import os import re # ---------- 在第一个点前插入后缀(例:base.dict.yaml -> base.pro.dict.yaml) ---------- def add_suffix_before_extensions(filename: str, suffix: str) -> str: if not suffix: return filename i = filename.find('.') return (filename + suffix) if i == -1 else (filename[:i] + suffix + filename[i:]) # ========== 1) 从“单个 aux 文件”加载 字 -> 辅助码段列表 ========== # 行格式:字;段1;段2;... (保留空段,不偏移;段内逗号原样保留) def load_aux_table(aux_file_path): if not os.path.isfile(aux_file_path): raise FileNotFoundError(f"aux 文件不存在:{aux_file_path}") aux_map = {} print(f'加载辅助码表文件: {os.path.basename(aux_file_path)}') with open(aux_file_path, 'r', encoding='utf-8') as f: for raw in f: line = raw.strip() if not line or line.startswith('#'): continue parts = line.split('\t') if len(parts) < 2: continue ch = parts[0] aux_list = parts[1].split(';') # 保留空串占位(分号才是边界) aux_map[ch] = aux_list return aux_map # ========== 2) 区间选择(严格:第 N 段 = aux_list[N];N 从 1 起)========== # 不处理逗号:分号窗口原样拼接 def select_aux_segment(aux_list, start_idx, end_idx=None): if not aux_list: return '' s = max(1, start_idx) e = end_idx if end_idx is not None else len(aux_list) e = max(s, min(e, len(aux_list))) window = aux_list[s:e] # 允许空段 return ''.join(window) if window else '' DIGIT_RE = re.compile(r'^\d+$') # ========== 3) 处理单个词库(流式;空也占位“拼音;”)========== def process_file_for_range_streaming(in_file, out_file, aux_map, start_idx, end_idx, sep=';'): os.makedirs(os.path.dirname(out_file), exist_ok=True) try: fin = open(in_file, 'r', encoding='utf-8') except Exception as e: print(f'读取失败 {in_file}: {e}') return try: fout = open(out_file, 'w', encoding='utf-8') except Exception as e: fin.close() print(f'写入失败 {out_file}: {e}') return passthrough_set = { "的\td\t1000", "了\tl\t999", "吗\tm\t999", "吧\tb\t999", } processing = False for line in fin: if not processing: fout.write(line) if '...' in line: processing = True continue raw = line.rstrip('\n') if (not raw) or raw.lstrip().startswith('#'): fout.write(line) continue parts = raw.split('\t') if len(parts) == 1: fout.write(line) continue han = parts[0] col2 = parts[1] if len(parts) > 1 else '' col3 = parts[2] if len(parts) > 2 else '' col4 = parts[3] if len(parts) > 3 else '' # 第二列若是频率(全数字),挪到第三列 if DIGIT_RE.fullmatch(col2 or ''): col3, col2 = col2, '' # 特定行直通 if raw.strip() in passthrough_set: fout.write(raw + '\n') continue pinyins = col2.split(' ') if col2 else [] if len(pinyins) != len(han): warn = f"# 警告: 拼音数与字数不匹配({in_file}) => {raw}" print(warn) fout.write(warn + '\n') continue _get = aux_map.get new_cols = [] for i, ch in enumerate(han): aux_list = _get(ch) piece = select_aux_segment(aux_list, start_idx, end_idx) if aux_list is not None else '' new_cols.append(pinyins[i] + sep + piece) # 空也占位:拼音;片段 new_col2 = ' '.join(new_cols) if col4: fout.write(f"{han}\t{new_col2}\t{col3}\t{col4}\n" if col3 else f"{han}\t{new_col2}\t\t{col4}\n") else: fout.write(f"{han}\t{new_col2}\t{col3}\n" if col3 else f"{han}\t{new_col2}\n") fin.close() fout.close() print(f'已处理: {out_file}') # ========== 4) 扫目录 + 六套区间(按白名单)========== def process_batch(input_dir, aux_file_path, base_out_dir, index_mapping, files_whitelist=None, sep=';', output_suffix=""): aux_map = load_aux_table(aux_file_path) print(f'已加载辅助码条目:{len(aux_map)}') # 收集要处理的文件 to_process = [] for entry in os.scandir(input_dir): if not entry.is_file(): continue name = entry.name if files_whitelist and name not in files_whitelist: continue if not (name.endswith('.yaml') or name.endswith('.yml') or name.endswith('.txt')): continue to_process.append(entry.path) if not to_process: print("输入目录内没有匹配文件") return for s_idx, e_idx, subdir in index_mapping: out_dir = os.path.join(base_out_dir, subdir) os.makedirs(out_dir, exist_ok=True) print(f'\n=== 区间 ({s_idx}, {e_idx}) → {subdir} ===') for in_file in to_process: fn = os.path.basename(in_file) out_name = add_suffix_before_extensions(fn, output_suffix) out_file = os.path.join(out_dir, out_name) process_file_for_range_streaming(in_file, out_file, aux_map, s_idx, e_idx, sep=sep) # ========== 5) 入口 ========== if __name__ == '__main__': # 六套区间(第 N 段,从 1 起) index_mapping = [ (1, 2, "pro-moqi-fuzhu-dicts"), (2, 3, "pro-flypy-fuzhu-dicts"), (3, 4, "pro-zrm-fuzhu-dicts"), (4, 5, "pro-tiger-fuzhu-dicts"), (5, 6, "pro-wubi-fuzhu-dicts"), (6, 7, "pro-hanxin-fuzhu-dicts"), (7, None, "pro-shouyou-fuzhu-dicts"), ] # 路径 AUX_FILE = "custom/aux_code.txt" # ← 单个 aux 文件 INPUT_DIR = "dicts" # ← 词库文件夹 OUT_ROOT = "." # ← 输出根目录 # 仅处理这些文件 FILES = [ "jichu.dict.yaml", "zi.dict.yaml", "duoyin.dict.yaml", "cuoyin.dict.yaml", "diming.dict.yaml", "shici.dict.yaml", "lianxiang.dict.yaml", "renming.dict.yaml", "wuzhong.dict.yaml", "shuxue.dict.yaml", "dikuang.dict.yaml", "wu-hua-sheng-yi-yao.dict.yaml", ] # 输出文件在第一个点前插这个后缀(如 ".pro";设为空串则不加) OUTPUT_SUFFIX = ".pro" process_batch( INPUT_DIR, AUX_FILE, OUT_ROOT, index_mapping, files_whitelist=FILES, sep=';', output_suffix=OUTPUT_SUFFIX )