mirror of
https://github.com/d0zingcat/rime_wanxiang.git
synced 2026-05-13 23:16:54 +00:00
198 lines
6.7 KiB
Python
198 lines
6.7 KiB
Python
import os
|
||
import re
|
||
|
||
# ---------- 在第一个点前插入后缀(例:base.dict.yaml -> base.pro.dict.yaml) ----------
|
||
def add_suffix_before_extensions(filename: str, suffix: str) -> str:
|
||
if not suffix:
|
||
return filename
|
||
i = filename.find('.')
|
||
return (filename + suffix) if i == -1 else (filename[:i] + suffix + filename[i:])
|
||
|
||
# ========== 1) 从“单个 aux 文件”加载 字 -> 辅助码段列表 ==========
|
||
# 行格式:字<TAB>;段1;段2;... (保留空段,不偏移;段内逗号原样保留)
|
||
def load_aux_table(aux_file_path):
|
||
if not os.path.isfile(aux_file_path):
|
||
raise FileNotFoundError(f"aux 文件不存在:{aux_file_path}")
|
||
aux_map = {}
|
||
print(f'加载辅助码表文件: {os.path.basename(aux_file_path)}')
|
||
with open(aux_file_path, 'r', encoding='utf-8') as f:
|
||
for raw in f:
|
||
line = raw.strip()
|
||
if not line or line.startswith('#'):
|
||
continue
|
||
parts = line.split('\t')
|
||
if len(parts) < 2:
|
||
continue
|
||
ch = parts[0]
|
||
aux_list = parts[1].split(';') # 保留空串占位(分号才是边界)
|
||
aux_map[ch] = aux_list
|
||
return aux_map
|
||
|
||
# ========== 2) 区间选择(严格:第 N 段 = aux_list[N];N 从 1 起)==========
|
||
# 不处理逗号:分号窗口原样拼接
|
||
def select_aux_segment(aux_list, start_idx, end_idx=None):
|
||
if not aux_list:
|
||
return ''
|
||
s = max(1, start_idx)
|
||
e = end_idx if end_idx is not None else len(aux_list)
|
||
e = max(s, min(e, len(aux_list)))
|
||
window = aux_list[s:e] # 允许空段
|
||
return ''.join(window) if window else ''
|
||
|
||
DIGIT_RE = re.compile(r'^\d+$')
|
||
|
||
# ========== 3) 处理单个词库(流式;空也占位“拼音;”)==========
|
||
def process_file_for_range_streaming(in_file, out_file, aux_map, start_idx, end_idx, sep=';'):
|
||
os.makedirs(os.path.dirname(out_file), exist_ok=True)
|
||
try:
|
||
fin = open(in_file, 'r', encoding='utf-8')
|
||
except Exception as e:
|
||
print(f'读取失败 {in_file}: {e}')
|
||
return
|
||
try:
|
||
fout = open(out_file, 'w', encoding='utf-8')
|
||
except Exception as e:
|
||
fin.close()
|
||
print(f'写入失败 {out_file}: {e}')
|
||
return
|
||
|
||
passthrough_set = {
|
||
"的\td\t1000",
|
||
"了\tl\t999",
|
||
"吗\tm\t999",
|
||
"吧\tb\t999",
|
||
}
|
||
|
||
processing = False
|
||
for line in fin:
|
||
if not processing:
|
||
fout.write(line)
|
||
if '...' in line:
|
||
processing = True
|
||
continue
|
||
|
||
raw = line.rstrip('\n')
|
||
if (not raw) or raw.lstrip().startswith('#'):
|
||
fout.write(line)
|
||
continue
|
||
|
||
parts = raw.split('\t')
|
||
if len(parts) == 1:
|
||
fout.write(line)
|
||
continue
|
||
|
||
han = parts[0]
|
||
col2 = parts[1] if len(parts) > 1 else ''
|
||
col3 = parts[2] if len(parts) > 2 else ''
|
||
col4 = parts[3] if len(parts) > 3 else ''
|
||
|
||
# 第二列若是频率(全数字),挪到第三列
|
||
if DIGIT_RE.fullmatch(col2 or ''):
|
||
col3, col2 = col2, ''
|
||
|
||
# 特定行直通
|
||
if raw.strip() in passthrough_set:
|
||
fout.write(raw + '\n')
|
||
continue
|
||
|
||
pinyins = col2.split(' ') if col2 else []
|
||
if len(pinyins) != len(han):
|
||
warn = f"# 警告: 拼音数与字数不匹配({in_file}) => {raw}"
|
||
print(warn)
|
||
fout.write(warn + '\n')
|
||
continue
|
||
|
||
_get = aux_map.get
|
||
new_cols = []
|
||
for i, ch in enumerate(han):
|
||
aux_list = _get(ch)
|
||
piece = select_aux_segment(aux_list, start_idx, end_idx) if aux_list is not None else ''
|
||
new_cols.append(pinyins[i] + sep + piece) # 空也占位:拼音;片段
|
||
|
||
new_col2 = ' '.join(new_cols)
|
||
if col4:
|
||
fout.write(f"{han}\t{new_col2}\t{col3}\t{col4}\n" if col3 else f"{han}\t{new_col2}\t\t{col4}\n")
|
||
else:
|
||
fout.write(f"{han}\t{new_col2}\t{col3}\n" if col3 else f"{han}\t{new_col2}\n")
|
||
|
||
fin.close()
|
||
fout.close()
|
||
print(f'已处理: {out_file}')
|
||
|
||
# ========== 4) 扫目录 + 六套区间(按白名单)==========
|
||
def process_batch(input_dir, aux_file_path, base_out_dir, index_mapping, files_whitelist=None,
|
||
sep=';', output_suffix=""):
|
||
aux_map = load_aux_table(aux_file_path)
|
||
print(f'已加载辅助码条目:{len(aux_map)}')
|
||
|
||
# 收集要处理的文件
|
||
to_process = []
|
||
for entry in os.scandir(input_dir):
|
||
if not entry.is_file():
|
||
continue
|
||
name = entry.name
|
||
if files_whitelist and name not in files_whitelist:
|
||
continue
|
||
if not (name.endswith('.yaml') or name.endswith('.yml') or name.endswith('.txt')):
|
||
continue
|
||
to_process.append(entry.path)
|
||
|
||
if not to_process:
|
||
print("输入目录内没有匹配文件")
|
||
return
|
||
|
||
for s_idx, e_idx, subdir in index_mapping:
|
||
out_dir = os.path.join(base_out_dir, subdir)
|
||
os.makedirs(out_dir, exist_ok=True)
|
||
print(f'\n=== 区间 ({s_idx}, {e_idx}) → {subdir} ===')
|
||
for in_file in to_process:
|
||
fn = os.path.basename(in_file)
|
||
out_name = add_suffix_before_extensions(fn, output_suffix)
|
||
out_file = os.path.join(out_dir, out_name)
|
||
process_file_for_range_streaming(in_file, out_file, aux_map, s_idx, e_idx, sep=sep)
|
||
|
||
# ========== 5) 入口 ==========
|
||
if __name__ == '__main__':
|
||
# 六套区间(第 N 段,从 1 起)
|
||
index_mapping = [
|
||
(1, 2, "pro-moqi-fuzhu-dicts"),
|
||
(2, 3, "pro-flypy-fuzhu-dicts"),
|
||
(3, 4, "pro-zrm-fuzhu-dicts"),
|
||
(4, 5, "pro-tiger-fuzhu-dicts"),
|
||
(5, 6, "pro-wubi-fuzhu-dicts"),
|
||
(6, 7, "pro-hanxin-fuzhu-dicts"),
|
||
(7, None, "pro-shouyou-fuzhu-dicts"),
|
||
]
|
||
|
||
# 路径
|
||
AUX_FILE = "custom/aux_code.txt" # ← 单个 aux 文件
|
||
INPUT_DIR = "dicts" # ← 词库文件夹
|
||
OUT_ROOT = "." # ← 输出根目录
|
||
|
||
# 仅处理这些文件
|
||
FILES = [
|
||
"jichu.dict.yaml",
|
||
"zi.dict.yaml",
|
||
"duoyin.dict.yaml",
|
||
"cuoyin.dict.yaml",
|
||
"diming.dict.yaml",
|
||
"shici.dict.yaml",
|
||
"lianxiang.dict.yaml",
|
||
"renming.dict.yaml",
|
||
"wuzhong.dict.yaml",
|
||
"shuxue.dict.yaml",
|
||
"dikuang.dict.yaml",
|
||
"wu-hua-sheng-yi-yao.dict.yaml",
|
||
]
|
||
|
||
# 输出文件在第一个点前插这个后缀(如 ".pro";设为空串则不加)
|
||
OUTPUT_SUFFIX = ".pro"
|
||
|
||
process_batch(
|
||
INPUT_DIR, AUX_FILE, OUT_ROOT,
|
||
index_mapping,
|
||
files_whitelist=FILES,
|
||
sep=';',
|
||
output_suffix=OUTPUT_SUFFIX
|
||
)
|