Files
rime_wanxiang/.github/workflows/scripts/aux_go.py
2026-01-21 17:43:36 +08:00

198 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import re
# ---------- 在第一个点前插入后缀base.dict.yaml -> base.pro.dict.yaml ----------
def add_suffix_before_extensions(filename: str, suffix: str) -> str:
if not suffix:
return filename
i = filename.find('.')
return (filename + suffix) if i == -1 else (filename[:i] + suffix + filename[i:])
# ========== 1) 从“单个 aux 文件”加载 字 -> 辅助码段列表 ==========
# 行格式:字<TAB>;段1;段2;... (保留空段,不偏移;段内逗号原样保留)
def load_aux_table(aux_file_path):
if not os.path.isfile(aux_file_path):
raise FileNotFoundError(f"aux 文件不存在:{aux_file_path}")
aux_map = {}
print(f'加载辅助码表文件: {os.path.basename(aux_file_path)}')
with open(aux_file_path, 'r', encoding='utf-8') as f:
for raw in f:
line = raw.strip()
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if len(parts) < 2:
continue
ch = parts[0]
aux_list = parts[1].split(';') # 保留空串占位(分号才是边界)
aux_map[ch] = aux_list
return aux_map
# ========== 2) 区间选择(严格:第 N 段 = aux_list[N]N 从 1 起)==========
# 不处理逗号:分号窗口原样拼接
def select_aux_segment(aux_list, start_idx, end_idx=None):
if not aux_list:
return ''
s = max(1, start_idx)
e = end_idx if end_idx is not None else len(aux_list)
e = max(s, min(e, len(aux_list)))
window = aux_list[s:e] # 允许空段
return ''.join(window) if window else ''
DIGIT_RE = re.compile(r'^\d+$')
# ========== 3) 处理单个词库(流式;空也占位“拼音;”)==========
def process_file_for_range_streaming(in_file, out_file, aux_map, start_idx, end_idx, sep=';'):
os.makedirs(os.path.dirname(out_file), exist_ok=True)
try:
fin = open(in_file, 'r', encoding='utf-8')
except Exception as e:
print(f'读取失败 {in_file}: {e}')
return
try:
fout = open(out_file, 'w', encoding='utf-8')
except Exception as e:
fin.close()
print(f'写入失败 {out_file}: {e}')
return
passthrough_set = {
"\td\t1000",
"\tl\t999",
"\tm\t999",
"\tb\t999",
}
processing = False
for line in fin:
if not processing:
fout.write(line)
if '...' in line:
processing = True
continue
raw = line.rstrip('\n')
if (not raw) or raw.lstrip().startswith('#'):
fout.write(line)
continue
parts = raw.split('\t')
if len(parts) == 1:
fout.write(line)
continue
han = parts[0]
col2 = parts[1] if len(parts) > 1 else ''
col3 = parts[2] if len(parts) > 2 else ''
col4 = parts[3] if len(parts) > 3 else ''
# 第二列若是频率(全数字),挪到第三列
if DIGIT_RE.fullmatch(col2 or ''):
col3, col2 = col2, ''
# 特定行直通
if raw.strip() in passthrough_set:
fout.write(raw + '\n')
continue
pinyins = col2.split(' ') if col2 else []
if len(pinyins) != len(han):
warn = f"# 警告: 拼音数与字数不匹配({in_file}) => {raw}"
print(warn)
fout.write(warn + '\n')
continue
_get = aux_map.get
new_cols = []
for i, ch in enumerate(han):
aux_list = _get(ch)
piece = select_aux_segment(aux_list, start_idx, end_idx) if aux_list is not None else ''
new_cols.append(pinyins[i] + sep + piece) # 空也占位:拼音;片段
new_col2 = ' '.join(new_cols)
if col4:
fout.write(f"{han}\t{new_col2}\t{col3}\t{col4}\n" if col3 else f"{han}\t{new_col2}\t\t{col4}\n")
else:
fout.write(f"{han}\t{new_col2}\t{col3}\n" if col3 else f"{han}\t{new_col2}\n")
fin.close()
fout.close()
print(f'已处理: {out_file}')
# ========== 4) 扫目录 + 六套区间(按白名单)==========
def process_batch(input_dir, aux_file_path, base_out_dir, index_mapping, files_whitelist=None,
sep=';', output_suffix=""):
aux_map = load_aux_table(aux_file_path)
print(f'已加载辅助码条目:{len(aux_map)}')
# 收集要处理的文件
to_process = []
for entry in os.scandir(input_dir):
if not entry.is_file():
continue
name = entry.name
if files_whitelist and name not in files_whitelist:
continue
if not (name.endswith('.yaml') or name.endswith('.yml') or name.endswith('.txt')):
continue
to_process.append(entry.path)
if not to_process:
print("输入目录内没有匹配文件")
return
for s_idx, e_idx, subdir in index_mapping:
out_dir = os.path.join(base_out_dir, subdir)
os.makedirs(out_dir, exist_ok=True)
print(f'\n=== 区间 ({s_idx}, {e_idx}) → {subdir} ===')
for in_file in to_process:
fn = os.path.basename(in_file)
out_name = add_suffix_before_extensions(fn, output_suffix)
out_file = os.path.join(out_dir, out_name)
process_file_for_range_streaming(in_file, out_file, aux_map, s_idx, e_idx, sep=sep)
# ========== 5) 入口 ==========
if __name__ == '__main__':
# 六套区间(第 N 段,从 1 起)
index_mapping = [
(1, 2, "pro-moqi-fuzhu-dicts"),
(2, 3, "pro-flypy-fuzhu-dicts"),
(3, 4, "pro-zrm-fuzhu-dicts"),
(4, 5, "pro-tiger-fuzhu-dicts"),
(5, 6, "pro-wubi-fuzhu-dicts"),
(6, 7, "pro-hanxin-fuzhu-dicts"),
(7, None, "pro-shouyou-fuzhu-dicts"),
]
# 路径
AUX_FILE = "custom/aux_code.txt" # ← 单个 aux 文件
INPUT_DIR = "dicts" # ← 词库文件夹
OUT_ROOT = "." # ← 输出根目录
# 仅处理这些文件
FILES = [
"jichu.dict.yaml",
"zi.dict.yaml",
"duoyin.dict.yaml",
"cuoyin.dict.yaml",
"diming.dict.yaml",
"shici.dict.yaml",
"lianxiang.dict.yaml",
"renming.dict.yaml",
"wuzhong.dict.yaml",
"shuxue.dict.yaml",
"dikuang.dict.yaml",
"wu-hua-sheng-yi-yao.dict.yaml",
]
# 输出文件在第一个点前插这个后缀(如 ".pro";设为空串则不加)
OUTPUT_SUFFIX = ".pro"
process_batch(
INPUT_DIR, AUX_FILE, OUT_ROOT,
index_mapping,
files_whitelist=FILES,
sep=';',
output_suffix=OUTPUT_SUFFIX
)