chore:新的根节点

This commit is contained in:
amzxyz
2026-01-21 17:43:36 +08:00
commit 274c2e8d00
94 changed files with 3245054 additions and 0 deletions

197
.github/workflows/scripts/aux_go.py vendored Normal file
View File

@@ -0,0 +1,197 @@
import os
import re
# ---------- 在第一个点前插入后缀base.dict.yaml -> base.pro.dict.yaml ----------
def add_suffix_before_extensions(filename: str, suffix: str) -> str:
if not suffix:
return filename
i = filename.find('.')
return (filename + suffix) if i == -1 else (filename[:i] + suffix + filename[i:])
# ========== 1) 从“单个 aux 文件”加载 字 -> 辅助码段列表 ==========
# 行格式:字<TAB>;段1;段2;... (保留空段,不偏移;段内逗号原样保留)
def load_aux_table(aux_file_path):
if not os.path.isfile(aux_file_path):
raise FileNotFoundError(f"aux 文件不存在:{aux_file_path}")
aux_map = {}
print(f'加载辅助码表文件: {os.path.basename(aux_file_path)}')
with open(aux_file_path, 'r', encoding='utf-8') as f:
for raw in f:
line = raw.strip()
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if len(parts) < 2:
continue
ch = parts[0]
aux_list = parts[1].split(';') # 保留空串占位(分号才是边界)
aux_map[ch] = aux_list
return aux_map
# ========== 2) 区间选择(严格:第 N 段 = aux_list[N]N 从 1 起)==========
# 不处理逗号:分号窗口原样拼接
def select_aux_segment(aux_list, start_idx, end_idx=None):
if not aux_list:
return ''
s = max(1, start_idx)
e = end_idx if end_idx is not None else len(aux_list)
e = max(s, min(e, len(aux_list)))
window = aux_list[s:e] # 允许空段
return ''.join(window) if window else ''
DIGIT_RE = re.compile(r'^\d+$')
# ========== 3) 处理单个词库(流式;空也占位“拼音;”)==========
def process_file_for_range_streaming(in_file, out_file, aux_map, start_idx, end_idx, sep=';'):
os.makedirs(os.path.dirname(out_file), exist_ok=True)
try:
fin = open(in_file, 'r', encoding='utf-8')
except Exception as e:
print(f'读取失败 {in_file}: {e}')
return
try:
fout = open(out_file, 'w', encoding='utf-8')
except Exception as e:
fin.close()
print(f'写入失败 {out_file}: {e}')
return
passthrough_set = {
"\td\t1000",
"\tl\t999",
"\tm\t999",
"\tb\t999",
}
processing = False
for line in fin:
if not processing:
fout.write(line)
if '...' in line:
processing = True
continue
raw = line.rstrip('\n')
if (not raw) or raw.lstrip().startswith('#'):
fout.write(line)
continue
parts = raw.split('\t')
if len(parts) == 1:
fout.write(line)
continue
han = parts[0]
col2 = parts[1] if len(parts) > 1 else ''
col3 = parts[2] if len(parts) > 2 else ''
col4 = parts[3] if len(parts) > 3 else ''
# 第二列若是频率(全数字),挪到第三列
if DIGIT_RE.fullmatch(col2 or ''):
col3, col2 = col2, ''
# 特定行直通
if raw.strip() in passthrough_set:
fout.write(raw + '\n')
continue
pinyins = col2.split(' ') if col2 else []
if len(pinyins) != len(han):
warn = f"# 警告: 拼音数与字数不匹配({in_file}) => {raw}"
print(warn)
fout.write(warn + '\n')
continue
_get = aux_map.get
new_cols = []
for i, ch in enumerate(han):
aux_list = _get(ch)
piece = select_aux_segment(aux_list, start_idx, end_idx) if aux_list is not None else ''
new_cols.append(pinyins[i] + sep + piece) # 空也占位:拼音;片段
new_col2 = ' '.join(new_cols)
if col4:
fout.write(f"{han}\t{new_col2}\t{col3}\t{col4}\n" if col3 else f"{han}\t{new_col2}\t\t{col4}\n")
else:
fout.write(f"{han}\t{new_col2}\t{col3}\n" if col3 else f"{han}\t{new_col2}\n")
fin.close()
fout.close()
print(f'已处理: {out_file}')
# ========== 4) 扫目录 + 六套区间(按白名单)==========
def process_batch(input_dir, aux_file_path, base_out_dir, index_mapping, files_whitelist=None,
sep=';', output_suffix=""):
aux_map = load_aux_table(aux_file_path)
print(f'已加载辅助码条目:{len(aux_map)}')
# 收集要处理的文件
to_process = []
for entry in os.scandir(input_dir):
if not entry.is_file():
continue
name = entry.name
if files_whitelist and name not in files_whitelist:
continue
if not (name.endswith('.yaml') or name.endswith('.yml') or name.endswith('.txt')):
continue
to_process.append(entry.path)
if not to_process:
print("输入目录内没有匹配文件")
return
for s_idx, e_idx, subdir in index_mapping:
out_dir = os.path.join(base_out_dir, subdir)
os.makedirs(out_dir, exist_ok=True)
print(f'\n=== 区间 ({s_idx}, {e_idx}) → {subdir} ===')
for in_file in to_process:
fn = os.path.basename(in_file)
out_name = add_suffix_before_extensions(fn, output_suffix)
out_file = os.path.join(out_dir, out_name)
process_file_for_range_streaming(in_file, out_file, aux_map, s_idx, e_idx, sep=sep)
# ========== 5) 入口 ==========
if __name__ == '__main__':
# 六套区间(第 N 段,从 1 起)
index_mapping = [
(1, 2, "pro-moqi-fuzhu-dicts"),
(2, 3, "pro-flypy-fuzhu-dicts"),
(3, 4, "pro-zrm-fuzhu-dicts"),
(4, 5, "pro-tiger-fuzhu-dicts"),
(5, 6, "pro-wubi-fuzhu-dicts"),
(6, 7, "pro-hanxin-fuzhu-dicts"),
(7, None, "pro-shouyou-fuzhu-dicts"),
]
# 路径
AUX_FILE = "custom/aux_code.txt" # ← 单个 aux 文件
INPUT_DIR = "dicts" # ← 词库文件夹
OUT_ROOT = "." # ← 输出根目录
# 仅处理这些文件
FILES = [
"jichu.dict.yaml",
"zi.dict.yaml",
"duoyin.dict.yaml",
"cuoyin.dict.yaml",
"diming.dict.yaml",
"shici.dict.yaml",
"lianxiang.dict.yaml",
"renming.dict.yaml",
"wuzhong.dict.yaml",
"shuxue.dict.yaml",
"dikuang.dict.yaml",
"wu-hua-sheng-yi-yao.dict.yaml",
]
# 输出文件在第一个点前插这个后缀(如 ".pro";设为空串则不加)
OUTPUT_SUFFIX = ".pro"
process_batch(
INPUT_DIR, AUX_FILE, OUT_ROOT,
index_mapping,
files_whitelist=FILES,
sep=';',
output_suffix=OUTPUT_SUFFIX
)