mirror of
https://github.com/d0zingcat/rime_wanxiang.git
synced 2026-05-13 23:16:54 +00:00
553 lines
21 KiB
Lua
553 lines
21 KiB
Lua
--[[
|
||
super_replacer.lua 一个rime OpenCC替代品,更灵活地配置能力
|
||
https://github.com/amzxyz/rime_wanxiang
|
||
by amzxyz
|
||
路径检测:UserDir > SharedDir
|
||
支持 option: true (常驻启用)
|
||
super_replacer:
|
||
db_name: lua/replacer
|
||
delimiter: "|"
|
||
comment_format: "〔%s〕"
|
||
chain: true #true表示流水线作业,上一个option产出交给下一个处理,典型的s2t>t2hk=s2hk,false就是并行,直接用text转换
|
||
types:
|
||
# 场景1:输入 '哈哈' -> 变成 '1.哈哈 2.😄'
|
||
- option: emoji # 开关名称与上面开关名称保持一致
|
||
mode: append # 新增候选append 替换原候选replace 替换注释comment
|
||
comment_mode: none # 注释模式: "append"(原候选注释继承), "text"(原候选文本放在注释), "none"(空,默认)
|
||
tags: [abc] # 生效的tag
|
||
prefix: "_em_" # 前缀用于区分同一个数据库的不同用途数据
|
||
files:
|
||
- lua/data/emoji.txt
|
||
# 场景2:输入 'hello' -> 显示 'hello 〔你好 | 哈喽〕'
|
||
- option: chinese_english
|
||
mode: append # <--- 添加注释模式
|
||
comment_mode: none
|
||
tags: [abc]
|
||
prefix: "_en_"
|
||
files:
|
||
- lua/data/english_chinese.txt
|
||
- lua/data/chinese_english.txt
|
||
# 场景3:用于常驻的直接替换 option: true
|
||
- option: true
|
||
mode: append # <--- 新增候选模式
|
||
comment_mode: none
|
||
tags: [abc]
|
||
prefix: "_ot_"
|
||
files:
|
||
- lua/data/others.txt
|
||
# 场景4:用于简繁转换的直接替换
|
||
- option: [ s2t, s2hk, s2tw ] #后面依赖这条流水线有一个开关为true这条流水线就能工作
|
||
mode: replace # <--- 替换原候选模式
|
||
comment_mode: append
|
||
sentence: true # <--- 句子级别替换
|
||
tags: [abc]
|
||
prefix: "_s2t_"
|
||
files:
|
||
- lua/data/STCharacters.txt
|
||
- lua/data/STPhrases.txt
|
||
- option: s2hk
|
||
mode: replace # <--- 替换原候选模式
|
||
comment_mode: append
|
||
sentence: true # <--- 句子级别替换
|
||
tags: [abc]
|
||
prefix: "_s2hk_"
|
||
files:
|
||
- lua/data/HKVariants.txt
|
||
- lua/data/HKVariantsRevPhrases.txt
|
||
- option: s2tw
|
||
mode: replace # <--- 替换原候选模式
|
||
comment_mode: append
|
||
sentence: true # <--- 句子级别替换
|
||
tags: [abc]
|
||
prefix: "_s2tw_"
|
||
files:
|
||
- lua/data/TWVariants.txt
|
||
- lua/data/TWVariantsRevPhrases.txt
|
||
- option: [ abbrev_lazy, abbrev_always ]
|
||
mode: abbrev # <--- 新增的简码模式
|
||
tags: [abc]
|
||
prefix: "_abbr_"
|
||
files:
|
||
- lua/data/abbrev.txt # 格式:zm\t怎么|在吗
|
||
]]
|
||
|
||
local M = {}
|
||
|
||
-- 性能优化:本地化常用库函数
|
||
local insert = table.insert
|
||
local concat = table.concat
|
||
local s_match = string.match
|
||
local s_gmatch = string.gmatch
|
||
local s_format = string.format
|
||
local s_byte = string.byte
|
||
local s_sub = string.sub
|
||
local s_gsub = string.gsub
|
||
local s_upper = string.upper
|
||
local open = io.open
|
||
local type = type
|
||
local tonumber = tonumber
|
||
|
||
-- 基础依赖
|
||
local function safe_require(name)
|
||
local status, lib = pcall(require, name)
|
||
if status then return lib end
|
||
return nil
|
||
end
|
||
|
||
local userdb = safe_require("lib/userdb") or safe_require("userdb")
|
||
local wanxiang = safe_require("wanxiang")
|
||
|
||
-- 重建数据库 (仅在 wanxiang 版本变更时运行)
|
||
local function rebuild(tasks, db)
|
||
if db.empty then db:empty() end
|
||
for _, task in ipairs(tasks) do
|
||
local txt_path = task.path
|
||
local prefix = task.prefix
|
||
local f = open(txt_path, "r")
|
||
if f then
|
||
for line in f:lines() do
|
||
if line ~= "" and not s_match(line, "^%s*#") then
|
||
local k, v = s_match(line, "^(%S+)%s+(.+)")
|
||
if k and v then
|
||
v = s_match(v, "^%s*(.-)%s*$")
|
||
db:update(prefix .. k, v)
|
||
end
|
||
end
|
||
end
|
||
f:close()
|
||
else
|
||
if log and log.info then log.info("super_replacer: 无法读取文件: " .. txt_path) end
|
||
end
|
||
end
|
||
return true
|
||
end
|
||
|
||
-- UTF-8 辅助
|
||
local function get_utf8_offsets(text)
|
||
local offsets = {}
|
||
local len = #text
|
||
local i = 1
|
||
while i <= len do
|
||
insert(offsets, i)
|
||
local b = s_byte(text, i)
|
||
if b < 128 then i = i + 1
|
||
elseif b < 224 then i = i + 2
|
||
elseif b < 240 then i = i + 3
|
||
else i = i + 4 end
|
||
end
|
||
insert(offsets, len + 1)
|
||
return offsets
|
||
end
|
||
|
||
-- FMM 分词转换算法
|
||
local function segment_convert(text, db, prefix, split_pat)
|
||
local offsets = get_utf8_offsets(text)
|
||
local char_count = #offsets - 1
|
||
local result_parts = {}
|
||
local i = 1
|
||
local MAX_LOOKAHEAD = 6
|
||
|
||
while i <= char_count do
|
||
local matched = false
|
||
local max_j = i + MAX_LOOKAHEAD
|
||
if max_j > char_count + 1 then max_j = char_count + 1 end
|
||
|
||
for j = max_j - 1, i + 1, -1 do
|
||
local start_byte = offsets[i]
|
||
local end_byte = offsets[j] - 1
|
||
local sub_text = s_sub(text, start_byte, end_byte)
|
||
|
||
local val = db:fetch(prefix .. sub_text)
|
||
if val then
|
||
local first_val = s_match(val, split_pat)
|
||
insert(result_parts, first_val or sub_text)
|
||
i = j - 1
|
||
matched = true
|
||
break
|
||
end
|
||
end
|
||
|
||
if not matched then
|
||
local start_byte = offsets[i]
|
||
local end_byte = offsets[i+1] - 1
|
||
local char = s_sub(text, start_byte, end_byte)
|
||
local val = db:fetch(prefix .. char)
|
||
if val then
|
||
local first_val = s_match(val, split_pat)
|
||
insert(result_parts, first_val or char)
|
||
else
|
||
insert(result_parts, char)
|
||
end
|
||
end
|
||
i = i + 1
|
||
end
|
||
return concat(result_parts)
|
||
end
|
||
|
||
-- 模块接口
|
||
|
||
function M.init(env)
|
||
local ns = env.name_space
|
||
ns = s_gsub(ns, "^%*", "")
|
||
local config = env.engine.schema.config
|
||
|
||
local user_dir = rime_api:get_user_data_dir()
|
||
local shared_dir = rime_api:get_shared_data_dir()
|
||
|
||
-- 1. 基础配置
|
||
local db_name = config:get_string(ns .. "/db_name") or "lua/replacer"
|
||
local delim = config:get_string(ns .. "/delimiter") or "|"
|
||
env.delimiter = delim
|
||
env.comment_format = config:get_string(ns .. "/comment_format") or "〔%s〕"
|
||
|
||
-- 获取全局版本号
|
||
local current_version = "v0.0.0"
|
||
if wanxiang and wanxiang.version then
|
||
current_version = wanxiang.version
|
||
end
|
||
|
||
env.chain = config:get_bool(ns .. "/chain")
|
||
if env.chain == nil then env.chain = false end
|
||
|
||
if delim == " " then env.split_pattern = "%S+"
|
||
else local esc = s_gsub(delim, "[%-%.%+%[%]%(%)%$%^%%%?%*]", "%%%1"); env.split_pattern = "([^" .. esc .. "]+)" end
|
||
|
||
-- 2. 解析 Types
|
||
env.types = {}
|
||
local tasks = {} -- 仅在需要重建时使用
|
||
|
||
local function resolve_path(relative)
|
||
if not relative then return nil end
|
||
local user_path = user_dir .. "/" .. relative
|
||
local f = open(user_path, "r")
|
||
if f then f:close(); return user_path end
|
||
local shared_path = shared_dir .. "/" .. relative
|
||
f = open(shared_path, "r")
|
||
if f then f:close(); return shared_path end
|
||
return user_path
|
||
end
|
||
|
||
local types_path = ns .. "/types"
|
||
local type_list = config:get_list(types_path)
|
||
|
||
if type_list then
|
||
for i = 0, type_list.size - 1 do
|
||
local entry_path = types_path .. "/@" .. i
|
||
|
||
-- 解析 triggers
|
||
local triggers = {}
|
||
local opts_keys = {"option", "options"}
|
||
for _, key in ipairs(opts_keys) do
|
||
local key_path = entry_path .. "/" .. key
|
||
local list = config:get_list(key_path)
|
||
if list then
|
||
for k = 0, list.size - 1 do
|
||
local val = config:get_string(key_path .. "/@" .. k)
|
||
if val then insert(triggers, val) end
|
||
end
|
||
else
|
||
-- 1. 如果配置写的是 true (bool),get_bool 返回 true,我们插入布尔值 true。
|
||
-- 2. 如果配置写的是 s2t (string),get_bool 返回 false (或nil),我们进入 else 读字符串。
|
||
if config:get_bool(key_path) == true then
|
||
insert(triggers, true)
|
||
else
|
||
local val = config:get_string(key_path)
|
||
-- 只有当它不是 "true" 字符串时才插入,防止双重解析(虽然上面的if已经拦截了)
|
||
if val and val ~= "true" then
|
||
insert(triggers, val)
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
-- 解析 Tags
|
||
local target_tags = nil
|
||
local tag_keys = {"tag", "tags"}
|
||
for _, key in ipairs(tag_keys) do
|
||
local key_path = entry_path .. "/" .. key
|
||
local list = config:get_list(key_path)
|
||
if list then
|
||
if not target_tags then target_tags = {} end
|
||
for k = 0, list.size - 1 do
|
||
local val = config:get_string(key_path .. "/@" .. k)
|
||
if val then target_tags[val] = true end
|
||
end
|
||
else
|
||
local val = config:get_string(key_path)
|
||
if val then
|
||
if not target_tags then target_tags = {} end
|
||
target_tags[val] = true
|
||
end
|
||
end
|
||
end
|
||
|
||
if #triggers > 0 then
|
||
local prefix = config:get_string(entry_path .. "/prefix") or ""
|
||
local mode = config:get_string(entry_path .. "/mode") or "append"
|
||
local comment_mode = config:get_string(entry_path .. "/comment_mode")
|
||
if not comment_mode then comment_mode = "comment" end
|
||
local fmm = config:get_bool(entry_path .. "/sentence")
|
||
if fmm == nil then fmm = false end
|
||
|
||
insert(env.types, {
|
||
triggers = triggers,
|
||
tags = target_tags,
|
||
prefix = prefix,
|
||
mode = mode,
|
||
comment_mode = comment_mode,
|
||
fmm = fmm
|
||
})
|
||
|
||
-- 收集文件路径 (用于重建)
|
||
local keys_to_check = {"files", "file"}
|
||
for _, key in ipairs(keys_to_check) do
|
||
local d_path = entry_path .. "/" .. key
|
||
local list = config:get_list(d_path)
|
||
if list then
|
||
for j = 0, list.size - 1 do
|
||
local p = resolve_path(config:get_string(d_path .. "/@" .. j))
|
||
if p then insert(tasks, { path = p, prefix = prefix }) end
|
||
end
|
||
else
|
||
local p = resolve_path(config:get_string(d_path))
|
||
if p then insert(tasks, { path = p, prefix = prefix }) end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
-- 3. DB 初始化
|
||
if not userdb then return end
|
||
local ok, db = pcall(function() local d = userdb.LevelDb(db_name); d:open(); return d end)
|
||
|
||
if ok and db then
|
||
env.db = db
|
||
local db_version = db:meta_fetch("_wanxiang_ver") or ""
|
||
local old_delim = db:meta_fetch("_delim")
|
||
local need_rebuild = false
|
||
if current_version ~= db_version then need_rebuild = true end
|
||
if env.delimiter ~= old_delim then need_rebuild = true end
|
||
|
||
if need_rebuild then
|
||
if rebuild(tasks, db) then
|
||
db:meta_update("_wanxiang_ver", current_version)
|
||
db:meta_update("_delim", env.delimiter)
|
||
if log and log.info then
|
||
log.info("super_replacer: 检测到版本变更 (" .. db_version .. " -> " .. current_version .. "),数据已重建。")
|
||
end
|
||
end
|
||
end
|
||
else
|
||
env.db = nil
|
||
end
|
||
end
|
||
|
||
function M.fini(env)
|
||
if env.db then env.db:close(); env.db = nil end
|
||
end
|
||
|
||
-- [Core Function] 核心逻辑
|
||
function M.func(input, env)
|
||
if not env.types or #env.types == 0 or not env.db then
|
||
for cand in input:iter() do yield(cand) end
|
||
return
|
||
end
|
||
|
||
local ctx = env.engine.context
|
||
local db = env.db
|
||
local types = env.types
|
||
local split_pat = env.split_pattern
|
||
local comment_fmt = env.comment_format
|
||
local is_chain = env.chain
|
||
local input_code = ctx.input
|
||
|
||
local input_type = "unknown"
|
||
if wanxiang and wanxiang.get_input_method_type then
|
||
input_type = wanxiang.get_input_method_type(env)
|
||
end
|
||
|
||
local seg = ctx.composition:back()
|
||
local current_seg_tags = seg and seg.tags or {}
|
||
-- [Helper] 通用处理函数
|
||
local function process_rules(cand)
|
||
local current_text = cand.text
|
||
local show_main = true
|
||
local current_main_comment = cand.comment
|
||
|
||
local pending_candidates = {}
|
||
local comments = {}
|
||
|
||
for _, t in ipairs(types) do
|
||
if t.mode ~= "abbrev" then -- 跳过 abbrev 模式
|
||
local is_active = false
|
||
for _, trigger in ipairs(t.triggers) do
|
||
if trigger == true then is_active = true; break
|
||
elseif type(trigger) == "string" and ctx:get_option(trigger) then is_active = true; break end
|
||
end
|
||
|
||
local is_tag_match = true
|
||
if t.tags then
|
||
is_tag_match = false
|
||
for req_tag, _ in pairs(t.tags) do
|
||
if current_seg_tags[req_tag] then is_tag_match = true; break end
|
||
end
|
||
end
|
||
|
||
if is_active and is_tag_match then
|
||
local query_text = is_chain and current_text or cand.text
|
||
local key = t.prefix .. query_text
|
||
local val = db:fetch(key)
|
||
|
||
if not val and t.fmm then
|
||
local seg_result = segment_convert(query_text, db, t.prefix, split_pat)
|
||
if seg_result ~= query_text then val = seg_result end
|
||
end
|
||
|
||
if val then
|
||
local mode = t.mode
|
||
local rule_comment = ""
|
||
if t.comment_mode == "text" then rule_comment = cand.text
|
||
elseif t.comment_mode == "comment" then rule_comment = cand.comment end
|
||
|
||
if mode == "comment" then
|
||
local parts = {}
|
||
for p in s_gmatch(val, split_pat) do insert(parts, p) end
|
||
insert(comments, concat(parts, " "))
|
||
|
||
elseif mode == "replace" then
|
||
if is_chain then
|
||
local first = true
|
||
for p in s_gmatch(val, split_pat) do
|
||
if first then
|
||
current_text = p
|
||
if t.comment_mode == "none" then current_main_comment = ""
|
||
elseif t.comment_mode == "text" then current_main_comment = cand.text end
|
||
first = false
|
||
else
|
||
insert(pending_candidates, { text=p, comment=rule_comment })
|
||
end
|
||
end
|
||
else
|
||
show_main = false
|
||
for p in s_gmatch(val, split_pat) do
|
||
insert(pending_candidates, { text=p, comment=rule_comment })
|
||
end
|
||
end
|
||
elseif mode == "append" then
|
||
for p in s_gmatch(val, split_pat) do
|
||
insert(pending_candidates, { text=p, comment=rule_comment })
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
if #comments > 0 then
|
||
local comment_str = concat(comments, " ")
|
||
local fmt = s_format(comment_fmt, comment_str)
|
||
if cand.comment and cand.comment ~= "" then
|
||
cand.comment = cand.comment .. fmt
|
||
else
|
||
cand.comment = fmt
|
||
end
|
||
end
|
||
|
||
if show_main then
|
||
if is_chain and current_text ~= cand.text then
|
||
local nc = Candidate(cand.type or "kv", cand.start, cand._end, current_text, current_main_comment)
|
||
nc.preedit = cand.preedit
|
||
nc.quality = cand.quality
|
||
yield(nc)
|
||
else
|
||
yield(cand)
|
||
end
|
||
end
|
||
|
||
for _, item in ipairs(pending_candidates) do
|
||
if not (show_main and item.text == current_text) then
|
||
local nc = Candidate("derived", cand.start, cand._end, item.text, item.comment)
|
||
nc.preedit = cand.preedit
|
||
nc.quality = cand.quality
|
||
yield(nc)
|
||
end
|
||
end
|
||
end
|
||
-- [Helper] 尝试触发简码
|
||
local function try_trigger_abbrev(is_empty_override)
|
||
for _, t in ipairs(types) do
|
||
-- 只有当模式是 abbrev 且 当前不是全拼 时,才进入逻辑
|
||
if t.mode == "abbrev" and input_type ~= "pinyin" then
|
||
local is_tag_match = true
|
||
if t.tags then
|
||
is_tag_match = false
|
||
for req_tag, _ in pairs(t.tags) do
|
||
if current_seg_tags[req_tag] then is_tag_match = true; break end
|
||
end
|
||
end
|
||
|
||
if is_tag_match then
|
||
local lazy_switch = t.triggers[1]
|
||
local always_switch = t.triggers[2]
|
||
local active_mode = "none"
|
||
|
||
if always_switch then
|
||
if always_switch == true or (type(always_switch) == "string" and ctx:get_option(always_switch)) then
|
||
active_mode = "always"
|
||
end
|
||
end
|
||
if active_mode == "none" and lazy_switch then
|
||
if lazy_switch == true or (type(lazy_switch) == "string" and ctx:get_option(lazy_switch)) then
|
||
active_mode = "lazy"
|
||
end
|
||
end
|
||
|
||
local should_trigger = false
|
||
if active_mode == "always" then should_trigger = true
|
||
elseif active_mode == "lazy" and is_empty_override then should_trigger = true
|
||
end
|
||
|
||
if should_trigger then
|
||
-- ... (查库逻辑) ...
|
||
local key = t.prefix .. input_code
|
||
local val = db:fetch(key)
|
||
if not val and not s_match(input_code, "[A-Z]") then
|
||
val = db:fetch(t.prefix .. s_upper(input_code))
|
||
end
|
||
if val then
|
||
for p in s_gmatch(val, split_pat) do
|
||
local abbrev_cand = Candidate("abbrev", 0, #input_code, p, "")
|
||
abbrev_cand.quality = 9999
|
||
process_rules(abbrev_cand)
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
-- [Main Loop] 主循环
|
||
local cand_count = 0
|
||
|
||
for cand in input:iter() do
|
||
cand_count = cand_count + 1
|
||
|
||
if cand_count == 1 then
|
||
-- 判断是否为"空码" (raw / english / 纯字母 / 文本等于输入)
|
||
local has_letter = string.find(cand.text, "[a-zA-Z]")
|
||
local is_empty = (cand.type == "raw"
|
||
or cand.type == "english"
|
||
or cand.text == input_code
|
||
or has_letter) -- 只要包含字母,就算作空
|
||
try_trigger_abbrev(is_empty)
|
||
end
|
||
|
||
-- 原候选逻辑
|
||
process_rules(cand)
|
||
end
|
||
if cand_count == 0 then
|
||
try_trigger_abbrev(true)
|
||
end
|
||
end
|
||
return M |