Files
rime_wanxiang/lua/auto_phrase.lua
2026-01-21 17:44:49 +08:00

240 lines
7.0 KiB
Lua
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- @amzxyz https://github.com/amzxyz/rime_wanxiang
-- 自动造词
local AP = {}
-- 注释缓存text -> comment只给中文造词用
local comment_cache = {}
-- 工具是否纯英文ASCII 且至少 1 个字母)
local function is_ascii_word(text)
if not text or text == "" then
return false
end
local has_alpha = false
for i = 1, #text do
local b = text:byte(i)
if b > 127 then
return false
end
if (b >= 65 and b <= 90) or (b >= 97 and b <= 122) then
has_alpha = true
end
end
return has_alpha
end
-- 判断字符是否为汉字(原逻辑)
function AP.is_chinese_only(text)
local non_chinese_pattern = "[%w%p]"
if not text or text == "" then
return false
end
if text:match(non_chinese_pattern) then
return false
end
for _, cp in utf8.codes(text) do
-- 常用汉字区 + 扩展 A/B/C/D/E/F/G
if not (
(cp >= 0x4E00 and cp <= 0x9FFF) or -- CJK Unified Ideographs
(cp >= 0x3400 and cp <= 0x4DBF) or -- CJK Ext-A
(cp >= 0x20000 and cp <= 0x2EBEF) -- CJK Ext-B~G
) then
return false
end
end
return true
end
function AP.init(env)
local config = env.engine.schema.config
local ctx = env.engine.context
-- 中文自动造词的开关(只控制 add_user_dict
local enable_auto_phrase =
config:get_bool("add_user_dict/enable_auto_phrase") or false
local enable_user_dict =
config:get_bool("add_user_dict/enable_user_dict") or false
-- 中文add_user_dict受 add_* 开关影响)
if enable_auto_phrase and enable_user_dict then
env.memory = Memory(env.engine, env.engine.schema, "add_user_dict")
else
env.memory = nil
end
-- 英文enuser不受 add_* 开关影响,始终尝试启用)
env.en_memory = Memory(env.engine, env.engine.schema, "wanxiang_english")
-- 只要有一边需要,就挂上 commit/delete 通知
if env.en_memory or env.memory then
env._commit_conn = ctx.commit_notifier:connect(function(c)
AP.commit_handler(c, env)
end)
env._delete_conn = ctx.delete_notifier:connect(function(_)
comment_cache = {}
end)
end
end
function AP.fini(env)
if env._commit_conn then
env._commit_conn:disconnect()
env._commit_conn = nil
end
if env._delete_conn then
env._delete_conn:disconnect()
env._delete_conn = nil
end
if env.memory then
env.memory:disconnect()
env.memory = nil
end
if env.en_memory then
env.en_memory:disconnect()
env.en_memory = nil
end
end
function AP.save_comment_cache(cand, genuine)
local text = cand.text
local comment = genuine.comment
if text and text ~= "" and comment and comment ~= "" then
comment_cache[text] = comment
end
end
-- 入口
function AP.func(input, env)
local config = env.engine.schema.config
local context = env.engine.context
local use_comment_cache = env.memory ~= nil -- 只有中文造词才需要缓存注释
for cand in input:iter() do
local genuine_cand = cand:get_genuine()
local preedit = genuine_cand.preedit or ""
local initial_comment = genuine_cand.comment
if use_comment_cache then
AP.save_comment_cache(cand, genuine_cand)
end
yield(cand)
end
end
-- 造词
function AP.commit_handler(ctx, env)
if not ctx or not ctx.composition then
comment_cache = {}
return
end
local segments = ctx.composition:toSegmentation():get_segments()
local segments_count = #segments
local commit_text = ctx:get_commit_text() or ""
local raw_input = ctx.input or ""
---------------------------------------------------
-- ① 英文 + '\' 造词 —— 始终启用,只依赖 env.en_memory
---------------------------------------------------
if raw_input ~= "" and raw_input:sub(-1) == "\\" and is_ascii_word(commit_text) then
local code_body = raw_input:gsub("\\+$", "") -- 去掉末尾连续 '\'
code_body = code_body:gsub("%s+$", "") -- 去掉尾部空白
if code_body ~= "" and env.en_memory then
-- 定义局部函数:执行写入操作
local function save_entry(code)
local entry = DictEntry()
entry.text = commit_text -- 上屏英文本身
entry.weight = 1
entry.custom_code = code .. " " -- 编码 + 空格
env.en_memory:update_userdict(entry, 1, "")
end
-- 1. 写入原编码(无论大小写)
save_entry(code_body)
-- 2. 如果原编码包含大写字母(转小写后不等于原编码),额外写入一份全小写编码
local lower_code = string.lower(code_body)
if lower_code ~= code_body then
save_entry(lower_code)
end
end
comment_cache = {}
return -- 英文造词后直接退出,杜绝干扰中文
end
---------------------------------------------------
-- ② 中文自动造词:只在 env.memory 存在时工作
---------------------------------------------------
if not env.memory then
comment_cache = {}
return
end
-- 检查是否符合最小造词单元要求
if segments_count <= 1 or utf8.len(commit_text) <= 1 then
comment_cache = {}
return
end
-- 检查是否符合造词内容要求
if not AP.is_chinese_only(commit_text) or comment_cache[commit_text] then
comment_cache = {}
return
end
local preedits_table = {}
local config = env.engine.schema.config
local delimiter = config:get_string("speller/delimiter") or " '"
local escaped_delimiter =
utf8.char(utf8.codepoint(delimiter)):gsub("(%W)", "%%%1")
for i = 1, segments_count do
local seg = segments[i]
local cand = seg:get_selected_candidate()
-- 防止单字片段造词
-- 如果取不到 cand或者 cand.text 在缓存里没有编码,说明数据缺失,直接放弃
if not cand or not comment_cache[cand.text] then
comment_cache = {}
return
end
local cand_text = cand.text
local preedit = comment_cache[cand_text]
if preedit and preedit ~= "" then
for part in preedit:gmatch("[^" .. escaped_delimiter .. "]+") do
table.insert(preedits_table, part)
end
end
end
-- 二次检查:如果解析出来的编码段数不对,也不存
if #preedits_table == 0 then
comment_cache = {}
return
end
local dictEntry = DictEntry()
dictEntry.text = commit_text
dictEntry.weight = 1
dictEntry.custom_code = table.concat(preedits_table, " ") .. " "
env.memory:update_userdict(dictEntry, 1, "")
comment_cache = {}
end
return AP