Files
rime_wanxiang/lua/auto_phrase.lua
2026-01-21 17:43:36 +08:00

232 lines
6.7 KiB
Lua
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- @amzxyz https://github.com/amzxyz/rime_wanxiang
-- 自动造词
local AP = {}
-- 注释缓存text -> comment只给中文造词用
local comment_cache = {}
-- 工具是否纯英文ASCII 且至少 1 个字母)
local function is_ascii_word(text)
if not text or text == "" then
return false
end
local has_alpha = false
for i = 1, #text do
local b = text:byte(i)
if b > 127 then
return false
end
if (b >= 65 and b <= 90) or (b >= 97 and b <= 122) then
has_alpha = true
end
end
return has_alpha
end
-- 判断字符是否为汉字(原逻辑)
function AP.is_chinese_only(text)
local non_chinese_pattern = "[%w%p]"
if not text or text == "" then
return false
end
if text:match(non_chinese_pattern) then
return false
end
for _, cp in utf8.codes(text) do
-- 常用汉字区 + 扩展 A/B/C/D/E/F/G
if not (
(cp >= 0x4E00 and cp <= 0x9FFF) or -- CJK Unified Ideographs
(cp >= 0x3400 and cp <= 0x4DBF) or -- CJK Ext-A
(cp >= 0x20000 and cp <= 0x2EBEF) -- CJK Ext-B~G
) then
return false
end
end
return true
end
function AP.init(env)
local config = env.engine.schema.config
local ctx = env.engine.context
-- 中文自动造词的开关(只控制 add_user_dict
local enable_auto_phrase =
config:get_bool("add_user_dict/enable_auto_phrase") or false
local enable_user_dict =
config:get_bool("add_user_dict/enable_user_dict") or false
-- 中文add_user_dict受 add_* 开关影响)
if enable_auto_phrase and enable_user_dict then
env.memory = Memory(env.engine, env.engine.schema, "add_user_dict")
else
env.memory = nil
end
-- 英文enuser不受 add_* 开关影响,始终尝试启用)
env.en_memory = Memory(env.engine, env.engine.schema, "wanxiang_mixedcode")
-- 只要有一边需要,就挂上 commit/delete 通知
if env.en_memory or env.memory then
env._commit_conn = ctx.commit_notifier:connect(function(c)
AP.commit_handler(c, env)
end)
env._delete_conn = ctx.delete_notifier:connect(function(_)
comment_cache = {}
end)
end
end
function AP.fini(env)
if env._commit_conn then
env._commit_conn:disconnect()
env._commit_conn = nil
end
if env._delete_conn then
env._delete_conn:disconnect()
env._delete_conn = nil
end
if env.memory then
env.memory:disconnect()
env.memory = nil
end
if env.en_memory then
env.en_memory:disconnect()
env.en_memory = nil
end
end
function AP.save_comment_cache(cand)
local comment = cand.comment
local comment_text = cand.text
if comment_text and comment_text ~= "" and comment and comment ~= "" then
comment_cache[comment_text] = comment
end
end
-- 入口lua_filter
function AP.func(input, env)
local config = env.engine.schema.config
local context = env.engine.context
local use_comment_cache = env.memory ~= nil -- 只有中文造词才需要缓存注释
for cand in input:iter() do
local genuine_cand = cand:get_genuine()
local preedit = genuine_cand.preedit or ""
local initial_comment = genuine_cand.comment
if use_comment_cache then
AP.save_comment_cache(cand)
end
yield(cand)
end
end
-- 造词(原逻辑 + 新增 '\' 英文造词)
function AP.commit_handler(ctx, env)
if not ctx or not ctx.composition then
comment_cache = {}
return
end
local segments = ctx.composition:toSegmentation():get_segments()
local segments_count = #segments
local commit_text = ctx:get_commit_text() or ""
local raw_input = ctx.input or ""
---------------------------------------------------
-- ① 英文 + '\' 造词 —— 始终启用,只依赖 env.en_memory
-- 条件:
-- - raw_input 末尾为 '\'
-- - commit_text 为“ASCII 且至少 1 字母”的英文
-- 行为:
-- - text = commit_text
-- - custom_code = 编码去掉末尾 '\' + 空格
---------------------------------------------------
if raw_input ~= "" and raw_input:sub(-1) == "\\" and is_ascii_word(commit_text) then
local code_body = raw_input:gsub("\\+$", "") -- 去掉末尾连续 '\'
code_body = code_body:gsub("%s+$", "") -- 去掉尾部空白
if code_body ~= "" and env.en_memory then
local entry = DictEntry()
entry.text = commit_text -- 上屏英文本身
entry.weight = 1
entry.custom_code = code_body .. " " -- 真实编码(无 '\') + 空格
env.en_memory:update_userdict(entry, 1, "")
-- log.info(string.format("[auto_phrase] EN 造词:[%s], code=[%s]", entry.text, entry.custom_code))
end
comment_cache = {}
return
end
---------------------------------------------------
-- ② 中文自动造词:只在 env.memory 存在时工作
---------------------------------------------------
if not env.memory then
-- 中文造词功能被关掉时,直接跳过这一段
comment_cache = {}
return
end
-- 检查是否符合最小造词单元要求
if segments_count <= 1 or utf8.len(commit_text) <= 1 then
comment_cache = {}
return
end
-- 检查是否符合造词内容要求
if not AP.is_chinese_only(commit_text) or comment_cache[commit_text] then
comment_cache = {}
return
end
local preedits_table = {}
local config = env.engine.schema.config
local delimiter = config:get_string("speller/delimiter") or " '"
local escaped_delimiter =
utf8.char(utf8.codepoint(delimiter)):gsub("(%W)", "%%%1")
for i = 1, segments_count do
local seg = segments[i]
local cand = seg:get_selected_candidate()
if cand then
local cand_text = cand.text
local preedit = comment_cache[cand_text]
if preedit and preedit ~= "" then
for part in preedit:gmatch("[^" .. escaped_delimiter .. "]+") do
table.insert(preedits_table, part)
end
end
end
end
if #preedits_table == 0 then
comment_cache = {}
return
end
local dictEntry = DictEntry()
dictEntry.text = commit_text
dictEntry.weight = 1
dictEntry.custom_code = table.concat(preedits_table, " ") .. " "
env.memory:update_userdict(dictEntry, 1, "")
comment_cache = {}
end
return AP