fix(super_lookup): 采用全新的词组匹配算法,支持有序模糊,词组匹配不再需要复杂的设计对应到某个字就像单字那样按序输入即可,pro支持词库辅助码数据,两类数据可方案中配置启用类型,支持两类顺序优先进而候选排序优先

This commit is contained in:
amzxyz
2025-12-15 18:55:26 +08:00
parent c1b162e512
commit 8e2f103b1b
4 changed files with 261 additions and 130 deletions

View File

@@ -99,8 +99,9 @@ engine:
- script_translator@add_user_dict #自造词之制造词汇入口
filters:
- reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜放在super_comment前面进一步被超级注释处理以获得拼音编码的提示
- lua_filter@*auto_phrase #无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词
- lua_filter@*super_comment_preedit #超级注释模块、超级preedit支持错词提示、辅助码显示部件组字读音注释有声调、无声调全拼编码的转换支持个性化配置和关闭相应的功能详情搜索super_comment_preedit进行详细配置
- lua_filter@*auto_phrase #comment前无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词
- lua_filter@*super_lookup #comment前字词输入中反查辅助筛选
- lua_filter@*super_comment_preedit #OpenCC前超级注释模块、超级preedit支持错词提示、辅助码显示部件组字读音注释有声调、无声调全拼编码的转换支持个性化配置和关闭相应的功能详情搜索super_comment_preedit进行详细配置
- simplifier@emoji #Emoji滤镜
- simplifier@s2t #简繁切换通繁
- simplifier@s2tw #简繁切换台繁
@@ -108,7 +109,6 @@ engine:
- simplifier@chinese_english #中英翻译滤镜
- lua_filter@*super_sequence*F #手动排序,对高亮候选 ctrl+j左移动 ctrl+k 右移动 ctrl+0 移除位移
- lua_filter@*super_filter #功能太多详见Lua文件
- lua_filter@*super_lookup #字词输入中反查辅助筛选
- uniquifier #去重
grammar:
@@ -447,6 +447,11 @@ wanxiang_lookup: #设置归属于super_lookup.lua
tags: [ abc ] # 检索当前tag的候选
key: "`" # 输入中反查引导符,要添加到 speller/alphabet
lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了
data_source: ['comment', 'db']
#data_source: ['db', 'comment']
#用来配置你的反查数据来自哪里注释comment即词库携带的辅助编码数据库db即反查方案携带的两分、多分、笔画
#谁在前哪种权重高比如 身db dian即dm深comment dm 就能让相应的候选排在前面
#但注释在前有时候与笔画的感知会有一定冲突,可针对习惯使用一类数据库即可
# 处理符合特定规则的输入码,如网址、反查
recognizer:
@@ -571,6 +576,7 @@ key_binder:
- { when: composing, accept: "Control+g", toggle: charset_filter }
#通过快捷键Control+q切换中文、英文、混合模式
- { when: has_menu, accept: "Control+q", toggle: zh_only }
- { when: composing, accept: "Control+q", toggle: zh_only }
# 使用 tab 在不同音节之间跳转
- { when: has_menu, accept: "Tab", send: "Control+Right" }
- { when: composing, accept: "Tab", send: "Control+Right" }

View File

@@ -1,23 +1,24 @@
--@amzxyz https://github.com/amzxyz/rime_wanxiang
--wanxiang_lookup: #设置归属于super_lookup.lua
--tags: [ abc ] # 检索当前tag的候选
--key: "`" # 输入中反查引导符,要添加到 speller/alphabet
--lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了
--key: "`" # 输入中反查引导符
--lookup: [ wanxiang_reverse ] #反查滤镜数据库
--data_source: [ comment, db ] # 优先级:写在前面优先
------------------------------------------------------------
-- 工具函数
------------------------------------------------------------
-- 工具函数:转义正则特殊字符
local function alt_lua_punc(s)
return s and s:gsub('([%.%+%-%*%?%[%]%^%$%(%)%%])', '%%%1') or ''
end
local function is_all_upper(s) return s:match('^%u+$') ~= nil end
local function is_all_lower(s) return s:match('^%l+$') ~= nil end
-- 高性能 UTF8 长度获取
local function get_utf8_len(s)
-- 优先使用 Rime 内置的 utf8 库
if utf8 and utf8.len then return utf8.len(s) end
local _, count = string.gsub(s, "[^\128-\193]", "")
return count
end
------------------------------------------------------------
-- 规则加载
------------------------------------------------------------
local function parse_and_separate_rules(schema_id)
if not schema_id or #schema_id == 0 then return nil, nil end
local schema = Schema(schema_id)
@@ -53,41 +54,31 @@ local function get_schema_rules(env)
return main_rules or {}, xlit_rules or {}
end
------------------------------------------------------------
-- 核心逻辑
------------------------------------------------------------
-- 【DB】构建编码
local function expand_code_variant(main_projection, xlit_projection, part)
local out, seen = {}, {}
local function add(s)
if s and #s > 0 and not seen[s] then
seen[s] = true
out[#out + 1] = s
local function add(s) if s and #s > 0 and not seen[s] then seen[s] = true out[#out + 1] = s end end
add(part)
if main_projection then local p = main_projection:apply(part, true) if p and #p > 0 then add(p) end end
local base = {}
for i = 1, #out do local elem = out[i] if elem:match('^%l+$') then base[#base + 1] = elem end end
-- 提取 1,3 位生成构造码
for _, s in ipairs(base) do
-- 安全检查:确保长度足够
if #s >= 3 and #s <= 4 and s:match('^%l+$') then
add(s:sub(1,1) .. s:sub(3,3))
end
end
add(part)
if main_projection then
local p = main_projection:apply(part, true)
if p and #p > 0 then add(p) end
end
local base = {}
for i = 1, #out do
local elem = out[i]
if is_all_lower(elem) then base[#base + 1] = elem end
end
for _, s in ipairs(base) do
if #s == 4 and is_all_lower(s) then
add(s:sub(1,1) .. s:sub(3,3))
end
end
if is_all_upper(part) and xlit_projection then
local xlit_result = xlit_projection:apply(part, true)
if xlit_result and #xlit_result > 0 then add(xlit_result) end
if part:match('^%u+$') and xlit_projection then
local xlit_result = xlit_projection:apply(part, true)
if xlit_result and #xlit_result > 0 then add(xlit_result) end
end
return out
end
-- 【DB】查表
local function build_reverse_group(main_projection, xlit_projection, db_table, text)
local group, seen = {}, {}
for _, db in ipairs(db_table) do
@@ -96,10 +87,10 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t
for part in code:gmatch('%S+') do
local variants = expand_code_variant(main_projection, xlit_projection, part)
for _, v in ipairs(variants) do
if not seen[v] then
seen[v] = true
group[#group + 1] = v
end
if not seen[v] then
seen[v] = true
group[#group + 1] = v
end
end
end
end
@@ -107,41 +98,130 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t
return group
end
-- 单字匹配 (Strict Prefix)
local function group_match(group, fuma)
if not group then return false end
for i = 1, #group do
if tostring(group[i]):sub(1, #fuma) == fuma then return true end
for i = 1, #group do
if string.sub(group[i], 1, #fuma) == fuma then return true end
end
return false
end
------------------------------------------------------------
-- 过滤器主体
------------------------------------------------------------
-- 递归匹配引擎 (优化:整数 Key)
local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, memo, is_phrase_mode)
if input_idx > #input_str then return true end
if idx > #codes_sequence then return false end
local state_key = idx * 1000 + input_idx
if memo[state_key] ~= nil then return memo[state_key] end
local codes = codes_sequence[idx]
local result = false
if codes then
for _, code in ipairs(codes) do
local skip = false
-- 词组模式下,过滤掉 >3 的全码
if is_phrase_mode and #code > 3 then skip = true end
if not skip then
local i_curr = input_idx
local c_curr = 1
local i_limit = #input_str
local c_limit = #code
while i_curr <= i_limit and c_curr <= c_limit do
if input_str:byte(i_curr) == code:byte(c_curr) then i_curr = i_curr + 1 end
c_curr = c_curr + 1
end
if match_fuzzy_recursive(codes_sequence, idx + 1, input_str, i_curr, memo, is_phrase_mode) then
result = true
break
end
end
end
else
if match_fuzzy_recursive(codes_sequence, idx + 1, input_str, input_idx, memo, is_phrase_mode) then result = true end
end
memo[state_key] = result
return result
end
-- 注释解析 (严格校验 + Trim)
local function parse_comment_codes(comment, pattern, target_len)
if not comment or comment == "" then return nil end
local parts = {}
if target_len == 1 then
parts = { comment }
else
for seg in comment:gmatch(pattern) do table.insert(parts, seg) end
if #parts ~= target_len then return nil end
end
local result = {}
for i, part in ipairs(parts) do
local p1, p2 = part:find(";")
if not p1 then return nil end
local codes_part = part:sub(p2 + 1)
local codes_list = {}
for c in codes_part:gmatch("[^,]+") do
-- Trim
local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
if #trimmed > 0 then table.insert(codes_list, trimmed) end
end
result[i] = codes_list
end
return result
end
local f = {}
function f.init(env)
local config = env.engine.schema.config
env.if_reverse_lookup = false
env.db_table = nil
local db = config:get_list("wanxiang_lookup/lookup")
if db and db.size > 0 then
env.db_table = {}
for i = 0, db.size - 1 do
table.insert(env.db_table, ReverseLookup(db:get_value_at(i).value))
local sources_list = config:get_list('wanxiang_lookup/data_source')
env.data_sources = {}
env.has_comment = false
env.has_db = false
if sources_list and sources_list.size > 0 then
for i = 0, sources_list.size - 1 do
local s = sources_list:get_value_at(i).value
table.insert(env.data_sources, s)
if s == 'comment' then env.has_comment = true end
if s == 'db' then env.has_db = true end
end
env.if_reverse_lookup = true
else
return
env.data_sources = { 'comment', 'db' }
env.has_comment = true
env.has_db = true
end
local main_rules, xlit_rules = get_schema_rules(env)
env.main_projection = (type(main_rules) == 'table' and #main_rules > 0) and Projection() or nil
if env.main_projection then env.main_projection:load(main_rules) end
env.xlit_projection = (type(xlit_rules) == 'table' and #xlit_rules > 0) and Projection() or nil
if env.xlit_projection then env.xlit_projection:load(xlit_rules) end
env.db_table = nil
if env.has_db then
local db_list = config:get_list("wanxiang_lookup/lookup")
if db_list and db_list.size > 0 then
env.db_table = {}
for i = 0, db_list.size - 1 do
table.insert(env.db_table, ReverseLookup(db_list:get_value_at(i).value))
end
local main_rules, xlit_rules = get_schema_rules(env)
env.main_projection = (type(main_rules) == 'table' and #main_rules > 0) and Projection() or nil
if env.main_projection then env.main_projection:load(main_rules) end
env.xlit_projection = (type(xlit_rules) == 'table' and #xlit_rules > 0) and Projection() or nil
if env.xlit_projection then env.xlit_projection:load(xlit_rules) end
else
env.has_db = false
end
end
if env.has_comment then
local delimiter = config:get_string('speller/delimiter') or " '"
if delimiter == "" then delimiter = " " end
-- 确保 " '" 中的所有字符都被加入排除列表 [^% %']+
env.comment_split_ptrn = "[^" .. alt_lua_punc(delimiter) .. "]+"
end
env.search_key_str = config:get_string('wanxiang_lookup/key') or '`'
env.search_key_alt = alt_lua_punc(env.search_key_str)
@@ -163,7 +243,7 @@ function f.init(env)
local preedit = ctx:get_preedit()
local no_search_string = input:match('^(.-)' .. env.search_key_alt)
local edit = preedit.text:match('^(.-)' .. env.search_key_alt)
if edit and edit:match('[%w;]') then
if edit and edit:match('[%w/]') then
ctx.input = no_search_string .. env.search_key_str
else
ctx.input = no_search_string
@@ -172,101 +252,144 @@ function f.init(env)
end
end)
-- 【安全缓存】初始化
env._global_group_cache = {}
env.cache_size = 0 -- 计数器
env._global_db_cache = {}
env._global_comment_cache = {}
env.cache_size = 0
end
function f.func(input, env)
if not env.if_reverse_lookup then
if #env.data_sources == 0 then
for cand in input:iter() do yield(cand) end
return
end
local ctx_input = env.engine.context.input
local s_start, s_end = ctx_input:find(env.search_key_alt, 1, false)
if not s_start then
for cand in input:iter() do yield(cand) end
return
end
if not s_start then for cand in input:iter() do yield(cand) end return end
local fuma = ctx_input:sub(s_end + 1)
-- 【惰性检查】无辅码,直接显示,不查库
if #fuma == 0 then
for cand in input:iter() do yield(cand) end
return
end
if #fuma == 0 then for cand in input:iter() do yield(cand) end return end
local fuma_segments = {}
for segment in fuma:gmatch('[^' .. env.search_key_alt .. ']+') do
table.insert(fuma_segments, string.lower(segment))
end
local if_single_char_first = env.engine.context:get_option('char_priority')
local buckets = {}
local max_len = 0
for i = 1, #env.data_sources do buckets[i] = {} end
local long_word_cands = {}
local cache = env._global_group_cache
-- 如果缓存条目超过 3000清空重来
-- 3000个字足以覆盖99.9%的日常输入,且仅占用极小内存
if env.cache_size > 3000 then
env._global_group_cache = {}
-- GC
if env.cache_size > 2000 then
env._global_db_cache = {}
env._global_comment_cache = {}
env.cache_size = 0
cache = env._global_group_cache -- 更新引用
end
local db_cache = env._global_db_cache
local comment_cache = env._global_comment_cache
for cand in input:iter() do
if cand.type == 'sentence' then goto skip end
local cand_text = cand.text
local cand_len = get_utf8_len(cand_text)
if not cand_len or cand_len == 0 then goto skip end
-- 西文跳过
local b = string.byte(cand_text, 1)
if b and b < 128 then goto skip end
local cand_len = utf8.len(cand_text)
local raw_data = {}
local characters = {}
local pos = 1
for i = 1, cand_len do
local next_pos = utf8.offset(cand_text, i + 1)
local char_str = cand_text:sub(pos, next_pos and next_pos - 1)
characters[i] = char_str
pos = next_pos
-- 【全局缓存】带计数
if not cache[char_str] then
cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str)
env.cache_size = env.cache_size + 1 -- 增加计数
end
end
local ok = true
if #fuma_segments == 1 and cand_len == 1 then
ok = group_match(cache[characters[1]], fuma_segments[1])
elseif #fuma_segments > 0 and cand_len > 1 then
local match_count = (#fuma_segments < cand_len) and #fuma_segments or cand_len
for i = 1, match_count do
if not group_match(cache[characters[i]], fuma_segments[i]) then
ok = false
break
-- 1. Comment Data
if env.has_comment then
local genuine = cand:get_genuine()
local comment_text = genuine and genuine.comment or ""
if comment_text ~= "" then
local cache_key = cand_text .. "_" .. comment_text
if not comment_cache[cache_key] then
comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len) or false
env.cache_size = env.cache_size + 1
end
if comment_cache[cache_key] then
raw_data.comment = comment_cache[cache_key]
end
end
else
if cand_len < #fuma_segments then ok = false end
end
if ok then
-- 2. DB Data
if env.has_db then
raw_data.db = {}
local pos = 1
local i = 0
for _, code_point in utf8.codes(cand_text) do
i = i + 1
local char_str = utf8.char(code_point)
if not db_cache[char_str] then
db_cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str)
env.cache_size = env.cache_size + 1
end
raw_data.db[i] = db_cache[char_str] or {}
end
end
-- 3. Match
local matched_idx = nil
for i, source_type in ipairs(env.data_sources) do
local codes_seq = raw_data[source_type]
if codes_seq then
local is_match = false
if source_type == 'comment' then
if cand_len == 1 then
if group_match(codes_seq[1], fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, false) then is_match = true end
end
elseif source_type == 'db' then
if cand_len == 1 then
if group_match(codes_seq[1], fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, true) then is_match = true end
end
end
if is_match then
matched_idx = i
break
end
end
end
if matched_idx then
if if_single_char_first and cand_len > 1 then
table.insert(long_word_cands, cand)
else
yield(cand)
if not buckets[matched_idx][cand_len] then buckets[matched_idx][cand_len] = {} end
table.insert(buckets[matched_idx][cand_len], cand)
if cand_len > max_len then max_len = cand_len end
end
end
::skip::
end
-- 输出 (Global Length Priority)
if if_single_char_first then
for i = 1, #env.data_sources do
if buckets[i][1] then for _, c in ipairs(buckets[i][1]) do yield(c) end end
end
for l = max_len, 2, -1 do
for i = 1, #env.data_sources do
if buckets[i][l] then for _, c in ipairs(buckets[i][l]) do yield(c) end end
end
end
else
for l = max_len, 1, -1 do
for i = 1, #env.data_sources do
if buckets[i][l] then for _, c in ipairs(buckets[i][l]) do yield(c) end end
end
end
end
for _, c in ipairs(long_word_cands) do yield(c) end
end
@@ -276,9 +399,10 @@ function f.tags_match(seg, env)
end
function f.fini(env)
if env.if_reverse_lookup and env.notifier then env.notifier:disconnect() end
if env.notifier then env.notifier:disconnect() end
env.db_table = nil
env._global_group_cache = nil
env._global_db_cache = nil
env._global_comment_cache = nil
collectgarbage('collect')
end

View File

@@ -95,8 +95,9 @@ engine:
- script_translator@user_dict_set #使用自造词
filters:
- reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜放在super_comment前面进一步被超级注释处理以获得拼音编码的提示
- lua_filter@*auto_phrase #无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词
- lua_filter@*super_comment_preedit #超级注释模块、超级preedit支持错词提示、辅助码显示部件组字读音注释有声调、无声调全拼编码的转换支持个性化配置和关闭相应的功能详情搜索super_comment_preedit进行详细配置
- lua_filter@*auto_phrase #comment前无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词
- lua_filter@*super_lookup #comment前字词输入中反查辅助筛选
- lua_filter@*super_comment_preedit #OpenCC前超级注释模块、超级preedit支持错词提示、辅助码显示部件组字读音注释有声调、无声调全拼编码的转换支持个性化配置和关闭相应的功能详情搜索super_comment_preedit进行详细配置
- simplifier@emoji #Emoji滤镜
- simplifier@s2t #简繁切换通繁
- simplifier@s2tw #简繁切换台繁
@@ -104,7 +105,6 @@ engine:
- simplifier@chinese_english #中英翻译滤镜
- lua_filter@*super_sequence*F #手动排序,对高亮候选 ctrl+j左移动 ctrl+k 右移动 ctrl+0 移除位移
- lua_filter@*super_filter #功能太多详见Lua文件
- lua_filter@*super_lookup #字词输入中反查辅助筛选
- uniquifier # 去重
grammar:
@@ -449,6 +449,7 @@ wanxiang_lookup: #设置归属于super_lookup.lua
tags: [ abc ] # 检索当前tag的候选
key: "`" # 输入中反查引导符,要添加到 speller/alphabet
lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了
data_source: [db] #对于pro版本能从注释中加载词库辅助码详情见Pro版本base这里无需修改。
# 处理符合特定规则的输入码,如网址、反查
recognizer:
@@ -567,20 +568,19 @@ key_binder:
- { when: has_menu, accept: "Control+s", toggle: tone_display }
#通过快捷键Control+t开启超级tips
- { when: has_menu, accept: "Control+t", toggle: super_tips }
#通过快捷键Control+q开启超级tips
- { when: has_menu, accept: "Control+q", toggle: corrector }
#通过快捷键Control+g开启字符集过滤
- { when: has_menu, accept: "Control+g", toggle: charset_filter }
- { when: composing, accept: "Control+g", toggle: charset_filter }
#通过快捷键Control+q切换中文、英文、混合模式
- { when: has_menu, accept: "Control+q", toggle: zh_only }
- { when: composing, accept: "Control+q", toggle: zh_only }
# 使用 tab 在不同音节之间跳转
- { when: has_menu, accept: "Tab", send: "Control+Right" }
- { when: composing, accept: "Tab", send: "Control+Right" }
#当tab第一个字补码正确后可以使用Ctrl+tab进行上屏并依次补码
- { when: composing, accept: "Control+Tab", send_sequence: '{Home}{Shift+Right}{1}{Shift+Right}' }
#当输入编码后发现没有词,则通过双击``进入造词模式而且不需要删除编码,这个功能与``直接引导相呼应相配合
- { match: "^.*`$", accept: "`", send_sequence: '{BackSpace}{Home}{`}{`}{End}' }
#- { match: "^.*`$", accept: "`", send_sequence: '{BackSpace}{Home}{`}{`}{End}' }基础版暂时取消这个功能,
#斜杠被占用引导符号,因此输入本身设置为双击
- { match: "^/$", accept: "/", send_sequence: '{space}' }

View File

@@ -79,13 +79,13 @@ engine:
filters:
- lua_filter@*super_sequence*F #手动排序,高亮候选 ctrl+j左移动 ctrl+k 右移动 ctrl+0 移除位移
- reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜放在super_comment前面进一步被超级注释处理以获得拼音编码的提示
- lua_filter@*super_lookup #字词输入中反查辅助筛选
- lua_filter@*super_comment_preedit #超级注释模块、超级preedit支持错词提示、辅助码显示部件组字读音注释有声调、无声调全拼编码的转换支持个性化配置和关闭相应的功能详情搜索super_comment_preedit进行详细配置
- simplifier@emoji #Emoji滤镜
- simplifier@s2t #简繁切换通繁
- simplifier@s2tw #简繁切换台繁
- simplifier@s2hk #简繁切换港繁
- simplifier@chinese_english #中英翻译滤镜
- lua_filter@*super_lookup #字词输入中反查辅助筛选
- lua_filter@*super_filter #功能太多详见Lua文件
- uniquifier # 去重
@@ -273,6 +273,7 @@ wanxiang_lookup: #设置归属于super_lookup.lua
tags: [ abc ] # 检索当前tag的候选
key: "`" # 输入中反查引导符,要添加到 speller/alphabet
lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了
data_source: [db] #对于pro版本能从注释中加载词库辅助码详情见Pro版本base这里无需修改。
# 处理符合特定规则的输入码,如网址、反查
recognizer: