fix(super_lookup): 反查现已支持声调,与字母任意位置输入即可,词组根据输入的个数来从第一个汉字依次来匹配,输入位置也是随意的

This commit is contained in:
amzxyz
2026-01-13 00:59:59 +08:00
parent 3dd2ac1984
commit 5a47c9d492
5 changed files with 176 additions and 89 deletions

View File

@@ -3,21 +3,38 @@
--tags: [ abc ] # 检索当前tag的候选
--key: "`" # 输入中反查引导符
--lookup: [ wanxiang_reverse ] #反查滤镜数据库
--data_source: [ comment, db ] # 优先级:写在前面优先
--data_source: [ comment, db ] # 优先级:写在前面优先。即使只写db只要开启enable_tone也能从注释获取声调。
--enable_tone: true #启用声调反查
-- 工具函数:转义正则特殊字符
local function alt_lua_punc(s)
return s and s:gsub('([%.%+%-%*%?%[%]%^%$%(%)%%])', '%%%1') or ''
end
-- 声调映射表
local tones_map = {
["ā"]="7", ["á"]="8", ["ǎ"]="9", ["à"]="0",
["ō"]="7", ["ó"]="8", ["ǒ"]="9", ["ò"]="0",
["ē"]="7", ["é"]="8", ["ě"]="9", ["è"]="0",
["ī"]="7", ["í"]="8", ["ǐ"]="9", ["ì"]="0",
["ū"]="7", ["ú"]="8", ["ǔ"]="9", ["ù"]="0",
["ǖ"]="7", ["ǘ"]="8", ["ǚ"]="9", ["ǜ"]="0"
}
-- 高性能 UTF8 长度获取
local function get_utf8_len(s)
-- 优先使用 Rime 内置的 utf8 库
if utf8 and utf8.len then return utf8.len(s) end
local _, count = string.gsub(s, "[^\128-\193]", "")
return count
end
-- 提取声调数字 (无声调/轻声 -> 默认归为 4)
local function get_tone_from_pinyin(pinyin)
if not pinyin or #pinyin == 0 then return nil end
for char, tone in pairs(tones_map) do
if string.find(pinyin, char, 1, true) then
return tone
end
end
return "0"
end
-- 规则加载
local function parse_and_separate_rules(schema_id)
if not schema_id or #schema_id == 0 then return nil, nil end
@@ -56,35 +73,25 @@ end
local function expand_code_variant(main_projection, xlit_projection, part)
local out, seen = {}, {}
-- 统一添加函数 (负责去重 + 顺序保持)
local function add(s)
if s and #s > 0 and not seen[s] then
seen[s] = true
table.insert(out, s)
end
end
-- 通用奇数位提取器 (1, 3, 5...)
-- 规则:纯小写字母 + 偶数长度
local function extract_odd_positions(s)
if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end
local res = ""
for i = 1, #s, 2 do res = res .. s:sub(i, i) end
return res
end
-- 检查奇偶位组合,如果是 [jqxy] + u则生成对应的 v 版本
local function get_v_variant(s)
if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end
local res = ""
local has_change = false
for i = 1, #s, 2 do
local char_odd = s:sub(i, i)
local char_even = s:sub(i+1, i+1)
if (char_odd == 'j' or char_odd == 'q' or char_odd == 'x' or char_odd == 'y') and char_even == 'v' then
res = res .. char_odd .. 'u'
has_change = true
@@ -95,7 +102,6 @@ local function expand_code_variant(main_projection, xlit_projection, part)
return has_change and res or nil
end
-- 预处理单引号特例 (ce'shi -> cs)全拼用
local _, quote_count = part:gsub("'", "")
if quote_count == 1 then
local s1, s2 = part:match("^([^']*)'([^']*)$")
@@ -103,38 +109,27 @@ local function expand_code_variant(main_projection, xlit_projection, part)
add(s1:sub(1,1) .. s2:sub(1,1))
end
end
-- 保留原始编码,用户可能加入直接的编码用
-- 只有纯小写字母 (ceui) 才保留,含符号(ce'shi)或大写(ABC)均不保留
if part:match("^%l+$") then
add(part)
end
-- 对“原始编码”进行奇位提取 (ceui -> cu)
-- extract_odd_positions 内部已经校验了 ^%l+$,所以这里直接调用即可
if part:match("^%l+$") then add(part) end
local raw_extracted = extract_odd_positions(part)
if raw_extracted then add(raw_extracted) end
-- 规则投影 (Main Projection)
if main_projection and not part:match('^%u+$') then
local p = main_projection:apply(part, true)
if p and #p > 0 then
-- A. 加入投影全码 (如 yuif)
add(p)
-- B. 生成 v 变体 (如 yuif -> yvif)
local v_variant = get_v_variant(p)
if v_variant then add(v_variant) end
-- C. 对投影全码提取奇位 (如 yuif -> yi)
local proj_extracted = extract_odd_positions(p)
if proj_extracted then add(proj_extracted) end
end
end
-- 大写反查 (Xlit)
if part:match('^%u+$') and xlit_projection then
local xlit_result = xlit_projection:apply(part, true)
if xlit_result and #xlit_result > 0 then add(xlit_result) end
end
return out
end
-- 【DB】查表
local function build_reverse_group(main_projection, xlit_projection, db_table, text)
local group, seen = {}, {}
for _, db in ipairs(db_table) do
@@ -154,7 +149,6 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t
return group
end
-- 单字匹配 (Strict Prefix)
local function group_match(group, fuma)
if not group then return false end
for i = 1, #group do
@@ -163,7 +157,6 @@ local function group_match(group, fuma)
return false
end
-- 递归匹配引擎 (优化:整数 Key)
local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, memo, is_phrase_mode)
if input_idx > #input_str then return true end
if idx > #codes_sequence then return false end
@@ -177,9 +170,8 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx,
if codes then
for _, code in ipairs(codes) do
local skip = false
-- 词组模式下,过滤掉 >3 的全码
if is_phrase_mode and #code > 3 then skip = true end
if code:match("^%d+$") then skip = true end
if not skip then
local i_curr = input_idx
local c_curr = 1
@@ -202,8 +194,15 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx,
return result
end
-- 注释解析 (严格校验 + Trim)
local function parse_comment_codes(comment, pattern, target_len)
local function list_contains(list, target)
if not list then return false end
for _, v in ipairs(list) do
if v == target then return true end
end
return false
end
local function parse_comment_codes(comment, pattern, target_len, enable_tone)
if not comment or comment == "" then return nil end
local parts = {}
@@ -217,14 +216,31 @@ local function parse_comment_codes(comment, pattern, target_len)
local result = {}
for i, part in ipairs(parts) do
local p1, p2 = part:find(";")
if not p1 then return nil end
local pinyin_part
local codes_part
if p1 then
-- 有分号:前面是音/拼音,后面是码
pinyin_part = part:sub(1, p1 - 1)
codes_part = part:sub(p2 + 1)
else
-- 无分号:整体是音/拼音,无码
pinyin_part = part
codes_part = ""
end
local codes_part = part:sub(p2 + 1)
local codes_list = {}
for c in codes_part:gmatch("[^,]+") do
-- Trim
local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
if #trimmed > 0 then table.insert(codes_list, trimmed) end
if #codes_part > 0 then
for c in codes_part:gmatch("[^,]+") do
local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
if #trimmed > 0 then table.insert(codes_list, trimmed) end
end
end
if enable_tone then
local tone = get_tone_from_pinyin(pinyin_part)
if tone then
table.insert(codes_list, tone)
end
end
result[i] = codes_list
end
@@ -236,24 +252,34 @@ local f = {}
function f.init(env)
local config = env.engine.schema.config
-- 1. 先读取是否开启声调过滤 (默认为 true)
env.enable_tone = config:get_bool('wanxiang_lookup/enable_tone')
if env.enable_tone == nil then env.enable_tone = true end
-- 2. 读取数据源 data_source
local sources_list = config:get_list('wanxiang_lookup/data_source')
env.data_sources = {}
env.has_comment = false
-- 临时标记,判断配置里是否显式包含了 comment
local config_has_comment_source = false
env.has_db = false
if sources_list and sources_list.size > 0 then
for i = 0, sources_list.size - 1 do
local s = sources_list:get_value_at(i).value
table.insert(env.data_sources, s)
if s == 'comment' then env.has_comment = true end
if s == 'aux' then config_has_comment_source = true end
if s == 'db' then env.has_db = true end
end
else
env.data_sources = { 'comment', 'db' }
env.has_comment = true
env.data_sources = { 'aux', 'db' }
config_has_comment_source = true
env.has_db = true
end
-- 只要配置里用了 comment 做数据源,或者开启了 enable_tone (需要从注释借声调),都必须解析注释。
env.has_comment = config_has_comment_source or env.enable_tone
env.db_table = nil
if env.has_db then
local db_list = config:get_list("wanxiang_lookup/lookup")
@@ -275,7 +301,6 @@ function f.init(env)
if env.has_comment then
local delimiter = config:get_string('speller/delimiter') or " '"
if delimiter == "" then delimiter = " " end
-- 确保 " '" 中的所有字符都被加入排除列表 [^% %']+
env.comment_split_ptrn = "[^" .. alt_lua_punc(delimiter) .. "]+"
end
@@ -325,15 +350,25 @@ function f.func(input, env)
local fuma = ctx_input:sub(s_end + 1)
if #fuma == 0 then for cand in input:iter() do yield(cand) end return end
local if_single_char_first = env.engine.context:get_option('char_priority')
local buckets = {}
local max_len = 0
for i = 1, #env.data_sources do buckets[i] = {} end
local long_word_cands = {}
local tone_filter_seq = {}
local clean_fuma = ""
for i = 1, #fuma do
local char = fuma:sub(i, i)
if char == "7" or char == "8" or char == "9" or char == "0" then
table.insert(tone_filter_seq, char)
else
clean_fuma = clean_fuma .. char
end
end
local apply_tone_filter = env.enable_tone and (#tone_filter_seq > 0)
local if_single_char_first = env.engine.context:get_option('char_priority')
local buckets = {}
for i = 1, #env.data_sources do buckets[i] = {} end
local long_word_cands = {}
local max_len = 0
local has_any_match = false
-- GC
if env.cache_size > 2000 then
env._global_db_cache = {}
env._global_comment_cache = {}
@@ -344,24 +379,22 @@ function f.func(input, env)
for cand in input:iter() do
if cand.type == 'sentence' then goto skip end
local cand_text = cand.text
local cand_len = get_utf8_len(cand_text)
if not cand_len or cand_len == 0 then goto skip end
local b = string.byte(cand_text, 1)
if b and b < 128 then goto skip end
local raw_data = {}
-- 1. Comment Data
-- A: Comment Data
if env.has_comment then
local genuine = cand:get_genuine()
local comment_text = genuine and genuine.comment or ""
if comment_text ~= "" then
local cache_key = cand_text .. "_" .. comment_text
if not comment_cache[cache_key] then
comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len) or false
comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len, env.enable_tone) or false
env.cache_size = env.cache_size + 1
end
if comment_cache[cache_key] then
@@ -370,15 +403,13 @@ function f.func(input, env)
end
end
-- 2. DB Data
-- B: DB Data
if env.has_db then
raw_data.db = {}
local pos = 1
local i = 0
for _, code_point in utf8.codes(cand_text) do
i = i + 1
local char_str = utf8.char(code_point)
if not db_cache[char_str] then
db_cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str)
env.cache_size = env.cache_size + 1
@@ -387,36 +418,66 @@ function f.func(input, env)
end
end
-- 3. Match
-- 提取借用声调 (总是尝试从 raw_data.comment 提取,即使 data_source 只有 db)
local borrowed_tones = {}
if raw_data.comment then
for k, codes in ipairs(raw_data.comment) do
borrowed_tones[k] = {}
for _, c in ipairs(codes) do
if c:match("^%d+$") then borrowed_tones[k][c] = true end
end
end
end
local matched_idx = nil
for i, source_type in ipairs(env.data_sources) do
local codes_seq = raw_data[source_type]
if codes_seq then
local is_match = false
if source_type == 'comment' then
if cand_len == 1 then
if group_match(codes_seq[1], fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, false) then is_match = true end
end
elseif source_type == 'db' then
if cand_len == 1 then
if group_match(codes_seq[1], fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, true) then is_match = true end
local tone_match_pass = true
if apply_tone_filter then
for k, tone_input in ipairs(tone_filter_seq) do
if k > #codes_seq then break end
local has_tone = list_contains(codes_seq[k], tone_input)
-- 如果是 db 源且自身没匹配到声调,尝试查阅 borrowed_tones
if not has_tone and source_type == 'db' then
if borrowed_tones[k] and borrowed_tones[k][tone_input] then has_tone = true end
end
if not has_tone then
tone_match_pass = false
break
end
end
end
if is_match then
matched_idx = i
break
if tone_match_pass then
local is_match = false
if source_type == 'aux' then
if cand_len == 1 then
if group_match(codes_seq[1], clean_fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, false) then is_match = true end
end
elseif source_type == 'db' then
if cand_len == 1 then
if group_match(codes_seq[1], clean_fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, true) then is_match = true end
end
end
if is_match then
matched_idx = i
break
end
end
end
end
if matched_idx then
has_any_match = true
if if_single_char_first and cand_len > 1 then
table.insert(long_word_cands, cand)
else
@@ -428,7 +489,6 @@ function f.func(input, env)
::skip::
end
-- 输出 (Global Length Priority)
if if_single_char_first then
for i = 1, #env.data_sources do
if buckets[i][1] then for _, c in ipairs(buckets[i][1]) do yield(c) end end
@@ -447,6 +507,19 @@ function f.func(input, env)
end
for _, c in ipairs(long_word_cands) do yield(c) end
if not has_any_match and apply_tone_filter and #clean_fuma > 0 and env.has_db and env.db_table then
for _, db_obj in ipairs(env.db_table) do
local res_str = db_obj:lookup(clean_fuma)
if res_str and #res_str > 0 then
for word in res_str:gmatch("%S+") do
local cand = Candidate("wanxiang_shadow", s_end, #ctx_input, word, "")
cand.quality = 1
yield(cand)
end
end
end
end
end
function f.tags_match(seg, env)

View File

@@ -34,7 +34,8 @@ local P = {}
function P.init(env)
env.tone_state = "idle"
local config = env.engine.schema.config
env.lookup_key = config:get_string('wanxiang_lookup/key') or '`'
local ctx = env.engine and env.engine.context
if not ctx or not ctx.update_notifier then return end
@@ -105,6 +106,17 @@ function P.func(key, env)
-- 主键盘数字 09标记为 compress
local r = key:repr() or ""
if r:match("^[0-9]$") then
local input = ctx.input or ""
local caret = (ctx.caret_pos ~= nil) and ctx.caret_pos or #input
if caret < 0 then caret = 0 end
if caret > #input then caret = #input end
local left = (caret > 0) and input:sub(1, caret) or ""
if left:find(env.lookup_key, 1, true) then
env.tone_state = "idle"
return wanxiang.RIME_PROCESS_RESULTS.kNoop
end
env.tone_state = "compress"
-- 这里用“预测压缩是否会发生”来决定要不要告诉 Rime “我处理了这个按键”