fix(super_lookup): 反查现已支持声调，与字母任意位置输入即可，词组根据输入的个数来从第一个汉字依次来匹配，输入位置也是随意的

2026-05-30 07:36:52 +00:00 · 2026-01-13 00:59:59 +08:00
parent 3dd2ac1984
commit 5a47c9d492
5 changed files with 176 additions and 89 deletions
--- a/lua/super_lookup.lua
+++ b/lua/super_lookup.lua
@@ -3,21 +3,38 @@
  --tags: [ abc ]  # 检索当前tag的候选
  --key: "`"       # 输入中反查引导符
  --lookup: [ wanxiang_reverse ] #反查滤镜数据库
-  --data_source: [ comment, db ] # 优先级：写在前面优先
+  --data_source: [ comment, db ] # 优先级：写在前面优先。即使只写db，只要开启enable_tone也能从注释获取声调。
+  --enable_tone: true  #启用声调反查

 -- 工具函数：转义正则特殊字符
 local function alt_lua_punc(s)
    return s and s:gsub('([%.%+%-%*%?%[%]%^%$%(%)%%])', '%%%1') or ''
 end
-
+-- 声调映射表
+local tones_map = {
+    ["ā"]="7", ["á"]="8", ["ǎ"]="9", ["à"]="0",
+    ["ō"]="7", ["ó"]="8", ["ǒ"]="9", ["ò"]="0",
+    ["ē"]="7", ["é"]="8", ["ě"]="9", ["è"]="0",
+    ["ī"]="7", ["í"]="8", ["ǐ"]="9", ["ì"]="0",
+    ["ū"]="7", ["ú"]="8", ["ǔ"]="9", ["ù"]="0",
+    ["ǖ"]="7", ["ǘ"]="8", ["ǚ"]="9", ["ǜ"]="0"
+}
 -- 高性能 UTF8 长度获取
 local function get_utf8_len(s)
-    -- 优先使用 Rime 内置的 utf8 库
    if utf8 and utf8.len then return utf8.len(s) end
    local _, count = string.gsub(s, "[^\128-\193]", "")
    return count
 end
-
+-- 提取声调数字 (无声调/轻声 -> 默认归为 4)
+local function get_tone_from_pinyin(pinyin)
+    if not pinyin or #pinyin == 0 then return nil end
+    for char, tone in pairs(tones_map) do
+        if string.find(pinyin, char, 1, true) then
+            return tone
+        end
+    end
+    return "0"
+end
 -- 规则加载
 local function parse_and_separate_rules(schema_id)
    if not schema_id or #schema_id == 0 then return nil, nil end
@@ -56,35 +73,25 @@ end

 local function expand_code_variant(main_projection, xlit_projection, part)
    local out, seen = {}, {}
-    
-    -- 统一添加函数 (负责去重 + 顺序保持)
    local function add(s) 
        if s and #s > 0 and not seen[s] then 
            seen[s] = true 
            table.insert(out, s) 
        end 
    end
-
-    -- 通用奇数位提取器 (1, 3, 5...)
-    -- 规则：纯小写字母 + 偶数长度
    local function extract_odd_positions(s)
        if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end
        local res = ""
        for i = 1, #s, 2 do res = res .. s:sub(i, i) end
        return res
    end
-
-    -- 检查奇偶位组合，如果是 [jqxy] + u，则生成对应的 v 版本
    local function get_v_variant(s)
        if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end
-        
        local res = ""
        local has_change = false
-        
        for i = 1, #s, 2 do
            local char_odd = s:sub(i, i)
            local char_even = s:sub(i+1, i+1)
-            
            if (char_odd == 'j' or char_odd == 'q' or char_odd == 'x' or char_odd == 'y') and char_even == 'v' then
                res = res .. char_odd .. 'u'
                has_change = true
@@ -95,7 +102,6 @@ local function expand_code_variant(main_projection, xlit_projection, part)
        return has_change and res or nil
    end

-    -- 预处理单引号特例 (ce'shi -> cs)全拼用
    local _, quote_count = part:gsub("'", "")
    if quote_count == 1 then
        local s1, s2 = part:match("^([^']*)'([^']*)$")
@@ -103,38 +109,27 @@ local function expand_code_variant(main_projection, xlit_projection, part)
            add(s1:sub(1,1) .. s2:sub(1,1))
        end
    end
-    -- 保留原始编码，用户可能加入直接的编码用
-    -- 只有纯小写字母 (ceui) 才保留，含符号(ce'shi)或大写(ABC)均不保留
-    if part:match("^%l+$") then
-        add(part)
-    end
-    -- 对“原始编码”进行奇位提取 (ceui -> cu)
-    -- extract_odd_positions 内部已经校验了 ^%l+$，所以这里直接调用即可
+    if part:match("^%l+$") then add(part) end
    local raw_extracted = extract_odd_positions(part)
    if raw_extracted then add(raw_extracted) end

-    -- 规则投影 (Main Projection)
    if main_projection and not part:match('^%u+$') then
        local p = main_projection:apply(part, true)
        if p and #p > 0 then
-            -- A. 加入投影全码 (如 yuif)
            add(p) 
-            -- B. 生成 v 变体 (如 yuif -> yvif)
            local v_variant = get_v_variant(p)
            if v_variant then add(v_variant) end
-            -- C. 对投影全码提取奇位 (如 yuif -> yi)
            local proj_extracted = extract_odd_positions(p)
            if proj_extracted then add(proj_extracted) end
        end
    end
-    -- 大写反查 (Xlit)
    if part:match('^%u+$') and xlit_projection then
        local xlit_result = xlit_projection:apply(part, true)
        if xlit_result and #xlit_result > 0 then add(xlit_result) end
    end
    return out
 end
-- 【DB】查表
+
 local function build_reverse_group(main_projection, xlit_projection, db_table, text)
    local group, seen = {}, {}
    for _, db in ipairs(db_table) do
@@ -154,7 +149,6 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t
    return group
 end

-- 单字匹配 (Strict Prefix)
 local function group_match(group, fuma)
    if not group then return false end
    for i = 1, #group do 
@@ -163,7 +157,6 @@ local function group_match(group, fuma)
    return false
 end

-- 递归匹配引擎 (优化：整数 Key)
 local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, memo, is_phrase_mode)
    if input_idx > #input_str then return true end
    if idx > #codes_sequence then return false end
@@ -177,9 +170,8 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx,
    if codes then
        for _, code in ipairs(codes) do
            local skip = false
-            -- 词组模式下，过滤掉 >3 的全码
            if is_phrase_mode and #code > 3 then skip = true end
-
+            if code:match("^%d+$") then skip = true end
            if not skip then
                local i_curr = input_idx
                local c_curr = 1
@@ -202,8 +194,15 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx,
    return result
 end

-- 注释解析 (严格校验 + Trim)
-local function parse_comment_codes(comment, pattern, target_len)
+local function list_contains(list, target)
+    if not list then return false end
+    for _, v in ipairs(list) do
+        if v == target then return true end
+    end
+    return false
+end
+
+local function parse_comment_codes(comment, pattern, target_len, enable_tone)
    if not comment or comment == "" then return nil end
    local parts = {}
    
@@ -217,14 +216,31 @@ local function parse_comment_codes(comment, pattern, target_len)
    local result = {}
    for i, part in ipairs(parts) do
        local p1, p2 = part:find(";")
-        if not p1 then return nil end
+        local pinyin_part
+        local codes_part
+        
+        if p1 then
+            -- 有分号：前面是音/拼音，后面是码
+            pinyin_part = part:sub(1, p1 - 1)
+            codes_part = part:sub(p2 + 1)
+        else
+            -- 无分号：整体是音/拼音，无码
+            pinyin_part = part
+            codes_part = ""
+        end
        
-        local codes_part = part:sub(p2 + 1)
        local codes_list = {}
-        for c in codes_part:gmatch("[^,]+") do 
-            -- Trim
-            local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
-            if #trimmed > 0 then table.insert(codes_list, trimmed) end
+        if #codes_part > 0 then
+            for c in codes_part:gmatch("[^,]+") do 
+                local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
+                if #trimmed > 0 then table.insert(codes_list, trimmed) end
+            end
+        end
+        if enable_tone then
+            local tone = get_tone_from_pinyin(pinyin_part)
+            if tone then
+                table.insert(codes_list, tone)
+            end
        end
        result[i] = codes_list
    end
@@ -236,24 +252,34 @@ local f = {}
 function f.init(env)
    local config = env.engine.schema.config
    
+    -- 1. 先读取是否开启声调过滤 (默认为 true)
+    env.enable_tone = config:get_bool('wanxiang_lookup/enable_tone')
+    if env.enable_tone == nil then env.enable_tone = true end
+
+    -- 2. 读取数据源 data_source
    local sources_list = config:get_list('wanxiang_lookup/data_source')
    env.data_sources = {}
-    env.has_comment = false
+    
+    -- 临时标记，判断配置里是否显式包含了 comment
+    local config_has_comment_source = false
    env.has_db = false
    
    if sources_list and sources_list.size > 0 then
        for i = 0, sources_list.size - 1 do
            local s = sources_list:get_value_at(i).value
            table.insert(env.data_sources, s)
-            if s == 'comment' then env.has_comment = true end
+            if s == 'aux' then config_has_comment_source = true end
            if s == 'db' then env.has_db = true end
        end
    else
-        env.data_sources = { 'comment', 'db' }
-        env.has_comment = true
+        env.data_sources = { 'aux', 'db' }
+        config_has_comment_source = true
        env.has_db = true
    end

+    -- 只要配置里用了 comment 做数据源，或者开启了 enable_tone (需要从注释借声调)，都必须解析注释。
+    env.has_comment = config_has_comment_source or env.enable_tone
+
    env.db_table = nil
    if env.has_db then
        local db_list = config:get_list("wanxiang_lookup/lookup")
@@ -275,7 +301,6 @@ function f.init(env)
    if env.has_comment then
        local delimiter = config:get_string('speller/delimiter') or " '"
        if delimiter == "" then delimiter = " " end
-        -- 确保 " '" 中的所有字符都被加入排除列表 [^% %']+
        env.comment_split_ptrn = "[^" .. alt_lua_punc(delimiter) .. "]+"
    end

@@ -325,15 +350,25 @@ function f.func(input, env)
    local fuma = ctx_input:sub(s_end + 1)
    if #fuma == 0 then for cand in input:iter() do yield(cand) end return end

-    local if_single_char_first = env.engine.context:get_option('char_priority')
-    
-    local buckets = {}
-    local max_len = 0
-    for i = 1, #env.data_sources do buckets[i] = {} end
-    
-    local long_word_cands = {}
+    local tone_filter_seq = {}
+    local clean_fuma = ""
+    for i = 1, #fuma do
+        local char = fuma:sub(i, i)
+        if char == "7" or char == "8" or char == "9" or char == "0" then
+            table.insert(tone_filter_seq, char)
+        else
+            clean_fuma = clean_fuma .. char
+        end
+    end
+    local apply_tone_filter = env.enable_tone and (#tone_filter_seq > 0)
+
+    local if_single_char_first = env.engine.context:get_option('char_priority')
+    local buckets = {}
+    for i = 1, #env.data_sources do buckets[i] = {} end
+    local long_word_cands = {}
+    local max_len = 0
+    local has_any_match = false 

-    -- GC
    if env.cache_size > 2000 then
        env._global_db_cache = {}
        env._global_comment_cache = {}
@@ -344,24 +379,22 @@ function f.func(input, env)

    for cand in input:iter() do
        if cand.type == 'sentence' then goto skip end
-        
        local cand_text = cand.text
        local cand_len = get_utf8_len(cand_text)
        if not cand_len or cand_len == 0 then goto skip end
-        
        local b = string.byte(cand_text, 1)
        if b and b < 128 then goto skip end

        local raw_data = {}
        
-        -- 1. Comment Data
+        -- A: Comment Data
        if env.has_comment then
            local genuine = cand:get_genuine()
            local comment_text = genuine and genuine.comment or ""
            if comment_text ~= "" then
                local cache_key = cand_text .. "_" .. comment_text
                if not comment_cache[cache_key] then
-                    comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len) or false
+                    comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len, env.enable_tone) or false
                    env.cache_size = env.cache_size + 1
                end
                if comment_cache[cache_key] then
@@ -370,15 +403,13 @@ function f.func(input, env)
            end
        end

-        -- 2. DB Data
+        -- B: DB Data
        if env.has_db then
            raw_data.db = {}
-            local pos = 1
            local i = 0
            for _, code_point in utf8.codes(cand_text) do
                i = i + 1
                local char_str = utf8.char(code_point)
-                
                if not db_cache[char_str] then
                    db_cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str)
                    env.cache_size = env.cache_size + 1 
@@ -387,36 +418,66 @@ function f.func(input, env)
            end
        end

-        -- 3. Match
+        -- 提取借用声调 (总是尝试从 raw_data.comment 提取，即使 data_source 只有 db)
+        local borrowed_tones = {} 
+        if raw_data.comment then
+            for k, codes in ipairs(raw_data.comment) do
+                borrowed_tones[k] = {}
+                for _, c in ipairs(codes) do
+                    if c:match("^%d+$") then borrowed_tones[k][c] = true end
+                end
+            end
+        end
+
        local matched_idx = nil
+
        for i, source_type in ipairs(env.data_sources) do
            local codes_seq = raw_data[source_type]
            if codes_seq then
-                local is_match = false
-                if source_type == 'comment' then
-                    if cand_len == 1 then
-                        if group_match(codes_seq[1], fuma) then is_match = true end
-                    else
-                        local memo = {}
-                        if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, false) then is_match = true end
-                    end
-                elseif source_type == 'db' then
-                    if cand_len == 1 then
-                         if group_match(codes_seq[1], fuma) then is_match = true end
-                    else
-                         local memo = {}
-                         if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, true) then is_match = true end
+                local tone_match_pass = true
+                if apply_tone_filter then
+                    for k, tone_input in ipairs(tone_filter_seq) do
+                        if k > #codes_seq then break end
+                        local has_tone = list_contains(codes_seq[k], tone_input)
+                        -- 如果是 db 源且自身没匹配到声调，尝试查阅 borrowed_tones
+                        if not has_tone and source_type == 'db' then
+                            if borrowed_tones[k] and borrowed_tones[k][tone_input] then has_tone = true end
+                        end
+                        if not has_tone then
+                            tone_match_pass = false
+                            break
+                        end
                    end
                end
-                
-                if is_match then
-                    matched_idx = i
-                    break 
+
+                if tone_match_pass then
+                    local is_match = false
+                    if source_type == 'aux' then
+                        if cand_len == 1 then
+                            if group_match(codes_seq[1], clean_fuma) then is_match = true end
+                        else
+                            local memo = {}
+                            if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, false) then is_match = true end
+                        end
+                    elseif source_type == 'db' then
+                        if cand_len == 1 then
+                             if group_match(codes_seq[1], clean_fuma) then is_match = true end
+                        else
+                             local memo = {}
+                             if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, true) then is_match = true end
+                        end
+                    end
+                    
+                    if is_match then
+                        matched_idx = i
+                        break 
+                    end
                end
            end
        end

        if matched_idx then
+            has_any_match = true
            if if_single_char_first and cand_len > 1 then
                table.insert(long_word_cands, cand)
            else
@@ -428,7 +489,6 @@ function f.func(input, env)
        ::skip::
    end

-    -- 输出 (Global Length Priority)
    if if_single_char_first then
        for i = 1, #env.data_sources do
            if buckets[i][1] then for _, c in ipairs(buckets[i][1]) do yield(c) end end
@@ -447,6 +507,19 @@ function f.func(input, env)
    end
    
    for _, c in ipairs(long_word_cands) do yield(c) end
+
+    if not has_any_match and apply_tone_filter and #clean_fuma > 0 and env.has_db and env.db_table then
+        for _, db_obj in ipairs(env.db_table) do
+            local res_str = db_obj:lookup(clean_fuma)
+            if res_str and #res_str > 0 then
+                for word in res_str:gmatch("%S+") do
+                    local cand = Candidate("wanxiang_shadow", s_end, #ctx_input, word, "")
+                    cand.quality = 1 
+                    yield(cand)
+                end
+            end
+        end
+    end
 end

 function f.tags_match(seg, env)
--- a/lua/tone_fallback.lua
+++ b/lua/tone_fallback.lua
@@ -34,7 +34,8 @@ local P = {}

 function P.init(env)
    env.tone_state = "idle"
-
+    local config = env.engine.schema.config
+    env.lookup_key = config:get_string('wanxiang_lookup/key') or '`'
    local ctx = env.engine and env.engine.context
    if not ctx or not ctx.update_notifier then return end

@@ -105,6 +106,17 @@ function P.func(key, env)
    -- 主键盘数字 0–9：标记为 compress
    local r = key:repr() or ""
    if r:match("^[0-9]$") then
+        local input = ctx.input or ""
+        local caret = (ctx.caret_pos ~= nil) and ctx.caret_pos or #input
+        if caret < 0 then caret = 0 end
+        if caret > #input then caret = #input end
+        local left = (caret > 0) and input:sub(1, caret) or ""
+
+        if left:find(env.lookup_key, 1, true) then
+            env.tone_state = "idle"
+            return wanxiang.RIME_PROCESS_RESULTS.kNoop
+        end
+
        env.tone_state = "compress"

        -- 这里用“预测压缩是否会发生”来决定要不要告诉 Rime “我处理了这个按键”