fix(super_lookup): 反查现已支持声调,与字母任意位置输入即可,词组根据输入的个数来从第一个汉字依次来匹配,输入位置也是随意的

This commit is contained in:
amzxyz
2026-01-13 00:59:59 +08:00
parent 3dd2ac1984
commit 5a47c9d492
5 changed files with 176 additions and 89 deletions

View File

@@ -470,10 +470,11 @@ wanxiang_lookup: #设置归属于super_lookup.lua
tags: [ abc ] # 检索当前tag的候选
key: "`" # 输入中反查引导符,要添加到 speller/alphabet
lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了
data_source: ['comment', 'db']
#data_source: ['db', 'comment']
#用来配置你的反查数据来自哪里注释comment即词库携带的辅助编码数据库db即反查方案携带的两分、多分、笔画
#谁在前哪种权重高比如 身db dian即dm深comment dm 就能让相应的候选排在前面
enable_tone: true #启用声调反查
data_source: ['aux', 'db']
#data_source: ['db', 'aux']
#用来配置你的反查数据来自哪里注释aux即词库携带的辅助编码数据库db即反查方案携带的两分、多分、笔画
#谁在前哪种权重高比如 身db dian即dm深aux dm 就能让相应的候选排在前面
#但注释在前有时候与笔画的感知会有一定冲突,可针对习惯使用一类数据库即可
# 处理符合特定规则的输入码,如网址、反查

View File

@@ -3,21 +3,38 @@
--tags: [ abc ] # 检索当前tag的候选
--key: "`" # 输入中反查引导符
--lookup: [ wanxiang_reverse ] #反查滤镜数据库
--data_source: [ comment, db ] # 优先级:写在前面优先
--data_source: [ comment, db ] # 优先级:写在前面优先。即使只写db只要开启enable_tone也能从注释获取声调。
--enable_tone: true #启用声调反查
-- 工具函数:转义正则特殊字符
local function alt_lua_punc(s)
return s and s:gsub('([%.%+%-%*%?%[%]%^%$%(%)%%])', '%%%1') or ''
end
-- 声调映射表
local tones_map = {
["ā"]="7", ["á"]="8", ["ǎ"]="9", ["à"]="0",
["ō"]="7", ["ó"]="8", ["ǒ"]="9", ["ò"]="0",
["ē"]="7", ["é"]="8", ["ě"]="9", ["è"]="0",
["ī"]="7", ["í"]="8", ["ǐ"]="9", ["ì"]="0",
["ū"]="7", ["ú"]="8", ["ǔ"]="9", ["ù"]="0",
["ǖ"]="7", ["ǘ"]="8", ["ǚ"]="9", ["ǜ"]="0"
}
-- 高性能 UTF8 长度获取
local function get_utf8_len(s)
-- 优先使用 Rime 内置的 utf8 库
if utf8 and utf8.len then return utf8.len(s) end
local _, count = string.gsub(s, "[^\128-\193]", "")
return count
end
-- 提取声调数字 (无声调/轻声 -> 默认归为 4)
local function get_tone_from_pinyin(pinyin)
if not pinyin or #pinyin == 0 then return nil end
for char, tone in pairs(tones_map) do
if string.find(pinyin, char, 1, true) then
return tone
end
end
return "0"
end
-- 规则加载
local function parse_and_separate_rules(schema_id)
if not schema_id or #schema_id == 0 then return nil, nil end
@@ -56,35 +73,25 @@ end
local function expand_code_variant(main_projection, xlit_projection, part)
local out, seen = {}, {}
-- 统一添加函数 (负责去重 + 顺序保持)
local function add(s)
if s and #s > 0 and not seen[s] then
seen[s] = true
table.insert(out, s)
end
end
-- 通用奇数位提取器 (1, 3, 5...)
-- 规则:纯小写字母 + 偶数长度
local function extract_odd_positions(s)
if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end
local res = ""
for i = 1, #s, 2 do res = res .. s:sub(i, i) end
return res
end
-- 检查奇偶位组合,如果是 [jqxy] + u则生成对应的 v 版本
local function get_v_variant(s)
if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end
local res = ""
local has_change = false
for i = 1, #s, 2 do
local char_odd = s:sub(i, i)
local char_even = s:sub(i+1, i+1)
if (char_odd == 'j' or char_odd == 'q' or char_odd == 'x' or char_odd == 'y') and char_even == 'v' then
res = res .. char_odd .. 'u'
has_change = true
@@ -95,7 +102,6 @@ local function expand_code_variant(main_projection, xlit_projection, part)
return has_change and res or nil
end
-- 预处理单引号特例 (ce'shi -> cs)全拼用
local _, quote_count = part:gsub("'", "")
if quote_count == 1 then
local s1, s2 = part:match("^([^']*)'([^']*)$")
@@ -103,38 +109,27 @@ local function expand_code_variant(main_projection, xlit_projection, part)
add(s1:sub(1,1) .. s2:sub(1,1))
end
end
-- 保留原始编码,用户可能加入直接的编码用
-- 只有纯小写字母 (ceui) 才保留,含符号(ce'shi)或大写(ABC)均不保留
if part:match("^%l+$") then
add(part)
end
-- 对“原始编码”进行奇位提取 (ceui -> cu)
-- extract_odd_positions 内部已经校验了 ^%l+$,所以这里直接调用即可
if part:match("^%l+$") then add(part) end
local raw_extracted = extract_odd_positions(part)
if raw_extracted then add(raw_extracted) end
-- 规则投影 (Main Projection)
if main_projection and not part:match('^%u+$') then
local p = main_projection:apply(part, true)
if p and #p > 0 then
-- A. 加入投影全码 (如 yuif)
add(p)
-- B. 生成 v 变体 (如 yuif -> yvif)
local v_variant = get_v_variant(p)
if v_variant then add(v_variant) end
-- C. 对投影全码提取奇位 (如 yuif -> yi)
local proj_extracted = extract_odd_positions(p)
if proj_extracted then add(proj_extracted) end
end
end
-- 大写反查 (Xlit)
if part:match('^%u+$') and xlit_projection then
local xlit_result = xlit_projection:apply(part, true)
if xlit_result and #xlit_result > 0 then add(xlit_result) end
end
return out
end
-- 【DB】查表
local function build_reverse_group(main_projection, xlit_projection, db_table, text)
local group, seen = {}, {}
for _, db in ipairs(db_table) do
@@ -154,7 +149,6 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t
return group
end
-- 单字匹配 (Strict Prefix)
local function group_match(group, fuma)
if not group then return false end
for i = 1, #group do
@@ -163,7 +157,6 @@ local function group_match(group, fuma)
return false
end
-- 递归匹配引擎 (优化:整数 Key)
local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, memo, is_phrase_mode)
if input_idx > #input_str then return true end
if idx > #codes_sequence then return false end
@@ -177,9 +170,8 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx,
if codes then
for _, code in ipairs(codes) do
local skip = false
-- 词组模式下,过滤掉 >3 的全码
if is_phrase_mode and #code > 3 then skip = true end
if code:match("^%d+$") then skip = true end
if not skip then
local i_curr = input_idx
local c_curr = 1
@@ -202,8 +194,15 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx,
return result
end
-- 注释解析 (严格校验 + Trim)
local function parse_comment_codes(comment, pattern, target_len)
local function list_contains(list, target)
if not list then return false end
for _, v in ipairs(list) do
if v == target then return true end
end
return false
end
local function parse_comment_codes(comment, pattern, target_len, enable_tone)
if not comment or comment == "" then return nil end
local parts = {}
@@ -217,14 +216,31 @@ local function parse_comment_codes(comment, pattern, target_len)
local result = {}
for i, part in ipairs(parts) do
local p1, p2 = part:find(";")
if not p1 then return nil end
local pinyin_part
local codes_part
if p1 then
-- 有分号:前面是音/拼音,后面是码
pinyin_part = part:sub(1, p1 - 1)
codes_part = part:sub(p2 + 1)
else
-- 无分号:整体是音/拼音,无码
pinyin_part = part
codes_part = ""
end
local codes_part = part:sub(p2 + 1)
local codes_list = {}
for c in codes_part:gmatch("[^,]+") do
-- Trim
local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
if #trimmed > 0 then table.insert(codes_list, trimmed) end
if #codes_part > 0 then
for c in codes_part:gmatch("[^,]+") do
local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "")
if #trimmed > 0 then table.insert(codes_list, trimmed) end
end
end
if enable_tone then
local tone = get_tone_from_pinyin(pinyin_part)
if tone then
table.insert(codes_list, tone)
end
end
result[i] = codes_list
end
@@ -236,24 +252,34 @@ local f = {}
function f.init(env)
local config = env.engine.schema.config
-- 1. 先读取是否开启声调过滤 (默认为 true)
env.enable_tone = config:get_bool('wanxiang_lookup/enable_tone')
if env.enable_tone == nil then env.enable_tone = true end
-- 2. 读取数据源 data_source
local sources_list = config:get_list('wanxiang_lookup/data_source')
env.data_sources = {}
env.has_comment = false
-- 临时标记,判断配置里是否显式包含了 comment
local config_has_comment_source = false
env.has_db = false
if sources_list and sources_list.size > 0 then
for i = 0, sources_list.size - 1 do
local s = sources_list:get_value_at(i).value
table.insert(env.data_sources, s)
if s == 'comment' then env.has_comment = true end
if s == 'aux' then config_has_comment_source = true end
if s == 'db' then env.has_db = true end
end
else
env.data_sources = { 'comment', 'db' }
env.has_comment = true
env.data_sources = { 'aux', 'db' }
config_has_comment_source = true
env.has_db = true
end
-- 只要配置里用了 comment 做数据源,或者开启了 enable_tone (需要从注释借声调),都必须解析注释。
env.has_comment = config_has_comment_source or env.enable_tone
env.db_table = nil
if env.has_db then
local db_list = config:get_list("wanxiang_lookup/lookup")
@@ -275,7 +301,6 @@ function f.init(env)
if env.has_comment then
local delimiter = config:get_string('speller/delimiter') or " '"
if delimiter == "" then delimiter = " " end
-- 确保 " '" 中的所有字符都被加入排除列表 [^% %']+
env.comment_split_ptrn = "[^" .. alt_lua_punc(delimiter) .. "]+"
end
@@ -325,15 +350,25 @@ function f.func(input, env)
local fuma = ctx_input:sub(s_end + 1)
if #fuma == 0 then for cand in input:iter() do yield(cand) end return end
local if_single_char_first = env.engine.context:get_option('char_priority')
local buckets = {}
local max_len = 0
for i = 1, #env.data_sources do buckets[i] = {} end
local long_word_cands = {}
local tone_filter_seq = {}
local clean_fuma = ""
for i = 1, #fuma do
local char = fuma:sub(i, i)
if char == "7" or char == "8" or char == "9" or char == "0" then
table.insert(tone_filter_seq, char)
else
clean_fuma = clean_fuma .. char
end
end
local apply_tone_filter = env.enable_tone and (#tone_filter_seq > 0)
local if_single_char_first = env.engine.context:get_option('char_priority')
local buckets = {}
for i = 1, #env.data_sources do buckets[i] = {} end
local long_word_cands = {}
local max_len = 0
local has_any_match = false
-- GC
if env.cache_size > 2000 then
env._global_db_cache = {}
env._global_comment_cache = {}
@@ -344,24 +379,22 @@ function f.func(input, env)
for cand in input:iter() do
if cand.type == 'sentence' then goto skip end
local cand_text = cand.text
local cand_len = get_utf8_len(cand_text)
if not cand_len or cand_len == 0 then goto skip end
local b = string.byte(cand_text, 1)
if b and b < 128 then goto skip end
local raw_data = {}
-- 1. Comment Data
-- A: Comment Data
if env.has_comment then
local genuine = cand:get_genuine()
local comment_text = genuine and genuine.comment or ""
if comment_text ~= "" then
local cache_key = cand_text .. "_" .. comment_text
if not comment_cache[cache_key] then
comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len) or false
comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len, env.enable_tone) or false
env.cache_size = env.cache_size + 1
end
if comment_cache[cache_key] then
@@ -370,15 +403,13 @@ function f.func(input, env)
end
end
-- 2. DB Data
-- B: DB Data
if env.has_db then
raw_data.db = {}
local pos = 1
local i = 0
for _, code_point in utf8.codes(cand_text) do
i = i + 1
local char_str = utf8.char(code_point)
if not db_cache[char_str] then
db_cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str)
env.cache_size = env.cache_size + 1
@@ -387,36 +418,66 @@ function f.func(input, env)
end
end
-- 3. Match
-- 提取借用声调 (总是尝试从 raw_data.comment 提取,即使 data_source 只有 db)
local borrowed_tones = {}
if raw_data.comment then
for k, codes in ipairs(raw_data.comment) do
borrowed_tones[k] = {}
for _, c in ipairs(codes) do
if c:match("^%d+$") then borrowed_tones[k][c] = true end
end
end
end
local matched_idx = nil
for i, source_type in ipairs(env.data_sources) do
local codes_seq = raw_data[source_type]
if codes_seq then
local is_match = false
if source_type == 'comment' then
if cand_len == 1 then
if group_match(codes_seq[1], fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, false) then is_match = true end
end
elseif source_type == 'db' then
if cand_len == 1 then
if group_match(codes_seq[1], fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, true) then is_match = true end
local tone_match_pass = true
if apply_tone_filter then
for k, tone_input in ipairs(tone_filter_seq) do
if k > #codes_seq then break end
local has_tone = list_contains(codes_seq[k], tone_input)
-- 如果是 db 源且自身没匹配到声调,尝试查阅 borrowed_tones
if not has_tone and source_type == 'db' then
if borrowed_tones[k] and borrowed_tones[k][tone_input] then has_tone = true end
end
if not has_tone then
tone_match_pass = false
break
end
end
end
if is_match then
matched_idx = i
break
if tone_match_pass then
local is_match = false
if source_type == 'aux' then
if cand_len == 1 then
if group_match(codes_seq[1], clean_fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, false) then is_match = true end
end
elseif source_type == 'db' then
if cand_len == 1 then
if group_match(codes_seq[1], clean_fuma) then is_match = true end
else
local memo = {}
if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, true) then is_match = true end
end
end
if is_match then
matched_idx = i
break
end
end
end
end
if matched_idx then
has_any_match = true
if if_single_char_first and cand_len > 1 then
table.insert(long_word_cands, cand)
else
@@ -428,7 +489,6 @@ function f.func(input, env)
::skip::
end
-- 输出 (Global Length Priority)
if if_single_char_first then
for i = 1, #env.data_sources do
if buckets[i][1] then for _, c in ipairs(buckets[i][1]) do yield(c) end end
@@ -447,6 +507,19 @@ function f.func(input, env)
end
for _, c in ipairs(long_word_cands) do yield(c) end
if not has_any_match and apply_tone_filter and #clean_fuma > 0 and env.has_db and env.db_table then
for _, db_obj in ipairs(env.db_table) do
local res_str = db_obj:lookup(clean_fuma)
if res_str and #res_str > 0 then
for word in res_str:gmatch("%S+") do
local cand = Candidate("wanxiang_shadow", s_end, #ctx_input, word, "")
cand.quality = 1
yield(cand)
end
end
end
end
end
function f.tags_match(seg, env)

View File

@@ -34,7 +34,8 @@ local P = {}
function P.init(env)
env.tone_state = "idle"
local config = env.engine.schema.config
env.lookup_key = config:get_string('wanxiang_lookup/key') or '`'
local ctx = env.engine and env.engine.context
if not ctx or not ctx.update_notifier then return end
@@ -105,6 +106,17 @@ function P.func(key, env)
-- 主键盘数字 09标记为 compress
local r = key:repr() or ""
if r:match("^[0-9]$") then
local input = ctx.input or ""
local caret = (ctx.caret_pos ~= nil) and ctx.caret_pos or #input
if caret < 0 then caret = 0 end
if caret > #input then caret = #input end
local left = (caret > 0) and input:sub(1, caret) or ""
if left:find(env.lookup_key, 1, true) then
env.tone_state = "idle"
return wanxiang.RIME_PROCESS_RESULTS.kNoop
end
env.tone_state = "compress"
-- 这里用“预测压缩是否会发生”来决定要不要告诉 Rime “我处理了这个按键”

View File

@@ -470,6 +470,7 @@ wanxiang_lookup: #设置归属于super_lookup.lua
key: "`" # 输入中反查引导符,要添加到 speller/alphabet
lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了
data_source: [db] #对于pro版本能从注释中加载词库辅助码详情见Pro版本base这里无需修改。
enable_tone: true #启用声调反查
# 处理符合特定规则的输入码,如网址、反查
recognizer:

View File

@@ -81,7 +81,6 @@ engine:
- reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜放在super_comment前面进一步被超级注释处理以获得拼音编码的提示
- lua_filter@*super_lookup #字词输入中反查辅助筛选
- lua_filter@*super_filter #功能太多详见Lua文件
- lua_filter@*super_comment_preedit #超级注释模块、超级preedit支持错词提示、辅助码显示部件组字读音注释有声调、无声调全拼编码的转换支持个性化配置和关闭相应的功能详情搜索super_comment_preedit进行详细配置
- simplifier@emoji #Emoji滤镜
- simplifier@s2t #简繁切换通繁
- simplifier@s2tw #简繁切换台繁
@@ -91,7 +90,7 @@ engine:
- uniquifier # 去重
t9:
enable: true #启用仓、元书t9输入方式
isDisplayOriginalPreedit: false
@@ -195,8 +194,9 @@ translator:
initial_quality: 3 #初始质量拼音的权重应该比英文大
spelling_hints: 30 #将注释以词典code字符串形式完全暴露通过super_comment.lua完全接管灵活配置。
always_show_comments: true # Rime 默认在 preedit 等于 comment 时取消显示 comment这里强制一直显示供super_comment.lua做判断用。
comment_format: {comment} #将注释以词典字符串形式完全暴露通过super_preedit.lua完全接管灵活配置。
comment_format:
- xlit/āáǎàōóǒòēéěèīíǐìūúǔùǖǘǚǜüńňǹḿm̀/aaaaooooeeeeiiiiuuuuvvvvvnnnmmm/
- xform/^(.*);.*$/$1/
#disable_user_dict_for_patterns: #如果你开启调频需要一并考虑这个配置是否需要基本的6码3字不调频你可以自定义目前的逻辑是依然记录用户词但满足规则的不输出不被使用
# - "^[a-z]{1,6}"