diff --git a/custom/wanxiang_pro.schema.yaml b/custom/wanxiang_pro.schema.yaml index 99ff80f..5e23be1 100644 --- a/custom/wanxiang_pro.schema.yaml +++ b/custom/wanxiang_pro.schema.yaml @@ -99,8 +99,9 @@ engine: - script_translator@add_user_dict #自造词之制造词汇入口 filters: - reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜,放在super_comment前面,进一步被超级注释处理以获得拼音编码的提示 - - lua_filter@*auto_phrase #无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词 - - lua_filter@*super_comment_preedit #超级注释模块、超级preedit,支持错词提示、辅助码显示,部件组字读音注释,有声调、无声调全拼编码的转换,支持个性化配置和关闭相应的功能,详情搜索super_comment_preedit进行详细配置 + - lua_filter@*auto_phrase #comment前,无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词 + - lua_filter@*super_lookup #comment前,字词输入中反查辅助筛选 + - lua_filter@*super_comment_preedit #OpenCC前,超级注释模块、超级preedit,支持错词提示、辅助码显示,部件组字读音注释,有声调、无声调全拼编码的转换,支持个性化配置和关闭相应的功能,详情搜索super_comment_preedit进行详细配置 - simplifier@emoji #Emoji滤镜 - simplifier@s2t #简繁切换通繁 - simplifier@s2tw #简繁切换台繁 @@ -108,7 +109,6 @@ engine: - simplifier@chinese_english #中英翻译滤镜 - lua_filter@*super_sequence*F #手动排序,对高亮候选 ctrl+j左移动 ctrl+k 右移动 ctrl+0 移除位移 - lua_filter@*super_filter #功能太多详见Lua文件 - - lua_filter@*super_lookup #字词输入中反查辅助筛选 - uniquifier #去重 grammar: @@ -447,6 +447,11 @@ wanxiang_lookup: #设置归属于super_lookup.lua tags: [ abc ] # 检索当前tag的候选 key: "`" # 输入中反查引导符,要添加到 speller/alphabet lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了 + data_source: ['comment', 'db'] + #data_source: ['db', 'comment'] + #用来配置你的反查数据来自哪里,注释comment:即词库携带的辅助编码,数据库db:即反查方案携带的两分、多分、笔画 + #谁在前哪种权重高比如 身db dian即dm,深comment dm 就能让相应的候选排在前面 + #但注释在前有时候与笔画的感知会有一定冲突,可针对习惯使用一类数据库即可 # 处理符合特定规则的输入码,如网址、反查 recognizer: @@ -571,6 +576,7 @@ key_binder: - { when: composing, accept: "Control+g", toggle: charset_filter } #通过快捷键Control+q切换中文、英文、混合模式 - { when: has_menu, accept: "Control+q", toggle: zh_only } + - { when: composing, accept: "Control+q", toggle: zh_only } # 使用 tab 在不同音节之间跳转 - { when: has_menu, accept: "Tab", send: "Control+Right" } - { when: composing, accept: "Tab", send: "Control+Right" } diff --git a/lua/super_lookup.lua b/lua/super_lookup.lua index 3e38a66..a4e71d5 100644 --- a/lua/super_lookup.lua +++ b/lua/super_lookup.lua @@ -1,23 +1,24 @@ --@amzxyz https://github.com/amzxyz/rime_wanxiang - --wanxiang_lookup: #设置归属于super_lookup.lua --tags: [ abc ] # 检索当前tag的候选 - --key: "`" # 输入中反查引导符,要添加到 speller/alphabet - --lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了 + --key: "`" # 输入中反查引导符 + --lookup: [ wanxiang_reverse ] #反查滤镜数据库 + --data_source: [ comment, db ] # 优先级:写在前面优先 ------------------------------------------------------------- --- 工具函数 ------------------------------------------------------------- +-- 工具函数:转义正则特殊字符 local function alt_lua_punc(s) return s and s:gsub('([%.%+%-%*%?%[%]%^%$%(%)%%])', '%%%1') or '' end -local function is_all_upper(s) return s:match('^%u+$') ~= nil end -local function is_all_lower(s) return s:match('^%l+$') ~= nil end +-- 高性能 UTF8 长度获取 +local function get_utf8_len(s) + -- 优先使用 Rime 内置的 utf8 库 + if utf8 and utf8.len then return utf8.len(s) end + local _, count = string.gsub(s, "[^\128-\193]", "") + return count +end ------------------------------------------------------------- -- 规则加载 ------------------------------------------------------------- local function parse_and_separate_rules(schema_id) if not schema_id or #schema_id == 0 then return nil, nil end local schema = Schema(schema_id) @@ -53,41 +54,31 @@ local function get_schema_rules(env) return main_rules or {}, xlit_rules or {} end ------------------------------------------------------------- --- 核心逻辑 ------------------------------------------------------------- +-- 【DB】构建编码 local function expand_code_variant(main_projection, xlit_projection, part) local out, seen = {}, {} - local function add(s) - if s and #s > 0 and not seen[s] then - seen[s] = true - out[#out + 1] = s + local function add(s) if s and #s > 0 and not seen[s] then seen[s] = true out[#out + 1] = s end end + add(part) + if main_projection then local p = main_projection:apply(part, true) if p and #p > 0 then add(p) end end + local base = {} + for i = 1, #out do local elem = out[i] if elem:match('^%l+$') then base[#base + 1] = elem end end + + -- 提取 1,3 位生成构造码 + for _, s in ipairs(base) do + -- 安全检查:确保长度足够 + if #s >= 3 and #s <= 4 and s:match('^%l+$') then + add(s:sub(1,1) .. s:sub(3,3)) end end - add(part) - if main_projection then - local p = main_projection:apply(part, true) - if p and #p > 0 then add(p) end - end - - local base = {} - for i = 1, #out do - local elem = out[i] - if is_all_lower(elem) then base[#base + 1] = elem end - end - for _, s in ipairs(base) do - if #s == 4 and is_all_lower(s) then - add(s:sub(1,1) .. s:sub(3,3)) - end - end - if is_all_upper(part) and xlit_projection then - local xlit_result = xlit_projection:apply(part, true) - if xlit_result and #xlit_result > 0 then add(xlit_result) end + if part:match('^%u+$') and xlit_projection then + local xlit_result = xlit_projection:apply(part, true) + if xlit_result and #xlit_result > 0 then add(xlit_result) end end return out end +-- 【DB】查表 local function build_reverse_group(main_projection, xlit_projection, db_table, text) local group, seen = {}, {} for _, db in ipairs(db_table) do @@ -96,10 +87,10 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t for part in code:gmatch('%S+') do local variants = expand_code_variant(main_projection, xlit_projection, part) for _, v in ipairs(variants) do - if not seen[v] then - seen[v] = true - group[#group + 1] = v - end + if not seen[v] then + seen[v] = true + group[#group + 1] = v + end end end end @@ -107,41 +98,130 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t return group end +-- 单字匹配 (Strict Prefix) local function group_match(group, fuma) if not group then return false end - for i = 1, #group do - if tostring(group[i]):sub(1, #fuma) == fuma then return true end + for i = 1, #group do + if string.sub(group[i], 1, #fuma) == fuma then return true end end return false end ------------------------------------------------------------- --- 过滤器主体 ------------------------------------------------------------- +-- 递归匹配引擎 (优化:整数 Key) +local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, memo, is_phrase_mode) + if input_idx > #input_str then return true end + if idx > #codes_sequence then return false end + + local state_key = idx * 1000 + input_idx + if memo[state_key] ~= nil then return memo[state_key] end + + local codes = codes_sequence[idx] + local result = false + + if codes then + for _, code in ipairs(codes) do + local skip = false + -- 词组模式下,过滤掉 >3 的全码 + if is_phrase_mode and #code > 3 then skip = true end + + if not skip then + local i_curr = input_idx + local c_curr = 1 + local i_limit = #input_str + local c_limit = #code + while i_curr <= i_limit and c_curr <= c_limit do + if input_str:byte(i_curr) == code:byte(c_curr) then i_curr = i_curr + 1 end + c_curr = c_curr + 1 + end + if match_fuzzy_recursive(codes_sequence, idx + 1, input_str, i_curr, memo, is_phrase_mode) then + result = true + break + end + end + end + else + if match_fuzzy_recursive(codes_sequence, idx + 1, input_str, input_idx, memo, is_phrase_mode) then result = true end + end + memo[state_key] = result + return result +end + +-- 注释解析 (严格校验 + Trim) +local function parse_comment_codes(comment, pattern, target_len) + if not comment or comment == "" then return nil end + local parts = {} + + if target_len == 1 then + parts = { comment } + else + for seg in comment:gmatch(pattern) do table.insert(parts, seg) end + if #parts ~= target_len then return nil end + end + + local result = {} + for i, part in ipairs(parts) do + local p1, p2 = part:find(";") + if not p1 then return nil end + + local codes_part = part:sub(p2 + 1) + local codes_list = {} + for c in codes_part:gmatch("[^,]+") do + -- Trim + local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "") + if #trimmed > 0 then table.insert(codes_list, trimmed) end + end + result[i] = codes_list + end + return result +end + local f = {} function f.init(env) local config = env.engine.schema.config - env.if_reverse_lookup = false - env.db_table = nil - local db = config:get_list("wanxiang_lookup/lookup") - if db and db.size > 0 then - env.db_table = {} - for i = 0, db.size - 1 do - table.insert(env.db_table, ReverseLookup(db:get_value_at(i).value)) + local sources_list = config:get_list('wanxiang_lookup/data_source') + env.data_sources = {} + env.has_comment = false + env.has_db = false + + if sources_list and sources_list.size > 0 then + for i = 0, sources_list.size - 1 do + local s = sources_list:get_value_at(i).value + table.insert(env.data_sources, s) + if s == 'comment' then env.has_comment = true end + if s == 'db' then env.has_db = true end end - env.if_reverse_lookup = true else - return + env.data_sources = { 'comment', 'db' } + env.has_comment = true + env.has_db = true end - local main_rules, xlit_rules = get_schema_rules(env) - env.main_projection = (type(main_rules) == 'table' and #main_rules > 0) and Projection() or nil - if env.main_projection then env.main_projection:load(main_rules) end - - env.xlit_projection = (type(xlit_rules) == 'table' and #xlit_rules > 0) and Projection() or nil - if env.xlit_projection then env.xlit_projection:load(xlit_rules) end + env.db_table = nil + if env.has_db then + local db_list = config:get_list("wanxiang_lookup/lookup") + if db_list and db_list.size > 0 then + env.db_table = {} + for i = 0, db_list.size - 1 do + table.insert(env.db_table, ReverseLookup(db_list:get_value_at(i).value)) + end + local main_rules, xlit_rules = get_schema_rules(env) + env.main_projection = (type(main_rules) == 'table' and #main_rules > 0) and Projection() or nil + if env.main_projection then env.main_projection:load(main_rules) end + env.xlit_projection = (type(xlit_rules) == 'table' and #xlit_rules > 0) and Projection() or nil + if env.xlit_projection then env.xlit_projection:load(xlit_rules) end + else + env.has_db = false + end + end + + if env.has_comment then + local delimiter = config:get_string('speller/delimiter') or " '" + if delimiter == "" then delimiter = " " end + -- 确保 " '" 中的所有字符都被加入排除列表 [^% %']+ + env.comment_split_ptrn = "[^" .. alt_lua_punc(delimiter) .. "]+" + end env.search_key_str = config:get_string('wanxiang_lookup/key') or '`' env.search_key_alt = alt_lua_punc(env.search_key_str) @@ -163,7 +243,7 @@ function f.init(env) local preedit = ctx:get_preedit() local no_search_string = input:match('^(.-)' .. env.search_key_alt) local edit = preedit.text:match('^(.-)' .. env.search_key_alt) - if edit and edit:match('[%w;]') then + if edit and edit:match('[%w/]') then ctx.input = no_search_string .. env.search_key_str else ctx.input = no_search_string @@ -172,101 +252,144 @@ function f.init(env) end end) - -- 【安全缓存】初始化 - env._global_group_cache = {} - env.cache_size = 0 -- 计数器 + env._global_db_cache = {} + env._global_comment_cache = {} + env.cache_size = 0 end function f.func(input, env) - if not env.if_reverse_lookup then + if #env.data_sources == 0 then for cand in input:iter() do yield(cand) end return end local ctx_input = env.engine.context.input local s_start, s_end = ctx_input:find(env.search_key_alt, 1, false) - - if not s_start then - for cand in input:iter() do yield(cand) end - return - end - + if not s_start then for cand in input:iter() do yield(cand) end return end local fuma = ctx_input:sub(s_end + 1) - - -- 【惰性检查】无辅码,直接显示,不查库 - if #fuma == 0 then - for cand in input:iter() do yield(cand) end - return - end + if #fuma == 0 then for cand in input:iter() do yield(cand) end return end - local fuma_segments = {} - for segment in fuma:gmatch('[^' .. env.search_key_alt .. ']+') do - table.insert(fuma_segments, string.lower(segment)) - end - local if_single_char_first = env.engine.context:get_option('char_priority') + + local buckets = {} + local max_len = 0 + for i = 1, #env.data_sources do buckets[i] = {} end + local long_word_cands = {} - local cache = env._global_group_cache - -- 如果缓存条目超过 3000,清空重来 - -- 3000个字足以覆盖99.9%的日常输入,且仅占用极小内存 - if env.cache_size > 3000 then - env._global_group_cache = {} + -- GC + if env.cache_size > 2000 then + env._global_db_cache = {} + env._global_comment_cache = {} env.cache_size = 0 - cache = env._global_group_cache -- 更新引用 end + local db_cache = env._global_db_cache + local comment_cache = env._global_comment_cache for cand in input:iter() do if cand.type == 'sentence' then goto skip end local cand_text = cand.text + local cand_len = get_utf8_len(cand_text) + if not cand_len or cand_len == 0 then goto skip end - -- 西文跳过 local b = string.byte(cand_text, 1) if b and b < 128 then goto skip end - local cand_len = utf8.len(cand_text) + local raw_data = {} - local characters = {} - local pos = 1 - for i = 1, cand_len do - local next_pos = utf8.offset(cand_text, i + 1) - local char_str = cand_text:sub(pos, next_pos and next_pos - 1) - characters[i] = char_str - pos = next_pos - - -- 【全局缓存】带计数 - if not cache[char_str] then - cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str) - env.cache_size = env.cache_size + 1 -- 增加计数 - end - end - - local ok = true - if #fuma_segments == 1 and cand_len == 1 then - ok = group_match(cache[characters[1]], fuma_segments[1]) - elseif #fuma_segments > 0 and cand_len > 1 then - local match_count = (#fuma_segments < cand_len) and #fuma_segments or cand_len - for i = 1, match_count do - if not group_match(cache[characters[i]], fuma_segments[i]) then - ok = false - break + -- 1. Comment Data + if env.has_comment then + local genuine = cand:get_genuine() + local comment_text = genuine and genuine.comment or "" + if comment_text ~= "" then + local cache_key = cand_text .. "_" .. comment_text + if not comment_cache[cache_key] then + comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len) or false + env.cache_size = env.cache_size + 1 + end + if comment_cache[cache_key] then + raw_data.comment = comment_cache[cache_key] end end - else - if cand_len < #fuma_segments then ok = false end end - if ok then + -- 2. DB Data + if env.has_db then + raw_data.db = {} + local pos = 1 + local i = 0 + for _, code_point in utf8.codes(cand_text) do + i = i + 1 + local char_str = utf8.char(code_point) + + if not db_cache[char_str] then + db_cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str) + env.cache_size = env.cache_size + 1 + end + raw_data.db[i] = db_cache[char_str] or {} + end + end + + -- 3. Match + local matched_idx = nil + for i, source_type in ipairs(env.data_sources) do + local codes_seq = raw_data[source_type] + if codes_seq then + local is_match = false + if source_type == 'comment' then + if cand_len == 1 then + if group_match(codes_seq[1], fuma) then is_match = true end + else + local memo = {} + if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, false) then is_match = true end + end + elseif source_type == 'db' then + if cand_len == 1 then + if group_match(codes_seq[1], fuma) then is_match = true end + else + local memo = {} + if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, true) then is_match = true end + end + end + + if is_match then + matched_idx = i + break + end + end + end + + if matched_idx then if if_single_char_first and cand_len > 1 then table.insert(long_word_cands, cand) else - yield(cand) + if not buckets[matched_idx][cand_len] then buckets[matched_idx][cand_len] = {} end + table.insert(buckets[matched_idx][cand_len], cand) + if cand_len > max_len then max_len = cand_len end end end ::skip:: end + -- 输出 (Global Length Priority) + if if_single_char_first then + for i = 1, #env.data_sources do + if buckets[i][1] then for _, c in ipairs(buckets[i][1]) do yield(c) end end + end + for l = max_len, 2, -1 do + for i = 1, #env.data_sources do + if buckets[i][l] then for _, c in ipairs(buckets[i][l]) do yield(c) end end + end + end + else + for l = max_len, 1, -1 do + for i = 1, #env.data_sources do + if buckets[i][l] then for _, c in ipairs(buckets[i][l]) do yield(c) end end + end + end + end + for _, c in ipairs(long_word_cands) do yield(c) end end @@ -276,9 +399,10 @@ function f.tags_match(seg, env) end function f.fini(env) - if env.if_reverse_lookup and env.notifier then env.notifier:disconnect() end + if env.notifier then env.notifier:disconnect() end env.db_table = nil - env._global_group_cache = nil + env._global_db_cache = nil + env._global_comment_cache = nil collectgarbage('collect') end diff --git a/wanxiang.schema.yaml b/wanxiang.schema.yaml index f12b650..31d418d 100644 --- a/wanxiang.schema.yaml +++ b/wanxiang.schema.yaml @@ -95,8 +95,9 @@ engine: - script_translator@user_dict_set #使用自造词 filters: - reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜,放在super_comment前面,进一步被超级注释处理以获得拼音编码的提示 - - lua_filter@*auto_phrase #无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词 - - lua_filter@*super_comment_preedit #超级注释模块、超级preedit,支持错词提示、辅助码显示,部件组字读音注释,有声调、无声调全拼编码的转换,支持个性化配置和关闭相应的功能,详情搜索super_comment_preedit进行详细配置 + - lua_filter@*auto_phrase #comment前,无感造词,关闭调频的时候将汉字写入次翻译器,当没有英文候选的时候追加\上屏可完成英文造词 + - lua_filter@*super_lookup #comment前,字词输入中反查辅助筛选 + - lua_filter@*super_comment_preedit #OpenCC前,超级注释模块、超级preedit,支持错词提示、辅助码显示,部件组字读音注释,有声调、无声调全拼编码的转换,支持个性化配置和关闭相应的功能,详情搜索super_comment_preedit进行详细配置 - simplifier@emoji #Emoji滤镜 - simplifier@s2t #简繁切换通繁 - simplifier@s2tw #简繁切换台繁 @@ -104,7 +105,6 @@ engine: - simplifier@chinese_english #中英翻译滤镜 - lua_filter@*super_sequence*F #手动排序,对高亮候选 ctrl+j左移动 ctrl+k 右移动 ctrl+0 移除位移 - lua_filter@*super_filter #功能太多详见Lua文件 - - lua_filter@*super_lookup #字词输入中反查辅助筛选 - uniquifier # 去重 grammar: @@ -449,6 +449,7 @@ wanxiang_lookup: #设置归属于super_lookup.lua tags: [ abc ] # 检索当前tag的候选 key: "`" # 输入中反查引导符,要添加到 speller/alphabet lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了 + data_source: [db] #对于pro版本能从注释中加载词库辅助码,详情见Pro版本,base这里无需修改。 # 处理符合特定规则的输入码,如网址、反查 recognizer: @@ -567,20 +568,19 @@ key_binder: - { when: has_menu, accept: "Control+s", toggle: tone_display } #通过快捷键Control+t开启超级tips - { when: has_menu, accept: "Control+t", toggle: super_tips } -#通过快捷键Control+q开启超级tips - - { when: has_menu, accept: "Control+q", toggle: corrector } #通过快捷键Control+g开启字符集过滤 - { when: has_menu, accept: "Control+g", toggle: charset_filter } - { when: composing, accept: "Control+g", toggle: charset_filter } #通过快捷键Control+q切换中文、英文、混合模式 - { when: has_menu, accept: "Control+q", toggle: zh_only } + - { when: composing, accept: "Control+q", toggle: zh_only } # 使用 tab 在不同音节之间跳转 - { when: has_menu, accept: "Tab", send: "Control+Right" } - { when: composing, accept: "Tab", send: "Control+Right" } #当tab第一个字补码正确后,可以使用Ctrl+tab进行上屏并依次补码 - { when: composing, accept: "Control+Tab", send_sequence: '{Home}{Shift+Right}{1}{Shift+Right}' } #当输入编码后发现没有词,则通过双击``进入造词模式而且不需要删除编码,这个功能与``直接引导相呼应相配合 - - { match: "^.*`$", accept: "`", send_sequence: '{BackSpace}{Home}{`}{`}{End}' } + #- { match: "^.*`$", accept: "`", send_sequence: '{BackSpace}{Home}{`}{`}{End}' }基础版暂时取消这个功能, #斜杠被占用引导符号,因此输入本身设置为双击 - { match: "^/$", accept: "/", send_sequence: '{space}' } diff --git a/wanxiang_t9.schema.yaml b/wanxiang_t9.schema.yaml index a4682b6..0d7d49d 100644 --- a/wanxiang_t9.schema.yaml +++ b/wanxiang_t9.schema.yaml @@ -79,13 +79,13 @@ engine: filters: - lua_filter@*super_sequence*F #手动排序,高亮候选 ctrl+j左移动 ctrl+k 右移动 ctrl+0 移除位移 - reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜,放在super_comment前面,进一步被超级注释处理以获得拼音编码的提示 + - lua_filter@*super_lookup #字词输入中反查辅助筛选 - lua_filter@*super_comment_preedit #超级注释模块、超级preedit,支持错词提示、辅助码显示,部件组字读音注释,有声调、无声调全拼编码的转换,支持个性化配置和关闭相应的功能,详情搜索super_comment_preedit进行详细配置 - simplifier@emoji #Emoji滤镜 - simplifier@s2t #简繁切换通繁 - simplifier@s2tw #简繁切换台繁 - simplifier@s2hk #简繁切换港繁 - simplifier@chinese_english #中英翻译滤镜 - - lua_filter@*super_lookup #字词输入中反查辅助筛选 - lua_filter@*super_filter #功能太多详见Lua文件 - uniquifier # 去重 @@ -273,6 +273,7 @@ wanxiang_lookup: #设置归属于super_lookup.lua tags: [ abc ] # 检索当前tag的候选 key: "`" # 输入中反查引导符,要添加到 speller/alphabet lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了 + data_source: [db] #对于pro版本能从注释中加载词库辅助码,详情见Pro版本,base这里无需修改。 # 处理符合特定规则的输入码,如网址、反查 recognizer: