diff --git a/custom/wanxiang_pro.schema.yaml b/custom/wanxiang_pro.schema.yaml index ef143c7..5f9b16b 100644 --- a/custom/wanxiang_pro.schema.yaml +++ b/custom/wanxiang_pro.schema.yaml @@ -470,10 +470,11 @@ wanxiang_lookup: #设置归属于super_lookup.lua tags: [ abc ] # 检索当前tag的候选 key: "`" # 输入中反查引导符,要添加到 speller/alphabet lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了 - data_source: ['comment', 'db'] - #data_source: ['db', 'comment'] - #用来配置你的反查数据来自哪里,注释comment:即词库携带的辅助编码,数据库db:即反查方案携带的两分、多分、笔画 - #谁在前哪种权重高比如 身db dian即dm,深comment dm 就能让相应的候选排在前面 + enable_tone: true #启用声调反查 + data_source: ['aux', 'db'] + #data_source: ['db', 'aux'] + #用来配置你的反查数据来自哪里,注释aux:即词库携带的辅助编码,数据库db:即反查方案携带的两分、多分、笔画 + #谁在前哪种权重高比如 身db dian即dm,深aux dm 就能让相应的候选排在前面 #但注释在前有时候与笔画的感知会有一定冲突,可针对习惯使用一类数据库即可 # 处理符合特定规则的输入码,如网址、反查 diff --git a/lua/super_lookup.lua b/lua/super_lookup.lua index 2df77c7..d869a57 100644 --- a/lua/super_lookup.lua +++ b/lua/super_lookup.lua @@ -3,21 +3,38 @@ --tags: [ abc ] # 检索当前tag的候选 --key: "`" # 输入中反查引导符 --lookup: [ wanxiang_reverse ] #反查滤镜数据库 - --data_source: [ comment, db ] # 优先级:写在前面优先 + --data_source: [ comment, db ] # 优先级:写在前面优先。即使只写db,只要开启enable_tone也能从注释获取声调。 + --enable_tone: true #启用声调反查 -- 工具函数:转义正则特殊字符 local function alt_lua_punc(s) return s and s:gsub('([%.%+%-%*%?%[%]%^%$%(%)%%])', '%%%1') or '' end - +-- 声调映射表 +local tones_map = { + ["ā"]="7", ["á"]="8", ["ǎ"]="9", ["à"]="0", + ["ō"]="7", ["ó"]="8", ["ǒ"]="9", ["ò"]="0", + ["ē"]="7", ["é"]="8", ["ě"]="9", ["è"]="0", + ["ī"]="7", ["í"]="8", ["ǐ"]="9", ["ì"]="0", + ["ū"]="7", ["ú"]="8", ["ǔ"]="9", ["ù"]="0", + ["ǖ"]="7", ["ǘ"]="8", ["ǚ"]="9", ["ǜ"]="0" +} -- 高性能 UTF8 长度获取 local function get_utf8_len(s) - -- 优先使用 Rime 内置的 utf8 库 if utf8 and utf8.len then return utf8.len(s) end local _, count = string.gsub(s, "[^\128-\193]", "") return count end - +-- 提取声调数字 (无声调/轻声 -> 默认归为 4) +local function get_tone_from_pinyin(pinyin) + if not pinyin or #pinyin == 0 then return nil end + for char, tone in pairs(tones_map) do + if string.find(pinyin, char, 1, true) then + return tone + end + end + return "0" +end -- 规则加载 local function parse_and_separate_rules(schema_id) if not schema_id or #schema_id == 0 then return nil, nil end @@ -56,35 +73,25 @@ end local function expand_code_variant(main_projection, xlit_projection, part) local out, seen = {}, {} - - -- 统一添加函数 (负责去重 + 顺序保持) local function add(s) if s and #s > 0 and not seen[s] then seen[s] = true table.insert(out, s) end end - - -- 通用奇数位提取器 (1, 3, 5...) - -- 规则:纯小写字母 + 偶数长度 local function extract_odd_positions(s) if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end local res = "" for i = 1, #s, 2 do res = res .. s:sub(i, i) end return res end - - -- 检查奇偶位组合,如果是 [jqxy] + u,则生成对应的 v 版本 local function get_v_variant(s) if not s or not s:match("^%l+$") or #s % 2 ~= 0 then return nil end - local res = "" local has_change = false - for i = 1, #s, 2 do local char_odd = s:sub(i, i) local char_even = s:sub(i+1, i+1) - if (char_odd == 'j' or char_odd == 'q' or char_odd == 'x' or char_odd == 'y') and char_even == 'v' then res = res .. char_odd .. 'u' has_change = true @@ -95,7 +102,6 @@ local function expand_code_variant(main_projection, xlit_projection, part) return has_change and res or nil end - -- 预处理单引号特例 (ce'shi -> cs)全拼用 local _, quote_count = part:gsub("'", "") if quote_count == 1 then local s1, s2 = part:match("^([^']*)'([^']*)$") @@ -103,38 +109,27 @@ local function expand_code_variant(main_projection, xlit_projection, part) add(s1:sub(1,1) .. s2:sub(1,1)) end end - -- 保留原始编码,用户可能加入直接的编码用 - -- 只有纯小写字母 (ceui) 才保留,含符号(ce'shi)或大写(ABC)均不保留 - if part:match("^%l+$") then - add(part) - end - -- 对“原始编码”进行奇位提取 (ceui -> cu) - -- extract_odd_positions 内部已经校验了 ^%l+$,所以这里直接调用即可 + if part:match("^%l+$") then add(part) end local raw_extracted = extract_odd_positions(part) if raw_extracted then add(raw_extracted) end - -- 规则投影 (Main Projection) if main_projection and not part:match('^%u+$') then local p = main_projection:apply(part, true) if p and #p > 0 then - -- A. 加入投影全码 (如 yuif) add(p) - -- B. 生成 v 变体 (如 yuif -> yvif) local v_variant = get_v_variant(p) if v_variant then add(v_variant) end - -- C. 对投影全码提取奇位 (如 yuif -> yi) local proj_extracted = extract_odd_positions(p) if proj_extracted then add(proj_extracted) end end end - -- 大写反查 (Xlit) if part:match('^%u+$') and xlit_projection then local xlit_result = xlit_projection:apply(part, true) if xlit_result and #xlit_result > 0 then add(xlit_result) end end return out end --- 【DB】查表 + local function build_reverse_group(main_projection, xlit_projection, db_table, text) local group, seen = {}, {} for _, db in ipairs(db_table) do @@ -154,7 +149,6 @@ local function build_reverse_group(main_projection, xlit_projection, db_table, t return group end --- 单字匹配 (Strict Prefix) local function group_match(group, fuma) if not group then return false end for i = 1, #group do @@ -163,7 +157,6 @@ local function group_match(group, fuma) return false end --- 递归匹配引擎 (优化:整数 Key) local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, memo, is_phrase_mode) if input_idx > #input_str then return true end if idx > #codes_sequence then return false end @@ -177,9 +170,8 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, if codes then for _, code in ipairs(codes) do local skip = false - -- 词组模式下,过滤掉 >3 的全码 if is_phrase_mode and #code > 3 then skip = true end - + if code:match("^%d+$") then skip = true end if not skip then local i_curr = input_idx local c_curr = 1 @@ -202,8 +194,15 @@ local function match_fuzzy_recursive(codes_sequence, idx, input_str, input_idx, return result end --- 注释解析 (严格校验 + Trim) -local function parse_comment_codes(comment, pattern, target_len) +local function list_contains(list, target) + if not list then return false end + for _, v in ipairs(list) do + if v == target then return true end + end + return false +end + +local function parse_comment_codes(comment, pattern, target_len, enable_tone) if not comment or comment == "" then return nil end local parts = {} @@ -217,14 +216,31 @@ local function parse_comment_codes(comment, pattern, target_len) local result = {} for i, part in ipairs(parts) do local p1, p2 = part:find(";") - if not p1 then return nil end + local pinyin_part + local codes_part + + if p1 then + -- 有分号:前面是音/拼音,后面是码 + pinyin_part = part:sub(1, p1 - 1) + codes_part = part:sub(p2 + 1) + else + -- 无分号:整体是音/拼音,无码 + pinyin_part = part + codes_part = "" + end - local codes_part = part:sub(p2 + 1) local codes_list = {} - for c in codes_part:gmatch("[^,]+") do - -- Trim - local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "") - if #trimmed > 0 then table.insert(codes_list, trimmed) end + if #codes_part > 0 then + for c in codes_part:gmatch("[^,]+") do + local trimmed = c:gsub("^%s+", ""):gsub("%s+$", "") + if #trimmed > 0 then table.insert(codes_list, trimmed) end + end + end + if enable_tone then + local tone = get_tone_from_pinyin(pinyin_part) + if tone then + table.insert(codes_list, tone) + end end result[i] = codes_list end @@ -236,24 +252,34 @@ local f = {} function f.init(env) local config = env.engine.schema.config + -- 1. 先读取是否开启声调过滤 (默认为 true) + env.enable_tone = config:get_bool('wanxiang_lookup/enable_tone') + if env.enable_tone == nil then env.enable_tone = true end + + -- 2. 读取数据源 data_source local sources_list = config:get_list('wanxiang_lookup/data_source') env.data_sources = {} - env.has_comment = false + + -- 临时标记,判断配置里是否显式包含了 comment + local config_has_comment_source = false env.has_db = false if sources_list and sources_list.size > 0 then for i = 0, sources_list.size - 1 do local s = sources_list:get_value_at(i).value table.insert(env.data_sources, s) - if s == 'comment' then env.has_comment = true end + if s == 'aux' then config_has_comment_source = true end if s == 'db' then env.has_db = true end end else - env.data_sources = { 'comment', 'db' } - env.has_comment = true + env.data_sources = { 'aux', 'db' } + config_has_comment_source = true env.has_db = true end + -- 只要配置里用了 comment 做数据源,或者开启了 enable_tone (需要从注释借声调),都必须解析注释。 + env.has_comment = config_has_comment_source or env.enable_tone + env.db_table = nil if env.has_db then local db_list = config:get_list("wanxiang_lookup/lookup") @@ -275,7 +301,6 @@ function f.init(env) if env.has_comment then local delimiter = config:get_string('speller/delimiter') or " '" if delimiter == "" then delimiter = " " end - -- 确保 " '" 中的所有字符都被加入排除列表 [^% %']+ env.comment_split_ptrn = "[^" .. alt_lua_punc(delimiter) .. "]+" end @@ -325,15 +350,25 @@ function f.func(input, env) local fuma = ctx_input:sub(s_end + 1) if #fuma == 0 then for cand in input:iter() do yield(cand) end return end - local if_single_char_first = env.engine.context:get_option('char_priority') - - local buckets = {} - local max_len = 0 - for i = 1, #env.data_sources do buckets[i] = {} end - - local long_word_cands = {} + local tone_filter_seq = {} + local clean_fuma = "" + for i = 1, #fuma do + local char = fuma:sub(i, i) + if char == "7" or char == "8" or char == "9" or char == "0" then + table.insert(tone_filter_seq, char) + else + clean_fuma = clean_fuma .. char + end + end + local apply_tone_filter = env.enable_tone and (#tone_filter_seq > 0) + + local if_single_char_first = env.engine.context:get_option('char_priority') + local buckets = {} + for i = 1, #env.data_sources do buckets[i] = {} end + local long_word_cands = {} + local max_len = 0 + local has_any_match = false - -- GC if env.cache_size > 2000 then env._global_db_cache = {} env._global_comment_cache = {} @@ -344,24 +379,22 @@ function f.func(input, env) for cand in input:iter() do if cand.type == 'sentence' then goto skip end - local cand_text = cand.text local cand_len = get_utf8_len(cand_text) if not cand_len or cand_len == 0 then goto skip end - local b = string.byte(cand_text, 1) if b and b < 128 then goto skip end local raw_data = {} - -- 1. Comment Data + -- A: Comment Data if env.has_comment then local genuine = cand:get_genuine() local comment_text = genuine and genuine.comment or "" if comment_text ~= "" then local cache_key = cand_text .. "_" .. comment_text if not comment_cache[cache_key] then - comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len) or false + comment_cache[cache_key] = parse_comment_codes(comment_text, env.comment_split_ptrn, cand_len, env.enable_tone) or false env.cache_size = env.cache_size + 1 end if comment_cache[cache_key] then @@ -370,15 +403,13 @@ function f.func(input, env) end end - -- 2. DB Data + -- B: DB Data if env.has_db then raw_data.db = {} - local pos = 1 local i = 0 for _, code_point in utf8.codes(cand_text) do i = i + 1 local char_str = utf8.char(code_point) - if not db_cache[char_str] then db_cache[char_str] = build_reverse_group(env.main_projection, env.xlit_projection, env.db_table, char_str) env.cache_size = env.cache_size + 1 @@ -387,36 +418,66 @@ function f.func(input, env) end end - -- 3. Match + -- 提取借用声调 (总是尝试从 raw_data.comment 提取,即使 data_source 只有 db) + local borrowed_tones = {} + if raw_data.comment then + for k, codes in ipairs(raw_data.comment) do + borrowed_tones[k] = {} + for _, c in ipairs(codes) do + if c:match("^%d+$") then borrowed_tones[k][c] = true end + end + end + end + local matched_idx = nil + for i, source_type in ipairs(env.data_sources) do local codes_seq = raw_data[source_type] if codes_seq then - local is_match = false - if source_type == 'comment' then - if cand_len == 1 then - if group_match(codes_seq[1], fuma) then is_match = true end - else - local memo = {} - if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, false) then is_match = true end - end - elseif source_type == 'db' then - if cand_len == 1 then - if group_match(codes_seq[1], fuma) then is_match = true end - else - local memo = {} - if match_fuzzy_recursive(codes_seq, 1, fuma, 1, memo, true) then is_match = true end + local tone_match_pass = true + if apply_tone_filter then + for k, tone_input in ipairs(tone_filter_seq) do + if k > #codes_seq then break end + local has_tone = list_contains(codes_seq[k], tone_input) + -- 如果是 db 源且自身没匹配到声调,尝试查阅 borrowed_tones + if not has_tone and source_type == 'db' then + if borrowed_tones[k] and borrowed_tones[k][tone_input] then has_tone = true end + end + if not has_tone then + tone_match_pass = false + break + end end end - - if is_match then - matched_idx = i - break + + if tone_match_pass then + local is_match = false + if source_type == 'aux' then + if cand_len == 1 then + if group_match(codes_seq[1], clean_fuma) then is_match = true end + else + local memo = {} + if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, false) then is_match = true end + end + elseif source_type == 'db' then + if cand_len == 1 then + if group_match(codes_seq[1], clean_fuma) then is_match = true end + else + local memo = {} + if match_fuzzy_recursive(codes_seq, 1, clean_fuma, 1, memo, true) then is_match = true end + end + end + + if is_match then + matched_idx = i + break + end end end end if matched_idx then + has_any_match = true if if_single_char_first and cand_len > 1 then table.insert(long_word_cands, cand) else @@ -428,7 +489,6 @@ function f.func(input, env) ::skip:: end - -- 输出 (Global Length Priority) if if_single_char_first then for i = 1, #env.data_sources do if buckets[i][1] then for _, c in ipairs(buckets[i][1]) do yield(c) end end @@ -447,6 +507,19 @@ function f.func(input, env) end for _, c in ipairs(long_word_cands) do yield(c) end + + if not has_any_match and apply_tone_filter and #clean_fuma > 0 and env.has_db and env.db_table then + for _, db_obj in ipairs(env.db_table) do + local res_str = db_obj:lookup(clean_fuma) + if res_str and #res_str > 0 then + for word in res_str:gmatch("%S+") do + local cand = Candidate("wanxiang_shadow", s_end, #ctx_input, word, "") + cand.quality = 1 + yield(cand) + end + end + end + end end function f.tags_match(seg, env) diff --git a/lua/tone_fallback.lua b/lua/tone_fallback.lua index 07cd6f7..380dbb0 100644 --- a/lua/tone_fallback.lua +++ b/lua/tone_fallback.lua @@ -34,7 +34,8 @@ local P = {} function P.init(env) env.tone_state = "idle" - + local config = env.engine.schema.config + env.lookup_key = config:get_string('wanxiang_lookup/key') or '`' local ctx = env.engine and env.engine.context if not ctx or not ctx.update_notifier then return end @@ -105,6 +106,17 @@ function P.func(key, env) -- 主键盘数字 0–9:标记为 compress local r = key:repr() or "" if r:match("^[0-9]$") then + local input = ctx.input or "" + local caret = (ctx.caret_pos ~= nil) and ctx.caret_pos or #input + if caret < 0 then caret = 0 end + if caret > #input then caret = #input end + local left = (caret > 0) and input:sub(1, caret) or "" + + if left:find(env.lookup_key, 1, true) then + env.tone_state = "idle" + return wanxiang.RIME_PROCESS_RESULTS.kNoop + end + env.tone_state = "compress" -- 这里用“预测压缩是否会发生”来决定要不要告诉 Rime “我处理了这个按键” diff --git a/wanxiang.schema.yaml b/wanxiang.schema.yaml index 7cde524..c57b8fc 100644 --- a/wanxiang.schema.yaml +++ b/wanxiang.schema.yaml @@ -470,6 +470,7 @@ wanxiang_lookup: #设置归属于super_lookup.lua key: "`" # 输入中反查引导符,要添加到 speller/alphabet lookup: [ wanxiang_reverse ] #反查滤镜数据库,万象都合并为一个了 data_source: [db] #对于pro版本能从注释中加载词库辅助码,详情见Pro版本,base这里无需修改。 + enable_tone: true #启用声调反查 # 处理符合特定规则的输入码,如网址、反查 recognizer: diff --git a/wanxiang_t9.schema.yaml b/wanxiang_t9.schema.yaml index 15a90e4..1171894 100644 --- a/wanxiang_t9.schema.yaml +++ b/wanxiang_t9.schema.yaml @@ -81,7 +81,6 @@ engine: - reverse_lookup_filter@radical_reverse_lookup #部件拆字滤镜,放在super_comment前面,进一步被超级注释处理以获得拼音编码的提示 - lua_filter@*super_lookup #字词输入中反查辅助筛选 - lua_filter@*super_filter #功能太多详见Lua文件 - - lua_filter@*super_comment_preedit #超级注释模块、超级preedit,支持错词提示、辅助码显示,部件组字读音注释,有声调、无声调全拼编码的转换,支持个性化配置和关闭相应的功能,详情搜索super_comment_preedit进行详细配置 - simplifier@emoji #Emoji滤镜 - simplifier@s2t #简繁切换通繁 - simplifier@s2tw #简繁切换台繁 @@ -91,7 +90,7 @@ engine: - uniquifier # 去重 t9: - enable: true #启用仓、元书t9输入方式 + isDisplayOriginalPreedit: false @@ -195,8 +194,9 @@ translator: initial_quality: 3 #初始质量拼音的权重应该比英文大 spelling_hints: 30 #将注释以词典code字符串形式完全暴露,通过super_comment.lua完全接管,灵活配置。 always_show_comments: true # Rime 默认在 preedit 等于 comment 时取消显示 comment,这里强制一直显示,供super_comment.lua做判断用。 - comment_format: {comment} #将注释以词典字符串形式完全暴露,通过super_preedit.lua完全接管,灵活配置。 - + comment_format: + - xlit/āáǎàōóǒòēéěèīíǐìūúǔùǖǘǚǜüńňǹḿm̀/aaaaooooeeeeiiiiuuuuvvvvvnnnmmm/ + - xform/^(.*);.*$/$1/ #disable_user_dict_for_patterns: #如果你开启调频,需要一并考虑这个配置是否需要,基本的6码3字不调频,你可以自定义,目前的逻辑是依然记录用户词但满足规则的不输出不被使用 # - "^[a-z]{1,6}"