fix: 再次调整英文处理器

2026-05-13 15:10:03 +00:00 · 2026-01-20 00:36:33 +08:00
parent 3bd3b7cd79
commit 73484df7db
1 changed files with 103 additions and 159 deletions
--- a/lua/super_english.lua
+++ b/lua/super_english.lua
@@ -12,7 +12,6 @@

 local F = {}

-- 引入常用函数
 local byte = string.byte
 local find = string.find
 local gsub = string.gsub
@@ -21,8 +20,8 @@ local lower = string.lower
 local sub = string.sub
 local match = string.match
 local format = string.format
-local STICKY_BUFFER_SIZE = 2  --输入/\的情况下，继续输入3个单词不加空格，适合网址路径
-- 辅助函数：获取候选类型
+local STICKY_BUFFER_SIZE = 2
+
 local function fast_type(c)
    local t = c.type
    if t then return t end
@@ -30,58 +29,48 @@ local function fast_type(c)
    return (g and g.type) or ""
 end

-- 辅助函数：判断是否为置顶表词汇
 local function is_table_type(c)
    local t = fast_type(c)
    return t == "user_table" or t == "fixed"
 end
-- [Time] 封装统一的时间获取函数 (单位: 秒, 带小数)
+
 local function get_now()
-    -- 使用用户指定的原生 API (毫秒转秒，以便和配置文件里的 0.5 秒兼容)
    if rime_api and rime_api.get_time_ms then
        return rime_api.get_time_ms() / 1000
    end
-    --以此为保底，防止 API 不存在时报错
    return os.time()
 end

 local function pure(s)
    return gsub(s, "[^a-zA-Z]", ""):lower()
 end
+
 local no_spacing_words = {
-    ["http"]  = true,
-    ["https"] = true,
-    ["www"]   = true,
-    ["ftp"]   = true,
-    ["ssh"]   = true,
-    ["mailto"]= true,
-    ["file"]  = true,
-    ["tel"]   = true,
+    ["http"]  = true, ["https"] = true, ["www"]   = true, ["ftp"]   = true,
+    ["ssh"]   = true, ["mailto"]= true, ["file"]  = true, ["tel"]   = true,
 }
+
 local allowed_ascii_symbols = {
+    [32] = true,  -- space
    [33] = true,  -- !
-    [39] = true,  -- ' (Don't)
+    [39] = true,  -- ' 
    [44] = true,  -- ,
-    [45] = true,  -- - (Co-op)
+    [45] = true,  -- -
    [43] = true,  -- +
    [46] = true,  -- .
    [63] = true,  -- ?
    [92] = true,  -- \
-    -- 数字 0-9 (ASCII 48-57)
    [48]=true, [49]=true, [50]=true, [51]=true, [52]=true,
    [53]=true, [54]=true, [55]=true, [56]=true, [57]=true,
 }
-- 规则：只允许 字母(A-Za-z) 和 上面配置表里的符号
+
 local function is_ascii_phrase_fast(s)
    if not s or s == "" then return false end
    local len = #s
    for i = 1, len do
        local b = byte(s, i)
-        -- 1. 判断是否为大写字母 A-Z (65-90)
        local is_upper = (b >= 65 and b <= 90)
-        -- 2. 判断是否为小写字母 a-z (97-122)
        local is_lower = (b >= 97 and b <= 122)
-        -- 3. 判断是否为白名单符号
        local is_allowed_sym = allowed_ascii_symbols[b]
        if not (is_upper or is_lower or is_allowed_sym) then
            return false
@@ -94,16 +83,13 @@ local function has_letters(s)
    return find(s, "[a-zA-Z]")
 end

-- 序列匹配：返回 (首字母位置, 最后一个匹配字符的位置)
 local function find_target_in_text(text, start_pos, target_fp)
    local text_len = #text
    local target_len = #target_fp
    if target_len == 0 then return nil, nil end
-
-    local t_idx = 1
-    local scan_p = start_pos
-    local s_index = nil
-
+    local t_idx = 1       
+    local scan_p = start_pos 
+    local s_index = nil   
    while scan_p <= text_len and t_idx <= target_len do
        local char_txt = sub(text, scan_p, scan_p)
        if lower(char_txt) == sub(target_fp, t_idx, t_idx) then
@@ -112,18 +98,15 @@ local function find_target_in_text(text, start_pos, target_fp)
        end
        scan_p = scan_p + 1
    end
-
    if t_idx > target_len then
        return s_index, scan_p - 1
    end
    return nil, nil
 end

-- 2. 核心逻辑：格式化与还原
 local function restore_sentence_spacing(cand, split_pattern, check_pattern)
    local guide = cand.preedit or ""
    if not find(guide, check_pattern) then return cand end
-
    local text = cand.text
    local targets = {}
    for seg in string.gmatch(guide, split_pattern) do
@@ -131,7 +114,6 @@ local function restore_sentence_spacing(cand, split_pattern, check_pattern)
        if #t > 0 then table.insert(targets, t) end
    end
    if #targets == 0 then return cand end
-
    local starts = {}
    local p = 1
    for _, target in ipairs(targets) do
@@ -140,19 +122,16 @@ local function restore_sentence_spacing(cand, split_pattern, check_pattern)
        table.insert(starts, s)
        p = e + 1 
    end
-
    local parts = {}
    if starts[1] > 1 then
        table.insert(parts, sub(text, 1, starts[1] - 1))
    end
-
    for i = 1, #starts do
        local current_s = starts[i]
        local next_s = starts[i+1]
        local chunk_end = next_s and (next_s - 1) or #text
        table.insert(parts, sub(text, current_s, chunk_end))
    end
-
    local new_text = ""
    for i, part in ipairs(parts) do
        if i == 1 then
@@ -167,9 +146,7 @@ local function restore_sentence_spacing(cand, split_pattern, check_pattern)
        end
    end
    new_text = gsub(new_text, "%s%s+", " ") 
-    
    if new_text == "" then return cand end
-    
    local nc = Candidate(cand.type, cand.start, cand._end, new_text, cand.comment)
    nc.preedit = cand.preedit
    return nc
@@ -179,14 +156,11 @@ local NBSP = string.char(0xC2, 0xA0)

 local function apply_segment_formatting(text, input_code)
    if not input_code or input_code == "" then return text end
-    
    local parts = {}
    local p_code = 1 
-    
    for word in string.gmatch(text, "%S+") do
        local clean_word = pure(word)
        local w_len = #clean_word
-        
        if w_len > 0 then
            if find(word, "[\128-\255]") then
                local input_remain = #input_code - p_code + 1
@@ -200,7 +174,6 @@ local function apply_segment_formatting(text, input_code)
                    local check_len = (w_len < input_remain) and w_len or input_remain
                    local segment = sub(input_code, p_code, p_code + check_len - 1)
                    local is_pure_alpha = not find(word, "[^a-zA-Z]")
-                    
                    if find(segment, "^%u%u") and is_pure_alpha then
                        word = upper(word)
                    elseif find(segment, "^%u") then
@@ -212,7 +185,6 @@ local function apply_segment_formatting(text, input_code)
        end
        table.insert(parts, word)
    end
-    
    return table.concat(parts, " ")
 end

@@ -220,19 +192,13 @@ local function apply_formatting(cand, code_ctx)
    local text = cand.text
    if not text or text == "" then return cand end
    local changed = false
-    
    local norm = gsub(text, NBSP, " ")
    if norm ~= text then text = norm; changed = true end
-
    if is_ascii_phrase_fast(text) and has_letters(text) then
        if code_ctx.raw_input then
            local new_text = apply_segment_formatting(text, code_ctx.raw_input)
-            if new_text ~= text then 
-                text = new_text
-                changed = true 
-            end
+            if new_text ~= text then text = new_text; changed = true end
        end
-
        if code_ctx.spacing_mode and code_ctx.spacing_mode ~= "off" then
            local mode = code_ctx.spacing_mode
            if mode == "smart" then
@@ -246,87 +212,69 @@ local function apply_formatting(cand, code_ctx)
            end
        end
    end
-
    if not changed then return cand end
    local nc = Candidate(cand.type, cand.start, cand._end, text, cand.comment)
    nc.preedit = cand.preedit
    return nc
 end

-- 3. 状态管理 (Filter)
 function F.init(env)
    env.memory = {}
    local cfg = env.engine.schema.config
-    
-    -- 1. 配置读取
    env.english_spacing_mode = "off"
    env.spacing_timeout = 0 
    env.lookup_key = "`"
    if cfg then
        local str = cfg:get_string("wanxiang_english/english_spacing")
        if str then env.english_spacing_mode = str end
-        
-        -- 读取超时 (单位: 秒, 支持小数)
        local timeout = cfg:get_double("wanxiang_english/spacing_timeout")
        if timeout then env.spacing_timeout = timeout end
        local key = cfg:get_string("wanxiang_lookup/key")
        if key and key ~= "" then env.lookup_key = key end
    end
    env.lookup_key_esc = gsub(env.lookup_key, "([%%%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")
-    -- 2. 动态获取分隔符
    local delimiter_str = " '" 
    if cfg then
        delimiter_str = cfg:get_string('speller/delimiter') or delimiter_str
    end
-    env.delimiter_char = sub(delimiter_str, 1, 1)  --提取自动分词符号
+    env.delimiter_char = sub(delimiter_str, 1, 1)
    local escaped_delims = gsub(delimiter_str, "([%%%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")
    env.split_pattern = "[^" .. escaped_delims .. "]+"     
    env.delim_check_pattern = "[" .. escaped_delims .. "]" 
-
    env.prev_commit_is_eng = false
-    env.last_commit_time = 0   --记录上次提交时间
-    env.comp_start_time = nil  -- 记录本次输入开始的时间
+    env.last_commit_time = 0
+    env.comp_start_time = nil
    env.spacing_active = false  
    env.decision_locked = false 
-    env.sticky_countdown = 0    -- 粘性倒计时
+    env.sticky_countdown = 0
    if env.engine.context then
        env.update_notifier = env.engine.context.update_notifier:connect(function(ctx)
            local curr_input = ctx.input
-            -- 检测当前输入是否包含反查符
            if env.lookup_key and find(curr_input, env.lookup_key, 1, true) then
                env.block_derivation = true
            else
                env.block_derivation = false
            end
-            -- 如果输入框为空，重置开始时间
            if curr_input == "" then
                env.comp_start_time = nil
-            -- 如果输入框不为空，且还没记录开始时间，说明是“刚刚开始打字”
            elseif env.comp_start_time == nil then
                env.comp_start_time = get_now()
            end
        end)
        env.commit_notifier = env.engine.context.commit_notifier:connect(function(ctx)
            local commit_text = ctx:get_commit_text()
-            -- 1. 先剔除空格，防止死循环
            local text_no_space = gsub(commit_text, "%s", "")
            local is_eng = is_ascii_phrase_fast(text_no_space)
-            
-            -- 2. 粘性触发 (结尾是 / 或 \)
            if find(text_no_space, "[/\\\\]$") then
                env.sticky_countdown = STICKY_BUFFER_SIZE
                is_eng = false 
-            -- 3. 粘性缓冲期 (倒计时)
            elseif env.sticky_countdown > 0 then
                if is_eng then
-                    -- 只要是英文，消耗一次缓冲，并强制不加空格
                    env.sticky_countdown = env.sticky_countdown - 1
                    is_eng = false 
                else
-                    -- 遇到非英文(中文等)，打断缓冲
                    env.sticky_countdown = 0
                end
-            -- 4. 普通黑名单 (http等)
            elseif is_eng then
                local clean = gsub(commit_text, "%s+$", ""):lower()
                if no_spacing_words[clean] then
@@ -334,7 +282,6 @@ function F.init(env)
                end
            end
            env.prev_commit_is_eng = is_eng
-            -- 仅英文上屏更新时间戳 (使用 rime_api 获取)
            if is_eng then
                env.last_commit_time = get_now()
            else
@@ -345,14 +292,12 @@ function F.init(env)
        end)
    end
 end
-
 function F.fini(env)
    if env.update_notifier then env.update_notifier:disconnect(); env.update_notifier = nil end
    if env.commit_notifier then env.commit_notifier:disconnect(); env.commit_notifier = nil end
    env.memory = nil
 end

-- 4. 主逻辑 (Filter)
 function F.func(input, env)
    local ctx = env.engine.context
    local curr_input = ctx.input
@@ -360,7 +305,7 @@ function F.func(input, env)
    local best_candidate_saved = false
    local code_len = #curr_input

-    -- [Feature] 强制英文造词 (末尾 \\)
+    -- [Feature] 强制英文造词
    if code_len > 2 and sub(curr_input, -2) == "\\\\" then
        local raw_text = sub(curr_input, 1, code_len - 2)
        if is_ascii_phrase_fast(raw_text) then
@@ -374,15 +319,12 @@ function F.func(input, env)
        end
    end
    
-    -- [Check 1] 外部脚本发来的打断信号
    local break_signal = (ctx:get_property("english_spacing") == "true")
    local effective_prev_is_eng = env.prev_commit_is_eng

    if break_signal then 
        effective_prev_is_eng = false
        env.prev_commit_is_eng = false
-        
-    -- [Check 2] 时间自然过期
    elseif effective_prev_is_eng and env.spacing_timeout > 0 then
        local check_time = env.comp_start_time or get_now()
        if (check_time - env.last_commit_time) > env.spacing_timeout then
@@ -397,16 +339,13 @@ function F.func(input, env)
        prev_is_eng = effective_prev_is_eng
    }

-    -- 1. 准备单字母候选
+    local single_char_injected = false
    local single_chars = {}
-    local has_single_chars = false
-    local single_char_injected = false 
    
    if code_len == 1 then
        local b = byte(curr_input)
        local is_upper = (b >= 65 and b <= 90)
        local is_lower = (b >= 97 and b <= 122)
-        -- 只有输入是字母时，才准备 A/a 候选
        if is_upper or is_lower then
            local t1 = curr_input
            local t2 = is_upper and lower(curr_input) or upper(curr_input)
@@ -418,50 +357,50 @@ function F.func(input, env)
        single_char_injected = true 
    end

-    -- 2. 流式遍历
    for cand in input:iter() do
-        local c_type = cand.type
-        local final_cand = cand 
-        local is_ascii = false 
-
-        if c_type ~= "phrase" then
-            local good_cand = restore_sentence_spacing(cand, env.split_pattern, env.delim_check_pattern)
-            final_cand = apply_formatting(good_cand, code_ctx)
-            is_ascii = is_ascii_phrase_fast(final_cand.text)
-        end
-        if final_cand.comment and find(final_cand.comment, "\226\152\175") then
-            local nc = Candidate(final_cand.type, final_cand.start, final_cand._end, final_cand.text, "")
-            nc.preedit = final_cand.preedit
-            final_cand = nc
-        end
-        local is_garbage = (c_type == "raw")
+        local good_cand = restore_sentence_spacing(cand, env.split_pattern, env.delim_check_pattern)
+        local fmt_cand = apply_formatting(good_cand, code_ctx)
        
+        -- [恢复] 去除注释中的太极符号
+        if fmt_cand.comment and find(fmt_cand.comment, "\226\152\175") then
+            local nc = Candidate(fmt_cand.type, fmt_cand.start, fmt_cand._end, fmt_cand.text, "")
+            nc.preedit = fmt_cand.preedit
+            fmt_cand = nc
+        end
+
+        local c_type = cand.type
+        local is_ascii = is_ascii_phrase_fast(fmt_cand.text) 
+        local is_tbl = is_table_type(cand)
+
+        -- [垃圾词判定]：保护符号，只去重单字母
+        local is_garbage = (c_type == "raw") 
        if not is_garbage and code_len == 1 and has_letters(curr_input) then
-             if lower(final_cand.text) == lower(curr_input) then
+             if lower(fmt_cand.text) == lower(curr_input) then
                 is_garbage = true
             end
        end
-
+        
        if not is_garbage then
            has_valid_candidate = true
            
+            -- [VIP 优先逻辑]
            local is_vip_type = (c_type == "user_table" or c_type == "fixed" or c_type == "phrase")
            local is_hidden_vip = (not is_vip_type) and (not is_ascii)
            local treat_as_vip = is_vip_type or is_hidden_vip

            if treat_as_vip then
-                -- VIP 通道 (汉字、符号、用户词)
+                -- VIP 通道：不仅是 user_table，包括汉字等，都直接输出，不让单字母插队
                if not best_candidate_saved and cand.comment ~= "~" and not env.block_derivation then
                    env.memory[curr_input] = {
-                        text = final_cand.text,
-                        preedit = final_cand.preedit or curr_input
+                        text = fmt_cand.text,
+                        preedit = fmt_cand.preedit or curr_input
                    }
                    best_candidate_saved = true
                end
-                yield(final_cand)
+                yield(fmt_cand)

            else
-                -- 普通通道 (英文插队)
+                -- 普通通道：允许单字母插队到前面
                if has_single_chars and not single_char_injected then
                    if not best_candidate_saved then
                        env.memory[curr_input] = { text = single_chars[1].text, preedit = curr_input }
@@ -474,86 +413,91 @@ function F.func(input, env)
                
                if not best_candidate_saved and cand.comment ~= "~" and not env.block_derivation then
                    env.memory[curr_input] = {
-                        text = final_cand.text,
-                        preedit = final_cand.preedit or curr_input
+                        text = fmt_cand.text,
+                        preedit = fmt_cand.preedit or curr_input
                    }
                    best_candidate_saved = true
                end
-                yield(final_cand)
+                yield(fmt_cand)
            end
        end
    end

-    -- 3. 兜底逻辑
+    -- 3. 兜底逻辑 (补单字母)
    if has_single_chars and not single_char_injected then
        if not best_candidate_saved then
-            env.memory[curr_input] = { text = single_chars[1].text, preedit = curr_input }
+            env.memory[curr_input] = { text = single_chars[1].text, preedit = single_chars[1].text }
            best_candidate_saved = true
        end
        for _, c in ipairs(single_chars) do yield(c) end
        has_valid_candidate = true
    end

-    -- [Phase 3] 构造补全
+    -- [Phase 3] 历史回溯构造 (Strictly fallback)
+    -- [恢复功能] 无候选时，尝试从历史构造
    if not has_valid_candidate then
-        if env.block_derivation then return end
-        if find(curr_input, "^[/]") then return end
-        if not has_letters(curr_input) then return end
-        
-        local anchor = nil
-        local diff = ""
-        
-        for i = #curr_input - 1, 1, -1 do
-            local prefix = sub(curr_input, 1, i)
-            if env.memory[prefix] then
-                anchor = env.memory[prefix]
-                diff = sub(curr_input, i + 1)
-                break
+        if not env.block_derivation and has_letters(curr_input) and not find(curr_input, "^[/]") then
+            local anchor = nil
+            local diff = ""
+            for i = #curr_input - 1, 1, -1 do
+                local prefix = sub(curr_input, 1, i)
+                if env.memory[prefix] then
+                    anchor = env.memory[prefix]
+                    diff = sub(curr_input, i + 1)
+                    break
+                end
            end
-        end
-        
-        if anchor and diff ~= "" then
-            local has_spacing = find(anchor.text, " ")
-            local last_word = match(anchor.text, "(%S+)%s*$") or ""
-            local last_len = #last_word
            
-            local output_text = ""
-            local output_preedit = ""
-            
-            local is_code_mode = find(curr_input, "^[/\\]")
-            
-            if is_ascii_phrase_fast(anchor.text) then
-                if has_spacing then
-                    output_text = anchor.text .. diff
-                    output_preedit = (anchor.preedit or anchor.text) .. diff
-                elseif last_len > 3 then
+            if anchor and diff ~= "" then
+                local has_spacing = find(anchor.text, " ")
+                local last_word = match(anchor.text, "(%S+)%s*$") or ""
+                local last_len = #last_word
+                local output_text = ""
+                local output_preedit = ""
+                
+                local is_code_mode = find(curr_input, "^[/\\]")
+                
+                if is_ascii_phrase_fast(anchor.text) then
                    local spacer = " "
                    if sub(anchor.text, -1) == " " then spacer = "" end
-                    output_text = anchor.text .. spacer .. diff
-                    output_preedit = (anchor.preedit or anchor.text) .. spacer .. diff
+
+                    if has_spacing then
+                        output_text = anchor.text .. spacer .. diff
+                        output_preedit = (anchor.preedit or anchor.text) .. spacer .. diff
+                    elseif last_len > 3 then
+                        output_text = anchor.text .. spacer .. diff
+                        output_preedit = (anchor.preedit or anchor.text) .. spacer .. diff
+                    else
+                        output_text = curr_input
+                        output_preedit = curr_input
+                    end
+                elseif is_code_mode then
+                    output_text = anchor.text .. diff
+                    output_preedit = (anchor.preedit or anchor.text) .. diff
                else
-                    output_text = curr_input
-                    output_preedit = curr_input
+                    output_text = anchor.text
+                    output_preedit = (anchor.preedit or anchor.text) .. env.delimiter_char .. diff
                end
-            elseif is_code_mode then
-                output_text = anchor.text .. diff
-                output_preedit = (anchor.preedit or anchor.text) .. diff
+                
+                output_text = apply_segment_formatting(output_text, curr_input)
+                
+                local cand = Candidate("completion", 0, #curr_input, output_text, "~")
+                cand.preedit = output_preedit
+                cand.quality = 999
+                yield(cand)
            else
-                output_text = anchor.text
-                output_preedit = (anchor.preedit or anchor.text) .. env.delimiter_char .. diff
+                -- [Phase 4] 真正的无解兜底
+                local cand = Candidate("completion", 0, #curr_input, curr_input, "~")
+                cand.preedit = curr_input
+                yield(cand)
            end
-            
-            output_text = apply_segment_formatting(output_text, curr_input)
-            
-            local cand = Candidate("completion", 0, #curr_input, output_text, "~")
-            cand.preedit = output_preedit
-            cand.quality = 999
-            yield(cand)
        else
-            local cand = Candidate("completion", 0, #curr_input, curr_input, "~")
-            cand.preedit = curr_input
-            yield(cand)
+             -- 特殊符号或被拦截时的兜底
+             local cand = Candidate("completion", 0, #curr_input, curr_input, "~")
+             cand.preedit = curr_input
+             yield(cand)
        end
    end
 end
+
 return F