Files
rime_wanxiang/lua/super_english.lua
2026-01-21 17:44:45 +08:00

517 lines
18 KiB
Lua
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- lua/super_english.lua
-- https://github.com/amzxyz/rime_wanxiang
-- @description: 英文全能处理器 (Filter Only: 锚点切分 + 动态分隔符 + 超时销毁)
-- @author: amzxyz
-- 核心功能清单:
-- 1. [Format] 语句级英文大写格式化,逐词大小写对应 (look HELLO -> look HELLO)
-- 2. [Spacing] 智能语句空格切分,智能单词上屏加空格 (Smart Spacing) 与无损分词还原
-- 3. [Memory] 全量历史缓存,完美解决回删乱码问题
-- 4. [Construct] 原生优先构造策略 (短词无分词则重置为原生输入)
-- 5. [Order] 单字母(a/A) 智能插队排序,补齐单字母候选
local F = {}
-- 引入常用函数
local byte = string.byte
local find = string.find
local gsub = string.gsub
local upper = string.upper
local lower = string.lower
local sub = string.sub
local match = string.match
local format = string.format
local STICKY_BUFFER_SIZE = 2 --输入/\的情况下继续输入3个单词不加空格适合网址路径
-- 辅助函数:获取候选类型
local function fast_type(c)
local t = c.type
if t then return t end
local g = c.get_genuine and c:get_genuine() or nil
return (g and g.type) or ""
end
-- 辅助函数:判断是否为置顶表词汇
local function is_table_type(c)
local t = fast_type(c)
return t == "user_table" or t == "fixed"
end
-- [Time] 封装统一的时间获取函数 (单位: 秒, 带小数)
local function get_now()
-- 使用用户指定的原生 API (毫秒转秒,以便和配置文件里的 0.5 秒兼容)
if rime_api and rime_api.get_time_ms then
return rime_api.get_time_ms() / 1000
end
--以此为保底,防止 API 不存在时报错
return os.time()
end
local function pure(s)
return gsub(s, "[^a-zA-Z]", ""):lower()
end
local no_spacing_words = {
["http"] = true,
["https"] = true,
["www"] = true,
["ftp"] = true,
["ssh"] = true,
["mailto"]= true,
["file"] = true,
["tel"] = true,
}
local allowed_ascii_symbols = {
[33] = true, -- !
[39] = true, -- ' (Don't)
[44] = true, -- ,
[45] = true, -- - (Co-op)
[46] = true, -- .
[63] = true, -- ?
[92] = true, -- \
-- 数字 0-9 (ASCII 48-57)
[48]=true, [49]=true, [50]=true, [51]=true, [52]=true,
[53]=true, [54]=true, [55]=true, [56]=true, [57]=true,
}
-- 规则:只允许 字母(A-Za-z) 和 上面配置表里的符号
local function is_ascii_phrase_fast(s)
if not s or s == "" then return false end
local len = #s
for i = 1, len do
local b = byte(s, i)
-- 1. 判断是否为大写字母 A-Z (65-90)
local is_upper = (b >= 65 and b <= 90)
-- 2. 判断是否为小写字母 a-z (97-122)
local is_lower = (b >= 97 and b <= 122)
-- 3. 判断是否为白名单符号
local is_allowed_sym = allowed_ascii_symbols[b]
if not (is_upper or is_lower or is_allowed_sym) then
return false
end
end
return true
end
local function has_letters(s)
return find(s, "[a-zA-Z]")
end
-- 序列匹配:返回 (首字母位置, 最后一个匹配字符的位置)
local function find_target_in_text(text, start_pos, target_fp)
local text_len = #text
local target_len = #target_fp
if target_len == 0 then return nil, nil end
local t_idx = 1
local scan_p = start_pos
local s_index = nil
while scan_p <= text_len and t_idx <= target_len do
local char_txt = sub(text, scan_p, scan_p)
if lower(char_txt) == sub(target_fp, t_idx, t_idx) then
if t_idx == 1 then s_index = scan_p end
t_idx = t_idx + 1
end
scan_p = scan_p + 1
end
if t_idx > target_len then
return s_index, scan_p - 1
end
return nil, nil
end
-- 2. 核心逻辑:格式化与还原
local function restore_sentence_spacing(cand, split_pattern, check_pattern)
local guide = cand.preedit or ""
if not find(guide, check_pattern) then return cand end
local text = cand.text
local targets = {}
for seg in string.gmatch(guide, split_pattern) do
local t = pure(seg)
if #t > 0 then table.insert(targets, t) end
end
if #targets == 0 then return cand end
local starts = {}
local p = 1
for _, target in ipairs(targets) do
local s, e = find_target_in_text(text, p, target)
if not s then return cand end
table.insert(starts, s)
p = e + 1
end
local parts = {}
if starts[1] > 1 then
table.insert(parts, sub(text, 1, starts[1] - 1))
end
for i = 1, #starts do
local current_s = starts[i]
local next_s = starts[i+1]
local chunk_end = next_s and (next_s - 1) or #text
table.insert(parts, sub(text, current_s, chunk_end))
end
local new_text = table.concat(parts, " ")
new_text = gsub(new_text, "%s%s+", " ")
if new_text == "" then return cand end
local nc = Candidate(cand.type, cand.start, cand._end, new_text, cand.comment)
nc.preedit = cand.preedit
return nc
end
local NBSP = string.char(0xC2, 0xA0)
local function apply_segment_formatting(text, input_code)
if not input_code or input_code == "" then return text end
local parts = {}
local p_code = 1
for word in string.gmatch(text, "%S+") do
local clean_word = pure(word)
local w_len = #clean_word
if w_len > 0 then
if find(word, "[\128-\255]") then
local input_remain = #input_code - p_code + 1
if input_remain > 0 then
local check_len = (w_len < input_remain) and w_len or input_remain
p_code = p_code + check_len
end
else
local input_remain = #input_code - p_code + 1
if input_remain > 0 then
local check_len = (w_len < input_remain) and w_len or input_remain
local segment = sub(input_code, p_code, p_code + check_len - 1)
local is_pure_alpha = not find(word, "[^a-zA-Z]")
if find(segment, "^%u%u") and is_pure_alpha then
word = upper(word)
elseif find(segment, "^%u") then
word = gsub(word, "^%a", upper)
end
p_code = p_code + check_len
end
end
end
table.insert(parts, word)
end
return table.concat(parts, " ")
end
local function apply_formatting(cand, code_ctx)
local text = cand.text
if not text or text == "" then return cand end
local changed = false
local norm = gsub(text, NBSP, " ")
if norm ~= text then text = norm; changed = true end
if is_ascii_phrase_fast(text) and has_letters(text) then
if code_ctx.raw_input then
local new_text = apply_segment_formatting(text, code_ctx.raw_input)
if new_text ~= text then
text = new_text
changed = true
end
end
if code_ctx.spacing_mode and code_ctx.spacing_mode ~= "off" then
local mode = code_ctx.spacing_mode
if mode == "smart" then
if code_ctx.prev_is_eng then
if not find(text, "^%s") then text = " " .. text; changed = true end
end
elseif mode == "before" then
if not find(text, "^%s") then text = " " .. text; changed = true end
elseif mode == "after" then
if not find(text, "%s$") then text = text .. " "; changed = true end
end
end
end
if not changed then return cand end
local nc = Candidate(cand.type, cand.start, cand._end, text, cand.comment)
nc.preedit = cand.preedit
return nc
end
-- 3. 状态管理 (Filter)
function F.init(env)
env.memory = {}
local cfg = env.engine.schema.config
-- 1. 配置读取
env.english_spacing_mode = "off"
env.spacing_timeout = 0
env.lookup_key = "`"
if cfg then
local str = cfg:get_string("wanxiang_english/english_spacing")
if str then env.english_spacing_mode = str end
-- 读取超时 (单位: 秒, 支持小数)
local timeout = cfg:get_double("wanxiang_english/spacing_timeout")
if timeout then env.spacing_timeout = timeout end
local key = cfg:get_string("wanxiang_lookup/key")
if key and key ~= "" then env.lookup_key = key end
end
env.lookup_key_esc = gsub(env.lookup_key, "([%%%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")
-- 2. 动态获取分隔符
local delimiter_str = " '"
if cfg then
delimiter_str = cfg:get_string('speller/delimiter') or delimiter_str
end
env.delimiter_char = sub(delimiter_str, 1, 1) --提取自动分词符号
local escaped_delims = gsub(delimiter_str, "([%%%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")
env.split_pattern = "[^" .. escaped_delims .. "]+"
env.delim_check_pattern = "[" .. escaped_delims .. "]"
env.prev_commit_is_eng = false
env.last_commit_time = 0 --记录上次提交时间
env.comp_start_time = nil -- 记录本次输入开始的时间
env.spacing_active = false
env.decision_locked = false
env.sticky_countdown = 0 -- 粘性倒计时
if env.engine.context then
env.update_notifier = env.engine.context.update_notifier:connect(function(ctx)
local curr_input = ctx.input
-- 检测当前输入是否包含反查符
if env.lookup_key and find(curr_input, env.lookup_key, 1, true) then
env.block_derivation = true
else
env.block_derivation = false
end
-- 如果输入框为空,重置开始时间
if curr_input == "" then
env.comp_start_time = nil
-- 如果输入框不为空,且还没记录开始时间,说明是“刚刚开始打字”
elseif env.comp_start_time == nil then
env.comp_start_time = get_now()
end
end)
env.commit_notifier = env.engine.context.commit_notifier:connect(function(ctx)
local commit_text = ctx:get_commit_text()
-- 1. 先剔除空格,防止死循环
local text_no_space = gsub(commit_text, "%s", "")
local is_eng = is_ascii_phrase_fast(text_no_space)
-- 2. 粘性触发 (结尾是 / 或 \)
if find(text_no_space, "[/\\\\]$") then
env.sticky_countdown = STICKY_BUFFER_SIZE
is_eng = false
-- 3. 粘性缓冲期 (倒计时)
elseif env.sticky_countdown > 0 then
if is_eng then
-- 只要是英文,消耗一次缓冲,并强制不加空格
env.sticky_countdown = env.sticky_countdown - 1
is_eng = false
else
-- 遇到非英文(中文等),打断缓冲
env.sticky_countdown = 0
end
-- 4. 普通黑名单 (http等)
elseif is_eng then
local clean = gsub(commit_text, "%s+$", ""):lower()
if no_spacing_words[clean] then
is_eng = false
end
end
env.prev_commit_is_eng = is_eng
-- 仅英文上屏更新时间戳 (使用 rime_api 获取)
if is_eng then
env.last_commit_time = get_now()
else
env.last_commit_time = 0
end
ctx:set_property("english_spacing", "")
env.block_derivation = false
end)
end
end
function F.fini(env)
if env.update_notifier then env.update_notifier:disconnect(); env.update_notifier = nil end
if env.commit_notifier then env.commit_notifier:disconnect(); env.commit_notifier = nil end
env.memory = nil
end
-- 4. 主逻辑 (Filter)
function F.func(input, env)
local ctx = env.engine.context
local curr_input = ctx.input
local has_valid_candidate = false
local best_candidate_saved = false
local code_len = #curr_input
if code_len > 2 and sub(curr_input, -2) == "\\\\" then
local raw_text = sub(curr_input, 1, code_len - 2)
if is_ascii_phrase_fast(raw_text) then
if ctx.composition and not ctx.composition:empty() then
ctx.composition:back().prompt = "〔英文造词〕"
end
local cand = Candidate("english", 0, code_len, raw_text, "")
cand.preedit = raw_text
yield(cand)
return -- 强制结束,独占输出
end
end
-- [Check 1] 外部脚本发来的打断信号
local break_signal = (ctx:get_property("english_spacing") == "true")
local effective_prev_is_eng = env.prev_commit_is_eng
if break_signal then
effective_prev_is_eng = false
env.prev_commit_is_eng = false
-- [Check 2] 时间自然过期
elseif effective_prev_is_eng and env.spacing_timeout > 0 then
-- 取“输入开始时间”保证输入中
local check_time = env.comp_start_time or get_now()
-- 计算间隙:(开始打字时间 - 上次上屏时间)
if (check_time - env.last_commit_time) > env.spacing_timeout then
effective_prev_is_eng = false
env.prev_commit_is_eng = false
end
end
local code_ctx = {
raw_input = curr_input,
spacing_mode = env.english_spacing_mode,
prev_is_eng = effective_prev_is_eng
}
local single_char_injected = false
local single_chars = {}
if code_len == 1 then
local b = byte(curr_input)
local is_upper = (b >= 65 and b <= 90)
local is_lower = (b >= 97 and b <= 122)
if is_upper or is_lower then
-- 根据输入大小写决定排序:输入 N -> [N, n]; 输入 n -> [n, N]
local t1 = curr_input
local t2 = is_upper and lower(curr_input) or upper(curr_input)
table.insert(single_chars, Candidate("completion", 0, 1, t1, ""))
table.insert(single_chars, Candidate("completion", 0, 1, t2, ""))
else
single_char_injected = true
end
else
single_char_injected = true
end
for cand in input:iter() do
local good_cand = restore_sentence_spacing(cand, env.split_pattern, env.delim_check_pattern)
local fmt_cand = apply_formatting(good_cand, code_ctx)
local is_ascii = is_ascii_phrase_fast(fmt_cand.text)
local is_tbl = is_table_type(cand)
-- table/fixed 类型会先输出,直到遇到第一个 completion类型前插入单字母
if not single_char_injected and is_ascii and #single_chars > 0 and not is_tbl then
if not best_candidate_saved then
env.memory[curr_input] = { text = single_chars[1].text, preedit = single_chars[1].text }
best_candidate_saved = true
end
for _, c in ipairs(single_chars) do yield(c) end
single_char_injected = true
has_valid_candidate = true
end
local is_garbage = (cand.type == "raw") or (fmt_cand.text == curr_input)
if not is_garbage then
has_valid_candidate = true
-- 如果处于拦截状态,就不要把脏数据写进内存了
if not best_candidate_saved and cand.comment ~= "~" and not env.block_derivation then
env.memory[curr_input] = {
text = fmt_cand.text,
preedit = fmt_cand.preedit or fmt_cand.text
}
best_candidate_saved = true
end
end
yield(fmt_cand)
end
if not single_char_injected and #single_chars > 0 then
if not best_candidate_saved then
env.memory[curr_input] = { text = single_chars[1].text, preedit = single_chars[1].text }
best_candidate_saved = true
end
for _, c in ipairs(single_chars) do yield(c) end
has_valid_candidate = true
end
-- [Phase 3] 构造补全
if not has_valid_candidate then
-- 如果设置了拦截标志 (意味着刚刚从反查模式退出来),则即使有记忆也不派生!
if env.block_derivation then return end
if not has_letters(curr_input) then return end
local anchor = nil
local diff = ""
for i = #curr_input - 1, 1, -1 do
local prefix = sub(curr_input, 1, i)
if env.memory[prefix] then
anchor = env.memory[prefix]
diff = sub(curr_input, i + 1)
break
end
end
if anchor and diff ~= "" then
local has_spacing = find(anchor.text, " ")
local last_word = match(anchor.text, "(%S+)%s*$") or ""
local last_len = #last_word
local output_text = ""
local output_preedit = ""
-- 英文构造策略
if is_ascii_phrase_fast(anchor.text) then
-- === 英文逻辑:拼接 diff长词加空格 ===
if has_spacing then
output_text = anchor.text .. diff
output_preedit = (anchor.preedit or anchor.text) .. diff
elseif last_len > 3 then
local spacer = " "
if sub(anchor.text, -1) == " " then spacer = "" end
output_text = anchor.text .. spacer .. diff
output_preedit = (anchor.preedit or anchor.text) .. spacer .. diff
else
output_text = curr_input
output_preedit = curr_input
end
else
-- 中文逻辑:只显示历史词 (anchor),丢弃 diff
-- 输入 nil -> anchor="你", diff="l" 注释 "~"
output_text = anchor.text
-- preedit 依然保留 diff但中间加入自动分词符号
output_preedit = (anchor.preedit or anchor.text) .. env.delimiter_char .. diff
end
output_text = apply_segment_formatting(output_text, curr_input)
local cand = Candidate("completion", 0, #curr_input, output_text, "~")
cand.preedit = output_preedit
cand.quality = 999
yield(cand)
else
local cand = Candidate("completion", 0, #curr_input, curr_input, "~")
cand.preedit = curr_input
yield(cand)
end
end
end
return F