Files
rime_wanxiang/lua/super_english.lua
2026-01-30 21:11:40 +08:00

513 lines
18 KiB
Lua
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- lua/super_english.lua
-- https://github.com/amzxyz/rime_wanxiang
-- @description: 英文全能处理器 (Filter Only: 锚点切分 + 动态分隔符 + 超时销毁)
-- @author: amzxyz
-- 核心功能清单:
-- 1. [Format] 语句级英文大写格式化,逐词大小写对应 (look HELLO -> look HELLO)
-- 2. [Spacing] 智能语句空格切分,智能单词上屏加空格 (Smart Spacing) 与无损分词还原
-- 3. [Memory] 全量历史缓存,完美解决回删乱码问题
-- 4. [Construct] 原生优先构造策略 (短词无分词则重置为原生输入)
-- 5. [Order] 单字母(a/A) 智能插队排序,补齐单字母候选
local F = {}
local byte = string.byte
local find = string.find
local gsub = string.gsub
local upper = string.upper
local lower = string.lower
local sub = string.sub
local match = string.match
local format = string.format
local STICKY_BUFFER_SIZE = 2
local function fast_type(c)
local t = c.type
if t then return t end
local g = c.get_genuine and c:get_genuine() or nil
return (g and g.type) or ""
end
local function is_table_type(c)
local t = fast_type(c)
return t == "user_table" or t == "fixed"
end
local function get_now()
if rime_api and rime_api.get_time_ms then
return rime_api.get_time_ms() / 1000
end
return os.time()
end
local function pure(s)
return gsub(s, "[^a-zA-Z]", ""):lower()
end
local no_spacing_words = {
["http"] = true, ["https"] = true, ["www"] = true, ["ftp"] = true,
["ssh"] = true, ["mailto"]= true, ["file"] = true, ["tel"] = true,
}
local allowed_ascii_symbols = {
[32] = true, -- space
[33] = true, -- !
[39] = true, -- '
[44] = true, -- ,
[45] = true, -- -
[43] = true, -- +
[46] = true, -- .
[63] = true, -- ?
[92] = true, -- \
[48]=true, [49]=true, [50]=true, [51]=true, [52]=true,
[53]=true, [54]=true, [55]=true, [56]=true, [57]=true,
}
local function is_ascii_phrase_fast(s)
if not s or s == "" then return false end
local len = #s
for i = 1, len do
local b = byte(s, i)
local is_upper = (b >= 65 and b <= 90)
local is_lower = (b >= 97 and b <= 122)
local is_allowed_sym = allowed_ascii_symbols[b]
if not (is_upper or is_lower or is_allowed_sym) then
return false
end
end
return true
end
local function has_letters(s)
return find(s, "[a-zA-Z]")
end
local function find_target_in_text(text, start_pos, target_fp)
local text_len = #text
local target_len = #target_fp
if target_len == 0 then return nil, nil end
local t_idx = 1
local scan_p = start_pos
local s_index = nil
while scan_p <= text_len and t_idx <= target_len do
local char_txt = sub(text, scan_p, scan_p)
if lower(char_txt) == sub(target_fp, t_idx, t_idx) then
if t_idx == 1 then s_index = scan_p end
t_idx = t_idx + 1
end
scan_p = scan_p + 1
end
if t_idx > target_len then
return s_index, scan_p - 1
end
return nil, nil
end
local function restore_sentence_spacing(cand, split_pattern, check_pattern)
local guide = cand.preedit or ""
if not find(guide, check_pattern) then return cand end
local text = cand.text
local targets = {}
for seg in string.gmatch(guide, split_pattern) do
local t = pure(seg)
if #t > 0 then table.insert(targets, t) end
end
if #targets == 0 then return cand end
local starts = {}
local p = 1
for _, target in ipairs(targets) do
local s, e = find_target_in_text(text, p, target)
if not s then return cand end
table.insert(starts, s)
p = e + 1
end
local parts = {}
if starts[1] > 1 then
table.insert(parts, sub(text, 1, starts[1] - 1))
end
for i = 1, #starts do
local current_s = starts[i]
local next_s = starts[i+1]
local chunk_end = next_s and (next_s - 1) or #text
table.insert(parts, sub(text, current_s, chunk_end))
end
local new_text = ""
for i, part in ipairs(parts) do
if i == 1 then
new_text = part
else
local last_char = sub(new_text, -1)
if last_char == "'" or last_char == "-" then
new_text = new_text .. part
else
new_text = new_text .. " " .. part
end
end
end
new_text = gsub(new_text, "%s%s+", " ")
if new_text == "" then return cand end
local nc = Candidate(cand.type, cand.start, cand._end, new_text, cand.comment)
nc.preedit = cand.preedit
return nc
end
local NBSP = string.char(0xC2, 0xA0)
local function apply_segment_formatting(text, input_code)
if not input_code or input_code == "" then return text end
local parts = {}
local p_code = 1
for word in string.gmatch(text, "%S+") do
local clean_word = pure(word)
local w_len = #clean_word
if w_len > 0 then
if find(word, "[\128-\255]") then
local input_remain = #input_code - p_code + 1
if input_remain > 0 then
local check_len = (w_len < input_remain) and w_len or input_remain
p_code = p_code + check_len
end
else
local input_remain = #input_code - p_code + 1
if input_remain > 0 then
local check_len = (w_len < input_remain) and w_len or input_remain
local segment = sub(input_code, p_code, p_code + check_len - 1)
local is_pure_alpha = not find(word, "[^a-zA-Z]")
if find(segment, "^%u%u") and is_pure_alpha then
word = upper(word)
elseif find(segment, "^%u") then
word = gsub(word, "^%a", upper)
end
p_code = p_code + check_len
end
end
end
table.insert(parts, word)
end
return table.concat(parts, " ")
end
local function apply_formatting(cand, code_ctx)
local text = cand.text
if not text or text == "" then return cand end
local changed = false
local norm = gsub(text, NBSP, " ")
if norm ~= text then text = norm; changed = true end
if is_ascii_phrase_fast(text) and has_letters(text) then
if code_ctx.raw_input then
local new_text = apply_segment_formatting(text, code_ctx.raw_input)
if new_text ~= text then text = new_text; changed = true end
end
if code_ctx.spacing_mode and code_ctx.spacing_mode ~= "off" then
local mode = code_ctx.spacing_mode
if mode == "smart" then
if code_ctx.prev_is_eng then
if not find(text, "^%s") then text = " " .. text; changed = true end
end
elseif mode == "before" then
if not find(text, "^%s") then text = " " .. text; changed = true end
elseif mode == "after" then
if not find(text, "%s$") then text = text .. " "; changed = true end
end
end
end
if not changed then return cand end
local nc = Candidate(cand.type, cand.start, cand._end, text, cand.comment)
nc.preedit = cand.preedit
return nc
end
function F.init(env)
env.memory = {}
local cfg = env.engine.schema.config
env.english_spacing_mode = "off"
env.spacing_timeout = 0
env.lookup_key = "`"
if cfg then
local str = cfg:get_string("wanxiang_english/english_spacing")
if str then env.english_spacing_mode = str end
local timeout = cfg:get_double("wanxiang_english/spacing_timeout")
if timeout then env.spacing_timeout = timeout end
local key = cfg:get_string("wanxiang_lookup/key")
if key and key ~= "" then env.lookup_key = key end
end
env.lookup_key_esc = gsub(env.lookup_key, "([%%%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")
local delimiter_str = " '"
if cfg then
delimiter_str = cfg:get_string('speller/delimiter') or delimiter_str
end
env.delimiter_char = sub(delimiter_str, 1, 1)
local escaped_delims = gsub(delimiter_str, "([%%%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")
env.split_pattern = "[^" .. escaped_delims .. "]+"
env.delim_check_pattern = "[" .. escaped_delims .. "]"
env.prev_commit_is_eng = false
env.last_commit_time = 0
env.comp_start_time = nil
env.spacing_active = false
env.decision_locked = false
env.sticky_countdown = 0
if env.engine.context then
env.update_notifier = env.engine.context.update_notifier:connect(function(ctx)
local curr_input = ctx.input
if env.lookup_key and find(curr_input, env.lookup_key, 1, true) then
env.block_derivation = true
else
env.block_derivation = false
end
if curr_input == "" then
env.comp_start_time = nil
elseif env.comp_start_time == nil then
env.comp_start_time = get_now()
end
end)
env.commit_notifier = env.engine.context.commit_notifier:connect(function(ctx)
local commit_text = ctx:get_commit_text()
local text_no_space = gsub(commit_text, "%s", "")
local is_eng = is_ascii_phrase_fast(text_no_space)
if find(text_no_space, "[/\\\\]$") then
env.sticky_countdown = STICKY_BUFFER_SIZE
is_eng = false
elseif env.sticky_countdown > 0 then
if is_eng then
env.sticky_countdown = env.sticky_countdown - 1
is_eng = false
else
env.sticky_countdown = 0
end
elseif is_eng then
local clean = gsub(commit_text, "%s+$", ""):lower()
if no_spacing_words[clean] then
is_eng = false
end
end
env.prev_commit_is_eng = is_eng
if is_eng then
env.last_commit_time = get_now()
else
env.last_commit_time = 0
end
ctx:set_property("english_spacing", "")
env.block_derivation = false
end)
end
end
function F.fini(env)
if env.update_notifier then env.update_notifier:disconnect(); env.update_notifier = nil end
if env.commit_notifier then env.commit_notifier:disconnect(); env.commit_notifier = nil end
env.memory = nil
end
function F.func(input, env)
local ctx = env.engine.context
local curr_input = ctx.input
if not has_letters(curr_input) then
for cand in input:iter() do
yield(cand)
end
return
end
-- ===
local has_valid_candidate = false
local best_candidate_saved = false
local code_len = #curr_input
-- [Feature] 强制英文造词
if code_len > 2 and sub(curr_input, -2) == "\\\\" then
local raw_text = sub(curr_input, 1, code_len - 2)
if is_ascii_phrase_fast(raw_text) then
if ctx.composition and not ctx.composition:empty() then
ctx.composition:back().prompt = "〔英文造词〕"
end
local cand = Candidate("english", 0, code_len, raw_text, "")
cand.preedit = raw_text
yield(cand)
return
end
end
local break_signal = (ctx:get_property("english_spacing") == "true")
local effective_prev_is_eng = env.prev_commit_is_eng
if break_signal then
effective_prev_is_eng = false
env.prev_commit_is_eng = false
elseif effective_prev_is_eng and env.spacing_timeout > 0 then
local check_time = env.comp_start_time or get_now()
if (check_time - env.last_commit_time) > env.spacing_timeout then
effective_prev_is_eng = false
env.prev_commit_is_eng = false
end
end
local code_ctx = {
raw_input = curr_input,
spacing_mode = env.english_spacing_mode,
prev_is_eng = effective_prev_is_eng
}
local single_char_injected = false
local single_chars = {}
if code_len == 1 then
local b = byte(curr_input)
local is_upper = (b >= 65 and b <= 90)
local is_lower = (b >= 97 and b <= 122)
if is_upper or is_lower then
local t1 = curr_input
local t2 = is_upper and lower(curr_input) or upper(curr_input)
table.insert(single_chars, Candidate("completion", 0, 1, t1, ""))
table.insert(single_chars, Candidate("completion", 0, 1, t2, ""))
has_single_chars = true
end
else
single_char_injected = true
end
for cand in input:iter() do
local good_cand = restore_sentence_spacing(cand, env.split_pattern, env.delim_check_pattern)
local fmt_cand = apply_formatting(good_cand, code_ctx)
-- 去除注释中的太极符号
if fmt_cand.comment and find(fmt_cand.comment, "\226\152\175") then
local nc = Candidate(fmt_cand.type, fmt_cand.start, fmt_cand._end, fmt_cand.text, "")
nc.preedit = fmt_cand.preedit
fmt_cand = nc
end
local c_type = cand.type
local is_ascii = is_ascii_phrase_fast(fmt_cand.text)
local is_tbl = is_table_type(cand)
-- [垃圾词判定]:保护符号,只去重单字母
local is_garbage = (c_type == "raw")
if not is_garbage and code_len == 1 and has_letters(curr_input) then
if lower(fmt_cand.text) == lower(curr_input) then
is_garbage = true
end
end
if not is_garbage then
has_valid_candidate = true
-- [VIP 优先逻辑]
local is_vip_type = (c_type == "user_table" or c_type == "fixed" or c_type == "phrase")
local is_hidden_vip = (not is_vip_type) and (not is_ascii)
local treat_as_vip = is_vip_type or is_hidden_vip
if treat_as_vip then
-- VIP 通道:不仅是 user_table包括汉字等都直接输出不让单字母插队
if not best_candidate_saved and cand.comment ~= "~" and not env.block_derivation then
env.memory[curr_input] = {
text = fmt_cand.text,
preedit = curr_input
}
best_candidate_saved = true
end
yield(fmt_cand)
else
-- 普通通道:允许单字母插队到前面
if has_single_chars and not single_char_injected then
if not best_candidate_saved then
env.memory[curr_input] = { text = single_chars[1].text, preedit = curr_input }
best_candidate_saved = true
end
for _, c in ipairs(single_chars) do yield(c) end
single_char_injected = true
has_valid_candidate = true
end
if not best_candidate_saved and cand.comment ~= "~" and not env.block_derivation then
env.memory[curr_input] = {
text = fmt_cand.text,
preedit = curr_input
}
best_candidate_saved = true
end
yield(fmt_cand)
end
end
end
-- 3. 兜底逻辑 (补单字母)
if has_single_chars and not single_char_injected then
if not best_candidate_saved then
env.memory[curr_input] = { text = single_chars[1].text, preedit = single_chars[1].text }
best_candidate_saved = true
end
for _, c in ipairs(single_chars) do yield(c) end
has_valid_candidate = true
end
-- [Phase 3] 历史回溯构造 (Strictly fallback)
if not has_valid_candidate then
if env.block_derivation then return end
if find(curr_input, "^[/]") then
return
end
if not env.block_derivation and has_letters(curr_input) then
local anchor = nil
local diff = ""
for i = #curr_input - 1, 1, -1 do
local prefix = sub(curr_input, 1, i)
if env.memory[prefix] then
anchor = env.memory[prefix]
diff = sub(curr_input, i + 1)
break
end
end
if anchor and diff ~= "" then
local has_spacing = find(anchor.text, " ")
local last_word = match(anchor.text, "(%S+)%s*$") or ""
local last_len = #last_word
local output_text = ""
local output_preedit = ""
local is_code_mode = find(curr_input, "^[/\\]")
if is_ascii_phrase_fast(anchor.text) then
local spacer = " "
if sub(anchor.text, -1) == " " then spacer = "" end
if has_spacing then
output_text = anchor.text .. spacer .. diff
output_preedit = (anchor.preedit or anchor.text) .. spacer .. diff
elseif last_len > 3 then
output_text = anchor.text .. spacer .. diff
output_preedit = (anchor.preedit or anchor.text) .. spacer .. diff
else
output_text = curr_input
output_preedit = curr_input
end
elseif is_code_mode then
output_text = anchor.text .. diff
output_preedit = (anchor.preedit or anchor.text) .. diff
else
output_text = anchor.text
output_preedit = (anchor.preedit or anchor.text) .. env.delimiter_char .. diff
end
output_text = apply_segment_formatting(output_text, curr_input)
local cand = Candidate("completion", 0, #curr_input, output_text, "~")
cand.preedit = output_preedit
cand.quality = 999
yield(cand)
else
-- [Phase 4] 真正的无解兜底
local cand = Candidate("completion", 0, #curr_input, curr_input, "~")
cand.preedit = curr_input
yield(cand)
end
else
-- 特殊符号或被拦截时的兜底
local cand = Candidate("completion", 0, #curr_input, curr_input, "~")
cand.preedit = curr_input
yield(cand)
end
end
end
return F