Merge pull request #846 from wumode/lexiannot

This commit is contained in:
jxxghp
2025-07-11 06:51:48 +08:00
committed by GitHub
3 changed files with 39 additions and 10 deletions

View File

@@ -481,11 +481,12 @@
"name": "美剧生词标注",
"description": "根据CEFR等级为英语影视剧标注高级词汇。",
"labels": "英语",
"version": "1.0",
"version": "1.0.1",
"icon": "LexiAnnot.png",
"author": "wumode",
"level": 1,
"history": {
"v1.0.1": "合并连字符词; 避免ARM平台依赖问题",
"v1.0": "新增LexiAnnot"
}
}

View File

@@ -15,8 +15,6 @@ from collections import Counter
from apscheduler.schedulers.background import BackgroundScheduler
import pysubs2
from pysubs2 import SSAFile, SSAEvent
import spacy
from spacy.tokens import Token
import pymediainfo
from langdetect import detect
@@ -44,7 +42,7 @@ class LexiAnnot(_PluginBase):
# 插件图标
plugin_icon = "https://raw.githubusercontent.com/wumode/LexiAnnot/refs/heads/master/LexiAnnot.png"
# 插件版本
plugin_version = "1.0"
plugin_version = "1.0.1"
# 插件作者
plugin_author = "wumode"
# 作者主页
@@ -139,9 +137,40 @@ class LexiAnnot(_PluginBase):
latest = self.__load_lexicon_version()
if not self._lexicon_version or StringUtils.compare_version(self._lexicon_version, '<', latest):
self.__load_lexicon()
# try to import spaCy
try:
import spacy
except ModuleNotFoundError:
logger.info('正在安装spaCy ...')
result, output = SystemUtils.execute_with_subprocess(
[sys.executable, "-m", "pip", "install", 'thinc==8.3.4']
)
if not result:
logger.error(f"无法安装spaCy, {output}")
return
result, output = SystemUtils.execute_with_subprocess(
[sys.executable, "-m", "pip", "install", 'spacy==3.8.7']
)
if not result:
logger.error(f"无法安装spaCy, {output}")
return
try:
import spacy
from spacy.util import compile_infix_regex
from spacy.tokenizer import Tokenizer
if self._nlp is None:
self._nlp = spacy.load(self._spacy_model_name)
infixes = list(self._nlp.Defaults.infixes)
infixes = [i for i in infixes if '-' not in i]
# 使用修改后的正则表达式重新创建 tokenizer
infix_re = compile_infix_regex(infixes)
self._nlp.tokenizer = Tokenizer(
self._nlp.vocab,
prefix_search=self._nlp.tokenizer.prefix_search,
suffix_search=self._nlp.tokenizer.suffix_search,
infix_finditer=infix_re.finditer,
token_match=self._nlp.tokenizer.token_match
)
except OSError:
self._nlp = LexiAnnot.__load_spacy_model(self._spacy_model_name)
if not (self._nlp and self._cefr_lexicon and self._coca2k_lexicon and self._swear_words):
@@ -863,6 +892,7 @@ class LexiAnnot(_PluginBase):
@staticmethod
def __load_spacy_model(model_name: str):
try:
import spacy
result = subprocess.run(
[sys.executable, "-m", "spacy", "download", model_name],
capture_output=True,
@@ -963,13 +993,13 @@ class LexiAnnot(_PluginBase):
return spacy_pos
@staticmethod
def get_cefr_by_spacy(token: Token, cefr_lexicon: Dict[str, Any]) -> Optional[str]:
result = LexiAnnot.query_cefr(token.lemma_, cefr_lexicon)
def get_cefr_by_spacy(lemma_: str, pos_: str, cefr_lexicon: Dict[str, Any]) -> Optional[str]:
result = LexiAnnot.query_cefr(lemma_, cefr_lexicon)
if result:
all_cefr = []
if len(result) > 0:
for entry in result:
if token.pos_ == LexiAnnot.convert_pos_to_spacy(entry['pos']):
if pos_ == LexiAnnot.convert_pos_to_spacy(entry['pos']):
return entry['cefr']
all_cefr.append(entry['cefr'])
return min(all_cefr)
@@ -1354,7 +1384,7 @@ class LexiAnnot(_PluginBase):
continue
if any(p.match(token.lemma_) for p in compiled_patterns):
continue
cefr = LexiAnnot.get_cefr_by_spacy(token, cefr_lexicon)
cefr = LexiAnnot.get_cefr_by_spacy(token.lemma_, token.pos_, cefr_lexicon)
if cefr and cefr in simple_vocabulary:
continue
res_of_coco = LexiAnnot.query_coca20k(token.lemma_, coca20k_lexicon)

View File

@@ -1,5 +1,3 @@
pysubs2~=1.8.0
thinc==8.3.4
spacy==3.8.7
langdetect~=1.0.9
pymediainfo~=7.0.1