From 4bd85865fc9c8d7bf31dec59670e10bb8a42a84c Mon Sep 17 00:00:00 2001 From: wumode Date: Fri, 11 Jul 2025 00:24:22 +0800 Subject: [PATCH] =?UTF-8?q?update(LexiAnnot)=20=E9=81=BF=E5=85=8DARM?= =?UTF-8?q?=E5=B9=B3=E5=8F=B0=E4=BE=9D=E8=B5=96=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.v2.json | 3 +- plugins.v2/lexiannot/__init__.py | 44 ++++++++++++++++++++++----- plugins.v2/lexiannot/requirements.txt | 2 -- 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/package.v2.json b/package.v2.json index ffe1643..058efbd 100644 --- a/package.v2.json +++ b/package.v2.json @@ -481,11 +481,12 @@ "name": "美剧生词标注", "description": "根据CEFR等级,为英语影视剧标注高级词汇。", "labels": "英语", - "version": "1.0", + "version": "1.0.1", "icon": "LexiAnnot.png", "author": "wumode", "level": 1, "history": { + "v1.0.1": "合并连字符词; 避免ARM平台依赖问题", "v1.0": "新增LexiAnnot" } } diff --git a/plugins.v2/lexiannot/__init__.py b/plugins.v2/lexiannot/__init__.py index c143024..fec6d93 100644 --- a/plugins.v2/lexiannot/__init__.py +++ b/plugins.v2/lexiannot/__init__.py @@ -15,8 +15,6 @@ from collections import Counter from apscheduler.schedulers.background import BackgroundScheduler import pysubs2 from pysubs2 import SSAFile, SSAEvent -import spacy -from spacy.tokens import Token import pymediainfo from langdetect import detect @@ -44,7 +42,7 @@ class LexiAnnot(_PluginBase): # 插件图标 plugin_icon = "https://raw.githubusercontent.com/wumode/LexiAnnot/refs/heads/master/LexiAnnot.png" # 插件版本 - plugin_version = "1.0" + plugin_version = "1.0.1" # 插件作者 plugin_author = "wumode" # 作者主页 @@ -139,9 +137,40 @@ class LexiAnnot(_PluginBase): latest = self.__load_lexicon_version() if not self._lexicon_version or StringUtils.compare_version(self._lexicon_version, '<', latest): self.__load_lexicon() + # try to import spaCy try: + import spacy + except ModuleNotFoundError: + logger.info('正在安装spaCy ...') + result, output = SystemUtils.execute_with_subprocess( + [sys.executable, "-m", "pip", "install", 'thinc==8.3.4'] + ) + if not result: + logger.error(f"无法安装spaCy, {output}") + return + result, output = SystemUtils.execute_with_subprocess( + [sys.executable, "-m", "pip", "install", 'spacy==3.8.7'] + ) + if not result: + logger.error(f"无法安装spaCy, {output}") + return + try: + import spacy + from spacy.util import compile_infix_regex + from spacy.tokenizer import Tokenizer if self._nlp is None: self._nlp = spacy.load(self._spacy_model_name) + infixes = list(self._nlp.Defaults.infixes) + infixes = [i for i in infixes if '-' not in i] + # 使用修改后的正则表达式重新创建 tokenizer + infix_re = compile_infix_regex(infixes) + self._nlp.tokenizer = Tokenizer( + self._nlp.vocab, + prefix_search=self._nlp.tokenizer.prefix_search, + suffix_search=self._nlp.tokenizer.suffix_search, + infix_finditer=infix_re.finditer, + token_match=self._nlp.tokenizer.token_match + ) except OSError: self._nlp = LexiAnnot.__load_spacy_model(self._spacy_model_name) if not (self._nlp and self._cefr_lexicon and self._coca2k_lexicon and self._swear_words): @@ -863,6 +892,7 @@ class LexiAnnot(_PluginBase): @staticmethod def __load_spacy_model(model_name: str): try: + import spacy result = subprocess.run( [sys.executable, "-m", "spacy", "download", model_name], capture_output=True, @@ -963,13 +993,13 @@ class LexiAnnot(_PluginBase): return spacy_pos @staticmethod - def get_cefr_by_spacy(token: Token, cefr_lexicon: Dict[str, Any]) -> Optional[str]: - result = LexiAnnot.query_cefr(token.lemma_, cefr_lexicon) + def get_cefr_by_spacy(lemma_: str, pos_: str, cefr_lexicon: Dict[str, Any]) -> Optional[str]: + result = LexiAnnot.query_cefr(lemma_, cefr_lexicon) if result: all_cefr = [] if len(result) > 0: for entry in result: - if token.pos_ == LexiAnnot.convert_pos_to_spacy(entry['pos']): + if pos_ == LexiAnnot.convert_pos_to_spacy(entry['pos']): return entry['cefr'] all_cefr.append(entry['cefr']) return min(all_cefr) @@ -1354,7 +1384,7 @@ class LexiAnnot(_PluginBase): continue if any(p.match(token.lemma_) for p in compiled_patterns): continue - cefr = LexiAnnot.get_cefr_by_spacy(token, cefr_lexicon) + cefr = LexiAnnot.get_cefr_by_spacy(token.lemma_, token.pos_, cefr_lexicon) if cefr and cefr in simple_vocabulary: continue res_of_coco = LexiAnnot.query_coca20k(token.lemma_, coca20k_lexicon) diff --git a/plugins.v2/lexiannot/requirements.txt b/plugins.v2/lexiannot/requirements.txt index f1dd2bf..88f3dde 100644 --- a/plugins.v2/lexiannot/requirements.txt +++ b/plugins.v2/lexiannot/requirements.txt @@ -1,5 +1,3 @@ pysubs2~=1.8.0 -thinc==8.3.4 -spacy==3.8.7 langdetect~=1.0.9 pymediainfo~=7.0.1 \ No newline at end of file