diff --git a/package.v2.json b/package.v2.json index 3b293b7..1adf0d7 100644 --- a/package.v2.json +++ b/package.v2.json @@ -543,11 +543,12 @@ "name": "美剧生词标注", "description": "根据CEFR等级,为英语影视剧标注高级词汇。", "labels": "英语", - "version": "1.2.0", + "version": "1.2.1", "icon": "LexiAnnot.png", "author": "wumode", "level": 1, "history": { + "v1.2.1": "改进字幕样式获取方法", "v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI", "v1.1.4": "优化字幕选择决策", "v1.1.3": "适配 Pydantic V2 (主程序版本需高于 2.8.1-1)", diff --git a/plugins.v2/lexiannot/__init__.py b/plugins.v2/lexiannot/__init__.py index 4034033..1ad3faa 100644 --- a/plugins.v2/lexiannot/__init__.py +++ b/plugins.v2/lexiannot/__init__.py @@ -61,7 +61,7 @@ class LexiAnnot(_PluginBase): # 插件图标 plugin_icon = "LexiAnnot.png" # 插件版本 - plugin_version = "1.2.0" + plugin_version = "1.2.1" # 插件作者 plugin_author = "wumode" # 作者主页 @@ -756,6 +756,7 @@ class LexiAnnot(_PluginBase): {"title": "0.3", "value": "0.3"}, {"title": "0.4", "value": "0.4"}, {"title": "0.5", "value": "0.5"}, + {"title": "1.0", "value": "1.0"}, ], }, } @@ -887,7 +888,7 @@ class LexiAnnot(_PluginBase): "ffmpeg_path": "", "english_only": True, "when_file_trans": True, - "model_temperature": "0.1", + "model_temperature": "0.3", "custom_files": "", "accent_color": "", "font_scaling": "1", @@ -1339,6 +1340,7 @@ class LexiAnnot(_PluginBase): ) ret_message = "" stat = None + ret_status: TaskStatus = TaskStatus.FAILED if embedded_subtitles: logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕") for embedded_subtitle in embedded_subtitles: @@ -1364,10 +1366,11 @@ class LexiAnnot(_PluginBase): ass_subtitle.save(str(ass_file)) ret_message = "字幕已保存" logger.info(f"字幕已保存:{str(ass_file)}") + ret_status = TaskStatus.COMPLETED + break except Exception as e: ret_message = f"字幕文件 {ass_file} 保存失败" logger.error(f"字幕文件 {ass_file} 保存失败, {e}") - break else: logger.info( f"处理字幕{embedded_subtitle['codec_id']}-{embedded_subtitle['stream_id']}失败" @@ -1378,7 +1381,7 @@ class LexiAnnot(_PluginBase): ret_message = "未能找到可提取的英文字幕" logger.info(f"✅ Finished: {path}") - return ProcessResult(status=TaskStatus.COMPLETED, message=ret_message, statistics=stat) + return ProcessResult(status=ret_status, message=ret_message, statistics=stat) @cached(maxsize=1, ttl=1800) def __load_lexicon_version(self) -> Optional[str]: @@ -1513,13 +1516,8 @@ class LexiAnnot(_PluginBase): mediainfo: MediaInfo | None = event_info.get("mediainfo") if self._english_only and mediainfo: - if mediainfo.original_language and mediainfo.original_language not in { - "en", - "eng", - }: - logger.info( - f"原始语言 ({mediainfo.original_language}) 不为英语, 跳过 {mediainfo.title}: " - ) + if mediainfo.original_language and mediainfo.original_language not in {"en","eng"}: + logger.info(f"原始语言 ({mediainfo.original_language}) 不为英语, 跳过 {mediainfo.title}: ") return for new_path in transfer_info.file_list_new or []: self.add_media_file(new_path) @@ -1537,10 +1535,7 @@ class LexiAnnot(_PluginBase): new_list = [] replacements.sort(key=lambda x: x["end"] - x["start"], reverse=True) for r in replacements: - if any( - (r["start"] >= new["start"] and r["end"] <= new["end"]) - for new in new_list - ): + if any((r["start"] >= new["start"] and r["end"] <= new["end"]) for new in new_list): continue new_list.append(r) return new_list @@ -1591,12 +1586,21 @@ class LexiAnnot(_PluginBase): @staticmethod def analyze_ass_language(ass_file: SSAFile): + + def _replace_with_spaces(_text): + """ + 使用等长的空格替换文本中的 (xxx) 模式。 + 例如:"(Hi)" 会被替换成 " " (4个空格) + """ + pattern = r"(\(.*?\)|\[.*?\])" + return re.sub(pattern, lambda match: " " * len(match.group(1)), _text) + styles = {} for style in ass_file.styles: styles[style] = {"text": [], "duration": 0, "text_size": 0, "times": 0} for dialogue in ass_file: style = dialogue.style - text = dialogue.plaintext + text = _replace_with_spaces(dialogue.plaintext) sub_text = text.split("\n") if style not in styles or not text: continue @@ -1638,13 +1642,11 @@ class LexiAnnot(_PluginBase): return style_language_analysis @staticmethod - def select_main_style_weighted( - language_analysis: Dict[str, Any], known_language: str, weights=None - ): + def select_main_style_weighted(analysis: Dict[str, Any], known_language: str, weights = None): """ 根据语言分析结果和已知的字幕语言,使用加权评分选择主要样式 - :params language_analysis: `analyze_ass_language` 函数的输出结果 + :params analysis: `analyze_ass_language` 函数的输出结果 :params known_language: 已知的字幕语言代码 :params weights: 各个维度的权重,权重之和应为 1 :returns: 主要字幕的样式名称,如果没有匹配的样式则返回 None @@ -1652,20 +1654,10 @@ class LexiAnnot(_PluginBase): if weights is None: weights = {"times": 0.5, "text_size": 0.4, "duration": 0.1} matching_styles = [] - max_times = max([analysis.get("times", 0) for _, analysis in language_analysis.items() if analysis]) or 1 - max_text_size = ( - max([analysis.get("text_size", 0) for _, analysis in language_analysis.items() if analysis]) or 1) - max_duration = ( - max( - [ - analysis.get("duration", 0) - for _, analysis in language_analysis.items() - if analysis - ] - ) - or 1 - ) - for style, analysis in language_analysis.items(): + max_times = max([analysis.get("times", 0) for _, analysis in analysis.items() if analysis]) or 1 + max_text_size = max([analysis.get("text_size", 0) for _, analysis in analysis.items() if analysis]) or 1 + max_duration = max([analysis.get("duration", 0) for _, analysis in analysis.items() if analysis]) or 1 + for style, analysis in analysis.items(): if not analysis: continue if analysis.get("main_language") == known_language: @@ -1898,7 +1890,7 @@ class LexiAnnot(_PluginBase): ) ) - # model_temperature = float(self._model_temperature) if self._model_temperature else 0.1 + model_temperature = float(self._model_temperature) if self._model_temperature else 0.3 logger.info("通过 spaCy 分词...") for seg in segments: if self._shutdown_event.is_set(): @@ -1925,7 +1917,7 @@ class LexiAnnot(_PluginBase): model_name=llm_model_name, base_url=llm_base_url, api_key=llm_apikey, - temperature=self._model_temperature, + temperature=model_temperature, max_retries=self._max_retries, proxy=self._use_proxy, ) @@ -1958,9 +1950,7 @@ class LexiAnnot(_PluginBase): ) # &H00FFFFFF& statistical_res = LexiAnnot.analyze_ass_language(ass_file) - main_style: str | None = LexiAnnot.select_main_style_weighted( - statistical_res, lang - ) + main_style: str | None = LexiAnnot.select_main_style_weighted(statistical_res, lang) if not main_style: logger.error("无法确定主要字幕样式") return None, None @@ -1996,16 +1986,8 @@ class LexiAnnot(_PluginBase): dialogue.start = main_processor[seg.index].start dialogue.end = main_processor[seg.index].end dialogue.style = "Annotation EN" - cefr_text = ( - f" {style_text('Annotation CEFR', word.cefr)}" - if word.cefr - else "" - ) - exam_text = ( - f" {style_text('Annotation EXAM', ' '.join(exams))}" - if exams - else "" - ) + cefr_text = f" {style_text('Annotation CEFR', word.cefr)}" if word.cefr else "" + exam_text = f" {style_text('Annotation EXAM', ' '.join(exams))}" if exams else "" phone_text = ( f"{__N}{style_text('Annotation PHONE', f'/{word.phonetics}/')}" if word.phonetics and self._show_phonetics @@ -2050,10 +2032,10 @@ class LexiAnnot(_PluginBase): ) if self._sentence_translation: chinese = seg.Chinese - if chinese and chinese[-1] in ["。", ","]: + if chinese and chinese[-1] in {"。", ","}: chinese = chinese[:-1] main_processor[seg.index].text = ( - main_processor[seg.index].text + f"\\N{{\\fs{int(main_style_fs * 0.75)}}}{chinese}{{\\r}}" + main_processor[seg.index].text + f"\\N{{\\fs{int(main_style_fs * 0.75)}}}{chinese}{{\\r}}" ) # 避免 Infuse 显示乱码 diff --git a/plugins.v2/lexiannot/pipeline.py b/plugins.v2/lexiannot/pipeline.py index 6aac3e3..c328886 100644 --- a/plugins.v2/lexiannot/pipeline.py +++ b/plugins.v2/lexiannot/pipeline.py @@ -500,7 +500,7 @@ Your goal is to identify **only** content that helps them reach native-level pro * Avoid repeating words already listed in `candidate_words`. * Must exist in the exact form in `context_text`. * Provide lemma and POS. - * **Do NOT include** simple high-frequency words, common fillers ('gonna', 'gotta'), or basic swear words. + * **Do NOT include** simple high-frequency words, common fillers ('gonna', 'gotta'), onomatopoeia, or basic swear words. ------------------------- You MUST return output strictly matching the provided Pydantic schema.