diff --git a/package.v2.json b/package.v2.json index 3dca7b3..7262e07 100644 --- a/package.v2.json +++ b/package.v2.json @@ -470,11 +470,12 @@ "name": "IMDb源", "description": "让探索,推荐和媒体识别支持IMDb数据源。", "labels": "探索", - "version": "1.6.5", + "version": "1.6.6", "icon": "IMDb_IOS-OSX_App.png", "author": "wumode", "level": 1, "history": { + "v1.6.6": "优化主页组件链接跳转", "v1.6.5": "仪表盘组件支持图片缓存", "v1.6.4": "为元数据增加背景图", "v1.6.3": "优化媒体识别速度; 适配 Pydantic V2 (主程序版本需高于 2.8.1-1)", @@ -549,11 +550,12 @@ "name": "美剧生词标注", "description": "根据CEFR等级,为英语影视剧标注高级词汇。", "labels": "英语", - "version": "1.2.1", + "version": "1.2.2", "icon": "LexiAnnot.png", "author": "wumode", "level": 1, "history": { + "v1.2.2": "优化提示词", "v1.2.1": "改进字幕样式获取方法", "v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI", "v1.1.4": "优化字幕选择决策", diff --git a/plugins.v2/imdbsource/__init__.py b/plugins.v2/imdbsource/__init__.py index cf444d4..7a89fe3 100644 --- a/plugins.v2/imdbsource/__init__.py +++ b/plugins.v2/imdbsource/__init__.py @@ -34,7 +34,7 @@ class ImdbSource(_PluginBase): # 插件图标 plugin_icon = "IMDb_IOS-OSX_App.png" # 插件版本 - plugin_version = "1.6.5" + plugin_version = "1.6.6" # 插件作者 plugin_author = "wumode" # 作者主页 @@ -285,28 +285,19 @@ class ImdbSource(_PluginBase): }, 'content': [ { - 'component': 'RouterLink', + 'component': 'h1', 'props': { - 'to': mp_url, - 'class': 'no-underline' + 'class': 'mb-1 text-white text-shadow font-extrabold text-2xl line-clamp-2 overflow-hidden text-ellipsis ...' }, - 'content': [ - { - 'component': 'h1', - 'props': { - 'class': 'mb-1 text-white text-shadow font-extrabold text-2xl line-clamp-2 overflow-hidden text-ellipsis ...' - }, - 'html': f"{entry.name} {year}", - }, - { - 'component': 'span', - 'props': { - 'class': 'text-shadow line-clamp-2 overflow-hidden text-ellipsis ...' - }, - 'html': imdb_title.plot_text, - } - ] + 'html': f"{entry.name} {year}", }, + { + 'component': 'span', + 'props': { + 'class': 'text-shadow line-clamp-2 overflow-hidden text-ellipsis ...' + }, + 'html': imdb_title.plot_text, + } ] } ] @@ -392,7 +383,8 @@ class ImdbSource(_PluginBase): { 'component': 'a', 'props': { - 'href': f'#{mp_url}', + 'href': f"https://www.imdb.com/title/{entry.ttconst}", + 'target': '_blank', 'class': 'no-underline w-100', 'style': 'display: flex; justify-content: center;' }, @@ -454,15 +446,17 @@ class ImdbSource(_PluginBase): { 'component': 'a', 'props': { - 'href': f"https://www.imdb.com/title/{entry.ttconst}", - 'target': '_blank', + 'href': f'#{mp_url}', 'rel': 'noopener noreferrer', 'class': 'text-h4 font-weight-bold mb-2 d-flex text-white align-center', }, 'content': [ { 'component': 'span', - 'html': f"{entry.name}" + 'html': f"{entry.name}", + 'props': { + 'class': 'line-clamp-2 overflow-hidden', + } }, { 'component': 'v-icon', diff --git a/plugins.v2/lexiannot/__init__.py b/plugins.v2/lexiannot/__init__.py index 98791a2..512e130 100644 --- a/plugins.v2/lexiannot/__init__.py +++ b/plugins.v2/lexiannot/__init__.py @@ -3,7 +3,6 @@ import os import json import queue import re -import shutil import subprocess import sys import threading @@ -61,7 +60,7 @@ class LexiAnnot(_PluginBase): # 插件图标 plugin_icon = "LexiAnnot.png" # 插件版本 - plugin_version = "1.2.1" + plugin_version = "1.2.2" # 插件作者 plugin_author = "wumode" # 作者主页 @@ -163,10 +162,6 @@ class LexiAnnot(_PluginBase): self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0 if self._delete_data: # 删除不再保存在数据库的数据 - self.del_data("cefr_lexicon") - self.del_data("coca2k_lexicon") - self.del_data("swear_words") - self.del_data("lexicon_version") self.delete_data() self._delete_data = False self._loaded = False @@ -1064,15 +1059,6 @@ class LexiAnnot(_PluginBase): logger.error(f"词典 {lexicon_path} 删除失败: {e}") self._load_lexicon_from_local.cache_clear() - # 删除虚拟环境 - venv_dir = data_path / "venv_genai" - if os.path.exists(venv_dir): - try: - shutil.rmtree(venv_dir) - logger.info(f"虚拟环境 {venv_dir} 已删除") - except Exception as e: - logger.error(f"虚拟环境 {venv_dir} 删除失败: {e}") - # 删除任务记录 with self._tasks_lock: self._tasks = {} @@ -1324,9 +1310,7 @@ class LexiAnnot(_PluginBase): ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else "ffmpeg" eng_mark = ["en", "en-US", "eng", "en-GB", "english", "en-AU"] - embedded_subtitles = LexiAnnot._extract_subtitles_by_lang( - path, eng_mark, ffmpeg_path - ) + embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path) if not embedded_subtitles: return ProcessResult( status=TaskStatus.CANCELED, message="未找到嵌入式英文文本字幕" @@ -1345,22 +1329,14 @@ class LexiAnnot(_PluginBase): logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕") for embedded_subtitle in embedded_subtitles: if self._shutdown_event.is_set(): - return ProcessResult( - status=TaskStatus.CANCELED, message="任务已取消" - ) - ass_subtitle = SSAFile.from_string( - embedded_subtitle["subtitle"], format_="ass" - ) + return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消") + ass_subtitle = SSAFile.from_string(embedded_subtitle["subtitle"], format_="ass") if embedded_subtitle.get("codec_id") == "S_TEXT/UTF8": ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle) ass_subtitle = self.__set_style(ass_subtitle) - ass_subtitle, stat = self.process_subtitles( - ass_subtitle, lexi, spacy_worker, mediainfo - ) + ass_subtitle, stat = self.process_subtitles(ass_subtitle, lexi, spacy_worker, mediainfo) if self._shutdown_event.is_set(): - return ProcessResult( - status=TaskStatus.CANCELED, message="任务已取消" - ) + return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消") if ass_subtitle: try: ass_subtitle.save(str(ass_file)) @@ -1810,7 +1786,7 @@ class LexiAnnot(_PluginBase): @staticmethod def _extract_subtitles_by_lang( video_path: str, lang: str | list = "en", ffmpeg: str = "ffmpeg" - ) -> Optional[List[Dict]]: + ) -> list[dict]: """ 提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。 """ @@ -1853,21 +1829,25 @@ class LexiAnnot(_PluginBase): } ) if subtitles: - return subtitles - else: + # remove outliers with abnormally short duration + if len(subtitles) > 1: + durations = [sub["duration"] for sub in subtitles if sub["duration"] > 0] + if durations: + avg_duration = sum(durations) / len(durations) + subtitles = [ + sub for sub in subtitles if sub["duration"] >= avg_duration * 0.2 + ] + if not subtitles: logger.warn("未找到标记为英语的文本字幕流") - return None except FileNotFoundError: logger.error(f"找不到视频文件 '{video_path}'") - return None except subprocess.CalledProcessError as e: logger.error(f"错误:提取字幕失败。\n错误信息:{e}") logger.error(f"FFmpeg 输出 (stderr):\n{e.stderr}") - return None except Exception as e: logger.error(f"使用 MediaInfo 提取字幕时发生错误:{e}") - return None + return subtitles def _process_chain( self, @@ -1884,12 +1864,9 @@ class LexiAnnot(_PluginBase): :param spacy_worker: spaCy 分词器 :returns: 处理后的字幕行列表 """ - simple_vocabulary = set( - filter( - lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"] - ) - ) - + CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"] + simple_vocabulary = set(filter(lambda x: x < self._annot_level, CEFR_LEVELS)) + learner_level = max(simple_vocabulary) model_temperature = float(self._model_temperature) if self._model_temperature else 0.3 logger.info("通过 spaCy 分词...") for seg in segments: @@ -1927,7 +1904,7 @@ class LexiAnnot(_PluginBase): segments=segments, shutdown_event=self._shutdown_event, context_window=self._context_window, - leaner_level=self._annot_level, + leaner_level=learner_level, media_context=mediainfo, translate_sentences=self._sentence_translation ) diff --git a/plugins.v2/lexiannot/pipeline.py b/plugins.v2/lexiannot/pipeline.py index c328886..7ca1c23 100644 --- a/plugins.v2/lexiannot/pipeline.py +++ b/plugins.v2/lexiannot/pipeline.py @@ -406,6 +406,7 @@ def _context_process_chain( [ f"- {word.text} (WORD_ID: {word.meta.word_id}, LEMMA: {word.lemma}, CEFR: {word.cefr}, POS: {word.pos})" for seg in segment_list + if start <= seg.index <= end for word in seg.candidate_words ] ), @@ -468,7 +469,7 @@ def _context_process_chain( ) built_word = _update_word_via_lexicon(built_word, lexi) - if built_word.cefr and built_word.cefr < leaner_level: + if built_word.cefr and built_word.cefr <= leaner_level: continue seg.candidate_words.append(built_word) @@ -477,26 +478,31 @@ def _context_process_chain( ( "system", """You are an expert in linguistics and language learning. Your task is to analyze subtitle segments. -Please perform the following tasks for an English learner at {leaner_level} CEFR level. +Please perform the following tasks for an non-native English learner. -**CRITICAL INSTRUCTION**: The learner is advanced. They already know common daily vocabulary. -Your goal is to identify **only** content that helps them reach native-level proficiency. +**CRITICAL INSTRUCTION**: The learner is at the {leaner_level} level. +They are proficient in vocabulary at or below this level. +Your goal is two-fold: +1. **Learning**: Identify content challenging for their current level. +2. **Comprehension**: Ensure they understand **specific or low-frequency vocabulary** crucial for the narrative, even if it is not "core" vocabulary. 1. **Review and Evaluate Candidate Words:** - * **Goal**: Filter out simple words and correct any errors in lemma/POS/text. + * **Goal**: Filter out words that are easy, BUT **retain** rare or specific words needed for understanding. * **Action**: Return feedback items **ONLY** for words that: 1. Should be **discarded** (too simple, trivial filler, profanity without cultural value). Set `should_keep` to `False`. 2. Need **correction** (wrong lemma, POS, or text boundary). Set `should_keep` to `True` and provide correct values. * **Implicit Rule**: If a word is appropriate for the learner and has correct info, **DO NOT** include it in the output list. - * **Keep criteria**: Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense. - * **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words below {leaner_level} level. + * **Keep criteria**: + * Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense. + * **Specific/Concrete Vocabulary**: Keep low-frequency words (e.g., like 'chamomile', 'cavernous' for B2) that are rare but essential for visualizing the scene or understanding the plot. **Do NOT discard these just because they are rare.** + * **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words well below {leaner_level} level (unless they fit the 'Keep criteria'). 2. **Identify Missed Words:** - * Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners. This specifically includes: - * **Slang or informal expressions.** - * **Internet terms or modern colloquialisms.** - * **Words or phrases that require specific cultural background knowledge to understand.** - * **Any other words or phrases that are challenging.** + * Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners or for **plot comprehension**. + * **Targets**: + * **Slang, idioms, or modern colloquialisms.** + * **Low-frequency words** (e.g., 'shimmer', 'rugged') missed by the algorithm. + * **Words requiring cultural background.** * Avoid repeating words already listed in `candidate_words`. * Must exist in the exact form in `context_text`. * Provide lemma and POS. @@ -690,7 +696,7 @@ def llm_process_chain( segments: SegmentList, shutdown_event: threading.Event, context_window: int = 30, - leaner_level: str = "C1", + learner_level: str = "C1", media_context: Context | None = None, translate_sentences: bool = False, ) -> SegmentList: @@ -702,7 +708,7 @@ def llm_process_chain( :param segments: 字幕片段 :param shutdown_event: 关闭事件 :param context_window: 上下文窗口大小 - :param leaner_level: 学习者的 CEFR 水平 + :param learner_level: 学习者的 CEFR 水平 :param media_context: 媒体信息 :param translate_sentences: 是否翻译句子 :returns: 更新后的字幕片段列表 @@ -726,7 +732,7 @@ def llm_process_chain( ) segments_list.extend( _context_process_chain( - lexi, llm, context, start, end, leaner_level, media_name, translate_sentences + lexi, llm, context, start, end, learner_level, media_name, translate_sentences ) )