feat(lexiannot): optimize prompts and subtitle extraction

2026-03-27 10:05:57 +00:00 · 2026-01-07 12:42:02 +08:00
parent 24b9c2ec29
commit 6dbf539d88
3 changed files with 40 additions and 57 deletions
--- a/package.v2.json
+++ b/package.v2.json
@@ -549,11 +549,12 @@
    "name": "美剧生词标注",
    "description": "根据CEFR等级，为英语影视剧标注高级词汇。",
    "labels": "英语",
-    "version": "1.2.1",
+    "version": "1.2.2",
    "icon": "LexiAnnot.png",
    "author": "wumode",
    "level": 1,
    "history": {
+      "v1.2.2": "优化提示词",
      "v1.2.1": "改进字幕样式获取方法",
      "v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI",
      "v1.1.4": "优化字幕选择决策",
--- a/plugins.v2/lexiannot/init.py
+++ b/plugins.v2/lexiannot/init.py
@@ -3,7 +3,6 @@ import os
 import json
 import queue
 import re
-import shutil
 import subprocess
 import sys
 import threading
@@ -61,7 +60,7 @@ class LexiAnnot(_PluginBase):
    # 插件图标
    plugin_icon = "LexiAnnot.png"
    # 插件版本
-    plugin_version = "1.2.1"
+    plugin_version = "1.2.2"
    # 插件作者
    plugin_author = "wumode"
    # 作者主页
@@ -163,10 +162,6 @@ class LexiAnnot(_PluginBase):
            self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0
        if self._delete_data:
            # 删除不再保存在数据库的数据
-            self.del_data("cefr_lexicon")
-            self.del_data("coca2k_lexicon")
-            self.del_data("swear_words")
-            self.del_data("lexicon_version")
            self.delete_data()
            self._delete_data = False
            self._loaded = False
@@ -1064,15 +1059,6 @@ class LexiAnnot(_PluginBase):
            logger.error(f"词典 {lexicon_path} 删除失败: {e}")
        self._load_lexicon_from_local.cache_clear()

-        # 删除虚拟环境
-        venv_dir = data_path / "venv_genai"
-        if os.path.exists(venv_dir):
-            try:
-                shutil.rmtree(venv_dir)
-                logger.info(f"虚拟环境 {venv_dir} 已删除")
-            except Exception as e:
-                logger.error(f"虚拟环境 {venv_dir} 删除失败: {e}")
-
        # 删除任务记录
        with self._tasks_lock:
            self._tasks = {}
@@ -1324,9 +1310,7 @@ class LexiAnnot(_PluginBase):

        ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else "ffmpeg"
        eng_mark = ["en", "en-US", "eng", "en-GB", "english", "en-AU"]
-        embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(
-            path, eng_mark, ffmpeg_path
-        )
+        embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path)
        if not embedded_subtitles:
            return ProcessResult(
                status=TaskStatus.CANCELED, message="未找到嵌入式英文文本字幕"
@@ -1345,22 +1329,14 @@ class LexiAnnot(_PluginBase):
            logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕")
            for embedded_subtitle in embedded_subtitles:
                if self._shutdown_event.is_set():
-                    return ProcessResult(
-                        status=TaskStatus.CANCELED, message="任务已取消"
-                    )
-                ass_subtitle = SSAFile.from_string(
-                    embedded_subtitle["subtitle"], format_="ass"
-                )
+                    return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
+                ass_subtitle = SSAFile.from_string(embedded_subtitle["subtitle"], format_="ass")
                if embedded_subtitle.get("codec_id") == "S_TEXT/UTF8":
                    ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle)
                ass_subtitle = self.__set_style(ass_subtitle)
-                ass_subtitle, stat = self.process_subtitles(
-                    ass_subtitle, lexi, spacy_worker, mediainfo
-                )
+                ass_subtitle, stat = self.process_subtitles(ass_subtitle, lexi, spacy_worker, mediainfo)
                if self._shutdown_event.is_set():
-                    return ProcessResult(
-                        status=TaskStatus.CANCELED, message="任务已取消"
-                    )
+                    return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
                if ass_subtitle:
                    try:
                        ass_subtitle.save(str(ass_file))
@@ -1810,7 +1786,7 @@ class LexiAnnot(_PluginBase):
    @staticmethod
    def _extract_subtitles_by_lang(
            video_path: str, lang: str | list = "en", ffmpeg: str = "ffmpeg"
-    ) -> Optional[List[Dict]]:
+    ) -> list[dict]:
        """
        提取视频文件中的内嵌英文字幕，使用 MediaInfo 查找字幕流。
        """
@@ -1853,21 +1829,25 @@ class LexiAnnot(_PluginBase):
                            }
                        )
            if subtitles:
-                return subtitles
-            else:
+                # remove outliers with abnormally short duration
+                if len(subtitles) > 1:
+                    durations = [sub["duration"] for sub in subtitles if sub["duration"] > 0]
+                    if durations:
+                        avg_duration = sum(durations) / len(durations)
+                        subtitles = [
+                            sub for sub in subtitles if sub["duration"] >= avg_duration * 0.2
+                        ]
+            if not subtitles:
                logger.warn("未找到标记为英语的文本字幕流")
-                return None

        except FileNotFoundError:
            logger.error(f"找不到视频文件 '{video_path}'")
-            return None
        except subprocess.CalledProcessError as e:
            logger.error(f"错误：提取字幕失败。\n错误信息：{e}")
            logger.error(f"FFmpeg 输出 (stderr):\n{e.stderr}")
-            return None
        except Exception as e:
            logger.error(f"使用 MediaInfo 提取字幕时发生错误：{e}")
-            return None
+        return subtitles

    def _process_chain(
            self,
@@ -1884,12 +1864,8 @@ class LexiAnnot(_PluginBase):
        :param spacy_worker: spaCy 分词器
        :returns: 处理后的字幕行列表
        """
-        simple_vocabulary = set(
-            filter(
-                lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"]
-            )
-        )
-
+        simple_vocabulary = set(filter(lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"]))
+        learner_level = max(simple_vocabulary)
        model_temperature = float(self._model_temperature) if self._model_temperature else 0.3
        logger.info("通过 spaCy 分词...")
        for seg in segments:
@@ -1927,7 +1903,7 @@ class LexiAnnot(_PluginBase):
                segments=segments,
                shutdown_event=self._shutdown_event,
                context_window=self._context_window,
-                leaner_level=self._annot_level,
+                leaner_level=learner_level,
                media_context=mediainfo,
                translate_sentences=self._sentence_translation
            )
--- a/plugins.v2/lexiannot/pipeline.py
+++ b/plugins.v2/lexiannot/pipeline.py
@@ -406,6 +406,7 @@ def _context_process_chain(
                [
                    f"- {word.text} (WORD_ID: {word.meta.word_id}, LEMMA: {word.lemma}, CEFR: {word.cefr}, POS: {word.pos})"
                    for seg in segment_list
+                    if start <= seg.index <= end
                    for word in seg.candidate_words
                ]
            ),
@@ -468,7 +469,7 @@ def _context_process_chain(
                )

                built_word = _update_word_via_lexicon(built_word, lexi)
-                if built_word.cefr and built_word.cefr < leaner_level:
+                if built_word.cefr and built_word.cefr <= leaner_level:
                    continue
                seg.candidate_words.append(built_word)

@@ -477,26 +478,31 @@ def _context_process_chain(
            (
                "system",
                """You are an expert in linguistics and language learning. Your task is to analyze subtitle segments.
-Please perform the following tasks for an English learner at {leaner_level} CEFR level.
+Please perform the following tasks for an non-native English learner.

-**CRITICAL INSTRUCTION**: The learner is advanced. They already know common daily vocabulary.
-Your goal is to identify **only** content that helps them reach native-level proficiency.
+**CRITICAL INSTRUCTION**: The learner is at the {leaner_level} level.
+They are proficient in vocabulary at or below this level.
+Your goal is two-fold:
+1.  **Learning**: Identify content challenging for their current level.
+2.  **Comprehension**: Ensure they understand **specific or low-frequency vocabulary** crucial for the narrative, even if it is not "core" vocabulary.

 1.  **Review and Evaluate Candidate Words:**
-    *   **Goal**: Filter out simple words and correct any errors in lemma/POS/text.
+    *   **Goal**: Filter out words that are easy, BUT **retain** rare or specific words needed for understanding.
    *   **Action**: Return feedback items **ONLY** for words that:
        1.  Should be **discarded** (too simple, trivial filler, profanity without cultural value). Set `should_keep` to `False`.
        2.  Need **correction** (wrong lemma, POS, or text boundary). Set `should_keep` to `True` and provide correct values.
    *   **Implicit Rule**: If a word is appropriate for the learner and has correct info, **DO NOT** include it in the output list.
-    *   **Keep criteria**: Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
-    *   **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words below {leaner_level} level.
+    *   **Keep criteria**:
+        *    Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
+        *   **Specific/Concrete Vocabulary**: Keep low-frequency words (e.g., like 'chamomile', 'cavernous' for B2) that are rare but essential for visualizing the scene or understanding the plot. **Do NOT discard these just because they are rare.**
+    *   **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words well below {leaner_level} level (unless they fit the 'Keep criteria').

 2.  **Identify Missed Words:**
-    *   Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners. This specifically includes:
-        *   **Slang or informal expressions.**
-        *   **Internet terms or modern colloquialisms.**
-        *   **Words or phrases that require specific cultural background knowledge to understand.**
-        *   **Any other words or phrases that are challenging.**
+    *   Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners or for **plot comprehension**.
+    *   **Targets**:
+        *   **Slang, idioms, or modern colloquialisms.**
+        *   **Low-frequency words** (e.g., 'shimmer', 'rugged') missed by the algorithm.
+        *   **Words requiring cultural background.**
    *   Avoid repeating words already listed in `candidate_words`.
    *   Must exist in the exact form in `context_text`.
    *   Provide lemma and POS.