mirror of
https://github.com/jxxghp/MoviePilot-Plugins.git
synced 2026-03-27 10:05:57 +00:00
feat(lexiannot): optimize prompts and subtitle extraction
This commit is contained in:
@@ -549,11 +549,12 @@
|
||||
"name": "美剧生词标注",
|
||||
"description": "根据CEFR等级,为英语影视剧标注高级词汇。",
|
||||
"labels": "英语",
|
||||
"version": "1.2.1",
|
||||
"version": "1.2.2",
|
||||
"icon": "LexiAnnot.png",
|
||||
"author": "wumode",
|
||||
"level": 1,
|
||||
"history": {
|
||||
"v1.2.2": "优化提示词",
|
||||
"v1.2.1": "改进字幕样式获取方法",
|
||||
"v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI",
|
||||
"v1.1.4": "优化字幕选择决策",
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
import json
|
||||
import queue
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
@@ -61,7 +60,7 @@ class LexiAnnot(_PluginBase):
|
||||
# 插件图标
|
||||
plugin_icon = "LexiAnnot.png"
|
||||
# 插件版本
|
||||
plugin_version = "1.2.1"
|
||||
plugin_version = "1.2.2"
|
||||
# 插件作者
|
||||
plugin_author = "wumode"
|
||||
# 作者主页
|
||||
@@ -163,10 +162,6 @@ class LexiAnnot(_PluginBase):
|
||||
self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0
|
||||
if self._delete_data:
|
||||
# 删除不再保存在数据库的数据
|
||||
self.del_data("cefr_lexicon")
|
||||
self.del_data("coca2k_lexicon")
|
||||
self.del_data("swear_words")
|
||||
self.del_data("lexicon_version")
|
||||
self.delete_data()
|
||||
self._delete_data = False
|
||||
self._loaded = False
|
||||
@@ -1064,15 +1059,6 @@ class LexiAnnot(_PluginBase):
|
||||
logger.error(f"词典 {lexicon_path} 删除失败: {e}")
|
||||
self._load_lexicon_from_local.cache_clear()
|
||||
|
||||
# 删除虚拟环境
|
||||
venv_dir = data_path / "venv_genai"
|
||||
if os.path.exists(venv_dir):
|
||||
try:
|
||||
shutil.rmtree(venv_dir)
|
||||
logger.info(f"虚拟环境 {venv_dir} 已删除")
|
||||
except Exception as e:
|
||||
logger.error(f"虚拟环境 {venv_dir} 删除失败: {e}")
|
||||
|
||||
# 删除任务记录
|
||||
with self._tasks_lock:
|
||||
self._tasks = {}
|
||||
@@ -1324,9 +1310,7 @@ class LexiAnnot(_PluginBase):
|
||||
|
||||
ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else "ffmpeg"
|
||||
eng_mark = ["en", "en-US", "eng", "en-GB", "english", "en-AU"]
|
||||
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(
|
||||
path, eng_mark, ffmpeg_path
|
||||
)
|
||||
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path)
|
||||
if not embedded_subtitles:
|
||||
return ProcessResult(
|
||||
status=TaskStatus.CANCELED, message="未找到嵌入式英文文本字幕"
|
||||
@@ -1345,22 +1329,14 @@ class LexiAnnot(_PluginBase):
|
||||
logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕")
|
||||
for embedded_subtitle in embedded_subtitles:
|
||||
if self._shutdown_event.is_set():
|
||||
return ProcessResult(
|
||||
status=TaskStatus.CANCELED, message="任务已取消"
|
||||
)
|
||||
ass_subtitle = SSAFile.from_string(
|
||||
embedded_subtitle["subtitle"], format_="ass"
|
||||
)
|
||||
return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
|
||||
ass_subtitle = SSAFile.from_string(embedded_subtitle["subtitle"], format_="ass")
|
||||
if embedded_subtitle.get("codec_id") == "S_TEXT/UTF8":
|
||||
ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle)
|
||||
ass_subtitle = self.__set_style(ass_subtitle)
|
||||
ass_subtitle, stat = self.process_subtitles(
|
||||
ass_subtitle, lexi, spacy_worker, mediainfo
|
||||
)
|
||||
ass_subtitle, stat = self.process_subtitles(ass_subtitle, lexi, spacy_worker, mediainfo)
|
||||
if self._shutdown_event.is_set():
|
||||
return ProcessResult(
|
||||
status=TaskStatus.CANCELED, message="任务已取消"
|
||||
)
|
||||
return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
|
||||
if ass_subtitle:
|
||||
try:
|
||||
ass_subtitle.save(str(ass_file))
|
||||
@@ -1810,7 +1786,7 @@ class LexiAnnot(_PluginBase):
|
||||
@staticmethod
|
||||
def _extract_subtitles_by_lang(
|
||||
video_path: str, lang: str | list = "en", ffmpeg: str = "ffmpeg"
|
||||
) -> Optional[List[Dict]]:
|
||||
) -> list[dict]:
|
||||
"""
|
||||
提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。
|
||||
"""
|
||||
@@ -1853,21 +1829,25 @@ class LexiAnnot(_PluginBase):
|
||||
}
|
||||
)
|
||||
if subtitles:
|
||||
return subtitles
|
||||
else:
|
||||
# remove outliers with abnormally short duration
|
||||
if len(subtitles) > 1:
|
||||
durations = [sub["duration"] for sub in subtitles if sub["duration"] > 0]
|
||||
if durations:
|
||||
avg_duration = sum(durations) / len(durations)
|
||||
subtitles = [
|
||||
sub for sub in subtitles if sub["duration"] >= avg_duration * 0.2
|
||||
]
|
||||
if not subtitles:
|
||||
logger.warn("未找到标记为英语的文本字幕流")
|
||||
return None
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error(f"找不到视频文件 '{video_path}'")
|
||||
return None
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"错误:提取字幕失败。\n错误信息:{e}")
|
||||
logger.error(f"FFmpeg 输出 (stderr):\n{e.stderr}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"使用 MediaInfo 提取字幕时发生错误:{e}")
|
||||
return None
|
||||
return subtitles
|
||||
|
||||
def _process_chain(
|
||||
self,
|
||||
@@ -1884,12 +1864,8 @@ class LexiAnnot(_PluginBase):
|
||||
:param spacy_worker: spaCy 分词器
|
||||
:returns: 处理后的字幕行列表
|
||||
"""
|
||||
simple_vocabulary = set(
|
||||
filter(
|
||||
lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
)
|
||||
)
|
||||
|
||||
simple_vocabulary = set(filter(lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"]))
|
||||
learner_level = max(simple_vocabulary)
|
||||
model_temperature = float(self._model_temperature) if self._model_temperature else 0.3
|
||||
logger.info("通过 spaCy 分词...")
|
||||
for seg in segments:
|
||||
@@ -1927,7 +1903,7 @@ class LexiAnnot(_PluginBase):
|
||||
segments=segments,
|
||||
shutdown_event=self._shutdown_event,
|
||||
context_window=self._context_window,
|
||||
leaner_level=self._annot_level,
|
||||
leaner_level=learner_level,
|
||||
media_context=mediainfo,
|
||||
translate_sentences=self._sentence_translation
|
||||
)
|
||||
|
||||
@@ -406,6 +406,7 @@ def _context_process_chain(
|
||||
[
|
||||
f"- {word.text} (WORD_ID: {word.meta.word_id}, LEMMA: {word.lemma}, CEFR: {word.cefr}, POS: {word.pos})"
|
||||
for seg in segment_list
|
||||
if start <= seg.index <= end
|
||||
for word in seg.candidate_words
|
||||
]
|
||||
),
|
||||
@@ -468,7 +469,7 @@ def _context_process_chain(
|
||||
)
|
||||
|
||||
built_word = _update_word_via_lexicon(built_word, lexi)
|
||||
if built_word.cefr and built_word.cefr < leaner_level:
|
||||
if built_word.cefr and built_word.cefr <= leaner_level:
|
||||
continue
|
||||
seg.candidate_words.append(built_word)
|
||||
|
||||
@@ -477,26 +478,31 @@ def _context_process_chain(
|
||||
(
|
||||
"system",
|
||||
"""You are an expert in linguistics and language learning. Your task is to analyze subtitle segments.
|
||||
Please perform the following tasks for an English learner at {leaner_level} CEFR level.
|
||||
Please perform the following tasks for an non-native English learner.
|
||||
|
||||
**CRITICAL INSTRUCTION**: The learner is advanced. They already know common daily vocabulary.
|
||||
Your goal is to identify **only** content that helps them reach native-level proficiency.
|
||||
**CRITICAL INSTRUCTION**: The learner is at the {leaner_level} level.
|
||||
They are proficient in vocabulary at or below this level.
|
||||
Your goal is two-fold:
|
||||
1. **Learning**: Identify content challenging for their current level.
|
||||
2. **Comprehension**: Ensure they understand **specific or low-frequency vocabulary** crucial for the narrative, even if it is not "core" vocabulary.
|
||||
|
||||
1. **Review and Evaluate Candidate Words:**
|
||||
* **Goal**: Filter out simple words and correct any errors in lemma/POS/text.
|
||||
* **Goal**: Filter out words that are easy, BUT **retain** rare or specific words needed for understanding.
|
||||
* **Action**: Return feedback items **ONLY** for words that:
|
||||
1. Should be **discarded** (too simple, trivial filler, profanity without cultural value). Set `should_keep` to `False`.
|
||||
2. Need **correction** (wrong lemma, POS, or text boundary). Set `should_keep` to `True` and provide correct values.
|
||||
* **Implicit Rule**: If a word is appropriate for the learner and has correct info, **DO NOT** include it in the output list.
|
||||
* **Keep criteria**: Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
|
||||
* **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words below {leaner_level} level.
|
||||
* **Keep criteria**:
|
||||
* Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
|
||||
* **Specific/Concrete Vocabulary**: Keep low-frequency words (e.g., like 'chamomile', 'cavernous' for B2) that are rare but essential for visualizing the scene or understanding the plot. **Do NOT discard these just because they are rare.**
|
||||
* **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words well below {leaner_level} level (unless they fit the 'Keep criteria').
|
||||
|
||||
2. **Identify Missed Words:**
|
||||
* Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners. This specifically includes:
|
||||
* **Slang or informal expressions.**
|
||||
* **Internet terms or modern colloquialisms.**
|
||||
* **Words or phrases that require specific cultural background knowledge to understand.**
|
||||
* **Any other words or phrases that are challenging.**
|
||||
* Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners or for **plot comprehension**.
|
||||
* **Targets**:
|
||||
* **Slang, idioms, or modern colloquialisms.**
|
||||
* **Low-frequency words** (e.g., 'shimmer', 'rugged') missed by the algorithm.
|
||||
* **Words requiring cultural background.**
|
||||
* Avoid repeating words already listed in `candidate_words`.
|
||||
* Must exist in the exact form in `context_text`.
|
||||
* Provide lemma and POS.
|
||||
|
||||
Reference in New Issue
Block a user