mirror of
https://github.com/jxxghp/MoviePilot-Plugins.git
synced 2026-03-27 10:05:57 +00:00
Merge pull request #966 from wumode/lexiannot
This commit is contained in:
@@ -470,11 +470,12 @@
|
||||
"name": "IMDb源",
|
||||
"description": "让探索,推荐和媒体识别支持IMDb数据源。",
|
||||
"labels": "探索",
|
||||
"version": "1.6.5",
|
||||
"version": "1.6.6",
|
||||
"icon": "IMDb_IOS-OSX_App.png",
|
||||
"author": "wumode",
|
||||
"level": 1,
|
||||
"history": {
|
||||
"v1.6.6": "优化主页组件链接跳转",
|
||||
"v1.6.5": "仪表盘组件支持图片缓存",
|
||||
"v1.6.4": "为元数据增加背景图",
|
||||
"v1.6.3": "优化媒体识别速度; 适配 Pydantic V2 (主程序版本需高于 2.8.1-1)",
|
||||
@@ -549,11 +550,12 @@
|
||||
"name": "美剧生词标注",
|
||||
"description": "根据CEFR等级,为英语影视剧标注高级词汇。",
|
||||
"labels": "英语",
|
||||
"version": "1.2.1",
|
||||
"version": "1.2.2",
|
||||
"icon": "LexiAnnot.png",
|
||||
"author": "wumode",
|
||||
"level": 1,
|
||||
"history": {
|
||||
"v1.2.2": "优化提示词",
|
||||
"v1.2.1": "改进字幕样式获取方法",
|
||||
"v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI",
|
||||
"v1.1.4": "优化字幕选择决策",
|
||||
|
||||
@@ -34,7 +34,7 @@ class ImdbSource(_PluginBase):
|
||||
# 插件图标
|
||||
plugin_icon = "IMDb_IOS-OSX_App.png"
|
||||
# 插件版本
|
||||
plugin_version = "1.6.5"
|
||||
plugin_version = "1.6.6"
|
||||
# 插件作者
|
||||
plugin_author = "wumode"
|
||||
# 作者主页
|
||||
@@ -285,28 +285,19 @@ class ImdbSource(_PluginBase):
|
||||
},
|
||||
'content': [
|
||||
{
|
||||
'component': 'RouterLink',
|
||||
'component': 'h1',
|
||||
'props': {
|
||||
'to': mp_url,
|
||||
'class': 'no-underline'
|
||||
'class': 'mb-1 text-white text-shadow font-extrabold text-2xl line-clamp-2 overflow-hidden text-ellipsis ...'
|
||||
},
|
||||
'content': [
|
||||
{
|
||||
'component': 'h1',
|
||||
'props': {
|
||||
'class': 'mb-1 text-white text-shadow font-extrabold text-2xl line-clamp-2 overflow-hidden text-ellipsis ...'
|
||||
},
|
||||
'html': f"{entry.name} <span class='text-base font-normal'>{year}</span>",
|
||||
},
|
||||
{
|
||||
'component': 'span',
|
||||
'props': {
|
||||
'class': 'text-shadow line-clamp-2 overflow-hidden text-ellipsis ...'
|
||||
},
|
||||
'html': imdb_title.plot_text,
|
||||
}
|
||||
]
|
||||
'html': f"{entry.name} <span class='text-base font-normal'>{year}</span>",
|
||||
},
|
||||
{
|
||||
'component': 'span',
|
||||
'props': {
|
||||
'class': 'text-shadow line-clamp-2 overflow-hidden text-ellipsis ...'
|
||||
},
|
||||
'html': imdb_title.plot_text,
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -392,7 +383,8 @@ class ImdbSource(_PluginBase):
|
||||
{
|
||||
'component': 'a',
|
||||
'props': {
|
||||
'href': f'#{mp_url}',
|
||||
'href': f"https://www.imdb.com/title/{entry.ttconst}",
|
||||
'target': '_blank',
|
||||
'class': 'no-underline w-100',
|
||||
'style': 'display: flex; justify-content: center;'
|
||||
},
|
||||
@@ -454,15 +446,17 @@ class ImdbSource(_PluginBase):
|
||||
{
|
||||
'component': 'a',
|
||||
'props': {
|
||||
'href': f"https://www.imdb.com/title/{entry.ttconst}",
|
||||
'target': '_blank',
|
||||
'href': f'#{mp_url}',
|
||||
'rel': 'noopener noreferrer',
|
||||
'class': 'text-h4 font-weight-bold mb-2 d-flex text-white align-center',
|
||||
},
|
||||
'content': [
|
||||
{
|
||||
'component': 'span',
|
||||
'html': f"{entry.name}"
|
||||
'html': f"{entry.name}",
|
||||
'props': {
|
||||
'class': 'line-clamp-2 overflow-hidden',
|
||||
}
|
||||
},
|
||||
{
|
||||
'component': 'v-icon',
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
import json
|
||||
import queue
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
@@ -61,7 +60,7 @@ class LexiAnnot(_PluginBase):
|
||||
# 插件图标
|
||||
plugin_icon = "LexiAnnot.png"
|
||||
# 插件版本
|
||||
plugin_version = "1.2.1"
|
||||
plugin_version = "1.2.2"
|
||||
# 插件作者
|
||||
plugin_author = "wumode"
|
||||
# 作者主页
|
||||
@@ -163,10 +162,6 @@ class LexiAnnot(_PluginBase):
|
||||
self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0
|
||||
if self._delete_data:
|
||||
# 删除不再保存在数据库的数据
|
||||
self.del_data("cefr_lexicon")
|
||||
self.del_data("coca2k_lexicon")
|
||||
self.del_data("swear_words")
|
||||
self.del_data("lexicon_version")
|
||||
self.delete_data()
|
||||
self._delete_data = False
|
||||
self._loaded = False
|
||||
@@ -1064,15 +1059,6 @@ class LexiAnnot(_PluginBase):
|
||||
logger.error(f"词典 {lexicon_path} 删除失败: {e}")
|
||||
self._load_lexicon_from_local.cache_clear()
|
||||
|
||||
# 删除虚拟环境
|
||||
venv_dir = data_path / "venv_genai"
|
||||
if os.path.exists(venv_dir):
|
||||
try:
|
||||
shutil.rmtree(venv_dir)
|
||||
logger.info(f"虚拟环境 {venv_dir} 已删除")
|
||||
except Exception as e:
|
||||
logger.error(f"虚拟环境 {venv_dir} 删除失败: {e}")
|
||||
|
||||
# 删除任务记录
|
||||
with self._tasks_lock:
|
||||
self._tasks = {}
|
||||
@@ -1324,9 +1310,7 @@ class LexiAnnot(_PluginBase):
|
||||
|
||||
ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else "ffmpeg"
|
||||
eng_mark = ["en", "en-US", "eng", "en-GB", "english", "en-AU"]
|
||||
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(
|
||||
path, eng_mark, ffmpeg_path
|
||||
)
|
||||
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path)
|
||||
if not embedded_subtitles:
|
||||
return ProcessResult(
|
||||
status=TaskStatus.CANCELED, message="未找到嵌入式英文文本字幕"
|
||||
@@ -1345,22 +1329,14 @@ class LexiAnnot(_PluginBase):
|
||||
logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕")
|
||||
for embedded_subtitle in embedded_subtitles:
|
||||
if self._shutdown_event.is_set():
|
||||
return ProcessResult(
|
||||
status=TaskStatus.CANCELED, message="任务已取消"
|
||||
)
|
||||
ass_subtitle = SSAFile.from_string(
|
||||
embedded_subtitle["subtitle"], format_="ass"
|
||||
)
|
||||
return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
|
||||
ass_subtitle = SSAFile.from_string(embedded_subtitle["subtitle"], format_="ass")
|
||||
if embedded_subtitle.get("codec_id") == "S_TEXT/UTF8":
|
||||
ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle)
|
||||
ass_subtitle = self.__set_style(ass_subtitle)
|
||||
ass_subtitle, stat = self.process_subtitles(
|
||||
ass_subtitle, lexi, spacy_worker, mediainfo
|
||||
)
|
||||
ass_subtitle, stat = self.process_subtitles(ass_subtitle, lexi, spacy_worker, mediainfo)
|
||||
if self._shutdown_event.is_set():
|
||||
return ProcessResult(
|
||||
status=TaskStatus.CANCELED, message="任务已取消"
|
||||
)
|
||||
return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
|
||||
if ass_subtitle:
|
||||
try:
|
||||
ass_subtitle.save(str(ass_file))
|
||||
@@ -1810,7 +1786,7 @@ class LexiAnnot(_PluginBase):
|
||||
@staticmethod
|
||||
def _extract_subtitles_by_lang(
|
||||
video_path: str, lang: str | list = "en", ffmpeg: str = "ffmpeg"
|
||||
) -> Optional[List[Dict]]:
|
||||
) -> list[dict]:
|
||||
"""
|
||||
提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。
|
||||
"""
|
||||
@@ -1853,21 +1829,25 @@ class LexiAnnot(_PluginBase):
|
||||
}
|
||||
)
|
||||
if subtitles:
|
||||
return subtitles
|
||||
else:
|
||||
# remove outliers with abnormally short duration
|
||||
if len(subtitles) > 1:
|
||||
durations = [sub["duration"] for sub in subtitles if sub["duration"] > 0]
|
||||
if durations:
|
||||
avg_duration = sum(durations) / len(durations)
|
||||
subtitles = [
|
||||
sub for sub in subtitles if sub["duration"] >= avg_duration * 0.2
|
||||
]
|
||||
if not subtitles:
|
||||
logger.warn("未找到标记为英语的文本字幕流")
|
||||
return None
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error(f"找不到视频文件 '{video_path}'")
|
||||
return None
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"错误:提取字幕失败。\n错误信息:{e}")
|
||||
logger.error(f"FFmpeg 输出 (stderr):\n{e.stderr}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"使用 MediaInfo 提取字幕时发生错误:{e}")
|
||||
return None
|
||||
return subtitles
|
||||
|
||||
def _process_chain(
|
||||
self,
|
||||
@@ -1884,12 +1864,9 @@ class LexiAnnot(_PluginBase):
|
||||
:param spacy_worker: spaCy 分词器
|
||||
:returns: 处理后的字幕行列表
|
||||
"""
|
||||
simple_vocabulary = set(
|
||||
filter(
|
||||
lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
)
|
||||
)
|
||||
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
simple_vocabulary = set(filter(lambda x: x < self._annot_level, CEFR_LEVELS))
|
||||
learner_level = max(simple_vocabulary)
|
||||
model_temperature = float(self._model_temperature) if self._model_temperature else 0.3
|
||||
logger.info("通过 spaCy 分词...")
|
||||
for seg in segments:
|
||||
@@ -1927,7 +1904,7 @@ class LexiAnnot(_PluginBase):
|
||||
segments=segments,
|
||||
shutdown_event=self._shutdown_event,
|
||||
context_window=self._context_window,
|
||||
leaner_level=self._annot_level,
|
||||
leaner_level=learner_level,
|
||||
media_context=mediainfo,
|
||||
translate_sentences=self._sentence_translation
|
||||
)
|
||||
|
||||
@@ -406,6 +406,7 @@ def _context_process_chain(
|
||||
[
|
||||
f"- {word.text} (WORD_ID: {word.meta.word_id}, LEMMA: {word.lemma}, CEFR: {word.cefr}, POS: {word.pos})"
|
||||
for seg in segment_list
|
||||
if start <= seg.index <= end
|
||||
for word in seg.candidate_words
|
||||
]
|
||||
),
|
||||
@@ -468,7 +469,7 @@ def _context_process_chain(
|
||||
)
|
||||
|
||||
built_word = _update_word_via_lexicon(built_word, lexi)
|
||||
if built_word.cefr and built_word.cefr < leaner_level:
|
||||
if built_word.cefr and built_word.cefr <= leaner_level:
|
||||
continue
|
||||
seg.candidate_words.append(built_word)
|
||||
|
||||
@@ -477,26 +478,31 @@ def _context_process_chain(
|
||||
(
|
||||
"system",
|
||||
"""You are an expert in linguistics and language learning. Your task is to analyze subtitle segments.
|
||||
Please perform the following tasks for an English learner at {leaner_level} CEFR level.
|
||||
Please perform the following tasks for an non-native English learner.
|
||||
|
||||
**CRITICAL INSTRUCTION**: The learner is advanced. They already know common daily vocabulary.
|
||||
Your goal is to identify **only** content that helps them reach native-level proficiency.
|
||||
**CRITICAL INSTRUCTION**: The learner is at the {leaner_level} level.
|
||||
They are proficient in vocabulary at or below this level.
|
||||
Your goal is two-fold:
|
||||
1. **Learning**: Identify content challenging for their current level.
|
||||
2. **Comprehension**: Ensure they understand **specific or low-frequency vocabulary** crucial for the narrative, even if it is not "core" vocabulary.
|
||||
|
||||
1. **Review and Evaluate Candidate Words:**
|
||||
* **Goal**: Filter out simple words and correct any errors in lemma/POS/text.
|
||||
* **Goal**: Filter out words that are easy, BUT **retain** rare or specific words needed for understanding.
|
||||
* **Action**: Return feedback items **ONLY** for words that:
|
||||
1. Should be **discarded** (too simple, trivial filler, profanity without cultural value). Set `should_keep` to `False`.
|
||||
2. Need **correction** (wrong lemma, POS, or text boundary). Set `should_keep` to `True` and provide correct values.
|
||||
* **Implicit Rule**: If a word is appropriate for the learner and has correct info, **DO NOT** include it in the output list.
|
||||
* **Keep criteria**: Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
|
||||
* **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words below {leaner_level} level.
|
||||
* **Keep criteria**:
|
||||
* Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
|
||||
* **Specific/Concrete Vocabulary**: Keep low-frequency words (e.g., like 'chamomile', 'cavernous' for B2) that are rare but essential for visualizing the scene or understanding the plot. **Do NOT discard these just because they are rare.**
|
||||
* **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words well below {leaner_level} level (unless they fit the 'Keep criteria').
|
||||
|
||||
2. **Identify Missed Words:**
|
||||
* Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners. This specifically includes:
|
||||
* **Slang or informal expressions.**
|
||||
* **Internet terms or modern colloquialisms.**
|
||||
* **Words or phrases that require specific cultural background knowledge to understand.**
|
||||
* **Any other words or phrases that are challenging.**
|
||||
* Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners or for **plot comprehension**.
|
||||
* **Targets**:
|
||||
* **Slang, idioms, or modern colloquialisms.**
|
||||
* **Low-frequency words** (e.g., 'shimmer', 'rugged') missed by the algorithm.
|
||||
* **Words requiring cultural background.**
|
||||
* Avoid repeating words already listed in `candidate_words`.
|
||||
* Must exist in the exact form in `context_text`.
|
||||
* Provide lemma and POS.
|
||||
@@ -690,7 +696,7 @@ def llm_process_chain(
|
||||
segments: SegmentList,
|
||||
shutdown_event: threading.Event,
|
||||
context_window: int = 30,
|
||||
leaner_level: str = "C1",
|
||||
learner_level: str = "C1",
|
||||
media_context: Context | None = None,
|
||||
translate_sentences: bool = False,
|
||||
) -> SegmentList:
|
||||
@@ -702,7 +708,7 @@ def llm_process_chain(
|
||||
:param segments: 字幕片段
|
||||
:param shutdown_event: 关闭事件
|
||||
:param context_window: 上下文窗口大小
|
||||
:param leaner_level: 学习者的 CEFR 水平
|
||||
:param learner_level: 学习者的 CEFR 水平
|
||||
:param media_context: 媒体信息
|
||||
:param translate_sentences: 是否翻译句子
|
||||
:returns: 更新后的字幕片段列表
|
||||
@@ -726,7 +732,7 @@ def llm_process_chain(
|
||||
)
|
||||
segments_list.extend(
|
||||
_context_process_chain(
|
||||
lexi, llm, context, start, end, leaner_level, media_name, translate_sentences
|
||||
lexi, llm, context, start, end, learner_level, media_name, translate_sentences
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user