Merge pull request #966 from wumode/lexiannot

This commit is contained in:
jxxghp
2026-01-07 16:40:08 +08:00
committed by GitHub
4 changed files with 64 additions and 85 deletions

View File

@@ -470,11 +470,12 @@
"name": "IMDb源",
"description": "让探索推荐和媒体识别支持IMDb数据源。",
"labels": "探索",
"version": "1.6.5",
"version": "1.6.6",
"icon": "IMDb_IOS-OSX_App.png",
"author": "wumode",
"level": 1,
"history": {
"v1.6.6": "优化主页组件链接跳转",
"v1.6.5": "仪表盘组件支持图片缓存",
"v1.6.4": "为元数据增加背景图",
"v1.6.3": "优化媒体识别速度; 适配 Pydantic V2 (主程序版本需高于 2.8.1-1)",
@@ -549,11 +550,12 @@
"name": "美剧生词标注",
"description": "根据CEFR等级为英语影视剧标注高级词汇。",
"labels": "英语",
"version": "1.2.1",
"version": "1.2.2",
"icon": "LexiAnnot.png",
"author": "wumode",
"level": 1,
"history": {
"v1.2.2": "优化提示词",
"v1.2.1": "改进字幕样式获取方法",
"v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI",
"v1.1.4": "优化字幕选择决策",

View File

@@ -34,7 +34,7 @@ class ImdbSource(_PluginBase):
# 插件图标
plugin_icon = "IMDb_IOS-OSX_App.png"
# 插件版本
plugin_version = "1.6.5"
plugin_version = "1.6.6"
# 插件作者
plugin_author = "wumode"
# 作者主页
@@ -285,28 +285,19 @@ class ImdbSource(_PluginBase):
},
'content': [
{
'component': 'RouterLink',
'component': 'h1',
'props': {
'to': mp_url,
'class': 'no-underline'
'class': 'mb-1 text-white text-shadow font-extrabold text-2xl line-clamp-2 overflow-hidden text-ellipsis ...'
},
'content': [
{
'component': 'h1',
'props': {
'class': 'mb-1 text-white text-shadow font-extrabold text-2xl line-clamp-2 overflow-hidden text-ellipsis ...'
},
'html': f"{entry.name} <span class='text-base font-normal'>{year}</span>",
},
{
'component': 'span',
'props': {
'class': 'text-shadow line-clamp-2 overflow-hidden text-ellipsis ...'
},
'html': imdb_title.plot_text,
}
]
'html': f"{entry.name} <span class='text-base font-normal'>{year}</span>",
},
{
'component': 'span',
'props': {
'class': 'text-shadow line-clamp-2 overflow-hidden text-ellipsis ...'
},
'html': imdb_title.plot_text,
}
]
}
]
@@ -392,7 +383,8 @@ class ImdbSource(_PluginBase):
{
'component': 'a',
'props': {
'href': f'#{mp_url}',
'href': f"https://www.imdb.com/title/{entry.ttconst}",
'target': '_blank',
'class': 'no-underline w-100',
'style': 'display: flex; justify-content: center;'
},
@@ -454,15 +446,17 @@ class ImdbSource(_PluginBase):
{
'component': 'a',
'props': {
'href': f"https://www.imdb.com/title/{entry.ttconst}",
'target': '_blank',
'href': f'#{mp_url}',
'rel': 'noopener noreferrer',
'class': 'text-h4 font-weight-bold mb-2 d-flex text-white align-center',
},
'content': [
{
'component': 'span',
'html': f"{entry.name}"
'html': f"{entry.name}",
'props': {
'class': 'line-clamp-2 overflow-hidden',
}
},
{
'component': 'v-icon',

View File

@@ -3,7 +3,6 @@ import os
import json
import queue
import re
import shutil
import subprocess
import sys
import threading
@@ -61,7 +60,7 @@ class LexiAnnot(_PluginBase):
# 插件图标
plugin_icon = "LexiAnnot.png"
# 插件版本
plugin_version = "1.2.1"
plugin_version = "1.2.2"
# 插件作者
plugin_author = "wumode"
# 作者主页
@@ -163,10 +162,6 @@ class LexiAnnot(_PluginBase):
self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0
if self._delete_data:
# 删除不再保存在数据库的数据
self.del_data("cefr_lexicon")
self.del_data("coca2k_lexicon")
self.del_data("swear_words")
self.del_data("lexicon_version")
self.delete_data()
self._delete_data = False
self._loaded = False
@@ -1064,15 +1059,6 @@ class LexiAnnot(_PluginBase):
logger.error(f"词典 {lexicon_path} 删除失败: {e}")
self._load_lexicon_from_local.cache_clear()
# 删除虚拟环境
venv_dir = data_path / "venv_genai"
if os.path.exists(venv_dir):
try:
shutil.rmtree(venv_dir)
logger.info(f"虚拟环境 {venv_dir} 已删除")
except Exception as e:
logger.error(f"虚拟环境 {venv_dir} 删除失败: {e}")
# 删除任务记录
with self._tasks_lock:
self._tasks = {}
@@ -1324,9 +1310,7 @@ class LexiAnnot(_PluginBase):
ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else "ffmpeg"
eng_mark = ["en", "en-US", "eng", "en-GB", "english", "en-AU"]
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(
path, eng_mark, ffmpeg_path
)
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path)
if not embedded_subtitles:
return ProcessResult(
status=TaskStatus.CANCELED, message="未找到嵌入式英文文本字幕"
@@ -1345,22 +1329,14 @@ class LexiAnnot(_PluginBase):
logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕")
for embedded_subtitle in embedded_subtitles:
if self._shutdown_event.is_set():
return ProcessResult(
status=TaskStatus.CANCELED, message="任务已取消"
)
ass_subtitle = SSAFile.from_string(
embedded_subtitle["subtitle"], format_="ass"
)
return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
ass_subtitle = SSAFile.from_string(embedded_subtitle["subtitle"], format_="ass")
if embedded_subtitle.get("codec_id") == "S_TEXT/UTF8":
ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle)
ass_subtitle = self.__set_style(ass_subtitle)
ass_subtitle, stat = self.process_subtitles(
ass_subtitle, lexi, spacy_worker, mediainfo
)
ass_subtitle, stat = self.process_subtitles(ass_subtitle, lexi, spacy_worker, mediainfo)
if self._shutdown_event.is_set():
return ProcessResult(
status=TaskStatus.CANCELED, message="任务已取消"
)
return ProcessResult(status=TaskStatus.CANCELED, message="任务已取消")
if ass_subtitle:
try:
ass_subtitle.save(str(ass_file))
@@ -1810,7 +1786,7 @@ class LexiAnnot(_PluginBase):
@staticmethod
def _extract_subtitles_by_lang(
video_path: str, lang: str | list = "en", ffmpeg: str = "ffmpeg"
) -> Optional[List[Dict]]:
) -> list[dict]:
"""
提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。
"""
@@ -1853,21 +1829,25 @@ class LexiAnnot(_PluginBase):
}
)
if subtitles:
return subtitles
else:
# remove outliers with abnormally short duration
if len(subtitles) > 1:
durations = [sub["duration"] for sub in subtitles if sub["duration"] > 0]
if durations:
avg_duration = sum(durations) / len(durations)
subtitles = [
sub for sub in subtitles if sub["duration"] >= avg_duration * 0.2
]
if not subtitles:
logger.warn("未找到标记为英语的文本字幕流")
return None
except FileNotFoundError:
logger.error(f"找不到视频文件 '{video_path}'")
return None
except subprocess.CalledProcessError as e:
logger.error(f"错误:提取字幕失败。\n错误信息:{e}")
logger.error(f"FFmpeg 输出 (stderr):\n{e.stderr}")
return None
except Exception as e:
logger.error(f"使用 MediaInfo 提取字幕时发生错误:{e}")
return None
return subtitles
def _process_chain(
self,
@@ -1884,12 +1864,9 @@ class LexiAnnot(_PluginBase):
:param spacy_worker: spaCy 分词器
:returns: 处理后的字幕行列表
"""
simple_vocabulary = set(
filter(
lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"]
)
)
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
simple_vocabulary = set(filter(lambda x: x < self._annot_level, CEFR_LEVELS))
learner_level = max(simple_vocabulary)
model_temperature = float(self._model_temperature) if self._model_temperature else 0.3
logger.info("通过 spaCy 分词...")
for seg in segments:
@@ -1927,7 +1904,7 @@ class LexiAnnot(_PluginBase):
segments=segments,
shutdown_event=self._shutdown_event,
context_window=self._context_window,
leaner_level=self._annot_level,
leaner_level=learner_level,
media_context=mediainfo,
translate_sentences=self._sentence_translation
)

View File

@@ -406,6 +406,7 @@ def _context_process_chain(
[
f"- {word.text} (WORD_ID: {word.meta.word_id}, LEMMA: {word.lemma}, CEFR: {word.cefr}, POS: {word.pos})"
for seg in segment_list
if start <= seg.index <= end
for word in seg.candidate_words
]
),
@@ -468,7 +469,7 @@ def _context_process_chain(
)
built_word = _update_word_via_lexicon(built_word, lexi)
if built_word.cefr and built_word.cefr < leaner_level:
if built_word.cefr and built_word.cefr <= leaner_level:
continue
seg.candidate_words.append(built_word)
@@ -477,26 +478,31 @@ def _context_process_chain(
(
"system",
"""You are an expert in linguistics and language learning. Your task is to analyze subtitle segments.
Please perform the following tasks for an English learner at {leaner_level} CEFR level.
Please perform the following tasks for an non-native English learner.
**CRITICAL INSTRUCTION**: The learner is advanced. They already know common daily vocabulary.
Your goal is to identify **only** content that helps them reach native-level proficiency.
**CRITICAL INSTRUCTION**: The learner is at the {leaner_level} level.
They are proficient in vocabulary at or below this level.
Your goal is two-fold:
1. **Learning**: Identify content challenging for their current level.
2. **Comprehension**: Ensure they understand **specific or low-frequency vocabulary** crucial for the narrative, even if it is not "core" vocabulary.
1. **Review and Evaluate Candidate Words:**
* **Goal**: Filter out simple words and correct any errors in lemma/POS/text.
* **Goal**: Filter out words that are easy, BUT **retain** rare or specific words needed for understanding.
* **Action**: Return feedback items **ONLY** for words that:
1. Should be **discarded** (too simple, trivial filler, profanity without cultural value). Set `should_keep` to `False`.
2. Need **correction** (wrong lemma, POS, or text boundary). Set `should_keep` to `True` and provide correct values.
* **Implicit Rule**: If a word is appropriate for the learner and has correct info, **DO NOT** include it in the output list.
* **Keep criteria**: Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
* **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words below {leaner_level} level.
* **Keep criteria**:
* Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense.
* **Specific/Concrete Vocabulary**: Keep low-frequency words (e.g., like 'chamomile', 'cavernous' for B2) that are rare but essential for visualizing the scene or understanding the plot. **Do NOT discard these just because they are rare.**
* **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words well below {leaner_level} level (unless they fit the 'Keep criteria').
2. **Identify Missed Words:**
* Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners. This specifically includes:
* **Slang or informal expressions.**
* **Internet terms or modern colloquialisms.**
* **Words or phrases that require specific cultural background knowledge to understand.**
* **Any other words or phrases that are challenging.**
* Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners or for **plot comprehension**.
* **Targets**:
* **Slang, idioms, or modern colloquialisms.**
* **Low-frequency words** (e.g., 'shimmer', 'rugged') missed by the algorithm.
* **Words requiring cultural background.**
* Avoid repeating words already listed in `candidate_words`.
* Must exist in the exact form in `context_text`.
* Provide lemma and POS.
@@ -690,7 +696,7 @@ def llm_process_chain(
segments: SegmentList,
shutdown_event: threading.Event,
context_window: int = 30,
leaner_level: str = "C1",
learner_level: str = "C1",
media_context: Context | None = None,
translate_sentences: bool = False,
) -> SegmentList:
@@ -702,7 +708,7 @@ def llm_process_chain(
:param segments: 字幕片段
:param shutdown_event: 关闭事件
:param context_window: 上下文窗口大小
:param leaner_level: 学习者的 CEFR 水平
:param learner_level: 学习者的 CEFR 水平
:param media_context: 媒体信息
:param translate_sentences: 是否翻译句子
:returns: 更新后的字幕片段列表
@@ -726,7 +732,7 @@ def llm_process_chain(
)
segments_list.extend(
_context_process_chain(
lexi, llm, context, start, end, leaner_level, media_name, translate_sentences
lexi, llm, context, start, end, learner_level, media_name, translate_sentences
)
)