From 00c65a098384e91a007d7daebf0519fee8cbc8d8 Mon Sep 17 00:00:00 2001 From: wumode Date: Wed, 10 Dec 2025 21:22:29 +0800 Subject: [PATCH] feat(lexiannot): Integrate LLM for advanced vocabulary processing --- package.v2.json | 3 +- plugins.v2/lexiannot/README.md | 36 +- plugins.v2/lexiannot/__init__.py | 2816 +++++++++++++------------ plugins.v2/lexiannot/agenttool.py | 67 + plugins.v2/lexiannot/lexicon.py | 116 + plugins.v2/lexiannot/pipeline.py | 736 +++++++ plugins.v2/lexiannot/query_gemini.py | 111 - plugins.v2/lexiannot/requirements.txt | 3 +- plugins.v2/lexiannot/schemas.py | 394 ++++ plugins.v2/lexiannot/spacyworker.py | 36 +- plugins.v2/lexiannot/subtitle.py | 44 + 11 files changed, 2854 insertions(+), 1508 deletions(-) create mode 100644 plugins.v2/lexiannot/agenttool.py create mode 100644 plugins.v2/lexiannot/lexicon.py create mode 100644 plugins.v2/lexiannot/pipeline.py delete mode 100644 plugins.v2/lexiannot/query_gemini.py create mode 100644 plugins.v2/lexiannot/schemas.py create mode 100644 plugins.v2/lexiannot/subtitle.py diff --git a/package.v2.json b/package.v2.json index cf17c9e..5152513 100644 --- a/package.v2.json +++ b/package.v2.json @@ -540,11 +540,12 @@ "name": "美剧生词标注", "description": "根据CEFR等级,为英语影视剧标注高级词汇。", "labels": "英语", - "version": "1.1.4", + "version": "1.2.0", "icon": "LexiAnnot.png", "author": "wumode", "level": 1, "history": { + "v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI", "v1.1.4": "优化字幕选择决策", "v1.1.3": "适配 Pydantic V2 (主程序版本需高于 2.8.1-1)", "v1.1.2": "使用子进程避免 spaCy 模型常驻内存", diff --git a/plugins.v2/lexiannot/README.md b/plugins.v2/lexiannot/README.md index 1bb9055..22a68f3 100644 --- a/plugins.v2/lexiannot/README.md +++ b/plugins.v2/lexiannot/README.md @@ -1,26 +1,32 @@ # 美剧生词标注 根据CEFR等级,为英语影视剧标注高级词汇。 +___ +在影视剧入库后,LexiAnnot 会读取媒体文件的MediaInfo和文件列表,如果视频的原始语言为英语并且包含英文文本字幕,LexiAnnot将为其生成包含词汇注释的`.en.ass`字幕文件。 -在影视剧入库后,LexiAnnot会读取媒体文件的MediaInfo和文件列表,如果视频的原始语言为英语并且包含英文文本字幕,LexiAnnot将为其生成包含词汇注释的.ass字幕文件。 +## 主要功能 ![](https://images2.imgbox.com/d6/b6/kZu6EH2a_o.png) ![](https://images2.imgbox.com/c8/3a/rEJBWu5v_o.png) -![](https://images2.imgbox.com/97/b7/d6RXFtwD_o.png) +![](https://images2.imgbox.com/56/c0/FBhJMvRD_o.jpg) ![](https://images2.imgbox.com/8a/d4/AtgOe265_o.jpg) -# Gemini +- 识别视频的原始语言和字幕语言 +- 自动适应原字幕样式 +- 俚语 / 自造词 / 熟词生义标注和解释 -- **[获取APIKEY](https://aistudio.google.com/app/apikey)** -- **[速率限制](https://ai.google.dev/gemini-api/docs/rate-limits)** +## 使用配置 -**确保可以正常访问下面的域名** +- spaCy 模型 + - spaCy 用于词形还原、POS 标注和命名实体识别,`en_core_web_sm`或`en_core_web_md` 已足够满足需求。 +- LLM 设置 + - 一集影视剧的字幕通常包含数千个单词,建议使用支持长文本输入的模型,选择一个适当的上下文窗口大小。 + - 处理 60 min 的影视剧字幕大约会消耗 `60K`~`80K` token,具体取决于字幕内容。 + - 配置请参考 MoviePilot 智能助手的设置部分。 +- Agent 工具 + - 在聊天中使用 `/ai` 命令告诉智能助手你要标注的影视剧。 -- googleapis.com -- google.dev -- aistudio.google.com - -# CEFR +## CEFR CEFR全称是Common European Framework of Reference for Languages。 @@ -36,20 +42,18 @@ CEFR全称是Common European Framework of Reference for Languages。 - **C1** (高级/Advanced):能够理解各种较长、要求较高的文本,并能识别隐含意义,表达流利、自然,能灵活有效地使用语言来应对各种目的。 - **C2** (精通/Proficient):能够轻松理解几乎所有听到的或读到的内容,能够非常流利、准确、精细地表达自己,即使在复杂的情况下也能区分细微的含义。 -# 计划 +## 计划 - 双语字幕支持 - ~~考试词汇标注~~ -# FAQ +## FAQ -- **为什么需要用到Gemini** - - LexiAnnot使用的词典仅包含约18000个单词,无法覆盖影视剧中的海量的俚语、习语、流行语等更广泛的表达形式 - **只能处理已有字幕的视频吗?** - 是的,视频需要包含**英文文本字幕** - **为什么无法处理一些包含字幕视频** - 目前无法识别基于图片的字幕(通常是特效字幕) -# 感谢 +## 感谢 - [coca-vocabulary-20000](https://github.com/llt22/coca-vocabulary-20000) \ No newline at end of file diff --git a/plugins.v2/lexiannot/__init__.py b/plugins.v2/lexiannot/__init__.py index 51049e0..7c375aa 100644 --- a/plugins.v2/lexiannot/__init__.py +++ b/plugins.v2/lexiannot/__init__.py @@ -5,19 +5,16 @@ import re import shutil import subprocess import sys -import time import threading -import uuid from collections import Counter from datetime import datetime -from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Tuple, Optional +from typing import Any, Dict, List, Tuple, Optional, Literal -import pysubs2 import pymediainfo from langdetect import detect -from pysubs2 import SSAFile, SSAEvent +from langchain_community.callbacks import get_openai_callback +from pysubs2 import SSAFile, SSAEvent, SSAStyle, Color, Alignment from app.core.config import settings from app.helper.directory import DirectoryHelper @@ -25,54 +22,34 @@ from app.log import logger from app.plugins import _PluginBase from app.core.cache import cached from app.core.event import eventmanager, Event -from app.schemas.types import NotificationType +from app.schemas import Response +from app.schemas.types import NotificationType, MediaType from app.utils.http import RequestUtils from app.utils.string import StringUtils -from app.schemas import TransferInfo +from app.schemas import TransferInfo, Context from app.schemas.types import EventType from app.core.context import MediaInfo -from app.plugins.lexiannot.query_gemini import ( - DialogueTranslationTask, VocabularyTranslationTask, Vocabulary, Context, TranslationTasks, translate, T +from app.chain.media import MediaChain + +from .agenttool import VocabularyAnnotatingTool +from .lexicon import Lexicon +from .schemas import ( + IDGenerator, + TaskStatus, + Task, + TasksApiParams, + ProcessResult, + SegmentList, + TaskParams, SegmentStatistics, +) +from .spacyworker import SpacyWorker +from .subtitle import SubtitleProcessor, style_text +from .pipeline import ( + extract_advanced_words, + llm_process_chain, + initialize_llm, + UNIVERSAL_POS_MAP, ) -from app.plugins.lexiannot.spacyworker import SpacyWorker - - -class TaskStatus(Enum): - PENDING = "pending" - RUNNING = "running" - COMPLETED = "completed" - FAILED = "failed" - CANCELED = "canceled" - IGNORED = "ignored" - - -class Task: - - def __init__(self, video_path: str, - task_id: Optional[str] = None, - status: TaskStatus = TaskStatus.PENDING, - add_time: Optional[datetime] = None, - complete_time: Optional[datetime] = None, - tokens_used: int = 0): - self.task_id = task_id or str(uuid.uuid4()) - self.video_path = video_path - self.status: TaskStatus = status - self.add_time: Optional[datetime] = add_time - self.complete_time: Optional[datetime] = complete_time - self.tokens_used: int = tokens_used - - def __repr__(self): - return f"" - - def to_dict(self): - return { - "task_id": self.task_id, - "video_path": self.video_path, - "status": self.status.value, - "add_time": self.add_time.isoformat() if self.add_time else None, - "complete_time": self.complete_time.isoformat() if self.complete_time else None, - "tokens_used": self.tokens_used - } class LexiAnnot(_PluginBase): @@ -83,7 +60,7 @@ class LexiAnnot(_PluginBase): # 插件图标 plugin_icon = "LexiAnnot.png" # 插件版本 - plugin_version = "1.1.4" + plugin_version = "1.2.0" # 插件作者 plugin_author = "wumode" # 作者主页 @@ -96,7 +73,7 @@ class LexiAnnot(_PluginBase): auth_level = 1 _enabled: bool = False - _annot_level = '' + _annot_level = "" _send_notify = False _onlyonce = False _show_vocabulary_detail = False @@ -104,32 +81,35 @@ class LexiAnnot(_PluginBase): _sentence_translation = False _in_place = False _enable_gemini = False - _gemini_model = '' - _gemini_apikey = '' + _gemini_model = "" + _gemini_apikey: str | None = None + _llm_provider = "google" + _llm_base_url = "" + _context_window: int = 0 _max_retries: int = 0 - _request_interval: int = 0 - _ffmpeg_path: str = 'ffmpeg' + _ffmpeg_path: str = "ffmpeg" _english_only = False _when_file_trans = False - _model_temperature = '' - _custom_files = '' - _accent_color = '' - _font_scaling = '' - _opacity = '' + _model_temperature = "" + _custom_files = "" + _accent_color = "" + _font_scaling = "" + _opacity = "" _exam_tags: List[str] = [] - _spacy_model: str = '' + _spacy_model: str = "" _delete_data: bool = False _libraries: List[str] = [] + _use_mp_agent: bool = False + _use_proxy: bool = False # protected variables - _lexicon_repo = 'https://raw.githubusercontent.com/wumode/LexiAnnot/' + _lexicon_repo = "https://raw.githubusercontent.com/wumode/LexiAnnot/" _worker_thread = None _task_queue: queue.Queue[Task] = queue.Queue() _shutdown_event = None - _total_token_count = 0 _venv_python = None - _query_gemini_script = '' + _query_gemini_script = "" _gemini_available = False _accent_color_rgb = None _color_alpha = 0 @@ -142,42 +122,50 @@ class LexiAnnot(_PluginBase): self.stop_service() if config: self._enabled = bool(config.get("enabled")) - self._annot_level = config.get("annot_level") or 'C1' + self._annot_level = config.get("annot_level") or "C1" self._send_notify = config.get("send_notify") self._onlyonce = config.get("onlyonce") self._show_vocabulary_detail = config.get("show_vocabulary_detail") self._sentence_translation = config.get("sentence_translation") self._in_place = config.get("in_place") self._enable_gemini = config.get("enable_gemini") - self._gemini_model = config.get("gemini_model") or 'gemini-2.0-flash' - self._gemini_apikey = config.get("gemini_apikey") or '' + self._gemini_model = config.get("gemini_model") or "gemini-2.5-flash" + self._gemini_apikey = config.get("gemini_apikey") or "" self._context_window = int(config.get("context_window") or 10) + self._context_window = max(5, min(self._context_window, 50)) self._max_retries = int(config.get("max_retries") or 3) - self._request_interval = int(config.get("request_interval") or 3) - self._ffmpeg_path = config.get("ffmpeg_path") or 'ffmpeg' + self._ffmpeg_path = config.get("ffmpeg_path") or "ffmpeg" self._english_only = config.get("english_only") self._when_file_trans = config.get("when_file_trans") - self._model_temperature = config.get("model_temperature") or '0.3' + self._model_temperature = config.get("model_temperature") or "0.3" self._show_phonetics = config.get("show_phonetics") - self._custom_files = config.get("custom_files") + self._custom_files = config.get("custom_files") or "" self._accent_color = config.get("accent_color") - self._font_scaling = config.get("font_scaling") or '1' - self._opacity = config.get("opacity") or '0' - self._spacy_model = config.get("spacy_model") or 'en_core_web_sm' + self._font_scaling = config.get("font_scaling") or "1" + self._opacity = config.get("opacity") or "0" + self._spacy_model = config.get("spacy_model") or "en_core_web_sm" self._exam_tags = config.get("exam_tags") or [] self._delete_data = config.get("delete_data") or False self._libraries = config.get("libraries") or [] + self._llm_base_url = config.get("llm_base_url") or "" + self._llm_provider = config.get("llm_provider") or "google" + self._use_mp_agent = config.get("use_mp_agent") or False + self._use_proxy = config.get("use_proxy") or False - libraries = [library.name for library in DirectoryHelper().get_library_dirs()] - self._libraries = [library for library in self._libraries if library in libraries] - self._accent_color_rgb = LexiAnnot.hex_to_rgb(self._accent_color) or (255, 255, 0) + libraries = [ + library.name for library in DirectoryHelper().get_library_dirs() + ] + self._libraries = [ + library for library in self._libraries if library in libraries + ] + self._accent_color_rgb = LexiAnnot.hex_to_rgb(self._accent_color) or (255, 255, 0,) self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0 if self._delete_data: # 删除不再保存在数据库的数据 - self.del_data('cefr_lexicon') - self.del_data('coca2k_lexicon') - self.del_data('swear_words') - self.del_data('lexicon_version') + self.del_data("cefr_lexicon") + self.del_data("coca2k_lexicon") + self.del_data("swear_words") + self.del_data("lexicon_version") self.delete_data() self._delete_data = False self._loaded = False @@ -196,10 +184,10 @@ class LexiAnnot(_PluginBase): if task.status == TaskStatus.PENDING: self._task_queue.put(task) - self._query_gemini_script = str(settings.ROOT_PATH / "app" / "plugins" / "lexiannot" / "query_gemini.py") - self._shutdown_event = threading.Event() - self._worker_thread = threading.Thread(target=self.__process_tasks, daemon=True) + self._worker_thread = threading.Thread( + target=self.__process_tasks, daemon=True + ) self._worker_thread.start() if self._onlyonce: @@ -215,691 +203,749 @@ class LexiAnnot(_PluginBase): """ 拼装插件配置页面,需要返回两块数据:1、页面配置;2、数据结构 """ - library_options = [{'title': library.name,'value': library.name} - for library in DirectoryHelper().get_library_dirs()] + library_options = [ + {"title": library.name, "value": library.name} + for library in DirectoryHelper().get_library_dirs() + ] return [ { - 'component': 'VForm', - 'content': [ + "component": "VForm", + "content": [ { - 'component': 'VRow', - 'content': [ + "component": "VRow", + "content": [ { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3 - }, - 'content': [ + "component": "VCol", + "props": {"cols": 12, "md": 3}, + "content": [ { - 'component': 'VSwitch', - 'props': { - 'model': 'enabled', - 'label': '启用插件', - } - } - ] - }, - - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'send_notify', - 'label': '发送通知', - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'onlyonce', - 'label': '手动运行一次', - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'delete_data', - 'label': '插件数据清理', - } - } - ] - } - ] - }, - { - 'component': 'VTabs', - 'props': { - 'model': '_tabs', - 'style': { - 'margin-top': '8px', - 'margin-bottom': '16px' - }, - 'stacked': True, - 'fixed-tabs': True - }, - 'content': [ - { - 'component': 'VTab', - 'props': { - 'value': 'base_tab' - }, - 'text': '基本设置' - }, { - 'component': 'VTab', - 'props': { - 'value': 'subtitle_tab' - }, - 'text': '字幕设置' - }, { - 'component': 'VTab', - 'props': { - 'value': 'gemini_tab' - }, - 'text': 'Gemini设置' - } - ] - }, - { - 'component': 'VWindow', - 'props': { - 'model': '_tabs' - }, - 'content': [ - { - 'component': 'VWindowItem', - 'props': { - 'value': 'base_tab' - }, - 'content': [ - { - 'component': 'VRow', - 'props': { - 'style': { - 'margin-top': '0px' - } + "component": "VSwitch", + "props": { + "model": "enabled", + "label": "启用插件", }, - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'when_file_trans', - 'label': '监控入库', - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'spacy_model', - 'label': 'spaCy模型', - 'hint': 'spaCy 模型用于分词和词性标注,推荐使用 Small', - 'items': [ - {'title': 'Small (~12 MB)', 'value': 'en_core_web_sm'}, - {'title': 'Medium (~30 MB)', 'value': 'en_core_web_md'}, - {'title': 'Large (700+ MB)', 'value': 'en_core_web_lg'}, - {'title': 'Transformer (400+ MB)', - 'value': 'en_core_web_trf'}, - ] - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'annot_level', - 'label': '标注词汇的最低CEFR等级', - 'items': [ - {'title': 'B1', 'value': 'B1'}, - {'title': 'B2', 'value': 'B2'}, - {'title': 'C1', 'value': 'C1'}, - {'title': 'C2', 'value': 'C2'}, - {'title': 'C2+', 'value': 'C2+'} - ] - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'english_only', - 'label': '仅英语影视剧', - 'hint': '检查入库影视剧原语言' - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 8 - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'exam_tags', - 'label': '考试词汇标签', - 'chips': True, - 'multiple': True, - 'items': [ - {'title': '四级', 'value': 'CET-4'}, - {'title': '六级', 'value': 'CET-6'}, - {'title': '考研', 'value': 'NPEE'}, - {'title': '雅思', 'value': 'IELTS'}, - {'title': '托福', 'value': 'TOEFL'}, - {'title': '专四', 'value': 'TEM-4'}, - {'title': '专八', 'value': 'TEM-8'}, - {'title': 'GRE', 'value': 'GRE'}, - {'title': 'PET', 'value': 'PET'}, - ] - } - } - ] - } - ] - }, - { - 'component': 'VRow', - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - }, - 'content': [ - { - 'component': 'VTextField', - 'props': { - 'model': 'ffmpeg_path', - 'label': 'FFmpeg 路径', - 'placeholder': 'ffmpeg' - } - } - ] - } - ] } - ] + ], }, { - 'component': 'VWindowItem', - 'props': { - 'value': 'subtitle_tab' - }, - 'content': [ + "component": "VCol", + "props": {"cols": 12, "md": 3}, + "content": [ { - 'component': 'VRow', - 'props': { - 'style': { - 'margin-top': '0px' - } + "component": "VSwitch", + "props": { + "model": "send_notify", + "label": "发送通知", }, - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'font_scaling', - 'label': '字体缩放', - 'items': [ - {'title': '50%', 'value': '0.5'}, - {'title': '75%', 'value': '0.75'}, - {'title': '100%', 'value': '1'}, - {'title': '125%', 'value': '1.25'}, - {'title': '150%', 'value': '1.5'}, - {'title': '200%', 'value': '2'} - ] - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VTextField', - 'props': { - 'model': 'accent_color', - 'label': '强调色', - 'placeholder': '#FFFF00' - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'opacity', - 'label': '不透明度', - 'items': [ - {'title': '0', 'value': '0'}, - {'title': '25%', 'value': '63'}, - {'title': '50%', 'value': '127'}, - {'title': '75%', 'value': '191'}, - {'title': '100%', 'value': '255'}, - ] - } - } - ] - } - ] - }, - { - 'component': 'VRow', - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'show_phonetics', - 'label': '标注音标', - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'in_place', - 'label': '在原字幕插入注释', - } - } - ] - }, - - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 4 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'show_vocabulary_detail', - 'label': '显示完整释义', - } - } - ] - }, - - ] - }, - ] + } + ], }, { - 'component': 'VWindowItem', - 'props': { - 'value': 'gemini_tab' - }, - 'content': [ + "component": "VCol", + "props": {"cols": 12, "md": 3}, + "content": [ { - 'component': 'VRow', - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 6, - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'enable_gemini', - 'label': '启用Gemini翻译', - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 6 - }, - 'content': [ - { - 'component': 'VSwitch', - 'props': { - 'model': 'sentence_translation', - 'label': '整句翻译', - } - } - ] - } - ] - }, - { - 'component': 'VRow', - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 6, - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'gemini_model', - 'label': '模型', - 'items': [ - {'title': 'gemini-2.5-flash', - 'value': 'gemini-2.5-flash'}, - {'title': 'gemini-2.5-flash-lite', - 'value': 'gemini-2.5-flash-lite'}, - {'title': 'gemini-2.5-pro', - 'value': 'gemini-2.5-pro'}, - {'title': 'gemini-2.0-flash', - 'value': 'gemini-2.0-flash'}, - {'title': 'gemini-2.0-flash-lite', - 'value': 'gemini-2.0-flash-lite'}, - ] - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 6, - }, - 'content': [ - { - 'component': 'VTextField', - 'props': { - 'model': 'gemini_apikey', - 'label': 'Gemini APIKEY', - 'placeholder': '' - } - } - ] - }, - ] - }, - { - 'component': 'VRow', - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3, - }, - 'content': [ - { - 'component': 'VTextField', - 'props': { - 'model': 'context_window', - 'label': '上下文窗口大小', - 'placeholder': '10', - 'type': 'number', - 'max': 100, - 'min': 1, - 'hint': '向Gemini发送的上下文长度' - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3 - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'model': 'model_temperature', - 'label': '模型温度', - 'items': [ - {'title': '0', 'value': '0'}, - {'title': '0.1', 'value': '0.1'}, - {'title': '0.2', 'value': '0.2'}, - {'title': '0.3', 'value': '0.3'}, - ] - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3, - }, - 'content': [ - { - 'component': 'VTextField', - 'props': { - 'model': 'max_retries', - 'label': '请求重试次数', - 'placeholder': '3', - 'type': 'number', - 'min': 1, - 'hint': '请求失败重试次数' - } - } - ] - }, - { - 'component': 'VCol', - 'props': { - 'cols': 12, - 'md': 3, - }, - 'content': [ - { - 'component': 'VTextField', - 'props': { - 'model': 'request_interval', - 'label': '请求间隔', - 'type': 'number', - 'placeholder': 5, - 'min': 1, - 'suffix': '秒', - 'hint': '请求间隔时间,建议不少于3秒' - } - } - ] - }, - ] - } - ] - } - ] - }, - { - 'component': 'VRow', - 'props': { - 'style': { - 'margin-top': '0px' - } - }, - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - }, - 'content': [ - { - 'component': 'VSelect', - 'props': { - 'chips': True, - 'multiple': True, - 'model': 'libraries', - 'label': '监控入库', - 'items': library_options - } - } - ] - } - ] - }, - { - 'component': 'VRow', - 'props': { - 'style': { - 'margin-top': '0px' - } - }, - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - }, - 'content': [ - { - 'component': 'VTextarea', - 'props': { - 'model': 'custom_files', - 'label': '手动处理视频路径', - 'rows': 3, - 'placeholder': '# 每行一个文件' - } - } - ] - }, - ] - }, - { - 'component': 'VRow', - 'content': [ - { - 'component': 'VCol', - 'props': { - 'cols': 12, - }, - 'content': [ - { - 'component': 'VAlert', - 'props': { - 'type': 'success', - 'variant': 'tonal' + "component": "VSwitch", + "props": { + "model": "onlyonce", + "label": "手动运行一次", }, - 'content': [ + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 3}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "delete_data", + "label": "插件数据清理", + }, + } + ], + }, + ], + }, + { + "component": "VTabs", + "props": { + "model": "_tabs", + "style": {"margin-top": "8px", "margin-bottom": "16px"}, + "stacked": True, + "fixed-tabs": True, + }, + "content": [ + { + "component": "VTab", + "props": {"value": "base_tab"}, + "text": "基本设置", + }, + { + "component": "VTab", + "props": {"value": "subtitle_tab"}, + "text": "字幕设置", + }, + { + "component": "VTab", + "props": {"value": "gemini_tab"}, + "text": "LLM 设置", + }, + ], + }, + { + "component": "VWindow", + "props": {"model": "_tabs"}, + "content": [ + { + "component": "VWindowItem", + "props": {"value": "base_tab"}, + "content": [ + { + "component": "VRow", + "props": {"style": {"margin-top": "0px"}}, + "content": [ { - 'component': 'span', - 'text': '配置说明:' + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "when_file_trans", + "label": "监控入库", + }, + } + ], }, { - 'component': 'a', - 'props': { - 'href': 'https://github.com/jxxghp/MoviePilot-Plugins/tree/main/plugins.v2/lexiannot/README.md', - 'target': '_blank' - }, - 'content': [ + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ { - 'component': 'u', - 'text': 'README' + "component": "VSelect", + "props": { + "model": "spacy_model", + "label": "spaCy模型", + "hint": "用于分词和词性标注,推荐使用「md」", + "items": [ + { + "title": "sm (~12 MB)", + "value": "en_core_web_sm", + }, + { + "title": "md (~30 MB)", + "value": "en_core_web_md", + }, + { + "title": "lg (700+ MB)", + "value": "en_core_web_lg", + }, + { + "title": "Transformer (400+ MB)", + "value": "en_core_web_trf", + }, + ], + }, } - ] + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSelect", + "props": { + "model": "annot_level", + "label": "标注词汇的最低CEFR等级", + "items": [ + { + "title": "B1", + "value": "B1", + }, + { + "title": "B2", + "value": "B2", + }, + { + "title": "C1", + "value": "C1", + }, + { + "title": "C2", + "value": "C2", + }, + { + "title": "C2+", + "value": "C2+", + }, + ], + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "english_only", + "label": "仅英语影视剧", + "hint": "检查入库影视剧原语言", + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 8}, + "content": [ + { + "component": "VSelect", + "props": { + "model": "exam_tags", + "label": "考试词汇标签", + "chips": True, + "multiple": True, + "items": [ + { + "title": "四级", + "value": "CET-4", + }, + { + "title": "六级", + "value": "CET-6", + }, + { + "title": "考研", + "value": "NPEE", + }, + { + "title": "雅思", + "value": "IELTS", + }, + { + "title": "托福", + "value": "TOEFL", + }, + { + "title": "专四", + "value": "TEM-4", + }, + { + "title": "专八", + "value": "TEM-8", + }, + { + "title": "GRE", + "value": "GRE", + }, + { + "title": "PET", + "value": "PET", + }, + ], + }, + } + ], + }, + ], + }, + { + "component": "VRow", + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + }, + "content": [ + { + "component": "VTextField", + "props": { + "model": "ffmpeg_path", + "label": "FFmpeg 路径", + "placeholder": "ffmpeg", + }, + } + ], } - ] + ], + }, + ], + }, + { + "component": "VWindowItem", + "props": {"value": "subtitle_tab"}, + "content": [ + { + "component": "VRow", + "props": {"style": {"margin-top": "0px"}}, + "content": [ + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSelect", + "props": { + "model": "font_scaling", + "label": "字体缩放", + "items": [ + { + "title": "50%", + "value": "0.5", + }, + { + "title": "75%", + "value": "0.75", + }, + { + "title": "100%", + "value": "1", + }, + { + "title": "125%", + "value": "1.25", + }, + { + "title": "150%", + "value": "1.5", + }, + { + "title": "200%", + "value": "2", + }, + ], + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VTextField", + "props": { + "model": "accent_color", + "label": "强调色", + "placeholder": "#FFFF00", + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSelect", + "props": { + "model": "opacity", + "label": "透明度", + "items": [ + { + "title": "0", + "value": "0", + }, + { + "title": "25%", + "value": "63", + }, + { + "title": "50%", + "value": "127", + }, + { + "title": "75%", + "value": "191", + }, + { + "title": "100%", + "value": "255", + }, + ], + }, + } + ], + }, + ], + }, + { + "component": "VRow", + "content": [ + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "show_phonetics", + "label": "标注音标", + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "in_place", + "label": "在原字幕插入注释", + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "show_vocabulary_detail", + "label": "显示完整释义", + }, + } + ], + }, + ], + }, + ], + }, + { + "component": "VWindowItem", + "props": {"value": "gemini_tab"}, + "content": [ + { + "component": "VRow", + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + "md": 3, + }, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "enable_gemini", + "label": "启用 LLM", + }, + } + ], + }, + { + "component": "VCol", + "props": { + "cols": 12, + "md": 3, + }, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "use_mp_agent", + "label": "使用系统 Agent 配置", + }, + } + ], + }, + { + "component": "VCol", + "props": { + "cols": 12, + "md": 3, + }, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "use_proxy", + "label": "使用系统代理", + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 3}, + "content": [ + { + "component": "VSwitch", + "props": { + "model": "sentence_translation", + "label": "整句翻译", + }, + } + ], + }, + ], + }, + { + "component": "VRow", + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + "md": 6, + }, + "content": [ + { + "component": "VSelect", + "props": { + "model": "llm_provider", + "label": "提供商", + "disabled": "use_mp_agent", + "items": [ + { + "title": "Google", + "value": "google", + }, + { + "title": "OpenAI", + "value": "openai", + }, + { + "title": "DeepSeek", + "value": "deepseek", + }, + ], + }, + } + ], + }, + { + "component": "VCol", + "props": { + "cols": 12, + "md": 6, + }, + "content": [ + { + "component": "VTextField", + "props": { + "model": "llm_base_url", + "disabled": "use_mp_agent", + "placeholder": "https://api.deepseek.com", + "label": "基础 URL", + "hint": "参考 MoviePilot Agent 配置", + }, + } + ], + }, + { + "component": "VCol", + "props": { + "cols": 12, + "md": 6, + }, + "content": [ + { + "component": "VCombobox", + "props": { + "model": "gemini_model", + "disabled": "use_mp_agent", + "label": "模型名称", + "items": [ + "gemini-2.5-flash", + "gemini-2.5-flash-lite", + "gemini-2.5-pro", + "gemini-2.0-flash", + "gemini-2.0-flash-lite", + "deepseek-ai/DeepSeek-V3.2", + "deepseek-ai/DeepSeek-R1" + ], + }, + } + ], + }, + { + "component": "VCol", + "props": { + "cols": 12, + "md": 6, + }, + "content": [ + { + "component": "VTextField", + "props": { + "model": "gemini_apikey", + "label": "API-KEY", + "disabled": "use_mp_agent", + }, + } + ], + }, + ], + }, + { + "component": "VRow", + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + "md": 4, + }, + "content": [ + { + "component": "VTextField", + "props": { + "model": "context_window", + "label": "上下文窗口大小", + "placeholder": "10", + "type": "number", + "max": 50, + "min": 1, + "hint": "向大模型发送的对话数量", + }, + } + ], + }, + { + "component": "VCol", + "props": {"cols": 12, "md": 4}, + "content": [ + { + "component": "VSelect", + "props": { + "model": "model_temperature", + "label": "模型温度", + "items": [ + {"title": "0", "value": "0"}, + {"title": "0.1", "value": "0.1"}, + {"title": "0.2", "value": "0.2"}, + {"title": "0.3", "value": "0.3"}, + {"title": "0.4", "value": "0.4"}, + {"title": "0.5", "value": "0.5"}, + ], + }, + } + ], + }, + { + "component": "VCol", + "props": { + "cols": 12, + "md": 4, + }, + "content": [ + { + "component": "VTextField", + "props": { + "model": "max_retries", + "label": "请求重试次数", + "placeholder": "3", + "type": "number", + "min": 1, + "hint": "请求失败重试次数", + }, + } + ], + }, + ], + }, + ], + }, + ], + }, + { + "component": "VRow", + "props": {"style": {"margin-top": "0px"}}, + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + }, + "content": [ + { + "component": "VSelect", + "props": { + "chips": True, + "multiple": True, + "model": "libraries", + "label": "监控入库", + "items": library_options, + }, } - ] + ], } - ] - } - ] + ], + }, + { + "component": "VRow", + "props": {"style": {"margin-top": "0px"}}, + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + }, + "content": [ + { + "component": "VTextarea", + "props": { + "model": "custom_files", + "label": "手动处理视频路径", + "rows": 3, + "placeholder": "# 每行一个文件", + }, + } + ], + }, + ], + }, + { + "component": "VRow", + "content": [ + { + "component": "VCol", + "props": { + "cols": 12, + }, + "content": [ + { + "component": "VAlert", + "props": { + "type": "success", + "variant": "tonal", + }, + "content": [ + {"component": "span", "text": "配置说明:"}, + { + "component": "a", + "props": { + "href": "https://github.com/jxxghp/MoviePilot-Plugins/tree/main/plugins.v2/lexiannot/README.md", + "target": "_blank", + }, + "content": [ + {"component": "u", "text": "README"} + ], + }, + ], + } + ], + } + ], + }, + ], } ], { "enabled": False, - "annot_level": 'C1', + "annot_level": "C1", "send_notify": False, "onlyonce": False, "show_vocabulary_detail": False, @@ -907,42 +953,52 @@ class LexiAnnot(_PluginBase): "sentence_translation": False, "in_place": False, "enable_gemini": False, - "gemini_model": 'gemini-2.0-flash', - "gemini_apikey": '', + "gemini_model": "gemini-2.0-flash", + "gemini_apikey": "", "context_window": 10, "max_retries": 3, - 'request_interval': 3, + "request_interval": 3, "ffmpeg_path": "", "english_only": True, "when_file_trans": True, - "model_temperature": '0.3', - "custom_files": '', - "accent_color": '', - "font_scaling": '1', - "opacity": '0', - "spacy_model": 'en_core_web_sm', + "model_temperature": "0.1", + "custom_files": "", + "accent_color": "", + "font_scaling": "1", + "opacity": "0", + "spacy_model": "en_core_web_sm", "exam_tags": [], "delete_data": False, - "libraries": [] + "libraries": [], + "llm_provider": "google", + "llm_base_url": "", + "use_mp_agent": False, + "use_proxy": False, } def get_api(self) -> List[Dict[str, Any]]: - pass + return [ + { + "path": "/tasks", + "endpoint": self.task_interface, + "methods": ["POST"], + "summary": "任务操作", + "description": "任务操作", + } + ] def get_page(self) -> List[dict]: headers = [ - {'title': '添加时间', 'key': 'add_time', 'sortable': True}, - {'title': '视频文件', 'key': 'video_path', 'sortable': True}, - {'title': '消耗 Tokens', 'key': 'tokens_used', 'sortable': True}, - {'title': '完成时间', 'key': 'complete_time', 'sortable': True}, - {'title': '任务状态', 'key': 'status', 'sortable': True}, + {"title": "添加时间", "key": "add_time", "sortable": True}, + {"title": "视频文件", "key": "video_path", "sortable": True}, + {"title": "消耗 Tokens", "key": "tokens_used", "sortable": True}, + {"title": "完成时间", "key": "complete_time", "sortable": True}, + {"title": "任务状态", "key": "status", "sortable": True}, ] items = [] with self._tasks_lock: sorted_tasks = sorted( - self._tasks.items(), - key=lambda x: x[1].add_time, - reverse=True + self._tasks.items(), key=lambda x: x[1].add_time, reverse=True ) status_map = { @@ -951,52 +1007,82 @@ class LexiAnnot(_PluginBase): TaskStatus.COMPLETED: "已完成", TaskStatus.IGNORED: "已忽略", TaskStatus.FAILED: "失败", - TaskStatus.CANCELED: "已取消" + TaskStatus.CANCELED: "已取消", } for task_id, task in sorted_tasks: status_text = status_map.get(task.status, task.status) item = { - 'task_id': task_id, - 'status': status_text, - 'video_path': task.video_path, - 'add_time': task.add_time.strftime("%Y-%m-%d %H:%M:%S") if task.add_time else '-', - 'tokens_used': task.tokens_used, - 'complete_time': task.complete_time.strftime("%Y-%m-%d %H:%M:%S") if task.complete_time else '-', + "task_id": task_id, + "status": status_text, + "video_path": task.video_path, + "add_time": task.add_time if task.add_time else "-", + "tokens_used": task.tokens_used, + "complete_time": task.complete_time if task.complete_time else "-", } items.append(item) return [ { - 'component': 'VRow', - 'props': { - 'style': { - 'overflow': 'hidden', + "component": "div", + "props": {"class": "d-flex align-center"}, + "content": [ + { + "component": "h2", + "props": {"class": "page-title m-0"}, + "text": "任务记录", + }, + {"component": "VSpacer"}, + { + "component": "VBtn", + "props": { + "prepend-icon": "mdi-delete-circle", + "variant": "tonal", + }, + "text": "清空任务记录", + "events": { + "click": { + "api": f"plugin/{self.__class__.__name__}/tasks?apikey={settings.API_TOKEN}", + "method": "post", + "params": { + "operation": "DELETE", + "task_id": None, + }, + } + }, + }, + ], + }, + { + "component": "VRow", + "props": { + "style": { + "overflow": "hidden", } }, - 'content': [ + "content": [ { - 'component': 'VCol', - 'props': { - 'cols': 12, + "component": "VCol", + "props": { + "cols": 12, }, - 'content': [ + "content": [ { - 'component': 'VDataTableVirtual', - 'props': { - 'class': 'text-sm', - 'headers': headers, - 'items': items, - 'height': '30rem', - 'density': 'compact', - 'fixed-header': True, - 'hide-no-data': True, - 'hover': True - } + "component": "VDataTableVirtual", + "props": { + "class": "text-sm", + "headers": headers, + "items": items, + "height": "30rem", + "density": "compact", + "fixed-header": True, + "hide-no-data": True, + "hover": True, + }, } - ] + ], } - ] - } + ], + }, ] @staticmethod @@ -1009,6 +1095,13 @@ class LexiAnnot(_PluginBase): """ return self._enabled + def get_agent_tools(self) -> List[type]: + """ + 获取插件智能体工具 + 返回工具类列表,每个工具类必须继承自 MoviePilotTool + """ + return [VocabularyAnnotatingTool] + def stop_service(self): """ 退出插件 @@ -1034,7 +1127,7 @@ class LexiAnnot(_PluginBase): def delete_data(self): # 删除词典 data_path = self.get_data_path() - lexicon_path = data_path / 'lexicon.json' + lexicon_path = data_path / "lexicon.json" try: os.remove(lexicon_path) logger.info(f"词典 {lexicon_path} 已删除") @@ -1042,7 +1135,7 @@ class LexiAnnot(_PluginBase): pass except Exception as e: logger.error(f"词典 {lexicon_path} 删除失败: {e}") - self.__load_lexicon_from_local.cache_clear() + self._load_lexicon_from_local.cache_clear() # 删除虚拟环境 venv_dir = data_path / "venv_genai" @@ -1059,19 +1152,11 @@ class LexiAnnot(_PluginBase): self.save_tasks() def load_tasks(self) -> Dict[str, Task]: - raw_tasks = self.get_data('tasks') or {} + raw_tasks = self.get_data("tasks") or {} tasks = {} for task_id, task_dict in raw_tasks.items(): try: - task = Task( - video_path=task_dict.get('video_path'), - task_id=task_dict.get('task_id'), - status=TaskStatus(task_dict.get('status')), - add_time=datetime.fromisoformat(task_dict.get('add_time')) if task_dict.get('add_time') else None, - complete_time=datetime.fromisoformat(task_dict.get('complete_time')) if task_dict.get( - 'complete_time') else None, - tokens_used=task_dict.get('tokens_used', 0) - ) + task = Task.model_validate(task_dict) tasks[task_id] = task except Exception as e: logger.error(f"加载任务失败:{e}") @@ -1079,18 +1164,25 @@ class LexiAnnot(_PluginBase): def save_tasks(self): with self._tasks_lock: - tasks_dict = {task_id: task.to_dict() for task_id, task in self._tasks.items()} + tasks_dict = { + task_id: task.model_dump(mode="json") + for task_id, task in self._tasks.items() + } self.save_data("tasks", tasks_dict) - def add_task(self, video_file: str): - task = Task(video_path=video_file, add_time=datetime.now()) + def add_task(self, video_file: str, skip_existing=True): + task = Task( + video_path=video_file, + add_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + params=TaskParams(skip_existing=skip_existing), + ) with self._tasks_lock: self._tasks[task.task_id] = task self._task_queue.put(task) self.save_tasks() logger.info(f"加入任务队列: {video_file}") - def add_media_file(self, path: str): + def add_media_file(self, path: str, skip_existing: bool = True): """ 添加新任务 """ @@ -1099,39 +1191,118 @@ class LexiAnnot(_PluginBase): else: raise RuntimeError("Plugin is shutting down. Cannot add new tasks.") + def delete_tasks(self, task_id: str | None): + historical_status = { + TaskStatus.COMPLETED, + TaskStatus.FAILED, + TaskStatus.CANCELED, + TaskStatus.IGNORED, + } + with self._tasks_lock: + if task_id is None: + tasks_to_delete = [ + task_id + for task_id, task in self._tasks.items() + if task.status in historical_status + ] + else: + task = self._tasks.get(task_id) + if task and task.status in historical_status: + tasks_to_delete = [task_id] + else: + tasks_to_delete = [] + for task_id in tasks_to_delete: + del self._tasks[task_id] + self.save_tasks() + + def task_interface(self, params: TasksApiParams) -> Response: + if params.operation == "DELETE": + logger.info("清空任务记录") + self.delete_tasks(params.task_id) + return Response(success=True) + def __update_config(self): with self._config_updating_lock: self.update_config( { - 'enabled': self._enabled, - 'annot_level': self._annot_level, - 'send_notify': self._send_notify, - 'onlyonce': self._onlyonce, - 'show_vocabulary_detail': self._show_vocabulary_detail, - 'sentence_translation': self._sentence_translation, - 'in_place': self._in_place, - 'enable_gemini': self._enable_gemini, - 'gemini_model': self._gemini_model, - 'gemini_apikey': self._gemini_apikey, - 'context_window': self._context_window, - 'max_retries': self._max_retries, - 'request_interval': self._request_interval, - 'ffmpeg_path': self._ffmpeg_path, - 'english_only': self._english_only, - 'when_file_trans': self._when_file_trans, - 'model_temperature': self._model_temperature, - 'show_phonetics': self._show_phonetics, - 'custom_files': self._custom_files, - 'accent_color': self._accent_color, - 'font_scaling': self._font_scaling, - 'opacity': self._opacity, - 'spacy_model': self._spacy_model, - 'exam_tags': self._exam_tags, - 'delete_data': self._delete_data, - 'libraries': self._libraries + "enabled": self._enabled, + "annot_level": self._annot_level, + "send_notify": self._send_notify, + "onlyonce": self._onlyonce, + "show_vocabulary_detail": self._show_vocabulary_detail, + "sentence_translation": self._sentence_translation, + "in_place": self._in_place, + "enable_gemini": self._enable_gemini, + "gemini_model": self._gemini_model, + "gemini_apikey": self._gemini_apikey, + "context_window": self._context_window, + "max_retries": self._max_retries, + "ffmpeg_path": self._ffmpeg_path, + "english_only": self._english_only, + "when_file_trans": self._when_file_trans, + "model_temperature": self._model_temperature, + "show_phonetics": self._show_phonetics, + "custom_files": self._custom_files, + "accent_color": self._accent_color, + "font_scaling": self._font_scaling, + "opacity": self._opacity, + "spacy_model": self._spacy_model, + "exam_tags": self._exam_tags, + "delete_data": self._delete_data, + "libraries": self._libraries, + "llm_provider": self._llm_provider, + "llm_base_url": self._llm_base_url, + "use_mp_agent": self._use_mp_agent, + "use_proxy": self._use_proxy, } ) + def _send_message( + self, + task: Task, + phase: Literal["start", "end"], + context: Context | None = None, + process_result: ProcessResult | None = None, + ): + if not self._send_notify: + return + video_path = Path(task.video_path) + media_name = video_path.name + if context and context.media_info and context.meta_info: + media_info = context.media_info + if media_info.type == MediaType.TV: + media_name = ( + f"{media_info.title_year} {context.meta_info.season_episode}" + ) + else: + media_name = f"{media_info.title_year}.{video_path.suffix}" + message = f"标题: {media_name}" + if phase == "start": + self.post_message( + title=f"【{self.plugin_name}】 任务开始", + image=context.media_info.get_message_image() + if context and context.meta_info + else None, + mtype=NotificationType.Plugin, + text=f"{message}", + ) + else: + result = "完成" + if process_result and process_result.status == TaskStatus.FAILED: + result = "失败" + elif process_result and process_result.status == TaskStatus.CANCELED: + result = "取消" + stat_str = f"\n{task.statistics.to_string()}" if task.statistics else "" + self.post_message( + title=f"【{self.plugin_name}】 任务{result}", + mtype=NotificationType.Plugin, + image=context.media_info.get_message_image() + if context and context.meta_info + else None, + text=f"{message}\n备注:{process_result.message if process_result else ''}\n" + f"Tokens:{task.tokens_used:,}{stat_str}", + ) + def __process_tasks(self): """ 后台线程:处理任务队列 @@ -1140,7 +1311,7 @@ class LexiAnnot(_PluginBase): self.__load_data() if not self._loaded: - logger.warn('插件数据未加载') + logger.warn("插件数据未加载") self._enabled = False self.__update_config() logger.debug("🛑 Worker exiting...") @@ -1148,7 +1319,7 @@ class LexiAnnot(_PluginBase): if self._enable_gemini: self._gemini_available = True if not self._gemini_apikey: - logger.warn("未提供GEMINI APIKEY") + logger.warn("未提供 APIKEY") self._gemini_available = False while not self._shutdown_event.is_set(): @@ -1156,143 +1327,178 @@ class LexiAnnot(_PluginBase): task = self._task_queue.get(timeout=1) if task is None: continue - tokens = self._total_token_count + context = MediaChain().recognize_by_path(path=task.video_path) + cb = None + res = ProcessResult(status=TaskStatus.FAILED, message="未知错误") try: task.status = TaskStatus.RUNNING + self._send_message(task, "start", context) with SpacyWorker(self._spacy_model) as worker: - task.status = self.__process_file(task.video_path, worker) + with get_openai_callback() as cb: + res = self._process_file( + task.video_path, + worker, + context, + task.params.skip_existing, + ) + task.status = res.status + task.message = res.message + task.statistics = res.statistics except Exception as e: task.status = TaskStatus.FAILED - logger.error(f"处理 {task} 出错: {e}") + task.message = str(e) + logger.error(f"处理 {task.task_id} 出错: {e}") + res = ProcessResult(status=TaskStatus.FAILED, message=str(e)) finally: self._task_queue.task_done() - task.complete_time = datetime.now() - task.tokens_used = self._total_token_count - tokens + task.complete_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if cb: + task.tokens_used = cb.total_tokens + logger.info(f"任务 {task.task_id} 消耗 Tokens: " + f"Input ({cb.prompt_tokens:,}), Output ({cb.reasoning_tokens:,})") self.save_tasks() + self._send_message(task, "end", context, res) except queue.Empty: continue - logger.debug(f"🛑 Worker thread {threading.get_ident():#x} received shutdown signal, exiting...") + logger.debug( + f"🛑 Worker thread {threading.get_ident():#x} received shutdown signal, exiting..." + ) - def __process_file(self, path: str, spacy_worker: SpacyWorker) -> TaskStatus: + def _process_file( + self, + path: str, + spacy_worker: SpacyWorker, + mediainfo: Context | None = None, + skip_existing: bool = True + ) -> ProcessResult: """ 处理视频文件 """ if not self._loaded: - return TaskStatus.FAILED - lexicon = self.__load_lexicon_from_local() - if not lexicon: + return ProcessResult(status=TaskStatus.FAILED, message="插件数据未加载") + lexi = self._load_lexicon_from_local() + if not lexi: logger.error("字典加载失败") - return TaskStatus.FAILED + return ProcessResult(status=TaskStatus.FAILED, message="字典加载失败") video = Path(path) if video.suffix.lower() not in settings.RMT_MEDIAEXT: - return TaskStatus.CANCELED + return ProcessResult(status=TaskStatus.CANCELED, message="不支持的文件格式") if not video.exists() or not video.is_file(): logger.warn(f"文件 {str(video)} 不存在, 跳过") - return TaskStatus.FAILED - subtitle = video.with_suffix(".en.ass") - if subtitle.exists(): - logger.warn(f"字幕文件 ({subtitle}) 已存在, 跳过") - return TaskStatus.IGNORED + return ProcessResult(status=TaskStatus.FAILED, message="文件不存在") + ass_file = video.with_suffix(".en.ass") + if ass_file.exists() and skip_existing: + logger.warn(f"字幕文件 ({ass_file}) 已存在, 跳过") + return ProcessResult(status=TaskStatus.IGNORED, message="字幕文件已存在") logger.info(f"📂 Processing file: {path}") - if self._send_notify: - message = f"正在处理文件: {path}" - self.post_message(title=f"【{self.plugin_name}】", - mtype=NotificationType.Plugin, - text=f"{message}") - ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else 'ffmpeg' - eng_mark = ['en', 'en-US', 'eng', 'en-GB', 'english', 'en-AU'] - embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path) + + ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else "ffmpeg" + eng_mark = ["en", "en-US", "eng", "en-GB", "english", "en-AU"] + embedded_subtitles = LexiAnnot._extract_subtitles_by_lang( + path, eng_mark, ffmpeg_path + ) if not embedded_subtitles: - return TaskStatus.CANCELED + return ProcessResult( + status=TaskStatus.CANCELED, message="未找到嵌入式英文文本字幕" + ) # order factor = 0, if 'SDH' in track['title'] # order factor = track['duration'], otherwise - embedded_subtitles = sorted(embedded_subtitles, - key=lambda track: track['duration']*(1-int('SDH' in track['title'])), - reverse=True) - ret_message = '' + embedded_subtitles = sorted( + embedded_subtitles, + key=lambda track: track["duration"] * (1 - int("SDH" in track["title"])), + reverse=True, + ) + ret_message = "" + stat = None if embedded_subtitles: - logger.info(f'提取到 {len(embedded_subtitles)} 条英语文本字幕') + logger.info(f"提取到 {len(embedded_subtitles)} 条英语文本字幕") for embedded_subtitle in embedded_subtitles: if self._shutdown_event.is_set(): - return TaskStatus.CANCELED - ass_subtitle = pysubs2.SSAFile.from_string(embedded_subtitle['subtitle'], format_='ass') - if embedded_subtitle.get('codec_id') == 'S_TEXT/UTF8': + return ProcessResult( + status=TaskStatus.CANCELED, message="任务已取消" + ) + ass_subtitle = SSAFile.from_string( + embedded_subtitle["subtitle"], format_="ass" + ) + if embedded_subtitle.get("codec_id") == "S_TEXT/UTF8": ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle) ass_subtitle = self.__set_style(ass_subtitle) - ass_subtitle = self.process_subtitles(ass_subtitle, lexicon.get('cefr'), lexicon.get('coca20k'), - lexicon.get('examinations'), lexicon.get('swear_words'), - spacy_worker) + ass_subtitle, stat = self.process_subtitles( + ass_subtitle, lexi, spacy_worker, mediainfo + ) if self._shutdown_event.is_set(): - return TaskStatus.CANCELED + return ProcessResult( + status=TaskStatus.CANCELED, message="任务已取消" + ) if ass_subtitle: try: - ass_subtitle.save(str(subtitle)) - ret_message = f"字幕已保存:{str(subtitle)}" - logger.info(f"字幕已保存:{str(subtitle)}") + ass_subtitle.save(str(ass_file)) + ret_message = "字幕已保存" + logger.info(f"字幕已保存:{str(ass_file)}") except Exception as e: - ret_message = f"字幕文件 {subtitle} 保存失败, {e}" - logger.error(f"字幕文件 {subtitle} 保存失败, {e}") + ret_message = f"字幕文件 {ass_file} 保存失败" + logger.error(f"字幕文件 {ass_file} 保存失败, {e}") break else: - logger.info(f"处理字幕{embedded_subtitle['codec_id']}-{embedded_subtitle['stream_id']}失败") + logger.info( + f"处理字幕{embedded_subtitle['codec_id']}-{embedded_subtitle['stream_id']}失败" + ) else: logger.warn(f"未能在{path}中找到可提取的英文字幕") if not ret_message: - ret_message = f"未能在{path}中找到可提取的英文字幕" + ret_message = "未能找到可提取的英文字幕" logger.info(f"✅ Finished: {path}") - if self._send_notify: - self.post_message(title=f"【{self.plugin_name}】", - mtype=NotificationType.Plugin, - text=f"{ret_message}") - return TaskStatus.COMPLETED + return ProcessResult(status=TaskStatus.COMPLETED, message=ret_message, statistics=stat) @cached(maxsize=1, ttl=1800) def __load_lexicon_version(self) -> Optional[str]: logger.info("正在检查远程词典文件版本...") - url = f'{self._lexicon_repo}master/version' + url = f"{self._lexicon_repo}master/version" version = RequestUtils().get(url, headers=settings.REPO_GITHUB_HEADERS()) if version is None: return None return version.strip() - @cached(maxsize=1, ttl=3600*6) - def __load_lexicon_from_local(self) -> Optional[Dict[str, Any]]: + @cached(maxsize=1, ttl=3600 * 24) + def _load_lexicon_from_local(self) -> Lexicon | None: data_path = self.get_data_path() - lexicon = {} try: - lexicon_path = data_path / 'lexicon.json' - with open(lexicon_path, 'r', encoding='utf-8') as f: - lexicon = json.load(f) + lexicon_path = data_path / "lexicon.json" + with open(lexicon_path, "r", encoding="utf-8") as f: + content = f.read() + lexicon_model = Lexicon.model_validate_json(content) except Exception as e: - logger.debug(f"词典文件读取失败: {e}") - lexicon_files = ('cefr', 'coca20k', 'swear_words', 'examinations', 'version') - if any(file not in lexicon for file in lexicon_files): + logger.error(f"词典文件加载失败: {e}") return None - return lexicon + return lexicon_model - def __retrieve_lexicon_online(self, version: str) -> Optional[Dict[str, Any]]: - logger.info('开始下载词典文件...') - lexicon_files = ['cefr', 'coca20k', 'swear_words', 'examinations'] - lexicon = {} + def _retrieve_lexicon_online(self, version: str) -> Lexicon | None: + logger.info("开始下载词典文件...") + lexicon_files = ["cefr", "coca20k", "swear_words", "examinations"] + lexicon_dict = {} for file in lexicon_files: - url = f'{self._lexicon_repo}master/{file}.json' + url = f"{self._lexicon_repo}master/{file}.json" res = RequestUtils().get_res(url, headers=settings.REPO_GITHUB_HEADERS()) + if not res: + return None if res.status_code == 200: - lexicon[file] = res.json() - if any(file not in lexicon for file in lexicon_files): + lexicon_dict[file] = res.json() + if any(file not in lexicon_dict for file in lexicon_files): return None logger.info(f"词典文件 (v{version}) 下载完成") data_path = self.get_data_path() - lexicon['version'] = version + lexicon_dict["version"] = version try: - lexicon_path = data_path / 'lexicon.json' - with open(lexicon_path, 'w', encoding='utf-8') as f: - json.dump(lexicon, f, ensure_ascii=False, indent=2) + lexicon_path = data_path / "lexicon.json" + with open(lexicon_path, "w", encoding="utf-8") as f: + json.dump(lexicon_dict, f, ensure_ascii=False, indent=2) + lexi = Lexicon.model_validate(lexicon_dict) except Exception as e: logger.warn(f"词典文件保存失败: {e}") - return lexicon + return None + return lexi def __load_data(self): """ @@ -1305,17 +1511,19 @@ class LexiAnnot(_PluginBase): except RuntimeError: nlp = LexiAnnot.__download_spacy_model(self._spacy_model) - lexicon = self.__load_lexicon_from_local() - latest = self.__load_lexicon_version() or '0.0.0' - if not lexicon or StringUtils.compare_version(lexicon.get('version') or '0.0.0', '<', latest): - lexicon = self.__retrieve_lexicon_online(latest) - - if not (nlp and lexicon): + lexi = self._load_lexicon_from_local() + latest = self.__load_lexicon_version() or "0.0.0" + if not lexi or StringUtils.compare_version( + lexi.version or "0.0.0", "<", latest + ): + lexi = self._retrieve_lexicon_online(latest) + self._load_lexicon_from_local.cache_clear() + if not (nlp and lexi): self._loaded = False logger.warn("插件数据加载失败") else: self._loaded = True - logger.info(f"当前词典文件版本: {lexicon.get('version')}") + logger.info(f"当前词典文件版本: {lexi.version}") @staticmethod def __download_spacy_model(model_name: str) -> bool: @@ -1325,7 +1533,7 @@ class LexiAnnot(_PluginBase): [sys.executable, "-m", "spacy", "download", model_name], capture_output=True, text=True, - check=True + check=True, ) with SpacyWorker(model_name): nlp = True @@ -1351,16 +1559,25 @@ class LexiAnnot(_PluginBase): # 入库数据 transfer_info: TransferInfo | None = event_info.get("transferinfo") - if not transfer_info or not transfer_info.target_diritem or not transfer_info.target_diritem.path: + if ( + not transfer_info + or not transfer_info.target_diritem + or not transfer_info.target_diritem.path + ): return # 检查是否为选择的媒体库 in_libraries = False - libraries = {library.name: library.library_path for library in DirectoryHelper().get_library_dirs()} + libraries = { + library.name: library.library_path + for library in DirectoryHelper().get_library_dirs() + } for library_name in self._libraries: if library_name in libraries: ll = libraries[library_name] - if ll and Path(transfer_info.target_diritem.path).is_relative_to(Path(ll)): + if ll and Path(transfer_info.target_diritem.path).is_relative_to( + Path(ll) + ): in_libraries = True break if not in_libraries: @@ -1368,87 +1585,17 @@ class LexiAnnot(_PluginBase): mediainfo: MediaInfo | None = event_info.get("mediainfo") if self._english_only and mediainfo: - if mediainfo.original_language and mediainfo.original_language != 'en': - logger.info(f"原始语言 ({mediainfo.original_language}) 不为英语, 跳过 {mediainfo.title}: ") + if mediainfo.original_language and mediainfo.original_language not in { + "en", + "eng", + }: + logger.info( + f"原始语言 ({mediainfo.original_language}) 不为英语, 跳过 {mediainfo.title}: " + ) return for new_path in transfer_info.file_list_new or []: self.add_media_file(new_path) - @staticmethod - def query_cefr(word, cefr_lexicon): - word = word.lower().strip("-*'") - if word in cefr_lexicon: - return cefr_lexicon[word] - else: - return None - - @staticmethod - def query_coca20k(word: str, lexicon: Dict[str, Any]): - word = word.lower().strip("-*'") - return lexicon.get(word) - - @staticmethod - def query_examinations(word: str, lexicon: Dict[str, Any]) -> Dict[str, Any]: - res = {} - for examination, exam_lexicon in lexicon.items(): - if word in exam_lexicon: - res[examination] = exam_lexicon[word] - return res - - @staticmethod - def convert_pos_to_spacy(pos: str): - """ - 将给定的词性列表转换为 spaCy 库中使用的词性标签 - :param pos: 字符串形式词性 - :returns: 一个包含对应spaCy词性标签的列表。对于无法直接映射的词性,将返回None - """ - spacy_pos_map = { - 'noun': 'NOUN', - 'adjective': 'ADJ', - 'adverb': 'ADV', - 'verb': 'VERB', - 'preposition': 'ADP', - 'conjunction': 'CCONJ', - 'determiner': 'DET', - 'pronoun': 'PRON', - 'interjection': 'INTJ', - 'number': 'NUM' - } - - pos_lower = pos.lower() - if pos_lower in spacy_pos_map: - spacy_pos = spacy_pos_map[pos_lower] - elif pos_lower == 'be-verb': - spacy_pos = 'AUX' # Auxiliary verb (e.g., be, do, have) - elif pos_lower == 'vern': - spacy_pos = 'VERB' # Assuming 'vern' is a typo for 'verb' - elif pos_lower == 'modal auxiliary': - spacy_pos = 'AUX' # Modal verbs are also auxiliaries - elif pos_lower == 'do-verb': - spacy_pos = 'AUX' - elif pos_lower == 'have-verb': - spacy_pos = 'AUX' - elif pos_lower == 'infinitive-to': - spacy_pos = 'PART' # Particle (e.g., to in "to go") - elif not pos_lower: # Handle empty strings - spacy_pos = None - else: - spacy_pos = None # For unmapped POS tags - return spacy_pos - - @staticmethod - def get_cefr_by_spacy(lemma_: str, pos_: str, cefr_lexicon: Dict[str, Any]) -> Optional[str]: - result = LexiAnnot.query_cefr(lemma_, cefr_lexicon) - if result: - all_cefr = [] - if len(result) > 0: - for entry in result: - if pos_ == LexiAnnot.convert_pos_to_spacy(entry['pos']): - return entry['cefr'] - all_cefr.append(entry['cefr']) - return min(all_cefr) - return None - @staticmethod def format_duration(ms): total_seconds, milliseconds = divmod(ms, 1000) @@ -1457,6 +1604,19 @@ class LexiAnnot(_PluginBase): hundredths = milliseconds // 10 return f"{hours}:{minutes:02}:{seconds:02}.{hundredths:02}" + @staticmethod + def _remove_substring(replacements: list[dict]): + new_list = [] + replacements.sort(key=lambda x: x["end"] - x["start"], reverse=True) + for r in replacements: + if any( + (r["start"] >= new["start"] and r["end"] <= new["end"]) + for new in new_list + ): + continue + new_list.append(r) + return new_list + @staticmethod def replace_by_plaintext_positions(line: SSAEvent, replacements: List[dict]): """ @@ -1488,7 +1648,7 @@ class LexiAnnot(_PluginBase): mapping[p_index] = t_index p_index += 1 t_index += 1 - + replacements = LexiAnnot._remove_substring(replacements) # 按照 mapping 执行替换(倒序替换防止位置错位) new_text = text for r in sorted(replacements, key=lambda x: x["start"], reverse=True): @@ -1505,27 +1665,27 @@ class LexiAnnot(_PluginBase): def analyze_ass_language(ass_file: SSAFile): styles = {} for style in ass_file.styles: - styles[style] = {'text': [], 'duration': 0, 'text_size': 0, 'times': 0} + styles[style] = {"text": [], "duration": 0, "text_size": 0, "times": 0} for dialogue in ass_file: style = dialogue.style text = dialogue.plaintext - sub_text = text.split('\n') + sub_text = text.split("\n") if style not in styles or not text: continue - styles[style]['text'].extend(sub_text) - styles[style]['duration'] += dialogue.duration - styles[style]['text_size'] += len(text) - styles[style]['times'] += 1 + styles[style]["text"].extend(sub_text) + styles[style]["duration"] += dialogue.duration + styles[style]["text_size"] += len(text) + styles[style]["times"] += 1 style_language_analysis = {} for style_name, data in styles.items(): - all_text = ' '.join(data['text']) + all_text = " ".join(data["text"]) if not all_text.strip(): style_language_analysis[style_name] = None continue languages = [] # 对每个文本片段进行语言检测 - for text_fragment in data['text']: + for text_fragment in data["text"]: try: lang = detect(text_fragment) languages.append(lang) @@ -1537,19 +1697,22 @@ class LexiAnnot(_PluginBase): if languages: language_counts = Counter(languages) most_common_language = language_counts.most_common(1)[0] - style_language_analysis[style_name] = {"main_language": most_common_language[0], - "proportion": most_common_language[1] / len(languages), - "duration": data['duration'], - "text_size": data['text_size'], - "times": data['times']} + style_language_analysis[style_name] = { + "main_language": most_common_language[0], + "proportion": most_common_language[1] / len(languages), + "duration": data["duration"], + "text_size": data["text_size"], + "times": data["times"], + } else: style_language_analysis[style_name] = None return style_language_analysis @staticmethod - def select_main_style_weighted(language_analysis: Dict[str, Any], known_language: str, - weights=None): + def select_main_style_weighted( + language_analysis: Dict[str, Any], known_language: str, weights=None + ): """ 根据语言分析结果和已知的字幕语言,使用加权评分选择主要样式 @@ -1559,23 +1722,32 @@ class LexiAnnot(_PluginBase): :returns: 主要字幕的样式名称,如果没有匹配的样式则返回 None """ if weights is None: - weights = {'times': 0.5, 'text_size': 0.4, 'duration': 0.1} + weights = {"times": 0.5, "text_size": 0.4, "duration": 0.1} matching_styles = [] - max_times = max([analysis.get('times', 0) for _, analysis in language_analysis.items() if analysis]) or 1 - max_text_size = max( - [analysis.get('text_size', 0) for _, analysis in language_analysis.items() if analysis]) or 1 - max_duration = max([analysis.get('duration', 0) for _, analysis in language_analysis.items() if analysis]) or 1 + max_times = max([analysis.get("times", 0) for _, analysis in language_analysis.items() if analysis]) or 1 + max_text_size = ( + max([analysis.get("text_size", 0) for _, analysis in language_analysis.items() if analysis]) or 1) + max_duration = ( + max( + [ + analysis.get("duration", 0) + for _, analysis in language_analysis.items() + if analysis + ] + ) + or 1 + ) for style, analysis in language_analysis.items(): if not analysis: continue - if analysis.get('main_language') == known_language: + if analysis.get("main_language") == known_language: # 跳过多语言 - if analysis.get('proportion', 0) < 0.5: + if analysis.get("proportion", 0) < 0.5: continue score = 0 - score += analysis.get('times', 0) * weights.get('times', 0) / max_times - score += analysis.get('text_size', 0) * weights.get('text_size', 0) / max_text_size - score += analysis.get('duration', 0) * weights.get('duration', 0) / max_duration + score += analysis.get("times", 0) * weights.get("times", 0) / max_times + score += analysis.get("text_size", 0) * weights.get("text_size", 0) / max_text_size + score += analysis.get("duration", 0) * weights.get("duration", 0) / max_duration matching_styles.append((style, score)) if not matching_styles: @@ -1586,67 +1758,80 @@ class LexiAnnot(_PluginBase): @staticmethod def set_srt_style(ass: SSAFile) -> SSAFile: - ass.info['ScaledBorderAndShadow'] = 'no' - play_res_y = int(ass.info['PlayResY']) - if 'Default' in ass.styles: - ass.styles['Default'].marginv = play_res_y // 16 - ass.styles['Default'].fontname = 'Microsoft YaHei' - ass.styles['Default'].fontsize = play_res_y // 16 + ass.info["ScaledBorderAndShadow"] = "no" + play_res_y = int(ass.info["PlayResY"]) + if "Default" in ass.styles: + ass.styles["Default"].marginv = play_res_y // 16 + ass.styles["Default"].fontname = "Microsoft YaHei" + ass.styles["Default"].fontsize = play_res_y // 16 return ass def __set_style(self, ass: SSAFile) -> SSAFile: - font_scaling = float(self._font_scaling) if self._font_scaling and len(self._font_scaling) else 1 - play_res_y = int(ass.info['PlayResY']) - play_res_x = int(ass.info['PlayResX']) + font_scaling = ( + float(self._font_scaling) + if self._font_scaling and len(self._font_scaling) + else 1 + ) + play_res_y = int(ass.info["PlayResY"]) + play_res_x = int(ass.info["PlayResX"]) # 创建一个新样式 fs = play_res_y // 16 * font_scaling - new_style = pysubs2.SSAStyle() - new_style.name = 'Annotation EN' - new_style.fontname = 'Times New Roman' + new_style = SSAStyle() + new_style.name = "Annotation EN" + new_style.fontname = "Times New Roman" new_style.fontsize = fs - new_style.primarycolor = pysubs2.Color(self._accent_color_rgb[0], - self._accent_color_rgb[1], - self._accent_color_rgb[2], - self._color_alpha) # 黄色 (BGR, alpha) + new_style.primarycolor = Color( + self._accent_color_rgb[0], + self._accent_color_rgb[1], + self._accent_color_rgb[2], + self._color_alpha, + ) # 黄色 (BGR, alpha) new_style.bold = True new_style.italic = False new_style.outline = 1 new_style.shadow = 0 - new_style.alignment = pysubs2.Alignment.TOP_LEFT + new_style.alignment = Alignment.TOP_LEFT new_style.marginl = play_res_x // 20 new_style.marginr = play_res_x // 20 new_style.marginv = int(fs) - ass.styles['Annotation EN'] = new_style + ass.styles["Annotation EN"] = new_style zh_style = new_style.copy() - zh_style.name = 'Annotation ZH' - zh_style.fontname = 'Microsoft YaHei' - zh_style.primarycolor = pysubs2.Color(255, 255, 255, self._color_alpha) - ass.styles['Annotation ZH'] = zh_style + zh_style.name = "Annotation ZH" + zh_style.fontname = "Microsoft YaHei" + zh_style.primarycolor = Color(255, 255, 255, self._color_alpha) + ass.styles["Annotation ZH"] = zh_style + + usage_style = zh_style.copy() + usage_style.name = "Annotation USAGE" + usage_style.fontsize = fs * 0.5 + usage_style.italic = True + usage_style.primarycolor = Color(224, 224, 224, self._color_alpha) + ass.styles["Annotation USAGE"] = usage_style pos_style = zh_style.copy() - pos_style.name = 'Annotation POS' - pos_style.fontname = 'Times New Roman' + pos_style.name = "Annotation POS" + pos_style.fontname = "Times New Roman" pos_style.fontsize = fs * 0.75 pos_style.italic = True - ass.styles['Annotation POS'] = pos_style + ass.styles["Annotation POS"] = pos_style phone_style = pos_style.copy() - phone_style.name = 'Annotation PHONE' - phone_style.fontname = 'Arial' + phone_style.name = "Annotation PHONE" + phone_style.fontname = "Arial" phone_style.fontsize = fs * 0.75 phone_style.bold = False phone_style.italic = False - ass.styles['Annotation PHONE'] = phone_style + ass.styles["Annotation PHONE"] = phone_style pos_def_cn_style = zh_style.copy() - pos_def_cn_style.name = 'DETAIL CN' + pos_def_cn_style.name = "DETAIL CN" pos_def_cn_style.fontsize = fs * 0.7 - ass.styles['DETAIL CN'] = pos_def_cn_style + ass.styles["DETAIL CN"] = pos_def_cn_style pos_def_pos_style = pos_style.copy() - pos_def_pos_style.name = 'DETAIL POS' + pos_def_pos_style.name = "DETAIL POS" pos_def_pos_style.fontsize = fs * 0.6 - ass.styles['DETAIL POS'] = pos_def_pos_style + ass.styles["DETAIL POS"] = pos_def_pos_style cefr_style = pos_style.copy() cefr_style.name = "Annotation CEFR" @@ -1654,55 +1839,58 @@ class LexiAnnot(_PluginBase): cefr_style.fontsize = fs * 0.5 cefr_style.bold = True cefr_style.italic = False - cefr_style.primarycolor = pysubs2.Color(self._accent_color_rgb[0], - self._accent_color_rgb[1], - self._accent_color_rgb[2], - self._color_alpha) + cefr_style.primarycolor = Color( + self._accent_color_rgb[0], + self._accent_color_rgb[1], + self._accent_color_rgb[2], + self._color_alpha, + ) cefr_style.outline = 1 cefr_style.shadow = 0 - ass.styles['Annotation CEFR'] = cefr_style - ass.styles['Annotation EXAM'] = cefr_style + ass.styles["Annotation CEFR"] = cefr_style + ass.styles["Annotation EXAM"] = cefr_style return ass @staticmethod def hex_to_rgb(hex_color) -> Optional[Tuple]: if not hex_color: return None - pattern = r'^#[0-9a-fA-F]{6}$' + pattern = r"^#[0-9a-fA-F]{6}$" if re.match(pattern, hex_color) is None: return None - hex_color = hex_color.lstrip('#') # 去掉前面的 # - return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4)) + hex_color = hex_color.lstrip("#") # 去掉前面的 # + return tuple(int(hex_color[i: i + 2], 16) for i in (0, 2, 4)) @staticmethod - def __extract_subtitle(video_path: str, - subtitle_stream_index: str, - ffmpeg_path: str = 'ffmpeg', - sub_format='ass') -> Optional[str]: - if sub_format not in ['srt', 'ass']: - raise ValueError('Invalid subtitle format') + def __extract_subtitle( + video_path: str, + subtitle_stream_index: str, + ffmpeg_path: str = "ffmpeg", + sub_format="ass", + ) -> Optional[str]: + if sub_format not in ["srt", "ass"]: + raise ValueError("Invalid subtitle format") try: map_parameter = f"0:s:{subtitle_stream_index}" - command = [ - ffmpeg_path, - '-i', video_path, - '-map', map_parameter, - '-f', sub_format, - '-' - ] - result = subprocess.run(command, capture_output=True, text=True, encoding='utf-8', check=True) + command = [ffmpeg_path, "-i", video_path, "-map", map_parameter, "-f", sub_format, "-"] + result = subprocess.run( + command, capture_output=True, text=True, encoding="utf-8", check=True + ) return result.stdout except FileNotFoundError: logger.warn(f"错误:找不到视频文件 '{video_path}'") return None except subprocess.CalledProcessError as e: logger.warn(f"错误:提取字幕失败。\n错误信息:{e}") - logger.warn(f"FFmpeg 输出 (stderr):\n{e.stderr.decode('utf-8', errors='ignore')}") + logger.warn( + f"FFmpeg 输出 (stderr):\n{e.stderr.decode('utf-8', errors='ignore')}" + ) return None @staticmethod - def _extract_subtitles_by_lang(video_path: str, lang: str | list = 'en', ffmpeg: str = 'ffmpeg' - ) -> Optional[List[Dict]]: + def _extract_subtitles_by_lang( + video_path: str, lang: str | list = "en", ffmpeg: str = "ffmpeg" + ) -> Optional[List[Dict]]: """ 提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。 """ @@ -1712,28 +1900,42 @@ class LexiAnnot(_PluginBase): return track_lang in lang return track_lang == lang - supported_codec = ['S_TEXT/UTF8', 'S_TEXT/ASS'] + supported_codec = ["S_TEXT/UTF8", "S_TEXT/ASS"] subtitles = [] try: media_info: pymediainfo.MediaInfo = pymediainfo.MediaInfo.parse(video_path) for track in media_info.tracks: - if (track.track_type == 'Text' and check_lang(track_lang=track.language) - and track.codec_id in supported_codec): - subtitle_stream_index = track.stream_identifier # MediaInfo 的 stream_id 从 1 开始,ffmpeg 从 0 开始 - subtitle = LexiAnnot.__extract_subtitle(video_path, subtitle_stream_index, ffmpeg) + if ( + track.track_type == "Text" + and check_lang(track_lang=track.language) + and track.codec_id in supported_codec + ): + subtitle_stream_index = ( + track.stream_identifier + ) # MediaInfo 的 stream_id 从 1 开始,ffmpeg 从 0 开始 + extracted_subtitle = LexiAnnot.__extract_subtitle( + video_path, subtitle_stream_index, ffmpeg + ) duration = 0 - if hasattr(track, 'duration'): + if hasattr(track, "duration"): try: duration = int(float(track.duration)) except (ValueError, TypeError): pass - if subtitle: - subtitles.append({'title': track.title or '', 'subtitle': subtitle, 'codec_id': track.codec_id, - 'stream_id': subtitle_stream_index, 'duration': duration}) + if extracted_subtitle: + subtitles.append( + { + "title": track.title or "", + "subtitle": extracted_subtitle, + "codec_id": track.codec_id, + "stream_id": subtitle_stream_index, + "duration": duration, + } + ) if subtitles: return subtitles else: - logger.warn('未找到标记为英语的文本字幕流') + logger.warn("未找到标记为英语的文本字幕流") return None except FileNotFoundError: @@ -1747,309 +1949,189 @@ class LexiAnnot(_PluginBase): logger.error(f"使用 MediaInfo 提取字幕时发生错误:{e}") return None - def __query_gemini( + def _process_chain( self, - tasks: TranslationTasks, - api_key: str, - system_instruction: str, - model: str, - temperature: float - ) -> List[T]: - response = translate( - api_key=api_key, - translation_tasks=tasks, - system_instruction=system_instruction, - gemini_model=model, - temperature=temperature, - max_retries=self._max_retries + segments: SegmentList, + lexi: Lexicon, + spacy_worker: SpacyWorker, + mediainfo: Context | None = None, + ) -> SegmentList: + """ + 处理字幕行 + + :param segments: 待处理的字幕 + :param lexi: 词典对象 + :param spacy_worker: spaCy 分词器 + :returns: 处理后的字幕行列表 + """ + simple_vocabulary = set( + filter( + lambda x: x < self._annot_level, ["A1", "A2", "B1", "B2", "C1", "C2"] + ) ) - if not response.success: - logger.warning(f"Error in response: {response.message}") - return tasks.tasks - - self._total_token_count += response.total_token_count - return response.tasks - - def __process_by_ai(self, lines_to_process: List[Dict[str, Any]], - cefr_lexicon: Dict[str, Any], - coca20k_lexicon: Dict[str, Any], - exams_lexicon: Dict[str, Any], - swear_words: List[str], - spacy_worker: SpacyWorker): - - def __replace_with_spaces(_text): - """ - 使用等长的空格替换文本中的 [xxx] 模式。 - 例如:"[Hi]" 会被替换成 " " (4个空格) - """ - pattern = r'(\[.*?\])' - return re.sub(pattern, lambda match: ' ' * len(match.group(1)), _text) - - simple_vocabulary = list(filter(lambda x: x < self._annot_level, ['A1', 'A2', 'B1', 'B2', 'C1', 'C2'])) - patterns = [r'\d+th|\d?1st|\d?2nd|\d?3rd', r"\w+'s$", r"\w+'d$", r"\w+'t$", "[Ii]'m$", r"\w+'re$", r"\w+'ve$", r"\w+'ll$"] - compiled_patterns = [re.compile(p) for p in patterns] - model_temperature = float(self._model_temperature) if self._model_temperature else 0.3 + # model_temperature = float(self._model_temperature) if self._model_temperature else 0.1 logger.info("通过 spaCy 分词...") - vocabulary_trans_instruction = '''You are an expert translator. You will be given a list of English words along with their context, formatted as JSON. For each entry, provide the most appropriate translation in Simplified Chinese based on the context. -Only complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.''' - # 使用nlp分词 - for line_data in lines_to_process: + for seg in segments: if self._shutdown_event.is_set(): - return lines_to_process - text_raw = line_data.get('raw_subtitle') - text = text_raw.replace('\n', ' ') - text = __replace_with_spaces(text) - new_vocab = [] - doc = spacy_worker.submit(text) - last_end_pos = 0 - lemma_to_query = [] - for token in doc: - if len(token['text']) == 1: - continue - if token['lemma_'] in swear_words: - continue - if token['pos_'] not in ('NOUN', 'AUX', 'VERB', 'ADJ', 'ADV', 'ADP', 'CCONJ', 'SCONJ'): - continue - striped = token['lemma_'].strip('-[') - if any(p.match(striped) for p in compiled_patterns): - continue - cefr = LexiAnnot.get_cefr_by_spacy(striped, token['pos_'], cefr_lexicon) - if cefr and cefr in simple_vocabulary: - continue - res_of_coco = LexiAnnot.query_coca20k(striped, coca20k_lexicon) - if res_of_coco and not cefr: - cefr = '' - res_of_exams = self.query_examinations(striped, exams_lexicon) - exam_tags = [] - if res_of_exams: - exam_tags = [exam_id for exam_id in res_of_exams if exam_id in self._exam_tags] - if striped in lemma_to_query: - continue - else: - lemma_to_query.append(striped) - striped_text = token['text'].strip('-*[') - start_pos = text.find(striped_text, last_end_pos) - end_pos = start_pos + len(striped_text) - phonetics = '' - pos_defs = [] - if res_of_exams: - for exam, value in res_of_exams.items(): - phonetics = value.get('ipa_uk') or '' - defs = {} - for pos_def in value.get('defs', []): - pos = pos_def.get('pos', '') - definition_cn = pos_def.get('definition_cn', '') - defs.setdefault(pos, []).append(definition_cn) - pos_defs = [{'pos': pos, 'meanings': meanings} for pos, meanings in defs.items() if pos] - break - elif res_of_coco: - phonetics = res_of_coco.get('phonetics_1') or '' - pos_defs = res_of_coco.get('pos_defs') or [] - last_end_pos = end_pos - new_vocab.append({'start': start_pos, 'end': end_pos, 'text': striped_text, 'lemma': striped, - 'pos': token['pos_'], 'cefr': cefr, 'Chinese': '', 'phonetics': phonetics, - 'pos_defs': pos_defs, 'exam_tags': exam_tags}) - line_data['new_vocab'] = new_vocab - # 查询词汇翻译 - task_bulk: List[VocabularyTranslationTask] = [] - i = 0 + return segments + seg.candidate_words = extract_advanced_words( + segment=seg, + lexi=lexi, + spacy_worker=spacy_worker, + simple_level=simple_vocabulary, + exams=self._exam_tags, + ) if self._gemini_available: - logger.info("查询词汇翻译...") - for line_data in lines_to_process: - if self._shutdown_event.is_set(): - return lines_to_process - if not self._gemini_available: - break - i += 1 - if not (len(line_data["new_vocab"]) or (i == len(lines_to_process) and len(task_bulk))): - continue - new_vocab = [Vocabulary(lemma=new_vocab['lemma'], Chinese='') for new_vocab in line_data['new_vocab']] - task_bulk.append( - VocabularyTranslationTask( - index=line_data['index'], - id=f"{line_data['index']}", - vocabulary=new_vocab, - context=Context( - original_text=line_data['raw_subtitle'].replace('\n', ' ') - ) - ) + if self._use_mp_agent: + llm_apikey = settings.LLM_API_KEY + llm_base_url = settings.LLM_BASE_URL + llm_model_name = settings.LLM_MODEL + llm_provider = settings.LLM_PROVIDER.lower() + else: + llm_apikey = self._gemini_apikey + llm_base_url = self._llm_base_url + llm_model_name = self._gemini_model + llm_provider = self._llm_provider.lower() + llm = initialize_llm( + provider=llm_provider, + model_name=llm_model_name, + base_url=llm_base_url, + api_key=llm_apikey, + temperature=self._model_temperature, + max_retries=self._max_retries, + proxy=self._use_proxy, ) - if len(task_bulk) >= self._context_window or (len(task_bulk) and i == len(lines_to_process)): - logger.info(f"processing dialogues: " - f"{LexiAnnot.format_duration(lines_to_process[task_bulk[0].index]['time_code'][0])} -> " - f"{LexiAnnot.format_duration(lines_to_process[i - 1]['time_code'][1])}") - answer: List[VocabularyTranslationTask] = self.__query_gemini( - TranslationTasks[VocabularyTranslationTask](tasks=task_bulk), - self._gemini_apikey, - vocabulary_trans_instruction, - self._gemini_model, - model_temperature - ) - if not answer: - continue - time.sleep(self._request_interval) - for answer_line in answer: - answer_lemma = tuple(v.lemma for v in answer_line.vocabulary) - filtered_raw = [x for x in lines_to_process if x.get('index') == answer_line.index] - if not len(filtered_raw): - logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.context.original_text}') - available_answer = False - for item in filtered_raw: - lemma = tuple(v['lemma'] for v in item['new_vocab']) - if lemma == answer_lemma: - available_answer = True - for i_, v in enumerate(item['new_vocab']): - v['Chinese'] = answer_line.vocabulary[i_].Chinese - break - if not available_answer: - logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.context.original_text}') - task_bulk = [] - if not self._sentence_translation: - return lines_to_process - if self._gemini_available: - logger.info("查询整句翻译...") - # 查询整句翻译 - translation_tasks: List[DialogueTranslationTask] = [] - for line_data in lines_to_process: - translation_tasks.append( - DialogueTranslationTask( - id=f"{line_data['index']}", - index=line_data['index'], - original_text=line_data['raw_subtitle'].replace('\n', ' '), - Chinese='' - ) + segments = llm_process_chain( + lexi=lexi, + llm=llm, + segments=segments, + shutdown_event=self._shutdown_event, + context_window=self._context_window, + leaner_level=self._annot_level, + media_context=mediainfo, + translate_sentences=self._sentence_translation ) - i = 0 - dialog_trans_instruction = '''You are an expert translator. You will be given a list of dialogue translation tasks in JSON format. For each entry, provide the most appropriate translation in Simplified Chinese based on the context. -Only complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.''' - while i < len(translation_tasks): - if self._shutdown_event.is_set(): - return lines_to_process - if not self._gemini_available: - break - start_index = max(0, i - 1) - end_index = min(len(translation_tasks), i + self._context_window + 1) - task_bulk: List[DialogueTranslationTask] = translation_tasks[start_index:end_index] - logger.info(f"processing dialogues: " - f"{LexiAnnot.format_duration(lines_to_process[i]['time_code'][0])} -> " - f"{LexiAnnot.format_duration(lines_to_process[min(len(translation_tasks), i + self._context_window) - 1]['time_code'][1])}") - answer: List[DialogueTranslationTask] = self.__query_gemini( - TranslationTasks[DialogueTranslationTask](tasks=task_bulk), - self._gemini_apikey, - dialog_trans_instruction, - self._gemini_model, - model_temperature - ) - time.sleep(self._request_interval) - for answer_line in answer: - if answer_line.index not in range(i, i + self._context_window): - continue - filtered_raw = [x for x in lines_to_process if x.get('index') == answer_line.index] - if not len(filtered_raw): - logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.original_text}') - available_answer = False - for item in filtered_raw: - if item['raw_subtitle'].replace('\n', ' ') == answer_line.original_text: - available_answer = True - item['Chinese'] = answer_line.Chinese - break - if not available_answer: - logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.original_text}') - i += self._context_window - return lines_to_process + return segments - def process_subtitles(self, ass_file: SSAFile, - cefr_lexicon: Dict[str, Any], - coca20k_lexicon: Dict[str, Any], - exams_lexicon: Dict[str, Any], - swear_words: List[str], - spacy_worker: SpacyWorker) -> Optional[SSAFile]: + def process_subtitles( + self, + ass_file: SSAFile, + lexi: Lexicon, + spacy_worker: SpacyWorker, + mediainfo: Context | None = None, + ) -> tuple[SSAFile | None, SegmentStatistics | None]: """ 处理字幕内容,标记词汇并添加翻译。 """ - lang = 'en' - abgr_str = (f'&H{self._color_alpha:02x}{self._accent_color_rgb[2]:02x}' - f'{self._accent_color_rgb[1]:02x}{self._accent_color_rgb[0]:02x}&') # &H00FFFFFF& - pos_map = { - 'NOUN': 'n.', - 'AUX': 'aux.', - 'VERB': 'v.', - 'ADJ': 'adj.', - 'ADV': 'adv.', - 'ADP': 'prep.', - 'CCONJ': 'conj.', - 'SCONJ': 'conj.' - } + lang = "en" + abgr_str = ( + f"&H{self._color_alpha:02x}{self._accent_color_rgb[2]:02x}" + f"{self._accent_color_rgb[1]:02x}{self._accent_color_rgb[0]:02x}&" + ) # &H00FFFFFF& + statistical_res = LexiAnnot.analyze_ass_language(ass_file) - main_style: str | None = LexiAnnot.select_main_style_weighted(statistical_res, lang) + main_style: str | None = LexiAnnot.select_main_style_weighted( + statistical_res, lang + ) if not main_style: - logger.error('无法确定主要字幕样式') - return None - index = 0 - lines_to_process = [] - main_dialogue: Dict[int, SSAEvent] = {} + logger.error("无法确定主要字幕样式") + return None, None + # main_dialogue: Dict[int, SSAEvent] = {} + main_processor = SubtitleProcessor() + IDGenerator().reset() for dialogue in ass_file: if dialogue.style != main_style: continue - time_code = (dialogue.start, dialogue.end) - text_raw = dialogue.plaintext - line_data = {'index': index, 'time_code': time_code, 'raw_subtitle': text_raw, 'new_vocab': [], - 'Chinese': ''} - lines_to_process.append(line_data) - main_dialogue[index] = dialogue - index += 1 - lines_to_process = self.__process_by_ai(lines_to_process, cefr_lexicon, coca20k_lexicon, exams_lexicon, - swear_words, spacy_worker) - + main_processor.append(dialogue) + segments = SegmentList(root=list(main_processor.segment_generator())) + segments = self._process_chain( + segments=segments, lexi=lexi, spacy_worker=spacy_worker, mediainfo=mediainfo + ) # 在原字幕添加标注 main_style_fs = ass_file.styles[main_style].fontsize - for line_data in lines_to_process: + __N = r"\N" + for seg in segments: if self._shutdown_event.is_set(): - return None - if line_data['new_vocab']: - replacements = line_data['new_vocab'] - for replacement in replacements: - part_of_speech = f"{{\\fnTimes New Roman\\fs{int(main_style_fs * 0.75)}\\i1}}{pos_map[replacement['pos']]}{{\\r}}" - new_text = f"{{\\c{abgr_str}}}{replacement['text']}{{\\r}}" + return None, None + if seg.candidate_words: + replacements = [] + for word in seg.candidate_words: + exams = [exam for exam in word.exams if exam in self._exam_tags] + new_text = f"{{\\c{abgr_str}}}{word.text}{{\\r}}" if self._in_place: - new_text = new_text + f" ({replacement['Chinese']} {part_of_speech})" if replacement[ - 'Chinese'] else "" + part_of_speech = f"{{\\fnTimes New Roman\\fs{int(main_style_fs * 0.75)}\\i1}}{UNIVERSAL_POS_MAP[word.pos] or ''}{{\\r}}" + new_text = ( + new_text + f" ({word.llm_translation} {part_of_speech})" + if word.llm_translation + else "" + ) else: - dialogue = pysubs2.SSAEvent() - dialogue.start = main_dialogue[line_data['index']].start - dialogue.end = main_dialogue[line_data['index']].end - dialogue.style = 'Annotation EN' - cefr_text = f" {{\\rAnnotation CEFR}}{replacement['cefr']}{{\\r}}" \ - if replacement['cefr'] else "" - exam_text = f" {{\\rAnnotation EXAM}}{' '.join(replacement['exam_tags'])}{{\\r}}" \ - if replacement['exam_tags'] else "" - __N = r'\N' - phone_text = f"{__N}{{\\rAnnotation PHONE}}/{replacement['phonetics']}/{{\\r}}" if replacement['phonetics'] and self._show_phonetics else "" - annot_text = f"{replacement['lemma']} {{\\rAnnotation POS}}{pos_map[replacement['pos']]}{{\\r}} {{\\rAnnotation ZH}}{replacement['Chinese']}{{\\r}}{cefr_text}{exam_text}{phone_text}" + dialogue = SSAEvent() + dialogue.start = main_processor[seg.index].start + dialogue.end = main_processor[seg.index].end + dialogue.style = "Annotation EN" + cefr_text = ( + f" {style_text('Annotation CEFR', word.cefr)}" + if word.cefr + else "" + ) + exam_text = ( + f" {style_text('Annotation EXAM', ' '.join(exams))}" + if exams + else "" + ) + phone_text = ( + f"{__N}{style_text('Annotation PHONE', f'/{word.phonetics}/')}" + if word.phonetics and self._show_phonetics + else "" + ) + annot_text = f"{word.lemma} {style_text('Annotation POS', UNIVERSAL_POS_MAP[word.pos] or '')} {style_text('Annotation ZH', word.llm_translation or '')}{cefr_text}{exam_text}{phone_text}" dialogue.text = annot_text ass_file.append(dialogue) - if self._show_vocabulary_detail and replacement['pos_defs']: - dialogue = pysubs2.SSAEvent() - dialogue.start = main_dialogue[line_data['index']].start - dialogue.end = main_dialogue[line_data['index']].end - dialogue.style = 'DETAIL CN' - detail_text = [] - for pos_def in replacement['pos_defs']: - meaning_str = ', '.join(pos_def['meanings']) - pos_text = f"{{\\rDETAIL POS}}{pos_def['pos']}{{\\r}} {meaning_str}" - detail_text.append(pos_text) - dialogue.text = '\\N'.join(detail_text) + if word.llm_usage_context: + dialogue = SSAEvent( + start=main_processor[seg.index].start, + style="DETAIL CN", + end=main_processor[seg.index].end, + text=style_text( + "Annotation USAGE", word.llm_usage_context + ), + ) ass_file.append(dialogue) - replacement['new_text'] = new_text - LexiAnnot.replace_by_plaintext_positions(main_dialogue[line_data['index']], replacements) + if self._show_vocabulary_detail and word.pos_defs: + dialogue = SSAEvent( + start=main_processor[seg.index].start, + style="DETAIL CN", + end=main_processor[seg.index].end, + ) + detail_text = [] + for pos_def in word.pos_defs: + meaning_str = ", ".join(pos_def.meanings) + pos_text = f"{style_text('DETAIL POS', pos_def.pos)} {meaning_str}" + detail_text.append(pos_text) + dialogue.text = "\\N".join(detail_text) + ass_file.append(dialogue) + replacement = { + "start": word.meta.start_pos, + "end": word.meta.end_pos, + "new_text": new_text, + } + replacements.append(replacement) + LexiAnnot.replace_by_plaintext_positions( + main_processor[seg.index], replacements + ) if self._sentence_translation: - chinese = line_data['Chinese'] - if chinese and chinese[-1] in ['。', ',']: + chinese = seg.Chinese + if chinese and chinese[-1] in ["。", ","]: chinese = chinese[:-1] - main_dialogue[line_data['index']].text = main_dialogue[line_data['index']].text + f"\\N{chinese}" + main_processor[seg.index].text = ( + main_processor[seg.index].text + f"\\N{{\\fs{int(main_style_fs * 0.75)}}}{chinese}{{\\r}}" + ) # 避免 Infuse 显示乱码 - unexplainable_line = pysubs2.SSAEvent(start=0, end=0, text=f"{{\\rAnnotation ZH}}{self.plugin_name}{{\\r}}") + unexplainable_line = SSAEvent( + start=0, end=0, text=f"{style_text('Annotation ZH', self.plugin_name)}" + ) ass_file.insert(0, unexplainable_line) - return ass_file + return ass_file, segments.statistics diff --git a/plugins.v2/lexiannot/agenttool.py b/plugins.v2/lexiannot/agenttool.py new file mode 100644 index 0000000..4bdc01f --- /dev/null +++ b/plugins.v2/lexiannot/agenttool.py @@ -0,0 +1,67 @@ +import asyncio +from typing import Optional, Type + +from pydantic import BaseModel + +from app.agent.tools.base import MoviePilotTool +from app.core.plugin import PluginManager +from .schemas import VocabularyAnnotatingToolInput + + +class VocabularyAnnotatingTool(MoviePilotTool): + """自定义工具示例""" + + # 工具名称 + name: str = "vocabulary_annotating_tool" + # 工具描述 + description: str = ( + "Add new vocabulary annotation task to plugin LexiAnnot's task queue." + ) + # 输入参数模型 + args_schema: Type[BaseModel] = VocabularyAnnotatingToolInput + + def get_tool_message(self, **kwargs) -> Optional[str]: + """根据订阅参数生成友好的提示消息""" + skip_existing = kwargs.get("skip_existing", False) + video_path = kwargs.get("video_path", "") + message = f"正在添加字幕任务: {video_path!r}" + if skip_existing: + message += "(覆写方式:跳过已存在的字幕文件)" + else: + message += "(覆写方式:覆盖已存在的字幕文件)" + return message + + async def run(self, video_path: str, skip_existing: bool = True, **kwargs) -> str: + """ + 实现工具的核心逻辑(异步方法) + + :param video_path: Path to the video file + :param skip_existing: Whether to skip existing subtitle files + :param kwargs: 其他参数,包含 explanation(工具使用说明) + :return: 工具执行结果,返回字符串格式 + """ + try: + # 执行工具逻辑 + result = await self._perform_operation(video_path, skip_existing) + + # 返回执行结果 + if not result: + return f"成功添加词汇标注任务: {video_path!r}" + else: + return f"添加任务出错: {result}" + except Exception as e: + return f"执行失败: {str(e)}" + + async def _perform_operation( + self, video_path: str, skip_existing: bool + ) -> str | None: + """内部方法,执行具体操作""" + # 实现具体业务逻辑 + plugins = PluginManager().running_plugins + plugin_instance = plugins.get("LexiAnnot") + if not plugin_instance: + return "LexiAnnot 插件未运行" + await asyncio.to_thread( + plugin_instance.add_task, video_file=video_path, skip_existing=skip_existing + ) + return None diff --git a/plugins.v2/lexiannot/lexicon.py b/plugins.v2/lexiannot/lexicon.py new file mode 100644 index 0000000..9c4f114 --- /dev/null +++ b/plugins.v2/lexiannot/lexicon.py @@ -0,0 +1,116 @@ +from typing import Literal + +from pydantic import BaseModel, Field, RootModel + +from .schemas import PosDef, Cefr + + +class CefrEntry(BaseModel): + pos: Literal[ + "noun", + "adverb", + "interjection", + "preposition", + "determiner", + "have-verb", + "modal auxiliary", + "adjective", + "number", + "be-verb", + "verb", + "conjunction", + "do-verb", + "infinitive-to", + "vern", + "pos", + "pronoun", + ] = Field(..., description="Part of speech") + cefr: Cefr = Field(..., description="CEFR level") + notes: str | None = Field(default=None, description="Notes") + + +class CefrDictionary(RootModel): + root: dict[str, list[CefrEntry]] + + def get(self, word: str) -> list[CefrEntry] | None: + return self.root.get(word) + + +class Coca20KEntry(BaseModel): + index: int = Field(..., description="Index of the entry") + phonetics_1: str = Field(..., description="Phonetics style 1") + phonetics_2: str = Field(..., description="Phonetics style 2") + pos_defs: list[PosDef] = Field( + ..., description="List of part of speech definitions" + ) + + +class Coca20KDictionary(RootModel): + root: dict[str, Coca20KEntry] + + def get(self, word: str) -> Coca20KEntry | None: + return self.root.get(word) + + +class ShanBayDef(BaseModel): + # 'n.', 'v.', 'adv.', 'adj.', 'phrase.', 'int.', 'pron.', 'prep.', '.', 'conj.', 'num.', 'phrase v.', 'linkv.', + # 'det.', 'ordnumber.', 'prefix.', 'un.', 'vt.', 'mod. v.', 'abbr.', 'auxv.', 'modalv.', 'vi.', 'aux. v.', + # 'interj.', 'article.', 'infinitive.', 'suff.', 'ord.', 'art.', 'exclam.', 'n.[C]' + pos: str = Field(..., description="Part of speech") + definition_cn: str = Field(..., description="Definition in Chinese") + + +class ShanbayEntry(BaseModel): + ipa_uk: str = Field(..., description="UK IPA pronunciation") + ipa_us: str = Field(..., description="US IPA pronunciation") + defs: list[ShanBayDef] = Field(..., description="List of definitions") + + +class ShanbayDictionary(BaseModel): + """Dictionary entries for various examinations.""" + + cet4: dict[str, ShanbayEntry] = Field( + ..., alias="CET-4", description="CET-4 dictionary entries" + ) + cet6: dict[str, ShanbayEntry] = Field( + ..., alias="CET-6", description="CET-6 dictionary entries" + ) + npee: dict[str, ShanbayEntry] = Field( + ..., alias="NPEE", description="NPEE dictionary entries" + ) + ielts: dict[str, ShanbayEntry] = Field( + ..., alias="IELTS", description="IELTS dictionary entries" + ) + toefl: dict[str, ShanbayEntry] = Field( + ..., alias="TOEFL", description="TOEFL dictionary entries" + ) + gre: dict[str, ShanbayEntry] = Field( + ..., alias="GRE", description="GRE dictionary entries" + ) + tem4: dict[str, ShanbayEntry] = Field( + ..., alias="TEM-4", description="TEM-4 dictionary entries" + ) + tem8: dict[str, ShanbayEntry] = Field( + ..., alias="TEM-8", description="TEM-8 dictionary entries" + ) + pet: dict[str, ShanbayEntry] = Field( + ..., alias="PET", description="PET dictionary entries" + ) + + def query(self, word: str) -> dict[str, ShanbayEntry]: + result = {} + for field_name, field_info in ShanbayDictionary.model_fields.items(): + value = getattr(self, field_name) + if word in value: + result[field_info.alias] = value[word] + return result + + +class Lexicon(BaseModel): + cefr: CefrDictionary = Field(..., description="CEFR dictionary") + coca20k: Coca20KDictionary = Field(..., description="COCA 20K dictionary") + examinations: ShanbayDictionary = Field( + ..., description="Shanbay examinations dictionary" + ) + swear_words: list[str] = Field(..., description="List of swear words") + version: str = Field(..., description="Version of the lexicon") diff --git a/plugins.v2/lexiannot/pipeline.py b/plugins.v2/lexiannot/pipeline.py new file mode 100644 index 0000000..657d59c --- /dev/null +++ b/plugins.v2/lexiannot/pipeline.py @@ -0,0 +1,736 @@ +import re +import threading + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import ChatPromptTemplate +from langchain.output_parsers import PydanticOutputParser +from pydantic import SecretStr + +from app.core.config import settings +from app.schemas import Context +from app.schemas.types import MediaType +from app.log import logger +from .lexicon import CefrDictionary, Lexicon, Coca20KDictionary +from .schemas import ( + SubtitleSegment, + PosDef, + Word, + Cefr, + WordMetadata, + SegmentList, + LlmFeedback, + UniversalPos, + LlmEnrichmentResult, + LlmTranslationResult, +) +from .spacyworker import SpacyWorker + + +_patterns = [ + r"\d+th|\d?1st|\d?2nd|\d?3rd", + r"\w+'s$", + r"\w+'d$", + r"\w+'t$", + "[Ii]'m$", + r"\w+'re$", + r"\w+'ve$", + r"\w+'ll$", +] +filter_patterns: list[re.Pattern] = [re.compile(p) for p in _patterns] +pos_interests = {"NOUN", "VERB", "ADJ", "ADV", "ADP", "CCONJ", "SCONJ"} + +UNIVERSAL_POS_MAP: dict[UniversalPos, str] = { + UniversalPos.ADJ: "adj.", + UniversalPos.ADV: "adv.", + UniversalPos.INTJ: "int.", + UniversalPos.NOUN: "n.", + UniversalPos.PROPN: "n.", + UniversalPos.VERB: "v.", + UniversalPos.AUX: "aux.", + UniversalPos.ADP: "prep.", + UniversalPos.CCONJ: "conj.", + UniversalPos.SCONJ: "conj.", + UniversalPos.DET: "det.", + UniversalPos.NUM: "num.", + UniversalPos.PART: "part.", + UniversalPos.PRON: "pron.", + UniversalPos.PUNCT: None, + UniversalPos.SYM: None, + UniversalPos.X: None, +} + + +def initialize_llm( + provider: str, + api_key: str, + model_name: str, + base_url: str | None, + temperature: float = 0.1, + max_retries: int = 3, + proxy: bool = False, +) -> BaseChatModel: + """初始化LLM模型""" + + if provider == "google": + if proxy: + from langchain_openai import ChatOpenAI + + return ChatOpenAI( + model=settings.LLM_MODEL, + api_key=SecretStr(api_key), + max_retries=3, + base_url="https://generativelanguage.googleapis.com/v1beta/openai", + temperature=settings.LLM_TEMPERATURE, + openai_proxy=settings.PROXY_HOST, + ) + from langchain_google_genai import ChatGoogleGenerativeAI + + return ChatGoogleGenerativeAI( + model=model_name, + google_api_key=api_key, # noqa + max_retries=max_retries, + temperature=temperature, + ) + elif provider == "deepseek": + from langchain_deepseek import ChatDeepSeek + + return ChatDeepSeek( + model=model_name, + api_key=SecretStr(api_key), + max_retries=max_retries, + temperature=temperature, + ) + else: + from langchain_openai import ChatOpenAI + + return ChatOpenAI( + model=model_name, + api_key=SecretStr(api_key), + max_retries=max_retries, + base_url=base_url, + temperature=temperature, + openai_proxy=settings.PROXY_HOST if proxy else None, + ) + + +def convert_pos_to_spacy(pos: str): + """ + 将给定的词性列表转换为 spaCy 库中使用的词性标签 + + :param pos: 字符串形式词性 + :returns: 一个包含对应spaCy词性标签的列表。对于无法直接映射的词性,将返回None + """ + spacy_pos_map = { + "noun": "NOUN", + "adjective": "ADJ", + "adverb": "ADV", + "verb": "VERB", + "preposition": "ADP", + "conjunction": "CCONJ", + "determiner": "DET", + "pronoun": "PRON", + "interjection": "INTJ", + "number": "NUM", + } + + pos_lower = pos.lower() + if pos_lower in spacy_pos_map: + spacy_pos = spacy_pos_map[pos_lower] + elif pos_lower == "be-verb": + spacy_pos = "AUX" # Auxiliary verb (e.g., be, do, have) + elif pos_lower == "vern": + spacy_pos = "VERB" # Assuming 'vern' is a typo for 'verb' + elif pos_lower == "modal auxiliary": + spacy_pos = "AUX" # Modal verbs are also auxiliaries + elif pos_lower == "do-verb": + spacy_pos = "AUX" + elif pos_lower == "have-verb": + spacy_pos = "AUX" + elif pos_lower == "infinitive-to": + spacy_pos = "PART" # Particle (e.g., to in "to go") + elif not pos_lower: # Handle empty strings + spacy_pos = None + else: + spacy_pos = None # For unmapped POS tags + return spacy_pos + + +def convert_spacy_to_universal(spacy_pos: str) -> UniversalPos: + """ + 将 spaCy POS 标签转换为 UniversalPos 枚举 + """ + # 创建映射字典 + pos_mapping = { + "ADJ": UniversalPos.ADJ, + "ADV": UniversalPos.ADV, + "INTJ": UniversalPos.INTJ, + "NOUN": UniversalPos.NOUN, + "PROPN": UniversalPos.PROPN, + "VERB": UniversalPos.VERB, + "AUX": UniversalPos.AUX, + # 介词/后置词 + "ADP": UniversalPos.ADP, + # 连词 + "CCONJ": UniversalPos.CCONJ, + "SCONJ": UniversalPos.SCONJ, + # 限定词 + "DET": UniversalPos.DET, + # 数词 + "NUM": UniversalPos.NUM, + # 代词 + "PRON": UniversalPos.PRON, + # 小品词 + "PART": UniversalPos.PART, + # 标点 + "PUNCT": UniversalPos.PUNCT, + # 符号 + "SYM": UniversalPos.SYM, + # 其他 + "X": UniversalPos.X, + # 特殊处理:spaCy 可能返回的其他标签 + "SPACE": UniversalPos.PUNCT, # 空格当作标点处理 + "CONJ": UniversalPos.CCONJ, # 旧版 spaCy 的连词标签 + } + + # 转换为大写,确保一致 + spacy_pos = spacy_pos.upper() + + # 如果直接匹配,返回对应枚举 + if spacy_pos in pos_mapping: + return pos_mapping[spacy_pos] + + # 处理特殊情况:以特定前缀开头的标签 + if spacy_pos.startswith("ADJ"): + return UniversalPos.ADJ + elif spacy_pos.startswith("ADV"): + return UniversalPos.ADV + elif spacy_pos.startswith("NOUN"): + return UniversalPos.NOUN + elif spacy_pos.startswith("VERB"): + return UniversalPos.VERB + elif spacy_pos.startswith("PROPN"): + return UniversalPos.PROPN + elif spacy_pos.startswith("PRON"): + return UniversalPos.PRON + + # 默认返回 X(未知) + return UniversalPos.X + + +def get_cefr_by_spacy( + lemma_: str, pos_: str, cefr_lexicon: CefrDictionary +) -> Cefr | None: + word = lemma_.lower().strip("-*'") + + result = cefr_lexicon.get(word) + if result: + all_cefr: list[Cefr] = [] + if len(result) > 0: + for entry in result: + if pos_ == convert_pos_to_spacy(entry.pos): + return entry.cefr + all_cefr.append(entry.cefr) + return min(all_cefr) + return None + + +def query_coca20k(word: str, coca20k: Coca20KDictionary): + word = word.lower().strip("-*'") + return coca20k.get(word) + + +def _update_word_via_lexicon(word: Word, lexi: Lexicon) -> Word: + """ + 使用词典信息更新单词对象 + + :param word: 需要更新的单词对象 + :param lexi: 词典对象 + :returns: 更新后的单词对象 + """ + # query dictionary + cefr = get_cefr_by_spacy(word.lemma, word.pos.value, lexi.cefr) + res_of_coca = query_coca20k(word.lemma, lexi.coca20k) + if res_of_coca and not cefr: + cefr = None + res_of_exams = lexi.examinations.query(word.lemma) + exam_tags = [exam_id for exam_id in res_of_exams if exam_id in res_of_exams] + pos_defs = [] + phonetics = "" + if res_of_exams: + for exam, value in res_of_exams.items(): + phonetics = value.ipa_uk + defs = {} + for pos_def in value.defs: + pos = pos_def.pos + definition_cn = pos_def.definition_cn + defs.setdefault(pos, []).append(definition_cn) + for pos, meanings in defs.items(): + pos_defs.append(PosDef(pos=pos, meanings=meanings)) + break + elif res_of_coca: + phonetics = res_of_coca.phonetics_1 + pos_defs = res_of_coca.pos_defs + word.exams = exam_tags + word.cefr = cefr + word.pos_defs = pos_defs + word.phonetics = phonetics + return word + + +def extract_advanced_words( + segment: SubtitleSegment, + lexi: Lexicon, + spacy_worker: SpacyWorker, + simple_level: set[Cefr], + exams: list[str], +) -> list[Word]: + text = segment.clean_text + doc = spacy_worker.submit(text) + last_end_pos = 0 + lemma_to_query = [] + words = [] + for token in doc.tokens: + # filter tokens + if ( + len(token.text) == 1 + or token.is_stop + or token.is_punct + or token.ent_iob_ != "O" + ): + continue + if token.pos_ not in pos_interests: + continue + if token.lemma_ in lexi.swear_words: + continue + + striped = token.lemma_.strip("-[") + if any(p.match(striped) for p in filter_patterns): + continue + + if striped in lemma_to_query: + continue + else: + lemma_to_query.append(striped) + striped_text = token.text.strip("-*[") + start_pos = text.find(striped_text, last_end_pos) + end_pos = start_pos + len(striped_text) + + last_end_pos = end_pos + word = Word( + text=striped_text, + lemma=striped, + pos=convert_spacy_to_universal(token.pos_), + meta=WordMetadata( + start_pos=start_pos, end_pos=end_pos, context_id=segment.index + ), + ) + word = _update_word_via_lexicon(word, lexi) + if word.cefr and word.cefr in simple_level: + continue + words.append(word) + return words + + +def _find_segment_by_word_id( + segments: list[SubtitleSegment], word_id: int +) -> SubtitleSegment | None: + for segment in segments: + for word in segment.candidate_words: + if word.meta.word_id == word_id: + return segment + return None + + +def _update_word_metadata( + new_text: str, meta: WordMetadata, segment: SubtitleSegment +) -> WordMetadata | None: + """ + 更新单词的元数据 + + :param new_text: 新的单词文本 + :param meta: 单词的元数据对象 + :param segment: 字幕片段对象 + """ + text = segment.clean_text + p_end = meta.end_pos + new_len = len(new_text) + i = meta.start_pos - new_len + 1 + i = max(0, i) + j = p_end + min(0, (len(text) - (p_end + new_len))) + + for x in range(i, j + 1): + text_view = text[x : (x + new_len)] + if text_view == new_text: + return WordMetadata( + start_pos=x, + end_pos=x + new_len, + context_id=segment.index, + word_id=meta.word_id, + ) + return None + + +def format_time_extended(milliseconds: int): + """ + 将秒数转换为时间格式 + + :param milliseconds: 整数,表示毫秒数 + :return: 字符串,格式为 HH:MM:SS 或 HH:MM:SS.mmm + """ + if milliseconds < 0: + sign = "-" + milliseconds = abs(milliseconds) + else: + sign = "" + + hours = int(milliseconds // 3600000) + minutes = int((milliseconds % 3600000) // 60000) + seconds = (milliseconds % 60000) // 1000 + milliseconds_remainder = milliseconds % 1000 + return f"{sign}{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds_remainder:03d}" + + +def _context_process_chain( + lexi: Lexicon, + llm: BaseChatModel, + segments: list[SubtitleSegment], + start: int, + end: int, + leaner_level: str = "C1", + media_name: str | None = None, + translate_sentences: bool = False +): + feedback_parser = PydanticOutputParser(pydantic_object=LlmFeedback) + + def format_input(segment_list: list[SubtitleSegment]): + media_name_prefix = ( + f"The following subtitles are from '{media_name}'.\n" if media_name else "" + ) + return { + "media_name_prefix": media_name_prefix, + "context_text": " ".join([seg.clean_text for seg in segment_list]), + "candidate_words": "\n".join( + [ + f"- {word.text} (WORD_ID: {word.meta.word_id}, LEMMA: {word.lemma}, CEFR: {word.cefr}, POS: {word.pos})" + for seg in segment_list + for word in seg.candidate_words + ] + ), + "leaner_level": leaner_level, + "format_instructions": feedback_parser.get_format_instructions(), + } + + def refactor_by_feedback(feedback: LlmFeedback): + # Process LLM feedback to update segments + for word in feedback.candidate_words_feedback: + seg = _find_segment_by_word_id(segments, word.word_id) + if not seg or seg.index < start or seg.index > end: + continue + # Update word info based on feedback + if not word.should_keep: + seg.candidate_words = [ + w for w in seg.candidate_words if w.meta.word_id != word.word_id + ] + continue + for w in seg.candidate_words: + if w.meta.word_id == word.word_id: + word_text = word.text + if word_text is not None and word.text != w.text: + # Update metadata if text changed + if word.text not in seg.clean_text: + # If the word text is not found in the segment, skip updating metadata + continue + new_meta = _update_word_metadata(word_text, w.meta, seg) + if not new_meta: + continue + w.meta = new_meta + w.text = word_text + if word.pos: + w.pos = word.pos + if word.lemma: + w.lemma = word.lemma + + # Add new words identified by LLM + for new_word in feedback.llm_identified_words: + for seg in segments: + if seg.index < start or seg.index > end: + continue + start_pos = seg.clean_text.find(new_word.text) + if start_pos == -1: + continue + if any(w.text == new_word.text for w in seg.candidate_words): + continue + new_meta = WordMetadata( + start_pos=start_pos, + end_pos=start_pos + len(new_word.text), + context_id=seg.index + ) + built_word = Word( + text=new_word.text, + lemma=new_word.lemma, + pos=new_word.pos, + meta=new_meta + ) + built_word = _update_word_via_lexicon(built_word, lexi) + if built_word.cefr and built_word.cefr < leaner_level: + continue + seg.candidate_words.append(built_word) + + prompt_template = ChatPromptTemplate.from_messages( + [ + ( + "system", + """You are an expert in linguistics and language learning. Your task is to analyze subtitle segments. +Please perform the following tasks for an English learner at {leaner_level} CEFR level. + +**CRITICAL INSTRUCTION**: The learner is advanced. They already know common daily vocabulary. +Your goal is to identify **only** content that helps them reach native-level proficiency. + +1. **Review and Evaluate Candidate Words:** + * **Goal**: Filter out simple words and correct any errors in lemma/POS/text. + * **Action**: Return feedback items **ONLY** for words that: + 1. Should be **discarded** (too simple, trivial filler, profanity without cultural value). Set `should_keep` to `False`. + 2. Need **correction** (wrong lemma, POS, or text boundary). Set `should_keep` to `True` and provide correct values. + * **Implicit Rule**: If a word is appropriate for the learner and has correct info, **DO NOT** include it in the output list. + * **Keep criteria**: Keep simple words **ONLY IF** used in a non-literal, metaphorical, or idiomatic sense. + * **Discard criteria**: Discard trivial conversational fillers ('gonna', 'wanna'), simple interjections, common profanity, and words below {leaner_level} level. + +2. **Identify Missed Words:** + * Identify any additional single words or phrases (typically 1-3 words) from the `context_text` that may be important for {leaner_level} learners. This specifically includes: + * **Slang or informal expressions.** + * **Internet terms or modern colloquialisms.** + * **Words or phrases that require specific cultural background knowledge to understand.** + * **Any other words or phrases that are challenging.** + * Avoid repeating words already listed in `candidate_words`. + * Must exist in the exact form in `context_text`. + * Provide lemma and POS. + * **Do NOT include** simple high-frequency words, common fillers ('gonna', 'gotta'), or basic swear words unless necessary for context. + +------------------------- +You MUST return output strictly matching the provided Pydantic schema. +Return ONLY valid JSON. + +**Here are the output format instructions you MUST follow strictly:** +{format_instructions} +""", + ), + ( + "human", + """{media_name_prefix}Here is the context from the subtitles: +--- +{context_text} +--- +Here are the candidate words identified by a basic algorithm: +{candidate_words} +""", + ), + ] + ) + feedback_chain = ( + format_input | prompt_template | llm.with_structured_output(LlmFeedback).with_retry(stop_after_attempt=3) + ) + result: LlmFeedback = feedback_chain.invoke(segments) # type: ignore + refactor_by_feedback(result) + + # 丰富词义 + if any(segment.candidate_words for segment in segments): + enrichment_prompt_template = ChatPromptTemplate.from_messages( + [ + ( + "system", + """You are a linguistics and English-learning expert. Your goal is to enhance vocabulary learning for Chinese users.\n +For each word (identified by `WORD_ID`), provide: +1. **Translation:** A concise Chinese translation. +2. **Usage or Cultural Context (optional, in Chinese)**: + * ONLY include if: + - The word has a specific meaning in this context that differs from its common definition; + - It is slang, idiom, phrasal, metaphorical, or culturally loaded; + * ONLY provide this context when learners would likely struggle to understand the word's usage without it. + +**For each word, provide the `word_id` to ensure proper mapping.** +**Your judgment should be based strictly on the provided subtitle context. DO NOT fabricate context or forced explanation.** + +------------------------- +You MUST return output strictly matching the provided Pydantic schema. +Return ONLY valid JSON. + +**Here are the output format instructions you MUST follow strictly:** +{format_instructions} +""", + ), + ( + "human", + """{media_name_prefix}Here is the context from the subtitles: +--- +{context_text} +--- +Here are the words you need to enrich: +{words_to_enrich} +""", + ), + ] + ) + enrichment_parser = PydanticOutputParser(pydantic_object=LlmEnrichmentResult) + + def format_enrichment_input(segment_list: list[SubtitleSegment]): + media_name_prefix = ( + f"The following subtitles are from '{media_name}'.\n" + if media_name + else "" + ) + words_to_enrich = [] + for seg in segment_list: + if start <= seg.index <= end: + for w in seg.candidate_words: + words_to_enrich.append( + f"- {w.text} (WORD_ID: {w.meta.word_id}, LEMMA: {w.lemma}, POS: {w.pos}, DEFINITIONS: {w.pos_defs_plaintext})" + ) + return { + "media_name_prefix": media_name_prefix, + "context_text": " ".join([seg.clean_text for seg in segment_list]), + "words_to_enrich": "\n".join(words_to_enrich), + "format_instructions": enrichment_parser.get_format_instructions(), + } + + enrichment_chain = ( + format_enrichment_input + | enrichment_prompt_template + | llm.with_structured_output(LlmEnrichmentResult).with_retry(stop_after_attempt=3) + ) + + enrichment_result: LlmEnrichmentResult = enrichment_chain.invoke(segments) # type: ignore + + for enriched_word_data in enrichment_result.enriched_words: + for segment in segments: + if segment.index < start or segment.index > end: + continue + for candidate_word in segment.candidate_words: + if candidate_word.meta.word_id == enriched_word_data.word_id: + candidate_word.llm_translation = enriched_word_data.translation + candidate_word.llm_usage_context = enriched_word_data.usage_context + break + # 整句翻译 + if translate_sentences: + translation_parser = PydanticOutputParser(pydantic_object=LlmTranslationResult) + + translation_prompt_template = ChatPromptTemplate.from_messages( + [ + ( + "system", + """You are a professional subtitle translator. Your task is to translate English subtitle segments into natural, idiomatic Chinese. + +**Guidelines:** +1. **Tone & Style:** Maintain the original tone (e.g., casual, formal, humorous, dramatic). +2. **Context:** Use the surrounding segments to ensure continuity and correct meaning. +3. **Conciseness:** Subtitles have space constraints. Keep translations concise but accurate. +4. **Formatting:** Return the result strictly matching the provided JSON schema. + +------------------------- +You MUST return output strictly matching the provided Pydantic schema. +Return ONLY valid JSON. + +**Here are the output format instructions you MUST follow strictly:** +{format_instructions} +""", + ), + ( + "human", + """{media_name_prefix}Here are the segments to translate: +--- +{segments_text} +--- +""", + ), + ] + ) + + def format_translation_input(segment_list: list[SubtitleSegment]): + media_name_prefix = ( + f"The following subtitles are from '{media_name}'.\n" + if media_name + else "" + ) + # Only translate segments within the current batch range (start to end) + segments_text_lines = [] + for seg in segment_list: + if start <= seg.index <= end: + segments_text_lines.append(f"ID {seg.index}: {seg.clean_text}") + + return { + "media_name_prefix": media_name_prefix, + "segments_text": "\n".join(segments_text_lines), + "format_instructions": translation_parser.get_format_instructions(), + } + + translation_chain = ( + format_translation_input + | translation_prompt_template + | llm.with_structured_output(LlmTranslationResult).with_retry(stop_after_attempt=3) + ) + + try: + translation_result: LlmTranslationResult = translation_chain.invoke(segments) # type: ignore + + # Map translations back to segments + trans_map = { + t.index: t.translation for t in translation_result.translations + } + for segment in segments: + if segment.index in trans_map: + segment.Chinese = trans_map[segment.index] + except Exception as e: + logger.error(f"Error during sentence translation: {e}") + + return [segment for segment in segments if start <= segment.index <= end] + + +def llm_process_chain( + lexi: Lexicon, + llm: BaseChatModel, + segments: SegmentList, + shutdown_event: threading.Event, + context_window: int = 30, + leaner_level: str = "C1", + media_context: Context | None = None, + translate_sentences: bool = False, +) -> SegmentList: + """ + 根据 LLM 的反馈更新字幕片段中的单词信息 + + :param lexi: 词典对象 + :param llm: 大语言模型对象 + :param segments: 字幕片段 + :param shutdown_event: 关闭事件 + :param context_window: 上下文窗口大小 + :param leaner_level: 学习者的 CEFR 水平 + :param media_context: 媒体信息 + :param translate_sentences: 是否翻译句子 + :returns: 更新后的字幕片段列表 + """ + media_name = None + if media_context and media_context.media_info and media_context.meta_info: + media_info = media_context.media_info + if media_info.type == MediaType.TV: + media_name = ( + f"{media_info.title_year} {media_context.meta_info.season_episode}" + ) + else: + media_name = f"{media_info.title_year}" + + segments_list = [] + for context, (start, end) in segments.context_generator( + context_window=context_window, extra_len=2 + ): + if shutdown_event.is_set(): + break + logger.info( + f"Processing segments {format_time_extended(context[0].start_time)} ({context[0].index}) ->" + f" {format_time_extended(context[-1].end_time)} ({context[-1].index}) via LLM..." + ) + segments_list.extend( + _context_process_chain( + lexi, llm, context, start, end, leaner_level, media_name, translate_sentences + ) + ) + + return SegmentList(root=segments_list) diff --git a/plugins.v2/lexiannot/query_gemini.py b/plugins.v2/lexiannot/query_gemini.py deleted file mode 100644 index e583985..0000000 --- a/plugins.v2/lexiannot/query_gemini.py +++ /dev/null @@ -1,111 +0,0 @@ -import time -from typing import Generic, List, TypeVar - -from google import genai -from google.genai import types -from pydantic import BaseModel - - -class Context(BaseModel): - original_text: str - - -class Vocabulary(BaseModel): - lemma: str - Chinese: str - - -class TaskBase(BaseModel): - id: str - - -class VocabularyTranslationTask(TaskBase): - vocabulary: List[Vocabulary] - context: Context - index: int - - -class DialogueTranslationTask(TaskBase): - original_text: str - Chinese: str - index: int - - -T = TypeVar("T", bound=TaskBase) - - -class TranslationTasks(BaseModel, Generic[T]): - tasks: List[T] - - -class GeminiResponse(BaseModel, Generic[T]): - tasks: List[T] - total_token_count: int - success: bool - message: str = "" - - -def translate( - api_key: str, - translation_tasks: TranslationTasks[T], - system_instruction: str, - gemini_model: str = "gemini-2.0-flash", - temperature: float = 0.3, - max_retries: int = 3, - retry_delay: int = 10, -) -> GeminiResponse[T]: - """ - Query the Gemini API for translation tasks with retry logic. - - :param api_key: Gemini API key - :param translation_tasks: Translation tasks - :param system_instruction: System instruction - :param gemini_model: Model name to use - :param temperature: Generation temperature - :param max_retries: Number of retry attempts - :param retry_delay: Delay between retries in seconds - - returns: GeminiResponse containing the results - """ - - - messages = [] - - response_schema = type(translation_tasks) - - for attempt in range(1, max_retries + 1): - try: - client = genai.Client(api_key=api_key) - response = client.models.generate_content( - model=gemini_model, - contents=translation_tasks.model_dump_json(), - config=types.GenerateContentConfig( - system_instruction=system_instruction, - response_mime_type="application/json", - response_schema=response_schema, - temperature=temperature, - ), - ) - - if not response.parsed: - raise ValueError("Empty response from Gemini API") - - translation_res = response.parsed - total_token_count = response.usage_metadata.total_token_count - return GeminiResponse( - tasks=translation_res.tasks, - total_token_count=total_token_count or 0, - success=True, - ) - - except Exception as e: - messages.append(f"Attempt {attempt} failed: {str(e)}") - if attempt < max_retries: - time.sleep(attempt*retry_delay) - - return GeminiResponse( - tasks=[], - total_token_count=0, - success=False, - message="All retry attempts failed. " + "\n".join(messages), - ) \ No newline at end of file diff --git a/plugins.v2/lexiannot/requirements.txt b/plugins.v2/lexiannot/requirements.txt index bf64a6e..a82c2a2 100644 --- a/plugins.v2/lexiannot/requirements.txt +++ b/plugins.v2/lexiannot/requirements.txt @@ -1,5 +1,4 @@ pysubs2~=1.8.0 langdetect~=1.0.9 pymediainfo~=7.0.1 -spacy~=3.8.7 -google-genai~=1.48.0 \ No newline at end of file +spacy~=3.8.11 \ No newline at end of file diff --git a/plugins.v2/lexiannot/schemas.py b/plugins.v2/lexiannot/schemas.py new file mode 100644 index 0000000..32c18ff --- /dev/null +++ b/plugins.v2/lexiannot/schemas.py @@ -0,0 +1,394 @@ +import re +import uuid +from collections import Counter +from enum import Enum +from typing import Literal, Generator, Iterator + +from pydantic import BaseModel, Field, RootModel, model_validator + +from app.utils.singleton import Singleton + + +Cefr = Literal["C2", "C1", "B2", "B1", "A2", "A1"] + + +class UniversalPos(str, Enum): + """Universal Part-of-Speech tags""" + + ADJ = "ADJ" # Adjective + ADV = "ADV" # Adverb + INTJ = "INTJ" # Interjection + NOUN = "NOUN" # Noun + PROPN = "PROPN" # Proper noun + VERB = "VERB" # Verb + ADP = "ADP" # Adposition (preposition/postposition) + AUX = "AUX" # Auxiliary verb + CCONJ = "CCONJ" # Coordinating conjunction + DET = "DET" # Determiner + NUM = "NUM" # Numeral + PART = "PART" # Particle + PRON = "PRON" # Pronoun + SCONJ = "SCONJ" # Subordinating conjunction + PUNCT = "PUNCT" # Punctuation + SYM = "SYM" # Symbol + X = "X" # Other/unknown + + +class IDGenerator(metaclass=Singleton): + """Singleton class for generating unique IDs.""" + + _counter = 0 + + def next_id(self): + self._counter += 1 + return self._counter + + def reset(self): + self._counter = 0 + + +class TaskStatus(Enum): + PENDING = "pending" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + CANCELED = "canceled" + IGNORED = "ignored" + + +class TaskParams(BaseModel): + skip_existing: bool = Field( + default=True, description="Whether to skip existing subtitle files" + ) + + +class TasksApiParams(BaseModel): + operation: Literal["DELETE", "RETRY", "IGNORE"] = Field( + ..., description="Operation to perform on the tasks" + ) + task_id: str | None = Field( + default=None, description="Unique identifier for the task" + ) + + +class SegmentStatistics(BaseModel): + total_segments: int = Field(default=0, description="Total number of subtitle segments") + total_words: int = Field(default=0, description="Total number of candidate words") + cefr_distribution: dict[str, int] = Field( + default_factory=dict, description="Distribution of words by CEFR level" + ) + pos_distribution: dict[str, int] = Field( + default_factory=dict, description="Distribution of words by Part of Speech" + ) + exam_distribution: dict[str, int] = Field( + default_factory=dict, description="Distribution of words by Examination" + ) + + def to_string(self) -> str: + cefr_str = ", ".join( + [f"{level}({count})" for level, count in self.cefr_distribution.items()] + ) + pos_str = ", ".join( + [f"{pos}({count})" for pos, count in self.pos_distribution.items()] + ) + exam_str = ", ".join([f"{exam}({count})" for exam, count in self.exam_distribution.items()]) + return ( + f"Total Segments: {self.total_segments}\n" + f"Total Words: {self.total_words}\n" + f"CEFR Distribution: {cefr_str if cefr_str else 'N/A'}\n" + f"POS Distribution: {pos_str if pos_str else 'N/A'}\n" + f"Exam Distribution: {exam_str if exam_str else 'N/A'}" + ) + + +class ProcessResult(BaseModel): + """Result of processing a task.""" + + message: str | None = Field( + default=None, description="Additional message or error information" + ) + status: TaskStatus = Field( + default=TaskStatus.PENDING, description="Current status of the task" + ) + statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task") + + +class Task(BaseModel): + video_path: str = Field(..., description="Path to the video file") + task_id: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique identifier for the task", + ) + status: TaskStatus = Field( + default=TaskStatus.PENDING, description="Current status of the task" + ) + add_time: str | None = Field( + default=None, description="Add time of the task, format %Y-%m-%d %H:%M:%S" + ) + complete_time: str | None = Field( + default=None, description="Complete time of the task" + ) + tokens_used: int = Field(default=0, description="Number of used tokens") + message: str | None = Field( + default=None, description="Additional message or error information" + ) + params: TaskParams = Field( + default_factory=TaskParams, description="Parameters for the task" + ) + statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task") + + +class WordMetadata(BaseModel): + start_pos: int = Field( + ..., description="Start position of the word in the context sentence" + ) + end_pos: int = Field( + ..., description="End position of the word in the context sentence" + ) + context_id: int = Field(..., description="Identifier of the context sentence") + word_id: int = Field( + default_factory=lambda: IDGenerator().next_id(), + description="Identifier of the word in the context", + ) + + +class PosDef(BaseModel): + # 'art.', 'v.', 'aux.', 'conj.', 'prep.', 'adv.', 'adj.', 'n.', 'vt.', 'pron.', 'det.', 'vi.', 'int.' + # 'num.', 'abbr.', 'na.', 'quant.', 'phr.' + pos: str = Field(..., description="Part of speech") + meanings: list[str] = Field(..., description="List of definitions") + + @property + def plaintext(self): + return f"{self.pos} {'; '.join(self.meanings)}" + + +class WordBase(BaseModel): + text: str = Field(..., description="The word or phrase") + lemma: str = Field(..., description="Lemma form of the word") + pos: UniversalPos = Field( + default=UniversalPos.X, description="Universal POS tag of the word" + ) + + +class Word(WordBase): + phonetics: str | None = Field( + default=None, description="Phonetic transcription of the word" + ) + meta: WordMetadata = Field( + default_factory=WordMetadata, description="Additional metadata" + ) + cefr: Cefr | None = Field(default=None, description="CEFR level") + exams: list[str] = Field( + default_factory=list, + description="Exams whose vocabulary syllabus include this word", + ) + pos_defs: list[PosDef] = Field( + default_factory=list, description="Part of speech definitions" + ) + llm_translation: str | None = Field( + default=None, description="LLM generated Chinese translation" + ) + llm_usage_context: str | None = Field( + default=None, description="LLM generated cultural context" + ) + llm_example_sentences: list[str] = Field( + default_factory=list, description="LLM generated example sentences" + ) + + @property + def pos_defs_plaintext(self) -> str: + return " ".join( + [ + f"{index}. {pos_def.plaintext}" + for index, pos_def in enumerate(self.pos_defs) + ] + ) + + +class SubtitleSegment(BaseModel): + index: int = Field(..., description="Index of the subtitle segment") + start_time: int = Field( + ..., description="Start time of the subtitle segment in milliseconds" + ) + end_time: int = Field( + ..., description="End time of the subtitle segment in milliseconds" + ) + plaintext: str = Field(..., description="Text content of the subtitle segment") + Chinese: str | None = Field( + default=None, description="Chinese translation of the subtitle segment" + ) + candidate_words: list[Word] = Field( + default_factory=list, description="List of words worth learning in the segment" + ) + + def words_append(self, word: Word): + """ + 向字幕片段中添加一个单词到 words_worth_larning 列表中。 + + :param word: 要添加的单词对象。 + """ + self.candidate_words.append(word) + + @staticmethod + def _replace_with_spaces(_text): + """ + 使用等长的空格替换文本中的 [xxx] 模式。 + 例如:"[Hi]" 会被替换成 " " (4个空格) + """ + pattern = r"(\[.*?\])" + return re.sub(pattern, lambda match: " " * len(match.group(1)), _text) + + @property + def clean_text(self) -> str: + """ + 获取清理后的文本内容,去除换行符并将 [xxx] 模式替换为空格。 + """ + return SubtitleSegment._replace_with_spaces(self.plaintext.replace("\n", " ")) + + def __lt__(self, other: object): + if not isinstance(other, SubtitleSegment): + return NotImplemented + return self.index < other.index + + +class SegmentList(RootModel): + root: list[SubtitleSegment] = Field( + default_factory=list, description="List of subtitle segments" + ) + + @property + def statistics(self) -> SegmentStatistics: + all_words = [word for seg in self.root for word in seg.candidate_words] + + cefr_counts = Counter(word.cefr if word.cefr else "Other" for word in all_words) + pos_counts = Counter(word.pos.value if word.pos else "Other" for word in all_words) + exam_counts = Counter(exam for word in all_words for exam in word.exams) + + return SegmentStatistics( + total_segments=len(self.root), + total_words=len(all_words), + cefr_distribution=dict(cefr_counts), + pos_distribution=dict(pos_counts), + ) + + def context_generator( + self, context_window: int, extra_len: int = 1 + ) -> Generator[tuple[list[SubtitleSegment], tuple[int, int]], None, None]: + """ + 生成包含上下文窗口的字幕片段列表 + + :param context_window: 上下文窗口大小 + :param extra_len: 额外长度,用于调整窗口大小 + :yield: 包含上下文的字幕片段列表。 + """ + total_segments = len(self.root) + for i in range(total_segments // context_window + 1): + real_start = i * context_window + real_end = min(total_segments, (i + 1) * context_window) - 1 + start_index = max(0, i * context_window - extra_len) + end_index = min(total_segments, (i + 1) * context_window + extra_len) + yield ( + self.root[start_index:end_index], + (self.root[real_start].index, self.root[real_end].index), + ) + + def sort(self): + self.root.sort() + + @model_validator(mode="after") + def sort_root(self): + self.root.sort() + return self + + def __iter__(self) -> Iterator[SubtitleSegment]: + return iter(self.root) + + +class SpacyToken(BaseModel): + lemma_: str = Field(..., description="Lemma form of the word (string)") + pos_: str = Field(..., description="POS tag of the word") + text: str = Field(..., description="Text of the word") + is_stop: bool = Field( + default=False, description="Indicates if the word is a stop word" + ) + is_punct: bool = Field( + default=False, description="Indicates if the word is punctuation" + ) + ent_iob_: str = Field(..., description="Entity IOB") + + +class SpacyNamedEntity(BaseModel): + text: str = Field(..., description="Text of the entity") + label_: str = Field(..., description="Label of the entity") + + +class NlpResult(BaseModel): + tokens: list[SpacyToken] = Field(default_factory=list, description="List of tokens") + entities: list[SpacyNamedEntity] = Field( + default_factory=list, description="List of named entities" + ) + + +class LlmFeedbackAboutCandidateWord(BaseModel): + should_keep: bool = Field( + ..., description="Indicates whether to keep the candidate word" + ) + # reason: str | None = Field(default=None, description="Concise reason for the decision") + word_id: int = Field(..., description="Identifier of the word in the context") + text: str | None = Field(default=None, description="The vocabulary word or phrase") + lemma: str | None = Field(default=None, description="Lemma form of the word") + pos: UniversalPos | None = Field( + default=None, + description="Universal POS tag of the word. Options: ADJ, ADV, INTJ, NOUN, PROPN, " + "VERB, ADP, AUX, CCONJ, DET, NUM, PART, PRON, SCONJ, PUNCT, SYM, X", + ) + + +class LlmFeedback(BaseModel): + candidate_words_feedback: list[LlmFeedbackAboutCandidateWord] = Field( + default_factory=list, description="Feedback about candidate words." + ) + llm_identified_words: list[WordBase] = Field( + default_factory=list, description="List of words identified by the LLM." + ) + + +class LlmWordEnrichment(BaseModel): + word_id: int = Field(..., description="Identifier of the word in the context") + translation: str | None = Field( + default=None, description="Chinese translation of the word" + ) + usage_context: str | None = Field( + default=None, description="Usage or Cultural Context" + ) + + +class LlmEnrichmentResult(BaseModel): + enriched_words: list[LlmWordEnrichment] = Field( + default_factory=list, description="List of enriched word data." + ) + + +class LlmSegmentTranslation(BaseModel): + index: int = Field(..., description="Index of the subtitle segment") + translation: str = Field( + ..., description="Natural Chinese translation of the segment" + ) + + +class LlmTranslationResult(BaseModel): + translations: list[LlmSegmentTranslation] = Field( + default_factory=list, description="List of segment translations" + ) + + +class VocabularyAnnotatingToolInput(BaseModel): + explanation: str = Field( + ..., + description="This is a tool for adding a new vocabulary-annotating task to AnnotLexi.", + ) + video_path: str = Field(..., description="Path to the video file") + skip_existing: bool = Field( + default=True, description="Whether to skip existing subtitle files" + ) diff --git a/plugins.v2/lexiannot/spacyworker.py b/plugins.v2/lexiannot/spacyworker.py index 496c1e7..f861325 100644 --- a/plugins.v2/lexiannot/spacyworker.py +++ b/plugins.v2/lexiannot/spacyworker.py @@ -1,29 +1,28 @@ from multiprocessing import Process, Queue -from typing import Dict, List import spacy from spacy.tokenizer import Tokenizer from app.core.cache import cached from app.log import logger +from .schemas import SpacyNamedEntity, SpacyToken, NlpResult class SpacyWorker: - - def __init__(self, model='en_core_web_sm'): + def __init__(self, model="en_core_web_sm"): self.task_q = Queue() self.result_q = Queue() self.status_q = Queue() self.model = model # 启动子进程 - logger.info(f"正在启动 SpacyWorker 子进程...") + logger.info("正在启动 SpacyWorker 子进程...") self.proc = Process(target=self.run, args=(self.model,)) self.proc.start() # 等待子进程返回模型加载状态 status, info = self.status_q.get() - if status == 'error': + if status == "error": self.proc.join() raise RuntimeError(f"spaCy 模型加载失败: {info}") else: @@ -39,35 +38,50 @@ class SpacyWorker: try: nlp = SpacyWorker.load_nlp(model) infixes = list(nlp.Defaults.infixes) - infixes = [i for i in infixes if '-' not in i] + infixes = [i for i in infixes if "-" not in i] infix_re = spacy.util.compile_infix_regex(infixes) nlp.tokenizer = Tokenizer( nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, suffix_search=nlp.tokenizer.suffix_search, infix_finditer=infix_re.finditer, - token_match=nlp.tokenizer.token_match + token_match=nlp.tokenizer.token_match, ) except Exception as e: - self.status_q.put(('error', str(e))) + self.status_q.put(("error", str(e))) return # 告诉主进程加载成功 - self.status_q.put(('ok', None)) + self.status_q.put(("ok", None)) while True: text = self.task_q.get() if text is None: break doc = nlp(text) - self.result_q.put([{'text': token.text, 'pos_': token.pos_, 'lemma_': token.lemma_} for token in doc]) + tokens = [] + entities = [] + for token in doc: + tokens.append( + SpacyToken( + lemma_=token.lemma_, + pos_=token.pos_, + text=token.text, + is_stop=token.is_stop, + is_punct=token.is_punct, + ent_iob_=token.ent_iob_, + ) + ) + for ent in doc.ents: + entities.append(SpacyNamedEntity(text=ent.text, label_=ent.label_)) + self.result_q.put(NlpResult(tokens=tokens, entities=entities)) @staticmethod @cached(maxsize=1, ttl=3600 * 6) def load_nlp(model: str) -> spacy.Language: return spacy.load(model) - def submit(self, text: str) -> List[Dict[str, str]]: + def submit(self, text: str) -> NlpResult: """ 提交任务并等待结果 """ diff --git a/plugins.v2/lexiannot/subtitle.py b/plugins.v2/lexiannot/subtitle.py new file mode 100644 index 0000000..9477713 --- /dev/null +++ b/plugins.v2/lexiannot/subtitle.py @@ -0,0 +1,44 @@ +from typing import Generator, Any, overload + +from pysubs2 import SSAEvent + +from .schemas import SubtitleSegment + + +class SubtitleProcessor: + def __init__(self): + self._events: list[SSAEvent] = [] + + def append(self, event: SSAEvent): + self._events.append(event) + + def segment_generator(self) -> Generator[SubtitleSegment, None, None]: + for index, event in enumerate(self._events): + yield SubtitleSegment( + index=index, + start_time=event.start, + end_time=event.end, + plaintext=event.plaintext, + ) + + @overload + def __getitem__(self, item: int) -> SSAEvent: + pass + + @overload + def __getitem__(self, s: slice) -> list[SSAEvent]: + pass + + def __getitem__(self, item: Any) -> Any: + return self._events[item] + + +def style_text(style: str, text: str) -> str: + """ + 使用指定的样式包装文本。 + + :param style: 样式名称 + :param text: 要包装的文本 + :return: 包含样式的文本 + """ + return f"{{\\r{style}}}{text}{{\\r}}"