mirror of
https://github.com/jxxghp/MoviePilot-Plugins.git
synced 2026-03-27 10:05:57 +00:00
1844 lines
90 KiB
Python
1844 lines
90 KiB
Python
from collections import Counter
|
||
import os
|
||
import re
|
||
import sys
|
||
import json
|
||
import subprocess
|
||
import time
|
||
import threading
|
||
import queue
|
||
import shutil
|
||
from typing import Any, List, Dict, Tuple, Optional, Union, Type, TypeVar
|
||
import venv
|
||
from pathlib import Path
|
||
|
||
import pysubs2
|
||
from pysubs2 import SSAFile, SSAEvent
|
||
import pymediainfo
|
||
from langdetect import detect
|
||
import spacy
|
||
from spacy.util import compile_infix_regex
|
||
from spacy.tokenizer import Tokenizer
|
||
|
||
from app.core.config import settings
|
||
from app.log import logger
|
||
from app.plugins import _PluginBase
|
||
from app.core.cache import cached
|
||
from app.core.event import eventmanager, Event
|
||
from app.utils.system import SystemUtils
|
||
from app.schemas.types import NotificationType
|
||
from app.utils.http import RequestUtils
|
||
from app.utils.string import StringUtils
|
||
from app.schemas import TransferInfo
|
||
from app.schemas.types import EventType
|
||
from app.core.context import MediaInfo
|
||
from app.plugins.lexiannot.query_gemini import DialogueTranslationTask, VocabularyTranslationTask, Vocabulary, Context
|
||
|
||
T = TypeVar('T', VocabularyTranslationTask, DialogueTranslationTask)
|
||
|
||
|
||
class LexiAnnot(_PluginBase):
|
||
# 插件名称
|
||
plugin_name = "美剧生词标注"
|
||
# 插件描述
|
||
plugin_desc = "根据CEFR等级,为英语影视剧标注高级词汇。"
|
||
# 插件图标
|
||
plugin_icon = "LexiAnnot.png"
|
||
# 插件版本
|
||
plugin_version = "1.1.0"
|
||
# 插件作者
|
||
plugin_author = "wumode"
|
||
# 作者主页
|
||
author_url = "https://github.com/wumode"
|
||
# 插件配置项ID前缀
|
||
plugin_config_prefix = "lexiannot_"
|
||
# 加载顺序
|
||
plugin_order = 50
|
||
# 可使用的用户级别
|
||
auth_level = 1
|
||
|
||
_enabled: bool = False
|
||
_annot_level = ''
|
||
_send_notify = False
|
||
_onlyonce = False
|
||
_show_vocabulary_detail = False
|
||
_show_phonetics = False
|
||
_sentence_translation = False
|
||
_in_place = False
|
||
_enable_gemini = False
|
||
_gemini_model = ''
|
||
_gemini_apikey = ''
|
||
_context_window: int = 0
|
||
_max_retries: int = 0
|
||
_request_interval: int = 0
|
||
_ffmpeg_path = ''
|
||
_english_only = False
|
||
_when_file_trans = False
|
||
_model_temperature = ''
|
||
_custom_files = ''
|
||
_accent_color = ''
|
||
_font_scaling = ''
|
||
_opacity = ''
|
||
_exam_tags: List[str] = []
|
||
_spacy_model: str = ''
|
||
_delete_data: bool = False
|
||
|
||
# protected variables
|
||
_lexicon_repo = 'https://raw.githubusercontent.com/wumode/LexiAnnot/'
|
||
_worker_thread = None
|
||
_task_queue = None
|
||
_shutdown_event = None
|
||
_client = None
|
||
_total_token_count = 0
|
||
_venv_python = None
|
||
_query_gemini_script = ''
|
||
_gemini_available = False
|
||
_accent_color_rgb = None
|
||
_color_alpha = 0
|
||
_loaded = False
|
||
_config_updating_lock: Optional[threading.Lock] = None
|
||
|
||
def init_plugin(self, config=None):
|
||
self._task_queue = queue.Queue()
|
||
self._config_updating_lock = threading.Lock()
|
||
self.stop_service()
|
||
if config:
|
||
self._enabled = config.get("enabled")
|
||
self._annot_level = config.get("annot_level") or 'C1'
|
||
self._send_notify = config.get("send_notify")
|
||
self._onlyonce = config.get("onlyonce")
|
||
self._show_vocabulary_detail = config.get("show_vocabulary_detail")
|
||
self._sentence_translation = config.get("sentence_translation")
|
||
self._in_place = config.get("in_place")
|
||
self._enable_gemini = config.get("enable_gemini")
|
||
self._gemini_model = config.get("gemini_model") or 'gemini-2.0-flash'
|
||
self._gemini_apikey = config.get("gemini_apikey") or ''
|
||
self._context_window = int(config.get("context_window") or 10)
|
||
self._max_retries = int(config.get("max_retries") or 3)
|
||
self._request_interval = int(config.get("request_interval") or 3)
|
||
self._ffmpeg_path = config.get("ffmpeg_path")
|
||
self._english_only = config.get("english_only")
|
||
self._when_file_trans = config.get("when_file_trans")
|
||
self._model_temperature = config.get("model_temperature") or '0.3'
|
||
self._show_phonetics = config.get("show_phonetics")
|
||
self._custom_files = config.get("custom_files")
|
||
self._accent_color = config.get("accent_color")
|
||
self._font_scaling = config.get("font_scaling") or '1'
|
||
self._opacity = config.get("opacity") or '0'
|
||
self._spacy_model = config.get("spacy_model") or 'en_core_web_sm'
|
||
self._exam_tags = config.get("exam_tags") or []
|
||
self._delete_data = config.get("delete_data") or False
|
||
|
||
self._accent_color_rgb = LexiAnnot.hex_to_rgb(self._accent_color) or (255, 255, 0)
|
||
self._color_alpha = int(self._opacity) if self._opacity and len(self._opacity) else 0
|
||
if self._delete_data:
|
||
# 删除不再保存在数据库的数据
|
||
self.del_data('cefr_lexicon')
|
||
self.del_data('coca2k_lexicon')
|
||
self.del_data('swear_words')
|
||
self.del_data('lexicon_version')
|
||
self.delete_data()
|
||
self._delete_data = False
|
||
self._loaded = False
|
||
if self._enabled:
|
||
self._query_gemini_script = str(settings.ROOT_PATH / "app" / "plugins" / "lexiannot" / "query_gemini.py")
|
||
|
||
self._shutdown_event = threading.Event()
|
||
self._worker_thread = threading.Thread(target=self.__process_tasks, daemon=True)
|
||
self._worker_thread.start()
|
||
|
||
if self._onlyonce:
|
||
for file_path in self._custom_files.split("\n"):
|
||
if not file_path:
|
||
continue
|
||
self.add_media_file(file_path)
|
||
self._onlyonce = False
|
||
self.__update_config()
|
||
|
||
def get_form(self) -> Tuple[List[dict], Dict[str, Any]]:
|
||
"""
|
||
拼装插件配置页面,需要返回两块数据:1、页面配置;2、数据结构
|
||
"""
|
||
return [
|
||
{
|
||
'component': 'VForm',
|
||
'content': [
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'enabled',
|
||
'label': '启用插件',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'send_notify',
|
||
'label': '发送通知',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'onlyonce',
|
||
'label': '手动运行一次',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'delete_data',
|
||
'label': '插件数据清理',
|
||
}
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VTabs',
|
||
'props': {
|
||
'model': '_tabs',
|
||
'style': {
|
||
'margin-top': '8px',
|
||
'margin-bottom': '16px'
|
||
},
|
||
'stacked': True,
|
||
'fixed-tabs': True
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTab',
|
||
'props': {
|
||
'value': 'base_tab'
|
||
},
|
||
'text': '基本设置'
|
||
}, {
|
||
'component': 'VTab',
|
||
'props': {
|
||
'value': 'subtitle_tab'
|
||
},
|
||
'text': '字幕设置'
|
||
}, {
|
||
'component': 'VTab',
|
||
'props': {
|
||
'value': 'gemini_tab'
|
||
},
|
||
'text': 'Gemini设置'
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VWindow',
|
||
'props': {
|
||
'model': '_tabs'
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VWindowItem',
|
||
'props': {
|
||
'value': 'base_tab'
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VRow',
|
||
'props': {
|
||
'style': {
|
||
'margin-top': '0px'
|
||
}
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'when_file_trans',
|
||
'label': '监控入库',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'spacy_model',
|
||
'label': 'spaCy模型',
|
||
'hint': 'spaCy 模型用于分词和词性标注,推荐使用 Small',
|
||
'items': [
|
||
{'title': 'Small (~12 MB)', 'value': 'en_core_web_sm'},
|
||
{'title': 'Medium (~30 MB)', 'value': 'en_core_web_md'},
|
||
{'title': 'Large (700+ MB)', 'value': 'en_core_web_lg'},
|
||
{'title': 'Transformer (400+ MB)',
|
||
'value': 'en_core_web_trf'},
|
||
]
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'annot_level',
|
||
'label': '标注词汇的最低CEFR等级',
|
||
'items': [
|
||
{'title': 'B1', 'value': 'B1'},
|
||
{'title': 'B2', 'value': 'B2'},
|
||
{'title': 'C1', 'value': 'C1'},
|
||
{'title': 'C2', 'value': 'C2'},
|
||
{'title': 'C2+', 'value': 'C2+'}
|
||
]
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'english_only',
|
||
'label': '仅英语影视剧',
|
||
'hint': '检查入库影视剧原语言'
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 8
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'exam_tags',
|
||
'label': '考试词汇标签',
|
||
'chips': True,
|
||
'multiple': True,
|
||
'items': [
|
||
{'title': '四级', 'value': 'CET-4'},
|
||
{'title': '六级', 'value': 'CET-6'},
|
||
{'title': '考研', 'value': 'NPEE'},
|
||
{'title': '雅思', 'value': 'IELTS'},
|
||
{'title': '托福', 'value': 'TOEFL'},
|
||
{'title': '专四', 'value': 'TEM-4'},
|
||
{'title': '专八', 'value': 'TEM-8'},
|
||
{'title': 'GRE', 'value': 'GRE'},
|
||
{'title': 'PET', 'value': 'PET'},
|
||
]
|
||
}
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextField',
|
||
'props': {
|
||
'model': 'ffmpeg_path',
|
||
'label': 'FFmpeg 路径',
|
||
'placeholder': 'ffmpeg'
|
||
}
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VWindowItem',
|
||
'props': {
|
||
'value': 'subtitle_tab'
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VRow',
|
||
'props': {
|
||
'style': {
|
||
'margin-top': '0px'
|
||
}
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'font_scaling',
|
||
'label': '字体缩放',
|
||
'items': [
|
||
{'title': '50%', 'value': '0.5'},
|
||
{'title': '75%', 'value': '0.75'},
|
||
{'title': '100%', 'value': '1'},
|
||
{'title': '125%', 'value': '1.25'},
|
||
{'title': '150%', 'value': '1.5'},
|
||
{'title': '200%', 'value': '2'}
|
||
]
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextField',
|
||
'props': {
|
||
'model': 'accent_color',
|
||
'label': '强调色',
|
||
'placeholder': '#FFFF00'
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'opacity',
|
||
'label': '不透明度',
|
||
'items': [
|
||
{'title': '0', 'value': '0'},
|
||
{'title': '25%', 'value': '63'},
|
||
{'title': '50%', 'value': '127'},
|
||
{'title': '75%', 'value': '191'},
|
||
{'title': '100%', 'value': '255'},
|
||
]
|
||
}
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'show_phonetics',
|
||
'label': '标注音标',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'in_place',
|
||
'label': '在原字幕插入注释',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 4
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'show_vocabulary_detail',
|
||
'label': '显示完整释义',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
|
||
]
|
||
},
|
||
]
|
||
},
|
||
{
|
||
'component': 'VWindowItem',
|
||
'props': {
|
||
'value': 'gemini_tab'
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 6,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'enable_gemini',
|
||
'label': '启用Gemini翻译',
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 6
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSwitch',
|
||
'props': {
|
||
'model': 'sentence_translation',
|
||
'label': '整句翻译',
|
||
}
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 6,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'gemini_model',
|
||
'label': '模型',
|
||
'items': [
|
||
{'title': 'gemini-2.5-flash',
|
||
'value': 'gemini-2.5-flash'},
|
||
{'title': 'gemini-2.5-flash-lite',
|
||
'value': 'gemini-2.5-flash-lite'},
|
||
{'title': 'gemini-2.5-pro',
|
||
'value': 'gemini-2.5-pro'},
|
||
{'title': 'gemini-2.0-flash',
|
||
'value': 'gemini-2.0-flash'},
|
||
{'title': 'gemini-2.0-flash-lite',
|
||
'value': 'gemini-2.0-flash-lite'},
|
||
]
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 6,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextField',
|
||
'props': {
|
||
'model': 'gemini_apikey',
|
||
'label': 'Gemini APIKEY',
|
||
'placeholder': ''
|
||
}
|
||
}
|
||
]
|
||
},
|
||
]
|
||
},
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextField',
|
||
'props': {
|
||
'model': 'context_window',
|
||
'label': '上下文窗口大小',
|
||
'placeholder': '10',
|
||
'type': 'number',
|
||
'max': 20,
|
||
'min': 1,
|
||
'hint': '向Gemini发送的上下文长度'
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VSelect',
|
||
'props': {
|
||
'model': 'model_temperature',
|
||
'label': '模型温度',
|
||
'items': [
|
||
{'title': '0', 'value': '0'},
|
||
{'title': '0.1', 'value': '0.1'},
|
||
{'title': '0.2', 'value': '0.2'},
|
||
{'title': '0.3', 'value': '0.3'},
|
||
]
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextField',
|
||
'props': {
|
||
'model': 'max_retries',
|
||
'label': '请求重试次数',
|
||
'placeholder': '3',
|
||
'type': 'number',
|
||
'min': 1,
|
||
'hint': '请求失败重试次数'
|
||
}
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
'md': 3,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextField',
|
||
'props': {
|
||
'model': 'request_interval',
|
||
'label': '请求间隔',
|
||
'type': 'number',
|
||
'placeholder': 5,
|
||
'min': 1,
|
||
'suffix': '秒',
|
||
'hint': '请求间隔时间,建议不少于3秒'
|
||
}
|
||
}
|
||
]
|
||
},
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
'component': 'VRow',
|
||
'props': {
|
||
'style': {
|
||
'margin-top': '0px'
|
||
}
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VTextarea',
|
||
'props': {
|
||
'model': 'custom_files',
|
||
'label': '手动处理视频路径',
|
||
'rows': 3,
|
||
'placeholder': '每行一个文件'
|
||
}
|
||
}
|
||
]
|
||
},
|
||
]
|
||
},
|
||
{
|
||
'component': 'VRow',
|
||
'content': [
|
||
{
|
||
'component': 'VCol',
|
||
'props': {
|
||
'cols': 12,
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'VAlert',
|
||
'props': {
|
||
'type': 'success',
|
||
'variant': 'tonal'
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'span',
|
||
'text': '配置说明:'
|
||
},
|
||
{
|
||
'component': 'a',
|
||
'props': {
|
||
'href': 'https://github.com/jxxghp/MoviePilot-Plugins/tree/main/plugins.v2/lexiannot/README.md',
|
||
'target': '_blank'
|
||
},
|
||
'content': [
|
||
{
|
||
'component': 'u',
|
||
'text': 'README'
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
], {
|
||
"enabled": False,
|
||
"annot_level": 'C1',
|
||
"send_notify": False,
|
||
"onlyonce": False,
|
||
"show_vocabulary_detail": False,
|
||
"show_phonetics": False,
|
||
"sentence_translation": False,
|
||
"in_place": False,
|
||
"enable_gemini": False,
|
||
"gemini_model": 'gemini-2.0-flash',
|
||
"gemini_apikey": '',
|
||
"context_window": 10,
|
||
"max_retries": 3,
|
||
'request_interval': 3,
|
||
"ffmpeg_path": "",
|
||
"english_only": True,
|
||
"when_file_trans": True,
|
||
"model_temperature": '0.3',
|
||
"custom_files": '',
|
||
"accent_color": '',
|
||
"font_scaling": '1',
|
||
"opacity": '0',
|
||
"spacy_model": 'en_core_web_sm',
|
||
"exam_tags": [],
|
||
"delete_data": False
|
||
}
|
||
|
||
def get_api(self) -> List[Dict[str, Any]]:
|
||
pass
|
||
|
||
def get_page(self) -> List[dict]:
|
||
pass
|
||
|
||
@staticmethod
|
||
def get_command() -> List[Dict[str, Any]]:
|
||
pass
|
||
|
||
def get_state(self) -> bool:
|
||
"""
|
||
获取插件状态,如果插件正在运行, 则返回True
|
||
"""
|
||
return self._enabled
|
||
|
||
def stop_service(self):
|
||
"""
|
||
退出插件
|
||
"""
|
||
try:
|
||
self.shutdown()
|
||
except Exception as e:
|
||
logger.error(f"退出插件失败:{e}")
|
||
|
||
def shutdown(self):
|
||
"""
|
||
关闭插件
|
||
"""
|
||
if self._worker_thread and self._worker_thread.is_alive():
|
||
logger.debug("🔻 Stopping existing worker thread...")
|
||
self._shutdown_event.set()
|
||
self._worker_thread.join()
|
||
logger.debug("✅ Existing worker thread stopped.")
|
||
self._worker_thread = None
|
||
else:
|
||
logger.debug("ℹ️ No running worker thread to stop.")
|
||
|
||
def delete_data(self):
|
||
data_path = self.get_data_path()
|
||
lexicon_path = data_path / 'lexicon.json'
|
||
try:
|
||
os.remove(lexicon_path)
|
||
logger.info(f"词典 {lexicon_path} 已删除")
|
||
except FileNotFoundError:
|
||
pass
|
||
except Exception as e:
|
||
logger.error(f"词典 {lexicon_path} 删除失败: {e}")
|
||
|
||
venv_dir = data_path / "venv_genai"
|
||
if os.path.exists(venv_dir):
|
||
try:
|
||
shutil.rmtree(venv_dir)
|
||
logger.info(f"虚拟环境 {venv_dir} 已删除")
|
||
except Exception as e:
|
||
logger.error(f"虚拟环境 {venv_dir} 删除失败: {e}")
|
||
|
||
def add_media_file(self, path: str):
|
||
"""
|
||
添加新任务
|
||
"""
|
||
if not self._shutdown_event.is_set():
|
||
self._task_queue.put(path)
|
||
else:
|
||
raise RuntimeError("Plugin is shutting down. Cannot add new tasks.")
|
||
|
||
def __update_config(self):
|
||
with self._config_updating_lock:
|
||
self.update_config({'enabled': self._enabled,
|
||
'annot_level': self._annot_level,
|
||
'send_notify': self._send_notify,
|
||
'onlyonce': self._onlyonce,
|
||
'show_vocabulary_detail': self._show_vocabulary_detail,
|
||
'sentence_translation': self._sentence_translation,
|
||
'in_place': self._in_place,
|
||
'enable_gemini': self._enable_gemini,
|
||
'gemini_model': self._gemini_model,
|
||
'gemini_apikey': self._gemini_apikey,
|
||
'context_window': self._context_window,
|
||
'max_retries': self._max_retries,
|
||
'request_interval': self._request_interval,
|
||
'ffmpeg_path': self._ffmpeg_path,
|
||
'english_only': self._english_only,
|
||
'when_file_trans': self._when_file_trans,
|
||
'model_temperature': self._model_temperature,
|
||
'show_phonetics': self._show_phonetics,
|
||
'custom_files': self._custom_files,
|
||
'accent_color': self._accent_color,
|
||
'font_scaling': self._font_scaling,
|
||
'opacity': self._opacity,
|
||
'spacy_model': self._spacy_model,
|
||
'exam_tags': self._exam_tags,
|
||
})
|
||
|
||
def __process_tasks(self):
|
||
"""
|
||
后台线程:处理任务队列
|
||
"""
|
||
logger.debug("👷 Worker thread started.")
|
||
|
||
self.__load_data()
|
||
if not self._loaded:
|
||
logger.warn('插件数据未加载')
|
||
self._enabled = False
|
||
self.__update_config()
|
||
logger.debug("🛑 Worker exiting...")
|
||
return
|
||
if self._enable_gemini:
|
||
self._gemini_available = True
|
||
res = self.init_venv()
|
||
if not res:
|
||
self._gemini_available = False
|
||
if not self._gemini_apikey:
|
||
logger.warn(f"未提供GEMINI APIKEY")
|
||
self._gemini_available = False
|
||
while not self._shutdown_event.is_set():
|
||
try:
|
||
task = self._task_queue.get(timeout=1) # 最多等待1秒
|
||
if task is None:
|
||
continue
|
||
self.__process_file(task)
|
||
except queue.Empty:
|
||
continue
|
||
logger.debug("🛑 Worker received shutdown signal, exiting...")
|
||
|
||
def __process_file(self, path: str):
|
||
"""
|
||
处理视频文件
|
||
"""
|
||
if not self._loaded:
|
||
return
|
||
lexicon = self.__load_lexicon_from_local()
|
||
if not lexicon:
|
||
logger.error(f"字典加载失败")
|
||
return
|
||
try:
|
||
nlp = spacy.load(self._spacy_model)
|
||
infixes = list(nlp.Defaults.infixes)
|
||
infixes = [i for i in infixes if '-' not in i]
|
||
# 使用修改后的正则表达式重新创建 tokenizer
|
||
infix_re = compile_infix_regex(infixes)
|
||
nlp.tokenizer = Tokenizer(
|
||
nlp.vocab,
|
||
prefix_search=nlp.tokenizer.prefix_search,
|
||
suffix_search=nlp.tokenizer.suffix_search,
|
||
infix_finditer=infix_re.finditer,
|
||
token_match=nlp.tokenizer.token_match
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"spaCy 模型 {self._spacy_model} 加载失败: {e}")
|
||
return
|
||
video = Path(path)
|
||
if video.suffix.lower() not in settings.RMT_MEDIAEXT:
|
||
return
|
||
if not video.exists() or not video.is_file():
|
||
logger.warn(f"文件 {str(video)} 不存在, 跳过")
|
||
return
|
||
subtitle = video.with_suffix(".en.ass")
|
||
if subtitle.exists():
|
||
logger.warn(f"字幕文件 ({subtitle}) 已存在, 跳过")
|
||
return
|
||
logger.info(f"📂 Processing file: {path}")
|
||
if self._send_notify:
|
||
message = f"正在处理文件: {path}"
|
||
self.post_message(title=f"【{self.plugin_name}】",
|
||
mtype=NotificationType.Plugin,
|
||
text=f"{message}")
|
||
ffmpeg_path = self._ffmpeg_path if self._ffmpeg_path else 'ffmpeg'
|
||
embedded_subtitles = LexiAnnot.__extract_subtitles_by_lang(path, 'en', ffmpeg_path)
|
||
embedded_subtitles = sorted(embedded_subtitles, key=lambda track: 'SDH' in track['title'])
|
||
ret_message = ''
|
||
if embedded_subtitles:
|
||
logger.info(f'提取到 {len(embedded_subtitles)} 条英语文本字幕')
|
||
for embedded_subtitle in embedded_subtitles:
|
||
if self._shutdown_event.is_set():
|
||
return
|
||
ass_subtitle = pysubs2.SSAFile.from_string(embedded_subtitle['subtitle'], format_='ass')
|
||
if embedded_subtitle.get('codec_id') == 'S_TEXT/UTF8':
|
||
ass_subtitle = LexiAnnot.set_srt_style(ass_subtitle)
|
||
ass_subtitle = self.__set_style(ass_subtitle)
|
||
ass_subtitle = self.process_subtitles(ass_subtitle, lexicon.get('cefr'), lexicon.get('coca20k'),
|
||
lexicon.get('examinations'),lexicon.get('swear_words'), nlp)
|
||
if self._shutdown_event.is_set():
|
||
return
|
||
if ass_subtitle:
|
||
try:
|
||
ass_subtitle.save(str(subtitle))
|
||
ret_message = f"字幕已保存:{str(subtitle)}"
|
||
logger.info(f"字幕已保存:{str(subtitle)}")
|
||
except Exception as e:
|
||
ret_message = f"字幕文件 {subtitle} 保存失败, {e}"
|
||
logger.error(f"字幕文件 {subtitle} 保存失败, {e}")
|
||
break
|
||
else:
|
||
logger.info(f"处理字幕{embedded_subtitle['codec_id']}-{embedded_subtitle['stream_id']}失败")
|
||
else:
|
||
logger.warn(f"未能在{path}中找到可提取的英文字幕")
|
||
if not ret_message:
|
||
ret_message = f"未能在{path}中找到可提取的英文字幕"
|
||
logger.info(f"✅ Finished: {path}")
|
||
if self._send_notify:
|
||
self.post_message(title=f"【{self.plugin_name}】",
|
||
mtype=NotificationType.Plugin,
|
||
text=f"{ret_message}")
|
||
|
||
@cached(maxsize=1000, ttl=1800)
|
||
def __load_lexicon_version(self) -> Optional[str]:
|
||
logger.info(f"正在检查远程词典文件版本...")
|
||
url = f'{self._lexicon_repo}master/version'
|
||
version = RequestUtils().get(url, headers=settings.REPO_GITHUB_HEADERS())
|
||
if version is None:
|
||
return None
|
||
return version.strip()
|
||
|
||
def __load_lexicon_from_local(self) -> Optional[Dict[str, Any]]:
|
||
data_path = self.get_data_path()
|
||
lexicon = {}
|
||
try:
|
||
lexicon_path = data_path / 'lexicon.json'
|
||
with open(lexicon_path, 'r', encoding='utf-8') as f:
|
||
lexicon = json.load(f)
|
||
except Exception as e:
|
||
logger.debug(f"词典文件读取失败: {e}")
|
||
lexicon_files = ('cefr', 'coca20k', 'swear_words', 'examinations')
|
||
if any(file not in lexicon for file in lexicon_files):
|
||
return None
|
||
return lexicon
|
||
|
||
def __retrieve_lexicon_online(self, version: str) -> Optional[Dict[str, Any]]:
|
||
logger.info('开始下载词典文件...')
|
||
lexicon_files = ['cefr', 'coca20k', 'swear_words', 'examinations']
|
||
lexicon = {}
|
||
for file in lexicon_files:
|
||
url = f'{self._lexicon_repo}master/{file}.json'
|
||
res = RequestUtils().get_res(url, headers=settings.REPO_GITHUB_HEADERS())
|
||
if res.status_code == 200:
|
||
lexicon[file] = res.json()
|
||
if any(file not in lexicon for file in lexicon_files):
|
||
return None
|
||
logger.info(f"词典文件 (v{version}) 下载完成")
|
||
data_path = self.get_data_path()
|
||
lexicon['version'] = version
|
||
try:
|
||
lexicon_path = data_path / 'lexicon.json'
|
||
with open(lexicon_path, 'w', encoding='utf-8') as f:
|
||
json.dump(lexicon, f, ensure_ascii=False, indent=2)
|
||
except Exception as e:
|
||
logger.warn(f"词典文件保存失败: {e}")
|
||
return lexicon
|
||
|
||
def __load_data(self):
|
||
"""
|
||
测试插件数据加载
|
||
"""
|
||
try:
|
||
nlp = spacy.load(self._spacy_model)
|
||
except OSError:
|
||
nlp = LexiAnnot.__load_spacy_model(self._spacy_model)
|
||
lexicon = self.__load_lexicon_from_local()
|
||
latest = self.__load_lexicon_version() or '0.0.0'
|
||
if not lexicon or StringUtils.compare_version(lexicon.get('version'), '<', latest):
|
||
lexicon = self.__retrieve_lexicon_online(latest)
|
||
|
||
if not (nlp and lexicon):
|
||
self._loaded = False
|
||
logger.warn(f"插件数据加载失败")
|
||
else:
|
||
self._loaded = True
|
||
logger.info(f"当前词典文件版本: {lexicon.get('version')}")
|
||
|
||
@staticmethod
|
||
def __load_spacy_model(model_name: str):
|
||
try:
|
||
subprocess.run(
|
||
[sys.executable, "-m", "spacy", "download", model_name],
|
||
capture_output=True,
|
||
text=True,
|
||
check=True
|
||
)
|
||
nlp = spacy.load(model_name)
|
||
logger.info(f"spaCy 模型 '{model_name}' 加载成功!")
|
||
return nlp
|
||
except subprocess.CalledProcessError as e:
|
||
logger.error(f"下载 spaCy 模型 '{model_name}' 失败。")
|
||
logger.error(f"命令返回非零退出码:{e.returncode}")
|
||
logger.error(f"Stdout:\n{e.stdout}")
|
||
logger.error(f"Stderr:\n{e.stderr}")
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"下载或加载 spaCy 模型时发生意外错误:{e}")
|
||
return None
|
||
|
||
@eventmanager.register(EventType.TransferComplete)
|
||
def check_media(self, event: Event):
|
||
if not self._enabled or not self._when_file_trans:
|
||
return
|
||
event_info: dict = event.event_data
|
||
if not event_info:
|
||
return
|
||
|
||
# 入库数据
|
||
transfer_info: TransferInfo = event_info.get("transferinfo")
|
||
if not transfer_info or not transfer_info.target_diritem or not transfer_info.target_diritem.path:
|
||
return
|
||
mediainfo: MediaInfo = event_info.get("mediainfo")
|
||
if self._english_only:
|
||
if mediainfo.original_language != 'en':
|
||
logger.info(f"原始语言 ({mediainfo.original_language}) 不为英语, 跳过 {mediainfo.title}: ")
|
||
return
|
||
for new_path in transfer_info.file_list_new:
|
||
self.add_media_file(new_path)
|
||
|
||
@staticmethod
|
||
def query_cefr(word, cefr_lexicon):
|
||
word = word.lower().strip("-*'")
|
||
if word in cefr_lexicon:
|
||
return cefr_lexicon[word]
|
||
else:
|
||
return None
|
||
|
||
@staticmethod
|
||
def query_coca20k(word: str, lexicon: Dict[str, Any]):
|
||
word = word.lower().strip("-*'")
|
||
return lexicon.get(word)
|
||
|
||
@staticmethod
|
||
def query_examinations(word: str, lexicon: Dict[str, Any]) -> Dict[str, Any]:
|
||
res = {}
|
||
for examination, exam_lexicon in lexicon.items():
|
||
if word in exam_lexicon:
|
||
res[examination] = exam_lexicon[word]
|
||
return res
|
||
|
||
@staticmethod
|
||
def convert_pos_to_spacy(pos: str):
|
||
"""
|
||
将给定的词性列表转换为 spaCy 库中使用的词性标签
|
||
:param pos: 字符串形式词性
|
||
:returns: 一个包含对应spaCy词性标签的列表。对于无法直接映射的词性,将返回None
|
||
"""
|
||
spacy_pos_map = {
|
||
'noun': 'NOUN',
|
||
'adjective': 'ADJ',
|
||
'adverb': 'ADV',
|
||
'verb': 'VERB',
|
||
'preposition': 'ADP',
|
||
'conjunction': 'CCONJ',
|
||
'determiner': 'DET',
|
||
'pronoun': 'PRON',
|
||
'interjection': 'INTJ',
|
||
'number': 'NUM'
|
||
}
|
||
|
||
pos_lower = pos.lower()
|
||
if pos_lower in spacy_pos_map:
|
||
spacy_pos = spacy_pos_map[pos_lower]
|
||
elif pos_lower == 'be-verb':
|
||
spacy_pos = 'AUX' # Auxiliary verb (e.g., be, do, have)
|
||
elif pos_lower == 'vern':
|
||
spacy_pos = 'VERB' # Assuming 'vern' is a typo for 'verb'
|
||
elif pos_lower == 'modal auxiliary':
|
||
spacy_pos = 'AUX' # Modal verbs are also auxiliaries
|
||
elif pos_lower == 'do-verb':
|
||
spacy_pos = 'AUX'
|
||
elif pos_lower == 'have-verb':
|
||
spacy_pos = 'AUX'
|
||
elif pos_lower == 'infinitive-to':
|
||
spacy_pos = 'PART' # Particle (e.g., to in "to go")
|
||
elif not pos_lower: # Handle empty strings
|
||
spacy_pos = None
|
||
else:
|
||
spacy_pos = None # For unmapped POS tags
|
||
return spacy_pos
|
||
|
||
@staticmethod
|
||
def get_cefr_by_spacy(lemma_: str, pos_: str, cefr_lexicon: Dict[str, Any]) -> Optional[str]:
|
||
result = LexiAnnot.query_cefr(lemma_, cefr_lexicon)
|
||
if result:
|
||
all_cefr = []
|
||
if len(result) > 0:
|
||
for entry in result:
|
||
if pos_ == LexiAnnot.convert_pos_to_spacy(entry['pos']):
|
||
return entry['cefr']
|
||
all_cefr.append(entry['cefr'])
|
||
return min(all_cefr)
|
||
return None
|
||
|
||
@staticmethod
|
||
def format_duration(ms):
|
||
total_seconds, milliseconds = divmod(ms, 1000)
|
||
hours, remainder = divmod(total_seconds, 3600)
|
||
minutes, seconds = divmod(remainder, 60)
|
||
hundredths = milliseconds // 10
|
||
return f"{hours}:{minutes:02}:{seconds:02}.{hundredths:02}"
|
||
|
||
@staticmethod
|
||
def replace_by_plaintext_positions(line: SSAEvent, replacements: List[dict]):
|
||
"""
|
||
使用 replacements 中的 plaintext 位置信息, 替换 line.text 中的内容。
|
||
:param line: SSAEvent line
|
||
:param replacements: [{'start': int, 'end': int, 'old_text': str, 'new_text': str}, ...]
|
||
"""
|
||
text = line.text
|
||
tag_pattern = re.compile(r"{.*?}") # 匹配 {xxx} 格式控制符
|
||
special_pattern = re.compile(r"\\[Nh]")
|
||
# 构建 plaintext 位置到 text 索引的映射
|
||
mapping = {} # plaintext_index -> text_index
|
||
p_index = 0 # 当前 plaintext 索引
|
||
t_index = 0 # 当前 text 索引
|
||
|
||
while t_index < len(text):
|
||
if text[t_index] == "{":
|
||
# 跳过格式标签
|
||
match = tag_pattern.match(text, t_index)
|
||
if match:
|
||
t_index = match.end()
|
||
continue
|
||
elif text[t_index] == "\\":
|
||
match = special_pattern.match(text, t_index)
|
||
if match:
|
||
t_index = match.end() - 1
|
||
continue
|
||
# 非格式字符
|
||
mapping[p_index] = t_index
|
||
p_index += 1
|
||
t_index += 1
|
||
|
||
# 按照 mapping 执行替换(倒序替换防止位置错位)
|
||
new_text = text
|
||
for r in sorted(replacements, key=lambda x: x["start"], reverse=True):
|
||
start = mapping.get(r["start"])
|
||
end = mapping.get(r["end"] - 1)
|
||
if start is None or end is None:
|
||
continue
|
||
end += 1
|
||
new_text = new_text[:start] + r["new_text"] + new_text[end:]
|
||
|
||
line.text = new_text
|
||
|
||
@staticmethod
|
||
def analyze_ass_language(ass_file: SSAFile):
|
||
styles = {}
|
||
for style in ass_file.styles:
|
||
styles[style] = {'text': [], 'duration': 0, 'text_size': 0, 'times': 0}
|
||
for dialogue in ass_file:
|
||
style = dialogue.style
|
||
text = dialogue.plaintext
|
||
sub_text = text.split('\n')
|
||
if style not in styles or not text: continue
|
||
styles[style]['text'].extend(sub_text)
|
||
styles[style]['duration'] += dialogue.duration
|
||
styles[style]['text_size'] += len(text)
|
||
styles[style]['times'] += 1
|
||
style_language_analysis = {}
|
||
for style_name, data in styles.items():
|
||
all_text = ' '.join(data['text'])
|
||
if not all_text.strip():
|
||
style_language_analysis[style_name] = None
|
||
continue
|
||
|
||
languages = []
|
||
# 对每个文本片段进行语言检测
|
||
for text_fragment in data['text']:
|
||
try:
|
||
lang = detect(text_fragment)
|
||
languages.append(lang)
|
||
except:
|
||
pass # 无法检测的文本
|
||
|
||
if languages:
|
||
language_counts = Counter(languages)
|
||
most_common_language = language_counts.most_common(1)[0]
|
||
style_language_analysis[style_name] = {"main_language": most_common_language[0],
|
||
"proportion": most_common_language[1] / len(languages),
|
||
"duration": data['duration'],
|
||
"text_size": data['text_size'],
|
||
"times": data['times']}
|
||
else:
|
||
style_language_analysis[style_name] = None
|
||
|
||
return style_language_analysis
|
||
|
||
@staticmethod
|
||
def select_main_style_weighted(language_analysis: Dict[str, Any], known_language: str,
|
||
weights=None):
|
||
"""
|
||
根据语言分析结果和已知的字幕语言,使用加权评分选择主要样式
|
||
:params language_analysis: `analyze_ass_language` 函数的输出结果
|
||
:params known_language: 已知的字幕语言代码
|
||
:params weights: 各个维度的权重,权重之和应为 1
|
||
:returns: 主要字幕的样式名称,如果没有匹配的样式则返回 None
|
||
"""
|
||
if weights is None:
|
||
weights = {'times': 0.5, 'text_size': 0.4, 'duration': 0.1}
|
||
matching_styles = []
|
||
max_times = max([analysis.get('times', 0) for _, analysis in language_analysis.items() if analysis]) or 1
|
||
max_text_size = max(
|
||
[analysis.get('text_size', 0) for _, analysis in language_analysis.items() if analysis]) or 1
|
||
max_duration = max([analysis.get('duration', 0) for _, analysis in language_analysis.items() if analysis]) or 1
|
||
for style, analysis in language_analysis.items():
|
||
if not analysis:
|
||
continue
|
||
if analysis.get('main_language') == known_language:
|
||
# 跳过多语言
|
||
if analysis.get('proportion', 0) < 0.5:
|
||
continue
|
||
score = 0
|
||
score += analysis.get('times', 0) * weights.get('times', 0) / max_times
|
||
score += analysis.get('text_size', 0) * weights.get('text_size', 0) / max_text_size
|
||
score += analysis.get('duration', 0) * weights.get('duration', 0) / max_duration
|
||
matching_styles.append((style, score))
|
||
|
||
if not matching_styles:
|
||
return None
|
||
|
||
sorted_styles = sorted(matching_styles, key=lambda item: item[1], reverse=True)
|
||
return sorted_styles[0][0]
|
||
|
||
@staticmethod
|
||
def set_srt_style(ass: SSAFile) -> SSAFile:
|
||
ass.info['ScaledBorderAndShadow'] = 'no'
|
||
play_res_y = int(ass.info.get('PlayResY'))
|
||
play_res_x = int(ass.info.get('PlayResX'))
|
||
if 'Default' in ass.styles:
|
||
ass.styles['Default'].marginv = play_res_y // 16
|
||
ass.styles['Default'].fontname = 'Microsoft YaHei'
|
||
ass.styles['Default'].fontsize = play_res_y // 16
|
||
return ass
|
||
|
||
def __set_style(self, ass: SSAFile) -> SSAFile:
|
||
font_scaling = float(self._font_scaling) if self._font_scaling and len(self._font_scaling) else 1
|
||
play_res_y = int(ass.info.get('PlayResY'))
|
||
play_res_x = int(ass.info.get('PlayResX'))
|
||
# 创建一个新样式
|
||
fs = play_res_y // 16 * font_scaling
|
||
new_style = pysubs2.SSAStyle()
|
||
new_style.name = 'Annotation EN'
|
||
new_style.fontname = 'Times New Roman'
|
||
new_style.fontsize = fs
|
||
new_style.primarycolor = pysubs2.Color(self._accent_color_rgb[0],
|
||
self._accent_color_rgb[1],
|
||
self._accent_color_rgb[2],
|
||
self._color_alpha) # 黄色 (BGR, alpha)
|
||
new_style.bold = True
|
||
new_style.italic = False
|
||
new_style.outline = 1
|
||
new_style.shadow = 0
|
||
new_style.alignment = pysubs2.Alignment.TOP_LEFT
|
||
new_style.marginl = play_res_x // 20
|
||
new_style.marginr = play_res_x // 20
|
||
new_style.marginv = fs
|
||
ass.styles['Annotation EN'] = new_style
|
||
zh_style = new_style.copy()
|
||
zh_style.name = 'Annotation ZH'
|
||
zh_style.fontname = 'Microsoft YaHei'
|
||
zh_style.primarycolor = pysubs2.Color(255, 255, 255, self._color_alpha)
|
||
ass.styles['Annotation ZH'] = zh_style
|
||
|
||
pos_style = zh_style.copy()
|
||
pos_style.name = 'Annotation POS'
|
||
pos_style.fontname = 'Times New Roman'
|
||
pos_style.fontsize = fs * 0.75
|
||
pos_style.italic = True
|
||
ass.styles['Annotation POS'] = pos_style
|
||
|
||
phone_style = pos_style.copy()
|
||
phone_style.name = 'Annotation PHONE'
|
||
phone_style.fontname = 'Arial'
|
||
phone_style.fontsize = fs * 0.75
|
||
phone_style.bold = False
|
||
phone_style.italic = False
|
||
ass.styles['Annotation PHONE'] = phone_style
|
||
|
||
pos_def_cn_style = zh_style.copy()
|
||
pos_def_cn_style.name = 'DETAIL CN'
|
||
pos_def_cn_style.fontsize = fs * 0.7
|
||
ass.styles['DETAIL CN'] = pos_def_cn_style
|
||
|
||
pos_def_pos_style = pos_style.copy()
|
||
pos_def_pos_style.name = 'DETAIL POS'
|
||
pos_def_pos_style.fontsize = fs * 0.6
|
||
ass.styles['DETAIL POS'] = pos_def_pos_style
|
||
|
||
cefr_style = pos_style.copy()
|
||
cefr_style.name = "Annotation CEFR"
|
||
cefr_style.fontname = "Times New Roman"
|
||
cefr_style.fontsize = fs * 0.5
|
||
cefr_style.bold = True
|
||
cefr_style.italic = False
|
||
cefr_style.primarycolor = pysubs2.Color(self._accent_color_rgb[0],
|
||
self._accent_color_rgb[1],
|
||
self._accent_color_rgb[2],
|
||
self._color_alpha)
|
||
cefr_style.outline = 1
|
||
cefr_style.shadow = 0
|
||
ass.styles['Annotation CEFR'] = cefr_style
|
||
ass.styles['Annotation EXAM'] = cefr_style
|
||
return ass
|
||
|
||
@staticmethod
|
||
def hex_to_rgb(hex_color) -> Optional[Tuple]:
|
||
if not hex_color:
|
||
return None
|
||
pattern = r'^#[0-9a-fA-F]{6}$'
|
||
if re.match(pattern, hex_color) is None:
|
||
return None
|
||
hex_color = hex_color.lstrip('#') # 去掉前面的 #
|
||
return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4))
|
||
|
||
@staticmethod
|
||
def __extract_subtitle(video_path: str,
|
||
subtitle_stream_index: str,
|
||
ffmpeg_path: str = 'ffmpeg',
|
||
sub_format='ass') -> Optional[str]:
|
||
if sub_format not in ['srt', 'ass']:
|
||
raise ValueError('Invalid subtitle format')
|
||
try:
|
||
map_parameter = f"0:s:{subtitle_stream_index}"
|
||
command = [
|
||
ffmpeg_path,
|
||
'-i', video_path,
|
||
'-map', map_parameter,
|
||
'-f', sub_format,
|
||
'-'
|
||
]
|
||
result = subprocess.run(command, capture_output=True, text=True, encoding='utf-8', check=True)
|
||
return result.stdout
|
||
except FileNotFoundError:
|
||
logger.warn(f"错误:找不到视频文件 '{video_path}'")
|
||
return None
|
||
except subprocess.CalledProcessError as e:
|
||
logger.warn(f"错误:提取字幕失败。\n错误信息:{e}")
|
||
logger.warn(f"FFmpeg 输出 (stderr):\n{e.stderr.decode('utf-8', errors='ignore')}")
|
||
return None
|
||
|
||
@staticmethod
|
||
def __extract_subtitles_by_lang(video_path: str, lang: str = 'en', ffmpeg: str = 'ffmpeg') -> Optional[List[Dict]]:
|
||
"""
|
||
提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。
|
||
"""
|
||
supported_codec = ['S_TEXT/UTF8', 'S_TEXT/ASS']
|
||
subtitles = []
|
||
try:
|
||
media_info: pymediainfo.MediaInfo = pymediainfo.MediaInfo.parse(video_path)
|
||
for track in media_info.tracks:
|
||
if track.track_type == 'Text' and track.language == lang and track.codec_id in supported_codec:
|
||
subtitle_stream_index = track.stream_identifier # MediaInfo 的 stream_id 从 1 开始,ffmpeg 从 0 开始
|
||
subtitle = LexiAnnot.__extract_subtitle(video_path, subtitle_stream_index, ffmpeg)
|
||
if subtitle:
|
||
subtitles.append({'title': track.title, 'subtitle': subtitle, 'codec_id': track.codec_id,
|
||
'stream_id': subtitle_stream_index})
|
||
if subtitles:
|
||
return subtitles
|
||
else:
|
||
logger.warn('未找到标记为英语的文本字幕流')
|
||
return None
|
||
|
||
except FileNotFoundError:
|
||
logger.error(f"找不到视频文件 '{video_path}'")
|
||
return None
|
||
except subprocess.CalledProcessError as e:
|
||
logger.error(f"错误:提取字幕失败。\n错误信息:{e}")
|
||
logger.error(f"FFmpeg 输出 (stderr):\n{e.stderr}")
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"使用 MediaInfo 提取字幕时发生错误:{e}")
|
||
return None
|
||
|
||
def init_venv(self) -> bool:
|
||
venv_dir = os.path.join(self.get_data_path(), "venv_genai")
|
||
python_path = os.path.join(venv_dir, "bin", "python") if os.name != "nt" else os.path.join(venv_dir, "Scripts",
|
||
"python.exe")
|
||
# 创建虚拟环境
|
||
try:
|
||
if not os.path.exists(venv_dir):
|
||
logger.info(f"为 google-genai 初始化虚拟环境: {venv_dir}")
|
||
venv.create(venv_dir, with_pip=True, symlinks=True, clear=True)
|
||
logger.info(f"虚拟环境创建成功: {venv_dir}")
|
||
SystemUtils.execute_with_subprocess([python_path, "-m", "pip", "install", 'google-genai'])
|
||
except subprocess.CalledProcessError as e:
|
||
logger.warn(f"虚拟环境创建失败: {e}")
|
||
shutil.rmtree(venv_dir)
|
||
return False
|
||
self._venv_python = python_path
|
||
|
||
return True
|
||
|
||
def __query_gemini(
|
||
self,
|
||
tasks: List[T],
|
||
task_type: Type[T],
|
||
api_key: str,
|
||
system_instruction: str,
|
||
model: str,
|
||
temperature: float
|
||
) -> List[T]:
|
||
input_dict = {
|
||
'tasks': [task.dict() for task in tasks], # 保证是可序列化格式
|
||
'params': {
|
||
'api_key': api_key,
|
||
'system_instruction': system_instruction,
|
||
'schema': task_type.__name__,
|
||
'model': model,
|
||
'temperature': temperature,
|
||
'max_retries': self._max_retries
|
||
}
|
||
}
|
||
|
||
try:
|
||
result = subprocess.run(
|
||
[self._venv_python, self._query_gemini_script],
|
||
input=json.dumps(input_dict),
|
||
capture_output=True,
|
||
text=True,
|
||
check=True
|
||
)
|
||
except subprocess.CalledProcessError as e:
|
||
logger.warning(f"Subprocess failed: {str(e)}")
|
||
return tasks
|
||
|
||
try:
|
||
response = json.loads(result.stdout)
|
||
except json.JSONDecodeError:
|
||
logger.warning(f"Invalid JSON from subprocess:\n{result.stdout}")
|
||
return tasks
|
||
|
||
if not response.get("success"):
|
||
logger.warning(f"Error in subprocess response: {response.get('message')}")
|
||
return tasks
|
||
|
||
try:
|
||
return [task_type(**task_data) for task_data in response["data"]["tasks"]]
|
||
except Exception as e:
|
||
logger.warning(f"Failed to reconstruct tasks: {str(e)}")
|
||
return tasks
|
||
|
||
def __process_by_ai(self, lines_to_process: List[Dict[str, Any]],
|
||
cefr_lexicon: Dict[str, Any],
|
||
coca20k_lexicon: Dict[str, Any],
|
||
exams_lexicon: Dict[str, Any],
|
||
swear_words: List[str],
|
||
nlp: spacy.Language):
|
||
|
||
def __replace_with_spaces(_text):
|
||
"""
|
||
使用等长的空格替换文本中的 [xxx] 模式。
|
||
例如:"[Hi]" 会被替换成 " " (4个空格)
|
||
"""
|
||
pattern = r'(\[.*?\])'
|
||
return re.sub(pattern, lambda match: ' ' * len(match.group(1)), text)
|
||
|
||
simple_vocabulary = list(filter(lambda x: x < self._annot_level, ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))
|
||
patterns = [r'\d+th|\d?1st|\d?2nd|\d?3rd', r"\w+'s$", r"\w+'t$", "[Ii]'m$", r"\w+'re$", r"\w+'ve$", r"\w+'ll$"]
|
||
compiled_patterns = [re.compile(p) for p in patterns]
|
||
model_temperature = float(self._model_temperature) if self._model_temperature else 0.3
|
||
logger.info(f"通过 spaCy 分词...")
|
||
vocabulary_trans_instruction = '''You are an expert translator. You will be given a list of English words along with their context, formatted as JSON. For each entry, provide the most appropriate translation in Simplified Chinese based on the context.
|
||
Only complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.'''
|
||
# 使用nlp分词
|
||
for line_data in lines_to_process:
|
||
if self._shutdown_event.is_set():
|
||
return lines_to_process
|
||
text_raw = line_data.get('raw_subtitle')
|
||
text = text_raw.replace('\n', ' ')
|
||
text = __replace_with_spaces(text)
|
||
new_vocab = []
|
||
doc = nlp(text)
|
||
last_end_pos = 0
|
||
lemma_to_query = []
|
||
for token in doc:
|
||
if len(token.text) == 1:
|
||
continue
|
||
if token.lemma_ in swear_words:
|
||
continue
|
||
if token.pos_ not in ('NOUN', 'AUX', 'VERB', 'ADJ', 'ADV', 'ADP', 'CCONJ', 'SCONJ'):
|
||
continue
|
||
striped = token.lemma_.strip('-[')
|
||
if any(p.match(striped) for p in compiled_patterns):
|
||
continue
|
||
cefr = LexiAnnot.get_cefr_by_spacy(striped, token.pos_, cefr_lexicon)
|
||
if cefr and cefr in simple_vocabulary:
|
||
continue
|
||
res_of_coco = LexiAnnot.query_coca20k(striped, coca20k_lexicon)
|
||
if res_of_coco and not cefr:
|
||
cefr = ''
|
||
res_of_exams = self.query_examinations(striped, exams_lexicon)
|
||
exam_tags = []
|
||
if res_of_exams:
|
||
exam_tags = [exam_id for exam_id in res_of_exams if exam_id in self._exam_tags]
|
||
if striped in lemma_to_query:
|
||
continue
|
||
else:
|
||
lemma_to_query.append(striped)
|
||
striped_text = token.text.strip('-*[')
|
||
start_pos = text.find(striped_text, last_end_pos)
|
||
end_pos = start_pos + len(striped_text)
|
||
phonetics = ''
|
||
pos_defs = []
|
||
if res_of_exams:
|
||
for exam, value in res_of_exams.items():
|
||
phonetics = value.get('ipa_uk') or ''
|
||
defs = {}
|
||
for pos_def in value.get('defs', []):
|
||
pos = pos_def.get('pos', '')
|
||
definition_cn = pos_def.get('definition_cn', '')
|
||
defs.setdefault(pos, []).append(definition_cn)
|
||
pos_defs = [{'pos': pos, 'meanings': meanings} for pos, meanings in defs.items() if pos]
|
||
break
|
||
elif res_of_coco:
|
||
phonetics = res_of_coco.get('phonetics_1') or ''
|
||
pos_defs = res_of_coco.get('pos_defs') or []
|
||
last_end_pos = end_pos
|
||
new_vocab.append({'start': start_pos, 'end': end_pos, 'text': striped_text, 'lemma': striped,
|
||
'pos': token.pos_, 'cefr': cefr, 'Chinese': '', 'phonetics': phonetics,
|
||
'pos_defs': pos_defs, 'exam_tags': exam_tags})
|
||
line_data['new_vocab'] = new_vocab
|
||
# 查询词汇翻译
|
||
task_bulk: List[Union[VocabularyTranslationTask | DialogueTranslationTask]] = []
|
||
i = 0
|
||
if self._gemini_available:
|
||
logger.info(f"查询词汇翻译...")
|
||
for line_data in lines_to_process:
|
||
if self._shutdown_event.is_set():
|
||
return lines_to_process
|
||
if not self._gemini_available:
|
||
break
|
||
i += 1
|
||
if not (len(line_data["new_vocab"]) or (i == len(lines_to_process) and len(task_bulk))):
|
||
continue
|
||
new_vocab = [Vocabulary(lemma=new_vocab['lemma'], Chinese='') for new_vocab in line_data['new_vocab']]
|
||
task_bulk.append(VocabularyTranslationTask(index=line_data['index'],
|
||
vocabulary=new_vocab,
|
||
context=Context(
|
||
original_text=line_data['raw_subtitle'].replace('\n', ' ')
|
||
)))
|
||
if len(task_bulk) >= self._context_window or (len(task_bulk) and i == len(lines_to_process)):
|
||
logger.info(f"processing dialogues: "
|
||
f"{LexiAnnot.format_duration(lines_to_process[task_bulk[0].index]['time_code'][0])} -> "
|
||
f"{LexiAnnot.format_duration(lines_to_process[i - 1]['time_code'][1])}")
|
||
answer: Optional[List[VocabularyTranslationTask]] = self.__query_gemini(task_bulk,
|
||
VocabularyTranslationTask,
|
||
self._gemini_apikey,
|
||
vocabulary_trans_instruction,
|
||
self._gemini_model,
|
||
model_temperature)
|
||
if not answer:
|
||
continue
|
||
time.sleep(self._request_interval)
|
||
for answer_line in answer:
|
||
answer_lemma = tuple(v.lemma for v in answer_line.vocabulary)
|
||
filtered_raw = [x for x in lines_to_process if x.get('index') == answer_line.index]
|
||
if not len(filtered_raw):
|
||
logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.context.original_text}')
|
||
available_answer = False
|
||
for item in filtered_raw:
|
||
lemma = tuple(v['lemma'] for v in item['new_vocab'])
|
||
if lemma == answer_lemma:
|
||
available_answer = True
|
||
for i_, v in enumerate(item['new_vocab']):
|
||
v['Chinese'] = answer_line.vocabulary[i_].Chinese
|
||
break
|
||
if not available_answer:
|
||
logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.context.original_text}')
|
||
task_bulk = []
|
||
if not self._sentence_translation:
|
||
return lines_to_process
|
||
if self._gemini_available:
|
||
logger.info(f"查询整句翻译...")
|
||
# 查询整句翻译
|
||
translation_tasks: List[DialogueTranslationTask] = []
|
||
for line_data in lines_to_process:
|
||
translation_tasks.append(DialogueTranslationTask(index=line_data['index'],
|
||
original_text=line_data['raw_subtitle'].replace('\n', ' '),
|
||
Chinese=''))
|
||
i = 0
|
||
dialog_trans_instruction = '''You are an expert translator. You will be given a list of dialogue translation tasks in JSON format. For each entry, provide the most appropriate translation in Simplified Chinese based on the context.
|
||
Only complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.'''
|
||
while i < len(translation_tasks):
|
||
if self._shutdown_event.is_set():
|
||
return lines_to_process
|
||
if not self._gemini_available:
|
||
break
|
||
start_index = max(0, i - 1)
|
||
end_index = min(len(translation_tasks), i + self._context_window + 1)
|
||
task_bulk: List[DialogueTranslationTask] = translation_tasks[start_index:end_index]
|
||
logger.info(f"processing dialogues: "
|
||
f"{LexiAnnot.format_duration(lines_to_process[i]['time_code'][0])} -> "
|
||
f"{LexiAnnot.format_duration(lines_to_process[min(len(translation_tasks), i + self._context_window) - 1]['time_code'][1])}")
|
||
answer: List[DialogueTranslationTask] = self.__query_gemini(task_bulk,
|
||
DialogueTranslationTask,
|
||
self._gemini_apikey,
|
||
dialog_trans_instruction,
|
||
self._gemini_model,
|
||
model_temperature)
|
||
time.sleep(self._request_interval)
|
||
for answer_line in answer:
|
||
if answer_line.index not in range(i, i + self._context_window):
|
||
continue
|
||
filtered_raw = [x for x in lines_to_process if x.get('index') == answer_line.index]
|
||
if not len(filtered_raw):
|
||
logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.original_text}')
|
||
available_answer = False
|
||
for item in filtered_raw:
|
||
if item['raw_subtitle'].replace('\n', ' ') == answer_line.original_text:
|
||
available_answer = True
|
||
item['Chinese'] = answer_line.Chinese
|
||
break
|
||
if not available_answer:
|
||
logger.warn(f'Unknown answer: {answer_line.index}: {answer_line.original_text}')
|
||
i += self._context_window
|
||
return lines_to_process
|
||
|
||
def process_subtitles(self, ass_file: SSAFile,
|
||
cefr_lexicon: Dict[str, Any],
|
||
coca20k_lexicon: Dict[str, Any],
|
||
exams_lexicon: Dict[str, Any],
|
||
swear_words: List[str],
|
||
nlp: spacy.Language) -> Optional[SSAFile]:
|
||
"""
|
||
处理字幕内容,标记词汇并添加翻译。
|
||
"""
|
||
lang = 'en'
|
||
abgr_str = (f'&H{self._color_alpha:02x}{self._accent_color_rgb[2]:02x}'
|
||
f'{self._accent_color_rgb[1]:02x}{self._accent_color_rgb[0]:02x}&') # &H00FFFFFF&
|
||
pos_map = {
|
||
'NOUN': 'n.',
|
||
'AUX': 'aux.',
|
||
'VERB': 'v.',
|
||
'ADJ': 'adj.',
|
||
'ADV': 'adv.',
|
||
'ADP': 'prep.',
|
||
'CCONJ': 'conj.',
|
||
'SCONJ': 'conj.'
|
||
}
|
||
statistical_res = LexiAnnot.analyze_ass_language(ass_file)
|
||
main_style = LexiAnnot.select_main_style_weighted(statistical_res, lang)
|
||
if not main_style:
|
||
logger.error(f'无法确定主要字幕样式')
|
||
return None
|
||
index = 0
|
||
lines_to_process = []
|
||
main_dialogue: Dict[int, SSAEvent] = {}
|
||
for dialogue in ass_file:
|
||
if dialogue.style != main_style:
|
||
continue
|
||
time_code = (dialogue.start, dialogue.end)
|
||
text_raw = dialogue.plaintext
|
||
line_data = {'index': index, 'time_code': time_code, 'raw_subtitle': text_raw, 'new_vocab': [],
|
||
'Chinese': ''}
|
||
lines_to_process.append(line_data)
|
||
main_dialogue[index] = dialogue
|
||
index += 1
|
||
lines_to_process = self.__process_by_ai(lines_to_process, cefr_lexicon, coca20k_lexicon, exams_lexicon,
|
||
swear_words, nlp)
|
||
|
||
# 在原字幕添加标注
|
||
main_style_fs = ass_file.styles[main_style].fontsize
|
||
for line_data in lines_to_process:
|
||
if self._shutdown_event.is_set():
|
||
return None
|
||
if line_data['new_vocab']:
|
||
replacements = line_data['new_vocab']
|
||
for replacement in replacements:
|
||
part_of_speech = f"{{\\fnTimes New Roman\\fs{int(main_style_fs * 0.75)}\\i1}}{pos_map[replacement['pos']]}{{\\r}}"
|
||
new_text = f"{{\\c{abgr_str}}}{replacement['text']}{{\\r}}"
|
||
if self._in_place:
|
||
new_text = new_text + f" ({replacement['Chinese']} {part_of_speech})" if replacement[
|
||
'Chinese'] else ""
|
||
else:
|
||
dialogue = pysubs2.SSAEvent()
|
||
dialogue.start = main_dialogue[line_data['index']].start
|
||
dialogue.end = main_dialogue[line_data['index']].end
|
||
dialogue.style = 'Annotation EN'
|
||
cefr_text = f" {{\\rAnnotation CEFR}}{replacement['cefr']}{{\\r}}" \
|
||
if replacement['cefr'] else ""
|
||
exam_text = f" {{\\rAnnotation EXAM}}{' '.join(replacement['exam_tags'])}{{\\r}}" \
|
||
if replacement['exam_tags'] else ""
|
||
__N = r'\N'
|
||
phone_text = f"{__N}{{\\rAnnotation PHONE}}/{replacement['phonetics']}/{{\\r}}" if replacement['phonetics'] and self._show_phonetics else ""
|
||
annot_text = f"{replacement['lemma']} {{\\rAnnotation POS}}{pos_map[replacement['pos']]}{{\\r}} {{\\rAnnotation ZH}}{replacement['Chinese']}{{\\r}}{cefr_text}{exam_text}{phone_text}"
|
||
dialogue.text = annot_text
|
||
ass_file.append(dialogue)
|
||
if self._show_vocabulary_detail and replacement['pos_defs']:
|
||
dialogue = pysubs2.SSAEvent()
|
||
dialogue.start = main_dialogue[line_data['index']].start
|
||
dialogue.end = main_dialogue[line_data['index']].end
|
||
dialogue.style = 'DETAIL CN'
|
||
detail_text = []
|
||
for pos_def in replacement['pos_defs']:
|
||
meaning_str = ', '.join(pos_def['meanings'])
|
||
pos_text = f"{{\\rDETAIL POS}}{pos_def['pos']}{{\\r}} {meaning_str}"
|
||
detail_text.append(pos_text)
|
||
dialogue.text = '\\N'.join(detail_text)
|
||
ass_file.append(dialogue)
|
||
replacement['new_text'] = new_text
|
||
LexiAnnot.replace_by_plaintext_positions(main_dialogue[line_data['index']], replacements)
|
||
if self._sentence_translation:
|
||
chinese = line_data['Chinese']
|
||
if chinese and chinese[-1] in ['。', ',']:
|
||
chinese = chinese[:-1]
|
||
main_dialogue[line_data['index']].text = main_dialogue[line_data['index']].text + f"\\N{chinese}"
|
||
return ass_file
|