feat(LexiAnnot): Improve subtitle selection strategy

This commit is contained in:
wumode
2025-11-19 21:15:56 +08:00
parent e96eece117
commit a9830202e8
5 changed files with 48 additions and 39 deletions

View File

@@ -533,11 +533,12 @@
"name": "美剧生词标注",
"description": "根据CEFR等级为英语影视剧标注高级词汇。",
"labels": "英语",
"version": "1.1.3",
"version": "1.1.4",
"icon": "LexiAnnot.png",
"author": "wumode",
"level": 1,
"history": {
"v1.1.4": "优化字幕选择决策",
"v1.1.3": "适配 Pydantic V2 (主程序版本需高于 2.8.1-1)",
"v1.1.2": "使用子进程避免 spaCy 模型常驻内存",
"v1.1.1": "添加任务页面; 改进 spaCy 模型加载逻辑",

View File

@@ -13,7 +13,7 @@ from app.core.config import settings
from app.core.event import eventmanager, Event
from app.log import logger
from app.schemas.types import EventType, NotificationType
from app.scheduler import Scheduler
from app.core.config import global_vars
from .api import ClashRuleProviderApi, apis
from .base import _ClashRuleProviderBase
@@ -92,11 +92,7 @@ class ClashRuleProvider(_ClashRuleProviderBase):
self.state.ruleset_rules_manager.clear()
if ClashRuleProvider.event_loop is None:
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = Scheduler().loop
ClashRuleProvider.event_loop = loop
ClashRuleProvider.event_loop = global_vars.loop
self.scheduler = AsyncIOScheduler(timezone=settings.TZ, event_loop=ClashRuleProvider.event_loop)
self.services = ClashRuleProviderService(self.__class__.__name__, self.config, self.state, self.store,
self.scheduler)

View File

@@ -83,7 +83,7 @@ class LexiAnnot(_PluginBase):
# 插件图标
plugin_icon = "LexiAnnot.png"
# 插件版本
plugin_version = "1.1.3"
plugin_version = "1.1.4"
# 插件作者
plugin_author = "wumode"
# 作者主页
@@ -109,7 +109,7 @@ class LexiAnnot(_PluginBase):
_context_window: int = 0
_max_retries: int = 0
_request_interval: int = 0
_ffmpeg_path = ''
_ffmpeg_path: str = 'ffmpeg'
_english_only = False
_when_file_trans = False
_model_temperature = ''
@@ -154,7 +154,7 @@ class LexiAnnot(_PluginBase):
self._context_window = int(config.get("context_window") or 10)
self._max_retries = int(config.get("max_retries") or 3)
self._request_interval = int(config.get("request_interval") or 3)
self._ffmpeg_path = config.get("ffmpeg_path")
self._ffmpeg_path = config.get("ffmpeg_path") or 'ffmpeg'
self._english_only = config.get("english_only")
self._when_file_trans = config.get("when_file_trans")
self._model_temperature = config.get("model_temperature") or '0.3'
@@ -975,31 +975,23 @@ class LexiAnnot(_PluginBase):
},
'content': [
{
'component': 'VRow',
'component': 'VCol',
'props': {
'class': 'd-none d-sm-block',
'cols': 12,
},
'content': [
{
'component': 'VCol',
'component': 'VDataTableVirtual',
'props': {
'cols': 12,
},
'content': [
{
'component': 'VDataTableVirtual',
'props': {
'class': 'text-sm',
'headers': headers,
'items': items,
'height': '30rem',
'density': 'compact',
'fixed-header': True,
'hide-no-data': True,
'hover': True
}
}
]
'class': 'text-sm',
'headers': headers,
'items': items,
'height': '30rem',
'density': 'compact',
'fixed-header': True,
'hide-no-data': True,
'hover': True
}
}
]
}
@@ -1213,7 +1205,11 @@ class LexiAnnot(_PluginBase):
embedded_subtitles = LexiAnnot._extract_subtitles_by_lang(path, eng_mark, ffmpeg_path)
if not embedded_subtitles:
return TaskStatus.CANCELED
embedded_subtitles = sorted(embedded_subtitles, key=lambda track: 'SDH' in track['title'])
# order factor = 0, if 'SDH' in track['title']
# order factor = track['duration'], otherwise
embedded_subtitles = sorted(embedded_subtitles,
key=lambda track: track['duration']*(1-int('SDH' in track['title'])),
reverse=True)
ret_message = ''
if embedded_subtitles:
logger.info(f'提取到 {len(embedded_subtitles)} 条英语文本字幕')
@@ -1705,7 +1701,8 @@ class LexiAnnot(_PluginBase):
return None
@staticmethod
def _extract_subtitles_by_lang(video_path: str, lang: str | list = 'en', ffmpeg: str = 'ffmpeg') -> Optional[List[Dict]]:
def _extract_subtitles_by_lang(video_path: str, lang: str | list = 'en', ffmpeg: str = 'ffmpeg'
) -> Optional[List[Dict]]:
"""
提取视频文件中的内嵌英文字幕,使用 MediaInfo 查找字幕流。
"""
@@ -1720,12 +1717,22 @@ class LexiAnnot(_PluginBase):
try:
media_info: pymediainfo.MediaInfo = pymediainfo.MediaInfo.parse(video_path)
for track in media_info.tracks:
if track.track_type == 'Text' and check_lang(track_lang=track.language) and track.codec_id in supported_codec:
if (track.track_type == 'Text' and check_lang(track_lang=track.language)
and track.codec_id in supported_codec):
subtitle_stream_index = track.stream_identifier # MediaInfo 的 stream_id 从 1 开始ffmpeg 从 0 开始
subtitle = LexiAnnot.__extract_subtitle(video_path, subtitle_stream_index, ffmpeg)
if hasattr(track, 'duration'):
if isinstance(track.duration, str) and StringUtils.is_number(track.duration):
duration = int(float(track.duration))
elif isinstance(track.duration, int):
duration = track.duration
else:
duration = 0
else:
duration = 0
if subtitle:
subtitles.append({'title': track.title or '', 'subtitle': subtitle, 'codec_id': track.codec_id,
'stream_id': subtitle_stream_index})
'stream_id': subtitle_stream_index, 'duration': duration})
if subtitles:
return subtitles
else:
@@ -1761,7 +1768,7 @@ class LexiAnnot(_PluginBase):
)
if not response.success:
logger.warning(f"Error in subprocess response: {response.message}")
logger.warning(f"Error in response: {response.message}")
return tasks.tasks
self._total_token_count += response.total_token_count
@@ -1918,7 +1925,7 @@ Only complete the `Chinese` field. Do not include pinyin, explanations, or any a
)
i = 0
dialog_trans_instruction = '''You are an expert translator. You will be given a list of dialogue translation tasks in JSON format. For each entry, provide the most appropriate translation in Simplified Chinese based on the context.
Only complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.'''
Only complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.'''
while i < len(translation_tasks):
if self._shutdown_event.is_set():
return lines_to_process
@@ -2044,4 +2051,8 @@ Only complete the `Chinese` field. Do not include pinyin, explanations, or any a
if chinese and chinese[-1] in ['', '']:
chinese = chinese[:-1]
main_dialogue[line_data['index']].text = main_dialogue[line_data['index']].text + f"\\N{chinese}"
# 避免 Infuse 显示乱码
unexplainable_line = pysubs2.SSAEvent(start=0, end=0, text=f"{{\\rAnnotation ZH}}{self.plugin_name}{{\\r}}")
ass_file.insert(0, unexplainable_line)
return ass_file

View File

@@ -68,13 +68,14 @@ def translate(
returns: GeminiResponse containing the results
"""
client = genai.Client(api_key=api_key)
messages = []
response_schema = type(translation_tasks)
for attempt in range(1, max_retries + 1):
try:
client = genai.Client(api_key=api_key)
response = client.models.generate_content(
model=gemini_model,
contents=translation_tasks.model_dump_json(),
@@ -100,7 +101,7 @@ def translate(
except Exception as e:
messages.append(f"Attempt {attempt} failed: {str(e)}")
if attempt < max_retries:
time.sleep(retry_delay)
time.sleep(attempt*retry_delay)
return GeminiResponse(
tasks=[],

View File

@@ -1 +1 @@
eyI1MnB0LnNpdGUiOiBbIjUycHQuc2l0ZSJdLCAiYXVkaWVuY2VzLm1lIjogWyJ0LmF1ZGllbmNlcy5tZSIsICJ0cmFja2VyLmNpbmVmaWxlcy5pbmZvIl0sICJidHNjaG9vbC5jbHViIjogWyJwdC5idHNjaG9vbC5jbHViIl0sICJieXIucHQiOiBbInRyYWNrZXIuYnlyLnB0Il0sICJjYXJwdC5uZXQiOiBbInRyYWNrZXIuY2FycHQubmV0Il0sICJjcmFicHQudmlwIjogWyJjcmFicHQudmlwIl0sICJjc3B0LnRvcCI6IFsidHJhY2tlci5jc3B0LnRvcCIsICJ0cmFja2VyLmNzcHQuY2MiLCAidHJhY2tlci5jc3B0LmRhdGUiXSwgImRpc2NmYW4ubmV0IjogWyJkaXNjZmFuLnh5eiJdLCAiZWFzdGdhbWUub3JnIjogWyJwdC5lYXN0Z2FtZS5vcmciXSwgImV0OC5vcmciOiBbImV0OC5vcmciLCAidC5ldDgub3JnIl0sICJnYW1lZ2FtZXB0LmNvbSI6IFsid3d3LmdhbWVnYW1lcHQuY29tIl0sICJoZGFyZWEuY2x1YiI6IFsidHJhY2tlci5oZGFyZWEuY2x1YiJdLCAiaGRkb2xieS5jb20iOiBbInQuaGRkb2xieS5jb20iXSwgImhkZmFucy5vcmciOiBbImhkZmFucy5vcmciXSwgImhka3lsLmluIjogWyJ0cmFja2VyLmhka3lsLmluIl0sICJoZHRpbWUub3JnIjogWyJoZHRpbWUub3JnIl0sICJoaXRwdC5jb20iOiBbImhpdHB0LmNvbSJdLCAiaHVkYnQuaHVzdC5lZHUuY24iOiBbImh1ZGJ0Lmh1c3QuZWR1LmNuIl0sICJpY2MyMDIyLmNvbSI6IFsidHJhY2tlci5pY2MyMDIyLnh5eiJdLCAiaWxvbGljb24uY29tIjogWyJ0cmFja2VyLmlsb2xpY29uLmNjIl0sICJrZWVwZnJkcy5jb20iOiBbInRyYWNrZXIua2VlcGZyZHMuY29tIl0sICJtLXRlYW0uY2MiOiBbInRyYWNrZXIubS10ZWFtLmNjIiwgInRyYWNrZXIubS10ZWFtLmlvIl0sICJtb25pa2FkZXNpZ24udWsiOiBbInRyYWNrZXIubW9uaWthZGVzaWduLnVrIiwgImRhaWtpcmFpLm1vbmlrYWRlc2lnbi51ayIsICJhbmltZS1uby1pbmRleC5jb20iXSwgIm5pY2VwdC5uZXQiOiBbInd3dy5uaWNlcHQubmV0Il0sICJva3B0Lm5ldCI6IFsid3d3Lm9rcHQubmV0Il0sICJwdGhvbWUubmV0IjogWyJwdGhvbWUubmV0Il0sICJwdGxncy5vcmciOiBbInB0bC5ncyIsICJyZWxheTAxLnB0bC5ncyJdLCAicHRzYmFvLmNsdWIiOiBbInB0c2Jhby5jbHViIl0sICJwdHRpbWUub3JnIjogWyJ3d3cucHR0aW1lLm9yZyJdLCAicHR6b25lLnh5eiI6IFsicHR6b25lLnh5eiJdLCAicWluZ3dhcHQuY29tIjogWyJ0cmFja2VyLnFpbmd3YS5wcm8iLCAidHJhY2tlci5xaW5nd2FwdC5jb20iXSwgInJhaW5nZmgudG9wIjogWyJyYWluZ2ZoLnRvcCJdLCAicm91c2kuemlwIjogWyJoaXRwdC5jb20iXSwgInNwcmluZ3N1bmRheS5uZXQiOiBbIm9uNi5zcHJpbmdzdW5kYXkubmV0IiwgIm9uLnNwcmluZ3N1bmRheS5uZXQiXSwgInRqdXB0Lm9yZyI6IFsidHJhY2tlci1wdWJsaWMudGp1cHQub3JnIl0sICJ0b3RoZWdsb3J5LmltIjogWyJ0cmFja2VyLnRvdGhlZ2xvcnkuaW0iXSwgInUyLmRtaHkub3JnIjogWyJkYXlkcmVhbS5kbWh5LmJlc3QiXSwgInhpbmd5dW5nZS50b3AiOiBbInRyYWNrZXIueGluZ3l1bmdlLnRvcCIsICJ0cmFja2VyLnhpbmd5dW5nZS5zYnMiXSwgInptcHQuY2MiOiBbInptcHQuY2MiXSwgImhoYW5jbHViLnRvcCI6IFsidHJhY2tlci5oaGFuY2x1Yi50b3AiXSwgImhkY2l0eS5jaXR5IjogWyJzeW5jLmxlbml0ZXIub3JnIl19
eyI1MnB0LnNpdGUiOiBbIjUycHQuc2l0ZSJdLCAiYXVkaWVuY2VzLm1lIjogWyJ0LmF1ZGllbmNlcy5tZSIsICJ0cmFja2VyLmNpbmVmaWxlcy5pbmZvIl0sICJidHNjaG9vbC5jbHViIjogWyJwdC5idHNjaG9vbC5jbHViIl0sICJieXIucHQiOiBbInRyYWNrZXIuYnlyLnB0Il0sICJjYXJwdC5uZXQiOiBbInRyYWNrZXIuY2FycHQubmV0Il0sICJjcmFicHQudmlwIjogWyJjcmFicHQudmlwIl0sICJjc3B0LnRvcCI6IFsidHJhY2tlci5jc3B0LnRvcCIsICJ0cmFja2VyLmNzcHQuY2MiLCAidHJhY2tlci5jc3B0LmRhdGUiXSwgImRpc2NmYW4ubmV0IjogWyJkaXNjZmFuLnh5eiJdLCAiZWFzdGdhbWUub3JnIjogWyJwdC5lYXN0Z2FtZS5vcmciXSwgImV0OC5vcmciOiBbImV0OC5vcmciLCAidC5ldDgub3JnIl0sICJnYW1lZ2FtZXB0LmNvbSI6IFsid3d3LmdhbWVnYW1lcHQuY29tIl0sICJoZGFyZWEuY2x1YiI6IFsidHJhY2tlci5oZGFyZWEuY2x1YiJdLCAiaGRkb2xieS5jb20iOiBbInQuaGRkb2xieS5jb20iXSwgImhkZmFucy5vcmciOiBbImhkZmFucy5vcmciXSwgImhka3lsLmluIjogWyJ0cmFja2VyLmhka3lsLmluIl0sICJoZHRpbWUub3JnIjogWyJoZHRpbWUub3JnIl0sICJoaXRwdC5jb20iOiBbImhpdHB0LmNvbSJdLCAiaHVkYnQuaHVzdC5lZHUuY24iOiBbImh1ZGJ0Lmh1c3QuZWR1LmNuIl0sICJpY2MyMDIyLmNvbSI6IFsidHJhY2tlci5pY2MyMDIyLnh5eiJdLCAiaWxvbGljb24uY29tIjogWyJ0cmFja2VyLmlsb2xpY29uLmNjIl0sICJrZWVwZnJkcy5jb20iOiBbInRyYWNrZXIua2VlcGZyZHMuY29tIl0sICJtLXRlYW0uY2MiOiBbInRyYWNrZXIubS10ZWFtLmNjIiwgInRyYWNrZXIubS10ZWFtLmlvIl0sICJtb25pa2FkZXNpZ24udWsiOiBbInRyYWNrZXIubW9uaWthZGVzaWduLnVrIiwgImRhaWtpcmFpLm1vbmlrYWRlc2lnbi51ayIsICJhbmltZS1uby1pbmRleC5jb20iXSwgIm5pY2VwdC5uZXQiOiBbInd3dy5uaWNlcHQubmV0Il0sICJva3B0Lm5ldCI6IFsid3d3Lm9rcHQubmV0Il0sICJwdGhvbWUubmV0IjogWyJwdGhvbWUubmV0Il0sICJwdGxncy5vcmciOiBbInB0bC5ncyIsICJyZWxheTAxLnB0bC5ncyJdLCAicHRzYmFvLmNsdWIiOiBbInB0c2Jhby5jbHViIl0sICJwdHRpbWUub3JnIjogWyJ3d3cucHR0aW1lLm9yZyJdLCAicHR6b25lLnh5eiI6IFsicHR6b25lLnh5eiJdLCAicWluZ3dhcHQuY29tIjogWyJ0cmFja2VyLnFpbmd3YS5wcm8iLCAidHJhY2tlci5xaW5nd2FwdC5jb20iLCAidHJhY2tlci5xaW5nd2FwdC5vcmciXSwgInJhaW5nZmgudG9wIjogWyJyYWluZ2ZoLnRvcCJdLCAicm91c2kuemlwIjogWyJoaXRwdC5jb20iXSwgInNwcmluZ3N1bmRheS5uZXQiOiBbIm9uNi5zcHJpbmdzdW5kYXkubmV0IiwgIm9uLnNwcmluZ3N1bmRheS5uZXQiXSwgInRqdXB0Lm9yZyI6IFsidHJhY2tlci1wdWJsaWMudGp1cHQub3JnIl0sICJ0b3RoZWdsb3J5LmltIjogWyJ0cmFja2VyLnRvdGhlZ2xvcnkuaW0iXSwgInUyLmRtaHkub3JnIjogWyJkYXlkcmVhbS5kbWh5LmJlc3QiXSwgInhpbmd5dW5nZS50b3AiOiBbInRyYWNrZXIueGluZ3l1bmdlLnRvcCIsICJ0cmFja2VyLnhpbmd5dW5nZS5zYnMiXSwgInptcHQuY2MiOiBbInptcHQuY2MiXSwgImhoYW5jbHViLnRvcCI6IFsidHJhY2tlci5oaGFuY2x1Yi50b3AiXSwgImhkY2l0eS5jaXR5IjogWyJzeW5jLmxlbml0ZXIub3JnIl19