mirror of
https://github.com/d0zingcat/MoviePilot-Plugins.git
synced 2026-05-13 15:09:12 +00:00
add: LexiAnnot
This commit is contained in:
BIN
icons/LexiAnnot.png
Normal file
BIN
icons/LexiAnnot.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
@@ -450,5 +450,17 @@
|
||||
"history": {
|
||||
"v0.1.0": "新增ClashRuleProvider"
|
||||
}
|
||||
},
|
||||
"LexiAnnot": {
|
||||
"name": "美剧生词标注",
|
||||
"description": "根据CEFR等级,为英语影视剧标注高级词汇。",
|
||||
"labels": "英语",
|
||||
"version": "1.0",
|
||||
"icon": "LexiAnnot.png",
|
||||
"author": "wumode",
|
||||
"level": 1,
|
||||
"history": {
|
||||
"v1.0": "新增LexiAnnot"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
54
plugins.v2/lexiannot/README.md
Normal file
54
plugins.v2/lexiannot/README.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# 美剧生词标注
|
||||
|
||||
根据CEFR等级,为英语影视剧标注高级词汇。
|
||||
|
||||
在影视剧入库后,LexiAnnot会读取媒体文件的MediaInfo和文件列表,如果视频的原始语言为英语并且包含英文文本字幕,LexiAnnot将为其生成包含词汇注释的.ass字幕文件。
|
||||
|
||||

|
||||

|
||||

|
||||
|
||||
# Gemini
|
||||
|
||||
- **[获取APIKEY](https://aistudio.google.com/app/apikey)**
|
||||
- **[速率限制](https://ai.google.dev/gemini-api/docs/rate-limits)**
|
||||
|
||||
**确保可以正常访问下面的域名**
|
||||
|
||||
- googleapis.com
|
||||
- google.dev
|
||||
- aistudio.google.com
|
||||
|
||||
# CEFR
|
||||
|
||||
CEFR全称是Common European Framework of Reference for Languages。
|
||||
|
||||
它是一个国际标准,用于描述语言学习者的语言能力水平。CEFR 将语言能力分为六个级别,并进一步归类为三大使用者类型:
|
||||
|
||||
- **A - 基础使用者 (Basic User)**
|
||||
- **A1** (初学者/Beginner):能够理解并使用日常熟悉的表达和非常基本的短语。
|
||||
- **A2** (初级/Elementary):能够理解基本的表达方式,并以简单的方式进行交流。
|
||||
- **B - 独立使用者 (Independent User)**
|
||||
- **B1** (中级/Intermediate):能够理解熟悉主题的主要观点,可以处理旅行中可能遇到的多数情况,并能就熟悉的话题发表意见和描述。
|
||||
- **B2** (中高级/Upper-Intermediate):能够理解复杂文本的主要思想,并能与母语者进行一定程度的流利、自然的互动,可以就广泛的主题进行清晰、详细的阐述。
|
||||
- **C - 熟练使用者 (Proficient User)**
|
||||
- **C1** (高级/Advanced):能够理解各种较长、要求较高的文本,并能识别隐含意义,表达流利、自然,能灵活有效地使用语言来应对各种目的。
|
||||
- **C2** (精通/Proficient):能够轻松理解几乎所有听到的或读到的内容,能够非常流利、准确、精细地表达自己,即使在复杂的情况下也能区分细微的含义。
|
||||
|
||||
# 计划
|
||||
|
||||
- 双语字幕支持
|
||||
- 考试词汇标注
|
||||
|
||||
# FAQ
|
||||
|
||||
- **为什么需要用到Gemini**
|
||||
- LexiAnnot使用的词典仅包含约18000个单词,无法覆盖影视剧中的海量的俚语、习语、流行语等更广泛的表达形式
|
||||
- **只能处理已有字幕的视频吗?**
|
||||
- 是的,视频需要包含**英文文本字幕**
|
||||
- **为什么无法处理一些包含字幕视频**
|
||||
- 目前无法识别基于图片的字幕(通常是特效字幕)
|
||||
|
||||
# 感谢
|
||||
|
||||
- [coca-vocabulary-20000](https://github.com/llt22/coca-vocabulary-20000)
|
||||
1561
plugins.v2/lexiannot/__init__.py
Normal file
1561
plugins.v2/lexiannot/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
220
plugins.v2/lexiannot/query_gemini.py
Normal file
220
plugins.v2/lexiannot/query_gemini.py
Normal file
@@ -0,0 +1,220 @@
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from typing import List, Dict, Any, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ValidationError
|
||||
|
||||
|
||||
class Context(BaseModel):
|
||||
original_text: str
|
||||
|
||||
|
||||
class Vocabulary(BaseModel):
|
||||
lemma: str
|
||||
Chinese: str
|
||||
|
||||
|
||||
class VocabularyTranslationTask(BaseModel):
|
||||
index: int
|
||||
vocabulary: List[Vocabulary]
|
||||
context: Context
|
||||
|
||||
|
||||
class DialogueTranslationTask(BaseModel):
|
||||
index: int
|
||||
original_text: str
|
||||
Chinese: str
|
||||
|
||||
|
||||
class GeminiResponse(BaseModel):
|
||||
tasks: List[Union[VocabularyTranslationTask, DialogueTranslationTask]]
|
||||
total_token_count: int
|
||||
success: bool
|
||||
message: str = ""
|
||||
|
||||
|
||||
def validate_input_data(request_data: Dict[str, Any]) -> None:
|
||||
"""Validate the input data structure"""
|
||||
if not isinstance(request_data, dict):
|
||||
raise ValueError("Input data must be a dictionary")
|
||||
if "tasks" not in request_data:
|
||||
raise ValueError("Missing 'tasks' in input data")
|
||||
if "params" not in request_data:
|
||||
raise ValueError("Missing 'params' in input data")
|
||||
|
||||
params = request_data["params"]
|
||||
required_params = ["api_key", "system_instruction", "schema"]
|
||||
for param in required_params:
|
||||
if param not in params:
|
||||
raise ValueError(f"Missing required parameter: {param}")
|
||||
|
||||
|
||||
def get_task_schema(schema_name: str) -> Type[BaseModel]:
|
||||
"""Get the appropriate schema class based on the schema name"""
|
||||
schema_map = {
|
||||
'DialogueTranslationTask': DialogueTranslationTask,
|
||||
'VocabularyTranslationTask': VocabularyTranslationTask
|
||||
}
|
||||
if schema_name not in schema_map:
|
||||
raise ValueError(f"Unknown schema name: {schema_name}")
|
||||
return schema_map[schema_name]
|
||||
|
||||
|
||||
def query_gemini(
|
||||
api_key: str,
|
||||
translation_tasks: List[Dict[str, Any]],
|
||||
task_schema: Type[Union[VocabularyTranslationTask, DialogueTranslationTask]],
|
||||
system_instruction: str,
|
||||
gemini_model: str = "gemini-2.0-flash",
|
||||
temperature: float = 0.3,
|
||||
max_retries: int = 3,
|
||||
retry_delay: int = 10
|
||||
) -> GeminiResponse:
|
||||
"""
|
||||
Query the Gemini API for translation tasks with retry logic.
|
||||
|
||||
Args:
|
||||
api_key: Gemini API key
|
||||
translation_tasks: List of translation tasks
|
||||
task_schema: Pydantic model for the task type
|
||||
system_instruction: System instruction for the model
|
||||
gemini_model: Model name to use
|
||||
temperature: Generation temperature
|
||||
max_retries: Number of retry attempts
|
||||
retry_delay: Delay between retries in seconds
|
||||
|
||||
Returns:
|
||||
GeminiResponse containing the results
|
||||
"""
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
from google.genai.types import SchemaUnion
|
||||
client = genai.Client(api_key=api_key)
|
||||
messages = []
|
||||
translation_res = []
|
||||
total_token_count = 0
|
||||
|
||||
# Validate input tasks before sending to API
|
||||
try:
|
||||
translation_res = [task_schema(**task) for task in translation_tasks]
|
||||
except ValidationError as e:
|
||||
return GeminiResponse(
|
||||
tasks=[],
|
||||
total_token_count=0,
|
||||
success=False,
|
||||
message=f"Input validation failed: {str(e)}"
|
||||
)
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model=gemini_model,
|
||||
contents=json.dumps(translation_tasks, ensure_ascii=False),
|
||||
config=types.GenerateContentConfig(
|
||||
system_instruction=system_instruction,
|
||||
response_mime_type="application/json",
|
||||
response_schema=list[task_schema],
|
||||
temperature=temperature
|
||||
),
|
||||
)
|
||||
|
||||
if not response.parsed:
|
||||
raise ValueError("Empty response from Gemini API")
|
||||
|
||||
translation_res = response.parsed
|
||||
total_token_count = response.usage_metadata.total_token_count
|
||||
return GeminiResponse(
|
||||
tasks=translation_res,
|
||||
total_token_count=total_token_count,
|
||||
success=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
messages.append(f"Attempt {attempt} failed: {str(e)}")
|
||||
if attempt < max_retries:
|
||||
time.sleep(retry_delay)
|
||||
|
||||
return GeminiResponse(
|
||||
tasks=[],
|
||||
total_token_count=0,
|
||||
success=False,
|
||||
message="All retry attempts failed. " + "\n".join(messages)
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
# Read and parse input
|
||||
'''{
|
||||
"tasks": [{
|
||||
"index": 0,
|
||||
"original_text": "That was eight years ago.",
|
||||
"Chinese": ""
|
||||
}, {
|
||||
"index": 1,
|
||||
"original_text": "Much has changed.",
|
||||
"Chinese": ""
|
||||
}],
|
||||
"params": {
|
||||
"api_key": "",
|
||||
"system_instruction": "You are an expert translator. You will be given a list of dialogue translation tasks in JSON format. For each entry, provide the most appropriate translation in Simplified Chinese based on the context. \\nOnly complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.",
|
||||
"schema": "DialogueTranslationTask"
|
||||
}
|
||||
}'''
|
||||
input_text = sys.stdin.read()
|
||||
if not input_text:
|
||||
raise ValueError("No input provided")
|
||||
|
||||
request_data = json.loads(input_text)
|
||||
validate_input_data(request_data)
|
||||
|
||||
# Extract parameters
|
||||
tasks = request_data["tasks"]
|
||||
params = request_data["params"]
|
||||
|
||||
# Get schema and make API call
|
||||
schema = get_task_schema(params["schema"])
|
||||
response = query_gemini(
|
||||
api_key=params["api_key"],
|
||||
translation_tasks=tasks,
|
||||
task_schema=schema,
|
||||
system_instruction=params["system_instruction"],
|
||||
gemini_model=params.get("model", "gemini-2.0-flash"),
|
||||
temperature=float(params.get("temperature", 0.3)),
|
||||
max_retries=int(params.get("max_retries", 3))
|
||||
)
|
||||
|
||||
# Prepare output
|
||||
if response.success:
|
||||
result = {
|
||||
"success": True,
|
||||
"data": {
|
||||
"tasks": [task.model_dump() for task in response.tasks],
|
||||
"total_token_count": response.total_token_count
|
||||
}
|
||||
}
|
||||
else:
|
||||
result = {
|
||||
"success": False,
|
||||
"message": response.message
|
||||
}
|
||||
|
||||
print(json.dumps(result, ensure_ascii=False))
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
error = {
|
||||
"success": False,
|
||||
"message": f"Invalid JSON input: {str(e)}"
|
||||
}
|
||||
print(json.dumps(error))
|
||||
except Exception as e:
|
||||
error = {
|
||||
"success": False,
|
||||
"message": f"Unexpected error: {str(e)}"
|
||||
}
|
||||
print(json.dumps(error))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5
plugins.v2/lexiannot/requirements.txt
Normal file
5
plugins.v2/lexiannot/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
pysubs2~=1.8.0
|
||||
thinc==8.3.4
|
||||
spacy==3.8.7
|
||||
langdetect~=1.0.9
|
||||
pymediainfo~=7.0.1
|
||||
Reference in New Issue
Block a user