add: LexiAnnot

This commit is contained in:
wumode
2025-06-09 23:59:43 +08:00
parent 8d8c7b88a8
commit 1cb8934288
6 changed files with 1852 additions and 0 deletions

BIN
icons/LexiAnnot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

View File

@@ -450,5 +450,17 @@
"history": {
"v0.1.0": "新增ClashRuleProvider"
}
},
"LexiAnnot": {
"name": "美剧生词标注",
"description": "根据CEFR等级为英语影视剧标注高级词汇。",
"labels": "英语",
"version": "1.0",
"icon": "LexiAnnot.png",
"author": "wumode",
"level": 1,
"history": {
"v1.0": "新增LexiAnnot"
}
}
}

View File

@@ -0,0 +1,54 @@
# 美剧生词标注
根据CEFR等级为英语影视剧标注高级词汇。
在影视剧入库后LexiAnnot会读取媒体文件的MediaInfo和文件列表如果视频的原始语言为英语并且包含英文文本字幕LexiAnnot将为其生成包含词汇注释的.ass字幕文件。
![](https://images2.imgbox.com/d6/b6/kZu6EH2a_o.png)
![](https://images2.imgbox.com/c8/3a/rEJBWu5v_o.png)
![](https://images2.imgbox.com/97/b7/d6RXFtwD_o.png)
# Gemini
- **[获取APIKEY](https://aistudio.google.com/app/apikey)**
- **[速率限制](https://ai.google.dev/gemini-api/docs/rate-limits)**
**确保可以正常访问下面的域名**
- googleapis.com
- google.dev
- aistudio.google.com
# CEFR
CEFR全称是Common European Framework of Reference for Languages。
它是一个国际标准用于描述语言学习者的语言能力水平。CEFR 将语言能力分为六个级别,并进一步归类为三大使用者类型:
- **A - 基础使用者 (Basic User)**
- **A1** (初学者/Beginner):能够理解并使用日常熟悉的表达和非常基本的短语。
- **A2** (初级/Elementary):能够理解基本的表达方式,并以简单的方式进行交流。
- **B - 独立使用者 (Independent User)**
- **B1** (中级/Intermediate):能够理解熟悉主题的主要观点,可以处理旅行中可能遇到的多数情况,并能就熟悉的话题发表意见和描述。
- **B2** (中高级/Upper-Intermediate):能够理解复杂文本的主要思想,并能与母语者进行一定程度的流利、自然的互动,可以就广泛的主题进行清晰、详细的阐述。
- **C - 熟练使用者 (Proficient User)**
- **C1** (高级/Advanced):能够理解各种较长、要求较高的文本,并能识别隐含意义,表达流利、自然,能灵活有效地使用语言来应对各种目的。
- **C2** (精通/Proficient):能够轻松理解几乎所有听到的或读到的内容,能够非常流利、准确、精细地表达自己,即使在复杂的情况下也能区分细微的含义。
# 计划
- 双语字幕支持
- 考试词汇标注
# FAQ
- **为什么需要用到Gemini**
- LexiAnnot使用的词典仅包含约18000个单词无法覆盖影视剧中的海量的俚语、习语、流行语等更广泛的表达形式
- **只能处理已有字幕的视频吗?**
- 是的,视频需要包含**英文文本字幕**
- **为什么无法处理一些包含字幕视频**
- 目前无法识别基于图片的字幕(通常是特效字幕)
# 感谢
- [coca-vocabulary-20000](https://github.com/llt22/coca-vocabulary-20000)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,220 @@
import sys
import json
import time
from typing import List, Dict, Any, Type, Union
from pydantic import BaseModel, ValidationError
class Context(BaseModel):
original_text: str
class Vocabulary(BaseModel):
lemma: str
Chinese: str
class VocabularyTranslationTask(BaseModel):
index: int
vocabulary: List[Vocabulary]
context: Context
class DialogueTranslationTask(BaseModel):
index: int
original_text: str
Chinese: str
class GeminiResponse(BaseModel):
tasks: List[Union[VocabularyTranslationTask, DialogueTranslationTask]]
total_token_count: int
success: bool
message: str = ""
def validate_input_data(request_data: Dict[str, Any]) -> None:
"""Validate the input data structure"""
if not isinstance(request_data, dict):
raise ValueError("Input data must be a dictionary")
if "tasks" not in request_data:
raise ValueError("Missing 'tasks' in input data")
if "params" not in request_data:
raise ValueError("Missing 'params' in input data")
params = request_data["params"]
required_params = ["api_key", "system_instruction", "schema"]
for param in required_params:
if param not in params:
raise ValueError(f"Missing required parameter: {param}")
def get_task_schema(schema_name: str) -> Type[BaseModel]:
"""Get the appropriate schema class based on the schema name"""
schema_map = {
'DialogueTranslationTask': DialogueTranslationTask,
'VocabularyTranslationTask': VocabularyTranslationTask
}
if schema_name not in schema_map:
raise ValueError(f"Unknown schema name: {schema_name}")
return schema_map[schema_name]
def query_gemini(
api_key: str,
translation_tasks: List[Dict[str, Any]],
task_schema: Type[Union[VocabularyTranslationTask, DialogueTranslationTask]],
system_instruction: str,
gemini_model: str = "gemini-2.0-flash",
temperature: float = 0.3,
max_retries: int = 3,
retry_delay: int = 10
) -> GeminiResponse:
"""
Query the Gemini API for translation tasks with retry logic.
Args:
api_key: Gemini API key
translation_tasks: List of translation tasks
task_schema: Pydantic model for the task type
system_instruction: System instruction for the model
gemini_model: Model name to use
temperature: Generation temperature
max_retries: Number of retry attempts
retry_delay: Delay between retries in seconds
Returns:
GeminiResponse containing the results
"""
from google import genai
from google.genai import types
from google.genai.types import SchemaUnion
client = genai.Client(api_key=api_key)
messages = []
translation_res = []
total_token_count = 0
# Validate input tasks before sending to API
try:
translation_res = [task_schema(**task) for task in translation_tasks]
except ValidationError as e:
return GeminiResponse(
tasks=[],
total_token_count=0,
success=False,
message=f"Input validation failed: {str(e)}"
)
for attempt in range(1, max_retries + 1):
try:
response = client.models.generate_content(
model=gemini_model,
contents=json.dumps(translation_tasks, ensure_ascii=False),
config=types.GenerateContentConfig(
system_instruction=system_instruction,
response_mime_type="application/json",
response_schema=list[task_schema],
temperature=temperature
),
)
if not response.parsed:
raise ValueError("Empty response from Gemini API")
translation_res = response.parsed
total_token_count = response.usage_metadata.total_token_count
return GeminiResponse(
tasks=translation_res,
total_token_count=total_token_count,
success=True
)
except Exception as e:
messages.append(f"Attempt {attempt} failed: {str(e)}")
if attempt < max_retries:
time.sleep(retry_delay)
return GeminiResponse(
tasks=[],
total_token_count=0,
success=False,
message="All retry attempts failed. " + "\n".join(messages)
)
def main():
try:
# Read and parse input
'''{
"tasks": [{
"index": 0,
"original_text": "That was eight years ago.",
"Chinese": ""
}, {
"index": 1,
"original_text": "Much has changed.",
"Chinese": ""
}],
"params": {
"api_key": "",
"system_instruction": "You are an expert translator. You will be given a list of dialogue translation tasks in JSON format. For each entry, provide the most appropriate translation in Simplified Chinese based on the context. \\nOnly complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.",
"schema": "DialogueTranslationTask"
}
}'''
input_text = sys.stdin.read()
if not input_text:
raise ValueError("No input provided")
request_data = json.loads(input_text)
validate_input_data(request_data)
# Extract parameters
tasks = request_data["tasks"]
params = request_data["params"]
# Get schema and make API call
schema = get_task_schema(params["schema"])
response = query_gemini(
api_key=params["api_key"],
translation_tasks=tasks,
task_schema=schema,
system_instruction=params["system_instruction"],
gemini_model=params.get("model", "gemini-2.0-flash"),
temperature=float(params.get("temperature", 0.3)),
max_retries=int(params.get("max_retries", 3))
)
# Prepare output
if response.success:
result = {
"success": True,
"data": {
"tasks": [task.model_dump() for task in response.tasks],
"total_token_count": response.total_token_count
}
}
else:
result = {
"success": False,
"message": response.message
}
print(json.dumps(result, ensure_ascii=False))
except json.JSONDecodeError as e:
error = {
"success": False,
"message": f"Invalid JSON input: {str(e)}"
}
print(json.dumps(error))
except Exception as e:
error = {
"success": False,
"message": f"Unexpected error: {str(e)}"
}
print(json.dumps(error))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,5 @@
pysubs2~=1.8.0
thinc==8.3.4
spacy==3.8.7
langdetect~=1.0.9
pymediainfo~=7.0.1