add: LexiAnnot

2026-05-13 15:09:12 +00:00 · 2025-06-09 23:59:43 +08:00
parent 8d8c7b88a8
commit 1cb8934288
6 changed files with 1852 additions and 0 deletions
--- a/icons/LexiAnnot.png
+++ b/icons/LexiAnnot.png
--- a/package.v2.json
+++ b/package.v2.json
@@ -450,5 +450,17 @@
    "history": {
      "v0.1.0": "新增ClashRuleProvider"
    }
+  },
+  "LexiAnnot": {
+    "name": "美剧生词标注",
+    "description": "根据CEFR等级，为英语影视剧标注高级词汇。",
+    "labels": "英语",
+    "version": "1.0",
+    "icon": "LexiAnnot.png",
+    "author": "wumode",
+    "level": 1,
+    "history": {
+      "v1.0": "新增LexiAnnot"
+    }
  }
 }
--- a/plugins.v2/lexiannot/README.md
+++ b/plugins.v2/lexiannot/README.md
@@ -0,0 +1,54 @@
+# 美剧生词标注
+
+根据CEFR等级，为英语影视剧标注高级词汇。
+
+在影视剧入库后，LexiAnnot会读取媒体文件的MediaInfo和文件列表，如果视频的原始语言为英语并且包含英文文本字幕，LexiAnnot将为其生成包含词汇注释的.ass字幕文件。
+
+![](https://images2.imgbox.com/d6/b6/kZu6EH2a_o.png)
+![](https://images2.imgbox.com/c8/3a/rEJBWu5v_o.png)
+![](https://images2.imgbox.com/97/b7/d6RXFtwD_o.png)
+
+# Gemini
+
+- **[获取APIKEY](https://aistudio.google.com/app/apikey)**
+- **[速率限制](https://ai.google.dev/gemini-api/docs/rate-limits)**
+
+**确保可以正常访问下面的域名**
+
+- googleapis.com
+- google.dev
+- aistudio.google.com
+
+# CEFR
+
+CEFR全称是Common European Framework of Reference for Languages。
+
+它是一个国际标准，用于描述语言学习者的语言能力水平。CEFR 将语言能力分为六个级别，并进一步归类为三大使用者类型：
+
+- **A - 基础使用者 (Basic User)**
+  - **A1** (初学者/Beginner)：能够理解并使用日常熟悉的表达和非常基本的短语。
+  - **A2** (初级/Elementary)：能够理解基本的表达方式，并以简单的方式进行交流。
+- **B - 独立使用者 (Independent User)**
+  - **B1** (中级/Intermediate)：能够理解熟悉主题的主要观点，可以处理旅行中可能遇到的多数情况，并能就熟悉的话题发表意见和描述。
+  - **B2** (中高级/Upper-Intermediate)：能够理解复杂文本的主要思想，并能与母语者进行一定程度的流利、自然的互动，可以就广泛的主题进行清晰、详细的阐述。
+- **C - 熟练使用者 (Proficient User)**
+  - **C1** (高级/Advanced)：能够理解各种较长、要求较高的文本，并能识别隐含意义，表达流利、自然，能灵活有效地使用语言来应对各种目的。
+  - **C2** (精通/Proficient)：能够轻松理解几乎所有听到的或读到的内容，能够非常流利、准确、精细地表达自己，即使在复杂的情况下也能区分细微的含义。
+
+# 计划
+
+- 双语字幕支持
+- 考试词汇标注
+
+# FAQ
+
+- **为什么需要用到Gemini**
+  - LexiAnnot使用的词典仅包含约18000个单词，无法覆盖影视剧中的海量的俚语、习语、流行语等更广泛的表达形式
+- **只能处理已有字幕的视频吗？**
+  - 是的，视频需要包含**英文文本字幕**
+- **为什么无法处理一些包含字幕视频**
+  - 目前无法识别基于图片的字幕(通常是特效字幕)
+
+# 感谢
+
+- [coca-vocabulary-20000](https://github.com/llt22/coca-vocabulary-20000)
--- a/plugins.v2/lexiannot/init.py
+++ b/plugins.v2/lexiannot/init.py
--- a/plugins.v2/lexiannot/query_gemini.py
+++ b/plugins.v2/lexiannot/query_gemini.py
@@ -0,0 +1,220 @@
+import sys
+import json
+import time
+from typing import List, Dict, Any, Type, Union
+
+from pydantic import BaseModel, ValidationError
+
+
+class Context(BaseModel):
+    original_text: str
+
+
+class Vocabulary(BaseModel):
+    lemma: str
+    Chinese: str
+
+
+class VocabularyTranslationTask(BaseModel):
+    index: int
+    vocabulary: List[Vocabulary]
+    context: Context
+
+
+class DialogueTranslationTask(BaseModel):
+    index: int
+    original_text: str
+    Chinese: str
+
+
+class GeminiResponse(BaseModel):
+    tasks: List[Union[VocabularyTranslationTask, DialogueTranslationTask]]
+    total_token_count: int
+    success: bool
+    message: str = ""
+
+
+def validate_input_data(request_data: Dict[str, Any]) -> None:
+    """Validate the input data structure"""
+    if not isinstance(request_data, dict):
+        raise ValueError("Input data must be a dictionary")
+    if "tasks" not in request_data:
+        raise ValueError("Missing 'tasks' in input data")
+    if "params" not in request_data:
+        raise ValueError("Missing 'params' in input data")
+
+    params = request_data["params"]
+    required_params = ["api_key", "system_instruction", "schema"]
+    for param in required_params:
+        if param not in params:
+            raise ValueError(f"Missing required parameter: {param}")
+
+
+def get_task_schema(schema_name: str) -> Type[BaseModel]:
+    """Get the appropriate schema class based on the schema name"""
+    schema_map = {
+        'DialogueTranslationTask': DialogueTranslationTask,
+        'VocabularyTranslationTask': VocabularyTranslationTask
+    }
+    if schema_name not in schema_map:
+        raise ValueError(f"Unknown schema name: {schema_name}")
+    return schema_map[schema_name]
+
+
+def query_gemini(
+        api_key: str,
+        translation_tasks: List[Dict[str, Any]],
+        task_schema: Type[Union[VocabularyTranslationTask, DialogueTranslationTask]],
+        system_instruction: str,
+        gemini_model: str = "gemini-2.0-flash",
+        temperature: float = 0.3,
+        max_retries: int = 3,
+        retry_delay: int = 10
+) -> GeminiResponse:
+    """
+    Query the Gemini API for translation tasks with retry logic.
+
+    Args:
+        api_key: Gemini API key
+        translation_tasks: List of translation tasks
+        task_schema: Pydantic model for the task type
+        system_instruction: System instruction for the model
+        gemini_model: Model name to use
+        temperature: Generation temperature
+        max_retries: Number of retry attempts
+        retry_delay: Delay between retries in seconds
+
+    Returns:
+        GeminiResponse containing the results
+    """
+    from google import genai
+    from google.genai import types
+    from google.genai.types import SchemaUnion
+    client = genai.Client(api_key=api_key)
+    messages = []
+    translation_res = []
+    total_token_count = 0
+
+    # Validate input tasks before sending to API
+    try:
+        translation_res = [task_schema(**task) for task in translation_tasks]
+    except ValidationError as e:
+        return GeminiResponse(
+            tasks=[],
+            total_token_count=0,
+            success=False,
+            message=f"Input validation failed: {str(e)}"
+        )
+
+    for attempt in range(1, max_retries + 1):
+        try:
+            response = client.models.generate_content(
+                model=gemini_model,
+                contents=json.dumps(translation_tasks, ensure_ascii=False),
+                config=types.GenerateContentConfig(
+                    system_instruction=system_instruction,
+                    response_mime_type="application/json",
+                    response_schema=list[task_schema],
+                    temperature=temperature
+                ),
+            )
+
+            if not response.parsed:
+                raise ValueError("Empty response from Gemini API")
+
+            translation_res = response.parsed
+            total_token_count = response.usage_metadata.total_token_count
+            return GeminiResponse(
+                tasks=translation_res,
+                total_token_count=total_token_count,
+                success=True
+            )
+
+        except Exception as e:
+            messages.append(f"Attempt {attempt} failed: {str(e)}")
+            if attempt < max_retries:
+                time.sleep(retry_delay)
+
+    return GeminiResponse(
+        tasks=[],
+        total_token_count=0,
+        success=False,
+        message="All retry attempts failed. " + "\n".join(messages)
+    )
+
+
+def main():
+    try:
+        # Read and parse input
+        '''{
+        	"tasks": [{
+        		"index": 0,
+        		"original_text": "That was eight years ago.",
+        		"Chinese": ""
+        	}, {
+        		"index": 1,
+        		"original_text": "Much has changed.",
+        		"Chinese": ""
+        	}],
+        	"params": {
+        		"api_key": "",
+        		"system_instruction": "You are an expert translator. You will be given a list of dialogue translation tasks in JSON format. For each entry, provide the most appropriate translation in Simplified Chinese based on the context. \\nOnly complete the `Chinese` field. Do not include pinyin, explanations, or any additional information.",
+        		"schema": "DialogueTranslationTask"
+        	}
+        }'''
+        input_text = sys.stdin.read()
+        if not input_text:
+            raise ValueError("No input provided")
+
+        request_data = json.loads(input_text)
+        validate_input_data(request_data)
+
+        # Extract parameters
+        tasks = request_data["tasks"]
+        params = request_data["params"]
+
+        # Get schema and make API call
+        schema = get_task_schema(params["schema"])
+        response = query_gemini(
+            api_key=params["api_key"],
+            translation_tasks=tasks,
+            task_schema=schema,
+            system_instruction=params["system_instruction"],
+            gemini_model=params.get("model", "gemini-2.0-flash"),
+            temperature=float(params.get("temperature", 0.3)),
+            max_retries=int(params.get("max_retries", 3))
+        )
+
+        # Prepare output
+        if response.success:
+            result = {
+                "success": True,
+                "data": {
+                    "tasks": [task.model_dump() for task in response.tasks],
+                    "total_token_count": response.total_token_count
+                }
+            }
+        else:
+            result = {
+                "success": False,
+                "message": response.message
+            }
+
+        print(json.dumps(result, ensure_ascii=False))
+
+    except json.JSONDecodeError as e:
+        error = {
+            "success": False,
+            "message": f"Invalid JSON input: {str(e)}"
+        }
+        print(json.dumps(error))
+    except Exception as e:
+        error = {
+            "success": False,
+            "message": f"Unexpected error: {str(e)}"
+        }
+        print(json.dumps(error))
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins.v2/lexiannot/requirements.txt
+++ b/plugins.v2/lexiannot/requirements.txt
@@ -0,0 +1,5 @@
+pysubs2~=1.8.0
+thinc==8.3.4
+spacy==3.8.7
+langdetect~=1.0.9
+pymediainfo~=7.0.1