fix(LexiAnnot): 避免潜在的数据校验错误

2026-03-27 10:05:57 +00:00 · 2026-01-16 22:30:21 +08:00
parent 0ac725383e
commit 1f80e3b078
4 changed files with 15 additions and 16 deletions
--- a/package.v2.json
+++ b/package.v2.json
@@ -554,11 +554,12 @@
    "name": "美剧生词标注",
    "description": "根据CEFR等级，为英语影视剧标注高级词汇。",
    "labels": "英语",
-    "version": "1.2.3",
+    "version": "1.2.4",
    "icon": "LexiAnnot.png",
    "author": "wumode",
    "level": 1,
    "history": {
+      "v1.2.4": "增强数据校验",
      "v1.2.3": "优化提示词",
      "v1.2.1": "改进字幕样式获取方法",
      "v1.2.0": "引入大模型候选词决策和词义丰富处理链; 支持读取系统智能体配置; 添加智能体工具; 优化通知样式; 改进 UI",
--- a/plugins.v2/lexiannot/init.py
+++ b/plugins.v2/lexiannot/init.py
@@ -60,7 +60,7 @@ class LexiAnnot(_PluginBase):
    # 插件图标
    plugin_icon = "LexiAnnot.png"
    # 插件版本
-    plugin_version = "1.2.3"
+    plugin_version = "1.2.4"
    # 插件作者
    plugin_author = "wumode"
    # 作者主页
--- a/plugins.v2/lexiannot/pipeline.py
+++ b/plugins.v2/lexiannot/pipeline.py
@@ -509,10 +509,6 @@ Your goal is two-fold:
    *   **Do NOT include** simple high-frequency words, common fillers ('gonna', 'gotta'), onomatopoeia, or basic swear words.

 -------------------------
-You MUST return output strictly matching the provided Pydantic schema. 
-Return ONLY valid JSON.
-
-**Here are the output format instructions you MUST follow strictly:**
 {format_instructions}
 """,
            ),
@@ -556,10 +552,6 @@ For each word (identified by `WORD_ID`), provide:
 **Your judgment should be based strictly on the provided subtitle context. DO NOT fabricate context or forced explanation.**

 -------------------------
-You MUST return output strictly matching the provided Pydantic schema.
-Return ONLY valid JSON. 
-
-**Here are the output format instructions you MUST follow strictly:**
 {format_instructions}
 """,
                ),
--- a/plugins.v2/lexiannot/schemas.py
+++ b/plugins.v2/lexiannot/schemas.py
@@ -1,10 +1,10 @@
 import re
 import uuid
 from collections import Counter
-from enum import Enum
+from enum import Enum, StrEnum
 from typing import Literal, Generator, Iterator

-from pydantic import BaseModel, Field, RootModel, model_validator
+from pydantic import BaseModel, Field, RootModel, model_validator, field_validator

 from app.utils.singleton import Singleton

@@ -12,9 +12,8 @@ from app.utils.singleton import Singleton
 Cefr = Literal["C2", "C1", "B2", "B1", "A2", "A1"]


-class UniversalPos(str, Enum):
+class UniversalPos(StrEnum):
    """Universal Part-of-Speech tags"""
-
    ADJ = "ADJ"  # Adjective
    ADV = "ADV"  # Adverb
    INTJ = "INTJ"  # Interjection
@@ -34,9 +33,8 @@ class UniversalPos(str, Enum):
    X = "X"  # Other/unknown


-class LexicalFeatures(str, Enum):
+class LexicalFeatures(StrEnum):
    """Lexical features for words."""
-
    FORMAL = "formal"
    INFORMAL = "informal"
    SLANG = "slang"
@@ -333,6 +331,14 @@ class LlmWordEnrichment(BaseModel):
    usage_context: str | None = Field(default=None, description="Usage or Cultural Context")
    lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")

+    @field_validator("lexical_features", mode="before")
+    @classmethod
+    def filter_invalid_lexical_features(cls, v):
+        if isinstance(v, list):
+            valid_values = {f.value for f in LexicalFeatures}
+            return [item for item in v if item in valid_values]
+        return v
+

 class LlmEnrichmentResult(BaseModel):
    enriched_words: list[LlmWordEnrichment] = Field(default_factory=list, description="List of enriched word data")