Files
archived-MoviePilot-Plugins/plugins.v2/lexiannot/schemas.py
2025-12-12 19:15:21 +08:00

362 lines
14 KiB
Python

import re
import uuid
from collections import Counter
from enum import Enum
from typing import Literal, Generator, Iterator
from pydantic import BaseModel, Field, RootModel, model_validator
from app.utils.singleton import Singleton
Cefr = Literal["C2", "C1", "B2", "B1", "A2", "A1"]
class UniversalPos(str, Enum):
"""Universal Part-of-Speech tags"""
ADJ = "ADJ" # Adjective
ADV = "ADV" # Adverb
INTJ = "INTJ" # Interjection
NOUN = "NOUN" # Noun
PROPN = "PROPN" # Proper noun
VERB = "VERB" # Verb
ADP = "ADP" # Adposition (preposition/postposition)
AUX = "AUX" # Auxiliary verb
CCONJ = "CCONJ" # Coordinating conjunction
DET = "DET" # Determiner
NUM = "NUM" # Numeral
PART = "PART" # Particle
PRON = "PRON" # Pronoun
SCONJ = "SCONJ" # Subordinating conjunction
PUNCT = "PUNCT" # Punctuation
SYM = "SYM" # Symbol
X = "X" # Other/unknown
class LexicalFeatures(str, Enum):
"""Lexical features for words."""
FORMAL = "formal"
INFORMAL = "informal"
SLANG = "slang"
COLLOQUIAL = "colloquial"
ARCHAIC = "archaic"
DIALECT = "dialect"
TECHNICAL = "technical"
LITERARY = "literary"
ABBREVIATION = "abbreviation"
NAME = "name"
IDIOMATIC = "idiomatic"
NEOLOGISM = "neologism"
GIBBERISH = "gibberish"
COMPOUND = "compound"
class IDGenerator(metaclass=Singleton):
"""Singleton class for generating unique IDs."""
_counter = 0
def next_id(self):
self._counter += 1
return self._counter
def reset(self):
self._counter = 0
class TaskStatus(Enum):
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELED = "canceled"
IGNORED = "ignored"
class TaskParams(BaseModel):
skip_existing: bool = Field(default=True, description="Whether to skip existing subtitle files")
class TasksApiParams(BaseModel):
operation: Literal["DELETE", "RETRY", "IGNORE"] = Field(
..., description="Operation to perform on the tasks"
)
task_id: str | None = Field(default=None, description="Unique identifier for the task")
class SegmentStatistics(BaseModel):
total_segments: int = Field(default=0, description="Total number of subtitle segments")
total_words: int = Field(default=0, description="Total number of candidate words")
cefr_distribution: dict[str, int] = Field(
default_factory=dict, description="Distribution of words by CEFR level"
)
pos_distribution: dict[str, int] = Field(
default_factory=dict, description="Distribution of words by Part of Speech"
)
exam_distribution: dict[str, int] = Field(
default_factory=dict, description="Distribution of words by Examination"
)
def to_string(self) -> str:
cefr_str = ", ".join(
[f"{level}({count})" for level, count in self.cefr_distribution.items()]
)
pos_str = ", ".join(
[f"{pos}({count})" for pos, count in self.pos_distribution.items()]
)
exam_str = ", ".join([f"{exam}({count})" for exam, count in self.exam_distribution.items()])
return (
f"Total Segments: {self.total_segments}\n"
f"Total Words: {self.total_words}\n"
f"CEFR Distribution: {cefr_str if cefr_str else 'N/A'}\n"
f"POS Distribution: {pos_str if pos_str else 'N/A'}\n"
f"Exam Distribution: {exam_str if exam_str else 'N/A'}"
)
class ProcessResult(BaseModel):
"""Result of processing a task."""
message: str | None = Field(default=None, description="Additional message or error information")
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status of the task")
statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task")
class Task(BaseModel):
video_path: str = Field(..., description="Path to the video file")
task_id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Unique identifier for the task",
)
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status of the task")
add_time: str | None = Field(default=None, description="Add time of the task, format %Y-%m-%d %H:%M:%S")
complete_time: str | None = Field(default=None, description="Complete time of the task")
tokens_used: int = Field(default=0, description="Number of used tokens")
message: str | None = Field(default=None, description="Additional message or error information")
params: TaskParams = Field(default_factory=TaskParams, description="Parameters for the task")
statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task")
class WordMetadata(BaseModel):
start_pos: int = Field(..., description="Start position of the word in the context sentence")
end_pos: int = Field(..., description="End position of the word in the context sentence")
context_id: int = Field(..., description="Identifier of the context sentence")
word_id: int = Field(
default_factory=lambda: IDGenerator().next_id(),
description="Identifier of the word in the context",
)
class PosDef(BaseModel):
# 'art.', 'v.', 'aux.', 'conj.', 'prep.', 'adv.', 'adj.', 'n.', 'vt.', 'pron.', 'det.', 'vi.', 'int.'
# 'num.', 'abbr.', 'na.', 'quant.', 'phr.'
pos: str = Field(..., description="Part of speech")
meanings: list[str] = Field(..., description="List of definitions")
@property
def plaintext(self):
return f"{self.pos} {'; '.join(self.meanings)}"
class WordBase(BaseModel):
text: str = Field(..., description="The word or phrase")
lemma: str = Field(..., description="Lemma form of the word")
pos: UniversalPos = Field(default=UniversalPos.X, description="Universal POS tag of the word")
class Word(WordBase):
phonetics: str | None = Field(default=None, description="Phonetic transcription of the word")
meta: WordMetadata = Field(default_factory=WordMetadata, description="Additional metadata")
cefr: Cefr | None = Field(default=None, description="CEFR level")
exams: list[str] = Field(
default_factory=list,
description="Exams whose vocabulary syllabus include this word",
)
pos_defs: list[PosDef] = Field(default_factory=list, description="Part of speech definitions")
llm_translation: str | None = Field(default=None, description="LLM generated Chinese translation")
llm_usage_context: str | None = Field(default=None, description="LLM generated cultural context")
lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")
llm_example_sentences: list[str] = Field(default_factory=list, description="LLM generated example sentences")
@property
def pos_defs_plaintext(self) -> str:
return " ".join(
[
f"{index}. {pos_def.plaintext}"
for index, pos_def in enumerate(self.pos_defs)
]
)
class SubtitleSegment(BaseModel):
index: int = Field(..., description="Index of the subtitle segment")
start_time: int = Field(
..., description="Start time of the subtitle segment in milliseconds"
)
end_time: int = Field(..., description="End time of the subtitle segment in milliseconds")
plaintext: str = Field(..., description="Text content of the subtitle segment")
Chinese: str | None = Field(default=None, description="Chinese translation of the subtitle segment")
candidate_words: list[Word] = Field(
default_factory=list, description="List of words worth learning in the segment"
)
def words_append(self, word: Word):
"""
向字幕片段中添加一个单词到 words_worth_larning 列表中。
:param word: 要添加的单词对象。
"""
self.candidate_words.append(word)
@staticmethod
def _replace_with_spaces(_text):
"""
使用等长的空格替换文本中的 [xxx] 模式。
例如:"[Hi]" 会被替换成 " " (4个空格)
"""
pattern = r"(\[.*?\])"
return re.sub(pattern, lambda match: " " * len(match.group(1)), _text)
@property
def clean_text(self) -> str:
"""
获取清理后的文本内容,去除换行符并将 [xxx] 模式替换为空格。
"""
return SubtitleSegment._replace_with_spaces(self.plaintext.replace("\n", " "))
def __lt__(self, other: object):
if not isinstance(other, SubtitleSegment):
return NotImplemented
return self.index < other.index
class SegmentList(RootModel):
root: list[SubtitleSegment] = Field(
default_factory=list, description="List of subtitle segments"
)
@property
def statistics(self) -> SegmentStatistics:
all_words = [word for seg in self.root for word in seg.candidate_words]
cefr_counts = Counter(word.cefr if word.cefr else "Other" for word in all_words)
pos_counts = Counter(word.pos.value if word.pos else "Other" for word in all_words)
exam_counts = Counter(exam for word in all_words for exam in word.exams)
return SegmentStatistics(
total_segments=len(self.root),
total_words=len(all_words),
cefr_distribution=dict(cefr_counts),
pos_distribution=dict(pos_counts),
exam_distribution=dict(exam_counts)
)
def context_generator(
self, context_window: int, extra_len: int = 1
) -> Generator[tuple[list[SubtitleSegment], tuple[int, int]], None, None]:
"""
生成包含上下文窗口的字幕片段列表
:param context_window: 上下文窗口大小
:param extra_len: 额外长度,用于调整窗口大小
:yield: 包含上下文的字幕片段列表。
"""
total_segments = len(self.root)
for i in range((total_segments + context_window - 1) // context_window):
real_start = i * context_window
real_end = min(total_segments, (i + 1) * context_window) - 1
start_index = max(0, i * context_window - extra_len)
end_index = min(total_segments, (i + 1) * context_window + extra_len)
yield (
self.root[start_index:end_index],
(self.root[real_start].index, self.root[real_end].index),
)
def sort(self):
self.root.sort()
@model_validator(mode="after")
def sort_root(self):
self.root.sort()
return self
def __iter__(self) -> Iterator[SubtitleSegment]:
return iter(self.root)
class SpacyToken(BaseModel):
lemma_: str = Field(..., description="Lemma form of the word (string)")
pos_: str = Field(..., description="POS tag of the word")
text: str = Field(..., description="Text of the word")
is_stop: bool = Field(default=False, description="Indicates if the word is a stop word")
is_punct: bool = Field(default=False, description="Indicates if the word is punctuation")
ent_iob_: str = Field(..., description="Entity IOB")
class SpacyNamedEntity(BaseModel):
text: str = Field(..., description="Text of the entity")
label_: str = Field(..., description="Label of the entity")
class NlpResult(BaseModel):
tokens: list[SpacyToken] = Field(default_factory=list, description="List of tokens")
entities: list[SpacyNamedEntity] = Field(default_factory=list, description="List of named entities")
class LlmFeedbackAboutCandidateWord(BaseModel):
should_keep: bool = Field(..., description="Indicates whether to keep the candidate word")
# reason: str | None = Field(default=None, description="Concise reason for the decision")
word_id: int = Field(..., description="Identifier of the word in the context")
text: str | None = Field(default=None, description="The vocabulary word or phrase")
lemma: str | None = Field(default=None, description="Lemma form of the word")
pos: UniversalPos | None = Field(
default=None,
description="Universal POS tag of the word. Options: ADJ, ADV, INTJ, NOUN, PROPN, "
"VERB, ADP, AUX, CCONJ, DET, NUM, PART, PRON, SCONJ, PUNCT, SYM, X",
)
class LlmFeedback(BaseModel):
candidate_words_feedback: list[LlmFeedbackAboutCandidateWord] = Field(
default_factory=list, description="Feedback about candidate words."
)
llm_identified_words: list[WordBase] = Field(
default_factory=list, description="List of words identified by the LLM."
)
class LlmWordEnrichment(BaseModel):
word_id: int = Field(..., description="Identifier of the word in the context")
translation: str | None = Field(default=None, description="Chinese translation of the word")
usage_context: str | None = Field(default=None, description="Usage or Cultural Context")
lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")
class LlmEnrichmentResult(BaseModel):
enriched_words: list[LlmWordEnrichment] = Field(default_factory=list, description="List of enriched word data")
class LlmSegmentTranslation(BaseModel):
index: int = Field(..., description="Index of the subtitle segment")
translation: str = Field(..., description="Natural Chinese translation of the segment")
class LlmTranslationResult(BaseModel):
translations: list[LlmSegmentTranslation] = Field(default_factory=list, description="List of segment translations")
class VocabularyAnnotatingToolInput(BaseModel):
explanation: str = Field(
...,
description="This is a tool for adding a new vocabulary-annotating task to AnnotLexi",
)
video_path: str = Field(..., description="Path to the video file")
skip_existing: bool = Field(default=True, description="Whether to skip existing subtitle files")
class QueryAnnotationTasksToolInput(BaseModel):
count: int = Field(default=5, description="The maximum number of returned annotation tasks")
explanation: str = Field(..., description="This is a tool for querying the latest annotation tasks in AnnotLexi")