archived-MoviePilot-Plugins/plugins.v2/lexiannot/schemas.py

import re
import uuid
from collections import Counter
from enum import Enum
from typing import Literal, Generator, Iterator

from pydantic import BaseModel, Field, RootModel, model_validator

from app.utils.singleton import Singleton


Cefr = Literal["C2", "C1", "B2", "B1", "A2", "A1"]


class UniversalPos(str, Enum):
    """Universal Part-of-Speech tags"""

    ADJ = "ADJ"  # Adjective
    ADV = "ADV"  # Adverb
    INTJ = "INTJ"  # Interjection
    NOUN = "NOUN"  # Noun
    PROPN = "PROPN"  # Proper noun
    VERB = "VERB"  # Verb
    ADP = "ADP"  # Adposition (preposition/postposition)
    AUX = "AUX"  # Auxiliary verb
    CCONJ = "CCONJ"  # Coordinating conjunction
    DET = "DET"  # Determiner
    NUM = "NUM"  # Numeral
    PART = "PART"  # Particle
    PRON = "PRON"  # Pronoun
    SCONJ = "SCONJ"  # Subordinating conjunction
    PUNCT = "PUNCT"  # Punctuation
    SYM = "SYM"  # Symbol
    X = "X"  # Other/unknown


class LexicalFeatures(str, Enum):
    """Lexical features for words."""

    FORMAL = "formal"
    INFORMAL = "informal"
    SLANG = "slang"
    COLLOQUIAL = "colloquial"
    ARCHAIC = "archaic"
    DIALECT = "dialect"
    TECHNICAL = "technical"
    LITERARY = "literary"
    ABBREVIATION = "abbreviation"
    NAME = "name"
    IDIOMATIC = "idiomatic"
    NEOLOGISM = "neologism"
    GIBBERISH = "gibberish"
    COMPOUND = "compound"


class IDGenerator(metaclass=Singleton):
    """Singleton class for generating unique IDs."""

    _counter = 0

    def next_id(self):
        self._counter += 1
        return self._counter

    def reset(self):
        self._counter = 0


class TaskStatus(Enum):
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELED = "canceled"
    IGNORED = "ignored"


class TaskParams(BaseModel):
    skip_existing: bool = Field(default=True, description="Whether to skip existing subtitle files")


class TasksApiParams(BaseModel):
    operation: Literal["DELETE", "RETRY", "IGNORE"] = Field(
        ..., description="Operation to perform on the tasks"
    )
    task_id: str | None = Field(default=None, description="Unique identifier for the task")


class SegmentStatistics(BaseModel):
    total_segments: int = Field(default=0, description="Total number of subtitle segments")
    total_words: int = Field(default=0, description="Total number of candidate words")
    cefr_distribution: dict[str, int] = Field(
        default_factory=dict, description="Distribution of words by CEFR level"
    )
    pos_distribution: dict[str, int] = Field(
        default_factory=dict, description="Distribution of words by Part of Speech"
    )
    exam_distribution: dict[str, int] = Field(
        default_factory=dict, description="Distribution of words by Examination"
    )

    def to_string(self) -> str:
        cefr_str = ", ".join(
            [f"{level}({count})" for level, count in self.cefr_distribution.items()]
        )
        pos_str = ", ".join(
            [f"{pos}({count})" for pos, count in self.pos_distribution.items()]
        )
        exam_str = ", ".join([f"{exam}({count})" for exam, count in self.exam_distribution.items()])
        return (
            f"Total Segments: {self.total_segments}\n"
            f"Total Words: {self.total_words}\n"
            f"CEFR Distribution: {cefr_str if cefr_str else 'N/A'}\n"
            f"POS Distribution: {pos_str if pos_str else 'N/A'}\n"
            f"Exam Distribution: {exam_str if exam_str else 'N/A'}"
        )


class ProcessResult(BaseModel):
    """Result of processing a task."""

    message: str | None = Field(default=None, description="Additional message or error information")
    status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status of the task")
    statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task")


class Task(BaseModel):
    video_path: str = Field(..., description="Path to the video file")
    task_id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="Unique identifier for the task",
    )
    status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status of the task")
    add_time: str | None = Field(default=None, description="Add time of the task, format %Y-%m-%d %H:%M:%S")
    complete_time: str | None = Field(default=None, description="Complete time of the task")
    tokens_used: int = Field(default=0, description="Number of used tokens")
    message: str | None = Field(default=None, description="Additional message or error information")
    params: TaskParams = Field(default_factory=TaskParams, description="Parameters for the task")
    statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task")


class WordMetadata(BaseModel):
    start_pos: int = Field(..., description="Start position of the word in the context sentence")
    end_pos: int = Field(..., description="End position of the word in the context sentence")
    context_id: int = Field(..., description="Identifier of the context sentence")
    word_id: int = Field(
        default_factory=lambda: IDGenerator().next_id(),
        description="Identifier of the word in the context",
    )


class PosDef(BaseModel):
    # 'art.', 'v.', 'aux.', 'conj.', 'prep.', 'adv.', 'adj.', 'n.', 'vt.', 'pron.', 'det.', 'vi.', 'int.'
    # 'num.', 'abbr.', 'na.', 'quant.', 'phr.'
    pos: str = Field(..., description="Part of speech")
    meanings: list[str] = Field(..., description="List of definitions")

    @property
    def plaintext(self):
        return f"{self.pos} {'; '.join(self.meanings)}"


class WordBase(BaseModel):
    text: str = Field(..., description="The word or phrase")
    lemma: str = Field(..., description="Lemma form of the word")
    pos: UniversalPos = Field(default=UniversalPos.X, description="Universal POS tag of the word")


class Word(WordBase):
    phonetics: str | None = Field(default=None, description="Phonetic transcription of the word")
    meta: WordMetadata = Field(default_factory=WordMetadata, description="Additional metadata")
    cefr: Cefr | None = Field(default=None, description="CEFR level")
    exams: list[str] = Field(
        default_factory=list,
        description="Exams whose vocabulary syllabus include this word",
    )
    pos_defs: list[PosDef] = Field(default_factory=list, description="Part of speech definitions")
    llm_translation: str | None = Field(default=None, description="LLM generated Chinese translation")
    llm_usage_context: str | None = Field(default=None, description="LLM generated cultural context")
    lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")
    llm_example_sentences: list[str] = Field(default_factory=list, description="LLM generated example sentences")

    @property
    def pos_defs_plaintext(self) -> str:
        return " ".join(
            [
                f"{index}. {pos_def.plaintext}"
                for index, pos_def in enumerate(self.pos_defs)
            ]
        )


class SubtitleSegment(BaseModel):
    index: int = Field(..., description="Index of the subtitle segment")
    start_time: int = Field(
        ..., description="Start time of the subtitle segment in milliseconds"
    )
    end_time: int = Field(..., description="End time of the subtitle segment in milliseconds")
    plaintext: str = Field(..., description="Text content of the subtitle segment")
    Chinese: str | None = Field(default=None, description="Chinese translation of the subtitle segment")
    candidate_words: list[Word] = Field(
        default_factory=list, description="List of words worth learning in the segment"
    )

    def words_append(self, word: Word):
        """
        向字幕片段中添加一个单词到 words_worth_larning 列表中。

        :param word: 要添加的单词对象。
        """
        self.candidate_words.append(word)

    @staticmethod
    def _replace_with_spaces(_text):
        """
        使用等长的空格替换文本中的 [xxx] 模式。
        例如："[Hi]" 会被替换成 "    " (4个空格)
        """
        pattern = r"(\[.*?\])"
        return re.sub(pattern, lambda match: " " * len(match.group(1)), _text)

    @property
    def clean_text(self) -> str:
        """
        获取清理后的文本内容，去除换行符并将 [xxx] 模式替换为空格。
        """
        return SubtitleSegment._replace_with_spaces(self.plaintext.replace("\n", " "))

    def __lt__(self, other: object):
        if not isinstance(other, SubtitleSegment):
            return NotImplemented
        return self.index < other.index


class SegmentList(RootModel):
    root: list[SubtitleSegment] = Field(
        default_factory=list, description="List of subtitle segments"
    )

    @property
    def statistics(self) -> SegmentStatistics:
        all_words = [word for seg in self.root for word in seg.candidate_words]

        cefr_counts = Counter(word.cefr if word.cefr else "Other" for word in all_words)
        pos_counts = Counter(word.pos.value if word.pos else "Other" for word in all_words)
        exam_counts = Counter(exam for word in all_words for exam in word.exams)

        return SegmentStatistics(
            total_segments=len(self.root),
            total_words=len(all_words),
            cefr_distribution=dict(cefr_counts),
            pos_distribution=dict(pos_counts),
            exam_distribution=dict(exam_counts)
        )

    def context_generator(
        self, context_window: int, extra_len: int = 1
    ) -> Generator[tuple[list[SubtitleSegment], tuple[int, int]], None, None]:
        """
        生成包含上下文窗口的字幕片段列表

        :param context_window: 上下文窗口大小
        :param extra_len: 额外长度，用于调整窗口大小
        :yield: 包含上下文的字幕片段列表。
        """
        total_segments = len(self.root)
        for i in range((total_segments + context_window - 1) // context_window):
            real_start = i * context_window
            real_end = min(total_segments, (i + 1) * context_window) - 1
            start_index = max(0, i * context_window - extra_len)
            end_index = min(total_segments, (i + 1) * context_window + extra_len)
            yield (
                self.root[start_index:end_index],
                (self.root[real_start].index, self.root[real_end].index),
            )

    def sort(self):
        self.root.sort()

    @model_validator(mode="after")
    def sort_root(self):
        self.root.sort()
        return self

    def __iter__(self) -> Iterator[SubtitleSegment]:
        return iter(self.root)


class SpacyToken(BaseModel):
    lemma_: str = Field(..., description="Lemma form of the word (string)")
    pos_: str = Field(..., description="POS tag of the word")
    text: str = Field(..., description="Text of the word")
    is_stop: bool = Field(default=False, description="Indicates if the word is a stop word")
    is_punct: bool = Field(default=False, description="Indicates if the word is punctuation")
    ent_iob_: str = Field(..., description="Entity IOB")


class SpacyNamedEntity(BaseModel):
    text: str = Field(..., description="Text of the entity")
    label_: str = Field(..., description="Label of the entity")


class NlpResult(BaseModel):
    tokens: list[SpacyToken] = Field(default_factory=list, description="List of tokens")
    entities: list[SpacyNamedEntity] = Field(default_factory=list, description="List of named entities")


class LlmFeedbackAboutCandidateWord(BaseModel):
    should_keep: bool = Field(..., description="Indicates whether to keep the candidate word")
    # reason: str | None = Field(default=None, description="Concise reason for the decision")
    word_id: int = Field(..., description="Identifier of the word in the context")
    text: str | None = Field(default=None, description="The vocabulary word or phrase")
    lemma: str | None = Field(default=None, description="Lemma form of the word")
    pos: UniversalPos | None = Field(
        default=None,
        description="Universal POS tag of the word. Options: ADJ, ADV, INTJ, NOUN, PROPN, "
        "VERB, ADP, AUX, CCONJ, DET, NUM, PART, PRON, SCONJ, PUNCT, SYM, X",
    )


class LlmFeedback(BaseModel):
    candidate_words_feedback: list[LlmFeedbackAboutCandidateWord] = Field(
        default_factory=list, description="Feedback about candidate words."
    )
    llm_identified_words: list[WordBase] = Field(
        default_factory=list, description="List of words identified by the LLM."
    )


class LlmWordEnrichment(BaseModel):
    word_id: int = Field(..., description="Identifier of the word in the context")
    translation: str | None = Field(default=None, description="Chinese translation of the word")
    usage_context: str | None = Field(default=None, description="Usage or Cultural Context")
    lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")


class LlmEnrichmentResult(BaseModel):
    enriched_words: list[LlmWordEnrichment] = Field(default_factory=list, description="List of enriched word data")


class LlmSegmentTranslation(BaseModel):
    index: int = Field(..., description="Index of the subtitle segment")
    translation: str = Field(..., description="Natural Chinese translation of the segment")


class LlmTranslationResult(BaseModel):
    translations: list[LlmSegmentTranslation] = Field(default_factory=list, description="List of segment translations")


class VocabularyAnnotatingToolInput(BaseModel):
    explanation: str = Field(
        ...,
        description="This is a tool for adding a new vocabulary-annotating task to AnnotLexi",
    )
    video_path: str = Field(..., description="Path to the video file")
    skip_existing: bool = Field(default=True, description="Whether to skip existing subtitle files")


class QueryAnnotationTasksToolInput(BaseModel):
    count: int = Field(default=5, description="The maximum number of returned annotation tasks")
    explanation: str = Field(..., description="This is a tool for querying the latest annotation tasks in AnnotLexi")