mirror of
https://github.com/jxxghp/MoviePilot-Plugins.git
synced 2026-03-27 10:05:57 +00:00
368 lines
14 KiB
Python
368 lines
14 KiB
Python
import re
|
|
import uuid
|
|
from collections import Counter
|
|
from enum import Enum, StrEnum
|
|
from typing import Literal, Generator, Iterator
|
|
|
|
from pydantic import BaseModel, Field, RootModel, model_validator, field_validator
|
|
|
|
from app.utils.singleton import Singleton
|
|
|
|
|
|
Cefr = Literal["C2", "C1", "B2", "B1", "A2", "A1"]
|
|
|
|
|
|
class UniversalPos(StrEnum):
|
|
"""Universal Part-of-Speech tags"""
|
|
ADJ = "ADJ" # Adjective
|
|
ADV = "ADV" # Adverb
|
|
INTJ = "INTJ" # Interjection
|
|
NOUN = "NOUN" # Noun
|
|
PROPN = "PROPN" # Proper noun
|
|
VERB = "VERB" # Verb
|
|
ADP = "ADP" # Adposition (preposition/postposition)
|
|
AUX = "AUX" # Auxiliary verb
|
|
CCONJ = "CCONJ" # Coordinating conjunction
|
|
DET = "DET" # Determiner
|
|
NUM = "NUM" # Numeral
|
|
PART = "PART" # Particle
|
|
PRON = "PRON" # Pronoun
|
|
SCONJ = "SCONJ" # Subordinating conjunction
|
|
PUNCT = "PUNCT" # Punctuation
|
|
SYM = "SYM" # Symbol
|
|
X = "X" # Other/unknown
|
|
|
|
|
|
class LexicalFeatures(StrEnum):
|
|
"""Lexical features for words."""
|
|
FORMAL = "formal"
|
|
INFORMAL = "informal"
|
|
SLANG = "slang"
|
|
COLLOQUIAL = "colloquial"
|
|
ARCHAIC = "archaic"
|
|
DIALECT = "dialect"
|
|
TECHNICAL = "technical"
|
|
LITERARY = "literary"
|
|
ABBREVIATION = "abbreviation"
|
|
NAME = "name"
|
|
IDIOMATIC = "idiomatic"
|
|
NEOLOGISM = "neologism"
|
|
GIBBERISH = "gibberish"
|
|
COMPOUND = "compound"
|
|
|
|
|
|
class IDGenerator(metaclass=Singleton):
|
|
"""Singleton class for generating unique IDs."""
|
|
|
|
_counter = 0
|
|
|
|
def next_id(self):
|
|
self._counter += 1
|
|
return self._counter
|
|
|
|
def reset(self):
|
|
self._counter = 0
|
|
|
|
|
|
class TaskStatus(Enum):
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELED = "canceled"
|
|
IGNORED = "ignored"
|
|
|
|
|
|
class TaskParams(BaseModel):
|
|
skip_existing: bool = Field(default=True, description="Whether to skip existing subtitle files")
|
|
|
|
|
|
class TasksApiParams(BaseModel):
|
|
operation: Literal["DELETE", "RETRY", "IGNORE"] = Field(
|
|
..., description="Operation to perform on the tasks"
|
|
)
|
|
task_id: str | None = Field(default=None, description="Unique identifier for the task")
|
|
|
|
|
|
class SegmentStatistics(BaseModel):
|
|
total_segments: int = Field(default=0, description="Total number of subtitle segments")
|
|
total_words: int = Field(default=0, description="Total number of candidate words")
|
|
cefr_distribution: dict[str, int] = Field(
|
|
default_factory=dict, description="Distribution of words by CEFR level"
|
|
)
|
|
pos_distribution: dict[str, int] = Field(
|
|
default_factory=dict, description="Distribution of words by Part of Speech"
|
|
)
|
|
exam_distribution: dict[str, int] = Field(
|
|
default_factory=dict, description="Distribution of words by Examination"
|
|
)
|
|
|
|
def to_string(self) -> str:
|
|
cefr_str = ", ".join(
|
|
[f"{level}({count})" for level, count in self.cefr_distribution.items()]
|
|
)
|
|
pos_str = ", ".join(
|
|
[f"{pos}({count})" for pos, count in self.pos_distribution.items()]
|
|
)
|
|
exam_str = ", ".join([f"{exam}({count})" for exam, count in self.exam_distribution.items()])
|
|
return (
|
|
f"Total Segments: {self.total_segments}\n"
|
|
f"Total Words: {self.total_words}\n"
|
|
f"CEFR Distribution: {cefr_str if cefr_str else 'N/A'}\n"
|
|
f"POS Distribution: {pos_str if pos_str else 'N/A'}\n"
|
|
f"Exam Distribution: {exam_str if exam_str else 'N/A'}"
|
|
)
|
|
|
|
|
|
class ProcessResult(BaseModel):
|
|
"""Result of processing a task."""
|
|
|
|
message: str | None = Field(default=None, description="Additional message or error information")
|
|
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status of the task")
|
|
statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task")
|
|
|
|
|
|
class Task(BaseModel):
|
|
video_path: str = Field(..., description="Path to the video file")
|
|
task_id: str = Field(
|
|
default_factory=lambda: str(uuid.uuid4()),
|
|
description="Unique identifier for the task",
|
|
)
|
|
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status of the task")
|
|
add_time: str | None = Field(default=None, description="Add time of the task, format %Y-%m-%d %H:%M:%S")
|
|
complete_time: str | None = Field(default=None, description="Complete time of the task")
|
|
tokens_used: int = Field(default=0, description="Number of used tokens")
|
|
message: str | None = Field(default=None, description="Additional message or error information")
|
|
params: TaskParams = Field(default_factory=TaskParams, description="Parameters for the task")
|
|
statistics: SegmentStatistics | None = Field(default=None, description="Statistics of the task")
|
|
|
|
|
|
class WordMetadata(BaseModel):
|
|
start_pos: int = Field(..., description="Start position of the word in the context sentence")
|
|
end_pos: int = Field(..., description="End position of the word in the context sentence")
|
|
context_id: int = Field(..., description="Identifier of the context sentence")
|
|
word_id: int = Field(
|
|
default_factory=lambda: IDGenerator().next_id(),
|
|
description="Identifier of the word in the context",
|
|
)
|
|
|
|
|
|
class PosDef(BaseModel):
|
|
# 'art.', 'v.', 'aux.', 'conj.', 'prep.', 'adv.', 'adj.', 'n.', 'vt.', 'pron.', 'det.', 'vi.', 'int.'
|
|
# 'num.', 'abbr.', 'na.', 'quant.', 'phr.'
|
|
pos: str = Field(..., description="Part of speech")
|
|
meanings: list[str] = Field(..., description="List of definitions")
|
|
|
|
@property
|
|
def plaintext(self):
|
|
return f"{self.pos} {'; '.join(self.meanings)}"
|
|
|
|
|
|
class WordBase(BaseModel):
|
|
text: str = Field(..., description="The word or phrase")
|
|
lemma: str = Field(..., description="Lemma form of the word")
|
|
pos: UniversalPos = Field(default=UniversalPos.X, description="Universal POS tag of the word")
|
|
|
|
|
|
class Word(WordBase):
|
|
phonetics: str | None = Field(default=None, description="Phonetic transcription of the word")
|
|
meta: WordMetadata = Field(default_factory=WordMetadata, description="Additional metadata")
|
|
cefr: Cefr | None = Field(default=None, description="CEFR level")
|
|
exams: list[str] = Field(
|
|
default_factory=list,
|
|
description="Exams whose vocabulary syllabus include this word",
|
|
)
|
|
pos_defs: list[PosDef] = Field(default_factory=list, description="Part of speech definitions")
|
|
llm_translation: str | None = Field(default=None, description="LLM generated Chinese translation")
|
|
llm_usage_context: str | None = Field(default=None, description="LLM generated cultural context")
|
|
lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")
|
|
llm_example_sentences: list[str] = Field(default_factory=list, description="LLM generated example sentences")
|
|
|
|
@property
|
|
def pos_defs_plaintext(self) -> str:
|
|
return " ".join(
|
|
[
|
|
f"{index}. {pos_def.plaintext}"
|
|
for index, pos_def in enumerate(self.pos_defs)
|
|
]
|
|
)
|
|
|
|
|
|
class SubtitleSegment(BaseModel):
|
|
index: int = Field(..., description="Index of the subtitle segment")
|
|
start_time: int = Field(
|
|
..., description="Start time of the subtitle segment in milliseconds"
|
|
)
|
|
end_time: int = Field(..., description="End time of the subtitle segment in milliseconds")
|
|
plaintext: str = Field(..., description="Text content of the subtitle segment")
|
|
Chinese: str | None = Field(default=None, description="Chinese translation of the subtitle segment")
|
|
candidate_words: list[Word] = Field(
|
|
default_factory=list, description="List of words worth learning in the segment"
|
|
)
|
|
|
|
def words_append(self, word: Word):
|
|
"""
|
|
向字幕片段中添加一个单词到 words_worth_larning 列表中。
|
|
|
|
:param word: 要添加的单词对象。
|
|
"""
|
|
self.candidate_words.append(word)
|
|
|
|
@staticmethod
|
|
def _replace_with_spaces(_text):
|
|
"""
|
|
使用等长的空格替换文本中的 [xxx] 模式。
|
|
例如:"[Hi]" 会被替换成 " " (4个空格)
|
|
"""
|
|
pattern = r"(\[.*?\])"
|
|
return re.sub(pattern, lambda match: " " * len(match.group(1)), _text)
|
|
|
|
@property
|
|
def clean_text(self) -> str:
|
|
"""
|
|
获取清理后的文本内容,去除换行符并将 [xxx] 模式替换为空格。
|
|
"""
|
|
return SubtitleSegment._replace_with_spaces(self.plaintext.replace("\n", " "))
|
|
|
|
def __lt__(self, other: object):
|
|
if not isinstance(other, SubtitleSegment):
|
|
return NotImplemented
|
|
return self.index < other.index
|
|
|
|
|
|
class SegmentList(RootModel):
|
|
root: list[SubtitleSegment] = Field(
|
|
default_factory=list, description="List of subtitle segments"
|
|
)
|
|
|
|
@property
|
|
def statistics(self) -> SegmentStatistics:
|
|
all_words = [word for seg in self.root for word in seg.candidate_words]
|
|
|
|
cefr_counts = Counter(word.cefr if word.cefr else "Other" for word in all_words)
|
|
pos_counts = Counter(word.pos.value if word.pos else "Other" for word in all_words)
|
|
exam_counts = Counter(exam for word in all_words for exam in word.exams)
|
|
|
|
return SegmentStatistics(
|
|
total_segments=len(self.root),
|
|
total_words=len(all_words),
|
|
cefr_distribution=dict(cefr_counts),
|
|
pos_distribution=dict(pos_counts),
|
|
exam_distribution=dict(exam_counts)
|
|
)
|
|
|
|
def context_generator(
|
|
self, context_window: int, extra_len: int = 1
|
|
) -> Generator[tuple[list[SubtitleSegment], tuple[int, int]], None, None]:
|
|
"""
|
|
生成包含上下文窗口的字幕片段列表
|
|
|
|
:param context_window: 上下文窗口大小
|
|
:param extra_len: 额外长度,用于调整窗口大小
|
|
:yield: 包含上下文的字幕片段列表。
|
|
"""
|
|
total_segments = len(self.root)
|
|
for i in range((total_segments + context_window - 1) // context_window):
|
|
real_start = i * context_window
|
|
real_end = min(total_segments, (i + 1) * context_window) - 1
|
|
start_index = max(0, i * context_window - extra_len)
|
|
end_index = min(total_segments, (i + 1) * context_window + extra_len)
|
|
yield (
|
|
self.root[start_index:end_index],
|
|
(self.root[real_start].index, self.root[real_end].index),
|
|
)
|
|
|
|
def sort(self):
|
|
self.root.sort()
|
|
|
|
@model_validator(mode="after")
|
|
def sort_root(self):
|
|
self.root.sort()
|
|
return self
|
|
|
|
def __iter__(self) -> Iterator[SubtitleSegment]:
|
|
return iter(self.root)
|
|
|
|
|
|
class SpacyToken(BaseModel):
|
|
lemma_: str = Field(..., description="Lemma form of the word (string)")
|
|
pos_: str = Field(..., description="POS tag of the word")
|
|
text: str = Field(..., description="Text of the word")
|
|
is_stop: bool = Field(default=False, description="Indicates if the word is a stop word")
|
|
is_punct: bool = Field(default=False, description="Indicates if the word is punctuation")
|
|
ent_iob_: str = Field(..., description="Entity IOB")
|
|
|
|
|
|
class SpacyNamedEntity(BaseModel):
|
|
text: str = Field(..., description="Text of the entity")
|
|
label_: str = Field(..., description="Label of the entity")
|
|
|
|
|
|
class NlpResult(BaseModel):
|
|
tokens: list[SpacyToken] = Field(default_factory=list, description="List of tokens")
|
|
entities: list[SpacyNamedEntity] = Field(default_factory=list, description="List of named entities")
|
|
|
|
|
|
class LlmFeedbackAboutCandidateWord(BaseModel):
|
|
should_keep: bool = Field(..., description="Indicates whether to keep the candidate word")
|
|
# reason: str | None = Field(default=None, description="Concise reason for the decision")
|
|
word_id: int = Field(..., description="Identifier of the word in the context")
|
|
text: str | None = Field(default=None, description="The vocabulary word or phrase")
|
|
lemma: str | None = Field(default=None, description="Lemma form of the word")
|
|
pos: UniversalPos | None = Field(
|
|
default=None,
|
|
description="Universal POS tag of the word. Options: ADJ, ADV, INTJ, NOUN, PROPN, "
|
|
"VERB, ADP, AUX, CCONJ, DET, NUM, PART, PRON, SCONJ, PUNCT, SYM, X",
|
|
)
|
|
|
|
|
|
class LlmFeedback(BaseModel):
|
|
candidate_words_feedback: list[LlmFeedbackAboutCandidateWord] = Field(
|
|
default_factory=list, description="Feedback about candidate words."
|
|
)
|
|
llm_identified_words: list[WordBase] = Field(
|
|
default_factory=list, description="List of words identified by the LLM."
|
|
)
|
|
|
|
|
|
class LlmWordEnrichment(BaseModel):
|
|
word_id: int = Field(..., description="Identifier of the word in the context")
|
|
translation: str | None = Field(default=None, description="Chinese translation of the word")
|
|
usage_context: str | None = Field(default=None, description="Usage or Cultural Context")
|
|
lexical_features: list[LexicalFeatures] = Field(default_factory=list, description="Lexical features")
|
|
|
|
@field_validator("lexical_features", mode="before")
|
|
@classmethod
|
|
def filter_invalid_lexical_features(cls, v):
|
|
if isinstance(v, list):
|
|
valid_values = {f.value for f in LexicalFeatures}
|
|
return [item for item in v if item in valid_values]
|
|
return v
|
|
|
|
|
|
class LlmEnrichmentResult(BaseModel):
|
|
enriched_words: list[LlmWordEnrichment] = Field(default_factory=list, description="List of enriched word data")
|
|
|
|
|
|
class LlmSegmentTranslation(BaseModel):
|
|
index: int = Field(..., description="Index of the subtitle segment")
|
|
translation: str = Field(..., description="Natural Chinese translation of the segment")
|
|
|
|
|
|
class LlmTranslationResult(BaseModel):
|
|
translations: list[LlmSegmentTranslation] = Field(default_factory=list, description="List of segment translations")
|
|
|
|
|
|
class VocabularyAnnotatingToolInput(BaseModel):
|
|
explanation: str = Field(
|
|
...,
|
|
description="This is a tool for adding a new vocabulary-annotating task to AnnotLexi",
|
|
)
|
|
video_path: str = Field(..., description="Path to the video file")
|
|
skip_existing: bool = Field(default=True, description="Whether to skip existing subtitle files")
|
|
|
|
|
|
class QueryAnnotationTasksToolInput(BaseModel):
|
|
count: int = Field(default=5, description="The maximum number of returned annotation tasks")
|
|
explanation: str = Field(..., description="This is a tool for querying the latest annotation tasks in AnnotLexi")
|