feat: accelerate metainfo parsing with rust

This commit is contained in:
jxxghp
2026-05-23 17:45:39 +08:00
parent ad38f51d6b
commit ed0f8c471b
8 changed files with 3634 additions and 0 deletions

View File

@@ -12,6 +12,7 @@ from app.core.meta.infopath import (
from app.core.meta.words import WordsMatcher
from app.log import logger
from app.schemas.types import MediaType
from app.utils import rust_accel
_ANIME_BRACKET_RE = re.compile(r'【[+0-9XVPI-]+】\s*【', re.IGNORECASE)
@@ -122,6 +123,87 @@ def _build_meta_info(
return meta
def _rust_parse_options(custom_words: List[str] = None) -> dict:
"""
收集 Rust Meta 解析所需的运行时配置,避免 Rust 层直接访问数据库和 settings。
"""
from app.core.meta.customization import CustomizationMatcher
from app.core.meta.releasegroup import ReleaseGroupsMatcher
from app.db.systemconfig_oper import SystemConfigOper
from app.schemas.types import SystemConfigKey
systemconfig = SystemConfigOper()
custom_release_groups = systemconfig.get(SystemConfigKey.CustomReleaseGroups)
if isinstance(custom_release_groups, list):
custom_release_groups = list(filter(None, custom_release_groups))
release_matcher = ReleaseGroupsMatcher()
release_groups = release_matcher._ReleaseGroupsMatcher__release_groups
if custom_release_groups:
release_groups = f"{release_groups}|{'|'.join(custom_release_groups)}"
customization = CustomizationMatcher._normalize_customization(
systemconfig.get(SystemConfigKey.Customization)
)
words = custom_words
if words is None:
words = systemconfig.get(SystemConfigKey.CustomIdentifiers) or []
return {
"custom_words": words or [],
"media_exts": settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT,
"release_groups": release_groups,
"customization": customization,
}
def _meta_from_rust(parsed: dict) -> Optional[MetaBase]:
"""
将 Rust 解析结果灌回现有 MetaVideo/MetaAnime 对象,保留下游属性和方法兼容性。
"""
if not parsed:
return None
meta = MetaAnime("") if parsed.get("kind") == "anime" else MetaVideo("")
type_map = {
MediaType.MOVIE.value: MediaType.MOVIE,
MediaType.TV.value: MediaType.TV,
MediaType.COLLECTION.value: MediaType.COLLECTION,
MediaType.UNKNOWN.value: MediaType.UNKNOWN,
}
fields = {
"isfile": parsed.get("isfile") or False,
"title": parsed.get("title") or "",
"org_string": parsed.get("org_string"),
"subtitle": parsed.get("subtitle"),
"type": type_map.get(parsed.get("type"), MediaType.UNKNOWN),
"cn_name": parsed.get("cn_name"),
"en_name": parsed.get("en_name"),
"original_name": parsed.get("original_name"),
"year": parsed.get("year"),
"total_season": parsed.get("total_season") or 0,
"begin_season": parsed.get("begin_season"),
"end_season": parsed.get("end_season"),
"total_episode": parsed.get("total_episode") or 0,
"begin_episode": parsed.get("begin_episode"),
"end_episode": parsed.get("end_episode"),
"part": parsed.get("part"),
"resource_type": parsed.get("resource_type"),
"resource_effect": parsed.get("resource_effect"),
"resource_pix": parsed.get("resource_pix"),
"resource_team": parsed.get("resource_team"),
"customization": parsed.get("customization"),
"web_source": parsed.get("web_source"),
"video_encode": parsed.get("video_encode"),
"video_bit": parsed.get("video_bit"),
"audio_encode": parsed.get("audio_encode"),
"apply_words": parsed.get("apply_words") or [],
"tmdbid": parsed.get("tmdbid"),
"doubanid": parsed.get("doubanid"),
"fps": parsed.get("fps"),
}
for key, value in fields.items():
setattr(meta, key, value)
return meta
def MetaInfo(title: str, subtitle: Optional[str] = None, custom_words: List[str] = None) -> MetaBase:
"""
根据标题和副标题识别元数据
@@ -130,6 +212,11 @@ def MetaInfo(title: str, subtitle: Optional[str] = None, custom_words: List[str]
:param custom_words: 自定义识别词列表
:return: MetaAnime、MetaVideo
"""
rust_meta = _meta_from_rust(
rust_accel.parse_metainfo(title, subtitle, _rust_parse_options(custom_words))
)
if rust_meta:
return rust_meta
meta = _build_meta_info(title=title, subtitle=subtitle, custom_words=custom_words)
if meta.apply_words:
original_meta = _build_meta_info(title=title, subtitle=subtitle)
@@ -145,6 +232,11 @@ def MetaInfoPath(path: Path, custom_words: List[str] = None) -> MetaBase:
:param path: 路径
:param custom_words: 自定义识别词列表
"""
rust_meta = _meta_from_rust(
rust_accel.parse_metainfo_path(str(path), _rust_parse_options(custom_words))
)
if rust_meta:
return rust_meta
# 文件元数据,不包含后缀
file_meta = MetaInfo(title=path.name, custom_words=custom_words)
if should_use_parent_title_for_file_stem(path.stem, path.parent.name, file_meta):
@@ -185,6 +277,9 @@ def find_metainfo(title: str) -> Tuple[str, dict]:
"""
从标题中提取媒体信息
"""
rust_result = rust_accel.find_metainfo(title)
if rust_result:
return rust_result["title"], rust_result["metainfo"]
metainfo = _empty_metainfo()
if not title:
return title, metainfo

View File

@@ -81,6 +81,48 @@ def parse_rss_items(xml_text: str, max_items: int = 1000) -> Optional[List[dict]
return None
def parse_metainfo(title: str, subtitle: Optional[str] = None, options: Optional[dict] = None) -> Optional[dict]:
"""
使用 Rust 从标题入口解析 MetaInfo不可用或异常时返回 None。
"""
if not _moviepilot_rust:
return None
try:
return _moviepilot_rust.parse_metainfo_fast(title, subtitle, options or {})
except BaseException as err:
_raise_non_rust_panic(err)
logger.debug(f"Rust MetaInfo解析失败使用 Python 解析兜底:{err}")
return None
def parse_metainfo_path(path: str, options: Optional[dict] = None) -> Optional[dict]:
"""
使用 Rust 从路径入口解析 MetaInfoPath不可用或异常时返回 None。
"""
if not _moviepilot_rust:
return None
try:
return _moviepilot_rust.parse_metainfo_path_fast(path, options or {})
except BaseException as err:
_raise_non_rust_panic(err)
logger.debug(f"Rust MetaInfoPath解析失败使用 Python 解析兜底:{err}")
return None
def find_metainfo(title: str) -> Optional[dict]:
"""
使用 Rust 提取标题中的显式媒体标签,不可用或异常时返回 None。
"""
if not _moviepilot_rust:
return None
try:
return _moviepilot_rust.find_metainfo_fast(title)
except BaseException as err:
_raise_non_rust_panic(err)
logger.debug(f"Rust 显式媒体标签解析失败,使用 Python 解析兜底:{err}")
return None
def _raise_non_rust_panic(err: BaseException) -> None:
"""
只吞掉 Rust 扩展 panic/异常,保留用户中断和进程退出语义。

View File

@@ -20,6 +20,12 @@ dependencies = [
"libc",
]
[[package]]
name = "anitomy-pure"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33f525032668d2aff5dff115958157db7aecf1dc2fd5f5df93cf1be1452dfd4a"
[[package]]
name = "autocfg"
version = "1.5.0"
@@ -170,6 +176,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fst"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
[[package]]
name = "futf"
version = "0.1.5"
@@ -375,6 +387,16 @@ dependencies = [
"rustversion",
]
[[package]]
name = "inputx-pinyin"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fa1bf26d5923bb73e34c63e7ef21fc976c37e016ea13872045bf72335f0a43c"
dependencies = [
"fst",
"phf",
]
[[package]]
name = "itoa"
version = "1.0.18"
@@ -483,7 +505,9 @@ dependencies = [
name = "moviepilot-rust"
version = "0.1.0"
dependencies = [
"anitomy-pure",
"chrono",
"inputx-pinyin",
"minijinja",
"once_cell",
"pyo3",

View File

@@ -8,6 +8,8 @@ name = "moviepilot_rust"
crate-type = ["cdylib"]
[dependencies]
anitomy-pure = "0.1"
inputx-pinyin = "1.0.2"
minijinja = "2.20"
chrono = "0.4"
once_cell = "1.20"

View File

@@ -1,5 +1,6 @@
mod filter;
mod indexer;
mod metainfo;
mod rss;
mod utils;
@@ -17,6 +18,9 @@ fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(is_available, m)?)?;
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
m.add_function(wrap_pyfunction!(metainfo::parse_metainfo_fast, m)?)?;
m.add_function(wrap_pyfunction!(metainfo::parse_metainfo_path_fast, m)?)?;
m.add_function(wrap_pyfunction!(metainfo::find_metainfo_fast, m)?)?;
m.add_function(wrap_pyfunction!(rss::parse_rss_items_fast, m)?)?;
Ok(())
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,116 @@
import argparse
import statistics
import sys
import time
from contextlib import contextmanager
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
from app.core import metainfo as metainfo_module
from app.core.metainfo import MetaInfo, MetaInfoPath
from tests.cases.meta import meta_cases
def build_inputs(repeat: int):
"""
构造覆盖 MetaInfo 和 MetaInfoPath 的基准输入。
"""
inputs = []
for _ in range(repeat):
for item in meta_cases:
if item.get("path"):
inputs.append(("path", item["path"], item.get("subtitle")))
else:
inputs.append(("title", item["title"], item.get("subtitle")))
return inputs
def disabled_rust_parse(*_args, **_kwargs):
"""
关闭 Rust MetaInfo 快路径,用于测量旧 Python 链路。
"""
return None
@contextmanager
def selected_meta_parser(use_rust: bool):
"""
在 Rust 入口和 Python 旧实现之间切换。
"""
original_parse = metainfo_module.rust_accel.parse_metainfo
original_parse_path = metainfo_module.rust_accel.parse_metainfo_path
original_find = metainfo_module.rust_accel.find_metainfo
if not use_rust:
metainfo_module.rust_accel.parse_metainfo = disabled_rust_parse
metainfo_module.rust_accel.parse_metainfo_path = disabled_rust_parse
metainfo_module.rust_accel.find_metainfo = disabled_rust_parse
try:
yield
finally:
metainfo_module.rust_accel.parse_metainfo = original_parse
metainfo_module.rust_accel.parse_metainfo_path = original_parse_path
metainfo_module.rust_accel.find_metainfo = original_find
def parse_all(inputs, use_rust: bool):
"""
执行一轮完整 MetaInfo/MetaInfoPath 入口解析。
"""
with selected_meta_parser(use_rust):
parsed = []
for kind, value, subtitle in inputs:
if kind == "path":
parsed.append(MetaInfoPath(Path(value)))
else:
parsed.append(MetaInfo(title=value, subtitle=subtitle, custom_words=["#"]))
return parsed
def measure(inputs, use_rust: bool, loops: int, repeats: int):
"""
多轮测量 MetaInfo 入口解析耗时。
"""
samples = []
parsed_count = 0
for _ in range(repeats):
start = time.perf_counter()
for _ in range(loops):
parsed = parse_all(inputs, use_rust)
parsed_count = len(parsed)
samples.append((time.perf_counter() - start) * 1000 / loops)
return statistics.median(samples), parsed_count
def parse_args():
"""
解析命令行参数。
"""
parser = argparse.ArgumentParser(description="Benchmark MetaInfo parsing through public entries")
parser.add_argument("--repeat-inputs", type=int, default=20, help="Repeat meta cases per loop")
parser.add_argument("--loops", type=int, default=10, help="Loops per repeat")
parser.add_argument("--repeats", type=int, default=5, help="Repeat count")
return parser.parse_args()
def main() -> int:
"""
运行 MetaInfo Rust 与 Python 入口链路基准测试。
"""
args = parse_args()
inputs = build_inputs(args.repeat_inputs)
rust_ms, rust_count = measure(inputs, use_rust=True, loops=args.loops, repeats=args.repeats)
python_ms, python_count = measure(inputs, use_rust=False, loops=args.loops, repeats=args.repeats)
speedup = python_ms / rust_ms if rust_ms else 0
print(f"items_per_loop={len(inputs)} loops={args.loops} repeats={args.repeats}")
print(f"rust_items={rust_count} python_items={python_count}")
print(f"rust_chain_ms_per_loop={rust_ms:.3f}")
print(f"python_chain_ms_per_loop={python_ms:.3f}")
print(f"speedup={speedup:.2f}x")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -5,7 +5,13 @@ import pytest
from app.helper import rss as rss_module
from app.helper.rss import RssHelper
from app.core import metainfo as metainfo_module
from app.core.config import settings
from app.core.meta.customization import CustomizationMatcher
from app.core.meta.releasegroup import ReleaseGroupsMatcher
from app.db.systemconfig_oper import SystemConfigOper
from app.modules.indexer.spider import SiteSpider
from app.schemas.types import SystemConfigKey
from app.schemas.types import MediaType
from app.utils import rust_accel
@@ -155,6 +161,107 @@ def test_rss_helper_parse_uses_rust_parser(monkeypatch):
assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 10, 30, tzinfo=timezone.utc).timestamp())
def _metainfo_options(custom_words=None):
"""
构造 Rust MetaInfo 测试所需的配置,保持和生产入口一致。
"""
systemconfig = SystemConfigOper()
custom_release_groups = systemconfig.get(SystemConfigKey.CustomReleaseGroups)
if isinstance(custom_release_groups, list):
custom_release_groups = list(filter(None, custom_release_groups))
release_groups = ReleaseGroupsMatcher()._ReleaseGroupsMatcher__release_groups
if custom_release_groups:
release_groups = f"{release_groups}|{'|'.join(custom_release_groups)}"
customization = CustomizationMatcher._normalize_customization(
systemconfig.get(SystemConfigKey.Customization)
)
return {
"custom_words": custom_words or [],
"media_exts": settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT,
"release_groups": release_groups,
"customization": customization,
}
def test_rust_metainfo_parser_handles_video_from_entry():
"""
Rust MetaInfo 入口应完整识别普通影视标题。
"""
result = rust_accel.parse_metainfo(
"The Long Season 2017 2160p WEB-DL H265 120FPS AAC-XXX",
options=_metainfo_options(),
)
assert result["kind"] == "video"
assert result["type"] == "未知"
assert result["en_name"] == "The Long Season"
assert result["year"] == "2017"
assert result["resource_type"] == "WEB-DL"
assert result["resource_pix"] == "2160p"
assert result["video_encode"] == "H265"
assert result["audio_encode"] == "AAC"
assert result["fps"] == 120
def test_rust_metainfo_parser_handles_anime_from_entry():
"""
Rust MetaInfo 入口应完整识别 Anime 标题。
"""
result = rust_accel.parse_metainfo(
"[ANi] OVERLORD 第四季 - 04 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4",
options=_metainfo_options(),
)
assert result["kind"] == "anime"
assert result["type"] == "电视剧"
assert result["en_name"] == "Overlord"
assert result["begin_season"] == 4
assert result["begin_episode"] == 4
assert result["resource_pix"] == "1080p"
assert result["video_encode"] == "AVC"
assert result["audio_encode"] == "AAC"
def test_rust_metainfo_path_parser_merges_parent_title():
"""
Rust MetaInfoPath 入口应在 Rust 内完成父目录标题合并。
"""
result = rust_accel.parse_metainfo_path(
"/Marty Supreme 2025 2160p DoVi HDR Atmos TrueHD 7.1 x265-PbK/简英双语特效.mp4",
options=_metainfo_options(),
)
assert result["kind"] == "video"
assert result["en_name"] == "Marty Supreme"
assert result["year"] == "2025"
assert result["original_name"] == "Marty Supreme"
assert result["resource_pix"] == "2160p"
def test_metainfo_public_entry_uses_rust(monkeypatch):
"""
MetaInfo 公共入口应调用 Rust 解析器,而不是直接进入 Python 旧解析逻辑。
"""
calls = []
original_parse = metainfo_module.rust_accel.parse_metainfo
def wrapped_parse(*args, **kwargs):
"""
记录 Rust 入口调用并透传结果。
"""
calls.append(args[0])
return original_parse(*args, **kwargs)
monkeypatch.setattr(metainfo_module.rust_accel, "parse_metainfo", wrapped_parse)
meta = metainfo_module.MetaInfo("旧名 第03集", custom_words=["旧名 => 新名 && 第 <> 集 >> EP+1"])
assert calls == ["旧名 第03集"]
assert meta.name == "新名"
assert meta.episode == "E04"
assert meta.apply_words == ["旧名 => 新名 && 第 <> 集 >> EP+1"]
def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links():
"""
Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。