mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-08 23:16:49 +00:00
feat: accelerate rss parsing with rust
This commit is contained in:
@@ -9,6 +9,7 @@ from lxml import etree
|
||||
from app.core.config import settings
|
||||
from app.helper.browser import PlaywrightHelper
|
||||
from app.log import logger
|
||||
from app.utils import rust_accel
|
||||
from app.utils.http import RequestUtils
|
||||
from app.utils.string import StringUtils
|
||||
|
||||
@@ -298,6 +299,12 @@ class RssHelper:
|
||||
logger.error("RSS内容不是有效的XML格式")
|
||||
return False
|
||||
|
||||
rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS + 1)
|
||||
if rust_items is not None:
|
||||
if len(rust_items) > self.MAX_RSS_ITEMS:
|
||||
logger.warning(f"RSS条目过多: 超过{self.MAX_RSS_ITEMS},仅处理前{self.MAX_RSS_ITEMS}个")
|
||||
return rust_items[:self.MAX_RSS_ITEMS]
|
||||
|
||||
# 使用lxml.etree解析XML
|
||||
parser = None
|
||||
try:
|
||||
|
||||
@@ -67,6 +67,20 @@ def parse_indexer_torrents(
|
||||
return None
|
||||
|
||||
|
||||
def parse_rss_items(xml_text: str, max_items: int = 1000) -> Optional[List[dict]]:
|
||||
"""
|
||||
使用 Rust 解析 RSS/Atom 条目,不可用或异常时返回 None。
|
||||
"""
|
||||
if not _moviepilot_rust:
|
||||
return None
|
||||
try:
|
||||
return _moviepilot_rust.parse_rss_items_fast(xml_text, max_items)
|
||||
except BaseException as err:
|
||||
_raise_non_rust_panic(err)
|
||||
logger.debug(f"Rust RSS解析失败,使用 Python 解析兜底:{err}")
|
||||
return None
|
||||
|
||||
|
||||
def _raise_non_rust_panic(err: BaseException) -> None:
|
||||
"""
|
||||
只吞掉 Rust 扩展 panic/异常,保留用户中断和进程退出语义。
|
||||
|
||||
10
rust/moviepilot_rust/Cargo.lock
generated
10
rust/moviepilot_rust/Cargo.lock
generated
@@ -487,6 +487,7 @@ dependencies = [
|
||||
"minijinja",
|
||||
"once_cell",
|
||||
"pyo3",
|
||||
"quick-xml",
|
||||
"regex",
|
||||
"scraper",
|
||||
"url",
|
||||
@@ -693,6 +694,15 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.38.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
|
||||
@@ -12,6 +12,7 @@ minijinja = "2.20"
|
||||
chrono = "0.4"
|
||||
once_cell = "1.20"
|
||||
pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }
|
||||
quick-xml = "0.38"
|
||||
regex = "1.11"
|
||||
scraper = "0.24"
|
||||
url = "2.5"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
mod filter;
|
||||
mod indexer;
|
||||
mod rss;
|
||||
mod utils;
|
||||
|
||||
use pyo3::prelude::*;
|
||||
@@ -16,5 +17,6 @@ fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_function(wrap_pyfunction!(is_available, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?;
|
||||
m.add_function(wrap_pyfunction!(rss::parse_rss_items_fast, m)?)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
384
rust/moviepilot_rust/src/rss.rs
Normal file
384
rust/moviepilot_rust/src/rss.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc};
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyAny, PyDict, PyList};
|
||||
use quick_xml::events::{BytesRef, BytesStart, Event};
|
||||
use quick_xml::name::QName;
|
||||
use quick_xml::Reader;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RssItem {
|
||||
title: String,
|
||||
description: String,
|
||||
link: String,
|
||||
enclosure: String,
|
||||
size: i64,
|
||||
pubdate: String,
|
||||
nickname: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum TextField {
|
||||
Title,
|
||||
Description,
|
||||
Link,
|
||||
Pubdate,
|
||||
Nickname,
|
||||
}
|
||||
|
||||
/// 解析 RSS/Atom 文本并返回 MoviePilot 现有调用方兼容的条目字典。
|
||||
#[pyfunction]
|
||||
#[pyo3(signature = (xml_text, max_items=1000))]
|
||||
pub(crate) fn parse_rss_items_fast(
|
||||
py: Python<'_>,
|
||||
xml_text: &str,
|
||||
max_items: usize,
|
||||
) -> PyResult<Option<PyObject>> {
|
||||
let parsed = parse_rss_items(xml_text, max_items)?;
|
||||
let result = PyList::empty(py);
|
||||
for item in parsed {
|
||||
result.append(item_to_py(py, &item)?)?;
|
||||
}
|
||||
Ok(Some(result.into()))
|
||||
}
|
||||
|
||||
/// 使用 quick-xml 流式读取 RSS/Atom,避免 lxml XPath 对每个 item 的重复遍历。
|
||||
fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult<Vec<RssItem>> {
|
||||
let mut reader = Reader::from_str(xml_text);
|
||||
|
||||
let mut results = Vec::new();
|
||||
let mut current_item: Option<RssItem> = None;
|
||||
let mut item_depth = 0usize;
|
||||
let mut current_field: Option<(TextField, usize)> = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(event)) => {
|
||||
let local = local_name(event.name());
|
||||
if current_item.is_none() && is_item_node(&local) {
|
||||
current_item = Some(RssItem::default());
|
||||
item_depth = 1;
|
||||
current_field = None;
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(item) = current_item.as_mut() {
|
||||
item_depth += 1;
|
||||
handle_start_field(&event, &local, item, item_depth, &mut current_field)?;
|
||||
}
|
||||
}
|
||||
Ok(Event::Empty(event)) => {
|
||||
let local = local_name(event.name());
|
||||
if let Some(item) = current_item.as_mut() {
|
||||
handle_empty_field(&event, &local, item)?;
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(event)) => {
|
||||
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
|
||||
let text = event.decode().map_err(to_py_value_error)?.to_string();
|
||||
append_text_field(item, field, &text);
|
||||
}
|
||||
}
|
||||
Ok(Event::CData(event)) => {
|
||||
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
|
||||
let text = event.decode().map_err(to_py_value_error)?.to_string();
|
||||
append_text_field(item, field, &text);
|
||||
}
|
||||
}
|
||||
Ok(Event::GeneralRef(event)) => {
|
||||
if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) {
|
||||
let text = resolve_general_ref(&event)?;
|
||||
append_text_field(item, field, &text);
|
||||
}
|
||||
}
|
||||
Ok(Event::End(event)) => {
|
||||
let local = local_name(event.name());
|
||||
if current_item.is_some() && item_depth == 1 && is_item_node(&local) {
|
||||
if let Some(item) = current_item.take() {
|
||||
if let Some(item) = finalize_item(item) {
|
||||
results.push(item);
|
||||
if results.len() >= max_items {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
item_depth = 0;
|
||||
current_field = None;
|
||||
continue;
|
||||
}
|
||||
|
||||
if current_item.is_some() && item_depth > 0 {
|
||||
if current_field
|
||||
.map(|(_, depth)| depth == item_depth)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
current_field = None;
|
||||
}
|
||||
item_depth = item_depth.saturating_sub(1);
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(err) => {
|
||||
return Err(to_py_value_error(err));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// 处理开始标签,记录当前需要采集文本的字段和链接属性。
|
||||
fn handle_start_field(
|
||||
event: &BytesStart<'_>,
|
||||
local: &str,
|
||||
item: &mut RssItem,
|
||||
depth: usize,
|
||||
current_field: &mut Option<(TextField, usize)>,
|
||||
) -> PyResult<()> {
|
||||
if local == "enclosure" {
|
||||
fill_enclosure(event, item)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if local == "link" {
|
||||
fill_link_from_href(event, item)?;
|
||||
}
|
||||
|
||||
if current_field.is_none() {
|
||||
if let Some(field) = pick_text_field(local, item) {
|
||||
*current_field = Some((field, depth));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 处理空标签,覆盖 Atom 的 link href 和 RSS 的 enclosure。
|
||||
fn handle_empty_field(event: &BytesStart<'_>, local: &str, item: &mut RssItem) -> PyResult<()> {
|
||||
match local {
|
||||
"enclosure" => fill_enclosure(event, item)?,
|
||||
"link" => fill_link_from_href(event, item)?,
|
||||
_ => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 根据标签名和已采集状态选择当前文本字段。
|
||||
fn pick_text_field(local: &str, item: &RssItem) -> Option<TextField> {
|
||||
match local {
|
||||
"title" if item.title.is_empty() => Some(TextField::Title),
|
||||
"description" | "summary" if item.description.is_empty() => Some(TextField::Description),
|
||||
"link" if item.link.is_empty() => Some(TextField::Link),
|
||||
"pubDate" | "published" | "updated" if item.pubdate.is_empty() => Some(TextField::Pubdate),
|
||||
"creator" if item.nickname.is_empty() => Some(TextField::Nickname),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// 追加文本字段内容,兼容 CDATA 和带内联标签的描述。
|
||||
fn append_text_field(item: &mut RssItem, field: TextField, text: &str) {
|
||||
if text.is_empty() {
|
||||
return;
|
||||
}
|
||||
match field {
|
||||
TextField::Title => item.title.push_str(text),
|
||||
TextField::Description => item.description.push_str(text),
|
||||
TextField::Link => item.link.push_str(text),
|
||||
TextField::Pubdate => item.pubdate.push_str(text),
|
||||
TextField::Nickname => item.nickname.push_str(text),
|
||||
}
|
||||
}
|
||||
|
||||
/// 解析 XML 通用实体,保留未识别实体的原始文本以便 Python 兜底时可复查。
|
||||
fn resolve_general_ref(event: &BytesRef<'_>) -> PyResult<String> {
|
||||
if let Some(value) = event.resolve_char_ref().map_err(to_py_value_error)? {
|
||||
return Ok(value.to_string());
|
||||
}
|
||||
let name = event.decode().map_err(to_py_value_error)?;
|
||||
let resolved = match name.as_ref() {
|
||||
"amp" => "&".to_string(),
|
||||
"lt" => "<".to_string(),
|
||||
"gt" => ">".to_string(),
|
||||
"apos" => "'".to_string(),
|
||||
"quot" => "\"".to_string(),
|
||||
other => format!("&{other};"),
|
||||
};
|
||||
Ok(resolved)
|
||||
}
|
||||
|
||||
/// 从 enclosure 标签读取下载链接和大小。
|
||||
fn fill_enclosure(event: &BytesStart<'_>, item: &mut RssItem) -> PyResult<()> {
|
||||
if !item.enclosure.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
if let Some(url) = attr_value(event, b"url")? {
|
||||
item.enclosure = url;
|
||||
}
|
||||
if let Some(length) = attr_value(event, b"length")? {
|
||||
item.size = length.trim().parse::<i64>().unwrap_or(0);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 从 Atom link 的 href 属性读取页面地址。
|
||||
fn fill_link_from_href(event: &BytesStart<'_>, item: &mut RssItem) -> PyResult<()> {
|
||||
if !item.link.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
if let Some(href) = attr_value(event, b"href")? {
|
||||
item.link = href;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 读取并反转义指定属性值。
|
||||
fn attr_value(event: &BytesStart<'_>, name: &[u8]) -> PyResult<Option<String>> {
|
||||
for attr in event.attributes().with_checks(false) {
|
||||
let attr = attr.map_err(to_py_value_error)?;
|
||||
if attr.key.as_ref().eq_ignore_ascii_case(name) {
|
||||
let value = attr
|
||||
.decode_and_unescape_value(event.decoder())
|
||||
.map_err(to_py_value_error)?;
|
||||
return Ok(Some(value.trim().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// 完成单条 RSS item 的兼容性整理,保留原 Python 逻辑的跳过条件。
|
||||
fn finalize_item(mut item: RssItem) -> Option<RssItem> {
|
||||
item.title = item.title.trim().to_string();
|
||||
item.description = item.description.trim().to_string();
|
||||
item.link = item.link.trim().to_string();
|
||||
item.enclosure = item.enclosure.trim().to_string();
|
||||
item.pubdate = item.pubdate.trim().to_string();
|
||||
item.nickname = item.nickname.trim().to_string();
|
||||
|
||||
if item.title.is_empty() {
|
||||
return None;
|
||||
}
|
||||
if item.enclosure.is_empty() {
|
||||
if item.link.is_empty() {
|
||||
return None;
|
||||
}
|
||||
item.enclosure = item.link.clone();
|
||||
}
|
||||
Some(item)
|
||||
}
|
||||
|
||||
/// 将 Rust 条目转换为 Python dict,字段名保持与 RssHelper.parse 原返回一致。
|
||||
fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult<PyObject> {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("title", &item.title)?;
|
||||
dict.set_item("enclosure", &item.enclosure)?;
|
||||
dict.set_item("size", item.size)?;
|
||||
dict.set_item("description", &item.description)?;
|
||||
dict.set_item("link", &item.link)?;
|
||||
if let Some(timestamp) = parse_pubdate_timestamp(&item.pubdate) {
|
||||
dict.set_item("pubdate", py_datetime_from_timestamp(py, timestamp)?)?;
|
||||
} else {
|
||||
dict.set_item("pubdate", "")?;
|
||||
}
|
||||
if !item.nickname.is_empty() {
|
||||
dict.set_item("nickname", &item.nickname)?;
|
||||
}
|
||||
Ok(dict.into())
|
||||
}
|
||||
|
||||
/// 将 Unix 时间戳转换为本地时区 Python datetime,匹配原 astimezone(tz=None) 语义。
|
||||
fn py_datetime_from_timestamp<'py>(py: Python<'py>, timestamp: i64) -> PyResult<Bound<'py, PyAny>> {
|
||||
let datetime_mod = py.import("datetime")?;
|
||||
let datetime_cls = datetime_mod.getattr("datetime")?;
|
||||
let timezone_cls = datetime_mod.getattr("timezone")?;
|
||||
let utc = timezone_cls.getattr("utc")?;
|
||||
let utc_dt = datetime_cls.call_method1("fromtimestamp", (timestamp, utc))?;
|
||||
utc_dt.call_method0("astimezone")
|
||||
}
|
||||
|
||||
/// 解析 RSS/Atom 常见日期格式并返回时间戳。
|
||||
fn parse_pubdate_timestamp(value: &str) -> Option<i64> {
|
||||
let trimmed = value.trim();
|
||||
if trimmed.is_empty() {
|
||||
return None;
|
||||
}
|
||||
if let Ok(datetime) = DateTime::parse_from_rfc2822(trimmed) {
|
||||
return Some(datetime.timestamp());
|
||||
}
|
||||
if let Ok(datetime) = DateTime::parse_from_rfc3339(trimmed) {
|
||||
return Some(datetime.timestamp());
|
||||
}
|
||||
if let Some(timestamp) = parse_utc_suffix_datetime(trimmed) {
|
||||
return Some(timestamp);
|
||||
}
|
||||
parse_local_naive_datetime(trimmed)
|
||||
}
|
||||
|
||||
/// 兼容部分站点输出的 UTC/GMT 文本后缀。
|
||||
fn parse_utc_suffix_datetime(value: &str) -> Option<i64> {
|
||||
for suffix in [" UTC", " GMT"] {
|
||||
let Some(stripped) = value.strip_suffix(suffix) else {
|
||||
continue;
|
||||
};
|
||||
for format in [
|
||||
"%a, %d %b %Y %H:%M:%S",
|
||||
"%d %b %Y %H:%M:%S",
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
] {
|
||||
if let Ok(naive) = NaiveDateTime::parse_from_str(stripped.trim(), format) {
|
||||
return Some(Utc.from_utc_datetime(&naive).timestamp());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// 解析不带时区的日期格式,并按系统本地时区解释。
|
||||
fn parse_local_naive_datetime(value: &str) -> Option<i64> {
|
||||
for format in [
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
"%Y-%m-%d %H:%M",
|
||||
"%Y/%m/%d %H:%M:%S",
|
||||
"%Y/%m/%d %H:%M",
|
||||
"%d %b %Y %H:%M:%S",
|
||||
"%a, %d %b %Y %H:%M:%S",
|
||||
] {
|
||||
if let Ok(naive) = NaiveDateTime::parse_from_str(value, format) {
|
||||
return local_timestamp(naive);
|
||||
}
|
||||
}
|
||||
for format in ["%Y-%m-%d", "%Y/%m/%d", "%d %b %Y"] {
|
||||
if let Ok(date) = NaiveDate::parse_from_str(value, format) {
|
||||
return local_timestamp(NaiveDateTime::new(date, NaiveTime::MIN));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// 将本地无时区时间转换为时间戳,处理夏令时歧义时取较早值。
|
||||
fn local_timestamp(naive: NaiveDateTime) -> Option<i64> {
|
||||
Local
|
||||
.from_local_datetime(&naive)
|
||||
.single()
|
||||
.or_else(|| Local.from_local_datetime(&naive).earliest())
|
||||
.map(|datetime| datetime.timestamp())
|
||||
}
|
||||
|
||||
/// 判断当前标签是否为 RSS item 或 Atom entry。
|
||||
fn is_item_node(local: &str) -> bool {
|
||||
matches!(local, "item" | "entry")
|
||||
}
|
||||
|
||||
/// 提取 XML 名称的本地部分,用于兼容 dc:creator 这类命名空间字段。
|
||||
fn local_name(name: QName<'_>) -> String {
|
||||
let raw = std::str::from_utf8(name.as_ref()).unwrap_or_default();
|
||||
raw.rsplit_once(':')
|
||||
.map(|(_, local)| local)
|
||||
.unwrap_or(raw)
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// 将 quick-xml 错误转换为 Python ValueError 交给 Python 包装层判断是否兜底。
|
||||
fn to_py_value_error<E: std::fmt::Display>(err: E) -> PyErr {
|
||||
pyo3::exceptions::PyValueError::new_err(err.to_string())
|
||||
}
|
||||
156
scripts/benchmark_rss_rust.py
Normal file
156
scripts/benchmark_rss_rust.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import argparse
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from app.helper import rss as rss_module
|
||||
from app.helper.rss import RssHelper
|
||||
from app.utils import rust_accel
|
||||
|
||||
|
||||
class FakeRequestUtils:
|
||||
"""
|
||||
基准测试用 RequestUtils,固定返回内存中的 RSS 文本。
|
||||
"""
|
||||
|
||||
xml_text = ""
|
||||
|
||||
def __init__(self, **_kwargs):
|
||||
"""
|
||||
保持与真实 RequestUtils 构造签名兼容。
|
||||
"""
|
||||
|
||||
def get_res(self, _url):
|
||||
"""
|
||||
返回 RssHelper.parse 所需的最小响应对象。
|
||||
"""
|
||||
return SimpleNamespace(
|
||||
status_code=200,
|
||||
content=self.xml_text.encode("utf-8"),
|
||||
text=self.xml_text,
|
||||
apparent_encoding="utf-8",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def build_rss_xml(items: int) -> str:
|
||||
"""
|
||||
构造覆盖标题、描述、链接、enclosure、日期和 creator 的 RSS 文本。
|
||||
"""
|
||||
rows = []
|
||||
for index in range(items):
|
||||
rows.append(f"""
|
||||
<item>
|
||||
<title>MoviePilot Benchmark {index}</title>
|
||||
<description><![CDATA[Benchmark description {index} <b>tag</b>]]></description>
|
||||
<link>https://example.com/details/{index}</link>
|
||||
<enclosure url="https://example.com/download/{index}.torrent" length="{1024 + index}" />
|
||||
<pubDate>Tue, 19 May 2026 08:30:00 GMT</pubDate>
|
||||
<dc:creator>bench-user-{index}</dc:creator>
|
||||
</item>
|
||||
""")
|
||||
return f"""
|
||||
<rss xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<channel>
|
||||
{''.join(rows)}
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
|
||||
|
||||
@contextmanager
|
||||
def patched_request_utils(xml_text: str):
|
||||
"""
|
||||
临时替换 RSS 请求层,让基准覆盖 RssHelper.parse 的实际解析链路。
|
||||
"""
|
||||
original_request_utils = rss_module.RequestUtils
|
||||
FakeRequestUtils.xml_text = xml_text
|
||||
rss_module.RequestUtils = FakeRequestUtils
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
rss_module.RequestUtils = original_request_utils
|
||||
|
||||
|
||||
def disabled_rust_parse(_xml_text: str, _max_items: int = 1000):
|
||||
"""
|
||||
关闭 Rust 快路径,用同一条 RssHelper.parse 链路测量 Python lxml 兜底性能。
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def selected_rss_parser(use_rust: bool):
|
||||
"""
|
||||
在 Rust 快路径和 Python lxml 解析之间切换,保持请求与编码成本一致。
|
||||
"""
|
||||
original_parse = rss_module.rust_accel.parse_rss_items
|
||||
if not use_rust:
|
||||
rss_module.rust_accel.parse_rss_items = disabled_rust_parse
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
rss_module.rust_accel.parse_rss_items = original_parse
|
||||
|
||||
|
||||
def parse_chain(xml_text: str, use_rust: bool):
|
||||
"""
|
||||
执行一次 RssHelper.parse,返回解析到的 RSS 条目。
|
||||
"""
|
||||
with patched_request_utils(xml_text), selected_rss_parser(use_rust):
|
||||
return RssHelper().parse("https://example.com/rss")
|
||||
|
||||
|
||||
def measure_chain(xml_text: str, use_rust: bool, loops: int, repeats: int):
|
||||
"""
|
||||
多轮测量 RssHelper.parse 平均耗时,并校验每轮解析数量稳定。
|
||||
"""
|
||||
samples = []
|
||||
parsed_count = 0
|
||||
for _ in range(repeats):
|
||||
start = time.perf_counter()
|
||||
for _ in range(loops):
|
||||
parsed = parse_chain(xml_text, use_rust)
|
||||
parsed_count = len(parsed)
|
||||
samples.append((time.perf_counter() - start) * 1000 / loops)
|
||||
return statistics.median(samples), parsed_count
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
解析命令行参数。
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Benchmark RSS parsing through RssHelper.parse")
|
||||
parser.add_argument("--items", type=int, default=200, help="RSS item count")
|
||||
parser.add_argument("--loops", type=int, default=50, help="Loops per repeat")
|
||||
parser.add_argument("--repeats", type=int, default=5, help="Repeat count")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""
|
||||
运行 Rust 与 Python RSS 解析链路基准测试。
|
||||
"""
|
||||
args = parse_args()
|
||||
xml_text = build_rss_xml(args.items)
|
||||
rust_ms, rust_count = measure_chain(xml_text, use_rust=True, loops=args.loops, repeats=args.repeats)
|
||||
python_ms, python_count = measure_chain(xml_text, use_rust=False, loops=args.loops, repeats=args.repeats)
|
||||
speedup = python_ms / rust_ms if rust_ms else 0
|
||||
|
||||
print(f"rust_available={rust_accel.is_available()}")
|
||||
print(f"items={args.items} loops={args.loops} repeats={args.repeats}")
|
||||
print(f"rust_items={rust_count} python_items={python_count}")
|
||||
print(f"rust_chain_ms_per_loop={rust_ms:.3f}")
|
||||
print(f"python_chain_ms_per_loop={python_ms:.3f}")
|
||||
print(f"speedup={speedup:.2f}x")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,5 +1,10 @@
|
||||
from datetime import datetime, timezone
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from app.helper import rss as rss_module
|
||||
from app.helper.rss import RssHelper
|
||||
from app.modules.indexer.spider import SiteSpider
|
||||
from app.schemas.types import MediaType
|
||||
from app.utils import rust_accel
|
||||
@@ -29,6 +34,127 @@ def test_rust_filter_rule_parser_handles_parentheses_and_or():
|
||||
assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]]
|
||||
|
||||
|
||||
def test_rust_rss_parser_extracts_rss_and_atom_items():
|
||||
"""
|
||||
Rust RSS解析应覆盖 RSS item、Atom entry、命名空间和日期字段。
|
||||
"""
|
||||
xml = """
|
||||
<root xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<rss>
|
||||
<channel>
|
||||
<item>
|
||||
<title>Movie & Show</title>
|
||||
<description><![CDATA[Desc <b>bold</b>]]></description>
|
||||
<link>https://example.com/details/1</link>
|
||||
<enclosure url="https://example.com/download/1.torrent" length="123456" />
|
||||
<pubDate>Tue, 19 May 2026 08:30:00 GMT</pubDate>
|
||||
<dc:creator>豆瓣用户</dc:creator>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
<feed>
|
||||
<entry>
|
||||
<title>Atom Title</title>
|
||||
<summary>Atom Summary</summary>
|
||||
<link href="https://example.com/atom/2" />
|
||||
<updated>2026-05-19T09:30:00Z</updated>
|
||||
</entry>
|
||||
</feed>
|
||||
</root>
|
||||
"""
|
||||
|
||||
result = rust_accel.parse_rss_items(xml, max_items=100)
|
||||
|
||||
assert len(result) == 2
|
||||
assert result[0]["title"] == "Movie & Show"
|
||||
assert result[0]["description"] == "Desc <b>bold</b>"
|
||||
assert result[0]["link"] == "https://example.com/details/1"
|
||||
assert result[0]["enclosure"] == "https://example.com/download/1.torrent"
|
||||
assert result[0]["size"] == 123456
|
||||
assert result[0]["nickname"] == "豆瓣用户"
|
||||
assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 8, 30, tzinfo=timezone.utc).timestamp())
|
||||
assert result[1]["title"] == "Atom Title"
|
||||
assert result[1]["description"] == "Atom Summary"
|
||||
assert result[1]["link"] == "https://example.com/atom/2"
|
||||
assert result[1]["enclosure"] == "https://example.com/atom/2"
|
||||
assert int(result[1]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 9, 30, tzinfo=timezone.utc).timestamp())
|
||||
|
||||
|
||||
def test_rust_rss_parser_skips_incomplete_items():
|
||||
"""
|
||||
Rust RSS解析应保持原逻辑,跳过无标题或无链接的条目。
|
||||
"""
|
||||
xml = """
|
||||
<rss>
|
||||
<channel>
|
||||
<item><title></title><link>https://example.com/a</link></item>
|
||||
<item><title>No Link</title></item>
|
||||
<item><title>OK</title><link>https://example.com/ok</link></item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
|
||||
result = rust_accel.parse_rss_items(xml, max_items=100)
|
||||
|
||||
assert result == [{
|
||||
"title": "OK",
|
||||
"enclosure": "https://example.com/ok",
|
||||
"size": 0,
|
||||
"description": "",
|
||||
"link": "https://example.com/ok",
|
||||
"pubdate": "",
|
||||
}]
|
||||
|
||||
|
||||
def test_rss_helper_parse_uses_rust_parser(monkeypatch):
|
||||
"""
|
||||
RssHelper.parse 应在请求和编码处理后直接使用 Rust 解析结果。
|
||||
"""
|
||||
xml = """
|
||||
<rss>
|
||||
<channel>
|
||||
<item>
|
||||
<title>Helper Title</title>
|
||||
<description>Helper Description</description>
|
||||
<link>https://example.com/details/3</link>
|
||||
<pubDate>2026-05-19T10:30:00Z</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
|
||||
class FakeRequestUtils:
|
||||
"""
|
||||
测试用 RequestUtils,避免真实网络请求。
|
||||
"""
|
||||
|
||||
def __init__(self, **_kwargs):
|
||||
"""
|
||||
保存构造参数占位,兼容 RssHelper 的调用方式。
|
||||
"""
|
||||
|
||||
def get_res(self, _url):
|
||||
"""
|
||||
返回带 content/text/status_code 的最小响应对象。
|
||||
"""
|
||||
return SimpleNamespace(
|
||||
status_code=200,
|
||||
content=xml.encode("utf-8"),
|
||||
text=xml,
|
||||
apparent_encoding="utf-8",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(rss_module, "RequestUtils", FakeRequestUtils)
|
||||
|
||||
result = RssHelper().parse("https://example.com/rss")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0]["title"] == "Helper Title"
|
||||
assert result[0]["enclosure"] == "https://example.com/details/3"
|
||||
assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 10, 30, tzinfo=timezone.utc).timestamp())
|
||||
|
||||
|
||||
def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links():
|
||||
"""
|
||||
Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。
|
||||
|
||||
Reference in New Issue
Block a user