From ad38f51d6bd1e3ff0595ee220ccd45027ef7258d Mon Sep 17 00:00:00 2001 From: jxxghp Date: Sat, 23 May 2026 16:14:47 +0800 Subject: [PATCH] feat: accelerate rss parsing with rust --- app/helper/rss.py | 7 + app/utils/rust_accel.py | 14 ++ rust/moviepilot_rust/Cargo.lock | 10 + rust/moviepilot_rust/Cargo.toml | 1 + rust/moviepilot_rust/src/lib.rs | 2 + rust/moviepilot_rust/src/rss.rs | 384 ++++++++++++++++++++++++++++++++ scripts/benchmark_rss_rust.py | 156 +++++++++++++ tests/test_rust_accel.py | 126 +++++++++++ 8 files changed, 700 insertions(+) create mode 100644 rust/moviepilot_rust/src/rss.rs create mode 100644 scripts/benchmark_rss_rust.py diff --git a/app/helper/rss.py b/app/helper/rss.py index 5257ff0c..a5f5c3dc 100644 --- a/app/helper/rss.py +++ b/app/helper/rss.py @@ -9,6 +9,7 @@ from lxml import etree from app.core.config import settings from app.helper.browser import PlaywrightHelper from app.log import logger +from app.utils import rust_accel from app.utils.http import RequestUtils from app.utils.string import StringUtils @@ -298,6 +299,12 @@ class RssHelper: logger.error("RSS内容不是有效的XML格式") return False + rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS + 1) + if rust_items is not None: + if len(rust_items) > self.MAX_RSS_ITEMS: + logger.warning(f"RSS条目过多: 超过{self.MAX_RSS_ITEMS},仅处理前{self.MAX_RSS_ITEMS}个") + return rust_items[:self.MAX_RSS_ITEMS] + # 使用lxml.etree解析XML parser = None try: diff --git a/app/utils/rust_accel.py b/app/utils/rust_accel.py index fb73a0ad..8b6c6cd8 100644 --- a/app/utils/rust_accel.py +++ b/app/utils/rust_accel.py @@ -67,6 +67,20 @@ def parse_indexer_torrents( return None +def parse_rss_items(xml_text: str, max_items: int = 1000) -> Optional[List[dict]]: + """ + 使用 Rust 解析 RSS/Atom 条目,不可用或异常时返回 None。 + """ + if not _moviepilot_rust: + return None + try: + return _moviepilot_rust.parse_rss_items_fast(xml_text, max_items) + except BaseException as err: + _raise_non_rust_panic(err) + logger.debug(f"Rust RSS解析失败,使用 Python 解析兜底:{err}") + return None + + def _raise_non_rust_panic(err: BaseException) -> None: """ 只吞掉 Rust 扩展 panic/异常,保留用户中断和进程退出语义。 diff --git a/rust/moviepilot_rust/Cargo.lock b/rust/moviepilot_rust/Cargo.lock index f2eaad29..333cddfa 100644 --- a/rust/moviepilot_rust/Cargo.lock +++ b/rust/moviepilot_rust/Cargo.lock @@ -487,6 +487,7 @@ dependencies = [ "minijinja", "once_cell", "pyo3", + "quick-xml", "regex", "scraper", "url", @@ -693,6 +694,15 @@ dependencies = [ "syn", ] +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.45" diff --git a/rust/moviepilot_rust/Cargo.toml b/rust/moviepilot_rust/Cargo.toml index 713ab289..8f9f1f3d 100644 --- a/rust/moviepilot_rust/Cargo.toml +++ b/rust/moviepilot_rust/Cargo.toml @@ -12,6 +12,7 @@ minijinja = "2.20" chrono = "0.4" once_cell = "1.20" pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] } +quick-xml = "0.38" regex = "1.11" scraper = "0.24" url = "2.5" diff --git a/rust/moviepilot_rust/src/lib.rs b/rust/moviepilot_rust/src/lib.rs index df536fdc..38464423 100644 --- a/rust/moviepilot_rust/src/lib.rs +++ b/rust/moviepilot_rust/src/lib.rs @@ -1,5 +1,6 @@ mod filter; mod indexer; +mod rss; mod utils; use pyo3::prelude::*; @@ -16,5 +17,6 @@ fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(is_available, m)?)?; m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?; m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?; + m.add_function(wrap_pyfunction!(rss::parse_rss_items_fast, m)?)?; Ok(()) } diff --git a/rust/moviepilot_rust/src/rss.rs b/rust/moviepilot_rust/src/rss.rs new file mode 100644 index 00000000..c8c823bc --- /dev/null +++ b/rust/moviepilot_rust/src/rss.rs @@ -0,0 +1,384 @@ +use chrono::{DateTime, Local, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; +use pyo3::prelude::*; +use pyo3::types::{PyAny, PyDict, PyList}; +use quick_xml::events::{BytesRef, BytesStart, Event}; +use quick_xml::name::QName; +use quick_xml::Reader; + +#[derive(Default)] +struct RssItem { + title: String, + description: String, + link: String, + enclosure: String, + size: i64, + pubdate: String, + nickname: String, +} + +#[derive(Clone, Copy)] +enum TextField { + Title, + Description, + Link, + Pubdate, + Nickname, +} + +/// 解析 RSS/Atom 文本并返回 MoviePilot 现有调用方兼容的条目字典。 +#[pyfunction] +#[pyo3(signature = (xml_text, max_items=1000))] +pub(crate) fn parse_rss_items_fast( + py: Python<'_>, + xml_text: &str, + max_items: usize, +) -> PyResult> { + let parsed = parse_rss_items(xml_text, max_items)?; + let result = PyList::empty(py); + for item in parsed { + result.append(item_to_py(py, &item)?)?; + } + Ok(Some(result.into())) +} + +/// 使用 quick-xml 流式读取 RSS/Atom,避免 lxml XPath 对每个 item 的重复遍历。 +fn parse_rss_items(xml_text: &str, max_items: usize) -> PyResult> { + let mut reader = Reader::from_str(xml_text); + + let mut results = Vec::new(); + let mut current_item: Option = None; + let mut item_depth = 0usize; + let mut current_field: Option<(TextField, usize)> = None; + + loop { + match reader.read_event() { + Ok(Event::Start(event)) => { + let local = local_name(event.name()); + if current_item.is_none() && is_item_node(&local) { + current_item = Some(RssItem::default()); + item_depth = 1; + current_field = None; + continue; + } + + if let Some(item) = current_item.as_mut() { + item_depth += 1; + handle_start_field(&event, &local, item, item_depth, &mut current_field)?; + } + } + Ok(Event::Empty(event)) => { + let local = local_name(event.name()); + if let Some(item) = current_item.as_mut() { + handle_empty_field(&event, &local, item)?; + } + } + Ok(Event::Text(event)) => { + if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) { + let text = event.decode().map_err(to_py_value_error)?.to_string(); + append_text_field(item, field, &text); + } + } + Ok(Event::CData(event)) => { + if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) { + let text = event.decode().map_err(to_py_value_error)?.to_string(); + append_text_field(item, field, &text); + } + } + Ok(Event::GeneralRef(event)) => { + if let (Some(item), Some((field, _))) = (current_item.as_mut(), current_field) { + let text = resolve_general_ref(&event)?; + append_text_field(item, field, &text); + } + } + Ok(Event::End(event)) => { + let local = local_name(event.name()); + if current_item.is_some() && item_depth == 1 && is_item_node(&local) { + if let Some(item) = current_item.take() { + if let Some(item) = finalize_item(item) { + results.push(item); + if results.len() >= max_items { + break; + } + } + } + item_depth = 0; + current_field = None; + continue; + } + + if current_item.is_some() && item_depth > 0 { + if current_field + .map(|(_, depth)| depth == item_depth) + .unwrap_or(false) + { + current_field = None; + } + item_depth = item_depth.saturating_sub(1); + } + } + Ok(Event::Eof) => break, + Err(err) => { + return Err(to_py_value_error(err)); + } + _ => {} + } + } + + Ok(results) +} + +/// 处理开始标签,记录当前需要采集文本的字段和链接属性。 +fn handle_start_field( + event: &BytesStart<'_>, + local: &str, + item: &mut RssItem, + depth: usize, + current_field: &mut Option<(TextField, usize)>, +) -> PyResult<()> { + if local == "enclosure" { + fill_enclosure(event, item)?; + return Ok(()); + } + + if local == "link" { + fill_link_from_href(event, item)?; + } + + if current_field.is_none() { + if let Some(field) = pick_text_field(local, item) { + *current_field = Some((field, depth)); + } + } + Ok(()) +} + +/// 处理空标签,覆盖 Atom 的 link href 和 RSS 的 enclosure。 +fn handle_empty_field(event: &BytesStart<'_>, local: &str, item: &mut RssItem) -> PyResult<()> { + match local { + "enclosure" => fill_enclosure(event, item)?, + "link" => fill_link_from_href(event, item)?, + _ => {} + } + Ok(()) +} + +/// 根据标签名和已采集状态选择当前文本字段。 +fn pick_text_field(local: &str, item: &RssItem) -> Option { + match local { + "title" if item.title.is_empty() => Some(TextField::Title), + "description" | "summary" if item.description.is_empty() => Some(TextField::Description), + "link" if item.link.is_empty() => Some(TextField::Link), + "pubDate" | "published" | "updated" if item.pubdate.is_empty() => Some(TextField::Pubdate), + "creator" if item.nickname.is_empty() => Some(TextField::Nickname), + _ => None, + } +} + +/// 追加文本字段内容,兼容 CDATA 和带内联标签的描述。 +fn append_text_field(item: &mut RssItem, field: TextField, text: &str) { + if text.is_empty() { + return; + } + match field { + TextField::Title => item.title.push_str(text), + TextField::Description => item.description.push_str(text), + TextField::Link => item.link.push_str(text), + TextField::Pubdate => item.pubdate.push_str(text), + TextField::Nickname => item.nickname.push_str(text), + } +} + +/// 解析 XML 通用实体,保留未识别实体的原始文本以便 Python 兜底时可复查。 +fn resolve_general_ref(event: &BytesRef<'_>) -> PyResult { + if let Some(value) = event.resolve_char_ref().map_err(to_py_value_error)? { + return Ok(value.to_string()); + } + let name = event.decode().map_err(to_py_value_error)?; + let resolved = match name.as_ref() { + "amp" => "&".to_string(), + "lt" => "<".to_string(), + "gt" => ">".to_string(), + "apos" => "'".to_string(), + "quot" => "\"".to_string(), + other => format!("&{other};"), + }; + Ok(resolved) +} + +/// 从 enclosure 标签读取下载链接和大小。 +fn fill_enclosure(event: &BytesStart<'_>, item: &mut RssItem) -> PyResult<()> { + if !item.enclosure.is_empty() { + return Ok(()); + } + if let Some(url) = attr_value(event, b"url")? { + item.enclosure = url; + } + if let Some(length) = attr_value(event, b"length")? { + item.size = length.trim().parse::().unwrap_or(0); + } + Ok(()) +} + +/// 从 Atom link 的 href 属性读取页面地址。 +fn fill_link_from_href(event: &BytesStart<'_>, item: &mut RssItem) -> PyResult<()> { + if !item.link.is_empty() { + return Ok(()); + } + if let Some(href) = attr_value(event, b"href")? { + item.link = href; + } + Ok(()) +} + +/// 读取并反转义指定属性值。 +fn attr_value(event: &BytesStart<'_>, name: &[u8]) -> PyResult> { + for attr in event.attributes().with_checks(false) { + let attr = attr.map_err(to_py_value_error)?; + if attr.key.as_ref().eq_ignore_ascii_case(name) { + let value = attr + .decode_and_unescape_value(event.decoder()) + .map_err(to_py_value_error)?; + return Ok(Some(value.trim().to_string())); + } + } + Ok(None) +} + +/// 完成单条 RSS item 的兼容性整理,保留原 Python 逻辑的跳过条件。 +fn finalize_item(mut item: RssItem) -> Option { + item.title = item.title.trim().to_string(); + item.description = item.description.trim().to_string(); + item.link = item.link.trim().to_string(); + item.enclosure = item.enclosure.trim().to_string(); + item.pubdate = item.pubdate.trim().to_string(); + item.nickname = item.nickname.trim().to_string(); + + if item.title.is_empty() { + return None; + } + if item.enclosure.is_empty() { + if item.link.is_empty() { + return None; + } + item.enclosure = item.link.clone(); + } + Some(item) +} + +/// 将 Rust 条目转换为 Python dict,字段名保持与 RssHelper.parse 原返回一致。 +fn item_to_py(py: Python<'_>, item: &RssItem) -> PyResult { + let dict = PyDict::new(py); + dict.set_item("title", &item.title)?; + dict.set_item("enclosure", &item.enclosure)?; + dict.set_item("size", item.size)?; + dict.set_item("description", &item.description)?; + dict.set_item("link", &item.link)?; + if let Some(timestamp) = parse_pubdate_timestamp(&item.pubdate) { + dict.set_item("pubdate", py_datetime_from_timestamp(py, timestamp)?)?; + } else { + dict.set_item("pubdate", "")?; + } + if !item.nickname.is_empty() { + dict.set_item("nickname", &item.nickname)?; + } + Ok(dict.into()) +} + +/// 将 Unix 时间戳转换为本地时区 Python datetime,匹配原 astimezone(tz=None) 语义。 +fn py_datetime_from_timestamp<'py>(py: Python<'py>, timestamp: i64) -> PyResult> { + let datetime_mod = py.import("datetime")?; + let datetime_cls = datetime_mod.getattr("datetime")?; + let timezone_cls = datetime_mod.getattr("timezone")?; + let utc = timezone_cls.getattr("utc")?; + let utc_dt = datetime_cls.call_method1("fromtimestamp", (timestamp, utc))?; + utc_dt.call_method0("astimezone") +} + +/// 解析 RSS/Atom 常见日期格式并返回时间戳。 +fn parse_pubdate_timestamp(value: &str) -> Option { + let trimmed = value.trim(); + if trimmed.is_empty() { + return None; + } + if let Ok(datetime) = DateTime::parse_from_rfc2822(trimmed) { + return Some(datetime.timestamp()); + } + if let Ok(datetime) = DateTime::parse_from_rfc3339(trimmed) { + return Some(datetime.timestamp()); + } + if let Some(timestamp) = parse_utc_suffix_datetime(trimmed) { + return Some(timestamp); + } + parse_local_naive_datetime(trimmed) +} + +/// 兼容部分站点输出的 UTC/GMT 文本后缀。 +fn parse_utc_suffix_datetime(value: &str) -> Option { + for suffix in [" UTC", " GMT"] { + let Some(stripped) = value.strip_suffix(suffix) else { + continue; + }; + for format in [ + "%a, %d %b %Y %H:%M:%S", + "%d %b %Y %H:%M:%S", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + ] { + if let Ok(naive) = NaiveDateTime::parse_from_str(stripped.trim(), format) { + return Some(Utc.from_utc_datetime(&naive).timestamp()); + } + } + } + None +} + +/// 解析不带时区的日期格式,并按系统本地时区解释。 +fn parse_local_naive_datetime(value: &str) -> Option { + for format in [ + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d %H:%M", + "%Y/%m/%d %H:%M:%S", + "%Y/%m/%d %H:%M", + "%d %b %Y %H:%M:%S", + "%a, %d %b %Y %H:%M:%S", + ] { + if let Ok(naive) = NaiveDateTime::parse_from_str(value, format) { + return local_timestamp(naive); + } + } + for format in ["%Y-%m-%d", "%Y/%m/%d", "%d %b %Y"] { + if let Ok(date) = NaiveDate::parse_from_str(value, format) { + return local_timestamp(NaiveDateTime::new(date, NaiveTime::MIN)); + } + } + None +} + +/// 将本地无时区时间转换为时间戳,处理夏令时歧义时取较早值。 +fn local_timestamp(naive: NaiveDateTime) -> Option { + Local + .from_local_datetime(&naive) + .single() + .or_else(|| Local.from_local_datetime(&naive).earliest()) + .map(|datetime| datetime.timestamp()) +} + +/// 判断当前标签是否为 RSS item 或 Atom entry。 +fn is_item_node(local: &str) -> bool { + matches!(local, "item" | "entry") +} + +/// 提取 XML 名称的本地部分,用于兼容 dc:creator 这类命名空间字段。 +fn local_name(name: QName<'_>) -> String { + let raw = std::str::from_utf8(name.as_ref()).unwrap_or_default(); + raw.rsplit_once(':') + .map(|(_, local)| local) + .unwrap_or(raw) + .to_string() +} + +/// 将 quick-xml 错误转换为 Python ValueError 交给 Python 包装层判断是否兜底。 +fn to_py_value_error(err: E) -> PyErr { + pyo3::exceptions::PyValueError::new_err(err.to_string()) +} diff --git a/scripts/benchmark_rss_rust.py b/scripts/benchmark_rss_rust.py new file mode 100644 index 00000000..aa60d31b --- /dev/null +++ b/scripts/benchmark_rss_rust.py @@ -0,0 +1,156 @@ +import argparse +import statistics +import sys +import time +from contextlib import contextmanager +from pathlib import Path +from types import SimpleNamespace + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(PROJECT_ROOT)) + +from app.helper import rss as rss_module +from app.helper.rss import RssHelper +from app.utils import rust_accel + + +class FakeRequestUtils: + """ + 基准测试用 RequestUtils,固定返回内存中的 RSS 文本。 + """ + + xml_text = "" + + def __init__(self, **_kwargs): + """ + 保持与真实 RequestUtils 构造签名兼容。 + """ + + def get_res(self, _url): + """ + 返回 RssHelper.parse 所需的最小响应对象。 + """ + return SimpleNamespace( + status_code=200, + content=self.xml_text.encode("utf-8"), + text=self.xml_text, + apparent_encoding="utf-8", + encoding="utf-8", + ) + + +def build_rss_xml(items: int) -> str: + """ + 构造覆盖标题、描述、链接、enclosure、日期和 creator 的 RSS 文本。 + """ + rows = [] + for index in range(items): + rows.append(f""" + + MoviePilot Benchmark {index} + tag]]> + https://example.com/details/{index} + + Tue, 19 May 2026 08:30:00 GMT + bench-user-{index} + + """) + return f""" + + + {''.join(rows)} + + + """ + + +@contextmanager +def patched_request_utils(xml_text: str): + """ + 临时替换 RSS 请求层,让基准覆盖 RssHelper.parse 的实际解析链路。 + """ + original_request_utils = rss_module.RequestUtils + FakeRequestUtils.xml_text = xml_text + rss_module.RequestUtils = FakeRequestUtils + try: + yield + finally: + rss_module.RequestUtils = original_request_utils + + +def disabled_rust_parse(_xml_text: str, _max_items: int = 1000): + """ + 关闭 Rust 快路径,用同一条 RssHelper.parse 链路测量 Python lxml 兜底性能。 + """ + return None + + +@contextmanager +def selected_rss_parser(use_rust: bool): + """ + 在 Rust 快路径和 Python lxml 解析之间切换,保持请求与编码成本一致。 + """ + original_parse = rss_module.rust_accel.parse_rss_items + if not use_rust: + rss_module.rust_accel.parse_rss_items = disabled_rust_parse + try: + yield + finally: + rss_module.rust_accel.parse_rss_items = original_parse + + +def parse_chain(xml_text: str, use_rust: bool): + """ + 执行一次 RssHelper.parse,返回解析到的 RSS 条目。 + """ + with patched_request_utils(xml_text), selected_rss_parser(use_rust): + return RssHelper().parse("https://example.com/rss") + + +def measure_chain(xml_text: str, use_rust: bool, loops: int, repeats: int): + """ + 多轮测量 RssHelper.parse 平均耗时,并校验每轮解析数量稳定。 + """ + samples = [] + parsed_count = 0 + for _ in range(repeats): + start = time.perf_counter() + for _ in range(loops): + parsed = parse_chain(xml_text, use_rust) + parsed_count = len(parsed) + samples.append((time.perf_counter() - start) * 1000 / loops) + return statistics.median(samples), parsed_count + + +def parse_args(): + """ + 解析命令行参数。 + """ + parser = argparse.ArgumentParser(description="Benchmark RSS parsing through RssHelper.parse") + parser.add_argument("--items", type=int, default=200, help="RSS item count") + parser.add_argument("--loops", type=int, default=50, help="Loops per repeat") + parser.add_argument("--repeats", type=int, default=5, help="Repeat count") + return parser.parse_args() + + +def main() -> int: + """ + 运行 Rust 与 Python RSS 解析链路基准测试。 + """ + args = parse_args() + xml_text = build_rss_xml(args.items) + rust_ms, rust_count = measure_chain(xml_text, use_rust=True, loops=args.loops, repeats=args.repeats) + python_ms, python_count = measure_chain(xml_text, use_rust=False, loops=args.loops, repeats=args.repeats) + speedup = python_ms / rust_ms if rust_ms else 0 + + print(f"rust_available={rust_accel.is_available()}") + print(f"items={args.items} loops={args.loops} repeats={args.repeats}") + print(f"rust_items={rust_count} python_items={python_count}") + print(f"rust_chain_ms_per_loop={rust_ms:.3f}") + print(f"python_chain_ms_per_loop={python_ms:.3f}") + print(f"speedup={speedup:.2f}x") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_rust_accel.py b/tests/test_rust_accel.py index cc9b2b14..c8afb4e2 100644 --- a/tests/test_rust_accel.py +++ b/tests/test_rust_accel.py @@ -1,5 +1,10 @@ +from datetime import datetime, timezone +from types import SimpleNamespace + import pytest +from app.helper import rss as rss_module +from app.helper.rss import RssHelper from app.modules.indexer.spider import SiteSpider from app.schemas.types import MediaType from app.utils import rust_accel @@ -29,6 +34,127 @@ def test_rust_filter_rule_parser_handles_parentheses_and_or(): assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]] +def test_rust_rss_parser_extracts_rss_and_atom_items(): + """ + Rust RSS解析应覆盖 RSS item、Atom entry、命名空间和日期字段。 + """ + xml = """ + + + + + Movie & Show + bold]]> + https://example.com/details/1 + + Tue, 19 May 2026 08:30:00 GMT + 豆瓣用户 + + + + + + Atom Title + Atom Summary + + 2026-05-19T09:30:00Z + + + + """ + + result = rust_accel.parse_rss_items(xml, max_items=100) + + assert len(result) == 2 + assert result[0]["title"] == "Movie & Show" + assert result[0]["description"] == "Desc bold" + assert result[0]["link"] == "https://example.com/details/1" + assert result[0]["enclosure"] == "https://example.com/download/1.torrent" + assert result[0]["size"] == 123456 + assert result[0]["nickname"] == "豆瓣用户" + assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 8, 30, tzinfo=timezone.utc).timestamp()) + assert result[1]["title"] == "Atom Title" + assert result[1]["description"] == "Atom Summary" + assert result[1]["link"] == "https://example.com/atom/2" + assert result[1]["enclosure"] == "https://example.com/atom/2" + assert int(result[1]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 9, 30, tzinfo=timezone.utc).timestamp()) + + +def test_rust_rss_parser_skips_incomplete_items(): + """ + Rust RSS解析应保持原逻辑,跳过无标题或无链接的条目。 + """ + xml = """ + + + https://example.com/a + No Link + OKhttps://example.com/ok + + + """ + + result = rust_accel.parse_rss_items(xml, max_items=100) + + assert result == [{ + "title": "OK", + "enclosure": "https://example.com/ok", + "size": 0, + "description": "", + "link": "https://example.com/ok", + "pubdate": "", + }] + + +def test_rss_helper_parse_uses_rust_parser(monkeypatch): + """ + RssHelper.parse 应在请求和编码处理后直接使用 Rust 解析结果。 + """ + xml = """ + + + + Helper Title + Helper Description + https://example.com/details/3 + 2026-05-19T10:30:00Z + + + + """ + + class FakeRequestUtils: + """ + 测试用 RequestUtils,避免真实网络请求。 + """ + + def __init__(self, **_kwargs): + """ + 保存构造参数占位,兼容 RssHelper 的调用方式。 + """ + + def get_res(self, _url): + """ + 返回带 content/text/status_code 的最小响应对象。 + """ + return SimpleNamespace( + status_code=200, + content=xml.encode("utf-8"), + text=xml, + apparent_encoding="utf-8", + encoding="utf-8", + ) + + monkeypatch.setattr(rss_module, "RequestUtils", FakeRequestUtils) + + result = RssHelper().parse("https://example.com/rss") + + assert len(result) == 1 + assert result[0]["title"] == "Helper Title" + assert result[0]["enclosure"] == "https://example.com/details/3" + assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 10, 30, tzinfo=timezone.utc).timestamp()) + + def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links(): """ Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。