From 7cbfeb2377a701cf4c0828f50c69acff0a53bed4 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Sat, 23 May 2026 09:17:32 +0800 Subject: [PATCH] refactor: slim rust acceleration surface --- app/core/meta/metavideo.py | 161 +-- app/core/metainfo.py | 7 - app/helper/rss.py | 33 - app/modules/filter/__init__.py | 44 - app/modules/indexer/spider/__init__.py | 24 - app/utils/rust_accel.py | 147 +-- rust/moviepilot_rust/Cargo.lock | 795 --------------- rust/moviepilot_rust/Cargo.toml | 7 - rust/moviepilot_rust/src/filter.rs | 418 +------- rust/moviepilot_rust/src/indexer.rs | 1151 ---------------------- rust/moviepilot_rust/src/lib.rs | 11 - rust/moviepilot_rust/src/meta.rs | 1247 ------------------------ rust/moviepilot_rust/src/rss.rs | 204 ---- rust/moviepilot_rust/src/utils.rs | 103 -- tests/test_rust_accel.py | 446 +-------- 15 files changed, 61 insertions(+), 4737 deletions(-) delete mode 100644 rust/moviepilot_rust/src/indexer.rs delete mode 100644 rust/moviepilot_rust/src/meta.rs delete mode 100644 rust/moviepilot_rust/src/rss.rs delete mode 100644 rust/moviepilot_rust/src/utils.rs diff --git a/app/core/meta/metavideo.py b/app/core/meta/metavideo.py index ec875f49..bc1a77c0 100644 --- a/app/core/meta/metavideo.py +++ b/app/core/meta/metavideo.py @@ -11,7 +11,6 @@ from app.schemas.types import MediaType from app.utils.string import StringUtils from app.utils.tokens import Tokens from app.core.meta.streamingplatform import StreamingPlatforms -from app.utils import rust_accel class MetaVideo(MetaBase): @@ -104,60 +103,58 @@ class MetaVideo(MetaBase): # 把年月日去掉 title = re.sub(r'\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}', "", title) media_exts = settings.RMT_MEDIAEXT + settings.RMT_SUBEXT + settings.RMT_AUDIOEXT - rust_parse = rust_accel.parse_video_title(title, isfile=isfile, media_exts=media_exts) - if not self.__apply_rust_parse(rust_parse): - # 拆分tokens - tokens = Tokens(title) - # 实例化StreamingPlatforms对象 - streaming_platforms = StreamingPlatforms() - # 解析名称、年份、季、集、资源类型、分辨率等 + # 拆分tokens + tokens = Tokens(title) + # 实例化StreamingPlatforms对象 + streaming_platforms = StreamingPlatforms() + # 解析名称、年份、季、集、资源类型、分辨率等 + token = tokens.get_next() + while token: + self._index += 1 # 更新当前处理的token索引 + # Part + self.__init_part(token, tokens) + # 标题 + if self._continue_flag: + self.__init_name(token, media_exts) + # 年份 + if self._continue_flag: + self.__init_year(token) + # 分辨率 + if self._continue_flag: + self.__init_resource_pix(token) + # 季 + if self._continue_flag: + self.__init_season(token) + # 集 + if self._continue_flag: + self.__init_episode(token) + # 资源类型 + if self._continue_flag: + self.__init_resource_type(token) + # 流媒体平台 + if self._continue_flag: + self.__init_web_source(token, tokens, streaming_platforms) + # 视频编码 + if self._continue_flag: + self.__init_video_encode(token) + # 视频位深 + if self._continue_flag: + self.__init_video_bit(token) + # 音频编码 + if self._continue_flag: + self.__init_audio_encode(token) + # 帧率 + if self._continue_flag: + self.__init_fps(token) + # 取下一个,直到没有为卡 token = tokens.get_next() - while token: - self._index += 1 # 更新当前处理的token索引 - # Part - self.__init_part(token, tokens) - # 标题 - if self._continue_flag: - self.__init_name(token, media_exts) - # 年份 - if self._continue_flag: - self.__init_year(token) - # 分辨率 - if self._continue_flag: - self.__init_resource_pix(token) - # 季 - if self._continue_flag: - self.__init_season(token) - # 集 - if self._continue_flag: - self.__init_episode(token) - # 资源类型 - if self._continue_flag: - self.__init_resource_type(token) - # 流媒体平台 - if self._continue_flag: - self.__init_web_source(token, tokens, streaming_platforms) - # 视频编码 - if self._continue_flag: - self.__init_video_encode(token) - # 视频位深 - if self._continue_flag: - self.__init_video_bit(token) - # 音频编码 - if self._continue_flag: - self.__init_audio_encode(token) - # 帧率 - if self._continue_flag: - self.__init_fps(token) - # 取下一个,直到没有为卡 - token = tokens.get_next() - self._continue_flag = True - # 合成质量 - if self._effect: - self._effect.reverse() - self.resource_effect = " ".join(self._effect) - if self._source: - self.resource_type = self._source.strip() + self._continue_flag = True + # 合成质量 + if self._effect: + self._effect.reverse() + self.resource_effect = " ".join(self._effect) + if self._source: + self.resource_type = self._source.strip() # 提取原盘DIY if self.resource_type and "BluRay" in self.resource_type: if (self.subtitle and re.findall(r'D[Ii]Y', self.subtitle)) \ @@ -188,62 +185,6 @@ class MetaVideo(MetaBase): if not self.video_bit: self.video_bit = self.extract_video_bit(self.video_encode) - def __apply_rust_parse(self, rust_parse: Optional[dict]) -> bool: - """ - 应用 Rust 主识别结果;成功时跳过 Python token 主循环。 - """ - if not rust_parse or not rust_parse.get("complete"): - return False - self.cn_name = rust_parse.get("cn_name") - self.en_name = rust_parse.get("en_name") - if rust_parse.get("year"): - self.year = str(rust_parse.get("year")) - self.part = rust_parse.get("part") - self.__merge_rust_parse(rust_parse) - media_type = rust_parse.get("type") - if media_type == "tv": - self.type = MediaType.TV - elif media_type == "movie": - self.type = MediaType.MOVIE - return True - - def __merge_rust_parse(self, rust_parse: Optional[dict]) -> None: - """ - 合并 Rust 预解析结果,仅补齐 Python 识别未命中的资源字段。 - """ - if not rust_parse: - return - if not self.year and rust_parse.get("year"): - self.year = str(rust_parse.get("year")) - if self.begin_season is None and rust_parse.get("begin_season") is not None: - self.begin_season = int(rust_parse.get("begin_season")) - self.type = MediaType.TV - if self.end_season is None and rust_parse.get("end_season") is not None: - self.end_season = int(rust_parse.get("end_season")) - if not self.total_season and rust_parse.get("total_season"): - self.total_season = int(rust_parse.get("total_season")) - if self.begin_episode is None and rust_parse.get("begin_episode") is not None: - self.begin_episode = int(rust_parse.get("begin_episode")) - self.type = MediaType.TV - if self.end_episode is None and rust_parse.get("end_episode") is not None: - self.end_episode = int(rust_parse.get("end_episode")) - if not self.total_episode and rust_parse.get("total_episode"): - self.total_episode = int(rust_parse.get("total_episode")) - if not self.resource_pix and rust_parse.get("resource_pix"): - self.resource_pix = rust_parse.get("resource_pix") - if not self.resource_type and rust_parse.get("resource_type"): - self.resource_type = rust_parse.get("resource_type") - if not self.resource_effect and rust_parse.get("resource_effect"): - self.resource_effect = rust_parse.get("resource_effect") - if not self.video_encode and rust_parse.get("video_encode"): - self.video_encode = rust_parse.get("video_encode") - if not self.video_bit and rust_parse.get("video_bit"): - self.video_bit = rust_parse.get("video_bit") - if not self.audio_encode and rust_parse.get("audio_encode"): - self.audio_encode = rust_parse.get("audio_encode") - if self.fps is None and rust_parse.get("fps") is not None: - self.fps = int(rust_parse.get("fps")) - @staticmethod def __get_title_from_description(description: str) -> Optional[str]: """ diff --git a/app/core/metainfo.py b/app/core/metainfo.py index b3000219..10201648 100644 --- a/app/core/metainfo.py +++ b/app/core/metainfo.py @@ -12,7 +12,6 @@ from app.core.meta.infopath import ( from app.core.meta.words import WordsMatcher from app.log import logger from app.schemas.types import MediaType -from app.utils import rust_accel _ANIME_BRACKET_RE = re.compile(r'【[+0-9XVPI-]+】\s*【', re.IGNORECASE) @@ -169,9 +168,6 @@ def is_anime(name: str) -> bool: :param name: 名称 :return: 是否动漫 """ - rust_result = rust_accel.is_anime(name) - if rust_result is not None: - return rust_result if not name: return False if _ANIME_BRACKET_RE.search(name): @@ -189,9 +185,6 @@ def find_metainfo(title: str) -> Tuple[str, dict]: """ 从标题中提取媒体信息 """ - rust_result = rust_accel.find_metainfo(title) - if rust_result is not None: - return rust_result metainfo = _empty_metainfo() if not title: return title, metainfo diff --git a/app/helper/rss.py b/app/helper/rss.py index 2418a1c0..5257ff0c 100644 --- a/app/helper/rss.py +++ b/app/helper/rss.py @@ -9,7 +9,6 @@ from lxml import etree from app.core.config import settings from app.helper.browser import PlaywrightHelper from app.log import logger -from app.utils import rust_accel from app.utils.http import RequestUtils from app.utils.string import StringUtils @@ -228,32 +227,6 @@ class RssHelper: }, } - @staticmethod - def __format_rust_items(items: List[dict]) -> List[dict]: - """ - 将 Rust RSS 解析结果转换为原 Python XPath 解析返回结构。 - """ - ret_array = [] - for item in items: - pubdate = "" - pubdate_raw = item.get("pubdate_raw") - if pubdate_raw: - pubdate = StringUtils.get_time(pubdate_raw) - if pubdate is not None: - pubdate = pubdate.astimezone(tz=None) - tmp_dict = { - 'title': item.get("title") or "", - 'enclosure': item.get("enclosure") or "", - 'size': item.get("size") or 0, - 'description': item.get("description") or "", - 'link': item.get("link") or "", - 'pubdate': pubdate - } - if item.get("nickname"): - tmp_dict['nickname'] = item.get("nickname") - ret_array.append(tmp_dict) - return ret_array - def parse(self, url, proxy: bool = False, timeout: Optional[int] = 15, headers: dict = None, ua: str = None) -> Union[List[dict], None, bool]: """ @@ -325,12 +298,6 @@ class RssHelper: logger.error("RSS内容不是有效的XML格式") return False - rust_items = rust_accel.parse_rss_items(ret_xml, self.MAX_RSS_ITEMS) - if rust_items is not None: - if len(rust_items) >= self.MAX_RSS_ITEMS: - logger.warning(f"RSS条目过多,仅处理前{self.MAX_RSS_ITEMS}个") - return self.__format_rust_items(rust_items) - # 使用lxml.etree解析XML parser = None try: diff --git a/app/modules/filter/__init__.py b/app/modules/filter/__init__.py index 232d1c72..f6192835 100644 --- a/app/modules/filter/__init__.py +++ b/app/modules/filter/__init__.py @@ -11,7 +11,6 @@ from app.modules import _ModuleBase from app.modules.filter.RuleParser import RuleParser from app.modules.filter.builtin_rules import BUILTIN_RULE_SET from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey -from app.utils import rust_accel from app.utils.string import StringUtils @@ -139,9 +138,6 @@ class FilterModule(_ModuleBase): # 查询规则表详情 groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups) if groups: - rust_filtered = self.__filter_torrents_by_rust(groups, torrent_list, mediainfo) - if rust_filtered is not None: - return rust_filtered for group in groups: # 过滤种子 torrent_list = self.__filter_torrents( @@ -154,46 +150,6 @@ class FilterModule(_ModuleBase): ) return torrent_list - def __filter_torrents_by_rust(self, groups: list, torrent_list: List[TorrentInfo], - mediainfo: MediaInfo) -> Optional[List[TorrentInfo]]: - """ - 使用 Rust 批量过滤种子;遇到不可支持的规则时返回 None 交由 Python 逻辑处理。 - """ - if not torrent_list: - return [] - payloads = [self.__build_rust_torrent_payload(torrent) for torrent in torrent_list] - media_payload = mediainfo.to_dict() if mediainfo and hasattr(mediainfo, "to_dict") else ( - vars(mediainfo).copy() if mediainfo else None - ) - result = rust_accel.filter_torrents( - rule_set=self.rule_set, - rule_strings=[group.rule_string for group in groups], - torrents=payloads, - media_info=media_payload, - ) - if result is None: - return None - filtered_torrents = [] - for index, pri_order in result: - torrent = torrent_list[int(index)] - torrent.pri_order = int(pri_order) - filtered_torrents.append(torrent) - return filtered_torrents - - @staticmethod - def __build_rust_torrent_payload(torrent: TorrentInfo) -> dict: - """ - 组装 Rust 过滤器需要的纯数据载荷,避免 Rust 直接依赖 Python 业务对象。 - """ - payload = torrent.to_dict() if hasattr(torrent, "to_dict") else vars(torrent).copy() - payload["pub_minutes"] = torrent.pub_minutes() - if payload.get("size"): - meta = MetaInfo(title=torrent.title, subtitle=torrent.description) - payload["episode_count"] = meta.total_episode or 1 - else: - payload["episode_count"] = 1 - return payload - def __filter_torrents(self, rule_string: str, rule_name: str, torrent_list: List[TorrentInfo], mediainfo: MediaInfo, diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index 0310cd9b..c3059c30 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -12,7 +12,6 @@ from pyquery import PyQuery from app.core.config import settings from app.log import logger from app.schemas.types import MediaType -from app.utils import rust_accel from app.utils.http import RequestUtils, AsyncRequestUtils from app.utils.string import StringUtils from app.utils.url import UrlUtils @@ -96,19 +95,6 @@ class SiteSpider: """ 获取搜索URL """ - rust_url = rust_accel.build_indexer_search_url({ - "search": self.search, - "batch": self.batch, - "browse": self.browse, - "category": self.category, - "domain": self.domain, - "keyword": self.keyword, - "mtype": self.mtype.value if self.mtype else None, - "cat": self.cat, - "page": self.page, - }) - if rust_url: - return rust_url # 种子搜索相对路径 paths = self.search.get('paths', []) torrentspath = "" @@ -753,16 +739,6 @@ class SiteSpider: # 清空旧结果 self.torrents_info_array = [] - rust_torrents = rust_accel.parse_indexer_torrents( - html_text=html_text, - domain=self.domain, - list_config=self.list, - fields=self.fields, - category=self.category, - result_num=int(self.result_num), - ) - if rust_torrents is not None: - return rust_torrents html_doc = None try: # 解析站点文本对象 diff --git a/app/utils/rust_accel.py b/app/utils/rust_accel.py index d9e0f87e..4297ca2f 100644 --- a/app/utils/rust_accel.py +++ b/app/utils/rust_accel.py @@ -1,7 +1,6 @@ -from typing import Any, Dict, List, Optional, Tuple +from typing import Optional from app.log import logger -from app.schemas.types import MediaType try: import moviepilot_rust as _moviepilot_rust @@ -26,64 +25,6 @@ def import_error() -> Optional[Exception]: return _import_error -def is_anime(name: str) -> Optional[bool]: - """ - 使用 Rust 快路径判断标题是否为动漫格式,不可用时返回 None。 - """ - if not _moviepilot_rust: - return None - try: - return bool(_moviepilot_rust.is_anime_fast(name or "")) - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust 动漫识别失败,回退 Python:{err}") - return None - - -def find_metainfo(title: str) -> Optional[Tuple[str, Dict[str, Any]]]: - """ - 使用 Rust 快路径提取标题中的内嵌媒体标签,不可用时返回 None。 - """ - if not _moviepilot_rust: - return None - try: - result = _moviepilot_rust.find_metainfo_fast(title or "") - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust 内嵌媒体标签识别失败,回退 Python:{err}") - return None - metainfo = { - "tmdbid": result.get("tmdbid"), - "doubanid": result.get("doubanid"), - "type": _coerce_media_type(result.get("type")), - "begin_season": result.get("begin_season"), - "end_season": result.get("end_season"), - "total_season": result.get("total_season"), - "begin_episode": result.get("begin_episode"), - "end_episode": result.get("end_episode"), - "total_episode": result.get("total_episode"), - } - return result.get("title"), metainfo - - -def parse_video_title( - title: str, - isfile: bool = False, - media_exts: Optional[List[str]] = None, -) -> Optional[Dict[str, Any]]: - """ - 使用 Rust 执行影视标题主识别流程,不可用时返回 None。 - """ - if not _moviepilot_rust: - return None - try: - return _moviepilot_rust.parse_video_title_fast(title or "", isfile, media_exts or []) - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust 影视标题主识别失败,回退 Python:{err}") - return None - - def parse_filter_rule(expression: str) -> Optional[list]: """ 使用 Rust 解析过滤规则表达式,不可用时返回 None。 @@ -98,92 +39,6 @@ def parse_filter_rule(expression: str) -> Optional[list]: return None -def filter_torrents( - rule_set: Dict[str, dict], - rule_strings: List[str], - torrents: List[dict], - media_info: Optional[dict] = None, -) -> Optional[list]: - """ - 使用 Rust 批量执行种子过滤,不可用或不兼容时返回 None。 - """ - if not _moviepilot_rust: - return None - try: - return _moviepilot_rust.filter_torrents_fast(rule_set, rule_strings, torrents, media_info) - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust 种子过滤失败,回退 Python:{err}") - return None - - -def build_indexer_search_url(config: dict) -> Optional[str]: - """ - 使用 Rust 根据普通 indexer 配置生成搜索 URL,不可用时返回 None。 - """ - if not _moviepilot_rust: - return None - try: - return _moviepilot_rust.build_indexer_search_url_fast(config) - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust 站点搜索 URL 生成失败,回退 Python:{err}") - return None - - -def parse_indexer_torrents( - html_text: str, - domain: str, - list_config: dict, - fields: dict, - category: Optional[dict], - result_num: int, -) -> Optional[List[dict]]: - """ - 使用 Rust 批量解析普通 indexer 页面,不支持的配置返回 None。 - """ - if not _moviepilot_rust: - return None - try: - return _moviepilot_rust.parse_indexer_torrents_fast( - html_text or "", - domain or "", - list_config or {}, - fields or {}, - category, - int(result_num or 0), - ) - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust 站点页面解析失败,回退 Python:{err}") - return None - - -def parse_rss_items(xml_text: str, max_items: int) -> Optional[List[dict]]: - """ - 使用 Rust 批量解析 RSS/Atom 条目,不可用或解析失败时返回 None。 - """ - if not _moviepilot_rust: - return None - try: - return _moviepilot_rust.parse_rss_items_fast(xml_text or "", int(max_items or 0)) - except BaseException as err: - _raise_non_rust_panic(err) - logger.debug(f"Rust RSS 条目解析失败,回退 Python:{err}") - return None - - -def _coerce_media_type(value: Optional[str]) -> Optional[MediaType]: - """ - 将 Rust 返回的媒体类型字符串转换为系统 MediaType。 - """ - if value == "movies": - return MediaType.MOVIE - if value == "tv": - return MediaType.TV - return None - - def _raise_non_rust_panic(err: BaseException) -> None: """ 只吞掉 Rust 扩展 panic/异常,保留用户中断和进程退出语义。 diff --git a/rust/moviepilot_rust/Cargo.lock b/rust/moviepilot_rust/Cargo.lock index 042157d0..8d32992b 100644 --- a/rust/moviepilot_rust/Cargo.lock +++ b/rust/moviepilot_rust/Cargo.lock @@ -2,272 +2,24 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "aho-corasick" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" -dependencies = [ - "memchr", -] - [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "bitflags" -version = "2.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" -[[package]] -name = "cssparser" -version = "0.35.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e901edd733a1472f944a45116df3f846f54d37e67e68640ac8bb69689aca2aa" -dependencies = [ - "cssparser-macros", - "dtoa-short", - "itoa", - "phf", - "smallvec", -] - -[[package]] -name = "cssparser-macros" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" -dependencies = [ - "quote", - "syn", -] - -[[package]] -name = "derive_more" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" -dependencies = [ - "derive_more-impl", -] - -[[package]] -name = "derive_more-impl" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" -dependencies = [ - "proc-macro2", - "quote", - "rustc_version", - "syn", -] - -[[package]] -name = "displaydoc" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "dtoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" - -[[package]] -name = "dtoa-short" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" -dependencies = [ - "dtoa", -] - -[[package]] -name = "ego-tree" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" - -[[package]] -name = "form_urlencoded" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "getopts" -version = "0.2.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" -dependencies = [ - "unicode-width", -] - [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "html5ever" -version = "0.35.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" -dependencies = [ - "log", - "markup5ever", - "match_token", -] - -[[package]] -name = "icu_collections" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" -dependencies = [ - "displaydoc", - "potential_utf", - "utf8_iter", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locale_core" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_normalizer" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" -dependencies = [ - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" - -[[package]] -name = "icu_properties" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" -dependencies = [ - "icu_collections", - "icu_locale_core", - "icu_properties_data", - "icu_provider", - "zerotrie", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" - -[[package]] -name = "icu_provider" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" -dependencies = [ - "displaydoc", - "icu_locale_core", - "writeable", - "yoke", - "zerofrom", - "zerotrie", - "zerovec", -] - -[[package]] -name = "idna" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" -dependencies = [ - "idna_adapter", - "smallvec", - "utf8_iter", -] - -[[package]] -name = "idna_adapter" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" -dependencies = [ - "icu_normalizer", - "icu_properties", -] - [[package]] name = "indoc" version = "2.0.7" @@ -277,79 +29,12 @@ dependencies = [ "rustversion", ] -[[package]] -name = "itoa" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" - [[package]] name = "libc" version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" -[[package]] -name = "litemap" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" - -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" - -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - -[[package]] -name = "markup5ever" -version = "0.35.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" -dependencies = [ - "log", - "tendril", - "web_atoms", -] - -[[package]] -name = "match_token" -version = "0.35.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "memchr" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" - -[[package]] -name = "memo-map" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" - [[package]] name = "memoffset" version = "0.9.1" @@ -359,144 +44,25 @@ dependencies = [ "autocfg", ] -[[package]] -name = "minijinja" -version = "2.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2929e494b2280e1e18959bb2e121da03347ae896896fdfaceaab43c88a02803f" -dependencies = [ - "memo-map", - "serde", -] - [[package]] name = "moviepilot-rust" version = "0.1.0" dependencies = [ - "minijinja", - "once_cell", - "percent-encoding", "pyo3", - "quick-xml", - "regex", - "scraper", - "url", ] -[[package]] -name = "new_debug_unreachable" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" - [[package]] name = "once_cell" version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - -[[package]] -name = "percent-encoding" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" - -[[package]] -name = "phf" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" -dependencies = [ - "phf_macros", - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" -dependencies = [ - "phf_shared", - "rand", -] - -[[package]] -name = "phf_macros" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" -dependencies = [ - "phf_generator", - "phf_shared", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "phf_shared" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" -dependencies = [ - "siphasher", -] - [[package]] name = "portable-atomic" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" -[[package]] -name = "potential_utf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" -dependencies = [ - "zerovec", -] - -[[package]] -name = "precomputed-hash" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" - [[package]] name = "proc-macro2" version = "1.0.106" @@ -569,15 +135,6 @@ dependencies = [ "syn", ] -[[package]] -name = "quick-xml" -version = "0.38.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" -dependencies = [ - "memchr", -] - [[package]] name = "quote" version = "1.0.45" @@ -587,201 +144,12 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" -dependencies = [ - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" - -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "scraper" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5f3a24d916e78954af99281a455168d4a9515d65eca99a18da1b813689c4ad9" -dependencies = [ - "cssparser", - "ego-tree", - "getopts", - "html5ever", - "precomputed-hash", - "selectors", - "tendril", -] - -[[package]] -name = "selectors" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5685b6ae43bfcf7d2e7dfcfb5d8e8f61b46442c902531e41a32a9a8bf0ee0fb6" -dependencies = [ - "bitflags", - "cssparser", - "derive_more", - "fxhash", - "log", - "new_debug_unreachable", - "phf", - "phf_codegen", - "precomputed-hash", - "servo_arc", - "smallvec", -] - -[[package]] -name = "semver" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "servo_arc" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" -dependencies = [ - "stable_deref_trait", -] - -[[package]] -name = "siphasher" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" - -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - -[[package]] -name = "stable_deref_trait" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" - -[[package]] -name = "string_cache" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" -dependencies = [ - "new_debug_unreachable", - "parking_lot", - "phf_shared", - "precomputed-hash", - "serde", -] - -[[package]] -name = "string_cache_codegen" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" -dependencies = [ - "phf_generator", - "phf_shared", - "proc-macro2", - "quote", -] - [[package]] name = "syn" version = "2.0.117" @@ -793,183 +161,20 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "synstructure" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "target-lexicon" version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" -[[package]] -name = "tendril" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" -dependencies = [ - "futf", - "mac", - "utf-8", -] - -[[package]] -name = "tinystr" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" -dependencies = [ - "displaydoc", - "zerovec", -] - [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" -[[package]] -name = "unicode-width" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" - [[package]] name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - -[[package]] -name = "url" -version = "2.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", -] - -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - -[[package]] -name = "web_atoms" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" -dependencies = [ - "phf", - "phf_codegen", - "string_cache", - "string_cache_codegen", -] - -[[package]] -name = "windows-link" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" - -[[package]] -name = "writeable" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" - -[[package]] -name = "yoke" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" -dependencies = [ - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - -[[package]] -name = "zerofrom" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - -[[package]] -name = "zerotrie" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", -] - -[[package]] -name = "zerovec" -version = "0.11.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/rust/moviepilot_rust/Cargo.toml b/rust/moviepilot_rust/Cargo.toml index 2b74ff47..cbb57cc8 100644 --- a/rust/moviepilot_rust/Cargo.toml +++ b/rust/moviepilot_rust/Cargo.toml @@ -8,11 +8,4 @@ name = "moviepilot_rust" crate-type = ["cdylib"] [dependencies] -minijinja = "2.20" -once_cell = "1.20" -percent-encoding = "2.3" pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] } -quick-xml = "0.38" -regex = "1.11" -scraper = "0.24" -url = "2.5" diff --git a/rust/moviepilot_rust/src/filter.rs b/rust/moviepilot_rust/src/filter.rs index 69c405e8..4391aa93 100644 --- a/rust/moviepilot_rust/src/filter.rs +++ b/rust/moviepilot_rust/src/filter.rs @@ -1,9 +1,6 @@ -use crate::utils::{get_optional_f64, get_optional_i64, get_optional_string}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList, PyString}; -use regex::{Regex, RegexBuilder}; -use std::collections::HashMap; +use pyo3::types::{PyList, PyString}; #[derive(Clone, Debug)] enum RuleExpr { @@ -22,27 +19,6 @@ enum Token { LParen, RParen, } - -#[derive(Clone, Debug)] -struct TorrentPayload { - index: usize, - title: String, - description: String, - labels: Vec, - size: f64, - seeders: i64, - downloadvolumefactor: Option, - pub_minutes: f64, - episode_count: f64, - fields: HashMap, -} - -#[derive(Clone, Debug)] -enum FieldValue { - Scalar(String), - List(Vec), -} - #[pyfunction] pub(crate) fn parse_filter_rule_fast(py: Python<'_>, expression: &str) -> PyResult { let tokens = tokenize_rule(expression)?; @@ -56,85 +32,6 @@ pub(crate) fn parse_filter_rule_fast(py: Python<'_>, expression: &str) -> PyResu Ok(outer.into()) } -/// 批量执行种子过滤规则,返回保留项的原始下标和优先级。 -#[pyfunction] -#[pyo3(signature = (rule_set, rule_strings, torrents, media_info=None))] -pub(crate) fn filter_torrents_fast( - py: Python<'_>, - rule_set: &Bound<'_, PyDict>, - rule_strings: Vec, - torrents: &Bound<'_, PyList>, - media_info: Option<&Bound<'_, PyDict>>, -) -> PyResult { - py.allow_threads(|| {}); - let mut payloads = Vec::with_capacity(torrents.len()); - for index in 0..torrents.len() { - let item = torrents.get_item(index)?; - let dict = item.downcast::()?; - payloads.push(TorrentPayload::from_py_dict(index, dict)?); - } - - let mut expr_cache: HashMap = HashMap::new(); - let mut regex_cache: HashMap = HashMap::new(); - let mut current_indices: Vec = (0..payloads.len()).collect(); - let mut priorities: HashMap = HashMap::new(); - - for rule_string in rule_strings { - if current_indices.is_empty() { - break; - } - let levels: Vec = rule_string - .split('>') - .map(|level| level.trim().to_string()) - .collect(); - let mut retained = Vec::new(); - for payload_index in ¤t_indices { - let payload = &payloads[*payload_index]; - let mut res_order = 100_i64; - let mut matched = false; - for level in &levels { - let expr = if let Some(cached) = expr_cache.get(level) { - cached.clone() - } else { - let parsed = parse_rule_expression(level)?; - expr_cache.insert(level.clone(), parsed.clone()); - parsed - }; - if match_expr(&expr, payload, rule_set, media_info, &mut regex_cache)? { - matched = true; - priorities.insert(payload.index, res_order); - break; - } - res_order -= 1; - } - if matched { - retained.push(*payload_index); - } - } - current_indices = retained; - } - - let result = PyList::empty(py); - for payload_index in current_indices { - let payload = &payloads[payload_index]; - result.append(( - payload.index, - priorities.get(&payload.index).copied().unwrap_or(0), - ))?; - } - Ok(result.into()) -} - -fn parse_rule_expression(expression: &str) -> PyResult { - let tokens = tokenize_rule(expression)?; - let mut parser = RuleParserState::new(tokens); - let expr = parser.parse_expression()?; - if parser.has_remaining() { - return Err(PyValueError::new_err("规则表达式包含无法解析的剩余内容")); - } - Ok(expr) -} - /// 将规则字符串切分为名称、逻辑符和括号。 fn tokenize_rule(expression: &str) -> PyResult> { let chars: Vec = expression.chars().collect(); @@ -325,316 +222,3 @@ fn expr_binary_to_py( list.append(expr_to_py(py, right)?)?; Ok(list.into()) } - -impl TorrentPayload { - /// 从 Python 字典构造 Rust 过滤载荷。 - fn from_py_dict(index: usize, dict: &Bound<'_, PyDict>) -> PyResult { - let title = get_optional_string(dict, "title")?.unwrap_or_default(); - let description = get_optional_string(dict, "description")?.unwrap_or_default(); - let labels = get_string_list(dict, "labels")?; - let size = get_optional_f64(dict, "size")?.unwrap_or(0.0); - let seeders = get_optional_i64(dict, "seeders")?.unwrap_or(0); - let downloadvolumefactor = get_optional_f64(dict, "downloadvolumefactor")?; - let pub_minutes = get_optional_f64(dict, "pub_minutes")?.unwrap_or(0.0); - let episode_count = get_optional_f64(dict, "episode_count")? - .unwrap_or(1.0) - .max(1.0); - let mut fields = HashMap::new(); - for (key, value) in dict.iter() { - let key = key.extract::()?; - if value.is_none() { - continue; - } - if let Ok(values) = value.extract::>() { - fields.insert(key, FieldValue::List(values)); - } else { - fields.insert(key, FieldValue::Scalar(value.str()?.to_str()?.to_string())); - } - } - Ok(Self { - index, - title, - description, - labels, - size, - seeders, - downloadvolumefactor, - pub_minutes, - episode_count, - fields, - }) - } - - /// 返回指定字段的匹配文本。 - fn content_for_matches(&self, match_fields: &[String]) -> String { - if match_fields.is_empty() { - return format!( - "{} {} {}", - self.title, - self.description, - self.labels.join(" ") - ); - } - let mut parts = Vec::new(); - for field in match_fields { - if let Some(value) = self.fields.get(field) { - match value { - FieldValue::Scalar(text) => { - if !text.is_empty() { - parts.push(text.clone()); - } - } - FieldValue::List(values) => { - parts.extend(values.iter().filter(|v| !v.is_empty()).cloned()) - } - } - } - } - parts.join(" ") - } -} - -/// 从 Python 字典读取字符串列表。 -fn get_string_list(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { - let Some(value) = dict.get_item(key)? else { - return Ok(Vec::new()); - }; - if value.is_none() { - return Ok(Vec::new()); - } - if let Ok(values) = value.extract::>() { - return Ok(values); - } - Ok(vec![value.str()?.to_str()?.to_string()]) -} - -/// 执行规则 AST 匹配。 -fn match_expr( - expr: &RuleExpr, - torrent: &TorrentPayload, - rule_set: &Bound<'_, PyDict>, - media_info: Option<&Bound<'_, PyDict>>, - regex_cache: &mut HashMap, -) -> PyResult { - match expr { - RuleExpr::Name(name) => match_rule(name, torrent, rule_set, media_info, regex_cache), - RuleExpr::Not(inner) => Ok(!match_expr( - inner, - torrent, - rule_set, - media_info, - regex_cache, - )?), - RuleExpr::And(left, right) => { - Ok( - match_expr(left, torrent, rule_set, media_info, regex_cache)? - && match_expr(right, torrent, rule_set, media_info, regex_cache)?, - ) - } - RuleExpr::Or(left, right) => { - Ok( - match_expr(left, torrent, rule_set, media_info, regex_cache)? - || match_expr(right, torrent, rule_set, media_info, regex_cache)?, - ) - } - } -} - -/// 执行单条规则匹配。 -fn match_rule( - rule_name: &str, - torrent: &TorrentPayload, - rule_set: &Bound<'_, PyDict>, - media_info: Option<&Bound<'_, PyDict>>, - regex_cache: &mut HashMap, -) -> PyResult { - let Some(rule_obj) = rule_set.get_item(rule_name)? else { - return Ok(false); - }; - let rule = rule_obj.downcast::()?; - if let Some(tmdb_obj) = rule.get_item("tmdb")? { - if !tmdb_obj.is_none() { - if let Ok(tmdb) = tmdb_obj.downcast::() { - if match_tmdb(tmdb, media_info)? { - return Ok(true); - } - } - } - } - - let match_fields = get_string_list(rule, "match")?; - let content = torrent.content_for_matches(&match_fields); - let includes = get_string_list(rule, "include")?; - let excludes = get_string_list(rule, "exclude")?; - - if !includes.is_empty() { - let mut included = false; - for pattern in &includes { - if regex_search(pattern, &content, regex_cache)? { - included = true; - break; - } - } - if !included { - return Ok(false); - } - } - for exclude in excludes { - if regex_search(&exclude, &content, regex_cache)? { - return Ok(false); - } - } - if let Some(size_range) = get_optional_string(rule, "size_range")? { - if !match_size(torrent, &size_range)? { - return Ok(false); - } - } - if let Some(seeders) = get_optional_i64(rule, "seeders")? { - if torrent.seeders < seeders { - return Ok(false); - } - } - if let Some(downloadvolumefactor) = get_optional_f64(rule, "downloadvolumefactor")? { - if torrent.downloadvolumefactor != Some(downloadvolumefactor) { - return Ok(false); - } - } - if let Some(pubdate) = get_optional_string(rule, "publish_time")? { - if !match_publish_time(torrent.pub_minutes, &pubdate) { - return Ok(false); - } - } - Ok(true) -} - -/// 使用带缓存的忽略大小写正则搜索。 -fn regex_search( - pattern: &str, - content: &str, - cache: &mut HashMap, -) -> PyResult { - if !cache.contains_key(pattern) { - let regex = RegexBuilder::new(pattern) - .case_insensitive(true) - .build() - .map_err(|err| PyValueError::new_err(err.to_string()))?; - cache.insert(pattern.to_string(), regex); - } - Ok(cache - .get(pattern) - .is_some_and(|regex| regex.is_match(content))) -} - -/// 匹配 TMDB 媒体属性规则。 -fn match_tmdb(tmdb: &Bound<'_, PyDict>, media_info: Option<&Bound<'_, PyDict>>) -> PyResult { - let Some(media) = media_info else { - return Ok(false); - }; - for (attr, value) in tmdb.iter() { - if value.is_none() { - continue; - } - let attr_name = attr.extract::()?; - let expected = value.str()?.to_str()?.to_string(); - if expected.is_empty() { - continue; - } - let info_values = media_values(media, &attr_name)?; - if info_values.is_empty() { - return Ok(false); - } - let expected_values: Vec = expected - .split(',') - .filter(|item| !item.is_empty()) - .map(|item| item.to_uppercase()) - .collect(); - if !expected_values.iter().any(|expected_item| { - info_values - .iter() - .any(|info_item| info_item == expected_item) - }) { - return Ok(false); - } - } - Ok(true) -} - -/// 获取媒体属性的可比较字符串集合。 -fn media_values(media: &Bound<'_, PyDict>, attr_name: &str) -> PyResult> { - let Some(value) = media.get_item(attr_name)? else { - return Ok(Vec::new()); - }; - if value.is_none() { - return Ok(Vec::new()); - } - if attr_name == "production_countries" { - let Ok(items) = value.downcast::() else { - return Ok(Vec::new()); - }; - let mut values = Vec::new(); - for item in items.iter() { - if let Ok(dict) = item.downcast::() { - if let Some(country) = dict.get_item("iso_3166_1")? { - values.push(country.str()?.to_str()?.to_uppercase()); - } - } - } - return Ok(values); - } - if let Ok(items) = value.extract::>() { - return Ok(items.into_iter().map(|item| item.to_uppercase()).collect()); - } - Ok(vec![value.str()?.to_str()?.to_uppercase()]) -} - -/// 按每集大小匹配大小范围规则。 -fn match_size(torrent: &TorrentPayload, size_range: &str) -> PyResult { - let torrent_size = torrent.size / torrent.episode_count; - let size_range = size_range.trim(); - let unit = 1024.0 * 1024.0; - if let Some((min, max)) = size_range.split_once('-') { - let min = min - .trim() - .parse::() - .map_err(|err| PyValueError::new_err(err.to_string()))? - * unit; - let max = max - .trim() - .parse::() - .map_err(|err| PyValueError::new_err(err.to_string()))? - * unit; - return Ok(min <= torrent_size && torrent_size <= max); - } - if let Some(min) = size_range.strip_prefix('>') { - let min = min - .trim() - .parse::() - .map_err(|err| PyValueError::new_err(err.to_string()))? - * unit; - return Ok(torrent_size >= min); - } - if let Some(max) = size_range.strip_prefix('<') { - let max = max - .trim() - .parse::() - .map_err(|err| PyValueError::new_err(err.to_string()))? - * unit; - return Ok(torrent_size <= max); - } - Ok(false) -} - -/// 匹配发布时间分钟数规则。 -fn match_publish_time(pub_minutes: f64, publish_time: &str) -> bool { - let values: Vec = publish_time - .split('-') - .filter_map(|item| item.parse::().ok()) - .collect(); - if values.len() == 1 { - return pub_minutes >= values[0]; - } - if values.len() >= 2 { - return values[0] <= pub_minutes && pub_minutes <= values[1]; - } - true -} diff --git a/rust/moviepilot_rust/src/indexer.rs b/rust/moviepilot_rust/src/indexer.rs deleted file mode 100644 index 3f4d0009..00000000 --- a/rust/moviepilot_rust/src/indexer.rs +++ /dev/null @@ -1,1151 +0,0 @@ -use crate::utils::{get_optional_i64, get_optional_string, py_i64_to_usize}; -use minijinja::{context, Environment, UndefinedBehavior}; -use once_cell::sync::Lazy; -use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; -use pyo3::exceptions::PyValueError; -use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList}; -use regex::{Regex, RegexBuilder}; -use scraper::{ElementRef, Html, Selector}; -use std::collections::BTreeMap; -use url::form_urlencoded; -use url::Url; - -const PATH_ENCODE_SET: &AsciiSet = &CONTROLS - .add(b' ') - .add(b'"') - .add(b'#') - .add(b'%') - .add(b'<') - .add(b'>') - .add(b'[') - .add(b'\\') - .add(b']') - .add(b'^') - .add(b'`') - .add(b'{') - .add(b'|') - .add(b'}'); - -static IMDB_ID_RE: Lazy = Lazy::new(|| Regex::new(r"^tt\d+$").unwrap()); - -static FILESIZE_UNIT_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"[KMGTPI]*B?") - .case_insensitive(true) - .build() - .unwrap() -}); -static NUMERIC_FACTOR_RE: Lazy = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap()); -static FIELD_REF_RE: Lazy = - Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap()); -static HAS_QUOTED_SELECTOR_RE: Lazy = - Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap()); -static TABLE_DIRECT_TR_RE: Lazy = - Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap()); - -enum RowParseResult { - Unsupported, - Empty, - Item(PyObject), -} - -/// 批量解析普通配置 indexer 页面,遇到不支持的选择器配置时返回 None 交给 Python 回退。 -#[pyfunction] -#[pyo3(signature = (html_text, domain, list_config, fields, category=None, result_num=100))] -pub(crate) fn parse_indexer_torrents_fast( - py: Python<'_>, - html_text: &str, - domain: &str, - list_config: &Bound<'_, PyDict>, - fields: &Bound<'_, PyDict>, - category: Option<&Bound<'_, PyDict>>, - result_num: usize, -) -> PyResult> { - let Some(list_selector_text) = get_optional_string(list_config, "selector")? else { - return Ok(None); - }; - if list_selector_text.is_empty() { - return Ok(None); - } - let Some(list_selector) = parse_site_selector(&list_selector_text) else { - return Ok(None); - }; - let document = Html::parse_document(html_text); - let result = PyList::empty(py); - for row in document.select(&list_selector).take(result_num) { - match parse_indexer_row(py, row, domain, fields, category)? { - RowParseResult::Unsupported => return Ok(None), - RowParseResult::Empty => {} - RowParseResult::Item(item) => result.append(item)?, - } - } - Ok(Some(result.into())) -} - -/// 执行 indexer 文本过滤器,遇到 Python 专属过滤器时返回 None。 -fn apply_text_filters(mut current: String, filters: &Bound<'_, PyAny>) -> PyResult> { - let Ok(filter_list) = filters.downcast::() else { - return Ok(None); - }; - for item in filter_list.iter() { - let filter = item.downcast::()?; - let method_name = get_optional_string(filter, "name")?; - if current.is_empty() { - break; - } - match method_name.as_deref() { - Some("re_search") => { - let Some(args) = filter.get_item("args")? else { - continue; - }; - let Ok(args_list) = args.downcast::() else { - continue; - }; - if args_list.len() < 2 { - continue; - } - let pattern = args_list.get_item(0)?.extract::()?; - let group_index = py_i64_to_usize(&args_list.get_item(args_list.len() - 1)?)?; - let regex = - Regex::new(&pattern).map_err(|err| PyValueError::new_err(err.to_string()))?; - if let Some(captures) = regex.captures(¤t) { - if let Some(value) = captures.get(group_index) { - current = value.as_str().to_string(); - } - } - } - Some("split") => { - let Some(args) = filter.get_item("args")? else { - continue; - }; - let Ok(args_list) = args.downcast::() else { - continue; - }; - if args_list.len() < 2 { - continue; - } - let delimiter = args_list.get_item(0)?.extract::()?; - let index = py_i64_to_usize(&args_list.get_item(args_list.len() - 1)?)?; - if let Some(value) = current.split(&delimiter).nth(index) { - current = value.to_string(); - } - } - Some("replace") => { - let Some(args) = filter.get_item("args")? else { - continue; - }; - let Ok(args_list) = args.downcast::() else { - continue; - }; - if args_list.len() < 2 { - continue; - } - let from = args_list.get_item(0)?.extract::()?; - let to = args_list - .get_item(args_list.len() - 1)? - .extract::()?; - current = current.replace(&from, &to); - } - Some("strip") => { - current = current.trim().to_string(); - } - Some("appendleft") => { - let Some(args) = filter.get_item("args")? else { - continue; - }; - current = format!("{}{}", args.str()?.to_str()?, current); - } - Some("querystring") => { - let Some(args) = filter.get_item("args")? else { - continue; - }; - current = query_param_value(¤t, args.str()?.to_str()?).unwrap_or_default(); - } - Some("dateparse") => return Ok(None), - _ => return Ok(None), - } - } - Ok(Some(current.trim().to_string())) -} - -/// 将文件大小文本转换为字节数,供 Rust HTML 解析内部共用。 -fn parse_filesize_text(text: &str) -> i64 { - let raw = text.trim().to_string(); - if raw.is_empty() { - return 0; - } - if raw.chars().all(|ch| ch.is_ascii_digit()) { - return raw.parse::().unwrap_or(0); - } - let normalized = raw.replace([',', ' '], "").to_uppercase(); - let size_text = FILESIZE_UNIT_RE.replace_all(&normalized, "").to_string(); - let Ok(mut size) = size_text.parse::() else { - return 0; - }; - if normalized.contains("PB") || normalized.contains("PIB") { - size *= 1024_f64.powi(5); - } else if normalized.contains("TB") || normalized.contains("TIB") { - size *= 1024_f64.powi(4); - } else if normalized.contains("GB") || normalized.contains("GIB") { - size *= 1024_f64.powi(3); - } else if normalized.contains("MB") || normalized.contains("MIB") { - size *= 1024_f64.powi(2); - } else if normalized.contains("KB") || normalized.contains("KIB") { - size *= 1024_f64; - } - size.round() as i64 -} - -/// 根据普通 indexer 配置构造搜索或浏览 URL。 -#[pyfunction] -pub(crate) fn build_indexer_search_url_fast( - config: &Bound<'_, PyDict>, -) -> PyResult> { - let Some(search_any) = config.get_item("search")? else { - return Ok(None); - }; - let search = search_any.downcast::()?; - let domain = get_optional_string(config, "domain")?.unwrap_or_default(); - if domain.is_empty() { - return Ok(None); - } - - let keyword_any = config.get_item("keyword")?; - let keyword_present = keyword_any.as_ref().is_some_and(|value| !value.is_none()); - let mut torrents_path = pick_torrents_path(search, config)?; - let page = get_optional_i64(config, "page")?.unwrap_or(0); - - if keyword_present { - let (mut search_word, search_mode) = - build_search_word(config, keyword_any.as_ref().unwrap())?; - let is_imdbid_search = IMDB_ID_RE.is_match(&search_word); - search_word = format_search_word(search, &search_word)?; - let params_any = search.get_item("params")?; - let Some(params_obj) = params_any else { - let encoded = utf8_percent_encode(&search_word, PATH_ENCODE_SET).to_string(); - return Ok(Some(format!( - "{}{}", - domain, - torrents_path - .replace("{keyword}", &encoded) - .replace("{page}", &page.to_string()) - ))); - }; - let params_dict = params_obj.downcast::()?; - if params_dict.is_empty() { - let encoded = utf8_percent_encode(&search_word, PATH_ENCODE_SET).to_string(); - return Ok(Some(format!( - "{}{}", - domain, - torrents_path - .replace("{keyword}", &encoded) - .replace("{page}", &page.to_string()) - ))); - } - - let mut query_params: Vec<(String, String)> = vec![ - ("search_mode".to_string(), search_mode.to_string()), - ("search_area".to_string(), "0".to_string()), - ("page".to_string(), page.to_string()), - ("notnewword".to_string(), "1".to_string()), - ]; - for (key, value) in params_dict.iter() { - let key = key.extract::()?; - if key == "search_area" && !is_imdbid_search { - continue; - } - let rendered = value.str()?.to_str()?.replace("{keyword}", &search_word); - upsert_query_param(&mut query_params, key, rendered); - } - apply_category_params(config, &mut query_params)?; - return Ok(Some(combine_url(&domain, &torrents_path, &query_params)?)); - } - - let browse_any = config.get_item("browse")?; - if let Some(browse_obj) = browse_any { - if !browse_obj.is_none() { - let browse = browse_obj.downcast::()?; - if let Some(path) = get_optional_string(browse, "path")? { - torrents_path = path; - } - if let Some(start) = get_optional_i64(browse, "start")? { - torrents_path = torrents_path.replace("{page}", &(start + page).to_string()); - } - } else if page > 0 { - torrents_path = format!("{torrents_path}?page={page}"); - } - } else if page > 0 { - torrents_path = format!("{torrents_path}?page={page}"); - } - torrents_path = torrents_path - .replace("{page}", &page.to_string()) - .replace("{keyword}", ""); - Ok(Some(format!("{domain}{torrents_path}"))) -} - -fn query_param_value(text: &str, key: &str) -> Option { - let query = if let Ok(url) = Url::parse(text) { - url.query().unwrap_or("").to_string() - } else { - text.split_once('?') - .map(|(_, query)| query.split('#').next().unwrap_or("").to_string()) - .unwrap_or_default() - }; - form_urlencoded::parse(query.as_bytes()) - .find(|(param_key, _)| param_key == key) - .map(|(_, value)| value.to_string()) -} - -/// 解析单行种子信息,覆盖普通配置站点的主字段抽取流程。 -fn parse_indexer_row( - py: Python<'_>, - row: ElementRef<'_>, - domain: &str, - fields: &Bound<'_, PyDict>, - category: Option<&Bound<'_, PyDict>>, -) -> PyResult { - let output = PyDict::new(py); - if !parse_title(py, row, fields, &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_description(py, row, fields, &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_link_field( - py, row, fields, domain, "details", "page_url", true, &output, - )? { - return Ok(RowParseResult::Unsupported); - } - if !parse_link_field( - py, - row, - fields, - domain, - "download", - "enclosure", - false, - &output, - )? { - return Ok(RowParseResult::Unsupported); - } - if !parse_plain_field(py, row, fields, "imdbid", "imdbid", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_size_field(py, row, fields, &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_int_field(py, row, fields, "leechers", "peers", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_int_field(py, row, fields, "seeders", "seeders", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_int_field(py, row, fields, "grabs", "grabs", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_factor_field(py, row, fields, "downloadvolumefactor", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_factor_field(py, row, fields, "uploadvolumefactor", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_plain_field(py, row, fields, "date_added", "pubdate", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_plain_field(py, row, fields, "date_elapsed", "date_elapsed", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_plain_field(py, row, fields, "freedate", "freedate", &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_labels_field(py, row, fields, &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_hr_field(py, row, fields, &output)? { - return Ok(RowParseResult::Unsupported); - } - if !parse_category_field(py, row, fields, category, &output)? { - return Ok(RowParseResult::Unsupported); - } - if output.is_empty() { - return Ok(RowParseResult::Empty); - } - Ok(RowParseResult::Item(output.into())) -} - -/// 解析标题字段,支持直接 selector 和按模板引用字段渲染 title.text。 -fn parse_title( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, "title")? else { - return Ok(true); - }; - let mut title = if selector.contains("selector")? { - safe_query(row, &selector)? - } else if let Some(template) = get_optional_string(&selector, "text")? { - let values = collect_template_field_values(row, fields, &template)?; - let Some(rendered) = render_jinja_template(&template, &values) else { - return Ok(false); - }; - Some(rendered) - } else { - None - }; - title = apply_selector_filters(py, title, &selector)?; - if let Some(value) = title { - output.set_item("title", value)?; - } - Ok(true) -} - -/// 解析描述字段,支持直接 selector 和按模板引用字段渲染 description.text。 -fn parse_description( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, "description")? else { - return Ok(true); - }; - let mut description = if selector.contains("selector")? || selector.contains("selectors")? { - safe_query(row, &selector)? - } else if let Some(template) = get_optional_string(&selector, "text")? { - let values = collect_template_field_values(row, fields, &template)?; - let Some(rendered) = render_jinja_template(&template, &values) else { - return Ok(false); - }; - Some(rendered) - } else { - None - }; - description = apply_selector_filters(py, description, &selector)?; - if let Some(value) = description { - output.set_item("description", value)?; - } - Ok(true) -} - -/// 按 Jinja 模板实际引用的 fields 字段提取当前行数据,避免把模板能力绑死在固定字段名上。 -fn collect_template_field_values( - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - template: &str, -) -> PyResult> { - let mut keys = Vec::new(); - for captures in FIELD_REF_RE.captures_iter(template) { - let Some(key) = captures.get(1).or_else(|| captures.get(2)) else { - continue; - }; - let key = key.as_str(); - if !keys.iter().any(|item: &String| item == key) { - keys.push(key.to_string()); - } - } - - let mut values = BTreeMap::new(); - for key in keys { - if let Some(field_selector) = get_field_dict(fields, &key)? { - let value = safe_query(row, &field_selector)?.unwrap_or_default(); - values.insert(key, value); - } - } - Ok(resolve_embedded_field_templates(values)) -} - -/// 解析普通文本字段。 -fn parse_plain_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - source_key: &str, - target_key: &str, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, source_key)? else { - return Ok(true); - }; - if selector.contains("text")? { - return Ok(false); - } - let value = apply_selector_filters(py, safe_query(row, &selector)?, &selector)?; - if let Some(value) = value { - output.set_item(target_key, value.replace('\n', " ").trim().to_string())?; - } - Ok(true) -} - -/// 解析详情和下载链接,并按 Python 逻辑拼接相对地址。 -fn parse_link_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - domain: &str, - source_key: &str, - target_key: &str, - protocol_relative: bool, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, source_key)? else { - return Ok(true); - }; - let link = apply_selector_filters(py, safe_query(row, &selector)?, &selector)?; - if let Some(link) = link { - if link.is_empty() { - return Ok(true); - } - output.set_item( - target_key, - normalize_site_link(domain, &link, protocol_relative), - )?; - } - Ok(true) -} - -/// 解析文件大小字段并转换为字节。 -fn parse_size_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, "size")? else { - return Ok(true); - }; - let value = apply_selector_filters(py, safe_query(row, &selector)?, &selector)?; - let size = value - .map(|item| parse_filesize_text(item.replace('\n', "").trim())) - .unwrap_or(0); - output.set_item("size", size)?; - Ok(true) -} - -/// 解析整数类字段,兼容 "12/34" 和千分位逗号。 -fn parse_int_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - source_key: &str, - target_key: &str, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, source_key)? else { - return Ok(true); - }; - let value = apply_selector_filters(py, safe_query(row, &selector)?, &selector)?; - let parsed = value - .as_deref() - .unwrap_or("") - .split('/') - .next() - .unwrap_or("") - .replace(',', "") - .trim() - .parse::() - .unwrap_or(0); - output.set_item(target_key, parsed)?; - Ok(true) -} - -/// 解析上传/下载优惠系数字段。 -fn parse_factor_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - key: &str, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, key)? else { - return Ok(true); - }; - output.set_item(key, 1)?; - if let Some(case_obj) = selector.get_item("case")? { - let case_dict = case_obj.downcast::()?; - for (case_selector_obj, value) in case_dict.iter() { - let case_selector = case_selector_obj.extract::()?; - if selector_exists(row, &case_selector)? { - output.set_item(key, value)?; - return Ok(true); - } - } - return Ok(true); - } - let value = apply_selector_filters(py, safe_query(row, &selector)?, &selector)?; - if let Some(value) = value { - if let Some(caps) = NUMERIC_FACTOR_RE.captures(&value) { - if let Some(number) = caps - .get(1) - .and_then(|item| item.as_str().parse::().ok()) - { - output.set_item(key, number)?; - } - } - } - Ok(true) -} - -/// 解析标签列表字段。 -fn parse_labels_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, "labels")? else { - return Ok(true); - }; - if !selector.contains("selector")? { - output.set_item("labels", PyList::empty(py))?; - return Ok(true); - } - let Some(values) = query_all_values(row, &selector)? else { - output.set_item("labels", PyList::empty(py))?; - return Ok(true); - }; - let labels = PyList::empty(py); - for value in values.into_iter().filter(|item| !item.is_empty()) { - labels.append(value)?; - } - output.set_item("labels", labels)?; - Ok(true) -} - -/// 解析 HR 标记字段。 -fn parse_hr_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, "hr")? else { - return Ok(true); - }; - let Some(selector_text) = get_selector_text(&selector)? else { - output.set_item("hit_and_run", false)?; - return Ok(true); - }; - output.set_item("hit_and_run", selector_exists(row, &selector_text)?)?; - let _ = py; - Ok(true) -} - -/// 解析分类字段并映射为 MoviePilot 媒体类型中文值。 -fn parse_category_field( - py: Python<'_>, - row: ElementRef<'_>, - fields: &Bound<'_, PyDict>, - category: Option<&Bound<'_, PyDict>>, - output: &Bound<'_, PyDict>, -) -> PyResult { - let Some(selector) = get_field_dict(fields, "category")? else { - return Ok(true); - }; - let value = apply_selector_filters(py, safe_query(row, &selector)?, &selector)?; - let media_type = if let (Some(value), Some(category)) = (value.as_deref(), category) { - let tv_cats = category_ids_for_field(category, "tv")?; - let movie_cats = category_ids_for_field(category, "movie")?; - if tv_cats.iter().any(|item| item == value) && !movie_cats.iter().any(|item| item == value) - { - "电视剧" - } else if movie_cats.iter().any(|item| item == value) { - "电影" - } else { - "未知" - } - } else { - "未知" - }; - output.set_item("category", media_type)?; - Ok(true) -} - -/// 获取字段配置字典。 -fn get_field_dict<'py>( - fields: &Bound<'py, PyDict>, - key: &str, -) -> PyResult>> { - let Some(value) = fields.get_item(key)? else { - return Ok(None); - }; - if value.is_none() { - return Ok(None); - } - Ok(Some(value.downcast_into::()?)) -} - -/// 解析站点配置选择器,并兼容 PyQuery 允许的 :has("selector") 写法。 -fn parse_site_selector(selector_text: &str) -> Option { - let normalized = normalize_pyquery_selector(selector_text); - let expanded = expand_table_direct_tr_selector(&normalized); - if let Ok(selector) = Selector::parse(&expanded) { - return Some(selector); - } - if expanded != normalized { - if let Ok(selector) = Selector::parse(&normalized) { - return Some(selector); - } - } - Selector::parse(selector_text).ok() -} - -/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。 -fn normalize_pyquery_selector(selector_text: &str) -> String { - HAS_QUOTED_SELECTOR_RE - .replace_all(selector_text, |captures: ®ex::Captures<'_>| { - let inner = captures - .get(1) - .or_else(|| captures.get(2)) - .map(|item| item.as_str()) - .unwrap_or_default(); - format!(":has({inner})") - }) - .into_owned() -} - -/// 为 table > tr 选择器追加 tbody 变体,适配 Rust HTML5 解析自动补 tbody 的行为。 -fn expand_table_direct_tr_selector(selector_text: &str) -> String { - let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2"); - if expanded == selector_text { - return selector_text.to_string(); - } - format!("{selector_text}, {expanded}") -} - -/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。 -fn safe_query( - row: ElementRef<'_>, - selector_config: &Bound<'_, PyDict>, -) -> PyResult> { - let Some(values) = query_all_values(row, selector_config)? else { - return Ok(None); - }; - Ok(select_indexed_value(values, selector_config)) -} - -/// 查询 selector 的全部文本或属性值。 -fn query_all_values( - row: ElementRef<'_>, - selector_config: &Bound<'_, PyDict>, -) -> PyResult>> { - let Some(selector_text) = get_selector_text(selector_config)? else { - return Ok(None); - }; - let Some(selector) = parse_site_selector(&selector_text) else { - return Ok(None); - }; - let attribute = get_optional_string(selector_config, "attribute")?; - let remove_selectors = parse_remove_selectors(selector_config)?; - let mut values = Vec::new(); - for element in row.select(&selector) { - if let Some(attribute) = attribute.as_deref() { - values.push(element.value().attr(attribute).unwrap_or("").to_string()); - } else { - values.push(normalize_element_text(element, &remove_selectors)); - } - } - Ok(Some(values)) -} - -/// 解析 remove 配置,支持逗号分隔的 CSS 选择器列表。 -fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult> { - let Some(remove_text) = get_optional_string(selector_config, "remove")? else { - return Ok(Vec::new()); - }; - let mut selectors = Vec::new(); - for item in remove_text.split(',') { - let item = item.trim(); - if item.is_empty() { - continue; - } - let Some(selector) = parse_site_selector(item) else { - return Ok(Vec::new()); - }; - selectors.push(selector); - } - Ok(selectors) -} - -/// 读取 selector 或 selectors 配置。 -fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult> { - if let Some(selector) = get_optional_string(selector_config, "selector")? { - if !selector.is_empty() { - return Ok(Some(selector)); - } - } - if let Some(selector) = get_optional_string(selector_config, "selectors")? { - if !selector.is_empty() { - return Ok(Some(selector)); - } - } - Ok(None) -} - -/// 对查询结果应用 contents/index 规则。 -fn select_indexed_value( - values: Vec, - selector_config: &Bound<'_, PyDict>, -) -> Option { - if values.is_empty() { - return None; - } - if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") { - if let Some(first) = values.first() { - let lines: Vec<&str> = first.split('\n').collect(); - return pick_indexed_item(&lines, contents).map(|item| item.to_string()); - } - } - if let Ok(Some(index)) = get_optional_i64(selector_config, "index") { - return pick_indexed_item(&values, index).cloned(); - } - values.first().cloned() -} - -/// 按 Python 列表语义读取正负索引。 -fn pick_indexed_item(items: &[T], index: i64) -> Option<&T> { - let len = items.len() as i64; - let resolved = if index < 0 { len + index } else { index }; - if resolved < 0 { - return None; - } - items.get(resolved as usize) -} - -/// 应用字段配置中的 filters。 -fn apply_selector_filters( - py: Python<'_>, - value: Option, - selector_config: &Bound<'_, PyDict>, -) -> PyResult> { - let Some(value) = value else { - return Ok(None); - }; - let Some(filters) = selector_config.get_item("filters")? else { - return Ok(Some(value)); - }; - if filters.is_none() { - return Ok(Some(value)); - } - let _ = py; - apply_text_filters(value, &filters).map(|filtered| filtered.or_else(|| Some(String::new()))) -} - -/// 规范化元素文本,尽量接近 PyQuery.text() 输出。 -fn normalize_element_text(element: ElementRef<'_>, remove_selectors: &[Selector]) -> String { - let mut rendered = String::new(); - for node in element.descendants() { - let Some(text_node) = node.value().as_text() else { - continue; - }; - if should_skip_text_node( - node.parent().and_then(ElementRef::wrap), - element, - remove_selectors, - ) { - continue; - } - rendered.push_str(text_node); - } - normalize_whitespace(&rendered) -} - -/// 折叠 PyQuery.text() 中的连续空白,保留元素相邻文本节点的直接拼接效果。 -fn normalize_whitespace(value: &str) -> String { - value.split_whitespace().collect::>().join(" ") -} - -/// 判断文本节点是否位于需要 remove 的元素子树中。 -fn should_skip_text_node( - mut parent: Option>, - root: ElementRef<'_>, - remove_selectors: &[Selector], -) -> bool { - while let Some(element) = parent { - if element == root { - return false; - } - if remove_selectors - .iter() - .any(|selector| selector.matches(&element)) - { - return true; - } - parent = element.parent().and_then(ElementRef::wrap); - } - false -} - -/// 判断 row 内是否存在指定 selector。 -fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult { - let Some(selector) = parse_site_selector(selector_text) else { - return Ok(false); - }; - Ok(row.select(&selector).next().is_some()) -} - -/// 拼接详情和下载链接。 -fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> String { - if link.starts_with("http") || link.starts_with("magnet") { - return link.to_string(); - } - if protocol_relative && link.starts_with("//") { - let scheme = domain.split(':').next().unwrap_or("http"); - return format!("{scheme}:{link}"); - } - if !protocol_relative { - if let Ok(base) = Url::parse(&standardize_base_url(domain)) { - if let Some(host) = base.host_str() { - if link.contains(host) { - if link.starts_with('/') { - return format!("{}:{link}", base.scheme()); - } - return format!("{}://{link}", base.scheme()); - } - } - } - } - if let Some(stripped) = link.strip_prefix('/') { - format!("{domain}{stripped}") - } else { - format!("{domain}{link}") - } -} - -/// 使用 MiniJinja 渲染站点字段模板,语义对齐 Python jinja2 的 Template.render(fields=...)。 -fn render_jinja_template(template: &str, fields: &BTreeMap) -> Option { - let mut env = Environment::new(); - env.set_undefined_behavior(UndefinedBehavior::Chainable); - env.render_str(template, context! { fields => fields }).ok() -} - -/// 渲染字段值中意外残留的 Jinja 模板,避免站点 title 属性里的模板文本继续进入识别链路。 -fn resolve_embedded_field_templates(values: BTreeMap) -> BTreeMap { - let mut resolved = values.clone(); - for (key, value) in &values { - if !contains_jinja_syntax(value) { - continue; - } - let mut context_values = resolved.clone(); - context_values.insert(key.clone(), String::new()); - if let Some(rendered) = render_jinja_template(value, &context_values) { - resolved.insert(key.clone(), rendered); - } - } - resolved -} - -/// 判断文本是否包含 Jinja 语法标记,作为字段内嵌模板的低成本预筛选。 -fn contains_jinja_syntax(value: &str) -> bool { - value.contains("{{") || value.contains("{%") || value.contains("{#") -} - -/// 读取分类配置中的 ID 列表。 -fn category_ids_for_field(category: &Bound<'_, PyDict>, key: &str) -> PyResult> { - let Some(list_obj) = category.get_item(key)? else { - return Ok(Vec::new()); - }; - let Ok(list) = list_obj.downcast::() else { - return Ok(Vec::new()); - }; - let mut values = Vec::new(); - for item in list.iter() { - let dict = item.downcast::()?; - if let Some(id) = get_optional_string(dict, "id")? { - values.push(id); - } - } - Ok(values) -} - -/// 从 indexer paths 配置中选择搜索路径。 -fn pick_torrents_path(search: &Bound<'_, PyDict>, config: &Bound<'_, PyDict>) -> PyResult { - let Some(paths_obj) = search.get_item("paths")? else { - return Ok(String::new()); - }; - let paths = paths_obj.downcast::()?; - if paths.len() == 1 { - let path_item = paths.get_item(0)?; - let path_dict = path_item.downcast::()?; - return Ok(get_optional_string(path_dict, "path")?.unwrap_or_default()); - } - let mtype = get_optional_string(config, "mtype")?; - for item in paths.iter() { - let path = item.downcast::()?; - let path_type = get_optional_string(path, "type")?; - if path_type.as_deref() == Some("all") && mtype.is_none() { - return Ok(get_optional_string(path, "path")?.unwrap_or_default()); - } - if path_type.as_deref() == Some("movie") && mtype.as_deref() == Some("电影") { - return Ok(get_optional_string(path, "path")?.unwrap_or_default()); - } - if path_type.as_deref() == Some("tv") && mtype.as_deref() == Some("电视剧") { - return Ok(get_optional_string(path, "path")?.unwrap_or_default()); - } - } - Ok(String::new()) -} - -/// 根据关键字、批量配置构造搜索词和搜索模式。 -fn build_search_word( - config: &Bound<'_, PyDict>, - keyword: &Bound<'_, PyAny>, -) -> PyResult<(String, i64)> { - if let Ok(values) = keyword.extract::>() { - let batch = config.get_item("batch")?; - let (delimiter, space_replace) = if let Some(batch_obj) = batch { - if batch_obj.is_none() { - (" ".to_string(), " ".to_string()) - } else { - let batch_dict = batch_obj.downcast::()?; - ( - get_optional_string(batch_dict, "delimiter")? - .unwrap_or_else(|| " ".to_string()), - get_optional_string(batch_dict, "space_replace")? - .unwrap_or_else(|| " ".to_string()), - ) - } - } else { - (" ".to_string(), " ".to_string()) - }; - let words: Vec = values - .into_iter() - .map(|value| value.replace(' ', &space_replace)) - .collect(); - return Ok((words.join(&delimiter), 1)); - } - Ok((keyword.str()?.to_str()?.to_string(), 0)) -} - -/// 按 imdbid_format 转换 IMDb ID 搜索词。 -fn format_search_word(search: &Bound<'_, PyDict>, search_word: &str) -> PyResult { - if !IMDB_ID_RE.is_match(search_word) { - return Ok(search_word.to_string()); - } - let Some(format) = get_optional_string(search, "imdbid_format")? else { - return Ok(search_word.to_string()); - }; - Ok(format - .replace("{keyword}", search_word) - .replace("{imdbid}", search_word) - .replace("{imdbid_num}", search_word.trim_start_matches("tt"))) -} - -/// 更新查询参数,保留 Python dict update 的覆盖语义。 -fn upsert_query_param(params: &mut Vec<(String, String)>, key: String, value: String) { - if let Some((_, existing_value)) = params - .iter_mut() - .find(|(existing_key, _)| existing_key == &key) - { - *existing_value = value; - return; - } - params.push((key, value)); -} - -/// 应用电影/电视剧分类查询参数。 -fn apply_category_params( - config: &Bound<'_, PyDict>, - params: &mut Vec<(String, String)>, -) -> PyResult<()> { - let Some(category_obj) = config.get_item("category")? else { - return Ok(()); - }; - if category_obj.is_none() { - return Ok(()); - } - let category = category_obj.downcast::()?; - let mtype = get_optional_string(config, "mtype")?; - let cat_ids = collect_category_ids(category, mtype.as_deref())?; - let allowed = get_optional_string(config, "cat")?.map(|value| { - value - .split(',') - .map(|item| item.to_string()) - .collect::>() - }); - for cat_id in cat_ids { - if cat_id.is_empty() { - continue; - } - if let Some(allowed_cats) = &allowed { - if !allowed_cats.iter().any(|item| item == &cat_id) { - continue; - } - } - if let Some(field) = get_optional_string(category, "field")? { - let delimiter = - get_optional_string(category, "delimiter")?.unwrap_or_else(|| " ".to_string()); - let current = params - .iter() - .find(|(key, _)| key == &field) - .map(|(_, value)| value.clone()) - .unwrap_or_default(); - upsert_query_param(params, field, format!("{current}{delimiter}{cat_id}")); - } else { - upsert_query_param(params, format!("cat{cat_id}"), "1".to_string()); - } - } - Ok(()) -} - -/// 收集当前媒体类型可用的分类 ID。 -fn collect_category_ids( - category: &Bound<'_, PyDict>, - mtype: Option<&str>, -) -> PyResult> { - let mut items = Vec::new(); - let keys = match mtype { - Some("电视剧") => vec!["tv"], - Some("电影") => vec!["movie"], - _ => vec!["movie", "tv"], - }; - for key in keys { - if let Some(list_obj) = category.get_item(key)? { - if let Ok(list) = list_obj.downcast::() { - for item in list.iter() { - let cat = item.downcast::()?; - if let Some(cat_id) = get_optional_string(cat, "id")? { - items.push(cat_id); - } - } - } - } - } - Ok(items) -} - -/// 合并 host、path 和查询参数。 -fn combine_url(host: &str, path: &str, query: &[(String, String)]) -> PyResult { - let base = standardize_base_url(host); - let mut url = Url::parse(&base) - .and_then(|base_url| base_url.join(path)) - .map_err(|err| PyValueError::new_err(err.to_string()))?; - let mut query_params: Vec<(String, String)> = url - .query_pairs() - .map(|(key, value)| (key.to_string(), value.to_string())) - .collect(); - for (key, value) in query { - upsert_query_param(&mut query_params, key.clone(), value.clone()); - } - { - let mut pairs = url.query_pairs_mut(); - pairs.clear(); - for (key, value) in query_params { - pairs.append_pair(&key, &value); - } - } - Ok(url.to_string()) -} - -/// 标准化基础 URL,与 Python UrlUtils.standardize_base_url 保持一致。 -fn standardize_base_url(host: &str) -> String { - let mut value = host.to_string(); - if !value.ends_with('/') { - value.push('/'); - } - if !value.starts_with("http://") && !value.starts_with("https://") { - value = format!("http://{value}"); - } - value -} diff --git a/rust/moviepilot_rust/src/lib.rs b/rust/moviepilot_rust/src/lib.rs index fa49a625..683f67ab 100644 --- a/rust/moviepilot_rust/src/lib.rs +++ b/rust/moviepilot_rust/src/lib.rs @@ -1,8 +1,4 @@ mod filter; -mod indexer; -mod meta; -mod rss; -mod utils; use pyo3::prelude::*; @@ -16,13 +12,6 @@ fn is_available() -> bool { #[pymodule] fn moviepilot_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(is_available, m)?)?; - m.add_function(wrap_pyfunction!(meta::is_anime_fast, m)?)?; - m.add_function(wrap_pyfunction!(meta::find_metainfo_fast, m)?)?; - m.add_function(wrap_pyfunction!(meta::parse_video_title_fast, m)?)?; m.add_function(wrap_pyfunction!(filter::parse_filter_rule_fast, m)?)?; - m.add_function(wrap_pyfunction!(filter::filter_torrents_fast, m)?)?; - m.add_function(wrap_pyfunction!(indexer::build_indexer_search_url_fast, m)?)?; - m.add_function(wrap_pyfunction!(indexer::parse_indexer_torrents_fast, m)?)?; - m.add_function(wrap_pyfunction!(rss::parse_rss_items_fast, m)?)?; Ok(()) } diff --git a/rust/moviepilot_rust/src/meta.rs b/rust/moviepilot_rust/src/meta.rs deleted file mode 100644 index a6e357a4..00000000 --- a/rust/moviepilot_rust/src/meta.rs +++ /dev/null @@ -1,1247 +0,0 @@ -use crate::utils::{apply_range_total, capture_all_i64, capture_i64}; -use once_cell::sync::Lazy; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use regex::{Regex, RegexBuilder}; - -static ANIME_BRACKET_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"【[+0-9XVPI-]+】\s*【") - .case_insensitive(true) - .build() - .unwrap() -}); -static ANIME_DASH_EPISODE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"\s+-\s+[\dv]{1,4}\s+") - .case_insensitive(true) - .build() - .unwrap() -}); -static VIDEO_SEASON_EPISODE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new( - r"S\d{2}\s*-\s*S\d{2}|S\d{2}|\s+S\d{1,2}|EP?\d{2,4}\s*-\s*EP?\d{2,4}|EP?\d{2,4}|\s+EP?\d{1,4}", - ) - .case_insensitive(true) - .build() - .unwrap() -}); -static ANIME_SQUARE_BRACKET_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"\[[+0-9XVPI-]+]\s*\[") - .case_insensitive(true) - .build() - .unwrap() -}); -static BRACED_METAINFO_RE: Lazy = Lazy::new(|| Regex::new(r"\{\[([\s\S]+?)]}").unwrap()); -static BRACED_TMDBID_RE: Lazy = Lazy::new(|| Regex::new(r"tmdbid=(\d+)").unwrap()); -static BRACED_DOUBANID_RE: Lazy = Lazy::new(|| Regex::new(r"doubanid=(\d+)").unwrap()); -static BRACED_TYPE_RE: Lazy = Lazy::new(|| Regex::new(r"type=(\w+)").unwrap()); -static BRACED_BEGIN_SEASON_RE: Lazy = Lazy::new(|| Regex::new(r"(?:^|;)s=(\d+)").unwrap()); -static BRACED_END_SEASON_RE: Lazy = Lazy::new(|| Regex::new(r"s=\d+-(\d+)").unwrap()); -static BRACED_BEGIN_EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"(?:^|;)e=(\d+)").unwrap()); -static BRACED_END_EPISODE_RE: Lazy = Lazy::new(|| Regex::new(r"e=\d+-(\d+)").unwrap()); -static EMBY_TMDB_RE_LIST: Lazy> = Lazy::new(|| { - vec![ - Regex::new(r"\[tmdbid[=\-](\d+)]").unwrap(), - Regex::new(r"\[tmdb[=\-](\d+)]").unwrap(), - Regex::new(r"\{tmdbid[=\-](\d+)}").unwrap(), - Regex::new(r"\{tmdb[=\-](\d+)}").unwrap(), - ] -}); - -static TITLE_SIZE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"[0-9.]+\s*[MGT]i?B") - .case_insensitive(true) - .build() - .unwrap() -}); -static TITLE_DATE_RE: Lazy = - Lazy::new(|| Regex::new(r"\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}").unwrap()); -static TITLE_YEAR_RANGE_RE: Lazy = - Lazy::new(|| Regex::new(r"([\s.]+)(\d{4})-(\d{4})").unwrap()); -static FIRST_BRACKET_RE: Lazy = Lazy::new(|| Regex::new(r"^[\[【](.+?)[\]】]").unwrap()); -static FIRST_BRACKET_RELEASE_RE: Lazy = - Lazy::new(|| Regex::new(r"[A-Za-z]+\..+(?:19|20)\d{2}").unwrap()); -static FIRST_BRACKET_RESOURCE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"(?:2160|1080|720|480)[PIpi]|4K|UHD|Blu[\-.]?ray|REMUX|WEB[\-.]?DL|HDTV") - .case_insensitive(true) - .build() - .unwrap() -}); -static TOKEN_SPLIT_RE: Lazy = - Lazy::new(|| Regex::new(r"[.\s()\[\]\-【】/~;&|#_「」~]+").unwrap()); -static FULL_SEASON_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"^(?:Season\s+|S)(\d{1,3})$") - .case_insensitive(true) - .build() - .unwrap() -}); -static SEASON_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"S(\d{3})|^S(\d{1,3})$|S(\d{1,3})E") - .case_insensitive(true) - .build() - .unwrap() -}); -static EPISODE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"EP?(\d{2,4})$|^EP?(\d{1,4})$|^S\d{1,2}EP?(\d{1,4})$|S\d{2}EP?(\d{2,4})") - .case_insensitive(true) - .build() - .unwrap() -}); -static RESOURCE_PIX_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"^[SBUHD]*(\d{3,4}[PI]+)|\d{3,4}X(\d{3,4})") - .case_insensitive(true) - .build() - .unwrap() -}); -static RESOURCE_PIX_RE2: Lazy = Lazy::new(|| { - RegexBuilder::new(r"(^[248]+K)") - .case_insensitive(true) - .build() - .unwrap() -}); -static SOURCE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"^BLURAY$|^HDTV$|^UHDTV$|^HDDVD$|^WEBRIP$|^DVDRIP$|^BDRIP$|^BLU$|^WEB$|^BD$|^HDRip$|^REMUX$|^UHD$") - .case_insensitive(true) - .build() - .unwrap() -}); -static EFFECT_RE: Lazy = Lazy::new(|| { - RegexBuilder::new( - r"^SDR$|^HDR\d*$|^DOLBY$|^DOVI$|^DV$|^3D$|^REPACK$|^HLG$|^HDR10(\+|Plus)$|^EDR$|^HQ$", - ) - .case_insensitive(true) - .build() - .unwrap() -}); -static VIDEO_ENCODE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"^(H26[45])$|^(x26[45])$|^AVC$|^HEVC$|^VC\d?$|^MPEG\d?$|^Xvid$|^DivX$|^AV1$|^HDR\d*$|^AVS(\+|[23])$") - .case_insensitive(true) - .build() - .unwrap() -}); -static AUDIO_ENCODE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"^DTS\d?$|^DTSHD$|^DTSHDMA$|^Atmos$|^TrueHD\d?$|^AC3$|^\dAudios?$|^DDP\d?$|^DD\+\d?$|^DD\d?$|^LPCM\d?$|^AAC\d?$|^FLAC\d?$|^HD\d?$|^MA\d?$|^HR\d?$|^Opus\d?$|^Vorbis\d?$|^AV[3S]A$") - .case_insensitive(true) - .build() - .unwrap() -}); -static FPS_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"(\d{2,3})FPS") - .case_insensitive(true) - .build() - .unwrap() -}); -static PART_RE: Lazy = Lazy::new(|| { - RegexBuilder::new( - r"(^PART[0-9ABI]{0,2}$|^CD[0-9]{0,2}$|^DVD[0-9]{0,2}$|^DISK[0-9]{0,2}$|^DISC[0-9]{0,2}$)", - ) - .case_insensitive(true) - .build() - .unwrap() -}); -static NAME_NO_CHINESE_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r".*版|.*字幕") - .case_insensitive(true) - .build() - .unwrap() -}); -static VIDEO_BIT_RE: Lazy = Lazy::new(|| { - RegexBuilder::new(r"(?i)(8|10|12)[\s._-]*bit") - .build() - .unwrap() -}); - -struct VideoParseState { - tokens: Vec, - media_exts: Vec, - isfile: bool, - cn_name: Option, - en_name: Option, - year: Option, - total_season: i64, - begin_season: Option, - end_season: Option, - total_episode: i64, - begin_episode: Option, - end_episode: Option, - part: Option, - source: String, - effects: Vec, - resource_pix: Option, - web_source: Option, - video_encode: Option, - video_bit: Option, - audio_encode: Option, - fps: Option, - media_type: Option, - stop_name_flag: bool, - stop_cnname_flag: bool, - last_token: String, - last_token_type: String, - continue_flag: bool, - unknown_name_str: String, - index: usize, -} - -#[pyfunction] -pub(crate) fn is_anime_fast(name: &str) -> bool { - if name.is_empty() { - return false; - } - if ANIME_BRACKET_RE.is_match(name) { - return true; - } - if ANIME_DASH_EPISODE_RE.is_match(name) { - return true; - } - if VIDEO_SEASON_EPISODE_RE.is_match(name) { - return false; - } - ANIME_SQUARE_BRACKET_RE.is_match(name) -} - -/// 从标题中的内嵌标签提取媒体 ID、类型和季集范围。 -#[pyfunction] -pub(crate) fn find_metainfo_fast(py: Python<'_>, title: &str) -> PyResult { - let result = PyDict::new(py); - let mut cleaned_title = title.to_string(); - let mut tmdbid: Option = None; - let mut doubanid: Option = None; - let mut media_type: Option = None; - let mut begin_season: Option = None; - let mut end_season: Option = None; - let mut begin_episode: Option = None; - let mut end_episode: Option = None; - - for captures in BRACED_METAINFO_RE.captures_iter(title) { - let Some(meta_match) = captures.get(1) else { - continue; - }; - let meta_text = meta_match.as_str(); - let found_tmdb = BRACED_TMDBID_RE - .captures(meta_text) - .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string())); - if found_tmdb.is_some() { - tmdbid = found_tmdb.clone(); - } - if let Some(value) = BRACED_DOUBANID_RE - .captures(meta_text) - .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string())) - { - doubanid = Some(value); - } - let found_type = BRACED_TYPE_RE - .captures(meta_text) - .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string())); - if let Some(value) = found_type.as_deref() { - if value == "movies" || value == "tv" { - media_type = Some(value.to_string()); - } - } - if let Some(value) = capture_i64(&BRACED_BEGIN_SEASON_RE, meta_text) { - begin_season = Some(value); - } - if let Some(value) = capture_i64(&BRACED_END_SEASON_RE, meta_text) { - end_season = Some(value); - } - if let Some(value) = capture_i64(&BRACED_BEGIN_EPISODE_RE, meta_text) { - begin_episode = Some(value); - } - if let Some(value) = capture_i64(&BRACED_END_EPISODE_RE, meta_text) { - end_episode = Some(value); - } - if found_tmdb.is_some() - || found_type.is_some() - || begin_season.is_some() - || end_season.is_some() - || begin_episode.is_some() - || end_episode.is_some() - { - cleaned_title = cleaned_title.replace(&format!("{{[{meta_text}]}}"), ""); - } - } - - if let Some(caps) = EMBY_TMDB_RE_LIST[0].captures(&cleaned_title) { - tmdbid = caps.get(1).map(|m| m.as_str().to_string()); - cleaned_title = EMBY_TMDB_RE_LIST[0] - .replace_all(&cleaned_title, "") - .trim() - .to_string(); - } else if tmdbid.is_none() { - for tmdb_re in EMBY_TMDB_RE_LIST.iter().skip(1) { - if let Some(caps) = tmdb_re.captures(&cleaned_title) { - tmdbid = caps.get(1).map(|m| m.as_str().to_string()); - cleaned_title = tmdb_re.replace_all(&cleaned_title, "").trim().to_string(); - break; - } - } - } - - let (begin_season, end_season, total_season) = apply_range_total(begin_season, end_season); - let (begin_episode, end_episode, total_episode) = apply_range_total(begin_episode, end_episode); - - result.set_item("title", cleaned_title)?; - result.set_item("tmdbid", tmdbid)?; - result.set_item("doubanid", doubanid)?; - result.set_item("type", media_type)?; - result.set_item("begin_season", begin_season)?; - result.set_item("end_season", end_season)?; - result.set_item("total_season", total_season)?; - result.set_item("begin_episode", begin_episode)?; - result.set_item("end_episode", end_episode)?; - result.set_item("total_episode", total_episode)?; - Ok(result.into()) -} - -/// 对标题执行影视主识别流程,返回名称、季集、资源和编码等完整主状态。 -#[pyfunction] -#[pyo3(signature = (title, isfile=false, media_exts=None))] -pub(crate) fn parse_video_title_fast( - py: Python<'_>, - title: &str, - isfile: bool, - media_exts: Option>, -) -> PyResult { - let result = PyDict::new(py); - if title.is_empty() { - result.set_item("complete", false)?; - return Ok(result.into()); - } - - if isfile && title.chars().all(|ch| ch.is_ascii_digit()) && title.len() < 5 { - result.set_item("complete", true)?; - result.set_item("type", "tv")?; - result.set_item("begin_episode", title.parse::().ok())?; - result.set_item("total_episode", 1)?; - return Ok(result.into()); - } - - if let Some(caps) = FULL_SEASON_RE.captures(title) { - result.set_item("complete", true)?; - result.set_item("type", "tv")?; - if let Some(season) = caps - .get(1) - .and_then(|value| value.as_str().parse::().ok()) - { - result.set_item("begin_season", season)?; - result.set_item("total_season", 1)?; - } - return Ok(result.into()); - } - - let normalized = normalize_video_title(title); - let tokens: Vec = TOKEN_SPLIT_RE - .split(&normalized) - .filter(|token| !token.is_empty()) - .map(|token| token.to_string()) - .collect(); - - let mut parser = VideoParseState::new(tokens, isfile, media_exts.unwrap_or_default()); - parser.parse(); - let mut effects = parser.effects.clone(); - if !effects.is_empty() { - effects.reverse(); - } - - result.set_item("complete", true)?; - result.set_item("cn_name", parser.cn_name)?; - result.set_item("en_name", parser.en_name)?; - result.set_item("year", parser.year)?; - result.set_item("type", parser.media_type)?; - result.set_item("begin_season", parser.begin_season)?; - result.set_item("end_season", parser.end_season)?; - result.set_item( - "total_season", - if parser.total_season > 0 { - Some(parser.total_season) - } else { - None - }, - )?; - result.set_item("begin_episode", parser.begin_episode)?; - result.set_item("end_episode", parser.end_episode)?; - result.set_item( - "total_episode", - if parser.total_episode > 0 { - Some(parser.total_episode) - } else { - None - }, - )?; - result.set_item("part", parser.part)?; - result.set_item( - "resource_type", - if parser.source.is_empty() { - None - } else { - Some(parser.source.trim().to_string()) - }, - )?; - result.set_item( - "resource_effect", - if effects.is_empty() { - None - } else { - Some(effects.join(" ")) - }, - )?; - result.set_item("resource_pix", parser.resource_pix)?; - result.set_item("web_source", parser.web_source)?; - result.set_item("video_encode", parser.video_encode)?; - result.set_item("video_bit", parser.video_bit)?; - result.set_item("audio_encode", parser.audio_encode)?; - result.set_item("fps", parser.fps)?; - Ok(result.into()) -} - -fn normalize_video_title(title: &str) -> String { - let mut value = title.to_string(); - if let Some(caps) = FIRST_BRACKET_RE.captures(&value) { - if let Some(content) = caps.get(1) { - if FIRST_BRACKET_RELEASE_RE.is_match(content.as_str()) - && FIRST_BRACKET_RESOURCE_RE.is_match(content.as_str()) - { - value = format!( - "{}{}", - content.as_str(), - value - .get(caps.get(0).map(|m| m.end()).unwrap_or(0)..) - .unwrap_or("") - ); - } else if let Some(full) = caps.get(0) { - value = value.get(full.end()..).unwrap_or("").to_string(); - } - } - } - value = TITLE_YEAR_RANGE_RE.replace_all(&value, "$1$2").to_string(); - value = remove_title_size_markers(&value); - TITLE_DATE_RE.replace_all(&value, "").to_string() -} - -/// 移除标题里的大小标记;单位后紧跟大写字母时保留,兼容 Python 负向前瞻语义。 -fn remove_title_size_markers(title: &str) -> String { - TITLE_SIZE_RE - .replace_all(title, |caps: ®ex::Captures<'_>| { - let Some(matched) = caps.get(0) else { - return String::new(); - }; - let next_char = title - .get(matched.end()..) - .and_then(|tail| tail.chars().next()); - if next_char.is_some_and(|ch| ch.is_ascii_uppercase()) { - matched.as_str().to_string() - } else { - String::new() - } - }) - .to_string() -} - -/// 识别分辨率字段。 -fn parse_resource_pix(token: &str) -> Option { - if let Some(caps) = RESOURCE_PIX_RE.captures(token) { - for item in caps.iter().skip(1).flatten() { - let mut pix = item.as_str().to_lowercase(); - if pix.chars().all(|ch| ch.is_ascii_digit()) && !pix.ends_with(['k', 'p', 'i']) { - pix.push('p'); - } - return Some(pix); - } - } - RESOURCE_PIX_RE2 - .captures(token) - .and_then(|caps| caps.get(1).map(|item| item.as_str().to_lowercase())) -} - -/// 判断文本是否包含中日韩统一表意文字,等价于 Python StringUtils.is_chinese。 -fn contains_chinese(text: &str) -> bool { - text.chars() - .any(|ch| ('\u{4e00}'..='\u{9fff}').contains(&ch)) -} - -/// 判断 token 是否属于季集描述词。 -fn is_name_se_word(token: &str) -> bool { - matches!(token, "共" | "第" | "季" | "集" | "话" | "話" | "期") -} - -/// 判断 token 是否包含季集描述词。 -fn contains_name_se_word(token: &str) -> bool { - ["共", "第", "季", "集", "话", "話", "期"] - .iter() - .any(|word| token.contains(word)) -} - -/// 判断中文 token 是否表示剧场版/电影版,保留在中文名中。 -fn is_name_movie_word(token: &str) -> bool { - ["剧场版", "劇場版", "电影版", "電影版"] - .iter() - .any(|word| token.contains(word)) -} - -/// 判断罗马数字,覆盖 Python 原正则的合法罗马数字范围。 -fn is_roman_numeral(token: &str) -> bool { - let upper = token.to_uppercase(); - if upper.is_empty() - || !upper - .chars() - .all(|ch| matches!(ch, 'M' | 'D' | 'C' | 'L' | 'X' | 'V' | 'I')) - { - return false; - } - let mut prev = 0; - let mut total = 0; - for ch in upper.chars().rev() { - let value = match ch { - 'I' => 1, - 'V' => 5, - 'X' => 10, - 'L' => 50, - 'C' => 100, - 'D' => 500, - 'M' => 1000, - _ => 0, - }; - if value < prev { - total -= value; - } else { - total += value; - prev = value; - } - } - total > 0 -} - -/// 判断字符串是否以指定 ASCII 后缀结尾,忽略大小写。 -fn ends_with_ignore_ascii(value: &str, suffix: &str) -> bool { - value - .get(value.len().saturating_sub(suffix.len())..) - .is_some_and(|tail| tail.eq_ignore_ascii_case(suffix)) -} - -/// 用空格向可选字符串追加片段。 -fn append_with_space(target: &mut Option, value: &str) { - if value.is_empty() { - return; - } - match target { - Some(existing) if !existing.is_empty() => { - existing.push(' '); - existing.push_str(value); - } - _ => *target = Some(value.to_string()), - } -} - -/// 原样追加片段,用于保留 Python 中 Season 标题后置空格的兼容行为。 -fn append_raw(target: &mut Option, value: &str) { - if let Some(existing) = target { - existing.push_str(value); - } -} - -/// 识别常见流媒体平台简称和全称。 -fn streaming_platform_name(token: &str) -> Option<&'static str> { - let normalized = token.to_uppercase(); - match normalized.as_str() { - "AMZN" | "AMAZON" => Some("Amazon"), - "NF" | "NETFLIX" => Some("Netflix"), - "ATVP" | "APPLE TV+" | "APPLE-TV+" => Some("Apple TV+"), - "DSNP" | "DISNEY+" | "DISNEY" => Some("Disney+"), - "HMAX" | "MAX" => Some("Max"), - "HULU" => Some("Hulu Networks"), - "PMTP" | "PARAMOUNT+" | "PARAMOUNT" => Some("Paramount+"), - "PCOK" | "PEACOCK" => Some("Peacock"), - "B-GLOBAL" | "BG" => Some("B-Global"), - "BAHA" => Some("Baha"), - "CR" | "CRUNCHYROLL" => Some("Crunchyroll"), - "VIU" => Some("Viu"), - "ITUNES" | "IT" => Some("iTunes"), - "YOUTUBE" | "YT" => Some("YouTube"), - "ROKU" => Some("Roku"), - "PLEX" => Some("Plex"), - "STAN" => Some("Stan"), - _ => None, - } -} - -/// 识别片源和效果字段,并保留 BluRay/WEB-DL/REMUX 等组合语义。 - -fn normalize_source(token: &str) -> String { - if token.eq_ignore_ascii_case("BLURAY") { - return "BluRay".to_string(); - } - if token.eq_ignore_ascii_case("WEBDL") { - return "WEB-DL".to_string(); - } - token.to_string() -} - -fn parse_video_bit(token: &str) -> Option { - VIDEO_BIT_RE - .captures(token) - .and_then(|caps| caps.get(1).map(|value| format!("{}bit", value.as_str()))) -} - -/// 标准化视频编码捕获结果,保持 Python MetaVideo 的大小写兼容行为。 -fn normalize_video_encode_capture(caps: ®ex::Captures<'_>) -> Option { - let value = caps.get(0)?.as_str(); - if value.starts_with('x') || value.starts_with('X') { - return Some(value.to_lowercase()); - } - Some(value.to_uppercase()) -} - -impl VideoParseState { - /// 创建影视标题主解析状态机,保持 Python MetaVideo 原有字段和中间状态语义。 - fn new(tokens: Vec, isfile: bool, media_exts: Vec) -> Self { - Self { - tokens, - media_exts: media_exts - .into_iter() - .map(|item| item.to_lowercase()) - .collect(), - isfile, - cn_name: None, - en_name: None, - year: None, - total_season: 0, - begin_season: None, - end_season: None, - total_episode: 0, - begin_episode: None, - end_episode: None, - part: None, - source: String::new(), - effects: Vec::new(), - resource_pix: None, - web_source: None, - video_encode: None, - video_bit: None, - audio_encode: None, - fps: None, - media_type: None, - stop_name_flag: false, - stop_cnname_flag: false, - last_token: String::new(), - last_token_type: String::new(), - continue_flag: true, - unknown_name_str: String::new(), - index: 0, - } - } - - /// 执行单轮 token 扫描,迁移 Python 侧 MetaVideo.__init__ 的主循环。 - fn parse(&mut self) { - while self.index < self.tokens.len() { - let token = self.tokens[self.index].clone(); - self.index += 1; - self.parse_part(&token); - if self.continue_flag { - self.parse_name(&token); - } - if self.continue_flag { - self.parse_year(&token); - } - if self.continue_flag { - self.parse_resource_pix(&token); - } - if self.continue_flag { - self.parse_season(&token); - } - if self.continue_flag { - self.parse_episode(&token); - } - if self.continue_flag { - self.parse_resource_type(&token); - } - if self.continue_flag { - self.parse_streaming_platform(&token); - } - if self.continue_flag { - self.parse_video_encode(&token); - } - if self.continue_flag { - self.parse_video_bit(&token); - } - if self.continue_flag { - self.parse_audio_encode(&token); - } - if self.continue_flag { - self.parse_fps(&token); - } - self.continue_flag = true; - } - } - - /// 返回当前是否已经识别到任意名称。 - fn has_name(&self) -> bool { - self.cn_name - .as_deref() - .is_some_and(|value| !value.is_empty()) - || self - .en_name - .as_deref() - .is_some_and(|value| !value.is_empty()) - } - - /// 识别标题中的 Part/CD/DVD/Disc 信息。 - fn parse_part(&mut self, token: &str) { - if !self.has_name() - || (self.year.is_none() - && self.begin_season.is_none() - && self.begin_episode.is_none() - && self.resource_pix.is_none() - && self.source.is_empty()) - { - return; - } - let Some(caps) = PART_RE.captures(token) else { - return; - }; - if self.part.is_none() { - self.part = caps.get(1).map(|value| value.as_str().to_string()); - } - if let Some(next_value) = self.tokens.get(self.index) { - let upper = next_value.to_uppercase(); - let next_is_part_suffix = (next_value.chars().all(|ch| ch.is_ascii_digit()) - && (next_value.len() == 1 - || (next_value.len() == 2 && next_value.starts_with('0')))) - || matches!(upper.as_str(), "A" | "B" | "C" | "I" | "II" | "III"); - if next_is_part_suffix { - if let Some(part) = &mut self.part { - part.push_str(next_value); - } - self.index += 1; - } - } - self.last_token_type = "part".to_string(); - self.continue_flag = false; - } - - /// 识别中文名和英文名,保持原有停止名称消费的规则。 - fn parse_name(&mut self, token: &str) { - if token.is_empty() { - return; - } - if !self.unknown_name_str.is_empty() { - if self.cn_name.as_deref().unwrap_or("").is_empty() { - if self.en_name.as_deref().unwrap_or("").is_empty() { - self.en_name = Some(self.unknown_name_str.clone()); - } else if Some(self.unknown_name_str.as_str()) != self.year.as_deref() { - append_with_space(&mut self.en_name, &self.unknown_name_str); - } - self.last_token_type = "enname".to_string(); - } - self.unknown_name_str.clear(); - } - if self.stop_name_flag { - return; - } - if token.eq_ignore_ascii_case("AKA") { - self.continue_flag = false; - self.stop_name_flag = true; - return; - } - if is_name_se_word(token) { - self.last_token_type = "name_se_words".to_string(); - return; - } - if contains_chinese(token) { - self.last_token_type = "cnname".to_string(); - if self.cn_name.as_deref().unwrap_or("").is_empty() { - self.cn_name = Some(token.to_string()); - } else if !self.stop_cnname_flag { - if is_name_movie_word(token) - || (!NAME_NO_CHINESE_RE.is_match(token) && !contains_name_se_word(token)) - { - append_with_space(&mut self.cn_name, token); - } - self.stop_cnname_flag = true; - } - return; - } - - let roman_digit = is_roman_numeral(token); - if token.chars().all(|ch| ch.is_ascii_digit()) || roman_digit { - if self.last_token_type == "name_se_words" { - return; - } - if self.has_name() { - if token.starts_with('0') { - return; - } - if token.chars().all(|ch| ch.is_ascii_digit()) - && self.last_token_type == "cnname" - && token.parse::().ok().is_some_and(|value| value < 1900) - { - return; - } - if (token.chars().all(|ch| ch.is_ascii_digit()) && token.len() < 4) || roman_digit { - if self.last_token_type == "cnname" { - append_with_space(&mut self.cn_name, token); - } else if self.last_token_type == "enname" { - append_with_space(&mut self.en_name, token); - } - self.continue_flag = false; - } else if token.chars().all(|ch| ch.is_ascii_digit()) - && token.len() == 4 - && self.unknown_name_str.is_empty() - { - self.unknown_name_str = token.to_string(); - } - } else if self.unknown_name_str.is_empty() { - self.unknown_name_str = token.to_string(); - } - } else if SEASON_RE.is_match(token) { - if self - .en_name - .as_deref() - .is_some_and(|name| ends_with_ignore_ascii(name, "SEASON")) - { - append_raw(&mut self.en_name, " "); - } - self.stop_name_flag = true; - } else if EPISODE_RE.is_match(token) - || SOURCE_RE.is_match(token) - || EFFECT_RE.is_match(token) - || RESOURCE_PIX_RE.is_match(token) - { - self.stop_name_flag = true; - } else { - if self.is_media_ext(token) { - return; - } - append_with_space(&mut self.en_name, token); - self.last_token_type = "enname".to_string(); - } - } - - /// 识别年份;识别到年份后停止后续名称消费。 - fn parse_year(&mut self, token: &str) { - if !self.has_name() - || !token.chars().all(|ch| ch.is_ascii_digit()) - || token.len() != 4 - || !token - .parse::() - .ok() - .is_some_and(|value| value > 1900 && value < 2050) - { - return; - } - if let Some(existing_year) = self.year.clone() { - if self.en_name.as_deref().is_some_and(|name| !name.is_empty()) { - append_with_space(&mut self.en_name, &existing_year); - } else if self.cn_name.as_deref().is_some_and(|name| !name.is_empty()) { - append_with_space(&mut self.cn_name, &existing_year); - } - } else if self - .en_name - .as_deref() - .is_some_and(|name| ends_with_ignore_ascii(name, "SEASON")) - { - append_raw(&mut self.en_name, " "); - } - self.year = Some(token.to_string()); - self.last_token_type = "year".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - } - - /// 识别分辨率。 - fn parse_resource_pix(&mut self, token: &str) { - if !self.has_name() { - return; - } - if let Some(pix) = parse_resource_pix(token) { - self.last_token_type = "pix".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - if self.resource_pix.is_none() { - self.resource_pix = Some(pix); - } - } - } - - /// 识别季信息并计算季总数。 - fn parse_season(&mut self, token: &str) { - let seasons = capture_all_i64(&SEASON_RE, token); - if !seasons.is_empty() { - self.last_token_type = "season".to_string(); - self.media_type = Some("tv".to_string()); - self.stop_name_flag = true; - self.continue_flag = true; - for season in seasons { - if self.begin_season.is_none() { - self.begin_season = Some(season); - self.total_season = 1; - } else if Some(season) > self.begin_season { - self.end_season = Some(season); - self.total_season = season - self.begin_season.unwrap_or(season) + 1; - if self.isfile && self.total_season > 1 { - self.end_season = None; - self.total_season = 1; - } - } - } - return; - } - if token.chars().all(|ch| ch.is_ascii_digit()) { - if self.last_token_type == "SEASON" && self.begin_season.is_none() && token.len() < 3 { - if let Ok(season) = token.parse::() { - self.begin_season = Some(season); - self.total_season = 1; - self.last_token_type = "season".to_string(); - self.stop_name_flag = true; - self.continue_flag = false; - self.media_type = Some("tv".to_string()); - } - } - } else if token.eq_ignore_ascii_case("SEASON") && self.begin_season.is_none() { - self.last_token_type = "SEASON".to_string(); - } else if self.media_type.as_deref() == Some("tv") && self.begin_season.is_none() { - self.begin_season = Some(1); - } - } - - /// 识别集信息并计算集总数。 - fn parse_episode(&mut self, token: &str) { - let episodes = capture_all_i64(&EPISODE_RE, token); - if !episodes.is_empty() { - self.last_token_type = "episode".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - self.media_type = Some("tv".to_string()); - for episode in episodes { - if self.begin_episode.is_none() { - self.begin_episode = Some(episode); - self.total_episode = 1; - } else if Some(episode) > self.begin_episode { - self.end_episode = Some(episode); - self.total_episode = episode - self.begin_episode.unwrap_or(episode) + 1; - if self.isfile && self.total_episode > 2 { - self.end_episode = None; - self.total_episode = 1; - } - } - } - return; - } - if token.chars().all(|ch| ch.is_ascii_digit()) { - let Ok(episode) = token.parse::() else { - return; - }; - if self.begin_episode.is_some() - && self.end_episode.is_none() - && token.len() < 5 - && Some(episode) > self.begin_episode - && self.last_token_type == "episode" - { - self.end_episode = Some(episode); - self.total_episode = episode - self.begin_episode.unwrap_or(episode) + 1; - if self.isfile && self.total_episode > 2 { - self.end_episode = None; - self.total_episode = 1; - } - self.continue_flag = false; - self.media_type = Some("tv".to_string()); - } else if self.begin_episode.is_none() - && token.len() > 1 - && token.len() < 4 - && self.last_token_type != "year" - && self.last_token_type != "videoencode" - && token != self.unknown_name_str - { - self.begin_episode = Some(episode); - self.total_episode = 1; - self.last_token_type = "episode".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - self.media_type = Some("tv".to_string()); - } else if self.last_token_type == "EPISODE" - && self.begin_episode.is_none() - && token.len() < 5 - { - self.begin_episode = Some(episode); - self.total_episode = 1; - self.last_token_type = "episode".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - self.media_type = Some("tv".to_string()); - } - } else if token.eq_ignore_ascii_case("EPISODE") { - self.last_token_type = "EPISODE".to_string(); - } - } - - /// 识别片源和效果字段。 - fn parse_resource_type(&mut self, token: &str) { - if !self.has_name() { - return; - } - let upper = token.to_uppercase(); - if upper == "DL" && self.last_token_type == "source" && self.last_token == "WEB" { - self.source = "WEB-DL".to_string(); - self.continue_flag = false; - return; - } - if upper == "RAY" && self.last_token_type == "source" && self.last_token == "BLU" { - self.source = if self.source == "UHD" { - "UHD BluRay".to_string() - } else { - "BluRay".to_string() - }; - self.continue_flag = false; - return; - } - if upper == "WEBDL" { - self.source = "WEB-DL".to_string(); - self.continue_flag = false; - return; - } - if upper == "REMUX" && self.source == "BluRay" { - self.source = "BluRay REMUX".to_string(); - self.continue_flag = false; - return; - } - if upper == "BLURAY" && self.source == "UHD" { - self.source = "UHD BluRay".to_string(); - self.continue_flag = false; - return; - } - if SOURCE_RE.is_match(token) { - self.last_token_type = "source".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - if self.source.is_empty() { - self.source = normalize_source(token); - self.last_token = self.source.to_uppercase(); - } - return; - } - if EFFECT_RE.is_match(token) { - self.last_token_type = "effect".to_string(); - self.continue_flag = false; - self.stop_name_flag = true; - if !self - .effects - .iter() - .any(|effect| effect.eq_ignore_ascii_case(token)) - { - self.effects.push(token.to_string()); - } - self.last_token = upper; - } - } - - /// 识别常见流媒体平台简称。 - fn parse_streaming_platform(&mut self, token: &str) { - if !self.has_name() { - return; - } - let mut platform_name = streaming_platform_name(token); - let mut query_range = 1usize; - if platform_name.is_none() { - let prev_token = if self.index >= 2 { - self.tokens.get(self.index - 2) - } else { - None - }; - let next_token = self.tokens.get(self.index); - for (adjacent_token, is_next) in [(prev_token, false), (next_token, true)] { - if adjacent_token.is_none() || platform_name.is_some() { - continue; - } - let adjacent_token = adjacent_token.unwrap(); - for separator in [" ", "-"] { - let combined = if is_next { - format!("{token}{separator}{adjacent_token}") - } else { - format!("{adjacent_token}{separator}{token}") - }; - if let Some(name) = streaming_platform_name(&combined) { - platform_name = Some(name); - query_range = 2; - if is_next { - self.index += 1; - } - break; - } - } - } - } - let Some(platform_name) = platform_name else { - return; - }; - let match_start = self.index.saturating_sub(query_range + 1); - let match_end = self.index.saturating_sub(1); - let start = match_start.saturating_sub(query_range); - let end = (match_end + 1 + query_range).min(self.tokens.len()); - if self.tokens[start..end].iter().any(|item| { - matches!( - item.to_uppercase().as_str(), - "WEB" | "DL" | "WEBDL" | "WEBRIP" - ) - }) { - self.web_source = Some(platform_name.to_string()); - self.continue_flag = false; - } - } - - /// 识别视频编码。 - fn parse_video_encode(&mut self, token: &str) { - if !self.has_name() - || (self.year.is_none() - && self.resource_pix.is_none() - && self.source.is_empty() - && self.begin_season.is_none() - && self.begin_episode.is_none()) - { - return; - } - if let Some(caps) = VIDEO_ENCODE_RE.captures(token) { - self.continue_flag = false; - self.stop_name_flag = true; - self.last_token_type = "videoencode".to_string(); - if self.video_encode.is_none() { - let encode = normalize_video_encode_capture(&caps); - if let Some(encode) = encode { - self.last_token = encode.clone(); - self.video_encode = Some(encode); - } - } else if self.video_encode.as_deref() == Some("10bit") { - if let Some(encode) = normalize_video_encode_capture(&caps) { - self.video_encode = Some(format!("{encode} 10bit")); - self.last_token = encode; - } - } - return; - } - let upper = token.to_uppercase(); - if upper == "H" || upper == "X" { - self.continue_flag = false; - self.stop_name_flag = true; - self.last_token_type = "videoencode".to_string(); - self.last_token = if upper == "H" { - upper - } else { - token.to_lowercase() - }; - } else if matches!(token, "264" | "265") - && self.last_token_type == "videoencode" - && matches!(self.last_token.as_str(), "H" | "x") - { - self.video_encode = Some(format!("{}{}", self.last_token, token)); - } else if token.chars().all(|ch| ch.is_ascii_digit()) - && self.last_token_type == "videoencode" - && matches!(self.last_token.as_str(), "VC" | "MPEG") - { - self.video_encode = Some(format!("{}{}", self.last_token, token)); - } else if upper == "10BIT" { - self.last_token_type = "videoencode".to_string(); - if let Some(existing) = &mut self.video_encode { - *existing = format!("{existing} 10bit"); - } else { - self.video_encode = Some("10bit".to_string()); - } - } - } - - /// 识别视频位深字段。 - fn parse_video_bit(&mut self, token: &str) { - if !self.has_name() - || (self.year.is_none() - && self.resource_pix.is_none() - && self.source.is_empty() - && self.begin_season.is_none() - && self.begin_episode.is_none()) - { - return; - } - let Some(video_bit) = parse_video_bit(token) else { - return; - }; - self.continue_flag = false; - self.stop_name_flag = true; - self.last_token_type = "videobit".to_string(); - if self.video_bit.is_none() { - self.video_bit = Some(video_bit); - } - } - - /// 识别音频编码并合并 5.1、DTS-HD MA 等组合。 - fn parse_audio_encode(&mut self, token: &str) { - if !self.has_name() - || (self.year.is_none() - && self.resource_pix.is_none() - && self.source.is_empty() - && self.begin_season.is_none() - && self.begin_episode.is_none()) - { - return; - } - if AUDIO_ENCODE_RE.is_match(token) { - self.continue_flag = false; - self.stop_name_flag = true; - self.last_token_type = "audioencode".to_string(); - self.last_token = token.to_uppercase(); - if let Some(existing) = &mut self.audio_encode { - if existing.eq_ignore_ascii_case("DTS") { - *existing = format!("{existing}-{token}"); - } else { - *existing = format!("{existing} {token}"); - } - } else { - self.audio_encode = Some(token.to_string()); - } - } else if is_digit_token(token) && self.last_token_type == "audioencode" { - if let Some(existing) = &mut self.audio_encode { - if is_digit_token(&self.last_token) { - *existing = format!("{existing}.{token}"); - } else if existing - .chars() - .last() - .is_some_and(|ch| ch.is_ascii_digit()) - { - let split_at = existing.len() - 1; - *existing = format!( - "{} {}.{token}", - &existing[..split_at], - &existing[split_at..] - ); - } else { - *existing = format!("{existing} {token}"); - } - } - self.last_token = token.to_string(); - } - } - - /// 识别 FPS 数值。 - fn parse_fps(&mut self, token: &str) { - if !self.has_name() { - return; - } - let Some(fps) = FPS_RE - .captures(token) - .and_then(|caps| caps.get(1)) - .and_then(|value| value.as_str().parse::().ok()) - else { - return; - }; - self.continue_flag = false; - self.stop_name_flag = true; - self.last_token_type = "fps".to_string(); - self.fps = Some(fps); - self.last_token = format!("{fps}FPS"); - } - - /// 判断 token 是否是配置里的媒体后缀,防止文件扩展名进入标题。 - fn is_media_ext(&self, token: &str) -> bool { - let suffix = format!(".{}", token.to_lowercase()); - self.media_exts.iter().any(|item| item == &suffix) - } -} - -/// 判断 token 是否全部为 Unicode 数字,兼容 Python str.isdigit 的行为。 -fn is_digit_token(token: &str) -> bool { - !token.is_empty() && token.chars().all(|ch| ch.is_numeric()) -} diff --git a/rust/moviepilot_rust/src/rss.rs b/rust/moviepilot_rust/src/rss.rs deleted file mode 100644 index f633fdbc..00000000 --- a/rust/moviepilot_rust/src/rss.rs +++ /dev/null @@ -1,204 +0,0 @@ -use pyo3::prelude::*; -use pyo3::types::PyDict; -use quick_xml::events::{BytesStart, Event}; -use quick_xml::name::QName; -use quick_xml::Reader; - -#[derive(Default)] -struct RssItem { - title: String, - description: String, - link: String, - enclosure: String, - size: i64, - pubdate: String, - nickname: String, -} - -impl RssItem { - fn has_output(&self) -> bool { - !self.title.is_empty() && (!self.enclosure.is_empty() || !self.link.is_empty()) - } -} - -#[derive(Clone, Copy, PartialEq, Eq)] -enum TextField { - Title, - Description, - Link, - Pubdate, - Nickname, -} - -/// 批量解析 RSS/Atom 条目,返回 Python 侧后续处理需要的核心字段。 -#[pyfunction] -pub(crate) fn parse_rss_items_fast( - py: Python<'_>, - xml_text: &str, - max_items: usize, -) -> PyResult>> { - let mut reader = Reader::from_str(xml_text); - reader.config_mut().trim_text(true); - - let mut items = Vec::new(); - let mut current_item: Option = None; - let mut current_field: Option = None; - let mut item_depth = 0usize; - let mut parse_failed = false; - - loop { - match reader.read_event() { - Ok(Event::Start(event)) => { - let local = local_name(event.name()); - if current_item.is_none() && (local == "item" || local == "entry") { - current_item = Some(RssItem::default()); - current_field = None; - item_depth = 1; - continue; - } - if let Some(item) = current_item.as_mut() { - item_depth += 1; - match local.as_str() { - "title" => current_field = Some(TextField::Title), - "description" | "summary" => current_field = Some(TextField::Description), - "pubDate" | "published" | "updated" => current_field = Some(TextField::Pubdate), - "creator" => current_field = Some(TextField::Nickname), - "link" => { - current_field = Some(TextField::Link); - if item.link.is_empty() { - if let Some(href) = attr_value(&event, QName(b"href")) { - item.link = href; - } - } - } - "enclosure" => { - if let Some(url) = attr_value(&event, QName(b"url")) { - item.enclosure = url; - } - if let Some(length) = attr_value(&event, QName(b"length")) { - item.size = length.parse::().unwrap_or(0); - } - } - _ => {} - } - } - } - Ok(Event::Empty(event)) => { - if let Some(item) = current_item.as_mut() { - match local_name(event.name()).as_str() { - "link" => { - if item.link.is_empty() { - if let Some(href) = attr_value(&event, QName(b"href")) { - item.link = href; - } - } - } - "enclosure" => { - if let Some(url) = attr_value(&event, QName(b"url")) { - item.enclosure = url; - } - if let Some(length) = attr_value(&event, QName(b"length")) { - item.size = length.parse::().unwrap_or(0); - } - } - _ => {} - } - } - } - Ok(Event::Text(event)) => { - if let (Some(item), Some(field)) = (current_item.as_mut(), current_field) { - if let Ok(value) = event.decode() { - append_field(item, field, value.as_ref()); - } - } - } - Ok(Event::CData(event)) => { - if let (Some(item), Some(field)) = (current_item.as_mut(), current_field) { - if let Ok(value) = event.decode() { - append_field(item, field, value.as_ref()); - } - } - } - Ok(Event::End(event)) => { - if current_item.is_some() { - let local = local_name(event.name()); - if local == "item" || local == "entry" { - let mut item = current_item.take().unwrap_or_default(); - if item.enclosure.is_empty() && !item.link.is_empty() { - item.enclosure = item.link.clone(); - } - if item.has_output() { - items.push(item_to_py(py, &item)?.into_any().unbind()); - if items.len() >= max_items { - break; - } - } - current_field = None; - item_depth = 0; - } else { - item_depth = item_depth.saturating_sub(1); - if item_depth <= 1 { - current_field = None; - } - } - } - } - Ok(Event::Eof) => break, - Err(_) => { - parse_failed = true; - break; - } - _ => {} - } - } - - if parse_failed && items.is_empty() { - Ok(None) - } else { - Ok(Some(items)) - } -} - -/// 将内部 RSS 条目结构转换为 Python 字典。 -fn item_to_py<'py>(py: Python<'py>, item: &RssItem) -> PyResult> { - let dict = PyDict::new(py); - dict.set_item("title", item.title.trim())?; - dict.set_item("enclosure", item.enclosure.trim())?; - dict.set_item("size", item.size)?; - dict.set_item("description", item.description.trim())?; - dict.set_item("link", item.link.trim())?; - dict.set_item("pubdate_raw", item.pubdate.trim())?; - if !item.nickname.trim().is_empty() { - dict.set_item("nickname", item.nickname.trim())?; - } - Ok(dict) -} - -/// 返回 XML 名称去掉命名空间前缀后的本地名称。 -fn local_name(name: QName<'_>) -> String { - let raw = name.as_ref(); - let local = raw.rsplit(|byte| *byte == b':').next().unwrap_or(raw); - std::str::from_utf8(local).unwrap_or("").to_string() -} - -/// 读取 XML 节点属性并完成实体反转义。 -fn attr_value(event: &BytesStart<'_>, name: QName<'_>) -> Option { - event - .try_get_attribute(name) - .ok() - .flatten() - .and_then(|attr| attr.decode_and_unescape_value(event.decoder()).ok()) - .map(|value| value.into_owned()) -} - -/// 追加当前文本节点到对应 RSS 字段。 -fn append_field(item: &mut RssItem, field: TextField, value: &str) { - let target = match field { - TextField::Title => &mut item.title, - TextField::Description => &mut item.description, - TextField::Link => &mut item.link, - TextField::Pubdate => &mut item.pubdate, - TextField::Nickname => &mut item.nickname, - }; - target.push_str(value); -} diff --git a/rust/moviepilot_rust/src/utils.rs b/rust/moviepilot_rust/src/utils.rs deleted file mode 100644 index 083d79d8..00000000 --- a/rust/moviepilot_rust/src/utils.rs +++ /dev/null @@ -1,103 +0,0 @@ -use pyo3::exceptions::PyValueError; -use pyo3::prelude::*; -use pyo3::types::{PyAny, PyDict}; -use regex::Regex; - -/// 捕获正则第一组并转换为整数。 -pub(crate) fn capture_i64(regex: &Regex, text: &str) -> Option { - regex - .captures(text) - .and_then(|caps| caps.get(1)) - .and_then(|value| value.as_str().parse::().ok()) -} - -/// 捕获正则所有分组中的整数,用于 S01E02 和范围类 token 的多值识别。 -pub(crate) fn capture_all_i64(regex: &Regex, text: &str) -> Vec { - let mut values = Vec::new(); - for caps in regex.captures_iter(text) { - for item in caps.iter().skip(1).flatten() { - if let Ok(value) = item.as_str().parse::() { - values.push(value); - break; - } - } - } - values -} - -/// 计算范围的开始、结束和总数,保持 Python 侧的倒序交换语义。 -pub(crate) fn apply_range_total( - mut begin: Option, - mut end: Option, -) -> (Option, Option, Option) { - let total = match (begin, end) { - (Some(begin_value), Some(end_value)) => { - if begin_value > end_value { - begin = Some(end_value); - end = Some(begin_value); - Some(begin_value - end_value + 1) - } else { - Some(end_value - begin_value + 1) - } - } - (Some(_), None) => Some(1), - _ => None, - }; - (begin, end, total) -} - -/// 将 Python 对象转换为 usize,用于过滤器下标。 -pub(crate) fn py_i64_to_usize(value: &Bound<'_, PyAny>) -> PyResult { - let index = value.extract::()?; - if index < 0 { - return Err(PyValueError::new_err("下标不能为负数")); - } - Ok(index as usize) -} - -/// 从 Python 字典读取可选字符串。 -pub(crate) fn get_optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { - let Some(value) = dict.get_item(key)? else { - return Ok(None); - }; - if value.is_none() { - return Ok(None); - } - Ok(Some(value.str()?.to_str()?.to_string())) -} - -/// 从 Python 字典读取可选整数。 -pub(crate) fn get_optional_i64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { - let Some(value) = dict.get_item(key)? else { - return Ok(None); - }; - if value.is_none() { - return Ok(None); - } - if let Ok(parsed) = value.extract::() { - return Ok(Some(parsed)); - } - let text = value.str()?.to_str()?.trim().to_string(); - if text.is_empty() { - return Ok(None); - } - Ok(text.parse::().ok()) -} - -/// 从 Python 字典读取可选浮点数。 -pub(crate) fn get_optional_f64(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { - let Some(value) = dict.get_item(key)? else { - return Ok(None); - }; - if value.is_none() { - return Ok(None); - } - if let Ok(parsed) = value.extract::() { - return Ok(Some(parsed)); - } - let text = value.str()?.to_str()?.trim().to_string(); - if text.is_empty() { - return Ok(None); - } - Ok(text.parse::().ok()) -} diff --git a/tests/test_rust_accel.py b/tests/test_rust_accel.py index 972967f4..dc45f0ec 100644 --- a/tests/test_rust_accel.py +++ b/tests/test_rust_accel.py @@ -1,9 +1,5 @@ import pytest -from app.core.context import TorrentInfo -from app.modules.filter import FilterModule -from app.modules.indexer.spider import SiteSpider -from app.schemas.types import MediaType from app.utils import rust_accel @@ -13,445 +9,19 @@ pytestmark = pytest.mark.skipif( ) -def test_rust_metainfo_fast_path_extracts_emby_override(): +def test_rust_filter_rule_parser_matches_boolean_semantics(): """ - Rust 内嵌媒体标签识别应保持 Emby tmdbid 标签优先级。 + Rust 过滤规则解析应保持 pyparsing 的布尔表达式结构。 """ - title, metainfo = rust_accel.find_metainfo("Movie {[tmdbid=111;type=movies]} [tmdbid=222]") + result = rust_accel.parse_filter_rule("HDR & !BLU") - assert title == "Movie" - assert metainfo["tmdbid"] == "222" - assert metainfo["type"] == MediaType.MOVIE + assert result == [["HDR", "and", ["not", "BLU"]]] -def test_rust_video_title_fast_path_extracts_common_resource_fields(): +def test_rust_filter_rule_parser_handles_parentheses_and_or(): """ - Rust 影视标题预解析应能提取常见资源字段。 + Rust 过滤规则解析应保持括号、与、或的优先级语义。 """ - result = rust_accel.parse_video_title( - "The 355 2022 BluRay 1080p DTS-HD MA5.1 X265.10bit 60FPS" - ) + result = rust_accel.parse_filter_rule("CNSUB & (4K | 1080P) & !BLU") - assert result["year"] == "2022" - assert result["resource_pix"] == "1080p" - assert result["resource_type"] == "BluRay" - assert result["video_encode"] == "x265 10bit" - assert result["video_bit"] == "10bit" - assert result["fps"] == 60 - - -def test_rust_filter_fast_path_matches_priority_semantics(): - """ - Rust 批量过滤应保持优先级和布尔表达式语义。 - """ - module = FilterModule() - module.rule_set = { - "HDR": {"include": "HDR"}, - "DV": {"include": "DOVI"}, - "BLU": {"include": "BluRay"}, - } - torrents = [ - TorrentInfo(title="Movie HDR WEB-DL", description=""), - TorrentInfo(title="Movie DOVI", description=""), - TorrentInfo(title="Movie HDR BluRay", description=""), - ] - - result = module._FilterModule__filter_torrents_by_rust( # noqa: SLF001 - groups=[type("RuleGroup", (), {"rule_string": "HDR & !BLU > DV"})()], - torrent_list=torrents, - mediainfo=None, - ) - - assert result == torrents[:2] - assert result[0].pri_order == 100 - assert result[1].pri_order == 99 - - -def test_rust_indexer_search_url_keeps_existing_query_and_category(): - """ - Rust URL 生成应保留路径原有查询参数并应用分类参数。 - """ - spider = SiteSpider( - indexer={ - "id": "ttg", - "name": "TTG", - "domain": "https://totheglory.im/", - "search": { - "paths": [{"path": "browse.php?c=M"}], - "params": {"search_field": "{keyword}", "c": "M"}, - "imdbid_format": "imdb{imdbid_num}", - }, - "category": { - "field": "search_field", - "delimiter": " 分类:", - "movie": [{"id": "电影DVDRip", "cat": "Movies/SD"}], - }, - "torrents": {"list": {}, "fields": {}}, - }, - keyword="tt0049406", - mtype=MediaType.MOVIE, - ) - - search_url = spider._SiteSpider__get_search_url() # noqa: SLF001 - - assert search_url.count("?") == 1 - assert "c=M" in search_url - assert "search_field=imdb0049406" in search_url - - -def test_rust_rss_parser_extracts_common_rss_and_atom_fields(): - """ - Rust RSS 解析应同时覆盖 RSS item 和 Atom entry 的核心字段。 - """ - xml_text = """ - - - Example Torrent - - https://example.org/details/1 - - Tue, 19 May 2026 08:30:00 GMT - 豆瓣用户 - - - Atom Torrent - Atom Desc - - 2026-05-19T09:30:00Z - - - """ - - items = rust_accel.parse_rss_items(xml_text, 100) - - assert items[0]["title"] == "Example Torrent" - assert items[0]["enclosure"] == "https://example.org/download/1.torrent" - assert items[0]["size"] == 1024 - assert items[0]["nickname"] == "豆瓣用户" - assert items[1]["title"] == "Atom Torrent" - assert items[1]["enclosure"] == "https://example.org/atom/2" - - -def test_rust_indexer_page_parser_handles_common_fields(): - """ - Rust 普通 indexer 页面解析应批量提取列表行核心字段。 - """ - spider = SiteSpider( - indexer={ - "id": "demo", - "name": "Demo", - "domain": "https://example.org/", - "search": {"paths": [{"path": "torrents.php"}]}, - "category": { - "movie": [{"id": "401"}], - "tv": [{"id": "402"}], - }, - "torrents": { - "list": {"selector": "tr.torrent"}, - "fields": { - "title": {"selector": "a.title"}, - "description": {"selector": ".desc"}, - "details": {"selector": "a.title", "attribute": "href"}, - "download": {"selector": "a.dl", "attribute": "href"}, - "size": {"selector": ".size"}, - "seeders": {"selector": ".seeders"}, - "leechers": {"selector": ".leechers"}, - "grabs": {"selector": ".grabs"}, - "downloadvolumefactor": {"case": {".free": 0}}, - "uploadvolumefactor": {"selector": ".up"}, - "labels": {"selector": ".label"}, - "hr": {"selector": ".hr"}, - "category": {"selector": ".cat"}, - }, - }, - }, - ) - html = """ - - - - - - - - - -
Movie 2024 1080pBluRayDL1.5 GB1,2345/1042Free2xDIYHDRH&R401
- """ - - torrents = spider.parse(html) - - assert torrents == [{ - "title": "Movie 2024 1080p", - "description": "BluRay", - "page_url": "https://example.org/details/1", - "enclosure": "https://example.org/download/1", - "size": 1610612736, - "seeders": 1234, - "peers": 5, - "grabs": 42, - "downloadvolumefactor": 0, - "uploadvolumefactor": 2, - "labels": ["DIY", "HDR"], - "hit_and_run": True, - "category": MediaType.MOVIE.value, - }] - - -def test_rust_indexer_page_parser_renders_common_title_template(): - """ - Rust 普通 indexer 页面解析应兼容站点构建项目里的 title_optional 模板。 - """ - spider = SiteSpider( - indexer={ - "id": "demo", - "name": "Demo", - "domain": "https://example.org/", - "search": {"paths": [{"path": "torrents.php"}]}, - "torrents": { - "list": {"selector": "tr.torrent"}, - "fields": { - "title_default": {"selector": "a.title"}, - "title_optional": { - "selector": "a.title", - "attribute": "title", - "optional": True, - }, - "title": { - "text": ( - "{% if fields['title_optional'] %}" - "{{ fields['title_optional'] }}" - "{% else %}" - "{{ fields['title_default'] }}" - "{% endif %}" - ) - }, - "download": {"selector": "a.dl", "attribute": "href"}, - }, - }, - }, - ) - html = """ - - - - - - - - - -
Default NameDL
Default FallbackDL
- """ - - torrents = spider.parse(html) - - assert [item["title"] for item in torrents] == ["Optional Name", "Default Fallback"] - - -def test_rust_indexer_page_parser_renders_literal_title_template_without_default_field(): - """ - Rust 普通 indexer 页面解析应在没有 title_default 时渲染 title_optional 的纯文本兜底模板。 - """ - spider = SiteSpider( - indexer={ - "id": "demo", - "name": "Demo", - "domain": "https://example.org/", - "search": {"paths": [{"path": "torrents.php"}]}, - "torrents": { - "list": {"selector": "tr.torrent"}, - "fields": { - "title_optional": { - "selector": "a.title", - "attribute": "title", - "optional": True, - }, - "title": { - "text": ( - "{% if fields['title_optional'] %}" - "{{ fields['title_optional'] }}" - "{% else %}" - "For All Mankind S05 2019 2160p ATVP WEB-DL " - "DDP5.1 Atmos DV H 265-HHWEB [新]" - "{% endif %}" - ) - }, - "download": {"selector": "a.dl", "attribute": "href"}, - }, - }, - }, - ) - html = """ - - - - - -
IgnoredDL
- """ - - torrents = spider.parse(html) - - assert torrents == [{ - "title": "For All Mankind S05 2019 2160p ATVP WEB-DL DDP5.1 Atmos DV H 265-HHWEB [新]", - "enclosure": "https://example.org/download/1", - }] - - -def test_rust_indexer_page_parser_supports_agsvpt_selector_and_embedded_title_template(): - """ - Rust 普通 indexer 页面解析应兼容 AGSVPT 的 PyQuery 选择器和字段内嵌 Jinja 模板。 - """ - spider = SiteSpider( - indexer={ - "id": "agsvpt", - "name": "AGSVPT", - "domain": "https://www.agsvpt.com/", - "search": {"paths": [{"path": "torrents.php"}]}, - "torrents": { - "list": {"selector": 'table.torrents > tr:has("table.torrentname")'}, - "fields": { - "title_default": {"selector": 'a[href*="details.php?id="]'}, - "title_optional": { - "selector": 'a[title][href*="details.php?id="]', - "attribute": "title", - "optional": True, - }, - "title": { - "text": ( - "{% if fields['title_optional'] %}" - "{{ fields['title_optional'] }}" - "{% else %}" - "{{ fields['title_default'] }}" - "{% endif %}" - ) - }, - "details": { - "selector": 'a[href*="details.php?id="]', - "attribute": "href", - }, - "download": { - "selector": 'a[href*="download.php?id="]', - "attribute": "href", - }, - }, - }, - }, - ) - html = """ - - - - - -
- Ignored -
DL
- """ - - torrents = spider.parse(html) - - assert torrents == [{ - "title": "Release that Witch S01 2026 1080p WEB-DL H264 AAC-HHWEB", - "page_url": "https://www.agsvpt.com/details.php?id=1", - "enclosure": "https://www.agsvpt.com/download.php?id=1", - }] - - -def test_rust_indexer_page_parser_renders_common_description_templates(): - """ - Rust 普通 indexer 页面解析应兼容站点构建项目里的 description 字段模板。 - """ - spider = SiteSpider( - indexer={ - "id": "demo", - "name": "Demo", - "domain": "https://example.org/", - "search": {"paths": [{"path": "torrents.php"}]}, - "torrents": { - "list": {"selector": "tr.torrent"}, - "fields": { - "title": {"selector": "a.title"}, - "subject": {"selector": ".subject"}, - "tags": {"selector": ".tags"}, - "description": { - "text": ( - "{% if fields['tags']%}" - "{{ fields['subject']+' '+fields['tags'] }}" - "{% else %}" - "{{ fields['subject'] }}" - "{% endif %}" - ) - }, - "download": {"selector": "a.dl", "attribute": "href"}, - }, - }, - }, - ) - html = """ - - - - - - - - - -
Movie 2024BluRayHDRDL
Show 2025WEB-DLDL
- """ - - torrents = spider.parse(html) - - assert [item["description"] for item in torrents] == ["BluRay HDR", "WEB-DL"] - - -def test_rust_indexer_page_parser_supports_remove_and_negative_index(): - """ - Rust 普通 indexer 页面解析应兼容站点配置常用的 remove 和负索引。 - """ - spider = SiteSpider( - indexer={ - "id": "demo", - "name": "Demo", - "domain": "https://example.org/", - "search": {"paths": [{"path": "torrents.php"}]}, - "torrents": { - "list": {"selector": "tr.torrent"}, - "fields": { - "title": {"selector": ".name", "remove": "a,b"}, - "description": { - "selector": ".desc", - "remove": "span,a,img,font,b", - "contents": -1, - }, - "labels": { - "selector": ".labels > span", - "remove": "span,a,img,font,b", - "contents": -1, - }, - "download": {"selector": "a.dl", "attribute": "href"}, - }, - }, - }, - ) - html = """ - - - - - - - -
Movie删掉也删2024第一行 - 标签链接 - 第二行 - DIYHDRDL
- """ - - torrents = spider.parse(html) - - assert torrents[0]["title"] == "Movie2024" - assert torrents[0]["description"] == "第一行 第二行" - assert torrents[0]["labels"] == ["DIY", "HDR"] + assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]]