From 51229204c9a97886ff6f8a2e83930840a7de8a34 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Fri, 15 May 2026 16:55:42 +0800 Subject: [PATCH] perf: optimize torrent filtering --- app/helper/torrent.py | 62 +++++++++++++---- app/modules/filter/__init__.py | 123 ++++++++++++++++++++++++--------- tests/test_torrent_filter.py | 104 ++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+), 46 deletions(-) create mode 100644 tests/test_torrent_filter.py diff --git a/app/helper/torrent.py b/app/helper/torrent.py index 91ac4712..7d43d295 100644 --- a/app/helper/torrent.py +++ b/app/helper/torrent.py @@ -1,5 +1,6 @@ import datetime import re +from functools import lru_cache from pathlib import Path from typing import Tuple, Optional, List, Union, Dict, Any from urllib.parse import unquote @@ -19,6 +20,40 @@ from app.utils.http import RequestUtils from app.utils.string import StringUtils +_SIZE_UNIT = 1024 * 1024 + + +@lru_cache(maxsize=512) +def _compile_filter_pattern(pattern: str) -> re.Pattern: + """ + 编译订阅/工作流附加过滤正则。 + 用户输入沿用原本的正则语义,缓存只减少同一规则反复匹配大量种子时的编译成本。 + """ + return re.compile(r"%s" % pattern, re.I) + + +def _filter_pattern_search(pattern: Union[str, int, float], content: str) -> bool: + """ + 按原有字符串插值语义执行过滤正则匹配。 + """ + return bool(_compile_filter_pattern(str(pattern)).search(content)) + + +@lru_cache(maxsize=256) +def _parse_filter_size_range(size_range: str) -> Tuple[str, float, Optional[float]]: + """ + 解析附加过滤的大小范围,单位为 MB。 + """ + if size_range.find("-") != -1: + size_min, size_max = size_range.split("-") + return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT + if size_range.startswith(">"): + return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None + if size_range.startswith("<"): + return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT + return "unknown", 0, None + + class TorrentHelper: """ 种子帮助类 @@ -460,52 +495,48 @@ class TorrentHelper: # 包含 include = filter_params.get("include") if include: - if not re.search(r"%s" % include, content, re.I): + if not _filter_pattern_search(include, content): logger.info(f"{content} 不匹配包含规则 {include}") return False # 排除 exclude = filter_params.get("exclude") if exclude: - if re.search(r"%s" % exclude, content, re.I): + if _filter_pattern_search(exclude, content): logger.info(f"{content} 匹配排除规则 {exclude}") return False # 质量 quality = filter_params.get("quality") if quality: - if not re.search(r"%s" % quality, torrent_info.title, re.I): + if not _filter_pattern_search(quality, torrent_info.title): logger.info(f"{torrent_info.title} 不匹配质量规则 {quality}") return False # 分辨率 resolution = filter_params.get("resolution") if resolution: - if not re.search(r"%s" % resolution, torrent_info.title, re.I): + if not _filter_pattern_search(resolution, torrent_info.title): logger.info(f"{torrent_info.title} 不匹配分辨率规则 {resolution}") return False # 特效 effect = filter_params.get("effect") if effect: - if not re.search(r"%s" % effect, torrent_info.title, re.I): + if not _filter_pattern_search(effect, torrent_info.title): logger.info(f"{torrent_info.title} 不匹配特效规则 {effect}") return False # 大小 size_range = filter_params.get("size") if size_range: - if size_range.find("-") != -1: + size_rule, size_min, size_max = _parse_filter_size_range(size_range) + if size_rule == "between": # 区间 - size_min, size_max = size_range.split("-") - size_min = float(size_min.strip()) * 1024 * 1024 - size_max = float(size_max.strip()) * 1024 * 1024 if torrent_info.size < size_min or torrent_info.size > size_max: return False - elif size_range.startswith(">"): + elif size_rule == "gte": # 大于 - size_min = float(size_range[1:].strip()) * 1024 * 1024 if torrent_info.size < size_min: return False - elif size_range.startswith("<"): + elif size_rule == "lte": # 小于 - size_max = float(size_range[1:].strip()) * 1024 * 1024 if torrent_info.size > size_max: return False @@ -521,6 +552,7 @@ class TorrentHelper: """ # 匹配季 seasons = season_episodes.keys() + seasons_set = set(seasons) # 种子季 torrent_seasons = meta.season_list if not torrent_seasons: @@ -528,7 +560,7 @@ class TorrentHelper: torrent_seasons = [1] # 种子集 torrent_episodes = meta.episode_list - if not set(torrent_seasons).issubset(set(seasons)): + if not set(torrent_seasons).issubset(seasons_set): # 种子季不在过滤季中 logger.debug( f"种子 {torrent.site_name} - {torrent.title} 包含季 {torrent_seasons} 不是需要的季 {list(seasons)}") @@ -539,7 +571,7 @@ class TorrentHelper: if len(torrent_seasons) == 1: need_episodes = season_episodes.get(torrent_seasons[0]) if need_episodes \ - and not set(torrent_episodes).intersection(set(need_episodes)): + and not set(torrent_episodes).intersection(need_episodes): # 单季集没有交集的不要 logger.debug(f"种子 {torrent.site_name} - {torrent.title} " f"集 {torrent_episodes} 没有需要的集:{need_episodes}") diff --git a/app/modules/filter/__init__.py b/app/modules/filter/__init__.py index 786863e7..f6192835 100644 --- a/app/modules/filter/__init__.py +++ b/app/modules/filter/__init__.py @@ -1,11 +1,12 @@ +import re from copy import deepcopy +from functools import lru_cache from typing import List, Tuple, Union, Dict, Optional from app.core.context import TorrentInfo, MediaInfo +from app.core.metainfo import MetaInfo from app.helper.rule import RuleHelper from app.log import logger -import re -from app.core.metainfo import MetaInfo from app.modules import _ModuleBase from app.modules.filter.RuleParser import RuleParser from app.modules.filter.builtin_rules import BUILTIN_RULE_SET @@ -13,6 +14,51 @@ from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey from app.utils.string import StringUtils +_SIZE_UNIT = 1024 * 1024 + + +@lru_cache(maxsize=1024) +def _compile_ignorecase(pattern: str) -> re.Pattern: + """ + 编译过滤规则正则。 + 过滤规则在搜索/订阅中会被大量种子重复匹配,缓存编译结果能减少热路径开销; + 这里仍保留原有的 IGNORECASE 语义,非法正则也会像原来一样在匹配时抛出异常。 + """ + return re.compile(r"%s" % pattern, re.IGNORECASE) + + +def _regex_search(pattern: Union[str, int, float], content: str) -> bool: + """ + 按原有字符串插值语义执行正则匹配,同时复用已编译表达式。 + """ + return bool(_compile_ignorecase(str(pattern)).search(content)) + + +@lru_cache(maxsize=256) +def _parse_size_range(size_range: str) -> Tuple[str, float, Optional[float]]: + """ + 解析大小范围,单位为 MB。 + 返回值中的操作符只供本模块内部使用,避免每个种子重复拆分同一个规则。 + """ + size_range = size_range.strip() + if size_range.find("-") != -1: + size_min, size_max = size_range.split("-") + return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT + if size_range.startswith(">"): + return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None + if size_range.startswith("<"): + return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT + return "unknown", 0, None + + +@lru_cache(maxsize=256) +def _parse_publish_time(publish_time: str) -> Tuple[float, ...]: + """ + 解析发布时间规则,避免同一规则对大量种子反复转换 float。 + """ + return tuple(float(t) for t in publish_time.split("-")) + + class FilterModule(_ModuleBase): CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value} @@ -86,6 +132,9 @@ class FilterModule(_ModuleBase): if not rule_groups: return torrent_list parser = RuleParser() + # 同一轮过滤里,相同的优先级层级会被多个种子反复使用;按需解析并缓存, + # 既减少 pyparsing 开销,也保留原来“命中高优先级后不解析低层级”的容错行为。 + parsed_rule_cache: Dict[str, Union[list, str]] = {} # 查询规则表详情 groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups) if groups: @@ -97,21 +146,27 @@ class FilterModule(_ModuleBase): torrent_list=torrent_list, mediainfo=mediainfo, parser=parser, + parsed_rule_cache=parsed_rule_cache, ) return torrent_list def __filter_torrents(self, rule_string: str, rule_name: str, torrent_list: List[TorrentInfo], mediainfo: MediaInfo, - parser: RuleParser) -> List[TorrentInfo]: + parser: RuleParser, + parsed_rule_cache: Dict[str, Union[list, str]]) -> List[TorrentInfo]: """ 过滤种子 """ + if not torrent_list: + return [] + # 只拆分一次规则层级;具体层级仍延迟到真正需要匹配时解析。 + rule_groups = [rule_group.strip() for rule_group in rule_string.split('>')] # 返回种子列表 ret_torrents = [] for torrent in torrent_list: # 能命中优先级的才返回 - if not self.__get_order(torrent, rule_string, mediainfo, parser): + if not self.__get_order(torrent, rule_groups, mediainfo, parser, parsed_rule_cache): logger.debug(f"种子 {torrent.site_name} - {torrent.title} {torrent.description or ''} " f"不匹配 {rule_name} 过滤规则") continue @@ -119,13 +174,12 @@ class FilterModule(_ModuleBase): return ret_torrents - def __get_order(self, torrent: TorrentInfo, rule_str: str, - mediainfo: MediaInfo, parser: RuleParser) -> Optional[TorrentInfo]: + def __get_order(self, torrent: TorrentInfo, rule_groups: List[str], + mediainfo: MediaInfo, parser: RuleParser, + parsed_rule_cache: Dict[str, Union[list, str]]) -> Optional[TorrentInfo]: """ 获取种子匹配的规则优先级,值越大越优先,未匹配时返回None """ - # 多级规则 - rule_groups = rule_str.split('>') # 优先级 res_order = 100 # 是否匹配 @@ -133,8 +187,8 @@ class FilterModule(_ModuleBase): for rule_group in rule_groups: # 解析规则组 - parsed_group = parser.parse(rule_group.strip()) - if self.__match_group(torrent, parsed_group.as_list()[0], mediainfo): + parsed_group = self.__parse_rule_group(rule_group, parser, parsed_rule_cache) + if self.__match_group(torrent, parsed_group, mediainfo): # 出现匹配时中断 matched = True logger.debug(f"种子 {torrent.site_name} - {torrent.title} 优先级为 {100 - res_order + 1}") @@ -145,6 +199,17 @@ class FilterModule(_ModuleBase): return None if not matched else torrent + @staticmethod + def __parse_rule_group(rule_group: str, parser: RuleParser, + parsed_rule_cache: Dict[str, Union[list, str]]) -> Union[list, str]: + """ + 解析单个优先级层级。 + 缓存粒度放在层级表达式上,兼容多个规则组复用相同表达式的情况。 + """ + if rule_group not in parsed_rule_cache: + parsed_rule_cache[rule_group] = parser.parse(rule_group).as_list()[0] + return parsed_rule_cache[rule_group] + def __match_group(self, torrent: TorrentInfo, rule_group: Union[list, str], mediainfo: MediaInfo) -> Optional[bool]: """ @@ -173,12 +238,13 @@ class FilterModule(_ModuleBase): """ 判断种子是否匹配规则项 """ - if not self.rule_set.get(rule_name): + rule = self.rule_set.get(rule_name) + if not rule: # 规则不存在 logger.debug(f"规则 {rule_name} 不存在") return False # TMDB规则 - tmdb = self.rule_set[rule_name].get("tmdb") + tmdb = rule.get("tmdb") # 符合TMDB规则的直接返回True,即不过滤 if tmdb and self.__match_tmdb(tmdb, mediainfo): logger.debug(f"种子 {torrent.site_name} - {torrent.title} 符合 {rule_name} 的TMDB规则,匹配成功") @@ -187,7 +253,7 @@ class FilterModule(_ModuleBase): content = f"{torrent.title} {torrent.description} {' '.join(torrent.labels or [])}" # 只匹配指定关键字 match_content = [] - matchs = self.rule_set[rule_name].get("match") or [] + matchs = rule.get("match") or [] if matchs: for match in matchs: if not hasattr(torrent, match): @@ -202,27 +268,27 @@ class FilterModule(_ModuleBase): if match_content: content = " ".join(match_content) # 包含规则项 - includes = self.rule_set[rule_name].get("include") or [] + includes = rule.get("include") or [] if not isinstance(includes, list): includes = [includes] # 排除规则项 - excludes = self.rule_set[rule_name].get("exclude") or [] + excludes = rule.get("exclude") or [] if not isinstance(excludes, list): excludes = [excludes] # 大小范围规则项 - size_range = self.rule_set[rule_name].get("size_range") + size_range = rule.get("size_range") # 做种人数规则项 - seeders = self.rule_set[rule_name].get("seeders") + seeders = rule.get("seeders") # FREE规则 - downloadvolumefactor = self.rule_set[rule_name].get("downloadvolumefactor") + downloadvolumefactor = rule.get("downloadvolumefactor") # 发布时间规则 - pubdate: str = self.rule_set[rule_name].get("publish_time") - if includes and not any(re.search(r"%s" % include, content, re.IGNORECASE) for include in includes): + pubdate: str = rule.get("publish_time") + if includes and not any(_regex_search(include, content) for include in includes): # 未发现任何包含项 logger.debug(f"种子 {torrent.site_name} - {torrent.title} 不包含任何项 {includes}") return False for exclude in excludes: - if re.search(r"%s" % exclude, content, re.IGNORECASE): + if _regex_search(exclude, content): # 发现排除项 logger.debug(f"种子 {torrent.site_name} - {torrent.title} 包含 {exclude}") return False @@ -247,7 +313,7 @@ class FilterModule(_ModuleBase): # 种子发布时间 pub_minutes = torrent.pub_minutes() # 发布时间规则 - pub_times = [float(t) for t in pubdate.split("-")] + pub_times = _parse_publish_time(pubdate) if len(pub_times) == 1: # 发布时间小于规则 if pub_minutes < pub_times[0]: @@ -319,22 +385,17 @@ class FilterModule(_ModuleBase): # 每集大小 torrent_size = torrent.size / episode_count # 大小范围 - size_range = size_range.strip() - if size_range.find("-") != -1: + size_rule, size_min, size_max = _parse_size_range(size_range) + if size_rule == "between": # 区间 - size_min, size_max = size_range.split("-") - size_min = float(size_min.strip()) * 1024 * 1024 - size_max = float(size_max.strip()) * 1024 * 1024 if size_min <= torrent_size <= size_max: return True - elif size_range.startswith(">"): + elif size_rule == "gte": # 大于 - size_min = float(size_range[1:].strip()) * 1024 * 1024 if torrent_size >= size_min: return True - elif size_range.startswith("<"): + elif size_rule == "lte": # 小于 - size_max = float(size_range[1:].strip()) * 1024 * 1024 if torrent_size <= size_max: return True return False diff --git a/tests/test_torrent_filter.py b/tests/test_torrent_filter.py new file mode 100644 index 00000000..a7926bc7 --- /dev/null +++ b/tests/test_torrent_filter.py @@ -0,0 +1,104 @@ +import unittest +from types import SimpleNamespace + +from app.core.context import TorrentInfo +from app.helper.torrent import TorrentHelper +from app.modules.filter import FilterModule + + +class _RuleHelper: + """ + 过滤模块测试用的轻量规则仓库,避免依赖真实系统配置。 + """ + + def __init__(self, groups): + self._groups = groups + + def get_rule_group_by_media(self, media=None, group_names=None): # noqa: ARG002 + if not group_names: + return self._groups + return [group for group in self._groups if group.name in group_names] + + +def _build_filter_module(rule_string: str, rule_set: dict) -> FilterModule: + module = FilterModule() + module.rulehelper = _RuleHelper( + [SimpleNamespace(name="test", rule_string=rule_string)] + ) + module.rule_set = rule_set + return module + + +class TorrentFilterTest(unittest.TestCase): + + def test_filter_torrents_keeps_priority_and_boolean_rule_semantics(self): + module = _build_filter_module( + rule_string="HDR & !BLU > DV", + rule_set={ + "HDR": {"include": "HDR"}, + "DV": {"include": "DOVI"}, + "BLU": {"include": "BluRay"}, + }, + ) + torrents = [ + TorrentInfo(title="Movie HDR WEB-DL", description=""), + TorrentInfo(title="Movie DOVI", description=""), + TorrentInfo(title="Movie HDR BluRay", description=""), + ] + + filtered = module.filter_torrents(rule_groups=["test"], torrent_list=torrents) + + self.assertEqual(torrents[:2], filtered) + self.assertEqual(100, filtered[0].pri_order) + self.assertEqual(99, filtered[1].pri_order) + + def test_filter_torrents_keeps_lazy_priority_level_parsing(self): + module = _build_filter_module( + rule_string="KEEP > (", + rule_set={"KEEP": {"include": "Movie"}}, + ) + torrent = TorrentInfo(title="Movie", description="") + + filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent]) + + self.assertEqual([torrent], filtered) + self.assertEqual(100, torrent.pri_order) + + def test_filter_torrent_keeps_extra_filter_semantics(self): + torrent = TorrentInfo( + title="Movie 1080p HDR", + description="中字", + labels=["free"], + size=3 * 1024 * 1024 * 1024, + uploadvolumefactor=1, + downloadvolumefactor=0, + ) + + self.assertTrue( + TorrentHelper.filter_torrent( + torrent_info=torrent, + filter_params={ + "include": "中字|free", + "exclude": "BluRay", + "resolution": "1080p", + "effect": "HDR", + "size": "1000-4000", + }, + ) + ) + self.assertFalse( + TorrentHelper.filter_torrent( + torrent_info=torrent, + filter_params={"exclude": "HDR"}, + ) + ) + self.assertFalse( + TorrentHelper.filter_torrent( + torrent_info=torrent, + filter_params={"size": "<1000"}, + ) + ) + + +if __name__ == "__main__": + unittest.main()