From 51229204c9a97886ff6f8a2e83930840a7de8a34 Mon Sep 17 00:00:00 2001
From: jxxghp <jxxghp@gmail.com>
Date: Fri, 15 May 2026 16:55:42 +0800
Subject: [PATCH] perf: optimize torrent filtering

---
 app/helper/torrent.py          |  62 +++++++++++++----
 app/modules/filter/__init__.py | 123 ++++++++++++++++++++++++---------
 tests/test_torrent_filter.py   | 104 ++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+), 46 deletions(-)
 create mode 100644 tests/test_torrent_filter.py

diff --git a/app/helper/torrent.py b/app/helper/torrent.py
index 91ac4712..7d43d295 100644
--- a/app/helper/torrent.py
+++ b/app/helper/torrent.py
@@ -1,5 +1,6 @@
 import datetime
 import re
+from functools import lru_cache
 from pathlib import Path
 from typing import Tuple, Optional, List, Union, Dict, Any
 from urllib.parse import unquote
@@ -19,6 +20,40 @@ from app.utils.http import RequestUtils
 from app.utils.string import StringUtils
 
 
+_SIZE_UNIT = 1024 * 1024
+
+
+@lru_cache(maxsize=512)
+def _compile_filter_pattern(pattern: str) -> re.Pattern:
+    """
+    编译订阅/工作流附加过滤正则。
+    用户输入沿用原本的正则语义，缓存只减少同一规则反复匹配大量种子时的编译成本。
+    """
+    return re.compile(r"%s" % pattern, re.I)
+
+
+def _filter_pattern_search(pattern: Union[str, int, float], content: str) -> bool:
+    """
+    按原有字符串插值语义执行过滤正则匹配。
+    """
+    return bool(_compile_filter_pattern(str(pattern)).search(content))
+
+
+@lru_cache(maxsize=256)
+def _parse_filter_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
+    """
+    解析附加过滤的大小范围，单位为 MB。
+    """
+    if size_range.find("-") != -1:
+        size_min, size_max = size_range.split("-")
+        return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
+    if size_range.startswith(">"):
+        return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
+    if size_range.startswith("<"):
+        return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
+    return "unknown", 0, None
+
+
 class TorrentHelper:
     """
     种子帮助类
@@ -460,52 +495,48 @@ class TorrentHelper:
         # 包含
         include = filter_params.get("include")
         if include:
-            if not re.search(r"%s" % include, content, re.I):
+            if not _filter_pattern_search(include, content):
                 logger.info(f"{content} 不匹配包含规则 {include}")
                 return False
         # 排除
         exclude = filter_params.get("exclude")
         if exclude:
-            if re.search(r"%s" % exclude, content, re.I):
+            if _filter_pattern_search(exclude, content):
                 logger.info(f"{content} 匹配排除规则 {exclude}")
                 return False
         # 质量
         quality = filter_params.get("quality")
         if quality:
-            if not re.search(r"%s" % quality, torrent_info.title, re.I):
+            if not _filter_pattern_search(quality, torrent_info.title):
                 logger.info(f"{torrent_info.title} 不匹配质量规则 {quality}")
                 return False
         # 分辨率
         resolution = filter_params.get("resolution")
         if resolution:
-            if not re.search(r"%s" % resolution, torrent_info.title, re.I):
+            if not _filter_pattern_search(resolution, torrent_info.title):
                 logger.info(f"{torrent_info.title} 不匹配分辨率规则 {resolution}")
                 return False
         # 特效
         effect = filter_params.get("effect")
         if effect:
-            if not re.search(r"%s" % effect, torrent_info.title, re.I):
+            if not _filter_pattern_search(effect, torrent_info.title):
                 logger.info(f"{torrent_info.title} 不匹配特效规则 {effect}")
                 return False
 
         # 大小
         size_range = filter_params.get("size")
         if size_range:
-            if size_range.find("-") != -1:
+            size_rule, size_min, size_max = _parse_filter_size_range(size_range)
+            if size_rule == "between":
                 # 区间
-                size_min, size_max = size_range.split("-")
-                size_min = float(size_min.strip()) * 1024 * 1024
-                size_max = float(size_max.strip()) * 1024 * 1024
                 if torrent_info.size < size_min or torrent_info.size > size_max:
                     return False
-            elif size_range.startswith(">"):
+            elif size_rule == "gte":
                 # 大于
-                size_min = float(size_range[1:].strip()) * 1024 * 1024
                 if torrent_info.size < size_min:
                     return False
-            elif size_range.startswith("<"):
+            elif size_rule == "lte":
                 # 小于
-                size_max = float(size_range[1:].strip()) * 1024 * 1024
                 if torrent_info.size > size_max:
                     return False
 
@@ -521,6 +552,7 @@ class TorrentHelper:
         """
         # 匹配季
         seasons = season_episodes.keys()
+        seasons_set = set(seasons)
         # 种子季
         torrent_seasons = meta.season_list
         if not torrent_seasons:
@@ -528,7 +560,7 @@ class TorrentHelper:
             torrent_seasons = [1]
         # 种子集
         torrent_episodes = meta.episode_list
-        if not set(torrent_seasons).issubset(set(seasons)):
+        if not set(torrent_seasons).issubset(seasons_set):
             # 种子季不在过滤季中
             logger.debug(
                 f"种子 {torrent.site_name} - {torrent.title} 包含季 {torrent_seasons} 不是需要的季 {list(seasons)}")
@@ -539,7 +571,7 @@ class TorrentHelper:
         if len(torrent_seasons) == 1:
             need_episodes = season_episodes.get(torrent_seasons[0])
             if need_episodes \
-                    and not set(torrent_episodes).intersection(set(need_episodes)):
+                    and not set(torrent_episodes).intersection(need_episodes):
                 # 单季集没有交集的不要
                 logger.debug(f"种子 {torrent.site_name} - {torrent.title} "
                              f"集 {torrent_episodes} 没有需要的集：{need_episodes}")
diff --git a/app/modules/filter/__init__.py b/app/modules/filter/__init__.py
index 786863e7..f6192835 100644
--- a/app/modules/filter/__init__.py
+++ b/app/modules/filter/__init__.py
@@ -1,11 +1,12 @@
+import re
 from copy import deepcopy
+from functools import lru_cache
 from typing import List, Tuple, Union, Dict, Optional
 
 from app.core.context import TorrentInfo, MediaInfo
+from app.core.metainfo import MetaInfo
 from app.helper.rule import RuleHelper
 from app.log import logger
-import re
-from app.core.metainfo import MetaInfo
 from app.modules import _ModuleBase
 from app.modules.filter.RuleParser import RuleParser
 from app.modules.filter.builtin_rules import BUILTIN_RULE_SET
@@ -13,6 +14,51 @@ from app.schemas.types import ModuleType, OtherModulesType, SystemConfigKey
 from app.utils.string import StringUtils
 
 
+_SIZE_UNIT = 1024 * 1024
+
+
+@lru_cache(maxsize=1024)
+def _compile_ignorecase(pattern: str) -> re.Pattern:
+    """
+    编译过滤规则正则。
+    过滤规则在搜索/订阅中会被大量种子重复匹配，缓存编译结果能减少热路径开销；
+    这里仍保留原有的 IGNORECASE 语义，非法正则也会像原来一样在匹配时抛出异常。
+    """
+    return re.compile(r"%s" % pattern, re.IGNORECASE)
+
+
+def _regex_search(pattern: Union[str, int, float], content: str) -> bool:
+    """
+    按原有字符串插值语义执行正则匹配，同时复用已编译表达式。
+    """
+    return bool(_compile_ignorecase(str(pattern)).search(content))
+
+
+@lru_cache(maxsize=256)
+def _parse_size_range(size_range: str) -> Tuple[str, float, Optional[float]]:
+    """
+    解析大小范围，单位为 MB。
+    返回值中的操作符只供本模块内部使用，避免每个种子重复拆分同一个规则。
+    """
+    size_range = size_range.strip()
+    if size_range.find("-") != -1:
+        size_min, size_max = size_range.split("-")
+        return "between", float(size_min.strip()) * _SIZE_UNIT, float(size_max.strip()) * _SIZE_UNIT
+    if size_range.startswith(">"):
+        return "gte", float(size_range[1:].strip()) * _SIZE_UNIT, None
+    if size_range.startswith("<"):
+        return "lte", 0, float(size_range[1:].strip()) * _SIZE_UNIT
+    return "unknown", 0, None
+
+
+@lru_cache(maxsize=256)
+def _parse_publish_time(publish_time: str) -> Tuple[float, ...]:
+    """
+    解析发布时间规则，避免同一规则对大量种子反复转换 float。
+    """
+    return tuple(float(t) for t in publish_time.split("-"))
+
+
 class FilterModule(_ModuleBase):
     CONFIG_WATCH = {SystemConfigKey.CustomFilterRules.value}
 
@@ -86,6 +132,9 @@ class FilterModule(_ModuleBase):
         if not rule_groups:
             return torrent_list
         parser = RuleParser()
+        # 同一轮过滤里，相同的优先级层级会被多个种子反复使用；按需解析并缓存，
+        # 既减少 pyparsing 开销，也保留原来“命中高优先级后不解析低层级”的容错行为。
+        parsed_rule_cache: Dict[str, Union[list, str]] = {}
         # 查询规则表详情
         groups = self.rulehelper.get_rule_group_by_media(media=mediainfo, group_names=rule_groups)
         if groups:
@@ -97,21 +146,27 @@ class FilterModule(_ModuleBase):
                     torrent_list=torrent_list,
                     mediainfo=mediainfo,
                     parser=parser,
+                    parsed_rule_cache=parsed_rule_cache,
                 )
         return torrent_list
 
     def __filter_torrents(self, rule_string: str, rule_name: str,
                           torrent_list: List[TorrentInfo],
                           mediainfo: MediaInfo,
-                          parser: RuleParser) -> List[TorrentInfo]:
+                          parser: RuleParser,
+                          parsed_rule_cache: Dict[str, Union[list, str]]) -> List[TorrentInfo]:
         """
         过滤种子
         """
+        if not torrent_list:
+            return []
+        # 只拆分一次规则层级；具体层级仍延迟到真正需要匹配时解析。
+        rule_groups = [rule_group.strip() for rule_group in rule_string.split('>')]
         # 返回种子列表
         ret_torrents = []
         for torrent in torrent_list:
             # 能命中优先级的才返回
-            if not self.__get_order(torrent, rule_string, mediainfo, parser):
+            if not self.__get_order(torrent, rule_groups, mediainfo, parser, parsed_rule_cache):
                 logger.debug(f"种子 {torrent.site_name} - {torrent.title} {torrent.description or ''} "
                              f"不匹配 {rule_name} 过滤规则")
                 continue
@@ -119,13 +174,12 @@ class FilterModule(_ModuleBase):
 
         return ret_torrents
 
-    def __get_order(self, torrent: TorrentInfo, rule_str: str,
-                    mediainfo: MediaInfo, parser: RuleParser) -> Optional[TorrentInfo]:
+    def __get_order(self, torrent: TorrentInfo, rule_groups: List[str],
+                    mediainfo: MediaInfo, parser: RuleParser,
+                    parsed_rule_cache: Dict[str, Union[list, str]]) -> Optional[TorrentInfo]:
         """
         获取种子匹配的规则优先级，值越大越优先，未匹配时返回None
         """
-        # 多级规则
-        rule_groups = rule_str.split('>')
         # 优先级
         res_order = 100
         # 是否匹配
@@ -133,8 +187,8 @@ class FilterModule(_ModuleBase):
 
         for rule_group in rule_groups:
             # 解析规则组
-            parsed_group = parser.parse(rule_group.strip())
-            if self.__match_group(torrent, parsed_group.as_list()[0], mediainfo):
+            parsed_group = self.__parse_rule_group(rule_group, parser, parsed_rule_cache)
+            if self.__match_group(torrent, parsed_group, mediainfo):
                 # 出现匹配时中断
                 matched = True
                 logger.debug(f"种子 {torrent.site_name} - {torrent.title} 优先级为 {100 - res_order + 1}")
@@ -145,6 +199,17 @@ class FilterModule(_ModuleBase):
 
         return None if not matched else torrent
 
+    @staticmethod
+    def __parse_rule_group(rule_group: str, parser: RuleParser,
+                           parsed_rule_cache: Dict[str, Union[list, str]]) -> Union[list, str]:
+        """
+        解析单个优先级层级。
+        缓存粒度放在层级表达式上，兼容多个规则组复用相同表达式的情况。
+        """
+        if rule_group not in parsed_rule_cache:
+            parsed_rule_cache[rule_group] = parser.parse(rule_group).as_list()[0]
+        return parsed_rule_cache[rule_group]
+
     def __match_group(self, torrent: TorrentInfo, rule_group: Union[list, str],
                       mediainfo: MediaInfo) -> Optional[bool]:
         """
@@ -173,12 +238,13 @@ class FilterModule(_ModuleBase):
         """
         判断种子是否匹配规则项
         """
-        if not self.rule_set.get(rule_name):
+        rule = self.rule_set.get(rule_name)
+        if not rule:
             # 规则不存在
             logger.debug(f"规则 {rule_name} 不存在")
             return False
         # TMDB规则
-        tmdb = self.rule_set[rule_name].get("tmdb")
+        tmdb = rule.get("tmdb")
         # 符合TMDB规则的直接返回True，即不过滤
         if tmdb and self.__match_tmdb(tmdb, mediainfo):
             logger.debug(f"种子 {torrent.site_name} - {torrent.title} 符合 {rule_name} 的TMDB规则，匹配成功")
@@ -187,7 +253,7 @@ class FilterModule(_ModuleBase):
         content = f"{torrent.title} {torrent.description} {' '.join(torrent.labels or [])}"
         # 只匹配指定关键字
         match_content = []
-        matchs = self.rule_set[rule_name].get("match") or []
+        matchs = rule.get("match") or []
         if matchs:
             for match in matchs:
                 if not hasattr(torrent, match):
@@ -202,27 +268,27 @@ class FilterModule(_ModuleBase):
         if match_content:
             content = " ".join(match_content)
         # 包含规则项
-        includes = self.rule_set[rule_name].get("include") or []
+        includes = rule.get("include") or []
         if not isinstance(includes, list):
             includes = [includes]
         # 排除规则项
-        excludes = self.rule_set[rule_name].get("exclude") or []
+        excludes = rule.get("exclude") or []
         if not isinstance(excludes, list):
             excludes = [excludes]
         # 大小范围规则项
-        size_range = self.rule_set[rule_name].get("size_range")
+        size_range = rule.get("size_range")
         # 做种人数规则项
-        seeders = self.rule_set[rule_name].get("seeders")
+        seeders = rule.get("seeders")
         # FREE规则
-        downloadvolumefactor = self.rule_set[rule_name].get("downloadvolumefactor")
+        downloadvolumefactor = rule.get("downloadvolumefactor")
         # 发布时间规则
-        pubdate: str = self.rule_set[rule_name].get("publish_time")
-        if includes and not any(re.search(r"%s" % include, content, re.IGNORECASE) for include in includes):
+        pubdate: str = rule.get("publish_time")
+        if includes and not any(_regex_search(include, content) for include in includes):
             # 未发现任何包含项
             logger.debug(f"种子 {torrent.site_name} - {torrent.title} 不包含任何项 {includes}")
             return False
         for exclude in excludes:
-            if re.search(r"%s" % exclude, content, re.IGNORECASE):
+            if _regex_search(exclude, content):
                 # 发现排除项
                 logger.debug(f"种子 {torrent.site_name} - {torrent.title} 包含 {exclude}")
                 return False
@@ -247,7 +313,7 @@ class FilterModule(_ModuleBase):
             # 种子发布时间
             pub_minutes = torrent.pub_minutes()
             # 发布时间规则
-            pub_times = [float(t) for t in pubdate.split("-")]
+            pub_times = _parse_publish_time(pubdate)
             if len(pub_times) == 1:
                 # 发布时间小于规则
                 if pub_minutes < pub_times[0]:
@@ -319,22 +385,17 @@ class FilterModule(_ModuleBase):
         # 每集大小
         torrent_size = torrent.size / episode_count
         # 大小范围
-        size_range = size_range.strip()
-        if size_range.find("-") != -1:
+        size_rule, size_min, size_max = _parse_size_range(size_range)
+        if size_rule == "between":
             # 区间
-            size_min, size_max = size_range.split("-")
-            size_min = float(size_min.strip()) * 1024 * 1024
-            size_max = float(size_max.strip()) * 1024 * 1024
             if size_min <= torrent_size <= size_max:
                 return True
-        elif size_range.startswith(">"):
+        elif size_rule == "gte":
             # 大于
-            size_min = float(size_range[1:].strip()) * 1024 * 1024
             if torrent_size >= size_min:
                 return True
-        elif size_range.startswith("<"):
+        elif size_rule == "lte":
             # 小于
-            size_max = float(size_range[1:].strip()) * 1024 * 1024
             if torrent_size <= size_max:
                 return True
         return False
diff --git a/tests/test_torrent_filter.py b/tests/test_torrent_filter.py
new file mode 100644
index 00000000..a7926bc7
--- /dev/null
+++ b/tests/test_torrent_filter.py
@@ -0,0 +1,104 @@
+import unittest
+from types import SimpleNamespace
+
+from app.core.context import TorrentInfo
+from app.helper.torrent import TorrentHelper
+from app.modules.filter import FilterModule
+
+
+class _RuleHelper:
+    """
+    过滤模块测试用的轻量规则仓库，避免依赖真实系统配置。
+    """
+
+    def __init__(self, groups):
+        self._groups = groups
+
+    def get_rule_group_by_media(self, media=None, group_names=None):  # noqa: ARG002
+        if not group_names:
+            return self._groups
+        return [group for group in self._groups if group.name in group_names]
+
+
+def _build_filter_module(rule_string: str, rule_set: dict) -> FilterModule:
+    module = FilterModule()
+    module.rulehelper = _RuleHelper(
+        [SimpleNamespace(name="test", rule_string=rule_string)]
+    )
+    module.rule_set = rule_set
+    return module
+
+
+class TorrentFilterTest(unittest.TestCase):
+
+    def test_filter_torrents_keeps_priority_and_boolean_rule_semantics(self):
+        module = _build_filter_module(
+            rule_string="HDR & !BLU > DV",
+            rule_set={
+                "HDR": {"include": "HDR"},
+                "DV": {"include": "DOVI"},
+                "BLU": {"include": "BluRay"},
+            },
+        )
+        torrents = [
+            TorrentInfo(title="Movie HDR WEB-DL", description=""),
+            TorrentInfo(title="Movie DOVI", description=""),
+            TorrentInfo(title="Movie HDR BluRay", description=""),
+        ]
+
+        filtered = module.filter_torrents(rule_groups=["test"], torrent_list=torrents)
+
+        self.assertEqual(torrents[:2], filtered)
+        self.assertEqual(100, filtered[0].pri_order)
+        self.assertEqual(99, filtered[1].pri_order)
+
+    def test_filter_torrents_keeps_lazy_priority_level_parsing(self):
+        module = _build_filter_module(
+            rule_string="KEEP > (",
+            rule_set={"KEEP": {"include": "Movie"}},
+        )
+        torrent = TorrentInfo(title="Movie", description="")
+
+        filtered = module.filter_torrents(rule_groups=["test"], torrent_list=[torrent])
+
+        self.assertEqual([torrent], filtered)
+        self.assertEqual(100, torrent.pri_order)
+
+    def test_filter_torrent_keeps_extra_filter_semantics(self):
+        torrent = TorrentInfo(
+            title="Movie 1080p HDR",
+            description="中字",
+            labels=["free"],
+            size=3 * 1024 * 1024 * 1024,
+            uploadvolumefactor=1,
+            downloadvolumefactor=0,
+        )
+
+        self.assertTrue(
+            TorrentHelper.filter_torrent(
+                torrent_info=torrent,
+                filter_params={
+                    "include": "中字|free",
+                    "exclude": "BluRay",
+                    "resolution": "1080p",
+                    "effect": "HDR",
+                    "size": "1000-4000",
+                },
+            )
+        )
+        self.assertFalse(
+            TorrentHelper.filter_torrent(
+                torrent_info=torrent,
+                filter_params={"exclude": "HDR"},
+            )
+        )
+        self.assertFalse(
+            TorrentHelper.filter_torrent(
+                torrent_info=torrent,
+                filter_params={"size": "<1000"},
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()