From fcf6e14ac9236ca5808a082d88705ab1db1d7beb Mon Sep 17 00:00:00 2001 From: jxxghp Date: Wed, 13 May 2026 17:57:25 +0800 Subject: [PATCH] fix: filter invalid subtitle action links --- app/modules/subtitle/__init__.py | 94 ++++++++++++++++++++++++-------- tests/test_subtitle_links.py | 45 +++++++++++++++ 2 files changed, 116 insertions(+), 23 deletions(-) create mode 100644 tests/test_subtitle_links.py diff --git a/app/modules/subtitle/__init__.py b/app/modules/subtitle/__init__.py index 10e05f9c..d305ccf3 100644 --- a/app/modules/subtitle/__init__.py +++ b/app/modules/subtitle/__init__.py @@ -1,7 +1,9 @@ +import re import shutil import time from pathlib import Path -from typing import Tuple, Union +from typing import List, Optional, Tuple, Union +from urllib.parse import urljoin, urlparse from lxml import etree @@ -18,7 +20,6 @@ from app.schemas import TorrentInfo from app.schemas.file import FileURI from app.schemas.types import ModuleType, OtherModulesType from app.utils.http import RequestUtils -from app.utils.string import StringUtils from app.utils.system import SystemUtils @@ -27,12 +28,24 @@ class SubtitleModule(_ModuleBase): 字幕下载模块 """ - # 站点详情页字幕下载链接识别XPATH + # 站点详情页字幕下载元素识别XPATH _SITE_SUBTITLE_XPATH = [ - '//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a[not(@class)]/@href', - '//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a/@href', - '//div[contains(@class, "font-bold")][text()="字幕"]/following-sibling::div[1]//a[not(@class)]/@href', # 憨憨 + '//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a[not(@class)]', + '//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a', + '//div[contains(@class, "font-bold")][text()="字幕"]/following-sibling::div[1]//a[not(@class)]', # 憨憨 ] + _SUBTITLE_URL_ATTRS = ( + "href", + "data-url", + "data-href", + "data-link", + "data-download", + "data-download-url", + ) + _SCRIPT_URL_RE = re.compile( + r"""["'](?P(?:https?:)?//[^"']+|/[^"']+|[^"']*(?:download|subtitle|subs?)[^"']*)["']""", + re.IGNORECASE, + ) def init_module(self) -> None: pass @@ -71,6 +84,57 @@ class SubtitleModule(_ModuleBase): def test(self): pass + @classmethod + def __normalize_subtitle_link(cls, page_url: str, sublink: str) -> Optional[str]: + """ + 转换并过滤真实字幕下载链接 + """ + if not sublink: + return None + sublink = sublink.strip() + if not sublink or sublink.startswith("#"): + return None + parsed = urlparse(sublink) + if parsed.scheme and parsed.scheme not in ("http", "https"): + return None + if sublink.startswith("//"): + page_scheme = urlparse(page_url).scheme or "https" + sublink = f"{page_scheme}:{sublink}" + else: + sublink = urljoin(page_url, sublink) + parsed = urlparse(sublink) + if parsed.scheme not in ("http", "https") or not parsed.netloc: + return None + return sublink + + @classmethod + def _parse_subtitle_links(cls, html, page_url: str) -> List[str]: + """ + 从站点详情页中解析字幕下载链接 + """ + sublink_list = [] + found_links = set() + for xpath in cls._SITE_SUBTITLE_XPATH: + sublink_count = len(sublink_list) + sublink_nodes = html.xpath(xpath) + if sublink_nodes: + for sublink_node in sublink_nodes: + sublinks = [sublink_node.get(attr) for attr in cls._SUBTITLE_URL_ATTRS] + sublinks.extend( + match.group("url") + for match in cls._SCRIPT_URL_RE.finditer(sublink_node.get("onclick") or "") + ) + for sublink in sublinks: + sublink = cls.__normalize_subtitle_link(page_url, sublink) + if not sublink or sublink in found_links: + continue + found_links.add(sublink) + sublink_list.append(sublink) + # 已成功匹配字幕区域,后续xpath可以忽略 + if len(sublink_list) > sublink_count: + break + return sublink_list + def _get_subtitle_links(self, torrent: TorrentInfo): """ 获取字幕链接 @@ -97,23 +161,7 @@ class SubtitleModule(_ModuleBase): return [] html = etree.HTML(res.text) try: - sublink_list = [] - for xpath in self._SITE_SUBTITLE_XPATH: - sublinks = html.xpath(xpath) - if sublinks: - for sublink in sublinks: - if not sublink: - continue - if not sublink.startswith("http"): - base_url = StringUtils.get_base_url(torrent.page_url) - if sublink.startswith("/"): - sublink = "%s%s" % (base_url, sublink) - else: - sublink = "%s/%s" % (base_url, sublink) - sublink_list.append(sublink) - # 已成功获取了链接,后续xpath可以忽略 - break - return sublink_list + return self._parse_subtitle_links(html, torrent.page_url) finally: if html is not None: del html diff --git a/tests/test_subtitle_links.py b/tests/test_subtitle_links.py new file mode 100644 index 00000000..745c3eac --- /dev/null +++ b/tests/test_subtitle_links.py @@ -0,0 +1,45 @@ +from lxml import etree + +from app.modules.subtitle import SubtitleModule + + +def test_parse_subtitle_links_filters_detail_page_action_links(): + html_text = """ + + 字幕 + + + + + + + + +
+ 简体中文 + + + [雪飘+白恋] 光之美少女 All Stars NewStage:未来的朋友 + +
+
+ 上传字幕 +
+
+ 搜索射手网 +
+
+ 搜索Opensubtitles +
+ + + """ + html = etree.HTML(html_text) + + links = SubtitleModule._parse_subtitle_links( + html, "https://audiences.me/details.php?id=599860&hit=1" + ) + + assert links == [ + "https://audiences.me/downloadsubs.php?torrentid=621182&subid=2148" + ]