archived-MoviePilot/tests/test_rust_accel.py

from datetime import datetime, timezone
from types import SimpleNamespace

import pytest

from app.helper import rss as rss_module
from app.helper.rss import RssHelper
from app.modules.indexer.spider import SiteSpider
from app.schemas.types import MediaType
from app.utils import rust_accel


pytestmark = pytest.mark.skipif(
    not rust_accel.is_available(),
    reason="moviepilot_rust 扩展未安装",
)


def test_rust_filter_rule_parser_matches_boolean_semantics():
    """
    Rust 过滤规则解析应保持 pyparsing 的布尔表达式结构。
    """
    result = rust_accel.parse_filter_rule("HDR & !BLU")

    assert result == [["HDR", "and", ["not", "BLU"]]]


def test_rust_filter_rule_parser_handles_parentheses_and_or():
    """
    Rust 过滤规则解析应保持括号、与、或的优先级语义。
    """
    result = rust_accel.parse_filter_rule("CNSUB & (4K | 1080P) & !BLU")

    assert result == [[["CNSUB", "and", ["4K", "or", "1080P"]], "and", ["not", "BLU"]]]


def test_rust_rss_parser_extracts_rss_and_atom_items():
    """
    Rust RSS解析应覆盖 RSS item、Atom entry、命名空间和日期字段。
    """
    xml = """
    <root xmlns:dc="http://purl.org/dc/elements/1.1/">
      <rss>
        <channel>
          <item>
            <title>Movie &amp; Show</title>
            <description><![CDATA[Desc <b>bold</b>]]></description>
            <link>https://example.com/details/1</link>
            <enclosure url="https://example.com/download/1.torrent" length="123456" />
            <pubDate>Tue, 19 May 2026 08:30:00 GMT</pubDate>
            <dc:creator>豆瓣用户</dc:creator>
          </item>
        </channel>
      </rss>
      <feed>
        <entry>
          <title>Atom Title</title>
          <summary>Atom Summary</summary>
          <link href="https://example.com/atom/2" />
          <updated>2026-05-19T09:30:00Z</updated>
        </entry>
      </feed>
    </root>
    """

    result = rust_accel.parse_rss_items(xml, max_items=100)

    assert len(result) == 2
    assert result[0]["title"] == "Movie & Show"
    assert result[0]["description"] == "Desc <b>bold</b>"
    assert result[0]["link"] == "https://example.com/details/1"
    assert result[0]["enclosure"] == "https://example.com/download/1.torrent"
    assert result[0]["size"] == 123456
    assert result[0]["nickname"] == "豆瓣用户"
    assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 8, 30, tzinfo=timezone.utc).timestamp())
    assert result[1]["title"] == "Atom Title"
    assert result[1]["description"] == "Atom Summary"
    assert result[1]["link"] == "https://example.com/atom/2"
    assert result[1]["enclosure"] == "https://example.com/atom/2"
    assert int(result[1]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 9, 30, tzinfo=timezone.utc).timestamp())


def test_rust_rss_parser_skips_incomplete_items():
    """
    Rust RSS解析应保持原逻辑，跳过无标题或无链接的条目。
    """
    xml = """
    <rss>
      <channel>
        <item><title></title><link>https://example.com/a</link></item>
        <item><title>No Link</title></item>
        <item><title>OK</title><link>https://example.com/ok</link></item>
      </channel>
    </rss>
    """

    result = rust_accel.parse_rss_items(xml, max_items=100)

    assert result == [{
        "title": "OK",
        "enclosure": "https://example.com/ok",
        "size": 0,
        "description": "",
        "link": "https://example.com/ok",
        "pubdate": "",
    }]


def test_rss_helper_parse_uses_rust_parser(monkeypatch):
    """
    RssHelper.parse 应在请求和编码处理后直接使用 Rust 解析结果。
    """
    xml = """
    <rss>
      <channel>
        <item>
          <title>Helper Title</title>
          <description>Helper Description</description>
          <link>https://example.com/details/3</link>
          <pubDate>2026-05-19T10:30:00Z</pubDate>
        </item>
      </channel>
    </rss>
    """

    class FakeRequestUtils:
        """
        测试用 RequestUtils，避免真实网络请求。
        """

        def __init__(self, **_kwargs):
            """
            保存构造参数占位，兼容 RssHelper 的调用方式。
            """

        def get_res(self, _url):
            """
            返回带 content/text/status_code 的最小响应对象。
            """
            return SimpleNamespace(
                status_code=200,
                content=xml.encode("utf-8"),
                text=xml,
                apparent_encoding="utf-8",
                encoding="utf-8",
            )

    monkeypatch.setattr(rss_module, "RequestUtils", FakeRequestUtils)

    result = RssHelper().parse("https://example.com/rss")

    assert len(result) == 1
    assert result[0]["title"] == "Helper Title"
    assert result[0]["enclosure"] == "https://example.com/details/3"
    assert int(result[0]["pubdate"].timestamp()) == int(datetime(2026, 5, 19, 10, 30, tzinfo=timezone.utc).timestamp())


def test_rust_indexer_parser_handles_jinja_pyquery_filters_and_links():
    """
    Rust indexer 解析应覆盖普通站点配置的 Jinja、PyQuery selector 和过滤器。
    """
    html = """
    <table class="torrents">
      <tr>
        <td><a href="?cat=402">TV</a></td>
        <td>
          <table class="torrentname">
            <tr>
              <td class="embedded">
                <a href="details.php?id=100" title="Optional.Title">Default.Title</a>
                <a href="download.php?id=100">DL</a>
                <a href="https://www.imdb.com/title/tt1234567/">IMDb</a>
                <font class="subtitle">Main description <span>remove</span><a>link</a></font>
                <span class="label">FREE</span>
                <img class="hitandrun" />
              </td>
            </tr>
          </table>
        </td>
        <td></td>
        <td><span title="2025-05-01 12:13:14">1 hour ago</span></td>
        <td>1.5 GB</td>
        <td>1,234</td>
        <td>5/7</td>
        <td>9</td>
      </tr>
    </table>
    """
    indexer = {
        "id": "unit",
        "name": "Unit",
        "domain": "https://example.com/",
        "search": {"paths": [{"path": "torrents.php"}]},
        "category": {
            "movie": [{"id": "401"}],
            "tv": [{"id": "402"}],
        },
        "torrents": {
            "list": {"selector": 'table.torrents > tr:has("table.torrentname")'},
            "fields": {
                "title_default": {"selector": 'a[href*="details.php?id="]'},
                "title_optional": {
                    "selector": 'a[title][href*="details.php?id="]',
                    "attribute": "title",
                },
                "title": {
                    "text": "{% if fields['title_optional'] %}{{ fields['title_optional'] }}{% else %}"
                            "{{ fields['title_default'] }}{% endif %}"
                },
                "details": {"selector": 'a[href*="details.php?id="]', "attribute": "href"},
                "download": {"selector": 'a[href*="download.php?id="]', "attribute": "href"},
                "imdbid": {
                    "selector": 'a[href*="imdb.com/title/tt"]',
                    "attribute": "href",
                    "filters": [{"name": "re_search", "args": ["tt\\d+", 0]}],
                },
                "date_elapsed": {"selector": "td:nth-child(4) > span"},
                "date_added": {"selector": "td:nth-child(4) > span", "attribute": "title"},
                "date": {
                    "text": "{% if fields['date_elapsed'] or fields['date_added'] %}"
                            "{{ fields['date_added'] if fields['date_added'] else fields['date_elapsed'] }}"
                            "{% else %}now{% endif %}",
                    "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}],
                },
                "size": {"selector": "td:nth-child(5)"},
                "seeders": {"selector": "td:nth-child(6)"},
                "leechers": {"selector": "td:nth-child(7)"},
                "grabs": {"selector": "td:nth-child(8)"},
                "downloadvolumefactor": {"case": {"img.free": 0, "*": 1}},
                "uploadvolumefactor": {"case": {"*": 1}},
                "description": {
                    "selector": "font.subtitle",
                    "remove": "span,a",
                },
                "labels": {"selector": "span.label"},
                "hr": {"selector": "img.hitandrun"},
                "category": {
                    "selector": 'a[href*="?cat="]',
                    "attribute": "href",
                    "filters": [{"name": "querystring", "args": "cat"}],
                },
            },
        },
    }

    result = SiteSpider(indexer, mtype=MediaType.TV).parse(html)

    assert result == [{
        "page_url": "https://example.com/details.php?id=100",
        "enclosure": "https://example.com/download.php?id=100",
        "downloadvolumefactor": 1.0,
        "uploadvolumefactor": 1.0,
        "pubdate": "2025-05-01 12:13:14",
        "title": "Optional.Title",
        "description": "Main description",
        "imdbid": "tt1234567",
        "size": 1610612736,
        "peers": 5,
        "seeders": 1234,
        "grabs": 9,
        "date_elapsed": "1 hour ago",
        "labels": ["FREE"],
        "hit_and_run": True,
        "category": "电视剧",
    }]


def test_rust_indexer_parser_handles_default_values_and_template_arithmetic():
    """
    Rust indexer 解析应支持 defualt_value、Jinja int filter 和模板算术表达式。
    """
    html = """
    <table class="torrents">
      <tr>
        <td><a href="details.php?id=200">Default.Title</a></td>
      </tr>
    </table>
    """
    fields = {
        "title_default": {"selector": 'a[href*="details.php?id="]'},
        "missing_days": {"defualt_value": "2", "selector": "span.missing"},
        "title": {"text": "{{ fields['title_default'] }} {{ (fields['missing_days']|int)*86400 }}"},
    }

    result = rust_accel.parse_indexer_torrents(
        html_text=html,
        domain="https://example.com/",
        list_config={"selector": "table.torrents > tr"},
        fields=fields,
        category=None,
        result_num=100,
    )

    assert result == [{"title": "Default.Title 172800"}]


def test_rust_indexer_parser_handles_lstrip_and_english_elapsed_date():
    """
    Rust indexer 解析应覆盖 IPT 配置用到的 lstrip 和 date_en_elapsed_parse 过滤器。
    """
    html = """
    <table id="torrents">
      <tr>
        <td><a href="/t/123">Title</a><a href="/download.php/123">download</a></td>
        <td><div>Uploaded | 2 hours ago</div></td>
      </tr>
    </table>
    """
    fields = {
        "title": {"selector": 'a[href*="/t/"]'},
        "download": {
            "selector": 'a[href*="/download.php/"]',
            "attribute": "href",
            "filters": [{"name": "lstrip", "args": ["/"]}],
        },
        "date": {
            "selector": "td:nth-child(2) > div",
            "filters": [
                {"name": "split", "args": ["|", 1]},
                {"name": "date_en_elapsed_parse"},
            ],
        },
    }

    result = rust_accel.parse_indexer_torrents(
        html_text=html,
        domain="https://iptorrents.com/",
        list_config={"selector": 'table[id="torrents"] tr'},
        fields=fields,
        category=None,
        result_num=100,
    )

    assert len(result) == 1
    assert result[0]["title"] == "Title"
    assert result[0]["enclosure"] == "https://iptorrents.com/download.php/123"
    assert result[0]["pubdate"]


def test_rust_indexer_parser_prefers_date_added_when_date_template_returns_elapsed_text():
    """
    Rust indexer 解析 date 模板产出相对时间时，应使用 date_added 里的标准时间。
    """
    html = """
    <table class="torrents">
      <tr>
        <td><span title="2025-06-02 03:04:05">1 hour ago</span></td>
      </tr>
    </table>
    """
    fields = {
        "date_elapsed": {"selector": "span"},
        "date_added": {"selector": "span", "attribute": "title"},
        "date": {
            "text": "{% if fields['date_elapsed'] or fields['date_added'] %}"
                    "{{ fields['date_elapsed'] if fields['date_elapsed'] else fields['date_added'] }}"
                    "{% else %}now{% endif %}",
            "filters": [{"name": "dateparse", "args": "%Y-%m-%d %H:%M:%S"}],
        },
    }

    result = rust_accel.parse_indexer_torrents(
        html_text=html,
        domain="https://example.com/",
        list_config={"selector": "table.torrents > tr"},
        fields=fields,
        category=None,
        result_num=100,
    )

    assert result[0]["pubdate"] == "2025-06-02 03:04:05"