feat: support indexer templates in Rust parser

2026-05-24 07:26:50 +00:00 · 2026-05-22 23:37:54 +08:00
parent f7b78721c3
commit cde267c55f
2 changed files with 517 additions and 33 deletions
--- a/rust/moviepilot_rust/src/indexer.rs
+++ b/rust/moviepilot_rust/src/indexer.rs
@@ -34,6 +34,12 @@ static FILESIZE_UNIT_RE: Lazy<Regex> = Lazy::new(|| {
        .unwrap()
 });
 static NUMERIC_FACTOR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap());
+static FIELD_EXPR_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r#"^fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])$"#).unwrap()
+});
+static JINJA_EXPR_RE: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"\{\{-?\s*(.*?)\s*-?\}\}"#).unwrap());
+static JINJA_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{%-?\s*(.*?)\s*-?%\}"#).unwrap());

 enum RowParseResult {
    Unsupported,
@@ -388,13 +394,16 @@ fn parse_title(
            } else {
                String::new()
            };
-        Some(render_known_template(
+        let Some(rendered) = render_known_template(
            &template,
            &[
                ("title_default", title_default.as_str()),
                ("title_optional", title_optional.as_str()),
            ],
-        ))
+        ) else {
+            return Ok(false);
+        };
+        Some(rendered)
    } else {
        None
    };
@@ -434,7 +443,10 @@ fn parse_description(
            .iter()
            .map(|(key, value)| (key.as_str(), value.as_str()))
            .collect();
-        Some(render_known_template(&template, &refs))
+        let Some(rendered) = render_known_template(&template, &refs) else {
+            return Ok(false);
+        };
+        Some(rendered)
    } else {
        None
    };
@@ -590,9 +602,6 @@ fn parse_labels_field(
        output.set_item("labels", PyList::empty(py))?;
        return Ok(true);
    }
-    if selector.contains("remove")? {
-        return Ok(false);
-    }
    let Some(values) = query_all_values(row, &selector)? else {
        output.set_item("labels", PyList::empty(py))?;
        return Ok(true);
@@ -673,9 +682,6 @@ fn safe_query(
    row: ElementRef<'_>,
    selector_config: &Bound<'_, PyDict>,
 ) -> PyResult<Option<String>> {
-    if selector_config.contains("remove")? {
-        return Ok(None);
-    }
    let Some(values) = query_all_values(row, selector_config)? else {
        return Ok(None);
    };
@@ -694,17 +700,37 @@ fn query_all_values(
        return Ok(None);
    };
    let attribute = get_optional_string(selector_config, "attribute")?;
+    let remove_selectors = parse_remove_selectors(selector_config)?;
    let mut values = Vec::new();
    for element in row.select(&selector) {
        if let Some(attribute) = attribute.as_deref() {
            values.push(element.value().attr(attribute).unwrap_or("").to_string());
        } else {
-            values.push(normalize_element_text(element));
+            values.push(normalize_element_text(element, &remove_selectors));
        }
    }
    Ok(Some(values))
 }

+/// 解析 remove 配置，支持逗号分隔的 CSS 选择器列表。
+fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<Selector>> {
+    let Some(remove_text) = get_optional_string(selector_config, "remove")? else {
+        return Ok(Vec::new());
+    };
+    let mut selectors = Vec::new();
+    for item in remove_text.split(',') {
+        let item = item.trim();
+        if item.is_empty() {
+            continue;
+        }
+        let Ok(selector) = Selector::parse(item) else {
+            return Ok(Vec::new());
+        };
+        selectors.push(selector);
+    }
+    Ok(selectors)
+}
+
 /// 读取 selector 或 selectors 配置。
 fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
    if let Some(selector) = get_optional_string(selector_config, "selector")? {
@@ -729,17 +755,27 @@ fn select_indexed_value(
        return None;
    }
    if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") {
-        let index = contents as usize;
        if let Some(first) = values.first() {
-            return first.split('\n').nth(index).map(|item| item.to_string());
+            let lines: Vec<&str> = first.split('\n').collect();
+            return pick_indexed_item(&lines, contents).map(|item| item.to_string());
        }
    }
    if let Ok(Some(index)) = get_optional_i64(selector_config, "index") {
-        return values.get(index as usize).cloned();
+        return pick_indexed_item(&values, index).cloned();
    }
    values.first().cloned()
 }

+/// 按 Python 列表语义读取正负索引。
+fn pick_indexed_item<T>(items: &[T], index: i64) -> Option<&T> {
+    let len = items.len() as i64;
+    let resolved = if index < 0 { len + index } else { index };
+    if resolved < 0 {
+        return None;
+    }
+    items.get(resolved as usize)
+}
+
 /// 应用字段配置中的 filters。
 fn apply_selector_filters(
    py: Python<'_>,
@@ -760,13 +796,48 @@ fn apply_selector_filters(
 }

 /// 规范化元素文本，尽量接近 PyQuery.text() 输出。
-fn normalize_element_text(element: ElementRef<'_>) -> String {
-    element
-        .text()
-        .map(str::trim)
-        .filter(|item| !item.is_empty())
-        .collect::<Vec<&str>>()
-        .join(" ")
+fn normalize_element_text(element: ElementRef<'_>, remove_selectors: &[Selector]) -> String {
+    let mut rendered = String::new();
+    for node in element.descendants() {
+        let Some(text_node) = node.value().as_text() else {
+            continue;
+        };
+        if should_skip_text_node(
+            node.parent().and_then(ElementRef::wrap),
+            element,
+            remove_selectors,
+        ) {
+            continue;
+        }
+        rendered.push_str(text_node);
+    }
+    normalize_whitespace(&rendered)
+}
+
+/// 折叠 PyQuery.text() 中的连续空白，保留元素相邻文本节点的直接拼接效果。
+fn normalize_whitespace(value: &str) -> String {
+    value.split_whitespace().collect::<Vec<&str>>().join(" ")
+}
+
+/// 判断文本节点是否位于需要 remove 的元素子树中。
+fn should_skip_text_node(
+    mut parent: Option<ElementRef<'_>>,
+    root: ElementRef<'_>,
+    remove_selectors: &[Selector],
+) -> bool {
+    while let Some(element) = parent {
+        if element == root {
+            return false;
+        }
+        if remove_selectors
+            .iter()
+            .any(|selector| selector.matches(&element))
+        {
+            return true;
+        }
+        parent = element.parent().and_then(ElementRef::wrap);
+    }
+    false
 }

 /// 判断 row 内是否存在指定 selector。
@@ -806,21 +877,285 @@ fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> Str
 }

 /// 渲染常见的 Jinja 字段模板，不支持复杂表达式时由调用方回退 Python。
-fn render_known_template(template: &str, values: &[(&str, &str)]) -> String {
-    let mut rendered = template.to_string();
-    for (key, value) in values {
-        for pattern in [
-            format!("{{{{ fields.{key} }}}}"),
-            format!("{{{{fields.{key}}}}}"),
-            format!("{{{{ fields['{key}'] }}}}"),
-            format!("{{{{fields['{key}']}}}}"),
-            format!("{{{{ fields[\"{key}\"] }}}}"),
-            format!("{{{{fields[\"{key}\"]}}}}"),
-        ] {
-            rendered = rendered.replace(&pattern, value);
+fn render_known_template(template: &str, values: &[(&str, &str)]) -> Option<String> {
+    if template.contains("{#") {
+        return None;
+    }
+    let rendered = render_jinja_blocks(template, values)?;
+    render_field_vars(&rendered, values)
+}
+
+/// 渲染站点解析配置里常见的 if/elif/else/endif 字段模板。
+fn render_jinja_blocks(template: &str, values: &[(&str, &str)]) -> Option<String> {
+    let mut result = String::new();
+    let mut cursor = 0;
+    while let Some(tag_match) = JINJA_TAG_RE.find_at(template, cursor) {
+        result.push_str(&template[cursor..tag_match.start()]);
+        let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
+        let tag_content = captures.get(1)?.as_str().trim();
+        let Some(condition) = tag_content.strip_prefix("if ") else {
+            return None;
+        };
+        let block_end = find_matching_endif(template, tag_match.end())?;
+        let body = &template[tag_match.end()..block_end.endif_start];
+        let rendered_branch = render_if_body(body, condition, values)?;
+        result.push_str(&rendered_branch);
+        cursor = block_end.endif_end;
+    }
+    result.push_str(&template[cursor..]);
+    Some(result)
+}
+
+/// 查找当前 if 块对应的 endif，允许内部再嵌套一层字段模板。
+fn find_matching_endif(template: &str, from: usize) -> Option<JinjaBlockEnd> {
+    let mut depth = 1;
+    for tag_match in JINJA_TAG_RE.find_iter(&template[from..]) {
+        let absolute_start = from + tag_match.start();
+        let absolute_end = from + tag_match.end();
+        let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
+        let tag_content = captures.get(1)?.as_str().trim();
+        if tag_content.starts_with("if ") {
+            depth += 1;
+        } else if tag_content == "endif" {
+            depth -= 1;
+            if depth == 0 {
+                return Some(JinjaBlockEnd {
+                    endif_start: absolute_start,
+                    endif_end: absolute_end,
+                });
+            }
        }
    }
-    rendered
+    None
+}
+
+/// 从 if 块中选出第一个满足条件的分支并继续渲染。
+fn render_if_body(body: &str, first_condition: &str, values: &[(&str, &str)]) -> Option<String> {
+    let branches = split_if_branches(body, first_condition)?;
+    for branch in branches {
+        let selected = match branch.condition {
+            Some(condition) => eval_field_condition(&condition, values)?,
+            None => true,
+        };
+        if selected {
+            return render_known_template(&branch.content, values);
+        }
+    }
+    Some(String::new())
+}
+
+/// 按同层级 elif/else 拆分 if 块，嵌套 if 内部的分支不会被误拆。
+fn split_if_branches(body: &str, first_condition: &str) -> Option<Vec<JinjaBranch>> {
+    let mut branches = Vec::new();
+    let mut depth = 0;
+    let mut current_condition = Some(first_condition.trim().to_string());
+    let mut branch_start = 0;
+    for tag_match in JINJA_TAG_RE.find_iter(body) {
+        let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
+        let tag_content = captures.get(1)?.as_str().trim();
+        if tag_content.starts_with("if ") {
+            depth += 1;
+            continue;
+        }
+        if tag_content == "endif" {
+            if depth == 0 {
+                return None;
+            }
+            depth -= 1;
+            continue;
+        }
+        if depth == 0 {
+            if let Some(condition) = tag_content.strip_prefix("elif ") {
+                branches.push(JinjaBranch {
+                    condition: current_condition.take(),
+                    content: body[branch_start..tag_match.start()].to_string(),
+                });
+                current_condition = Some(condition.trim().to_string());
+                branch_start = tag_match.end();
+            } else if tag_content == "else" {
+                branches.push(JinjaBranch {
+                    condition: current_condition.take(),
+                    content: body[branch_start..tag_match.start()].to_string(),
+                });
+                current_condition = None;
+                branch_start = tag_match.end();
+            }
+        }
+    }
+    branches.push(JinjaBranch {
+        condition: current_condition,
+        content: body[branch_start..].to_string(),
+    });
+    Some(branches)
+}
+
+/// 计算字段真值条件，覆盖站点模板里的 fields.xxx、not、and、or。
+fn eval_field_condition(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
+    let trimmed = condition.trim();
+    if trimmed.contains(" or ") {
+        for part in trimmed.split(" or ") {
+            if eval_field_condition(part, values)? {
+                return Some(true);
+            }
+        }
+        return Some(false);
+    }
+    if trimmed.contains(" and ") {
+        for part in trimmed.split(" and ") {
+            if !eval_field_condition(part, values)? {
+                return Some(false);
+            }
+        }
+        return Some(true);
+    }
+    eval_field_condition_atom(trimmed, values)
+}
+
+/// 计算单个字段条件，缺失字段按 Jinja Undefined 的假值处理。
+fn eval_field_condition_atom(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
+    let (negated, expression) = if let Some(rest) = condition.trim().strip_prefix("not ") {
+        (true, rest.trim())
+    } else {
+        (false, condition.trim())
+    };
+    let key = parse_field_key(expression)?;
+    let value = get_template_value(values, &key).unwrap_or_default();
+    let truthy = !value.is_empty();
+    Some(if negated { !truthy } else { truthy })
+}
+
+/// 替换模板中的 fields 变量，存在未知变量语法时回退 Python。
+fn render_field_vars(template: &str, values: &[(&str, &str)]) -> Option<String> {
+    let mut rendered = String::new();
+    let mut cursor = 0;
+    for captures in JINJA_EXPR_RE.captures_iter(template) {
+        let whole = captures.get(0)?;
+        rendered.push_str(&template[cursor..whole.start()]);
+        let expression = captures.get(1)?.as_str();
+        rendered.push_str(&eval_field_output(expression, values)?);
+        cursor = whole.end();
+    }
+    rendered.push_str(&template[cursor..]);
+    if rendered.contains("{{") || rendered.contains("{%") {
+        return None;
+    }
+    Some(rendered)
+}
+
+/// 渲染输出表达式，覆盖字段变量、字段三元表达式和字符串拼接。
+fn eval_field_output(expression: &str, values: &[(&str, &str)]) -> Option<String> {
+    let expression = expression.trim();
+    if let Some((true_expr, condition, false_expr)) = split_inline_if(expression) {
+        if eval_field_condition(condition, values)? {
+            return eval_field_output(true_expr, values);
+        }
+        return eval_field_output(false_expr, values);
+    }
+    let terms = split_concat_terms(expression)?;
+    if terms.len() > 1 {
+        let mut rendered = String::new();
+        for term in terms {
+            rendered.push_str(&eval_field_atom(term, values)?);
+        }
+        return Some(rendered);
+    }
+    eval_field_atom(expression, values)
+}
+
+/// 拆分 Jinja 的简单三元表达式：a if cond else b。
+fn split_inline_if(expression: &str) -> Option<(&str, &str, &str)> {
+    let (true_expr, right) = expression.split_once(" if ")?;
+    let (condition, false_expr) = right.split_once(" else ")?;
+    Some((true_expr.trim(), condition.trim(), false_expr.trim()))
+}
+
+/// 按字符串字面量边界拆分加号拼接表达式。
+fn split_concat_terms(expression: &str) -> Option<Vec<&str>> {
+    let mut terms = Vec::new();
+    let mut start = 0;
+    let mut quote: Option<char> = None;
+    for (index, ch) in expression.char_indices() {
+        if let Some(current_quote) = quote {
+            if ch == current_quote {
+                quote = None;
+            }
+            continue;
+        }
+        if ch == '\'' || ch == '"' {
+            quote = Some(ch);
+            continue;
+        }
+        if ch == '+' {
+            let term = expression[start..index].trim();
+            if term.is_empty() {
+                return None;
+            }
+            terms.push(term);
+            start = index + ch.len_utf8();
+        }
+    }
+    if quote.is_some() {
+        return None;
+    }
+    let term = expression[start..].trim();
+    if term.is_empty() {
+        return None;
+    }
+    terms.push(term);
+    Some(terms)
+}
+
+/// 渲染字段或字符串字面量，其他表达式交给 Python 回退。
+fn eval_field_atom(expression: &str, values: &[(&str, &str)]) -> Option<String> {
+    let expression = expression.trim();
+    if let Some(value) = parse_string_literal(expression) {
+        return Some(value);
+    }
+    let key = parse_field_key(expression)?;
+    Some(get_template_value(values, &key).unwrap_or_default().to_string())
+}
+
+/// 解析单引号或双引号字符串字面量。
+fn parse_string_literal(expression: &str) -> Option<String> {
+    let mut chars = expression.chars();
+    let quote = chars.next()?;
+    if quote != '\'' && quote != '"' {
+        return None;
+    }
+    if !expression.ends_with(quote) || expression.len() < 2 {
+        return None;
+    }
+    let inner = &expression[quote.len_utf8()..expression.len() - quote.len_utf8()];
+    Some(inner.to_string())
+}
+
+/// 解析 fields 变量名，拒绝函数调用和比较表达式等完整 Jinja 能力。
+fn parse_field_key(expression: &str) -> Option<String> {
+    let captures = FIELD_EXPR_RE.captures(expression.trim())?;
+    captures
+        .get(1)
+        .or_else(|| captures.get(2))
+        .map(|item| item.as_str().to_string())
+}
+
+/// 从模板上下文中获取字段值，缺失字段按 Jinja 的空值处理。
+fn get_template_value<'a>(values: &'a [(&str, &str)], template_key: &str) -> Option<&'a str> {
+    for (field_key, value) in values {
+        if *field_key == template_key {
+            return Some(*value);
+        }
+    }
+    None
+}
+
+struct JinjaBlockEnd {
+    endif_start: usize,
+    endif_end: usize,
+}
+
+struct JinjaBranch {
+    condition: Option<String>,
+    content: String,
 }

 /// 读取分类配置中的 ID 列表。
--- a/tests/test_rust_accel.py
+++ b/tests/test_rust_accel.py
@@ -196,3 +196,152 @@ def test_rust_indexer_page_parser_handles_common_fields():
        "hit_and_run": True,
        "category": MediaType.MOVIE.value,
    }]
+
+
+def test_rust_indexer_page_parser_renders_common_title_template():
+    """
+    Rust 普通 indexer 页面解析应兼容站点构建项目里的 title_optional 模板。
+    """
+    spider = SiteSpider(
+        indexer={
+            "id": "demo",
+            "name": "Demo",
+            "domain": "https://example.org/",
+            "search": {"paths": [{"path": "torrents.php"}]},
+            "torrents": {
+                "list": {"selector": "tr.torrent"},
+                "fields": {
+                    "title_default": {"selector": "a.title"},
+                    "title_optional": {
+                        "selector": "a.title",
+                        "attribute": "title",
+                        "optional": True,
+                    },
+                    "title": {
+                        "text": (
+                            "{% if fields['title_optional'] %}"
+                            "{{ fields['title_optional'] }}"
+                            "{% else %}"
+                            "{{ fields['title_default'] }}"
+                            "{% endif %}"
+                        )
+                    },
+                    "download": {"selector": "a.dl", "attribute": "href"},
+                },
+            },
+        },
+    )
+    html = """
+    <table>
+      <tr class="torrent">
+        <td><a class="title" title="Optional Name" href="/details/1">Default Name</a></td>
+        <td><a class="dl" href="/download/1">DL</a></td>
+      </tr>
+      <tr class="torrent">
+        <td><a class="title" title="" href="/details/2">Default Fallback</a></td>
+        <td><a class="dl" href="/download/2">DL</a></td>
+      </tr>
+    </table>
+    """
+
+    torrents = spider.parse(html)
+
+    assert [item["title"] for item in torrents] == ["Optional Name", "Default Fallback"]
+
+
+def test_rust_indexer_page_parser_renders_common_description_templates():
+    """
+    Rust 普通 indexer 页面解析应兼容站点构建项目里的 description 字段模板。
+    """
+    spider = SiteSpider(
+        indexer={
+            "id": "demo",
+            "name": "Demo",
+            "domain": "https://example.org/",
+            "search": {"paths": [{"path": "torrents.php"}]},
+            "torrents": {
+                "list": {"selector": "tr.torrent"},
+                "fields": {
+                    "title": {"selector": "a.title"},
+                    "subject": {"selector": ".subject"},
+                    "tags": {"selector": ".tags"},
+                    "description": {
+                        "text": (
+                            "{% if fields['tags']%}"
+                            "{{ fields['subject']+' '+fields['tags'] }}"
+                            "{% else %}"
+                            "{{ fields['subject'] }}"
+                            "{% endif %}"
+                        )
+                    },
+                    "download": {"selector": "a.dl", "attribute": "href"},
+                },
+            },
+        },
+    )
+    html = """
+    <table>
+      <tr class="torrent">
+        <td><a class="title">Movie 2024</a><span class="subject">BluRay</span><span class="tags">HDR</span></td>
+        <td><a class="dl" href="/download/1">DL</a></td>
+      </tr>
+      <tr class="torrent">
+        <td><a class="title">Show 2025</a><span class="subject">WEB-DL</span><span class="tags"></span></td>
+        <td><a class="dl" href="/download/2">DL</a></td>
+      </tr>
+    </table>
+    """
+
+    torrents = spider.parse(html)
+
+    assert [item["description"] for item in torrents] == ["BluRay HDR", "WEB-DL"]
+
+
+def test_rust_indexer_page_parser_supports_remove_and_negative_index():
+    """
+    Rust 普通 indexer 页面解析应兼容站点配置常用的 remove 和负索引。
+    """
+    spider = SiteSpider(
+        indexer={
+            "id": "demo",
+            "name": "Demo",
+            "domain": "https://example.org/",
+            "search": {"paths": [{"path": "torrents.php"}]},
+            "torrents": {
+                "list": {"selector": "tr.torrent"},
+                "fields": {
+                    "title": {"selector": ".name", "remove": "a,b"},
+                    "description": {
+                        "selector": ".desc",
+                        "remove": "span,a,img,font,b",
+                        "contents": -1,
+                    },
+                    "labels": {
+                        "selector": ".labels > span",
+                        "remove": "span,a,img,font,b",
+                        "contents": -1,
+                    },
+                    "download": {"selector": "a.dl", "attribute": "href"},
+                },
+            },
+        },
+    )
+    html = """
+    <table>
+      <tr class="torrent">
+        <td class="name">Movie<a>删掉</a><b>也删</b>2024</td>
+        <td class="desc">第一行
+          <span>标签</span><a>链接</a>
+          第二行
+        </td>
+        <td class="labels"><span><i>DIY</i></span><span><i>HDR</i></span></td>
+        <td><a class="dl" href="/download/1">DL</a></td>
+      </tr>
+    </table>
+    """
+
+    torrents = spider.parse(html)
+
+    assert torrents[0]["title"] == "Movie2024"
+    assert torrents[0]["description"] == "第一行 第二行"
+    assert torrents[0]["labels"] == ["DIY", "HDR"]