fix: support jinja indexer parsing in rust

This commit is contained in:
jxxghp
2026-05-23 08:07:43 +08:00
parent da54f3a302
commit fcbea077b7
5 changed files with 199 additions and 299 deletions

View File

@@ -24,6 +24,52 @@ function WARN() {
VENV_PATH="${VENV_PATH:-/opt/venv}"
export PATH="${VENV_PATH}/bin:$PATH"
# 按需准备 Rust 构建环境,避免把工具链常驻打进 Docker runtime 镜像。
function ensure_rust_build_env() {
export PATH="/root/.cargo/bin:$PATH"
if command -v cargo > /dev/null 2>&1; then
return 0
fi
INFO "→ 当前镜像未包含 cargo正在按需准备 Rust 构建环境..."
if command -v apt-get > /dev/null 2>&1; then
if ! apt-get update; then
ERROR "更新 apt 索引失败,无法安装 Rust 构建依赖"
return 1
fi
if ! apt-get install -y --no-install-recommends build-essential curl ca-certificates; then
ERROR "安装 Rust 构建依赖失败"
return 1
fi
apt-get clean
rm -rf /var/lib/apt/lists/*
fi
if ! curl ${CURL_OPTIONS} https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal; then
ERROR "安装 Rust 工具链失败"
return 1
fi
export PATH="/root/.cargo/bin:$PATH"
command -v cargo > /dev/null 2>&1
}
# 更新 Rust 加速扩展,确保 Docker dev/release 更新源码后不会继续加载旧 wheel。
function install_rust_accel() {
local manifest="/app/rust/moviepilot_rust/Cargo.toml"
if [ ! -f "${manifest}" ]; then
WARN "未找到 Rust 扩展源码,跳过 Rust 加速扩展更新"
return 0
fi
if ! ensure_rust_build_env; then
ERROR "Rust 构建环境不可用,无法更新 Rust 加速扩展"
return 1
fi
INFO "→ 正在更新 Rust 加速扩展..."
if ! "${VENV_PATH}/bin/python" -m maturin develop --release --manifest-path "${manifest}"; then
ERROR "Rust 加速扩展更新失败"
return 1
fi
INFO "Rust 加速扩展更新成功"
}
# 下载及解压
function download_and_unzip() {
local retries=0
@@ -166,6 +212,9 @@ function install_backend_and_download_resources() {
WARN "${sites_file} 下载失败,继续使用旧的资源来启动..."
fi
INFO "站点资源更新成功"
if ! install_rust_accel; then
return 1
fi
# 清理临时目录
rm -rf "${TMP_PATH}"
return 0

View File

@@ -344,6 +344,12 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "memo-map"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b"
[[package]]
name = "memoffset"
version = "0.9.1"
@@ -353,10 +359,21 @@ dependencies = [
"autocfg",
]
[[package]]
name = "minijinja"
version = "2.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2929e494b2280e1e18959bb2e121da03347ae896896fdfaceaab43c88a02803f"
dependencies = [
"memo-map",
"serde",
]
[[package]]
name = "moviepilot-rust"
version = "0.1.0"
dependencies = [
"minijinja",
"once_cell",
"percent-encoding",
"pyo3",

View File

@@ -8,6 +8,7 @@ name = "moviepilot_rust"
crate-type = ["cdylib"]
[dependencies]
minijinja = "2.20"
once_cell = "1.20"
percent-encoding = "2.3"
pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }

View File

@@ -1,4 +1,5 @@
use crate::utils::{get_optional_i64, get_optional_string, py_i64_to_usize};
use minijinja::{context, Environment, UndefinedBehavior};
use once_cell::sync::Lazy;
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
use pyo3::exceptions::PyValueError;
@@ -6,6 +7,7 @@ use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList};
use regex::{Regex, RegexBuilder};
use scraper::{ElementRef, Html, Selector};
use std::collections::BTreeMap;
use url::form_urlencoded;
use url::Url;
@@ -34,13 +36,12 @@ static FILESIZE_UNIT_RE: Lazy<Regex> = Lazy::new(|| {
.unwrap()
});
static NUMERIC_FACTOR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap());
static FIELD_EXPR_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"^fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])$"#).unwrap()
});
static FIELD_REF_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap());
static JINJA_EXPR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{\{-?\s*(.*?)\s*-?\}\}"#).unwrap());
static JINJA_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{%-?\s*(.*?)\s*-?%\}"#).unwrap());
static HAS_QUOTED_SELECTOR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap());
static TABLE_DIRECT_TR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap());
enum RowParseResult {
Unsupported,
@@ -66,7 +67,7 @@ pub(crate) fn parse_indexer_torrents_fast(
if list_selector_text.is_empty() {
return Ok(None);
}
let Ok(list_selector) = Selector::parse(&list_selector_text) else {
let Some(list_selector) = parse_site_selector(&list_selector_text) else {
return Ok(None);
};
let document = Html::parse_document(html_text);
@@ -386,11 +387,7 @@ fn parse_title(
safe_query(row, &selector)?
} else if let Some(template) = get_optional_string(&selector, "text")? {
let values = collect_template_field_values(row, fields, &template)?;
let refs: Vec<(&str, &str)> = values
.iter()
.map(|(key, value)| (key.as_str(), value.as_str()))
.collect();
let Some(rendered) = render_known_template(&template, &refs) else {
let Some(rendered) = render_jinja_template(&template, &values) else {
return Ok(false);
};
Some(rendered)
@@ -418,11 +415,7 @@ fn parse_description(
safe_query(row, &selector)?
} else if let Some(template) = get_optional_string(&selector, "text")? {
let values = collect_template_field_values(row, fields, &template)?;
let refs: Vec<(&str, &str)> = values
.iter()
.map(|(key, value)| (key.as_str(), value.as_str()))
.collect();
let Some(rendered) = render_known_template(&template, &refs) else {
let Some(rendered) = render_jinja_template(&template, &values) else {
return Ok(false);
};
Some(rendered)
@@ -441,7 +434,7 @@ fn collect_template_field_values(
row: ElementRef<'_>,
fields: &Bound<'_, PyDict>,
template: &str,
) -> PyResult<Vec<(String, String)>> {
) -> PyResult<BTreeMap<String, String>> {
let mut keys = Vec::new();
for captures in FIELD_REF_RE.captures_iter(template) {
let Some(key) = captures.get(1).or_else(|| captures.get(2)) else {
@@ -453,14 +446,14 @@ fn collect_template_field_values(
}
}
let mut values = Vec::new();
let mut values = BTreeMap::new();
for key in keys {
if let Some(field_selector) = get_field_dict(fields, &key)? {
let value = safe_query(row, &field_selector)?.unwrap_or_default();
values.push((key, value));
values.insert(key, value);
}
}
Ok(values)
Ok(resolve_embedded_field_templates(values))
}
/// 解析普通文本字段。
@@ -683,6 +676,44 @@ fn get_field_dict<'py>(
Ok(Some(value.downcast_into::<PyDict>()?))
}
/// 解析站点配置选择器,并兼容 PyQuery 允许的 :has("selector") 写法。
fn parse_site_selector(selector_text: &str) -> Option<Selector> {
let normalized = normalize_pyquery_selector(selector_text);
let expanded = expand_table_direct_tr_selector(&normalized);
if let Ok(selector) = Selector::parse(&expanded) {
return Some(selector);
}
if expanded != normalized {
if let Ok(selector) = Selector::parse(&normalized) {
return Some(selector);
}
}
Selector::parse(selector_text).ok()
}
/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。
fn normalize_pyquery_selector(selector_text: &str) -> String {
HAS_QUOTED_SELECTOR_RE
.replace_all(selector_text, |captures: &regex::Captures<'_>| {
let inner = captures
.get(1)
.or_else(|| captures.get(2))
.map(|item| item.as_str())
.unwrap_or_default();
format!(":has({inner})")
})
.into_owned()
}
/// 为 table > tr 选择器追加 tbody 变体,适配 Rust HTML5 解析自动补 tbody 的行为。
fn expand_table_direct_tr_selector(selector_text: &str) -> String {
let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2");
if expanded == selector_text {
return selector_text.to_string();
}
format!("{selector_text}, {expanded}")
}
/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。
fn safe_query(
row: ElementRef<'_>,
@@ -702,7 +733,7 @@ fn query_all_values(
let Some(selector_text) = get_selector_text(selector_config)? else {
return Ok(None);
};
let Ok(selector) = Selector::parse(&selector_text) else {
let Some(selector) = parse_site_selector(&selector_text) else {
return Ok(None);
};
let attribute = get_optional_string(selector_config, "attribute")?;
@@ -729,7 +760,7 @@ fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<S
if item.is_empty() {
continue;
}
let Ok(selector) = Selector::parse(item) else {
let Some(selector) = parse_site_selector(item) else {
return Ok(Vec::new());
};
selectors.push(selector);
@@ -848,7 +879,7 @@ fn should_skip_text_node(
/// 判断 row 内是否存在指定 selector。
fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult<bool> {
let Ok(selector) = Selector::parse(selector_text) else {
let Some(selector) = parse_site_selector(selector_text) else {
return Ok(false);
};
Ok(row.select(&selector).next().is_some())
@@ -882,290 +913,32 @@ fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> Str
}
}
/// 渲染常见的 Jinja 字段模板,不支持复杂表达式时由调用方回退 Python
fn render_known_template(template: &str, values: &[(&str, &str)]) -> Option<String> {
if template.contains("{#") {
return None;
}
let rendered = render_jinja_blocks(template, values)?;
render_field_vars(&rendered, values)
/// 使用 MiniJinja 渲染站点字段模板,语义对齐 Python jinja2 的 Template.render(fields=...)
fn render_jinja_template(template: &str, fields: &BTreeMap<String, String>) -> Option<String> {
let mut env = Environment::new();
env.set_undefined_behavior(UndefinedBehavior::Chainable);
env.render_str(template, context! { fields => fields }).ok()
}
/// 渲染站点解析配置里常见的 if/elif/else/endif 字段模板
fn render_jinja_blocks(template: &str, values: &[(&str, &str)]) -> Option<String> {
let mut result = String::new();
let mut cursor = 0;
while let Some(tag_match) = JINJA_TAG_RE.find_at(template, cursor) {
result.push_str(&template[cursor..tag_match.start()]);
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
let tag_content = captures.get(1)?.as_str().trim();
let Some(condition) = tag_content.strip_prefix("if ") else {
return None;
};
let block_end = find_matching_endif(template, tag_match.end())?;
let body = &template[tag_match.end()..block_end.endif_start];
let rendered_branch = render_if_body(body, condition, values)?;
result.push_str(&rendered_branch);
cursor = block_end.endif_end;
}
result.push_str(&template[cursor..]);
Some(result)
}
/// 查找当前 if 块对应的 endif允许内部再嵌套一层字段模板。
fn find_matching_endif(template: &str, from: usize) -> Option<JinjaBlockEnd> {
let mut depth = 1;
for tag_match in JINJA_TAG_RE.find_iter(&template[from..]) {
let absolute_start = from + tag_match.start();
let absolute_end = from + tag_match.end();
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
let tag_content = captures.get(1)?.as_str().trim();
if tag_content.starts_with("if ") {
depth += 1;
} else if tag_content == "endif" {
depth -= 1;
if depth == 0 {
return Some(JinjaBlockEnd {
endif_start: absolute_start,
endif_end: absolute_end,
});
}
}
}
None
}
/// 从 if 块中选出第一个满足条件的分支并继续渲染。
fn render_if_body(body: &str, first_condition: &str, values: &[(&str, &str)]) -> Option<String> {
let branches = split_if_branches(body, first_condition)?;
for branch in branches {
let selected = match branch.condition {
Some(condition) => eval_field_condition(&condition, values)?,
None => true,
};
if selected {
return render_known_template(&branch.content, values);
}
}
Some(String::new())
}
/// 按同层级 elif/else 拆分 if 块,嵌套 if 内部的分支不会被误拆。
fn split_if_branches(body: &str, first_condition: &str) -> Option<Vec<JinjaBranch>> {
let mut branches = Vec::new();
let mut depth = 0;
let mut current_condition = Some(first_condition.trim().to_string());
let mut branch_start = 0;
for tag_match in JINJA_TAG_RE.find_iter(body) {
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
let tag_content = captures.get(1)?.as_str().trim();
if tag_content.starts_with("if ") {
depth += 1;
/// 渲染字段值中意外残留的 Jinja 模板,避免站点 title 属性里的模板文本继续进入识别链路
fn resolve_embedded_field_templates(values: BTreeMap<String, String>) -> BTreeMap<String, String> {
let mut resolved = values.clone();
for (key, value) in &values {
if !contains_jinja_syntax(value) {
continue;
}
if tag_content == "endif" {
if depth == 0 {
return None;
}
depth -= 1;
continue;
}
if depth == 0 {
if let Some(condition) = tag_content.strip_prefix("elif ") {
branches.push(JinjaBranch {
condition: current_condition.take(),
content: body[branch_start..tag_match.start()].to_string(),
});
current_condition = Some(condition.trim().to_string());
branch_start = tag_match.end();
} else if tag_content == "else" {
branches.push(JinjaBranch {
condition: current_condition.take(),
content: body[branch_start..tag_match.start()].to_string(),
});
current_condition = None;
branch_start = tag_match.end();
}
let mut context_values = resolved.clone();
context_values.insert(key.clone(), String::new());
if let Some(rendered) = render_jinja_template(value, &context_values) {
resolved.insert(key.clone(), rendered);
}
}
branches.push(JinjaBranch {
condition: current_condition,
content: body[branch_start..].to_string(),
});
Some(branches)
resolved
}
/// 计算字段真值条件,覆盖站点模板里的 fields.xxx、not、and、or
fn eval_field_condition(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
let trimmed = condition.trim();
if trimmed.contains(" or ") {
for part in trimmed.split(" or ") {
if eval_field_condition(part, values)? {
return Some(true);
}
}
return Some(false);
}
if trimmed.contains(" and ") {
for part in trimmed.split(" and ") {
if !eval_field_condition(part, values)? {
return Some(false);
}
}
return Some(true);
}
eval_field_condition_atom(trimmed, values)
}
/// 计算单个字段条件,缺失字段按 Jinja Undefined 的假值处理。
fn eval_field_condition_atom(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
let (negated, expression) = if let Some(rest) = condition.trim().strip_prefix("not ") {
(true, rest.trim())
} else {
(false, condition.trim())
};
let key = parse_field_key(expression)?;
let value = get_template_value(values, &key).unwrap_or_default();
let truthy = !value.is_empty();
Some(if negated { !truthy } else { truthy })
}
/// 替换模板中的 fields 变量,存在未知变量语法时回退 Python。
fn render_field_vars(template: &str, values: &[(&str, &str)]) -> Option<String> {
let mut rendered = String::new();
let mut cursor = 0;
for captures in JINJA_EXPR_RE.captures_iter(template) {
let whole = captures.get(0)?;
rendered.push_str(&template[cursor..whole.start()]);
let expression = captures.get(1)?.as_str();
rendered.push_str(&eval_field_output(expression, values)?);
cursor = whole.end();
}
rendered.push_str(&template[cursor..]);
if rendered.contains("{{") || rendered.contains("{%") {
return None;
}
Some(rendered)
}
/// 渲染输出表达式,覆盖字段变量、字段三元表达式和字符串拼接。
fn eval_field_output(expression: &str, values: &[(&str, &str)]) -> Option<String> {
let expression = expression.trim();
if let Some((true_expr, condition, false_expr)) = split_inline_if(expression) {
if eval_field_condition(condition, values)? {
return eval_field_output(true_expr, values);
}
return eval_field_output(false_expr, values);
}
let terms = split_concat_terms(expression)?;
if terms.len() > 1 {
let mut rendered = String::new();
for term in terms {
rendered.push_str(&eval_field_atom(term, values)?);
}
return Some(rendered);
}
eval_field_atom(expression, values)
}
/// 拆分 Jinja 的简单三元表达式a if cond else b。
fn split_inline_if(expression: &str) -> Option<(&str, &str, &str)> {
let (true_expr, right) = expression.split_once(" if ")?;
let (condition, false_expr) = right.split_once(" else ")?;
Some((true_expr.trim(), condition.trim(), false_expr.trim()))
}
/// 按字符串字面量边界拆分加号拼接表达式。
fn split_concat_terms(expression: &str) -> Option<Vec<&str>> {
let mut terms = Vec::new();
let mut start = 0;
let mut quote: Option<char> = None;
for (index, ch) in expression.char_indices() {
if let Some(current_quote) = quote {
if ch == current_quote {
quote = None;
}
continue;
}
if ch == '\'' || ch == '"' {
quote = Some(ch);
continue;
}
if ch == '+' {
let term = expression[start..index].trim();
if term.is_empty() {
return None;
}
terms.push(term);
start = index + ch.len_utf8();
}
}
if quote.is_some() {
return None;
}
let term = expression[start..].trim();
if term.is_empty() {
return None;
}
terms.push(term);
Some(terms)
}
/// 渲染字段或字符串字面量,其他表达式交给 Python 回退。
fn eval_field_atom(expression: &str, values: &[(&str, &str)]) -> Option<String> {
let expression = expression.trim();
if let Some(value) = parse_string_literal(expression) {
return Some(value);
}
let key = parse_field_key(expression)?;
Some(
get_template_value(values, &key)
.unwrap_or_default()
.to_string(),
)
}
/// 解析单引号或双引号字符串字面量。
fn parse_string_literal(expression: &str) -> Option<String> {
let mut chars = expression.chars();
let quote = chars.next()?;
if quote != '\'' && quote != '"' {
return None;
}
if !expression.ends_with(quote) || expression.len() < 2 {
return None;
}
let inner = &expression[quote.len_utf8()..expression.len() - quote.len_utf8()];
Some(inner.to_string())
}
/// 解析 fields 变量名,拒绝函数调用和比较表达式等完整 Jinja 能力。
fn parse_field_key(expression: &str) -> Option<String> {
let captures = FIELD_EXPR_RE.captures(expression.trim())?;
captures
.get(1)
.or_else(|| captures.get(2))
.map(|item| item.as_str().to_string())
}
/// 从模板上下文中获取字段值,缺失字段按 Jinja 的空值处理。
fn get_template_value<'a>(values: &'a [(&str, &str)], template_key: &str) -> Option<&'a str> {
for (field_key, value) in values {
if *field_key == template_key {
return Some(*value);
}
}
None
}
struct JinjaBlockEnd {
endif_start: usize,
endif_end: usize,
}
struct JinjaBranch {
condition: Option<String>,
content: String,
/// 判断文本是否包含 Jinja 语法标记,作为字段内嵌模板的低成本预筛选
fn contains_jinja_syntax(value: &str) -> bool {
value.contains("{{") || value.contains("{%") || value.contains("{#")
}
/// 读取分类配置中的 ID 列表。

View File

@@ -299,6 +299,66 @@ def test_rust_indexer_page_parser_renders_literal_title_template_without_default
}]
def test_rust_indexer_page_parser_supports_agsvpt_selector_and_embedded_title_template():
"""
Rust 普通 indexer 页面解析应兼容 AGSVPT 的 PyQuery 选择器和字段内嵌 Jinja 模板。
"""
spider = SiteSpider(
indexer={
"id": "agsvpt",
"name": "AGSVPT",
"domain": "https://www.agsvpt.com/",
"search": {"paths": [{"path": "torrents.php"}]},
"torrents": {
"list": {"selector": 'table.torrents > tr:has("table.torrentname")'},
"fields": {
"title_default": {"selector": 'a[href*="details.php?id="]'},
"title_optional": {
"selector": 'a[title][href*="details.php?id="]',
"attribute": "title",
"optional": True,
},
"title": {
"text": (
"{% if fields['title_optional'] %}"
"{{ fields['title_optional'] }}"
"{% else %}"
"{{ fields['title_default'] }}"
"{% endif %}"
)
},
"details": {
"selector": 'a[href*="details.php?id="]',
"attribute": "href",
},
"download": {
"selector": 'a[href*="download.php?id="]',
"attribute": "href",
},
},
},
},
)
html = """
<table class="torrents">
<tr>
<td><table class="torrentname"><tr><td>
<a href="details.php?id=1" title="{% if fields['title_optional'] %}{% else %}Release that Witch S01 2026 1080p WEB-DL H264 AAC-HHWEB{% endif %}">Ignored</a>
</td></tr></table></td>
<td><a href="download.php?id=1">DL</a></td>
</tr>
</table>
"""
torrents = spider.parse(html)
assert torrents == [{
"title": "Release that Witch S01 2026 1080p WEB-DL H264 AAC-HHWEB",
"page_url": "https://www.agsvpt.com/details.php?id=1",
"enclosure": "https://www.agsvpt.com/download.php?id=1",
}]
def test_rust_indexer_page_parser_renders_common_description_templates():
"""
Rust 普通 indexer 页面解析应兼容站点构建项目里的 description 字段模板。