mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-02 07:26:46 +00:00
fix: support jinja indexer parsing in rust
This commit is contained in:
@@ -24,6 +24,52 @@ function WARN() {
|
||||
VENV_PATH="${VENV_PATH:-/opt/venv}"
|
||||
export PATH="${VENV_PATH}/bin:$PATH"
|
||||
|
||||
# 按需准备 Rust 构建环境,避免把工具链常驻打进 Docker runtime 镜像。
|
||||
function ensure_rust_build_env() {
|
||||
export PATH="/root/.cargo/bin:$PATH"
|
||||
if command -v cargo > /dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
INFO "→ 当前镜像未包含 cargo,正在按需准备 Rust 构建环境..."
|
||||
if command -v apt-get > /dev/null 2>&1; then
|
||||
if ! apt-get update; then
|
||||
ERROR "更新 apt 索引失败,无法安装 Rust 构建依赖"
|
||||
return 1
|
||||
fi
|
||||
if ! apt-get install -y --no-install-recommends build-essential curl ca-certificates; then
|
||||
ERROR "安装 Rust 构建依赖失败"
|
||||
return 1
|
||||
fi
|
||||
apt-get clean
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
if ! curl ${CURL_OPTIONS} https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal; then
|
||||
ERROR "安装 Rust 工具链失败"
|
||||
return 1
|
||||
fi
|
||||
export PATH="/root/.cargo/bin:$PATH"
|
||||
command -v cargo > /dev/null 2>&1
|
||||
}
|
||||
|
||||
# 更新 Rust 加速扩展,确保 Docker dev/release 更新源码后不会继续加载旧 wheel。
|
||||
function install_rust_accel() {
|
||||
local manifest="/app/rust/moviepilot_rust/Cargo.toml"
|
||||
if [ ! -f "${manifest}" ]; then
|
||||
WARN "未找到 Rust 扩展源码,跳过 Rust 加速扩展更新"
|
||||
return 0
|
||||
fi
|
||||
if ! ensure_rust_build_env; then
|
||||
ERROR "Rust 构建环境不可用,无法更新 Rust 加速扩展"
|
||||
return 1
|
||||
fi
|
||||
INFO "→ 正在更新 Rust 加速扩展..."
|
||||
if ! "${VENV_PATH}/bin/python" -m maturin develop --release --manifest-path "${manifest}"; then
|
||||
ERROR "Rust 加速扩展更新失败"
|
||||
return 1
|
||||
fi
|
||||
INFO "Rust 加速扩展更新成功"
|
||||
}
|
||||
|
||||
# 下载及解压
|
||||
function download_and_unzip() {
|
||||
local retries=0
|
||||
@@ -166,6 +212,9 @@ function install_backend_and_download_resources() {
|
||||
WARN "${sites_file} 下载失败,继续使用旧的资源来启动..."
|
||||
fi
|
||||
INFO "站点资源更新成功"
|
||||
if ! install_rust_accel; then
|
||||
return 1
|
||||
fi
|
||||
# 清理临时目录
|
||||
rm -rf "${TMP_PATH}"
|
||||
return 0
|
||||
|
||||
17
rust/moviepilot_rust/Cargo.lock
generated
17
rust/moviepilot_rust/Cargo.lock
generated
@@ -344,6 +344,12 @@ version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
||||
|
||||
[[package]]
|
||||
name = "memo-map"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b"
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.9.1"
|
||||
@@ -353,10 +359,21 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "minijinja"
|
||||
version = "2.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2929e494b2280e1e18959bb2e121da03347ae896896fdfaceaab43c88a02803f"
|
||||
dependencies = [
|
||||
"memo-map",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "moviepilot-rust"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"minijinja",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pyo3",
|
||||
|
||||
@@ -8,6 +8,7 @@ name = "moviepilot_rust"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
minijinja = "2.20"
|
||||
once_cell = "1.20"
|
||||
percent-encoding = "2.3"
|
||||
pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] }
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::utils::{get_optional_i64, get_optional_string, py_i64_to_usize};
|
||||
use minijinja::{context, Environment, UndefinedBehavior};
|
||||
use once_cell::sync::Lazy;
|
||||
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
|
||||
use pyo3::exceptions::PyValueError;
|
||||
@@ -6,6 +7,7 @@ use pyo3::prelude::*;
|
||||
use pyo3::types::{PyDict, PyList};
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use std::collections::BTreeMap;
|
||||
use url::form_urlencoded;
|
||||
use url::Url;
|
||||
|
||||
@@ -34,13 +36,12 @@ static FILESIZE_UNIT_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
static NUMERIC_FACTOR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap());
|
||||
static FIELD_EXPR_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"^fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])$"#).unwrap()
|
||||
});
|
||||
static FIELD_REF_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap());
|
||||
static JINJA_EXPR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{\{-?\s*(.*?)\s*-?\}\}"#).unwrap());
|
||||
static JINJA_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{%-?\s*(.*?)\s*-?%\}"#).unwrap());
|
||||
static HAS_QUOTED_SELECTOR_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap());
|
||||
static TABLE_DIRECT_TR_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap());
|
||||
|
||||
enum RowParseResult {
|
||||
Unsupported,
|
||||
@@ -66,7 +67,7 @@ pub(crate) fn parse_indexer_torrents_fast(
|
||||
if list_selector_text.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let Ok(list_selector) = Selector::parse(&list_selector_text) else {
|
||||
let Some(list_selector) = parse_site_selector(&list_selector_text) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let document = Html::parse_document(html_text);
|
||||
@@ -386,11 +387,7 @@ fn parse_title(
|
||||
safe_query(row, &selector)?
|
||||
} else if let Some(template) = get_optional_string(&selector, "text")? {
|
||||
let values = collect_template_field_values(row, fields, &template)?;
|
||||
let refs: Vec<(&str, &str)> = values
|
||||
.iter()
|
||||
.map(|(key, value)| (key.as_str(), value.as_str()))
|
||||
.collect();
|
||||
let Some(rendered) = render_known_template(&template, &refs) else {
|
||||
let Some(rendered) = render_jinja_template(&template, &values) else {
|
||||
return Ok(false);
|
||||
};
|
||||
Some(rendered)
|
||||
@@ -418,11 +415,7 @@ fn parse_description(
|
||||
safe_query(row, &selector)?
|
||||
} else if let Some(template) = get_optional_string(&selector, "text")? {
|
||||
let values = collect_template_field_values(row, fields, &template)?;
|
||||
let refs: Vec<(&str, &str)> = values
|
||||
.iter()
|
||||
.map(|(key, value)| (key.as_str(), value.as_str()))
|
||||
.collect();
|
||||
let Some(rendered) = render_known_template(&template, &refs) else {
|
||||
let Some(rendered) = render_jinja_template(&template, &values) else {
|
||||
return Ok(false);
|
||||
};
|
||||
Some(rendered)
|
||||
@@ -441,7 +434,7 @@ fn collect_template_field_values(
|
||||
row: ElementRef<'_>,
|
||||
fields: &Bound<'_, PyDict>,
|
||||
template: &str,
|
||||
) -> PyResult<Vec<(String, String)>> {
|
||||
) -> PyResult<BTreeMap<String, String>> {
|
||||
let mut keys = Vec::new();
|
||||
for captures in FIELD_REF_RE.captures_iter(template) {
|
||||
let Some(key) = captures.get(1).or_else(|| captures.get(2)) else {
|
||||
@@ -453,14 +446,14 @@ fn collect_template_field_values(
|
||||
}
|
||||
}
|
||||
|
||||
let mut values = Vec::new();
|
||||
let mut values = BTreeMap::new();
|
||||
for key in keys {
|
||||
if let Some(field_selector) = get_field_dict(fields, &key)? {
|
||||
let value = safe_query(row, &field_selector)?.unwrap_or_default();
|
||||
values.push((key, value));
|
||||
values.insert(key, value);
|
||||
}
|
||||
}
|
||||
Ok(values)
|
||||
Ok(resolve_embedded_field_templates(values))
|
||||
}
|
||||
|
||||
/// 解析普通文本字段。
|
||||
@@ -683,6 +676,44 @@ fn get_field_dict<'py>(
|
||||
Ok(Some(value.downcast_into::<PyDict>()?))
|
||||
}
|
||||
|
||||
/// 解析站点配置选择器,并兼容 PyQuery 允许的 :has("selector") 写法。
|
||||
fn parse_site_selector(selector_text: &str) -> Option<Selector> {
|
||||
let normalized = normalize_pyquery_selector(selector_text);
|
||||
let expanded = expand_table_direct_tr_selector(&normalized);
|
||||
if let Ok(selector) = Selector::parse(&expanded) {
|
||||
return Some(selector);
|
||||
}
|
||||
if expanded != normalized {
|
||||
if let Ok(selector) = Selector::parse(&normalized) {
|
||||
return Some(selector);
|
||||
}
|
||||
}
|
||||
Selector::parse(selector_text).ok()
|
||||
}
|
||||
|
||||
/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。
|
||||
fn normalize_pyquery_selector(selector_text: &str) -> String {
|
||||
HAS_QUOTED_SELECTOR_RE
|
||||
.replace_all(selector_text, |captures: ®ex::Captures<'_>| {
|
||||
let inner = captures
|
||||
.get(1)
|
||||
.or_else(|| captures.get(2))
|
||||
.map(|item| item.as_str())
|
||||
.unwrap_or_default();
|
||||
format!(":has({inner})")
|
||||
})
|
||||
.into_owned()
|
||||
}
|
||||
|
||||
/// 为 table > tr 选择器追加 tbody 变体,适配 Rust HTML5 解析自动补 tbody 的行为。
|
||||
fn expand_table_direct_tr_selector(selector_text: &str) -> String {
|
||||
let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2");
|
||||
if expanded == selector_text {
|
||||
return selector_text.to_string();
|
||||
}
|
||||
format!("{selector_text}, {expanded}")
|
||||
}
|
||||
|
||||
/// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。
|
||||
fn safe_query(
|
||||
row: ElementRef<'_>,
|
||||
@@ -702,7 +733,7 @@ fn query_all_values(
|
||||
let Some(selector_text) = get_selector_text(selector_config)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Ok(selector) = Selector::parse(&selector_text) else {
|
||||
let Some(selector) = parse_site_selector(&selector_text) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let attribute = get_optional_string(selector_config, "attribute")?;
|
||||
@@ -729,7 +760,7 @@ fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<S
|
||||
if item.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let Ok(selector) = Selector::parse(item) else {
|
||||
let Some(selector) = parse_site_selector(item) else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
selectors.push(selector);
|
||||
@@ -848,7 +879,7 @@ fn should_skip_text_node(
|
||||
|
||||
/// 判断 row 内是否存在指定 selector。
|
||||
fn selector_exists(row: ElementRef<'_>, selector_text: &str) -> PyResult<bool> {
|
||||
let Ok(selector) = Selector::parse(selector_text) else {
|
||||
let Some(selector) = parse_site_selector(selector_text) else {
|
||||
return Ok(false);
|
||||
};
|
||||
Ok(row.select(&selector).next().is_some())
|
||||
@@ -882,290 +913,32 @@ fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> Str
|
||||
}
|
||||
}
|
||||
|
||||
/// 渲染常见的 Jinja 字段模板,不支持复杂表达式时由调用方回退 Python。
|
||||
fn render_known_template(template: &str, values: &[(&str, &str)]) -> Option<String> {
|
||||
if template.contains("{#") {
|
||||
return None;
|
||||
}
|
||||
let rendered = render_jinja_blocks(template, values)?;
|
||||
render_field_vars(&rendered, values)
|
||||
/// 使用 MiniJinja 渲染站点字段模板,语义对齐 Python jinja2 的 Template.render(fields=...)。
|
||||
fn render_jinja_template(template: &str, fields: &BTreeMap<String, String>) -> Option<String> {
|
||||
let mut env = Environment::new();
|
||||
env.set_undefined_behavior(UndefinedBehavior::Chainable);
|
||||
env.render_str(template, context! { fields => fields }).ok()
|
||||
}
|
||||
|
||||
/// 渲染站点解析配置里常见的 if/elif/else/endif 字段模板。
|
||||
fn render_jinja_blocks(template: &str, values: &[(&str, &str)]) -> Option<String> {
|
||||
let mut result = String::new();
|
||||
let mut cursor = 0;
|
||||
while let Some(tag_match) = JINJA_TAG_RE.find_at(template, cursor) {
|
||||
result.push_str(&template[cursor..tag_match.start()]);
|
||||
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
|
||||
let tag_content = captures.get(1)?.as_str().trim();
|
||||
let Some(condition) = tag_content.strip_prefix("if ") else {
|
||||
return None;
|
||||
};
|
||||
let block_end = find_matching_endif(template, tag_match.end())?;
|
||||
let body = &template[tag_match.end()..block_end.endif_start];
|
||||
let rendered_branch = render_if_body(body, condition, values)?;
|
||||
result.push_str(&rendered_branch);
|
||||
cursor = block_end.endif_end;
|
||||
}
|
||||
result.push_str(&template[cursor..]);
|
||||
Some(result)
|
||||
}
|
||||
|
||||
/// 查找当前 if 块对应的 endif,允许内部再嵌套一层字段模板。
|
||||
fn find_matching_endif(template: &str, from: usize) -> Option<JinjaBlockEnd> {
|
||||
let mut depth = 1;
|
||||
for tag_match in JINJA_TAG_RE.find_iter(&template[from..]) {
|
||||
let absolute_start = from + tag_match.start();
|
||||
let absolute_end = from + tag_match.end();
|
||||
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
|
||||
let tag_content = captures.get(1)?.as_str().trim();
|
||||
if tag_content.starts_with("if ") {
|
||||
depth += 1;
|
||||
} else if tag_content == "endif" {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
return Some(JinjaBlockEnd {
|
||||
endif_start: absolute_start,
|
||||
endif_end: absolute_end,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// 从 if 块中选出第一个满足条件的分支并继续渲染。
|
||||
fn render_if_body(body: &str, first_condition: &str, values: &[(&str, &str)]) -> Option<String> {
|
||||
let branches = split_if_branches(body, first_condition)?;
|
||||
for branch in branches {
|
||||
let selected = match branch.condition {
|
||||
Some(condition) => eval_field_condition(&condition, values)?,
|
||||
None => true,
|
||||
};
|
||||
if selected {
|
||||
return render_known_template(&branch.content, values);
|
||||
}
|
||||
}
|
||||
Some(String::new())
|
||||
}
|
||||
|
||||
/// 按同层级 elif/else 拆分 if 块,嵌套 if 内部的分支不会被误拆。
|
||||
fn split_if_branches(body: &str, first_condition: &str) -> Option<Vec<JinjaBranch>> {
|
||||
let mut branches = Vec::new();
|
||||
let mut depth = 0;
|
||||
let mut current_condition = Some(first_condition.trim().to_string());
|
||||
let mut branch_start = 0;
|
||||
for tag_match in JINJA_TAG_RE.find_iter(body) {
|
||||
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
|
||||
let tag_content = captures.get(1)?.as_str().trim();
|
||||
if tag_content.starts_with("if ") {
|
||||
depth += 1;
|
||||
/// 渲染字段值中意外残留的 Jinja 模板,避免站点 title 属性里的模板文本继续进入识别链路。
|
||||
fn resolve_embedded_field_templates(values: BTreeMap<String, String>) -> BTreeMap<String, String> {
|
||||
let mut resolved = values.clone();
|
||||
for (key, value) in &values {
|
||||
if !contains_jinja_syntax(value) {
|
||||
continue;
|
||||
}
|
||||
if tag_content == "endif" {
|
||||
if depth == 0 {
|
||||
return None;
|
||||
}
|
||||
depth -= 1;
|
||||
continue;
|
||||
}
|
||||
if depth == 0 {
|
||||
if let Some(condition) = tag_content.strip_prefix("elif ") {
|
||||
branches.push(JinjaBranch {
|
||||
condition: current_condition.take(),
|
||||
content: body[branch_start..tag_match.start()].to_string(),
|
||||
});
|
||||
current_condition = Some(condition.trim().to_string());
|
||||
branch_start = tag_match.end();
|
||||
} else if tag_content == "else" {
|
||||
branches.push(JinjaBranch {
|
||||
condition: current_condition.take(),
|
||||
content: body[branch_start..tag_match.start()].to_string(),
|
||||
});
|
||||
current_condition = None;
|
||||
branch_start = tag_match.end();
|
||||
}
|
||||
let mut context_values = resolved.clone();
|
||||
context_values.insert(key.clone(), String::new());
|
||||
if let Some(rendered) = render_jinja_template(value, &context_values) {
|
||||
resolved.insert(key.clone(), rendered);
|
||||
}
|
||||
}
|
||||
branches.push(JinjaBranch {
|
||||
condition: current_condition,
|
||||
content: body[branch_start..].to_string(),
|
||||
});
|
||||
Some(branches)
|
||||
resolved
|
||||
}
|
||||
|
||||
/// 计算字段真值条件,覆盖站点模板里的 fields.xxx、not、and、or。
|
||||
fn eval_field_condition(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
|
||||
let trimmed = condition.trim();
|
||||
if trimmed.contains(" or ") {
|
||||
for part in trimmed.split(" or ") {
|
||||
if eval_field_condition(part, values)? {
|
||||
return Some(true);
|
||||
}
|
||||
}
|
||||
return Some(false);
|
||||
}
|
||||
if trimmed.contains(" and ") {
|
||||
for part in trimmed.split(" and ") {
|
||||
if !eval_field_condition(part, values)? {
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
return Some(true);
|
||||
}
|
||||
eval_field_condition_atom(trimmed, values)
|
||||
}
|
||||
|
||||
/// 计算单个字段条件,缺失字段按 Jinja Undefined 的假值处理。
|
||||
fn eval_field_condition_atom(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
|
||||
let (negated, expression) = if let Some(rest) = condition.trim().strip_prefix("not ") {
|
||||
(true, rest.trim())
|
||||
} else {
|
||||
(false, condition.trim())
|
||||
};
|
||||
let key = parse_field_key(expression)?;
|
||||
let value = get_template_value(values, &key).unwrap_or_default();
|
||||
let truthy = !value.is_empty();
|
||||
Some(if negated { !truthy } else { truthy })
|
||||
}
|
||||
|
||||
/// 替换模板中的 fields 变量,存在未知变量语法时回退 Python。
|
||||
fn render_field_vars(template: &str, values: &[(&str, &str)]) -> Option<String> {
|
||||
let mut rendered = String::new();
|
||||
let mut cursor = 0;
|
||||
for captures in JINJA_EXPR_RE.captures_iter(template) {
|
||||
let whole = captures.get(0)?;
|
||||
rendered.push_str(&template[cursor..whole.start()]);
|
||||
let expression = captures.get(1)?.as_str();
|
||||
rendered.push_str(&eval_field_output(expression, values)?);
|
||||
cursor = whole.end();
|
||||
}
|
||||
rendered.push_str(&template[cursor..]);
|
||||
if rendered.contains("{{") || rendered.contains("{%") {
|
||||
return None;
|
||||
}
|
||||
Some(rendered)
|
||||
}
|
||||
|
||||
/// 渲染输出表达式,覆盖字段变量、字段三元表达式和字符串拼接。
|
||||
fn eval_field_output(expression: &str, values: &[(&str, &str)]) -> Option<String> {
|
||||
let expression = expression.trim();
|
||||
if let Some((true_expr, condition, false_expr)) = split_inline_if(expression) {
|
||||
if eval_field_condition(condition, values)? {
|
||||
return eval_field_output(true_expr, values);
|
||||
}
|
||||
return eval_field_output(false_expr, values);
|
||||
}
|
||||
let terms = split_concat_terms(expression)?;
|
||||
if terms.len() > 1 {
|
||||
let mut rendered = String::new();
|
||||
for term in terms {
|
||||
rendered.push_str(&eval_field_atom(term, values)?);
|
||||
}
|
||||
return Some(rendered);
|
||||
}
|
||||
eval_field_atom(expression, values)
|
||||
}
|
||||
|
||||
/// 拆分 Jinja 的简单三元表达式:a if cond else b。
|
||||
fn split_inline_if(expression: &str) -> Option<(&str, &str, &str)> {
|
||||
let (true_expr, right) = expression.split_once(" if ")?;
|
||||
let (condition, false_expr) = right.split_once(" else ")?;
|
||||
Some((true_expr.trim(), condition.trim(), false_expr.trim()))
|
||||
}
|
||||
|
||||
/// 按字符串字面量边界拆分加号拼接表达式。
|
||||
fn split_concat_terms(expression: &str) -> Option<Vec<&str>> {
|
||||
let mut terms = Vec::new();
|
||||
let mut start = 0;
|
||||
let mut quote: Option<char> = None;
|
||||
for (index, ch) in expression.char_indices() {
|
||||
if let Some(current_quote) = quote {
|
||||
if ch == current_quote {
|
||||
quote = None;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if ch == '\'' || ch == '"' {
|
||||
quote = Some(ch);
|
||||
continue;
|
||||
}
|
||||
if ch == '+' {
|
||||
let term = expression[start..index].trim();
|
||||
if term.is_empty() {
|
||||
return None;
|
||||
}
|
||||
terms.push(term);
|
||||
start = index + ch.len_utf8();
|
||||
}
|
||||
}
|
||||
if quote.is_some() {
|
||||
return None;
|
||||
}
|
||||
let term = expression[start..].trim();
|
||||
if term.is_empty() {
|
||||
return None;
|
||||
}
|
||||
terms.push(term);
|
||||
Some(terms)
|
||||
}
|
||||
|
||||
/// 渲染字段或字符串字面量,其他表达式交给 Python 回退。
|
||||
fn eval_field_atom(expression: &str, values: &[(&str, &str)]) -> Option<String> {
|
||||
let expression = expression.trim();
|
||||
if let Some(value) = parse_string_literal(expression) {
|
||||
return Some(value);
|
||||
}
|
||||
let key = parse_field_key(expression)?;
|
||||
Some(
|
||||
get_template_value(values, &key)
|
||||
.unwrap_or_default()
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
/// 解析单引号或双引号字符串字面量。
|
||||
fn parse_string_literal(expression: &str) -> Option<String> {
|
||||
let mut chars = expression.chars();
|
||||
let quote = chars.next()?;
|
||||
if quote != '\'' && quote != '"' {
|
||||
return None;
|
||||
}
|
||||
if !expression.ends_with(quote) || expression.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
let inner = &expression[quote.len_utf8()..expression.len() - quote.len_utf8()];
|
||||
Some(inner.to_string())
|
||||
}
|
||||
|
||||
/// 解析 fields 变量名,拒绝函数调用和比较表达式等完整 Jinja 能力。
|
||||
fn parse_field_key(expression: &str) -> Option<String> {
|
||||
let captures = FIELD_EXPR_RE.captures(expression.trim())?;
|
||||
captures
|
||||
.get(1)
|
||||
.or_else(|| captures.get(2))
|
||||
.map(|item| item.as_str().to_string())
|
||||
}
|
||||
|
||||
/// 从模板上下文中获取字段值,缺失字段按 Jinja 的空值处理。
|
||||
fn get_template_value<'a>(values: &'a [(&str, &str)], template_key: &str) -> Option<&'a str> {
|
||||
for (field_key, value) in values {
|
||||
if *field_key == template_key {
|
||||
return Some(*value);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
struct JinjaBlockEnd {
|
||||
endif_start: usize,
|
||||
endif_end: usize,
|
||||
}
|
||||
|
||||
struct JinjaBranch {
|
||||
condition: Option<String>,
|
||||
content: String,
|
||||
/// 判断文本是否包含 Jinja 语法标记,作为字段内嵌模板的低成本预筛选。
|
||||
fn contains_jinja_syntax(value: &str) -> bool {
|
||||
value.contains("{{") || value.contains("{%") || value.contains("{#")
|
||||
}
|
||||
|
||||
/// 读取分类配置中的 ID 列表。
|
||||
|
||||
@@ -299,6 +299,66 @@ def test_rust_indexer_page_parser_renders_literal_title_template_without_default
|
||||
}]
|
||||
|
||||
|
||||
def test_rust_indexer_page_parser_supports_agsvpt_selector_and_embedded_title_template():
|
||||
"""
|
||||
Rust 普通 indexer 页面解析应兼容 AGSVPT 的 PyQuery 选择器和字段内嵌 Jinja 模板。
|
||||
"""
|
||||
spider = SiteSpider(
|
||||
indexer={
|
||||
"id": "agsvpt",
|
||||
"name": "AGSVPT",
|
||||
"domain": "https://www.agsvpt.com/",
|
||||
"search": {"paths": [{"path": "torrents.php"}]},
|
||||
"torrents": {
|
||||
"list": {"selector": 'table.torrents > tr:has("table.torrentname")'},
|
||||
"fields": {
|
||||
"title_default": {"selector": 'a[href*="details.php?id="]'},
|
||||
"title_optional": {
|
||||
"selector": 'a[title][href*="details.php?id="]',
|
||||
"attribute": "title",
|
||||
"optional": True,
|
||||
},
|
||||
"title": {
|
||||
"text": (
|
||||
"{% if fields['title_optional'] %}"
|
||||
"{{ fields['title_optional'] }}"
|
||||
"{% else %}"
|
||||
"{{ fields['title_default'] }}"
|
||||
"{% endif %}"
|
||||
)
|
||||
},
|
||||
"details": {
|
||||
"selector": 'a[href*="details.php?id="]',
|
||||
"attribute": "href",
|
||||
},
|
||||
"download": {
|
||||
"selector": 'a[href*="download.php?id="]',
|
||||
"attribute": "href",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
html = """
|
||||
<table class="torrents">
|
||||
<tr>
|
||||
<td><table class="torrentname"><tr><td>
|
||||
<a href="details.php?id=1" title="{% if fields['title_optional'] %}{% else %}Release that Witch S01 2026 1080p WEB-DL H264 AAC-HHWEB{% endif %}">Ignored</a>
|
||||
</td></tr></table></td>
|
||||
<td><a href="download.php?id=1">DL</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
"""
|
||||
|
||||
torrents = spider.parse(html)
|
||||
|
||||
assert torrents == [{
|
||||
"title": "Release that Witch S01 2026 1080p WEB-DL H264 AAC-HHWEB",
|
||||
"page_url": "https://www.agsvpt.com/details.php?id=1",
|
||||
"enclosure": "https://www.agsvpt.com/download.php?id=1",
|
||||
}]
|
||||
|
||||
|
||||
def test_rust_indexer_page_parser_renders_common_description_templates():
|
||||
"""
|
||||
Rust 普通 indexer 页面解析应兼容站点构建项目里的 description 字段模板。
|
||||
|
||||
Reference in New Issue
Block a user