diff --git a/docker/update.sh b/docker/update.sh index 359163be..0113dc4e 100644 --- a/docker/update.sh +++ b/docker/update.sh @@ -24,6 +24,52 @@ function WARN() { VENV_PATH="${VENV_PATH:-/opt/venv}" export PATH="${VENV_PATH}/bin:$PATH" +# 按需准备 Rust 构建环境,避免把工具链常驻打进 Docker runtime 镜像。 +function ensure_rust_build_env() { + export PATH="/root/.cargo/bin:$PATH" + if command -v cargo > /dev/null 2>&1; then + return 0 + fi + INFO "→ 当前镜像未包含 cargo,正在按需准备 Rust 构建环境..." + if command -v apt-get > /dev/null 2>&1; then + if ! apt-get update; then + ERROR "更新 apt 索引失败,无法安装 Rust 构建依赖" + return 1 + fi + if ! apt-get install -y --no-install-recommends build-essential curl ca-certificates; then + ERROR "安装 Rust 构建依赖失败" + return 1 + fi + apt-get clean + rm -rf /var/lib/apt/lists/* + fi + if ! curl ${CURL_OPTIONS} https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal; then + ERROR "安装 Rust 工具链失败" + return 1 + fi + export PATH="/root/.cargo/bin:$PATH" + command -v cargo > /dev/null 2>&1 +} + +# 更新 Rust 加速扩展,确保 Docker dev/release 更新源码后不会继续加载旧 wheel。 +function install_rust_accel() { + local manifest="/app/rust/moviepilot_rust/Cargo.toml" + if [ ! -f "${manifest}" ]; then + WARN "未找到 Rust 扩展源码,跳过 Rust 加速扩展更新" + return 0 + fi + if ! ensure_rust_build_env; then + ERROR "Rust 构建环境不可用,无法更新 Rust 加速扩展" + return 1 + fi + INFO "→ 正在更新 Rust 加速扩展..." + if ! "${VENV_PATH}/bin/python" -m maturin develop --release --manifest-path "${manifest}"; then + ERROR "Rust 加速扩展更新失败" + return 1 + fi + INFO "Rust 加速扩展更新成功" +} + # 下载及解压 function download_and_unzip() { local retries=0 @@ -166,6 +212,9 @@ function install_backend_and_download_resources() { WARN "${sites_file} 下载失败,继续使用旧的资源来启动..." fi INFO "站点资源更新成功" + if ! install_rust_accel; then + return 1 + fi # 清理临时目录 rm -rf "${TMP_PATH}" return 0 diff --git a/rust/moviepilot_rust/Cargo.lock b/rust/moviepilot_rust/Cargo.lock index e060060a..042157d0 100644 --- a/rust/moviepilot_rust/Cargo.lock +++ b/rust/moviepilot_rust/Cargo.lock @@ -344,6 +344,12 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memo-map" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" + [[package]] name = "memoffset" version = "0.9.1" @@ -353,10 +359,21 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minijinja" +version = "2.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2929e494b2280e1e18959bb2e121da03347ae896896fdfaceaab43c88a02803f" +dependencies = [ + "memo-map", + "serde", +] + [[package]] name = "moviepilot-rust" version = "0.1.0" dependencies = [ + "minijinja", "once_cell", "percent-encoding", "pyo3", diff --git a/rust/moviepilot_rust/Cargo.toml b/rust/moviepilot_rust/Cargo.toml index c9a8d95c..2b74ff47 100644 --- a/rust/moviepilot_rust/Cargo.toml +++ b/rust/moviepilot_rust/Cargo.toml @@ -8,6 +8,7 @@ name = "moviepilot_rust" crate-type = ["cdylib"] [dependencies] +minijinja = "2.20" once_cell = "1.20" percent-encoding = "2.3" pyo3 = { version = "0.23", features = ["abi3-py311", "extension-module"] } diff --git a/rust/moviepilot_rust/src/indexer.rs b/rust/moviepilot_rust/src/indexer.rs index 4dd8a5ce..3f4d0009 100644 --- a/rust/moviepilot_rust/src/indexer.rs +++ b/rust/moviepilot_rust/src/indexer.rs @@ -1,4 +1,5 @@ use crate::utils::{get_optional_i64, get_optional_string, py_i64_to_usize}; +use minijinja::{context, Environment, UndefinedBehavior}; use once_cell::sync::Lazy; use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; use pyo3::exceptions::PyValueError; @@ -6,6 +7,7 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; use regex::{Regex, RegexBuilder}; use scraper::{ElementRef, Html, Selector}; +use std::collections::BTreeMap; use url::form_urlencoded; use url::Url; @@ -34,13 +36,12 @@ static FILESIZE_UNIT_RE: Lazy = Lazy::new(|| { .unwrap() }); static NUMERIC_FACTOR_RE: Lazy = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap()); -static FIELD_EXPR_RE: Lazy = Lazy::new(|| { - Regex::new(r#"^fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])$"#).unwrap() -}); static FIELD_REF_RE: Lazy = Lazy::new(|| Regex::new(r#"fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])"#).unwrap()); -static JINJA_EXPR_RE: Lazy = Lazy::new(|| Regex::new(r#"\{\{-?\s*(.*?)\s*-?\}\}"#).unwrap()); -static JINJA_TAG_RE: Lazy = Lazy::new(|| Regex::new(r#"\{%-?\s*(.*?)\s*-?%\}"#).unwrap()); +static HAS_QUOTED_SELECTOR_RE: Lazy = + Lazy::new(|| Regex::new(r#":has\(\s*"([^"]+)"\s*\)|:has\(\s*'([^']+)'\s*\)"#).unwrap()); +static TABLE_DIRECT_TR_RE: Lazy = + Lazy::new(|| Regex::new(r#"\b(table[^>,]*?)\s*>\s*(tr(?:[^\s>,]*)?)"#).unwrap()); enum RowParseResult { Unsupported, @@ -66,7 +67,7 @@ pub(crate) fn parse_indexer_torrents_fast( if list_selector_text.is_empty() { return Ok(None); } - let Ok(list_selector) = Selector::parse(&list_selector_text) else { + let Some(list_selector) = parse_site_selector(&list_selector_text) else { return Ok(None); }; let document = Html::parse_document(html_text); @@ -386,11 +387,7 @@ fn parse_title( safe_query(row, &selector)? } else if let Some(template) = get_optional_string(&selector, "text")? { let values = collect_template_field_values(row, fields, &template)?; - let refs: Vec<(&str, &str)> = values - .iter() - .map(|(key, value)| (key.as_str(), value.as_str())) - .collect(); - let Some(rendered) = render_known_template(&template, &refs) else { + let Some(rendered) = render_jinja_template(&template, &values) else { return Ok(false); }; Some(rendered) @@ -418,11 +415,7 @@ fn parse_description( safe_query(row, &selector)? } else if let Some(template) = get_optional_string(&selector, "text")? { let values = collect_template_field_values(row, fields, &template)?; - let refs: Vec<(&str, &str)> = values - .iter() - .map(|(key, value)| (key.as_str(), value.as_str())) - .collect(); - let Some(rendered) = render_known_template(&template, &refs) else { + let Some(rendered) = render_jinja_template(&template, &values) else { return Ok(false); }; Some(rendered) @@ -441,7 +434,7 @@ fn collect_template_field_values( row: ElementRef<'_>, fields: &Bound<'_, PyDict>, template: &str, -) -> PyResult> { +) -> PyResult> { let mut keys = Vec::new(); for captures in FIELD_REF_RE.captures_iter(template) { let Some(key) = captures.get(1).or_else(|| captures.get(2)) else { @@ -453,14 +446,14 @@ fn collect_template_field_values( } } - let mut values = Vec::new(); + let mut values = BTreeMap::new(); for key in keys { if let Some(field_selector) = get_field_dict(fields, &key)? { let value = safe_query(row, &field_selector)?.unwrap_or_default(); - values.push((key, value)); + values.insert(key, value); } } - Ok(values) + Ok(resolve_embedded_field_templates(values)) } /// 解析普通文本字段。 @@ -683,6 +676,44 @@ fn get_field_dict<'py>( Ok(Some(value.downcast_into::()?)) } +/// 解析站点配置选择器,并兼容 PyQuery 允许的 :has("selector") 写法。 +fn parse_site_selector(selector_text: &str) -> Option { + let normalized = normalize_pyquery_selector(selector_text); + let expanded = expand_table_direct_tr_selector(&normalized); + if let Ok(selector) = Selector::parse(&expanded) { + return Some(selector); + } + if expanded != normalized { + if let Ok(selector) = Selector::parse(&normalized) { + return Some(selector); + } + } + Selector::parse(selector_text).ok() +} + +/// 将 PyQuery 扩展选择器转换为 scraper 可识别的 CSS selector 形式。 +fn normalize_pyquery_selector(selector_text: &str) -> String { + HAS_QUOTED_SELECTOR_RE + .replace_all(selector_text, |captures: ®ex::Captures<'_>| { + let inner = captures + .get(1) + .or_else(|| captures.get(2)) + .map(|item| item.as_str()) + .unwrap_or_default(); + format!(":has({inner})") + }) + .into_owned() +} + +/// 为 table > tr 选择器追加 tbody 变体,适配 Rust HTML5 解析自动补 tbody 的行为。 +fn expand_table_direct_tr_selector(selector_text: &str) -> String { + let expanded = TABLE_DIRECT_TR_RE.replace_all(selector_text, "$1 > tbody > $2"); + if expanded == selector_text { + return selector_text.to_string(); + } + format!("{selector_text}, {expanded}") +} + /// 执行 selector 查询并返回第一个符合 index/contents 规则的文本。 fn safe_query( row: ElementRef<'_>, @@ -702,7 +733,7 @@ fn query_all_values( let Some(selector_text) = get_selector_text(selector_config)? else { return Ok(None); }; - let Ok(selector) = Selector::parse(&selector_text) else { + let Some(selector) = parse_site_selector(&selector_text) else { return Ok(None); }; let attribute = get_optional_string(selector_config, "attribute")?; @@ -729,7 +760,7 @@ fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult, selector_text: &str) -> PyResult { - let Ok(selector) = Selector::parse(selector_text) else { + let Some(selector) = parse_site_selector(selector_text) else { return Ok(false); }; Ok(row.select(&selector).next().is_some()) @@ -882,290 +913,32 @@ fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> Str } } -/// 渲染常见的 Jinja 字段模板,不支持复杂表达式时由调用方回退 Python。 -fn render_known_template(template: &str, values: &[(&str, &str)]) -> Option { - if template.contains("{#") { - return None; - } - let rendered = render_jinja_blocks(template, values)?; - render_field_vars(&rendered, values) +/// 使用 MiniJinja 渲染站点字段模板,语义对齐 Python jinja2 的 Template.render(fields=...)。 +fn render_jinja_template(template: &str, fields: &BTreeMap) -> Option { + let mut env = Environment::new(); + env.set_undefined_behavior(UndefinedBehavior::Chainable); + env.render_str(template, context! { fields => fields }).ok() } -/// 渲染站点解析配置里常见的 if/elif/else/endif 字段模板。 -fn render_jinja_blocks(template: &str, values: &[(&str, &str)]) -> Option { - let mut result = String::new(); - let mut cursor = 0; - while let Some(tag_match) = JINJA_TAG_RE.find_at(template, cursor) { - result.push_str(&template[cursor..tag_match.start()]); - let captures = JINJA_TAG_RE.captures(tag_match.as_str())?; - let tag_content = captures.get(1)?.as_str().trim(); - let Some(condition) = tag_content.strip_prefix("if ") else { - return None; - }; - let block_end = find_matching_endif(template, tag_match.end())?; - let body = &template[tag_match.end()..block_end.endif_start]; - let rendered_branch = render_if_body(body, condition, values)?; - result.push_str(&rendered_branch); - cursor = block_end.endif_end; - } - result.push_str(&template[cursor..]); - Some(result) -} - -/// 查找当前 if 块对应的 endif,允许内部再嵌套一层字段模板。 -fn find_matching_endif(template: &str, from: usize) -> Option { - let mut depth = 1; - for tag_match in JINJA_TAG_RE.find_iter(&template[from..]) { - let absolute_start = from + tag_match.start(); - let absolute_end = from + tag_match.end(); - let captures = JINJA_TAG_RE.captures(tag_match.as_str())?; - let tag_content = captures.get(1)?.as_str().trim(); - if tag_content.starts_with("if ") { - depth += 1; - } else if tag_content == "endif" { - depth -= 1; - if depth == 0 { - return Some(JinjaBlockEnd { - endif_start: absolute_start, - endif_end: absolute_end, - }); - } - } - } - None -} - -/// 从 if 块中选出第一个满足条件的分支并继续渲染。 -fn render_if_body(body: &str, first_condition: &str, values: &[(&str, &str)]) -> Option { - let branches = split_if_branches(body, first_condition)?; - for branch in branches { - let selected = match branch.condition { - Some(condition) => eval_field_condition(&condition, values)?, - None => true, - }; - if selected { - return render_known_template(&branch.content, values); - } - } - Some(String::new()) -} - -/// 按同层级 elif/else 拆分 if 块,嵌套 if 内部的分支不会被误拆。 -fn split_if_branches(body: &str, first_condition: &str) -> Option> { - let mut branches = Vec::new(); - let mut depth = 0; - let mut current_condition = Some(first_condition.trim().to_string()); - let mut branch_start = 0; - for tag_match in JINJA_TAG_RE.find_iter(body) { - let captures = JINJA_TAG_RE.captures(tag_match.as_str())?; - let tag_content = captures.get(1)?.as_str().trim(); - if tag_content.starts_with("if ") { - depth += 1; +/// 渲染字段值中意外残留的 Jinja 模板,避免站点 title 属性里的模板文本继续进入识别链路。 +fn resolve_embedded_field_templates(values: BTreeMap) -> BTreeMap { + let mut resolved = values.clone(); + for (key, value) in &values { + if !contains_jinja_syntax(value) { continue; } - if tag_content == "endif" { - if depth == 0 { - return None; - } - depth -= 1; - continue; - } - if depth == 0 { - if let Some(condition) = tag_content.strip_prefix("elif ") { - branches.push(JinjaBranch { - condition: current_condition.take(), - content: body[branch_start..tag_match.start()].to_string(), - }); - current_condition = Some(condition.trim().to_string()); - branch_start = tag_match.end(); - } else if tag_content == "else" { - branches.push(JinjaBranch { - condition: current_condition.take(), - content: body[branch_start..tag_match.start()].to_string(), - }); - current_condition = None; - branch_start = tag_match.end(); - } + let mut context_values = resolved.clone(); + context_values.insert(key.clone(), String::new()); + if let Some(rendered) = render_jinja_template(value, &context_values) { + resolved.insert(key.clone(), rendered); } } - branches.push(JinjaBranch { - condition: current_condition, - content: body[branch_start..].to_string(), - }); - Some(branches) + resolved } -/// 计算字段真值条件,覆盖站点模板里的 fields.xxx、not、and、or。 -fn eval_field_condition(condition: &str, values: &[(&str, &str)]) -> Option { - let trimmed = condition.trim(); - if trimmed.contains(" or ") { - for part in trimmed.split(" or ") { - if eval_field_condition(part, values)? { - return Some(true); - } - } - return Some(false); - } - if trimmed.contains(" and ") { - for part in trimmed.split(" and ") { - if !eval_field_condition(part, values)? { - return Some(false); - } - } - return Some(true); - } - eval_field_condition_atom(trimmed, values) -} - -/// 计算单个字段条件,缺失字段按 Jinja Undefined 的假值处理。 -fn eval_field_condition_atom(condition: &str, values: &[(&str, &str)]) -> Option { - let (negated, expression) = if let Some(rest) = condition.trim().strip_prefix("not ") { - (true, rest.trim()) - } else { - (false, condition.trim()) - }; - let key = parse_field_key(expression)?; - let value = get_template_value(values, &key).unwrap_or_default(); - let truthy = !value.is_empty(); - Some(if negated { !truthy } else { truthy }) -} - -/// 替换模板中的 fields 变量,存在未知变量语法时回退 Python。 -fn render_field_vars(template: &str, values: &[(&str, &str)]) -> Option { - let mut rendered = String::new(); - let mut cursor = 0; - for captures in JINJA_EXPR_RE.captures_iter(template) { - let whole = captures.get(0)?; - rendered.push_str(&template[cursor..whole.start()]); - let expression = captures.get(1)?.as_str(); - rendered.push_str(&eval_field_output(expression, values)?); - cursor = whole.end(); - } - rendered.push_str(&template[cursor..]); - if rendered.contains("{{") || rendered.contains("{%") { - return None; - } - Some(rendered) -} - -/// 渲染输出表达式,覆盖字段变量、字段三元表达式和字符串拼接。 -fn eval_field_output(expression: &str, values: &[(&str, &str)]) -> Option { - let expression = expression.trim(); - if let Some((true_expr, condition, false_expr)) = split_inline_if(expression) { - if eval_field_condition(condition, values)? { - return eval_field_output(true_expr, values); - } - return eval_field_output(false_expr, values); - } - let terms = split_concat_terms(expression)?; - if terms.len() > 1 { - let mut rendered = String::new(); - for term in terms { - rendered.push_str(&eval_field_atom(term, values)?); - } - return Some(rendered); - } - eval_field_atom(expression, values) -} - -/// 拆分 Jinja 的简单三元表达式:a if cond else b。 -fn split_inline_if(expression: &str) -> Option<(&str, &str, &str)> { - let (true_expr, right) = expression.split_once(" if ")?; - let (condition, false_expr) = right.split_once(" else ")?; - Some((true_expr.trim(), condition.trim(), false_expr.trim())) -} - -/// 按字符串字面量边界拆分加号拼接表达式。 -fn split_concat_terms(expression: &str) -> Option> { - let mut terms = Vec::new(); - let mut start = 0; - let mut quote: Option = None; - for (index, ch) in expression.char_indices() { - if let Some(current_quote) = quote { - if ch == current_quote { - quote = None; - } - continue; - } - if ch == '\'' || ch == '"' { - quote = Some(ch); - continue; - } - if ch == '+' { - let term = expression[start..index].trim(); - if term.is_empty() { - return None; - } - terms.push(term); - start = index + ch.len_utf8(); - } - } - if quote.is_some() { - return None; - } - let term = expression[start..].trim(); - if term.is_empty() { - return None; - } - terms.push(term); - Some(terms) -} - -/// 渲染字段或字符串字面量,其他表达式交给 Python 回退。 -fn eval_field_atom(expression: &str, values: &[(&str, &str)]) -> Option { - let expression = expression.trim(); - if let Some(value) = parse_string_literal(expression) { - return Some(value); - } - let key = parse_field_key(expression)?; - Some( - get_template_value(values, &key) - .unwrap_or_default() - .to_string(), - ) -} - -/// 解析单引号或双引号字符串字面量。 -fn parse_string_literal(expression: &str) -> Option { - let mut chars = expression.chars(); - let quote = chars.next()?; - if quote != '\'' && quote != '"' { - return None; - } - if !expression.ends_with(quote) || expression.len() < 2 { - return None; - } - let inner = &expression[quote.len_utf8()..expression.len() - quote.len_utf8()]; - Some(inner.to_string()) -} - -/// 解析 fields 变量名,拒绝函数调用和比较表达式等完整 Jinja 能力。 -fn parse_field_key(expression: &str) -> Option { - let captures = FIELD_EXPR_RE.captures(expression.trim())?; - captures - .get(1) - .or_else(|| captures.get(2)) - .map(|item| item.as_str().to_string()) -} - -/// 从模板上下文中获取字段值,缺失字段按 Jinja 的空值处理。 -fn get_template_value<'a>(values: &'a [(&str, &str)], template_key: &str) -> Option<&'a str> { - for (field_key, value) in values { - if *field_key == template_key { - return Some(*value); - } - } - None -} - -struct JinjaBlockEnd { - endif_start: usize, - endif_end: usize, -} - -struct JinjaBranch { - condition: Option, - content: String, +/// 判断文本是否包含 Jinja 语法标记,作为字段内嵌模板的低成本预筛选。 +fn contains_jinja_syntax(value: &str) -> bool { + value.contains("{{") || value.contains("{%") || value.contains("{#") } /// 读取分类配置中的 ID 列表。 diff --git a/tests/test_rust_accel.py b/tests/test_rust_accel.py index 87a3be78..972967f4 100644 --- a/tests/test_rust_accel.py +++ b/tests/test_rust_accel.py @@ -299,6 +299,66 @@ def test_rust_indexer_page_parser_renders_literal_title_template_without_default }] +def test_rust_indexer_page_parser_supports_agsvpt_selector_and_embedded_title_template(): + """ + Rust 普通 indexer 页面解析应兼容 AGSVPT 的 PyQuery 选择器和字段内嵌 Jinja 模板。 + """ + spider = SiteSpider( + indexer={ + "id": "agsvpt", + "name": "AGSVPT", + "domain": "https://www.agsvpt.com/", + "search": {"paths": [{"path": "torrents.php"}]}, + "torrents": { + "list": {"selector": 'table.torrents > tr:has("table.torrentname")'}, + "fields": { + "title_default": {"selector": 'a[href*="details.php?id="]'}, + "title_optional": { + "selector": 'a[title][href*="details.php?id="]', + "attribute": "title", + "optional": True, + }, + "title": { + "text": ( + "{% if fields['title_optional'] %}" + "{{ fields['title_optional'] }}" + "{% else %}" + "{{ fields['title_default'] }}" + "{% endif %}" + ) + }, + "details": { + "selector": 'a[href*="details.php?id="]', + "attribute": "href", + }, + "download": { + "selector": 'a[href*="download.php?id="]', + "attribute": "href", + }, + }, + }, + }, + ) + html = """ + + + + + +
+ Ignored +
DL
+ """ + + torrents = spider.parse(html) + + assert torrents == [{ + "title": "Release that Witch S01 2026 1080p WEB-DL H264 AAC-HHWEB", + "page_url": "https://www.agsvpt.com/details.php?id=1", + "enclosure": "https://www.agsvpt.com/download.php?id=1", + }] + + def test_rust_indexer_page_parser_renders_common_description_templates(): """ Rust 普通 indexer 页面解析应兼容站点构建项目里的 description 字段模板。