feat: support indexer templates in Rust parser

This commit is contained in:
jxxghp
2026-05-22 23:37:54 +08:00
parent f7b78721c3
commit cde267c55f
2 changed files with 517 additions and 33 deletions

View File

@@ -34,6 +34,12 @@ static FILESIZE_UNIT_RE: Lazy<Regex> = Lazy::new(|| {
.unwrap()
});
static NUMERIC_FACTOR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\d+\.?\d*)").unwrap());
static FIELD_EXPR_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"^fields(?:\.([A-Za-z0-9_]+)|\[\s*['"]([^'"]+)['"]\s*\])$"#).unwrap()
});
static JINJA_EXPR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"\{\{-?\s*(.*?)\s*-?\}\}"#).unwrap());
static JINJA_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\{%-?\s*(.*?)\s*-?%\}"#).unwrap());
enum RowParseResult {
Unsupported,
@@ -388,13 +394,16 @@ fn parse_title(
} else {
String::new()
};
Some(render_known_template(
let Some(rendered) = render_known_template(
&template,
&[
("title_default", title_default.as_str()),
("title_optional", title_optional.as_str()),
],
))
) else {
return Ok(false);
};
Some(rendered)
} else {
None
};
@@ -434,7 +443,10 @@ fn parse_description(
.iter()
.map(|(key, value)| (key.as_str(), value.as_str()))
.collect();
Some(render_known_template(&template, &refs))
let Some(rendered) = render_known_template(&template, &refs) else {
return Ok(false);
};
Some(rendered)
} else {
None
};
@@ -590,9 +602,6 @@ fn parse_labels_field(
output.set_item("labels", PyList::empty(py))?;
return Ok(true);
}
if selector.contains("remove")? {
return Ok(false);
}
let Some(values) = query_all_values(row, &selector)? else {
output.set_item("labels", PyList::empty(py))?;
return Ok(true);
@@ -673,9 +682,6 @@ fn safe_query(
row: ElementRef<'_>,
selector_config: &Bound<'_, PyDict>,
) -> PyResult<Option<String>> {
if selector_config.contains("remove")? {
return Ok(None);
}
let Some(values) = query_all_values(row, selector_config)? else {
return Ok(None);
};
@@ -694,17 +700,37 @@ fn query_all_values(
return Ok(None);
};
let attribute = get_optional_string(selector_config, "attribute")?;
let remove_selectors = parse_remove_selectors(selector_config)?;
let mut values = Vec::new();
for element in row.select(&selector) {
if let Some(attribute) = attribute.as_deref() {
values.push(element.value().attr(attribute).unwrap_or("").to_string());
} else {
values.push(normalize_element_text(element));
values.push(normalize_element_text(element, &remove_selectors));
}
}
Ok(Some(values))
}
/// 解析 remove 配置,支持逗号分隔的 CSS 选择器列表。
fn parse_remove_selectors(selector_config: &Bound<'_, PyDict>) -> PyResult<Vec<Selector>> {
let Some(remove_text) = get_optional_string(selector_config, "remove")? else {
return Ok(Vec::new());
};
let mut selectors = Vec::new();
for item in remove_text.split(',') {
let item = item.trim();
if item.is_empty() {
continue;
}
let Ok(selector) = Selector::parse(item) else {
return Ok(Vec::new());
};
selectors.push(selector);
}
Ok(selectors)
}
/// 读取 selector 或 selectors 配置。
fn get_selector_text(selector_config: &Bound<'_, PyDict>) -> PyResult<Option<String>> {
if let Some(selector) = get_optional_string(selector_config, "selector")? {
@@ -729,17 +755,27 @@ fn select_indexed_value(
return None;
}
if let Ok(Some(contents)) = get_optional_i64(selector_config, "contents") {
let index = contents as usize;
if let Some(first) = values.first() {
return first.split('\n').nth(index).map(|item| item.to_string());
let lines: Vec<&str> = first.split('\n').collect();
return pick_indexed_item(&lines, contents).map(|item| item.to_string());
}
}
if let Ok(Some(index)) = get_optional_i64(selector_config, "index") {
return values.get(index as usize).cloned();
return pick_indexed_item(&values, index).cloned();
}
values.first().cloned()
}
/// 按 Python 列表语义读取正负索引。
fn pick_indexed_item<T>(items: &[T], index: i64) -> Option<&T> {
let len = items.len() as i64;
let resolved = if index < 0 { len + index } else { index };
if resolved < 0 {
return None;
}
items.get(resolved as usize)
}
/// 应用字段配置中的 filters。
fn apply_selector_filters(
py: Python<'_>,
@@ -760,13 +796,48 @@ fn apply_selector_filters(
}
/// 规范化元素文本,尽量接近 PyQuery.text() 输出。
fn normalize_element_text(element: ElementRef<'_>) -> String {
element
.text()
.map(str::trim)
.filter(|item| !item.is_empty())
.collect::<Vec<&str>>()
.join(" ")
fn normalize_element_text(element: ElementRef<'_>, remove_selectors: &[Selector]) -> String {
let mut rendered = String::new();
for node in element.descendants() {
let Some(text_node) = node.value().as_text() else {
continue;
};
if should_skip_text_node(
node.parent().and_then(ElementRef::wrap),
element,
remove_selectors,
) {
continue;
}
rendered.push_str(text_node);
}
normalize_whitespace(&rendered)
}
/// 折叠 PyQuery.text() 中的连续空白,保留元素相邻文本节点的直接拼接效果。
fn normalize_whitespace(value: &str) -> String {
value.split_whitespace().collect::<Vec<&str>>().join(" ")
}
/// 判断文本节点是否位于需要 remove 的元素子树中。
fn should_skip_text_node(
mut parent: Option<ElementRef<'_>>,
root: ElementRef<'_>,
remove_selectors: &[Selector],
) -> bool {
while let Some(element) = parent {
if element == root {
return false;
}
if remove_selectors
.iter()
.any(|selector| selector.matches(&element))
{
return true;
}
parent = element.parent().and_then(ElementRef::wrap);
}
false
}
/// 判断 row 内是否存在指定 selector。
@@ -806,21 +877,285 @@ fn normalize_site_link(domain: &str, link: &str, protocol_relative: bool) -> Str
}
/// 渲染常见的 Jinja 字段模板,不支持复杂表达式时由调用方回退 Python。
fn render_known_template(template: &str, values: &[(&str, &str)]) -> String {
let mut rendered = template.to_string();
for (key, value) in values {
for pattern in [
format!("{{{{ fields.{key} }}}}"),
format!("{{{{fields.{key}}}}}"),
format!("{{{{ fields['{key}'] }}}}"),
format!("{{{{fields['{key}']}}}}"),
format!("{{{{ fields[\"{key}\"] }}}}"),
format!("{{{{fields[\"{key}\"]}}}}"),
] {
rendered = rendered.replace(&pattern, value);
fn render_known_template(template: &str, values: &[(&str, &str)]) -> Option<String> {
if template.contains("{#") {
return None;
}
let rendered = render_jinja_blocks(template, values)?;
render_field_vars(&rendered, values)
}
/// 渲染站点解析配置里常见的 if/elif/else/endif 字段模板。
fn render_jinja_blocks(template: &str, values: &[(&str, &str)]) -> Option<String> {
let mut result = String::new();
let mut cursor = 0;
while let Some(tag_match) = JINJA_TAG_RE.find_at(template, cursor) {
result.push_str(&template[cursor..tag_match.start()]);
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
let tag_content = captures.get(1)?.as_str().trim();
let Some(condition) = tag_content.strip_prefix("if ") else {
return None;
};
let block_end = find_matching_endif(template, tag_match.end())?;
let body = &template[tag_match.end()..block_end.endif_start];
let rendered_branch = render_if_body(body, condition, values)?;
result.push_str(&rendered_branch);
cursor = block_end.endif_end;
}
result.push_str(&template[cursor..]);
Some(result)
}
/// 查找当前 if 块对应的 endif允许内部再嵌套一层字段模板。
fn find_matching_endif(template: &str, from: usize) -> Option<JinjaBlockEnd> {
let mut depth = 1;
for tag_match in JINJA_TAG_RE.find_iter(&template[from..]) {
let absolute_start = from + tag_match.start();
let absolute_end = from + tag_match.end();
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
let tag_content = captures.get(1)?.as_str().trim();
if tag_content.starts_with("if ") {
depth += 1;
} else if tag_content == "endif" {
depth -= 1;
if depth == 0 {
return Some(JinjaBlockEnd {
endif_start: absolute_start,
endif_end: absolute_end,
});
}
}
}
rendered
None
}
/// 从 if 块中选出第一个满足条件的分支并继续渲染。
fn render_if_body(body: &str, first_condition: &str, values: &[(&str, &str)]) -> Option<String> {
let branches = split_if_branches(body, first_condition)?;
for branch in branches {
let selected = match branch.condition {
Some(condition) => eval_field_condition(&condition, values)?,
None => true,
};
if selected {
return render_known_template(&branch.content, values);
}
}
Some(String::new())
}
/// 按同层级 elif/else 拆分 if 块,嵌套 if 内部的分支不会被误拆。
fn split_if_branches(body: &str, first_condition: &str) -> Option<Vec<JinjaBranch>> {
let mut branches = Vec::new();
let mut depth = 0;
let mut current_condition = Some(first_condition.trim().to_string());
let mut branch_start = 0;
for tag_match in JINJA_TAG_RE.find_iter(body) {
let captures = JINJA_TAG_RE.captures(tag_match.as_str())?;
let tag_content = captures.get(1)?.as_str().trim();
if tag_content.starts_with("if ") {
depth += 1;
continue;
}
if tag_content == "endif" {
if depth == 0 {
return None;
}
depth -= 1;
continue;
}
if depth == 0 {
if let Some(condition) = tag_content.strip_prefix("elif ") {
branches.push(JinjaBranch {
condition: current_condition.take(),
content: body[branch_start..tag_match.start()].to_string(),
});
current_condition = Some(condition.trim().to_string());
branch_start = tag_match.end();
} else if tag_content == "else" {
branches.push(JinjaBranch {
condition: current_condition.take(),
content: body[branch_start..tag_match.start()].to_string(),
});
current_condition = None;
branch_start = tag_match.end();
}
}
}
branches.push(JinjaBranch {
condition: current_condition,
content: body[branch_start..].to_string(),
});
Some(branches)
}
/// 计算字段真值条件,覆盖站点模板里的 fields.xxx、not、and、or。
fn eval_field_condition(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
let trimmed = condition.trim();
if trimmed.contains(" or ") {
for part in trimmed.split(" or ") {
if eval_field_condition(part, values)? {
return Some(true);
}
}
return Some(false);
}
if trimmed.contains(" and ") {
for part in trimmed.split(" and ") {
if !eval_field_condition(part, values)? {
return Some(false);
}
}
return Some(true);
}
eval_field_condition_atom(trimmed, values)
}
/// 计算单个字段条件,缺失字段按 Jinja Undefined 的假值处理。
fn eval_field_condition_atom(condition: &str, values: &[(&str, &str)]) -> Option<bool> {
let (negated, expression) = if let Some(rest) = condition.trim().strip_prefix("not ") {
(true, rest.trim())
} else {
(false, condition.trim())
};
let key = parse_field_key(expression)?;
let value = get_template_value(values, &key).unwrap_or_default();
let truthy = !value.is_empty();
Some(if negated { !truthy } else { truthy })
}
/// 替换模板中的 fields 变量,存在未知变量语法时回退 Python。
fn render_field_vars(template: &str, values: &[(&str, &str)]) -> Option<String> {
let mut rendered = String::new();
let mut cursor = 0;
for captures in JINJA_EXPR_RE.captures_iter(template) {
let whole = captures.get(0)?;
rendered.push_str(&template[cursor..whole.start()]);
let expression = captures.get(1)?.as_str();
rendered.push_str(&eval_field_output(expression, values)?);
cursor = whole.end();
}
rendered.push_str(&template[cursor..]);
if rendered.contains("{{") || rendered.contains("{%") {
return None;
}
Some(rendered)
}
/// 渲染输出表达式,覆盖字段变量、字段三元表达式和字符串拼接。
fn eval_field_output(expression: &str, values: &[(&str, &str)]) -> Option<String> {
let expression = expression.trim();
if let Some((true_expr, condition, false_expr)) = split_inline_if(expression) {
if eval_field_condition(condition, values)? {
return eval_field_output(true_expr, values);
}
return eval_field_output(false_expr, values);
}
let terms = split_concat_terms(expression)?;
if terms.len() > 1 {
let mut rendered = String::new();
for term in terms {
rendered.push_str(&eval_field_atom(term, values)?);
}
return Some(rendered);
}
eval_field_atom(expression, values)
}
/// 拆分 Jinja 的简单三元表达式a if cond else b。
fn split_inline_if(expression: &str) -> Option<(&str, &str, &str)> {
let (true_expr, right) = expression.split_once(" if ")?;
let (condition, false_expr) = right.split_once(" else ")?;
Some((true_expr.trim(), condition.trim(), false_expr.trim()))
}
/// 按字符串字面量边界拆分加号拼接表达式。
fn split_concat_terms(expression: &str) -> Option<Vec<&str>> {
let mut terms = Vec::new();
let mut start = 0;
let mut quote: Option<char> = None;
for (index, ch) in expression.char_indices() {
if let Some(current_quote) = quote {
if ch == current_quote {
quote = None;
}
continue;
}
if ch == '\'' || ch == '"' {
quote = Some(ch);
continue;
}
if ch == '+' {
let term = expression[start..index].trim();
if term.is_empty() {
return None;
}
terms.push(term);
start = index + ch.len_utf8();
}
}
if quote.is_some() {
return None;
}
let term = expression[start..].trim();
if term.is_empty() {
return None;
}
terms.push(term);
Some(terms)
}
/// 渲染字段或字符串字面量,其他表达式交给 Python 回退。
fn eval_field_atom(expression: &str, values: &[(&str, &str)]) -> Option<String> {
let expression = expression.trim();
if let Some(value) = parse_string_literal(expression) {
return Some(value);
}
let key = parse_field_key(expression)?;
Some(get_template_value(values, &key).unwrap_or_default().to_string())
}
/// 解析单引号或双引号字符串字面量。
fn parse_string_literal(expression: &str) -> Option<String> {
let mut chars = expression.chars();
let quote = chars.next()?;
if quote != '\'' && quote != '"' {
return None;
}
if !expression.ends_with(quote) || expression.len() < 2 {
return None;
}
let inner = &expression[quote.len_utf8()..expression.len() - quote.len_utf8()];
Some(inner.to_string())
}
/// 解析 fields 变量名,拒绝函数调用和比较表达式等完整 Jinja 能力。
fn parse_field_key(expression: &str) -> Option<String> {
let captures = FIELD_EXPR_RE.captures(expression.trim())?;
captures
.get(1)
.or_else(|| captures.get(2))
.map(|item| item.as_str().to_string())
}
/// 从模板上下文中获取字段值,缺失字段按 Jinja 的空值处理。
fn get_template_value<'a>(values: &'a [(&str, &str)], template_key: &str) -> Option<&'a str> {
for (field_key, value) in values {
if *field_key == template_key {
return Some(*value);
}
}
None
}
struct JinjaBlockEnd {
endif_start: usize,
endif_end: usize,
}
struct JinjaBranch {
condition: Option<String>,
content: String,
}
/// 读取分类配置中的 ID 列表。

View File

@@ -196,3 +196,152 @@ def test_rust_indexer_page_parser_handles_common_fields():
"hit_and_run": True,
"category": MediaType.MOVIE.value,
}]
def test_rust_indexer_page_parser_renders_common_title_template():
"""
Rust 普通 indexer 页面解析应兼容站点构建项目里的 title_optional 模板。
"""
spider = SiteSpider(
indexer={
"id": "demo",
"name": "Demo",
"domain": "https://example.org/",
"search": {"paths": [{"path": "torrents.php"}]},
"torrents": {
"list": {"selector": "tr.torrent"},
"fields": {
"title_default": {"selector": "a.title"},
"title_optional": {
"selector": "a.title",
"attribute": "title",
"optional": True,
},
"title": {
"text": (
"{% if fields['title_optional'] %}"
"{{ fields['title_optional'] }}"
"{% else %}"
"{{ fields['title_default'] }}"
"{% endif %}"
)
},
"download": {"selector": "a.dl", "attribute": "href"},
},
},
},
)
html = """
<table>
<tr class="torrent">
<td><a class="title" title="Optional Name" href="/details/1">Default Name</a></td>
<td><a class="dl" href="/download/1">DL</a></td>
</tr>
<tr class="torrent">
<td><a class="title" title="" href="/details/2">Default Fallback</a></td>
<td><a class="dl" href="/download/2">DL</a></td>
</tr>
</table>
"""
torrents = spider.parse(html)
assert [item["title"] for item in torrents] == ["Optional Name", "Default Fallback"]
def test_rust_indexer_page_parser_renders_common_description_templates():
"""
Rust 普通 indexer 页面解析应兼容站点构建项目里的 description 字段模板。
"""
spider = SiteSpider(
indexer={
"id": "demo",
"name": "Demo",
"domain": "https://example.org/",
"search": {"paths": [{"path": "torrents.php"}]},
"torrents": {
"list": {"selector": "tr.torrent"},
"fields": {
"title": {"selector": "a.title"},
"subject": {"selector": ".subject"},
"tags": {"selector": ".tags"},
"description": {
"text": (
"{% if fields['tags']%}"
"{{ fields['subject']+' '+fields['tags'] }}"
"{% else %}"
"{{ fields['subject'] }}"
"{% endif %}"
)
},
"download": {"selector": "a.dl", "attribute": "href"},
},
},
},
)
html = """
<table>
<tr class="torrent">
<td><a class="title">Movie 2024</a><span class="subject">BluRay</span><span class="tags">HDR</span></td>
<td><a class="dl" href="/download/1">DL</a></td>
</tr>
<tr class="torrent">
<td><a class="title">Show 2025</a><span class="subject">WEB-DL</span><span class="tags"></span></td>
<td><a class="dl" href="/download/2">DL</a></td>
</tr>
</table>
"""
torrents = spider.parse(html)
assert [item["description"] for item in torrents] == ["BluRay HDR", "WEB-DL"]
def test_rust_indexer_page_parser_supports_remove_and_negative_index():
"""
Rust 普通 indexer 页面解析应兼容站点配置常用的 remove 和负索引。
"""
spider = SiteSpider(
indexer={
"id": "demo",
"name": "Demo",
"domain": "https://example.org/",
"search": {"paths": [{"path": "torrents.php"}]},
"torrents": {
"list": {"selector": "tr.torrent"},
"fields": {
"title": {"selector": ".name", "remove": "a,b"},
"description": {
"selector": ".desc",
"remove": "span,a,img,font,b",
"contents": -1,
},
"labels": {
"selector": ".labels > span",
"remove": "span,a,img,font,b",
"contents": -1,
},
"download": {"selector": "a.dl", "attribute": "href"},
},
},
},
)
html = """
<table>
<tr class="torrent">
<td class="name">Movie<a>删掉</a><b>也删</b>2024</td>
<td class="desc">第一行
<span>标签</span><a>链接</a>
第二行
</td>
<td class="labels"><span><i>DIY</i></span><span><i>HDR</i></span></td>
<td><a class="dl" href="/download/1">DL</a></td>
</tr>
</table>
"""
torrents = spider.parse(html)
assert torrents[0]["title"] == "Movie2024"
assert torrents[0]["description"] == "第一行 第二行"
assert torrents[0]["labels"] == ["DIY", "HDR"]