mirror of
https://github.com/d0zingcat/RSSHub-python.git
synced 2026-06-10 07:26:47 +00:00
58 lines
1.7 KiB
Python
58 lines
1.7 KiB
Python
import re
|
||
import json
|
||
from bs4 import BeautifulSoup
|
||
from rsshub.utils import DEFAULT_HEADERS
|
||
from rsshub.utils import fetch
|
||
|
||
domain = 'https://www.economist.com'
|
||
|
||
def extract_text(node):
|
||
if isinstance(node, dict):
|
||
if 'data' in node:
|
||
return node['data']
|
||
elif 'children' in node:
|
||
return ''.join(extract_text(child) for child in node['children'])
|
||
elif isinstance(node, list):
|
||
return ''.join(extract_text(child) for child in node)
|
||
return ''
|
||
|
||
def parse_news(gobbet):
|
||
"""
|
||
生成单条 news 的新闻内容,提取标题和正文。
|
||
"""
|
||
title = re.sub(r'<[^>]+>', '', gobbet.strip())
|
||
item = {
|
||
'title': title,
|
||
'description': gobbet, # 简单设置正文为描述
|
||
'link': f"{domain}/the-world-in-brief?from={title[:100]}" # 生成链接
|
||
}
|
||
return item
|
||
|
||
def ctx(category=''):
|
||
"""
|
||
解析 JSON 数据,提取所有brief news的内容。
|
||
"""
|
||
url = f"{domain}/the-world-in-brief"
|
||
html = fetch(url, headers=DEFAULT_HEADERS).get()
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
script_tag = soup.find('script', id="__NEXT_DATA__", type="application/json")
|
||
|
||
if not script_tag:
|
||
raise ValueError("Could not find __NEXT_DATA__ script tag.")
|
||
|
||
# Load JSON content
|
||
data = json.loads(script_tag.string)
|
||
|
||
news_list = data.get('props', {}).get('pageProps', {}).get('content', {}).get('gobbets', [])
|
||
|
||
# 使用 parse_gobbet 解析每一条新闻
|
||
items = [parse_news(news) for news in news_list]
|
||
|
||
return {
|
||
'title': 'World Brief - Economist',
|
||
'link': url,
|
||
'description': 'The world in brief: Catch up quickly on the global stories that matter',
|
||
'author': 'hillerliao',
|
||
'items': items
|
||
}
|