mirror of
https://github.com/d0zingcat/RSSHub-python.git
synced 2026-06-12 15:10:49 +00:00
get economist's world in brief from homepage
This commit is contained in:
@@ -1,27 +1,62 @@
|
|||||||
import re
|
import re
|
||||||
|
import json
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from rsshub.utils import DEFAULT_HEADERS
|
from rsshub.utils import DEFAULT_HEADERS
|
||||||
from rsshub.utils import fetch
|
from rsshub.utils import fetch
|
||||||
|
|
||||||
domain = 'https://www.economist.com'
|
domain = 'https://www.economist.com'
|
||||||
|
|
||||||
def parse(post):
|
def extract_text(node):
|
||||||
item = {}
|
if isinstance(node, dict):
|
||||||
item['title'] = post.css('div').css('p').get()
|
if 'data' in node:
|
||||||
item['description'] = item['title']
|
return node['data']
|
||||||
item['title'] = re.sub(r'<[^>]*>', '', item['title']).strip()
|
elif 'children' in node:
|
||||||
item['link'] = f"{domain}/the-world-in-brief" + '?from=' + item['title'][:30]
|
return ''.join(extract_text(child) for child in node['children'])
|
||||||
|
elif isinstance(node, list):
|
||||||
|
return ''.join(extract_text(child) for child in node)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def parse_news(gobbet):
|
||||||
|
"""
|
||||||
|
生成单条 news 的新闻内容,提取标题和正文。
|
||||||
|
"""
|
||||||
|
title = gobbet.strip()
|
||||||
|
item = {
|
||||||
|
'title': title,
|
||||||
|
'description': title, # 简单设置正文为描述
|
||||||
|
'link': f"{domain}/the-world-in-brief?from={title[:30]}" # 生成链接
|
||||||
|
}
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def ctx(category=''):
|
def ctx(category=''):
|
||||||
url = f"{domain}/the-world-in-brief"
|
"""
|
||||||
tree = fetch(url,headers=DEFAULT_HEADERS)
|
解析 JSON 数据,提取所有brief news的内容。
|
||||||
posts = tree.css('._gobbet')
|
"""
|
||||||
|
url = f"{domain}/"
|
||||||
|
html = fetch(url, headers=DEFAULT_HEADERS).get()
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
script_tag = soup.find('script', id="__NEXT_DATA__", type="application/json")
|
||||||
|
|
||||||
|
if not script_tag:
|
||||||
|
raise ValueError("Could not find __NEXT_DATA__ script tag.")
|
||||||
|
|
||||||
|
# Load JSON content
|
||||||
|
data = json.loads(script_tag.string)
|
||||||
|
|
||||||
|
news_list = data.get('props', {}).get('pageProps', {}).get('worldInBrief', {}).get('text', [])[:-2]
|
||||||
|
|
||||||
|
news_list_new = []
|
||||||
|
for item in news_list:
|
||||||
|
if item['type'] == 'tag' and item['name'] == 'p': # 确保是段落
|
||||||
|
news_list_new.append(extract_text(item['children']))
|
||||||
|
|
||||||
|
# 使用 parse_gobbet 解析每一条新闻
|
||||||
|
items = [parse_news(news) for news in news_list_new]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'title': f'World Brief - Economist',
|
'title': 'World Brief - Economist',
|
||||||
'link': url,
|
'link': url,
|
||||||
'description': f'The world in brief: Catch up quickly on the global stories that matter',
|
'description': 'The world in brief: Catch up quickly on the global stories that matter',
|
||||||
'author': 'hillerliao',
|
'author': 'hillerliao',
|
||||||
'items': list(map(parse, posts))
|
'items': items
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user