From 13de40c246719fca4b5872c9f2f0feda032eb597 Mon Sep 17 00:00:00 2001 From: Hiller Liao Date: Fri, 13 Dec 2024 11:50:00 +0800 Subject: [PATCH] get economist's world in brief from homepage --- rsshub/spiders/economist/worldbrief.py | 61 ++++++++++++++++++++------ 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/rsshub/spiders/economist/worldbrief.py b/rsshub/spiders/economist/worldbrief.py index d1803a8..85bb704 100644 --- a/rsshub/spiders/economist/worldbrief.py +++ b/rsshub/spiders/economist/worldbrief.py @@ -1,27 +1,62 @@ import re +import json +from bs4 import BeautifulSoup from rsshub.utils import DEFAULT_HEADERS from rsshub.utils import fetch domain = 'https://www.economist.com' -def parse(post): - item = {} - item['title'] = post.css('div').css('p').get() - item['description'] = item['title'] - item['title'] = re.sub(r'<[^>]*>', '', item['title']).strip() - item['link'] = f"{domain}/the-world-in-brief" + '?from=' + item['title'][:30] +def extract_text(node): + if isinstance(node, dict): + if 'data' in node: + return node['data'] + elif 'children' in node: + return ''.join(extract_text(child) for child in node['children']) + elif isinstance(node, list): + return ''.join(extract_text(child) for child in node) + return '' + +def parse_news(gobbet): + """ + 生成单条 news 的新闻内容,提取标题和正文。 + """ + title = gobbet.strip() + item = { + 'title': title, + 'description': title, # 简单设置正文为描述 + 'link': f"{domain}/the-world-in-brief?from={title[:30]}" # 生成链接 + } return item def ctx(category=''): - url = f"{domain}/the-world-in-brief" - tree = fetch(url,headers=DEFAULT_HEADERS) - posts = tree.css('._gobbet') + """ + 解析 JSON 数据,提取所有brief news的内容。 + """ + url = f"{domain}/" + html = fetch(url, headers=DEFAULT_HEADERS).get() + soup = BeautifulSoup(html, 'html.parser') + script_tag = soup.find('script', id="__NEXT_DATA__", type="application/json") + + if not script_tag: + raise ValueError("Could not find __NEXT_DATA__ script tag.") + + # Load JSON content + data = json.loads(script_tag.string) + + news_list = data.get('props', {}).get('pageProps', {}).get('worldInBrief', {}).get('text', [])[:-2] + news_list_new = [] + for item in news_list: + if item['type'] == 'tag' and item['name'] == 'p': # 确保是段落 + news_list_new.append(extract_text(item['children'])) + + # 使用 parse_gobbet 解析每一条新闻 + items = [parse_news(news) for news in news_list_new] + return { - 'title': f'World Brief - Economist', + 'title': 'World Brief - Economist', 'link': url, - 'description': f'The world in brief: Catch up quickly on the global stories that matter', + 'description': 'The world in brief: Catch up quickly on the global stories that matter', 'author': 'hillerliao', - 'items': list(map(parse, posts)) + 'items': items } -