From d63ccd114b92325e76f6f3ca443ac931067b7c01 Mon Sep 17 00:00:00 2001 From: Zhihai Liao Date: Tue, 10 Dec 2024 00:09:11 +0800 Subject: [PATCH] Update worldbrief.py --- rsshub/spiders/economist/worldbrief.py | 36 ++++++++++++++++++-------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/rsshub/spiders/economist/worldbrief.py b/rsshub/spiders/economist/worldbrief.py index d1803a8..0e84885 100644 --- a/rsshub/spiders/economist/worldbrief.py +++ b/rsshub/spiders/economist/worldbrief.py @@ -1,27 +1,41 @@ import re +import json +from bs4 import BeautifulSoup from rsshub.utils import DEFAULT_HEADERS from rsshub.utils import fetch domain = 'https://www.economist.com' -def parse(post): +def parse_gobbet(gobbet): item = {} - item['title'] = post.css('div').css('p').get() - item['description'] = item['title'] - item['title'] = re.sub(r'<[^>]*>', '', item['title']).strip() - item['link'] = f"{domain}/the-world-in-brief" + '?from=' + item['title'][:30] + # Remove HTML tags but keep the text + item['title'] = BeautifulSoup(gobbet, 'html.parser').get_text() + item['description'] = gobbet # Keep HTML formatting for description + item['link'] = f"{domain}/the-world-in-brief?from={item['title'][:30]}" return item def ctx(category=''): url = f"{domain}/the-world-in-brief" - tree = fetch(url,headers=DEFAULT_HEADERS) - posts = tree.css('._gobbet') + html = fetch(url, headers=DEFAULT_HEADERS).get() + + # Find the __NEXT_DATA__ script + match = re.search(r'', html) + if not match: + return { + 'title': 'World Brief - Economist', + 'link': url, + 'description': 'The world in brief: Catch up quickly on the global stories that matter', + 'author': 'hillerliao', + 'items': [] + } + + data = json.loads(match.group(1)) + gobbets = data.get('props', {}).get('pageProps', {}).get('content', {}).get('gobbets', []) return { - 'title': f'World Brief - Economist', + 'title': 'World Brief - Economist', 'link': url, - 'description': f'The world in brief: Catch up quickly on the global stories that matter', + 'description': 'The world in brief: Catch up quickly on the global stories that matter', 'author': 'hillerliao', - 'items': list(map(parse, posts)) + 'items': list(map(parse_gobbet, gobbets)) } -