Update worldbrief.py

This commit is contained in:
Zhihai Liao
2024-12-10 00:09:11 +08:00
committed by GitHub
parent a617c6991a
commit d63ccd114b

View File

@@ -1,27 +1,41 @@
import re
import json
from bs4 import BeautifulSoup
from rsshub.utils import DEFAULT_HEADERS
from rsshub.utils import fetch
domain = 'https://www.economist.com'
def parse(post):
def parse_gobbet(gobbet):
item = {}
item['title'] = post.css('div').css('p').get()
item['description'] = item['title']
item['title'] = re.sub(r'<[^>]*>', '', item['title']).strip()
item['link'] = f"{domain}/the-world-in-brief" + '?from=' + item['title'][:30]
# Remove HTML tags but keep the text
item['title'] = BeautifulSoup(gobbet, 'html.parser').get_text()
item['description'] = gobbet # Keep HTML formatting for description
item['link'] = f"{domain}/the-world-in-brief?from={item['title'][:30]}"
return item
def ctx(category=''):
url = f"{domain}/the-world-in-brief"
tree = fetch(url,headers=DEFAULT_HEADERS)
posts = tree.css('._gobbet')
html = fetch(url, headers=DEFAULT_HEADERS).get()
# Find the __NEXT_DATA__ script
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
if not match:
return {
'title': 'World Brief - Economist',
'link': url,
'description': 'The world in brief: Catch up quickly on the global stories that matter',
'author': 'hillerliao',
'items': []
}
data = json.loads(match.group(1))
gobbets = data.get('props', {}).get('pageProps', {}).get('content', {}).get('gobbets', [])
return {
'title': f'World Brief - Economist',
'title': 'World Brief - Economist',
'link': url,
'description': f'The world in brief: Catch up quickly on the global stories that matter',
'description': 'The world in brief: Catch up quickly on the global stories that matter',
'author': 'hillerliao',
'items': list(map(parse, posts))
'items': list(map(parse_gobbet, gobbets))
}