mirror of
https://github.com/d0zingcat/RSSHub-python.git
synced 2026-05-14 23:16:50 +00:00
58 lines
1.9 KiB
Python
58 lines
1.9 KiB
Python
import re
|
|
from flask import Response
|
|
import requests
|
|
from parsel import Selector
|
|
|
|
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
|
|
|
|
|
|
class XMLResponse(Response):
|
|
def __init__(self, response, **kwargs):
|
|
if 'mimetype' not in kwargs and 'contenttype' not in kwargs:
|
|
if response.startswith('<?xml'):
|
|
kwargs['mimetype'] = 'application/xml'
|
|
return super().__init__(response, **kwargs)
|
|
|
|
|
|
def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
|
|
try:
|
|
res = requests.get(url, headers=headers, proxies=proxies)
|
|
res.raise_for_status()
|
|
except Exception as e:
|
|
print(f'[Err] {e}')
|
|
else:
|
|
html = res.text
|
|
tree = Selector(text=html)
|
|
return tree
|
|
|
|
|
|
async def fetch_by_puppeteer(url):
|
|
try:
|
|
from pyppeteer import launch
|
|
except Exception as e:
|
|
print(f'[Err] {e}')
|
|
else:
|
|
browser = await launch( # 启动浏览器
|
|
{'args': ['--no-sandbox']},
|
|
handleSIGINT=False,
|
|
handleSIGTERM=False,
|
|
handleSIGHUP=False
|
|
)
|
|
page = await browser.newPage() # 创建新页面
|
|
await page.goto(url) # 访问网址
|
|
html = await page.content() # 获取页面内容
|
|
await browser.close() # 关闭浏览器
|
|
return Selector(text=html)
|
|
|
|
|
|
def filter_content(items):
|
|
content = []
|
|
p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
|
|
p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
|
|
p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
|
|
|
|
for item in items:
|
|
title = item['title']
|
|
if p1.match(title) or p2.match(title) or p3.match(title):
|
|
content.append(item)
|
|
return content |