mirror of
https://github.com/d0zingcat/RSSHub-python.git
synced 2026-05-14 15:09:23 +00:00
Add support for puppeteer to facilitate crawling with information on anti-crawl pages; introduce cache module to avoid frequent crawling of pages
This commit is contained in:
@@ -25,8 +25,28 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
|
||||
tree = Selector(text=html)
|
||||
return tree
|
||||
|
||||
|
||||
async def fetch_by_puppeteer(url):
|
||||
try:
|
||||
from pyppeteer import launch
|
||||
except Exception as e:
|
||||
print(f'[Err] {e}')
|
||||
else:
|
||||
browser = await launch( # 启动浏览器
|
||||
{'args': ['--no-sandbox']},
|
||||
handleSIGINT=False,
|
||||
handleSIGTERM=False,
|
||||
handleSIGHUP=False
|
||||
)
|
||||
page = await browser.newPage() # 创建新页面
|
||||
await page.goto(url) # 访问网址
|
||||
html = await page.content() # 获取页面内容
|
||||
await browser.close() # 关闭浏览器
|
||||
return Selector(text=html)
|
||||
|
||||
|
||||
def filter_content(items):
|
||||
content = []
|
||||
content = []
|
||||
p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
|
||||
p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
|
||||
p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
|
||||
@@ -35,4 +55,4 @@ def filter_content(items):
|
||||
title = item['title']
|
||||
if p1.match(title) or p2.match(title) or p3.match(title):
|
||||
content.append(item)
|
||||
return content
|
||||
return content
|
||||
Reference in New Issue
Block a user