Add support for puppeteer to facilitate crawling with information on anti-crawl pages; introduce cache module to avoid frequent crawling of pages

This commit is contained in:
airchaoz
2023-04-01 15:47:35 +08:00
committed by 王志强
parent 64096acd2e
commit af216b7f7e
7 changed files with 593 additions and 327 deletions

View File

@@ -25,8 +25,28 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
tree = Selector(text=html)
return tree
async def fetch_by_puppeteer(url):
try:
from pyppeteer import launch
except Exception as e:
print(f'[Err] {e}')
else:
browser = await launch( # 启动浏览器
{'args': ['--no-sandbox']},
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False
)
page = await browser.newPage() # 创建新页面
await page.goto(url) # 访问网址
html = await page.content() # 获取页面内容
await browser.close() # 关闭浏览器
return Selector(text=html)
def filter_content(items):
content = []
content = []
p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
@@ -35,4 +55,4 @@ def filter_content(items):
title = item['title']
if p1.match(title) or p2.match(title) or p3.match(title):
content.append(item)
return content
return content