Add support for puppeteer to facilitate crawling with information on anti-crawl pages; introduce cache module to avoid frequent crawling of pages

2026-05-14 15:09:23 +00:00 · 2023-04-01 15:47:35 +08:00
parent 64096acd2e
commit af216b7f7e
7 changed files with 593 additions and 327 deletions
--- a/rsshub/utils.py
+++ b/rsshub/utils.py
@@ -25,8 +25,28 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
        tree = Selector(text=html)
        return tree

+
+async def fetch_by_puppeteer(url):
+    try:
+        from pyppeteer import launch
+    except Exception as e:
+        print(f'[Err] {e}')
+    else:
+        browser = await launch(  # 启动浏览器
+            {'args': ['--no-sandbox']},
+            handleSIGINT=False,
+            handleSIGTERM=False,
+            handleSIGHUP=False
+        )
+        page = await browser.newPage()  # 创建新页面
+        await page.goto(url)  # 访问网址
+        html = await page.content()  # 获取页面内容
+        await browser.close()  # 关闭浏览器
+        return Selector(text=html)
+
+
 def filter_content(items):
-    content = []    
+    content = []
    p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
    p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
    p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
@@ -35,4 +55,4 @@ def filter_content(items):
        title = item['title']
        if p1.match(title) or p2.match(title) or p3.match(title):
            content.append(item)
-    return content  
+    return content