Add support for puppeteer to facilitate crawling with information on anti-crawl pages; introduce cache module to avoid frequent crawling of pages

2026-05-14 23:16:50 +00:00 · 2023-04-01 15:47:35 +08:00
parent 64096acd2e
commit af216b7f7e
7 changed files with 593 additions and 327 deletions
--- a/rsshub/init.py
+++ b/rsshub/init.py
@@ -7,6 +7,7 @@ from rsshub.config import config
 from rsshub.extensions import *
 from rsshub.blueprints.main import bp as main_bp
 from rsshub.utils import XMLResponse
+from rsshub.extensions import cache


 def create_app(config_name=None):
@@ -17,8 +18,9 @@ def create_app(config_name=None):
    app = Flask(__name__)
    app.config.from_object(config[config_name])
    app.response_class = XMLResponse
+    cache.init_app(app)

-    # Add analytics 
+    # Add analytics
    from flask_analytics import Analytics
    from rsshub.google_analytics import ga_account

--- a/rsshub/blueprints/main.py
+++ b/rsshub/blueprints/main.py
@@ -1,4 +1,5 @@
 from flask import Blueprint, render_template, request
+from rsshub.extensions import cache

 bp = Blueprint('main', __name__)

@@ -55,7 +56,7 @@ def chuansongme_articles(category=''):
 def ctolib_topics(category=''):
    from rsshub.spiders.ctolib.topics import ctx
    return render_template('main/atom.xml', **filter_content(ctx(category)))
- 
+
@bp.route('/bbwc/realtime/<string:category>')
 def bbwc_realtime(category=''):
    from rsshub.spiders.bbwc.realtime import ctx
@@ -81,7 +82,7 @@ def infoq_profile(category=''):
@bp.route('/infoq/search/<string:category>/<int:type>')
 def infoq_search(category='', type=''):
    from rsshub.spiders.infoq.search import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category, type)))    
+    return render_template('main/atom.xml', **filter_content(ctx(category, type)))

@bp.route('/dxzg/notice')
 def dxzg_notice():
@@ -117,32 +118,32 @@ def csrc_audit(category=''):
@bp.route('/caixin/scroll/<string:category>')
 def caixin_scroll(category=''):
    from rsshub.spiders.caixin.scroll import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category)))    
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

@bp.route('/eastmoney/report/<string:type>/<string:category>')
 def eastmoney_report(category='', type=''):
    from rsshub.spiders.eastmoney.report import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(type,category)))      
+    return render_template('main/atom.xml', **filter_content(ctx(type,category)))

@bp.route('/xuangubao/<string:type>/<string:category>')
 def xuangubao_xuangubao(type='', category=''):
    from rsshub.spiders.xuangubao.xuangubao import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(type, category)))        
+    return render_template('main/atom.xml', **filter_content(ctx(type, category)))

@bp.route('/cls/subject/<string:category>')
 def cls_subject(category=''):
    from rsshub.spiders.cls.subject import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category))) 
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

@bp.route('/cls/telegraph/')
 def cls_telegraph():
    from rsshub.spiders.cls.telegraph import ctx
-    return render_template('main/atom.xml', **filter_content(ctx()))          
+    return render_template('main/atom.xml', **filter_content(ctx()))

@bp.route('/chaindd/column/<string:category>')
 def chaindd_column(category=''):
    from rsshub.spiders.chaindd.column import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category)))      
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

@bp.route('/techcrunch/tag/<string:category>')
 def techcrunch_tag(category=''):
@@ -167,29 +168,29 @@ def weiyangx_tag(category=''):
@bp.route('/jintiankansha/column/<string:category>')
 def jintiankansha_column(category=''):
    from rsshub.spiders.jintiankansha.column import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category)))    
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

@bp.route('/interotc/cpgg/<string:category>')
 def interotc_cpgg(category=''):
    from rsshub.spiders.interotc.cpgg import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category)))    
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

@bp.route('/benzinga/ratings/<string:category>')
 def benzinga_ratings(category=''):
    from rsshub.spiders.benzinga.ratings import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category)))     
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

-@bp.route('/chouti/section/<string:category>')       
+@bp.route('/chouti/section/<string:category>')
 def chouti_section(category=''):
    from rsshub.spiders.chouti.section import ctx
    return render_template('main/atom.xml', **filter_content(ctx(category)))

-@bp.route('/chouti/search/<string:category>')       
+@bp.route('/chouti/search/<string:category>')
 def chouti_search(category=''):
    from rsshub.spiders.chouti.search import ctx
    return render_template('main/atom.xml', **filter_content(ctx(category)))

-@bp.route('/chouti/user/<string:category>')       
+@bp.route('/chouti/user/<string:category>')
 def chouti_user(category=''):
    from rsshub.spiders.chouti.user import ctx
    return render_template('main/atom.xml', **filter_content(ctx(category)))
@@ -227,22 +228,22 @@ def economist_wordlbrief(category=''):
@bp.route('/baidu/suggest/<string:category>')
 def baidu_suggest(category=''):
    from rsshub.spiders.baidu.suggest import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(category)))    
+    return render_template('main/atom.xml', **filter_content(ctx(category)))

@bp.route('/mp/gh/<string:gh>')
 def mp_gh(gh=''):
    from rsshub.spiders.mp.gh import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(gh)))    
+    return render_template('main/atom.xml', **filter_content(ctx(gh)))

@bp.route('/mp/youwuqiong/<string:author>')
 def mp_youwuqiong(author=''):
    from rsshub.spiders.mp.youwuqiong import ctx
-    return render_template('main/atom.xml', **filter_content(ctx(author)))        
+    return render_template('main/atom.xml', **filter_content(ctx(author)))

@bp.route('/yfchuhai/express/')
 def yfchuhai_express():
    from rsshub.spiders.yfchuhai.express import ctx
-    return render_template('main/atom.xml', **filter_content(ctx())) 
+    return render_template('main/atom.xml', **filter_content(ctx()))

@bp.route('/bjnews/<string:category>')
 def bjnews_channel(category=''):
@@ -264,10 +265,16 @@ def aisixiang_search(category='', keywords=''):
    from rsshub.spiders.aisixiang.search import ctx
    return render_template('main/atom.xml', **filter_content(ctx(category, keywords)))

+@bp.route('/sysu/ifcen')
+@cache.cached(timeout=3600)
+def sysu_ifcen(category='', keywords=''):
+    from rsshub.spiders.sysu.ifcen import ctx
+    return render_template('main/atom.xml', **filter_content(ctx(category)))
+
@bp.route('/filter/')
 def rss_filter():
    from rsshub.spiders.rssfilter.filter import ctx
-    feed_url = request.args.get("feed")  
+    feed_url = request.args.get("feed")
    return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
 '''
@bp.route('/test')
--- a/rsshub/extensions.py
+++ b/rsshub/extensions.py
@@ -1,8 +1,15 @@
 from flask_bootstrap import Bootstrap
 from flask_debugtoolbar import DebugToolbarExtension
 from flask_moment import Moment
+from flask_caching import Cache


 bootstrap = Bootstrap()
 debugtoolbar = DebugToolbarExtension()
 moment = Moment()
+
+cache = Cache(config={
+    "DEBUG": True,          # some Flask specific configs
+    "CACHE_TYPE": "simple",  # Flask-Caching related configs
+    "CACHE_DEFAULT_TIMEOUT": 3600  # cache half hour
+})
--- a/rsshub/spiders/sysu/ifcen.py
+++ b/rsshub/spiders/sysu/ifcen.py
@@ -0,0 +1,152 @@
+from rsshub.utils import fetch_by_puppeteer
+import asyncio
+
+domain = 'https://ifcen.sysu.edu.cn/'
+
+
+def parse(selector):
+
+    items = list()
+
+    # 公告通知
+    xpath = '//div[@id="news-2"]/ul//a'
+    announces = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    announces = ['公告通知 | ' + i for i in announces]
+    for i in range(len(announces)):
+        item = dict()
+        item['title'] = announces[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 学院新闻
+    xpath = '//*[@id="news-1"]/ul/li/a'
+    news = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    for i in range(len(news)):
+        item = dict()
+        item['title'] = news[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 人才工作
+    xpath = '//*[@id="notice-1"]/div//a'
+    works = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    works = ['人才工作 | ' + i for i in works]
+    for i in range(len(works)):
+        item = dict()
+        item['title'] = works[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 本科生教育
+    xpath = '//*[@id="notice-2"]/div//a'
+    ues = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    ues = ['本科生教育 | ' + i for i in ues]
+    for i in range(len(ues)):
+        item = dict()
+        item['title'] = ues[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 研究生教育
+    xpath = '//*[@id="notice-3"]/div//a'
+    pgs = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    pgs = ['研究生教育 | ' + i for i in pgs]
+    for i in range(len(pgs)):
+        item = dict()
+        item['title'] = pgs[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 科研信息
+    xpath = '//*[@id="notice-4"]/div//a'
+    research = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    research = ['科研信息 | ' + i for i in research]
+    for i in range(len(research)):
+        item = dict()
+        item['title'] = research[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 学工信息
+    xpath = '//*[@id="notice-5"]/div//a'
+    students = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    students = ['学工信息 | ' + i for i in students]
+    for i in range(len(students)):
+        item = dict()
+        item['title'] = students[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 党建通知
+    xpath = '//*[@id="notice-6"]/div//a'
+    party = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    party = ['党建通知 | ' + i for i in party]
+    for i in range(len(party)):
+        item = dict()
+        item['title'] = party[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    # 工会工作
+    xpath = '//*[@id="notice-7"]/div//a'
+    union = selector.xpath(xpath + '/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    union = ['工会工作 | ' + i for i in union]
+    for i in range(len(union)):
+        item = dict()
+        item['title'] = union[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    xpath = '//*[@id="event-1"]/li//a'
+    report = selector.xpath(xpath + '/text()').getall()
+    author = selector.xpath('//*[@id="event-1"]/li//span[@class="content"]/text()').getall()
+    urls = selector.xpath(xpath + '/@href').getall()
+    urls = [domain + i for i in urls]
+    for i in range(len(report)) :
+        report[i] = report[i] + author[i]
+    report = ['学术报告 | ' + i for i in report]
+    for i in range(len(report)):
+        item = dict()
+        item['title'] = report[i]
+        item['description'] = "网站严格反爬，请进入网站查看具体内容"
+        item['link'] = urls[i]
+        items.append(item)
+
+    return items
+
+def ctx(category=''):
+    tree = asyncio.run(fetch_by_puppeteer(domain))
+    return {
+        'title': '中山大学中法核官网信息',
+        'link': domain,
+        'description': '中山大学中法核官网通知公告',
+        'author': 'echo',
+        'items': parse(tree)
+    }
--- a/rsshub/utils.py
+++ b/rsshub/utils.py
@@ -25,8 +25,28 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
        tree = Selector(text=html)
        return tree

+
+async def fetch_by_puppeteer(url):
+    try:
+        from pyppeteer import launch
+    except Exception as e:
+        print(f'[Err] {e}')
+    else:
+        browser = await launch(  # 启动浏览器
+            {'args': ['--no-sandbox']},
+            handleSIGINT=False,
+            handleSIGTERM=False,
+            handleSIGHUP=False
+        )
+        page = await browser.newPage()  # 创建新页面
+        await page.goto(url)  # 访问网址
+        html = await page.content()  # 获取页面内容
+        await browser.close()  # 关闭浏览器
+        return Selector(text=html)
+
+
 def filter_content(items):
-    content = []    
+    content = []
    p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
    p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
    p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
@@ -35,4 +55,4 @@ def filter_content(items):
        title = item['title']
        if p1.match(title) or p2.match(title) or p3.match(title):
            content.append(item)
-    return content  
+    return content