Add support for puppeteer to facilitate crawling with information on anti-crawl pages; introduce cache module to avoid frequent crawling of pages

This commit is contained in:
airchaoz
2023-04-01 15:47:35 +08:00
committed by 王志强
parent 64096acd2e
commit af216b7f7e
7 changed files with 593 additions and 327 deletions

View File

@@ -7,6 +7,7 @@ from rsshub.config import config
from rsshub.extensions import *
from rsshub.blueprints.main import bp as main_bp
from rsshub.utils import XMLResponse
from rsshub.extensions import cache
def create_app(config_name=None):
@@ -17,8 +18,9 @@ def create_app(config_name=None):
app = Flask(__name__)
app.config.from_object(config[config_name])
app.response_class = XMLResponse
cache.init_app(app)
# Add analytics
# Add analytics
from flask_analytics import Analytics
from rsshub.google_analytics import ga_account

View File

@@ -1,4 +1,5 @@
from flask import Blueprint, render_template, request
from rsshub.extensions import cache
bp = Blueprint('main', __name__)
@@ -55,7 +56,7 @@ def chuansongme_articles(category=''):
def ctolib_topics(category=''):
from rsshub.spiders.ctolib.topics import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/bbwc/realtime/<string:category>')
def bbwc_realtime(category=''):
from rsshub.spiders.bbwc.realtime import ctx
@@ -81,7 +82,7 @@ def infoq_profile(category=''):
@bp.route('/infoq/search/<string:category>/<int:type>')
def infoq_search(category='', type=''):
from rsshub.spiders.infoq.search import ctx
return render_template('main/atom.xml', **filter_content(ctx(category, type)))
return render_template('main/atom.xml', **filter_content(ctx(category, type)))
@bp.route('/dxzg/notice')
def dxzg_notice():
@@ -117,32 +118,32 @@ def csrc_audit(category=''):
@bp.route('/caixin/scroll/<string:category>')
def caixin_scroll(category=''):
from rsshub.spiders.caixin.scroll import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/eastmoney/report/<string:type>/<string:category>')
def eastmoney_report(category='', type=''):
from rsshub.spiders.eastmoney.report import ctx
return render_template('main/atom.xml', **filter_content(ctx(type,category)))
return render_template('main/atom.xml', **filter_content(ctx(type,category)))
@bp.route('/xuangubao/<string:type>/<string:category>')
def xuangubao_xuangubao(type='', category=''):
from rsshub.spiders.xuangubao.xuangubao import ctx
return render_template('main/atom.xml', **filter_content(ctx(type, category)))
return render_template('main/atom.xml', **filter_content(ctx(type, category)))
@bp.route('/cls/subject/<string:category>')
def cls_subject(category=''):
from rsshub.spiders.cls.subject import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/cls/telegraph/')
def cls_telegraph():
from rsshub.spiders.cls.telegraph import ctx
return render_template('main/atom.xml', **filter_content(ctx()))
return render_template('main/atom.xml', **filter_content(ctx()))
@bp.route('/chaindd/column/<string:category>')
def chaindd_column(category=''):
from rsshub.spiders.chaindd.column import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/techcrunch/tag/<string:category>')
def techcrunch_tag(category=''):
@@ -167,29 +168,29 @@ def weiyangx_tag(category=''):
@bp.route('/jintiankansha/column/<string:category>')
def jintiankansha_column(category=''):
from rsshub.spiders.jintiankansha.column import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/interotc/cpgg/<string:category>')
def interotc_cpgg(category=''):
from rsshub.spiders.interotc.cpgg import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/benzinga/ratings/<string:category>')
def benzinga_ratings(category=''):
from rsshub.spiders.benzinga.ratings import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/chouti/section/<string:category>')
@bp.route('/chouti/section/<string:category>')
def chouti_section(category=''):
from rsshub.spiders.chouti.section import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/chouti/search/<string:category>')
@bp.route('/chouti/search/<string:category>')
def chouti_search(category=''):
from rsshub.spiders.chouti.search import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/chouti/user/<string:category>')
@bp.route('/chouti/user/<string:category>')
def chouti_user(category=''):
from rsshub.spiders.chouti.user import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
@@ -227,22 +228,22 @@ def economist_wordlbrief(category=''):
@bp.route('/baidu/suggest/<string:category>')
def baidu_suggest(category=''):
from rsshub.spiders.baidu.suggest import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/mp/gh/<string:gh>')
def mp_gh(gh=''):
from rsshub.spiders.mp.gh import ctx
return render_template('main/atom.xml', **filter_content(ctx(gh)))
return render_template('main/atom.xml', **filter_content(ctx(gh)))
@bp.route('/mp/youwuqiong/<string:author>')
def mp_youwuqiong(author=''):
from rsshub.spiders.mp.youwuqiong import ctx
return render_template('main/atom.xml', **filter_content(ctx(author)))
return render_template('main/atom.xml', **filter_content(ctx(author)))
@bp.route('/yfchuhai/express/')
def yfchuhai_express():
from rsshub.spiders.yfchuhai.express import ctx
return render_template('main/atom.xml', **filter_content(ctx()))
return render_template('main/atom.xml', **filter_content(ctx()))
@bp.route('/bjnews/<string:category>')
def bjnews_channel(category=''):
@@ -264,10 +265,16 @@ def aisixiang_search(category='', keywords=''):
from rsshub.spiders.aisixiang.search import ctx
return render_template('main/atom.xml', **filter_content(ctx(category, keywords)))
@bp.route('/sysu/ifcen')
@cache.cached(timeout=3600)
def sysu_ifcen(category='', keywords=''):
from rsshub.spiders.sysu.ifcen import ctx
return render_template('main/atom.xml', **filter_content(ctx(category)))
@bp.route('/filter/')
def rss_filter():
from rsshub.spiders.rssfilter.filter import ctx
feed_url = request.args.get("feed")
feed_url = request.args.get("feed")
return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
'''
@bp.route('/test')

View File

@@ -1,8 +1,15 @@
from flask_bootstrap import Bootstrap
from flask_debugtoolbar import DebugToolbarExtension
from flask_moment import Moment
from flask_caching import Cache
bootstrap = Bootstrap()
debugtoolbar = DebugToolbarExtension()
moment = Moment()
cache = Cache(config={
"DEBUG": True, # some Flask specific configs
"CACHE_TYPE": "simple", # Flask-Caching related configs
"CACHE_DEFAULT_TIMEOUT": 3600 # cache half hour
})

View File

@@ -0,0 +1,152 @@
from rsshub.utils import fetch_by_puppeteer
import asyncio
domain = 'https://ifcen.sysu.edu.cn/'
def parse(selector):
items = list()
# 公告通知
xpath = '//div[@id="news-2"]/ul//a'
announces = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
announces = ['公告通知 | ' + i for i in announces]
for i in range(len(announces)):
item = dict()
item['title'] = announces[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 学院新闻
xpath = '//*[@id="news-1"]/ul/li/a'
news = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
for i in range(len(news)):
item = dict()
item['title'] = news[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 人才工作
xpath = '//*[@id="notice-1"]/div//a'
works = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
works = ['人才工作 | ' + i for i in works]
for i in range(len(works)):
item = dict()
item['title'] = works[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 本科生教育
xpath = '//*[@id="notice-2"]/div//a'
ues = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
ues = ['本科生教育 | ' + i for i in ues]
for i in range(len(ues)):
item = dict()
item['title'] = ues[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 研究生教育
xpath = '//*[@id="notice-3"]/div//a'
pgs = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
pgs = ['研究生教育 | ' + i for i in pgs]
for i in range(len(pgs)):
item = dict()
item['title'] = pgs[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 科研信息
xpath = '//*[@id="notice-4"]/div//a'
research = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
research = ['科研信息 | ' + i for i in research]
for i in range(len(research)):
item = dict()
item['title'] = research[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 学工信息
xpath = '//*[@id="notice-5"]/div//a'
students = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
students = ['学工信息 | ' + i for i in students]
for i in range(len(students)):
item = dict()
item['title'] = students[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 党建通知
xpath = '//*[@id="notice-6"]/div//a'
party = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
party = ['党建通知 | ' + i for i in party]
for i in range(len(party)):
item = dict()
item['title'] = party[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
# 工会工作
xpath = '//*[@id="notice-7"]/div//a'
union = selector.xpath(xpath + '/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
union = ['工会工作 | ' + i for i in union]
for i in range(len(union)):
item = dict()
item['title'] = union[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
xpath = '//*[@id="event-1"]/li//a'
report = selector.xpath(xpath + '/text()').getall()
author = selector.xpath('//*[@id="event-1"]/li//span[@class="content"]/text()').getall()
urls = selector.xpath(xpath + '/@href').getall()
urls = [domain + i for i in urls]
for i in range(len(report)) :
report[i] = report[i] + author[i]
report = ['学术报告 | ' + i for i in report]
for i in range(len(report)):
item = dict()
item['title'] = report[i]
item['description'] = "网站严格反爬,请进入网站查看具体内容"
item['link'] = urls[i]
items.append(item)
return items
def ctx(category=''):
tree = asyncio.run(fetch_by_puppeteer(domain))
return {
'title': '中山大学中法核官网信息',
'link': domain,
'description': '中山大学中法核官网通知公告',
'author': 'echo',
'items': parse(tree)
}

View File

@@ -25,8 +25,28 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
tree = Selector(text=html)
return tree
async def fetch_by_puppeteer(url):
try:
from pyppeteer import launch
except Exception as e:
print(f'[Err] {e}')
else:
browser = await launch( # 启动浏览器
{'args': ['--no-sandbox']},
handleSIGINT=False,
handleSIGTERM=False,
handleSIGHUP=False
)
page = await browser.newPage() # 创建新页面
await page.goto(url) # 访问网址
html = await page.content() # 获取页面内容
await browser.close() # 关闭浏览器
return Selector(text=html)
def filter_content(items):
content = []
content = []
p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
@@ -35,4 +55,4 @@ def filter_content(items):
title = item['title']
if p1.match(title) or p2.match(title) or p3.match(title):
content.append(item)
return content
return content