mirror of
https://github.com/d0zingcat/RSSHub-python.git
synced 2026-05-14 23:16:50 +00:00
Add support for puppeteer to facilitate crawling with information on anti-crawl pages; introduce cache module to avoid frequent crawling of pages
This commit is contained in:
@@ -7,6 +7,7 @@ from rsshub.config import config
|
||||
from rsshub.extensions import *
|
||||
from rsshub.blueprints.main import bp as main_bp
|
||||
from rsshub.utils import XMLResponse
|
||||
from rsshub.extensions import cache
|
||||
|
||||
|
||||
def create_app(config_name=None):
|
||||
@@ -17,8 +18,9 @@ def create_app(config_name=None):
|
||||
app = Flask(__name__)
|
||||
app.config.from_object(config[config_name])
|
||||
app.response_class = XMLResponse
|
||||
cache.init_app(app)
|
||||
|
||||
# Add analytics
|
||||
# Add analytics
|
||||
from flask_analytics import Analytics
|
||||
from rsshub.google_analytics import ga_account
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from flask import Blueprint, render_template, request
|
||||
from rsshub.extensions import cache
|
||||
|
||||
bp = Blueprint('main', __name__)
|
||||
|
||||
@@ -55,7 +56,7 @@ def chuansongme_articles(category=''):
|
||||
def ctolib_topics(category=''):
|
||||
from rsshub.spiders.ctolib.topics import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
|
||||
@bp.route('/bbwc/realtime/<string:category>')
|
||||
def bbwc_realtime(category=''):
|
||||
from rsshub.spiders.bbwc.realtime import ctx
|
||||
@@ -81,7 +82,7 @@ def infoq_profile(category=''):
|
||||
@bp.route('/infoq/search/<string:category>/<int:type>')
|
||||
def infoq_search(category='', type=''):
|
||||
from rsshub.spiders.infoq.search import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category, type)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category, type)))
|
||||
|
||||
@bp.route('/dxzg/notice')
|
||||
def dxzg_notice():
|
||||
@@ -117,32 +118,32 @@ def csrc_audit(category=''):
|
||||
@bp.route('/caixin/scroll/<string:category>')
|
||||
def caixin_scroll(category=''):
|
||||
from rsshub.spiders.caixin.scroll import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/eastmoney/report/<string:type>/<string:category>')
|
||||
def eastmoney_report(category='', type=''):
|
||||
from rsshub.spiders.eastmoney.report import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(type,category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(type,category)))
|
||||
|
||||
@bp.route('/xuangubao/<string:type>/<string:category>')
|
||||
def xuangubao_xuangubao(type='', category=''):
|
||||
from rsshub.spiders.xuangubao.xuangubao import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(type, category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(type, category)))
|
||||
|
||||
@bp.route('/cls/subject/<string:category>')
|
||||
def cls_subject(category=''):
|
||||
from rsshub.spiders.cls.subject import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/cls/telegraph/')
|
||||
def cls_telegraph():
|
||||
from rsshub.spiders.cls.telegraph import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx()))
|
||||
return render_template('main/atom.xml', **filter_content(ctx()))
|
||||
|
||||
@bp.route('/chaindd/column/<string:category>')
|
||||
def chaindd_column(category=''):
|
||||
from rsshub.spiders.chaindd.column import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/techcrunch/tag/<string:category>')
|
||||
def techcrunch_tag(category=''):
|
||||
@@ -167,29 +168,29 @@ def weiyangx_tag(category=''):
|
||||
@bp.route('/jintiankansha/column/<string:category>')
|
||||
def jintiankansha_column(category=''):
|
||||
from rsshub.spiders.jintiankansha.column import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/interotc/cpgg/<string:category>')
|
||||
def interotc_cpgg(category=''):
|
||||
from rsshub.spiders.interotc.cpgg import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/benzinga/ratings/<string:category>')
|
||||
def benzinga_ratings(category=''):
|
||||
from rsshub.spiders.benzinga.ratings import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/chouti/section/<string:category>')
|
||||
@bp.route('/chouti/section/<string:category>')
|
||||
def chouti_section(category=''):
|
||||
from rsshub.spiders.chouti.section import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/chouti/search/<string:category>')
|
||||
@bp.route('/chouti/search/<string:category>')
|
||||
def chouti_search(category=''):
|
||||
from rsshub.spiders.chouti.search import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/chouti/user/<string:category>')
|
||||
@bp.route('/chouti/user/<string:category>')
|
||||
def chouti_user(category=''):
|
||||
from rsshub.spiders.chouti.user import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
@@ -227,22 +228,22 @@ def economist_wordlbrief(category=''):
|
||||
@bp.route('/baidu/suggest/<string:category>')
|
||||
def baidu_suggest(category=''):
|
||||
from rsshub.spiders.baidu.suggest import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/mp/gh/<string:gh>')
|
||||
def mp_gh(gh=''):
|
||||
from rsshub.spiders.mp.gh import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(gh)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(gh)))
|
||||
|
||||
@bp.route('/mp/youwuqiong/<string:author>')
|
||||
def mp_youwuqiong(author=''):
|
||||
from rsshub.spiders.mp.youwuqiong import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(author)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(author)))
|
||||
|
||||
@bp.route('/yfchuhai/express/')
|
||||
def yfchuhai_express():
|
||||
from rsshub.spiders.yfchuhai.express import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx()))
|
||||
return render_template('main/atom.xml', **filter_content(ctx()))
|
||||
|
||||
@bp.route('/bjnews/<string:category>')
|
||||
def bjnews_channel(category=''):
|
||||
@@ -264,10 +265,16 @@ def aisixiang_search(category='', keywords=''):
|
||||
from rsshub.spiders.aisixiang.search import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category, keywords)))
|
||||
|
||||
@bp.route('/sysu/ifcen')
|
||||
@cache.cached(timeout=3600)
|
||||
def sysu_ifcen(category='', keywords=''):
|
||||
from rsshub.spiders.sysu.ifcen import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx(category)))
|
||||
|
||||
@bp.route('/filter/')
|
||||
def rss_filter():
|
||||
from rsshub.spiders.rssfilter.filter import ctx
|
||||
feed_url = request.args.get("feed")
|
||||
feed_url = request.args.get("feed")
|
||||
return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
|
||||
'''
|
||||
@bp.route('/test')
|
||||
|
||||
@@ -1,8 +1,15 @@
|
||||
from flask_bootstrap import Bootstrap
|
||||
from flask_debugtoolbar import DebugToolbarExtension
|
||||
from flask_moment import Moment
|
||||
from flask_caching import Cache
|
||||
|
||||
|
||||
bootstrap = Bootstrap()
|
||||
debugtoolbar = DebugToolbarExtension()
|
||||
moment = Moment()
|
||||
|
||||
cache = Cache(config={
|
||||
"DEBUG": True, # some Flask specific configs
|
||||
"CACHE_TYPE": "simple", # Flask-Caching related configs
|
||||
"CACHE_DEFAULT_TIMEOUT": 3600 # cache half hour
|
||||
})
|
||||
152
rsshub/spiders/sysu/ifcen.py
Normal file
152
rsshub/spiders/sysu/ifcen.py
Normal file
@@ -0,0 +1,152 @@
|
||||
from rsshub.utils import fetch_by_puppeteer
|
||||
import asyncio
|
||||
|
||||
domain = 'https://ifcen.sysu.edu.cn/'
|
||||
|
||||
|
||||
def parse(selector):
|
||||
|
||||
items = list()
|
||||
|
||||
# 公告通知
|
||||
xpath = '//div[@id="news-2"]/ul//a'
|
||||
announces = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
announces = ['公告通知 | ' + i for i in announces]
|
||||
for i in range(len(announces)):
|
||||
item = dict()
|
||||
item['title'] = announces[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 学院新闻
|
||||
xpath = '//*[@id="news-1"]/ul/li/a'
|
||||
news = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
for i in range(len(news)):
|
||||
item = dict()
|
||||
item['title'] = news[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 人才工作
|
||||
xpath = '//*[@id="notice-1"]/div//a'
|
||||
works = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
works = ['人才工作 | ' + i for i in works]
|
||||
for i in range(len(works)):
|
||||
item = dict()
|
||||
item['title'] = works[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 本科生教育
|
||||
xpath = '//*[@id="notice-2"]/div//a'
|
||||
ues = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
ues = ['本科生教育 | ' + i for i in ues]
|
||||
for i in range(len(ues)):
|
||||
item = dict()
|
||||
item['title'] = ues[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 研究生教育
|
||||
xpath = '//*[@id="notice-3"]/div//a'
|
||||
pgs = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
pgs = ['研究生教育 | ' + i for i in pgs]
|
||||
for i in range(len(pgs)):
|
||||
item = dict()
|
||||
item['title'] = pgs[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 科研信息
|
||||
xpath = '//*[@id="notice-4"]/div//a'
|
||||
research = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
research = ['科研信息 | ' + i for i in research]
|
||||
for i in range(len(research)):
|
||||
item = dict()
|
||||
item['title'] = research[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 学工信息
|
||||
xpath = '//*[@id="notice-5"]/div//a'
|
||||
students = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
students = ['学工信息 | ' + i for i in students]
|
||||
for i in range(len(students)):
|
||||
item = dict()
|
||||
item['title'] = students[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 党建通知
|
||||
xpath = '//*[@id="notice-6"]/div//a'
|
||||
party = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
party = ['党建通知 | ' + i for i in party]
|
||||
for i in range(len(party)):
|
||||
item = dict()
|
||||
item['title'] = party[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
# 工会工作
|
||||
xpath = '//*[@id="notice-7"]/div//a'
|
||||
union = selector.xpath(xpath + '/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
union = ['工会工作 | ' + i for i in union]
|
||||
for i in range(len(union)):
|
||||
item = dict()
|
||||
item['title'] = union[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
xpath = '//*[@id="event-1"]/li//a'
|
||||
report = selector.xpath(xpath + '/text()').getall()
|
||||
author = selector.xpath('//*[@id="event-1"]/li//span[@class="content"]/text()').getall()
|
||||
urls = selector.xpath(xpath + '/@href').getall()
|
||||
urls = [domain + i for i in urls]
|
||||
for i in range(len(report)) :
|
||||
report[i] = report[i] + author[i]
|
||||
report = ['学术报告 | ' + i for i in report]
|
||||
for i in range(len(report)):
|
||||
item = dict()
|
||||
item['title'] = report[i]
|
||||
item['description'] = "网站严格反爬,请进入网站查看具体内容"
|
||||
item['link'] = urls[i]
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
def ctx(category=''):
|
||||
tree = asyncio.run(fetch_by_puppeteer(domain))
|
||||
return {
|
||||
'title': '中山大学中法核官网信息',
|
||||
'link': domain,
|
||||
'description': '中山大学中法核官网通知公告',
|
||||
'author': 'echo',
|
||||
'items': parse(tree)
|
||||
}
|
||||
@@ -25,8 +25,28 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
|
||||
tree = Selector(text=html)
|
||||
return tree
|
||||
|
||||
|
||||
async def fetch_by_puppeteer(url):
|
||||
try:
|
||||
from pyppeteer import launch
|
||||
except Exception as e:
|
||||
print(f'[Err] {e}')
|
||||
else:
|
||||
browser = await launch( # 启动浏览器
|
||||
{'args': ['--no-sandbox']},
|
||||
handleSIGINT=False,
|
||||
handleSIGTERM=False,
|
||||
handleSIGHUP=False
|
||||
)
|
||||
page = await browser.newPage() # 创建新页面
|
||||
await page.goto(url) # 访问网址
|
||||
html = await page.content() # 获取页面内容
|
||||
await browser.close() # 关闭浏览器
|
||||
return Selector(text=html)
|
||||
|
||||
|
||||
def filter_content(items):
|
||||
content = []
|
||||
content = []
|
||||
p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
|
||||
p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
|
||||
p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
|
||||
@@ -35,4 +55,4 @@ def filter_content(items):
|
||||
title = item['title']
|
||||
if p1.match(title) or p2.match(title) or p3.match(title):
|
||||
content.append(item)
|
||||
return content
|
||||
return content
|
||||
Reference in New Issue
Block a user