mirror of
https://github.com/d0zingcat/RSSHub-python.git
synced 2026-06-10 23:16:52 +00:00
添加对知乎 (www.zhihu.com) 的支持,包括explore和根据问题id返回内容两个频道
This commit is contained in:
@@ -320,4 +320,14 @@ def tadoku_books(category=''):
|
||||
def rss_filter():
|
||||
from rsshub.spiders.rssfilter.filter import ctx
|
||||
feed_url = request.args.get("feed")
|
||||
return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
|
||||
return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
|
||||
|
||||
@bp.route('/zhihu/explore')
|
||||
def zhihu_explore():
|
||||
from rsshub.spiders.zhihu.explore import ctx
|
||||
return render_template('main/atom.xml', **filter_content(ctx()))
|
||||
|
||||
@bp.route('/zhihu/question/<string:qid>')
|
||||
def zhihu_question(qid):
|
||||
from rsshub.spiders.zhihu.article import ctx_question
|
||||
return render_template('main/atom.xml', **filter_content(ctx_question(qid)))
|
||||
|
||||
111
rsshub/spiders/zhihu/article.py
Normal file
111
rsshub/spiders/zhihu/article.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import re
|
||||
import json
|
||||
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from rsshub.utils import fetch
|
||||
|
||||
|
||||
def get_value(d):
|
||||
return list(d.values())[0]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Feed:
|
||||
link: str
|
||||
title: str = ''
|
||||
author: str = '未知作者'
|
||||
description: str = ''
|
||||
items: list = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AtomEntry:
|
||||
link: str
|
||||
title: str = ''
|
||||
author: str = '未知作者'
|
||||
pubDate: datetime = datetime.now()
|
||||
updated_time: datetime = datetime.now()
|
||||
|
||||
description: str = ''
|
||||
content: str = ''
|
||||
|
||||
|
||||
class ZhihuAnswer(AtomEntry):
|
||||
def get(self):
|
||||
tree = fetch(self.link)
|
||||
self.title = tree.css('h1::text').get()
|
||||
self.content = zhihu_figure_transfer(tree.css('.RichText').get())
|
||||
self.description = self.content
|
||||
|
||||
meta: dict = get_value(json.loads(tree.css("#js-initialData::text").get())
|
||||
['initialState']['entities']['questions'])
|
||||
|
||||
self.pubDate = datetime.fromtimestamp(meta['created'])
|
||||
self.updated_time = datetime.fromtimestamp(meta['updatedTime'])
|
||||
|
||||
|
||||
class ZhihuZhuanlanArticle(AtomEntry):
|
||||
def get(self):
|
||||
tree = fetch(self.link)
|
||||
self.title = tree.css('h1::text').get()
|
||||
self.content = zhihu_figure_transfer(tree.css('article').css('.RichText').get())
|
||||
self.description = self.content
|
||||
|
||||
|
||||
class ZhihuQuestion(Feed):
|
||||
|
||||
def get_description(self):
|
||||
tree = fetch(self.link)
|
||||
self.title = tree.css('title::text').get()
|
||||
self.description = tree.xpath('//meta[@name="description"]/text()').get()
|
||||
|
||||
data = json.loads(tree.css("#js-initialData::text").get())
|
||||
for answer_id in list(data['initialState']['question']['answers'].values())[0]['ids']:
|
||||
assert answer_id['targetType'] == 'answer'
|
||||
item = ZhihuAnswer(f'{self.link}/answer/{answer_id["target"]}')
|
||||
item.get()
|
||||
self.items.append(item)
|
||||
|
||||
self.next = list(data['initialState']['question']['answers'].values())[0]['next']
|
||||
|
||||
def get_all(self):
|
||||
if 'next' not in self.__dict__:
|
||||
self.get_description()
|
||||
|
||||
while True:
|
||||
data = json.loads(requests.get(self.next).text)
|
||||
|
||||
for d in data['data']:
|
||||
target = d['target']
|
||||
author = target['author']['name']
|
||||
content = zhihu_figure_transfer(target['content'])
|
||||
|
||||
self.items.append(ZhihuAnswer(
|
||||
title=f'{author}的回答',
|
||||
author=author,
|
||||
link=f'{self.link}/answer/{target["id"]}',
|
||||
pubDate=datetime.fromtimestamp(target['created_time']),
|
||||
updated_time=datetime.fromtimestamp(target['updated_time']),
|
||||
description=zhihu_figure_transfer(content)
|
||||
))
|
||||
|
||||
if data['paging']['is_end']:
|
||||
del self.next
|
||||
break
|
||||
|
||||
self.next = data['paging']['next']
|
||||
|
||||
|
||||
def zhihu_figure_transfer(content):
|
||||
pattern = r'<figure(.*?)<noscript>(.*?)</noscript>(.*?)</figure>'
|
||||
return re.sub(pattern, lambda match: match.group(2), content)
|
||||
|
||||
|
||||
def ctx_question(qid):
|
||||
url = f'https://www.zhihu.com/question/{qid}'
|
||||
question = ZhihuQuestion(url)
|
||||
question.get_all()
|
||||
return asdict(question)
|
||||
53
rsshub/spiders/zhihu/explore.py
Normal file
53
rsshub/spiders/zhihu/explore.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from itertools import chain
|
||||
|
||||
from .article import *
|
||||
|
||||
|
||||
def ctx():
|
||||
r_url = 'https://www.zhihu.com/explore'
|
||||
tree = fetch(r_url)
|
||||
items = {}
|
||||
channel = {}
|
||||
|
||||
hot_question = tree.css('.css-1nd7dqm')
|
||||
newest_topic = tree.css('.ExploreSpecialCard-contentTitle')
|
||||
discussion = tree.css('.ExploreRoundtableCard-questionTitle')
|
||||
collection_card = tree.css('.ExploreCollectionCard-contentTitle')
|
||||
|
||||
for post in chain(hot_question, collection_card, discussion, newest_topic):
|
||||
title = post.css('a::text').extract_first()
|
||||
link: str = post.css('a::attr(href)').extract_first()
|
||||
|
||||
if link:
|
||||
if not (link.startswith('https://www.zhihu.com')
|
||||
or link.startswith('https://zhuanlan.zhihu.com')):
|
||||
link = f'https://www.zhihu.com{link}'
|
||||
|
||||
if link.startswith('https://www.zhihu.com/question/'):
|
||||
item = ZhihuQuestion(link, title=title)
|
||||
channel[link] = item
|
||||
elif link.startswith('https://zhuanlan.zhihu.com/p'):
|
||||
item = ZhihuZhuanlanArticle(link)
|
||||
item.get()
|
||||
items[link] = item
|
||||
elif link.startswith('https://www.zhihu.com/answer/'):
|
||||
item = ZhihuAnswer(link)
|
||||
item.get()
|
||||
items[link] = item
|
||||
else:
|
||||
items[link] = {
|
||||
'title': title,
|
||||
'link': link,
|
||||
'description': title
|
||||
}
|
||||
|
||||
for c in channel.values():
|
||||
c.get_description()
|
||||
for i in c.items:
|
||||
items[i.link] = i
|
||||
|
||||
return {
|
||||
'title': f'发现 - 知乎',
|
||||
'link': r_url,
|
||||
'items': list(items.values())
|
||||
}
|
||||
@@ -624,5 +624,22 @@
|
||||
<br>
|
||||
<!--item info end-->
|
||||
|
||||
<!--item info start-->
|
||||
<div class="card text-left">
|
||||
<div class="card-body">
|
||||
<h4 class="card-title">知乎</h4>
|
||||
|
||||
<h6 class="text-muted">Explore <a href="https://github.com/JeffersonYoung" target="_blank" class="badge badge-secondary">by https://github.com/JeffersonYoung</a></h6>
|
||||
<p class="card-text">举例:<a href="https://pyrsshub.vercel.app/zhihu/explore" target="_blank">https://pyrsshub.vercel.app/zhihu/explore</a></p>
|
||||
<p class="card-text">路由:<code>/zhihu/explore</code></p>
|
||||
|
||||
<h6 class="text-muted">问题 <a href="https://github.com/JeffersonYoung" target="_blank" class="badge badge-secondary">by https://github.com/JeffersonYoung</a></h6>
|
||||
<p class="card-text">举例:<a href="https://pyrsshub.vercel.app/zhihu/question/509768617" target="_blank">https://pyrsshub.vercel.app/zhihu/question/509768617</a></p>
|
||||
<p class="card-text">路由:<code>/zhihu/question/:qid</code></p>
|
||||
<p class="card-text">qid [必填, 问题id,如例所示,和网站url上的id相同] </p>
|
||||
</div>
|
||||
</div>
|
||||
<br>
|
||||
<!--item info end-->
|
||||
|
||||
{% endblock content %}
|
||||
Reference in New Issue
Block a user