添加对知乎 (www.zhihu.com) 的支持,包括explore和根据问题id返回内容两个频道

This commit is contained in:
jefferson
2023-08-04 14:11:16 +08:00
parent 58cee18bda
commit fe04ece37d
4 changed files with 192 additions and 1 deletions

View File

@@ -320,4 +320,14 @@ def tadoku_books(category=''):
def rss_filter():
from rsshub.spiders.rssfilter.filter import ctx
feed_url = request.args.get("feed")
return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
@bp.route('/zhihu/explore')
def zhihu_explore():
from rsshub.spiders.zhihu.explore import ctx
return render_template('main/atom.xml', **filter_content(ctx()))
@bp.route('/zhihu/question/<string:qid>')
def zhihu_question(qid):
from rsshub.spiders.zhihu.article import ctx_question
return render_template('main/atom.xml', **filter_content(ctx_question(qid)))

View File

@@ -0,0 +1,111 @@
import re
import json
from dataclasses import dataclass, field, asdict
from datetime import datetime
import requests
from rsshub.utils import fetch
def get_value(d):
return list(d.values())[0]
@dataclass
class Feed:
link: str
title: str = ''
author: str = '未知作者'
description: str = ''
items: list = field(default_factory=list)
@dataclass
class AtomEntry:
link: str
title: str = ''
author: str = '未知作者'
pubDate: datetime = datetime.now()
updated_time: datetime = datetime.now()
description: str = ''
content: str = ''
class ZhihuAnswer(AtomEntry):
def get(self):
tree = fetch(self.link)
self.title = tree.css('h1::text').get()
self.content = zhihu_figure_transfer(tree.css('.RichText').get())
self.description = self.content
meta: dict = get_value(json.loads(tree.css("#js-initialData::text").get())
['initialState']['entities']['questions'])
self.pubDate = datetime.fromtimestamp(meta['created'])
self.updated_time = datetime.fromtimestamp(meta['updatedTime'])
class ZhihuZhuanlanArticle(AtomEntry):
def get(self):
tree = fetch(self.link)
self.title = tree.css('h1::text').get()
self.content = zhihu_figure_transfer(tree.css('article').css('.RichText').get())
self.description = self.content
class ZhihuQuestion(Feed):
def get_description(self):
tree = fetch(self.link)
self.title = tree.css('title::text').get()
self.description = tree.xpath('//meta[@name="description"]/text()').get()
data = json.loads(tree.css("#js-initialData::text").get())
for answer_id in list(data['initialState']['question']['answers'].values())[0]['ids']:
assert answer_id['targetType'] == 'answer'
item = ZhihuAnswer(f'{self.link}/answer/{answer_id["target"]}')
item.get()
self.items.append(item)
self.next = list(data['initialState']['question']['answers'].values())[0]['next']
def get_all(self):
if 'next' not in self.__dict__:
self.get_description()
while True:
data = json.loads(requests.get(self.next).text)
for d in data['data']:
target = d['target']
author = target['author']['name']
content = zhihu_figure_transfer(target['content'])
self.items.append(ZhihuAnswer(
title=f'{author}的回答',
author=author,
link=f'{self.link}/answer/{target["id"]}',
pubDate=datetime.fromtimestamp(target['created_time']),
updated_time=datetime.fromtimestamp(target['updated_time']),
description=zhihu_figure_transfer(content)
))
if data['paging']['is_end']:
del self.next
break
self.next = data['paging']['next']
def zhihu_figure_transfer(content):
pattern = r'<figure(.*?)<noscript>(.*?)</noscript>(.*?)</figure>'
return re.sub(pattern, lambda match: match.group(2), content)
def ctx_question(qid):
url = f'https://www.zhihu.com/question/{qid}'
question = ZhihuQuestion(url)
question.get_all()
return asdict(question)

View File

@@ -0,0 +1,53 @@
from itertools import chain
from .article import *
def ctx():
r_url = 'https://www.zhihu.com/explore'
tree = fetch(r_url)
items = {}
channel = {}
hot_question = tree.css('.css-1nd7dqm')
newest_topic = tree.css('.ExploreSpecialCard-contentTitle')
discussion = tree.css('.ExploreRoundtableCard-questionTitle')
collection_card = tree.css('.ExploreCollectionCard-contentTitle')
for post in chain(hot_question, collection_card, discussion, newest_topic):
title = post.css('a::text').extract_first()
link: str = post.css('a::attr(href)').extract_first()
if link:
if not (link.startswith('https://www.zhihu.com')
or link.startswith('https://zhuanlan.zhihu.com')):
link = f'https://www.zhihu.com{link}'
if link.startswith('https://www.zhihu.com/question/'):
item = ZhihuQuestion(link, title=title)
channel[link] = item
elif link.startswith('https://zhuanlan.zhihu.com/p'):
item = ZhihuZhuanlanArticle(link)
item.get()
items[link] = item
elif link.startswith('https://www.zhihu.com/answer/'):
item = ZhihuAnswer(link)
item.get()
items[link] = item
else:
items[link] = {
'title': title,
'link': link,
'description': title
}
for c in channel.values():
c.get_description()
for i in c.items:
items[i.link] = i
return {
'title': f'发现 - 知乎',
'link': r_url,
'items': list(items.values())
}

View File

@@ -624,5 +624,22 @@
<br>
<!--item info end-->
<!--item info start-->
<div class="card text-left">
<div class="card-body">
<h4 class="card-title">知乎</h4>
<h6 class="text-muted">Explore <a href="https://github.com/JeffersonYoung" target="_blank" class="badge badge-secondary">by https://github.com/JeffersonYoung</a></h6>
<p class="card-text">举例:<a href="https://pyrsshub.vercel.app/zhihu/explore" target="_blank">https://pyrsshub.vercel.app/zhihu/explore</a></p>
<p class="card-text">路由:<code>/zhihu/explore</code></p>
<h6 class="text-muted">问题 <a href="https://github.com/JeffersonYoung" target="_blank" class="badge badge-secondary">by https://github.com/JeffersonYoung</a></h6>
<p class="card-text">举例:<a href="https://pyrsshub.vercel.app/zhihu/question/509768617" target="_blank">https://pyrsshub.vercel.app/zhihu/question/509768617</a></p>
<p class="card-text">路由:<code>/zhihu/question/:qid</code></p>
<p class="card-text">qid [必填, 问题id如例所示和网站url上的id相同] </p>
</div>
</div>
<br>
<!--item info end-->
{% endblock content %}