添加对知乎 (www.zhihu.com) 的支持，包括explore和根据问题id返回内容两个频道

2026-06-10 23:16:52 +00:00 · 2023-08-04 14:11:16 +08:00
parent 58cee18bda
commit fe04ece37d
4 changed files with 192 additions and 1 deletions
--- a/rsshub/blueprints/main.py
+++ b/rsshub/blueprints/main.py
@@ -320,4 +320,14 @@ def tadoku_books(category=''):
 def rss_filter():
    from rsshub.spiders.rssfilter.filter import ctx
    feed_url = request.args.get("feed")
-    return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
+    return render_template('main/atom.xml', **filter_content(ctx(feed_url)))
+
+@bp.route('/zhihu/explore')
+def zhihu_explore():
+    from rsshub.spiders.zhihu.explore import ctx
+    return render_template('main/atom.xml', **filter_content(ctx()))
+
+@bp.route('/zhihu/question/<string:qid>')
+def zhihu_question(qid):
+    from rsshub.spiders.zhihu.article import ctx_question
+    return render_template('main/atom.xml', **filter_content(ctx_question(qid)))
--- a/rsshub/spiders/zhihu/article.py
+++ b/rsshub/spiders/zhihu/article.py
@@ -0,0 +1,111 @@
+import re
+import json
+
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+
+import requests
+from rsshub.utils import fetch
+
+
+def get_value(d):
+    return list(d.values())[0]
+
+
+@dataclass
+class Feed:
+    link: str
+    title: str = ''
+    author: str = '未知作者'
+    description: str = ''
+    items: list = field(default_factory=list)
+
+
+@dataclass
+class AtomEntry:
+    link: str
+    title: str = ''
+    author: str = '未知作者'
+    pubDate: datetime = datetime.now()
+    updated_time: datetime = datetime.now()
+
+    description: str = ''
+    content: str = ''
+
+
+class ZhihuAnswer(AtomEntry):
+    def get(self):
+        tree = fetch(self.link)
+        self.title = tree.css('h1::text').get()
+        self.content = zhihu_figure_transfer(tree.css('.RichText').get())
+        self.description = self.content
+
+        meta: dict = get_value(json.loads(tree.css("#js-initialData::text").get())
+                               ['initialState']['entities']['questions'])
+
+        self.pubDate = datetime.fromtimestamp(meta['created'])
+        self.updated_time = datetime.fromtimestamp(meta['updatedTime'])
+
+
+class ZhihuZhuanlanArticle(AtomEntry):
+    def get(self):
+        tree = fetch(self.link)
+        self.title = tree.css('h1::text').get()
+        self.content = zhihu_figure_transfer(tree.css('article').css('.RichText').get())
+        self.description = self.content
+
+
+class ZhihuQuestion(Feed):
+
+    def get_description(self):
+        tree = fetch(self.link)
+        self.title = tree.css('title::text').get()
+        self.description = tree.xpath('//meta[@name="description"]/text()').get()
+
+        data = json.loads(tree.css("#js-initialData::text").get())
+        for answer_id in list(data['initialState']['question']['answers'].values())[0]['ids']:
+            assert answer_id['targetType'] == 'answer'
+            item = ZhihuAnswer(f'{self.link}/answer/{answer_id["target"]}')
+            item.get()
+            self.items.append(item)
+
+        self.next = list(data['initialState']['question']['answers'].values())[0]['next']
+
+    def get_all(self):
+        if 'next' not in self.__dict__:
+            self.get_description()
+
+        while True:
+            data = json.loads(requests.get(self.next).text)
+
+            for d in data['data']:
+                target = d['target']
+                author = target['author']['name']
+                content = zhihu_figure_transfer(target['content'])
+
+                self.items.append(ZhihuAnswer(
+                    title=f'{author}的回答',
+                    author=author,
+                    link=f'{self.link}/answer/{target["id"]}',
+                    pubDate=datetime.fromtimestamp(target['created_time']),
+                    updated_time=datetime.fromtimestamp(target['updated_time']),
+                    description=zhihu_figure_transfer(content)
+                ))
+
+            if data['paging']['is_end']:
+                del self.next
+                break
+
+            self.next = data['paging']['next']
+
+
+def zhihu_figure_transfer(content):
+    pattern = r'<figure(.*?)<noscript>(.*?)</noscript>(.*?)</figure>'
+    return re.sub(pattern, lambda match: match.group(2), content)
+
+
+def ctx_question(qid):
+    url = f'https://www.zhihu.com/question/{qid}'
+    question = ZhihuQuestion(url)
+    question.get_all()
+    return asdict(question)
--- a/rsshub/spiders/zhihu/explore.py
+++ b/rsshub/spiders/zhihu/explore.py
@@ -0,0 +1,53 @@
+from itertools import chain
+
+from .article import *
+
+
+def ctx():
+    r_url = 'https://www.zhihu.com/explore'
+    tree = fetch(r_url)
+    items = {}
+    channel = {}
+
+    hot_question = tree.css('.css-1nd7dqm')
+    newest_topic = tree.css('.ExploreSpecialCard-contentTitle')
+    discussion = tree.css('.ExploreRoundtableCard-questionTitle')
+    collection_card = tree.css('.ExploreCollectionCard-contentTitle')
+
+    for post in chain(hot_question, collection_card, discussion, newest_topic):
+        title = post.css('a::text').extract_first()
+        link: str = post.css('a::attr(href)').extract_first()
+
+        if link:
+            if not (link.startswith('https://www.zhihu.com')
+                    or link.startswith('https://zhuanlan.zhihu.com')):
+                link = f'https://www.zhihu.com{link}'
+
+            if link.startswith('https://www.zhihu.com/question/'):
+                item = ZhihuQuestion(link, title=title)
+                channel[link] = item
+            elif link.startswith('https://zhuanlan.zhihu.com/p'):
+                item = ZhihuZhuanlanArticle(link)
+                item.get()
+                items[link] = item
+            elif link.startswith('https://www.zhihu.com/answer/'):
+                item = ZhihuAnswer(link)
+                item.get()
+                items[link] = item
+            else:
+                items[link] = {
+                    'title': title,
+                    'link': link,
+                    'description': title
+                }
+
+    for c in channel.values():
+        c.get_description()
+        for i in c.items:
+            items[i.link] = i
+
+    return {
+        'title': f'发现 - 知乎',
+        'link': r_url,
+        'items': list(items.values())
+    }
--- a/rsshub/templates/main/feeds.html
+++ b/rsshub/templates/main/feeds.html
@@ -624,5 +624,22 @@
 <br>
 <!--item info end-->

+<!--item info start-->
+<div class="card text-left">
+    <div class="card-body">
+        <h4 class="card-title">知乎</h4>
+
+        <h6 class="text-muted">Explore <a href="https://github.com/JeffersonYoung" target="_blank" class="badge badge-secondary">by https://github.com/JeffersonYoung</a></h6>
+        <p class="card-text">举例：<a href="https://pyrsshub.vercel.app/zhihu/explore" target="_blank">https://pyrsshub.vercel.app/zhihu/explore</a></p>
+        <p class="card-text">路由：<code>/zhihu/explore</code></p>
+
+        <h6 class="text-muted">问题 <a href="https://github.com/JeffersonYoung" target="_blank" class="badge badge-secondary">by https://github.com/JeffersonYoung</a></h6>
+        <p class="card-text">举例：<a href="https://pyrsshub.vercel.app/zhihu/question/509768617" target="_blank">https://pyrsshub.vercel.app/zhihu/question/509768617</a></p>
+        <p class="card-text">路由：<code>/zhihu/question/:qid</code></p>
+        <p class="card-text">qid [必填， 问题id，如例所示，和网站url上的id相同] </p>
+    </div>
+</div>
+<br>
+<!--item info end-->

 {% endblock content %}