diff --git a/rsshub/blueprints/main.py b/rsshub/blueprints/main.py index cdcc2f8..b8529cc 100644 --- a/rsshub/blueprints/main.py +++ b/rsshub/blueprints/main.py @@ -320,4 +320,14 @@ def tadoku_books(category=''): def rss_filter(): from rsshub.spiders.rssfilter.filter import ctx feed_url = request.args.get("feed") - return render_template('main/atom.xml', **filter_content(ctx(feed_url))) \ No newline at end of file + return render_template('main/atom.xml', **filter_content(ctx(feed_url))) + +@bp.route('/zhihu/explore') +def zhihu_explore(): + from rsshub.spiders.zhihu.explore import ctx + return render_template('main/atom.xml', **filter_content(ctx())) + +@bp.route('/zhihu/question/') +def zhihu_question(qid): + from rsshub.spiders.zhihu.article import ctx_question + return render_template('main/atom.xml', **filter_content(ctx_question(qid))) diff --git a/rsshub/spiders/zhihu/article.py b/rsshub/spiders/zhihu/article.py new file mode 100644 index 0000000..2d766da --- /dev/null +++ b/rsshub/spiders/zhihu/article.py @@ -0,0 +1,111 @@ +import re +import json + +from dataclasses import dataclass, field, asdict +from datetime import datetime + +import requests +from rsshub.utils import fetch + + +def get_value(d): + return list(d.values())[0] + + +@dataclass +class Feed: + link: str + title: str = '' + author: str = '未知作者' + description: str = '' + items: list = field(default_factory=list) + + +@dataclass +class AtomEntry: + link: str + title: str = '' + author: str = '未知作者' + pubDate: datetime = datetime.now() + updated_time: datetime = datetime.now() + + description: str = '' + content: str = '' + + +class ZhihuAnswer(AtomEntry): + def get(self): + tree = fetch(self.link) + self.title = tree.css('h1::text').get() + self.content = zhihu_figure_transfer(tree.css('.RichText').get()) + self.description = self.content + + meta: dict = get_value(json.loads(tree.css("#js-initialData::text").get()) + ['initialState']['entities']['questions']) + + self.pubDate = datetime.fromtimestamp(meta['created']) + self.updated_time = datetime.fromtimestamp(meta['updatedTime']) + + +class ZhihuZhuanlanArticle(AtomEntry): + def get(self): + tree = fetch(self.link) + self.title = tree.css('h1::text').get() + self.content = zhihu_figure_transfer(tree.css('article').css('.RichText').get()) + self.description = self.content + + +class ZhihuQuestion(Feed): + + def get_description(self): + tree = fetch(self.link) + self.title = tree.css('title::text').get() + self.description = tree.xpath('//meta[@name="description"]/text()').get() + + data = json.loads(tree.css("#js-initialData::text").get()) + for answer_id in list(data['initialState']['question']['answers'].values())[0]['ids']: + assert answer_id['targetType'] == 'answer' + item = ZhihuAnswer(f'{self.link}/answer/{answer_id["target"]}') + item.get() + self.items.append(item) + + self.next = list(data['initialState']['question']['answers'].values())[0]['next'] + + def get_all(self): + if 'next' not in self.__dict__: + self.get_description() + + while True: + data = json.loads(requests.get(self.next).text) + + for d in data['data']: + target = d['target'] + author = target['author']['name'] + content = zhihu_figure_transfer(target['content']) + + self.items.append(ZhihuAnswer( + title=f'{author}的回答', + author=author, + link=f'{self.link}/answer/{target["id"]}', + pubDate=datetime.fromtimestamp(target['created_time']), + updated_time=datetime.fromtimestamp(target['updated_time']), + description=zhihu_figure_transfer(content) + )) + + if data['paging']['is_end']: + del self.next + break + + self.next = data['paging']['next'] + + +def zhihu_figure_transfer(content): + pattern = r'(.*?)(.*?)' + return re.sub(pattern, lambda match: match.group(2), content) + + +def ctx_question(qid): + url = f'https://www.zhihu.com/question/{qid}' + question = ZhihuQuestion(url) + question.get_all() + return asdict(question) \ No newline at end of file diff --git a/rsshub/spiders/zhihu/explore.py b/rsshub/spiders/zhihu/explore.py new file mode 100644 index 0000000..542a81c --- /dev/null +++ b/rsshub/spiders/zhihu/explore.py @@ -0,0 +1,53 @@ +from itertools import chain + +from .article import * + + +def ctx(): + r_url = 'https://www.zhihu.com/explore' + tree = fetch(r_url) + items = {} + channel = {} + + hot_question = tree.css('.css-1nd7dqm') + newest_topic = tree.css('.ExploreSpecialCard-contentTitle') + discussion = tree.css('.ExploreRoundtableCard-questionTitle') + collection_card = tree.css('.ExploreCollectionCard-contentTitle') + + for post in chain(hot_question, collection_card, discussion, newest_topic): + title = post.css('a::text').extract_first() + link: str = post.css('a::attr(href)').extract_first() + + if link: + if not (link.startswith('https://www.zhihu.com') + or link.startswith('https://zhuanlan.zhihu.com')): + link = f'https://www.zhihu.com{link}' + + if link.startswith('https://www.zhihu.com/question/'): + item = ZhihuQuestion(link, title=title) + channel[link] = item + elif link.startswith('https://zhuanlan.zhihu.com/p'): + item = ZhihuZhuanlanArticle(link) + item.get() + items[link] = item + elif link.startswith('https://www.zhihu.com/answer/'): + item = ZhihuAnswer(link) + item.get() + items[link] = item + else: + items[link] = { + 'title': title, + 'link': link, + 'description': title + } + + for c in channel.values(): + c.get_description() + for i in c.items: + items[i.link] = i + + return { + 'title': f'发现 - 知乎', + 'link': r_url, + 'items': list(items.values()) + } diff --git a/rsshub/templates/main/feeds.html b/rsshub/templates/main/feeds.html index 6980caf..38cd66e 100644 --- a/rsshub/templates/main/feeds.html +++ b/rsshub/templates/main/feeds.html @@ -624,5 +624,22 @@
+ +
+
+

知乎

+ +
Explore by https://github.com/JeffersonYoung
+

举例:https://pyrsshub.vercel.app/zhihu/explore

+

路由:/zhihu/explore

+ +
问题 by https://github.com/JeffersonYoung
+

举例:https://pyrsshub.vercel.app/zhihu/question/509768617

+

路由:/zhihu/question/:qid

+

qid [必填, 问题id,如例所示,和网站url上的id相同]

+
+
+
+ {% endblock content %} \ No newline at end of file