From 2fff755756f188d6a0a52e6ddb3abbf1224ef4ba Mon Sep 17 00:00:00 2001 From: jefferson Date: Tue, 8 Aug 2023 14:20:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=9F=A5=E4=B9=8E=20(www.zhi?= =?UTF-8?q?hu.com)=20=E7=9A=84=E4=B8=A4=E4=B8=AA=E9=A2=91=E9=81=93?= =?UTF-8?q?=EF=BC=9A=20=E5=9C=86=E6=A1=8C=E5=92=8C=E6=94=B6=E8=97=8F?= =?UTF-8?q?=E5=A4=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rsshub/spiders/zhihu/article.py | 13 +++++++++ rsshub/spiders/zhihu/collection.py | 44 ++++++++++++++++++++++++++++++ rsshub/spiders/zhihu/explore.py | 2 +- rsshub/spiders/zhihu/roundtable.py | 23 ++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 rsshub/spiders/zhihu/collection.py create mode 100644 rsshub/spiders/zhihu/roundtable.py diff --git a/rsshub/spiders/zhihu/article.py b/rsshub/spiders/zhihu/article.py index 2d766da..3527012 100644 --- a/rsshub/spiders/zhihu/article.py +++ b/rsshub/spiders/zhihu/article.py @@ -40,6 +40,10 @@ class ZhihuAnswer(AtomEntry): self.content = zhihu_figure_transfer(tree.css('.RichText').get()) self.description = self.content + # author + + self.author = json.loads(tree.xpath('//div[@class="ContentItem AnswerItem"]/@data-zop').get())['authorName'] + meta: dict = get_value(json.loads(tree.css("#js-initialData::text").get()) ['initialState']['entities']['questions']) @@ -51,9 +55,18 @@ class ZhihuZhuanlanArticle(AtomEntry): def get(self): tree = fetch(self.link) self.title = tree.css('h1::text').get() + author = tree.xpath('//meta[@itemProp="name"]/@content').get() + if author: + self.author = author self.content = zhihu_figure_transfer(tree.css('article').css('.RichText').get()) self.description = self.content + # + data = json.loads(tree.css("#js-initialData::text").get()) + metadata = list(data['initialState']['entities']['articles'].values())[0] + self.pubDate = datetime.fromtimestamp(metadata['created']) + self.updated_time = datetime.fromtimestamp(metadata['updated']) + class ZhihuQuestion(Feed): diff --git a/rsshub/spiders/zhihu/collection.py b/rsshub/spiders/zhihu/collection.py new file mode 100644 index 0000000..f6e5bad --- /dev/null +++ b/rsshub/spiders/zhihu/collection.py @@ -0,0 +1,44 @@ +import json +import requests + +from .article import ZhihuAnswer, ZhihuZhuanlanArticle + + + +def get_metadata(collection_id): + response = requests.get(f'https://api.zhihu.com/collections/{collection_id}') + response.raise_for_status() + data = json.loads(response.text)['collection'] + + metadata = dict() + metadata['link'] = data['url'] + metadata['title'] = data['title'] + # metadata['created_time'] = data['created_time'] + # metadata['updated_time'] = data['updated_time'] + return metadata + +def ctx(collection_id): + + # meta + metadata = get_metadata(collection_id) + + # content + + response = requests.get(f'https://www.zhihu.com/api/v4/collections/{collection_id}/items?limit=20&offset=0') + response.raise_for_status() + data = json.loads(response.text) + items = [] + + for d in data['data']: + if d['content']['type'] == 'answer': + item = ZhihuAnswer(d['content']['url']) + elif d['content']['type'] == 'article': + item = ZhihuZhuanlanArticle(d['content']['url']) + else: + assert False + item.get() + items.append(item) + + metadata['items'] = items + + return metadata diff --git a/rsshub/spiders/zhihu/explore.py b/rsshub/spiders/zhihu/explore.py index 542a81c..9242e82 100644 --- a/rsshub/spiders/zhihu/explore.py +++ b/rsshub/spiders/zhihu/explore.py @@ -14,7 +14,7 @@ def ctx(): discussion = tree.css('.ExploreRoundtableCard-questionTitle') collection_card = tree.css('.ExploreCollectionCard-contentTitle') - for post in chain(hot_question, collection_card, discussion, newest_topic): + for post in chain(hot_question, collection_card, discussion): #, newest_topic): title = post.css('a::text').extract_first() link: str = post.css('a::attr(href)').extract_first() diff --git a/rsshub/spiders/zhihu/roundtable.py b/rsshub/spiders/zhihu/roundtable.py new file mode 100644 index 0000000..8e22282 --- /dev/null +++ b/rsshub/spiders/zhihu/roundtable.py @@ -0,0 +1,23 @@ +import json +import requests + +from .article import ZhihuQuestion + +def ctx(name): + url = f'https://www.zhihu.com/api/v4/roundtables/{name}/hot-questions?include=data[*].question.relationship' + response = requests.get(url) + response.raise_for_status() + + data = json.loads(response.text) + items = [] + + for d in data['data']: + item = ZhihuQuestion(f'https://www.zhihu.com/question/{d["question"]["id"]}') + item.get_description() + + items.append(item) + + return { + 'title': 'roundtable', + 'items': items + }