From 8f129e9c00465a82ff09ab3f63774ebf88ff2b23 Mon Sep 17 00:00:00 2001 From: hillerliao Date: Sat, 14 Mar 2020 14:58:38 +0800 Subject: [PATCH] fix none to businesswire item link --- rsshub/spiders/earningsdate/businesswire.py | 4 ++-- rsshub/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rsshub/spiders/earningsdate/businesswire.py b/rsshub/spiders/earningsdate/businesswire.py index 9ab24d5..26657a6 100644 --- a/rsshub/spiders/earningsdate/businesswire.py +++ b/rsshub/spiders/earningsdate/businesswire.py @@ -7,8 +7,8 @@ domain = 'businesswire.com' def parse(post): item = {} item['title'] = post.css('title::text').extract_first().strip() - item['description'] = post.css('description::text').extract_first().strip(']]>') - item['link'] = post.css('link::text').extract_first() + item['description'] = post.css('description::text').extract_first() + item['link'] = post.extract().split(' ')[-2].split('>')[-1].strip() item['pubDate'] = post.css('pubDate::text').extract_first() return item diff --git a/rsshub/utils.py b/rsshub/utils.py index 614393e..3f6803b 100644 --- a/rsshub/utils.py +++ b/rsshub/utils.py @@ -28,7 +28,7 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None): def filter_content(items): content = [] p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE) - p2 = re.compile(r'(.*)(schedule|announce|to) (.*)call', re.IGNORECASE) + p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE) p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE) for item in items: