fix none to businesswire item link

This commit is contained in:
hillerliao
2020-03-14 14:58:38 +08:00
parent e4734cb1d2
commit 8f129e9c00
2 changed files with 3 additions and 3 deletions

View File

@@ -7,8 +7,8 @@ domain = 'businesswire.com'
def parse(post):
item = {}
item['title'] = post.css('title::text').extract_first().strip()
item['description'] = post.css('description::text').extract_first().strip(']]>')
item['link'] = post.css('link::text').extract_first()
item['description'] = post.css('description::text').extract_first()
item['link'] = post.extract().split(' ')[-2].split('>')[-1].strip()
item['pubDate'] = post.css('pubDate::text').extract_first()
return item

View File

@@ -28,7 +28,7 @@ def fetch(url: str, headers: dict=DEFAULT_HEADERS, proxies: dict=None):
def filter_content(items):
content = []
p1 = re.compile(r'(.*)(to|will|date|schedule) (.*)results', re.IGNORECASE)
p2 = re.compile(r'(.*)(schedule|announce|to) (.*)call', re.IGNORECASE)
p2 = re.compile(r'(.*)(schedule|schedules|announce|to) (.*)call', re.IGNORECASE)
p3 = re.compile(r'(.*)release (.*)date', re.IGNORECASE)
for item in items: