try to bypass cloudflare on ph

2026-05-14 23:16:50 +00:00 · 2022-12-14 21:44:10 +08:00
parent ff2940092e
commit 3064d5a18e
3 changed files with 277 additions and 177 deletions
--- a/rsshub/spiders/producthunt/search.py
+++ b/rsshub/spiders/producthunt/search.py
@@ -1,6 +1,9 @@
 import re
-import requests
-from rsshub.utils import fetch
+import json
+
+from bs4 import BeautifulSoup    
+import undetected_chromedriver as uc
+
 from rsshub.utils import DEFAULT_HEADERS

 domain = 'https://www.producthunt.com'
@@ -8,24 +11,33 @@ domain = 'https://www.producthunt.com'

 def parse(post):
    item = {}
-    item['title'] = post.css('a[class*="styles_title__"]::text').extract_first()
-    item['description'] = post.css('a[class*="styles_tagline__"]::text').extract_first()
-    item['link'] = domain + post.css('a::attr(href)').extract_first()
+    item['title'] = post['name']
+    item['description'] = post['tagline']
+    item['link'] = post['url']
    return item 


 def ctx(keyword='', period=''):
    DEFAULT_HEADERS.update({'Referer': domain}) 
-    r_url = f'{domain}' + f'/search?q={keyword}&postedAfter={period}:days'
-    tree = fetch(r_url,headers=DEFAULT_HEADERS)
-    posts = tree.css('.style_layoutMain___pXHk').css('.style_px-mobile-1__DSM5j.style_px-tablet-1__R5dkv.style_pt-mobile-0__lBXpV.style_pt-desktop-6__eNi8V.style_pt-tablet-6__BJU9d.style_pt-widescreen-6__SWPD_.style_pb-mobile-7__OX0Sz.style_pb-desktop-6__EZ3zm.style_pb-tablet-6__F61Qx.style_pb-widescreen-6__UB2pW')
-    print(posts)
+    r_url = f'{domain}' + f'/search?q={keyword}&postedAfter={period}:days'    
+    browser = uc.Chrome()
+    browser.get(r_url)
+    import time
+    time.sleep(3)
+    html = browser.page_source
+
+    soup = BeautifulSoup(html, 'html.parser')
+    script = soup.find('script', id='__NEXT_DATA__')
+    data = json.loads(script.text)['props']['apolloState']
+    browser.quit()
+    posts = [ v for k, v in data.items() if k.startswith('Product')] 
+    
    items = list(map(parse, posts))
-    items = [item for item in items if item['title']!=None]
+    
    return {
        'title': f'{keyword} - Producthunt',
        'link': r_url,
-        'description': f'Producthunt - {r_url}',
+        'description': f'{keyword} - Producthunt',
        'author': 'hillerliao',
        'items': items
-    }
+    }