diff options
-rw-r--r-- | parse-deals.py | 26 |
1 files changed, 19 insertions, 7 deletions
diff --git a/parse-deals.py b/parse-deals.py index f545d90..8a3db43 100644 --- a/parse-deals.py +++ b/parse-deals.py @@ -4,13 +4,13 @@ import markdown import requests from urllib.parse import urlparse -def get_article_text(url): +def scrap_url(url): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.57 Safari/537.36'} return requests.get(url, headers=headers) def parse_article(url): - article = get_article_text(url) + article = scrap_url(url) soup = BeautifulSoup(article.text, "lxml") for div in soup.find_all("div", class_="article__body"): for a in div.find_all('a'): @@ -22,14 +22,13 @@ def parse_article(url): w_price = w_price.strip() w_price = re.sub("[^\d\.]", "", w_price) amzn_url = a['href'] - amzn_r = requests.get(amzn_url) - print(amzn_r.status_code) + amzn_r = scrap_url(amzn_url) if amzn_r.status_code == 404: print(a.get_text(), "is a 404") else: - amzn_soup = BeautifulSoup(amzn_r.text, "lxml") - amzn_price = amzn_soup.find("span", {"id": "newBuyBoxPrice"}) - print(w_price, amzn_price) + amzn_soup = BeautifulSoup(amzn_r.content, "lxml") + price = soup.find(id="priceblock_ourprice") + print(price) except: print("wrong") except: @@ -67,7 +66,20 @@ class Deal(models.Model): def get_article_text(url): return requests.get(url) +def getAmazonPrice(productUrl): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', + } + res = requests.get(productUrl, headers=headers) + res.raise_for_status() + + + soup = BeautifulSoup(res.text, 'html.parser') + amzn_price = soup.find("span", {"id": "newBuyBoxPrice"}) + return soup + +price = getAmazonPrice('http://www.amazon.com/Automate-Boring-Stuff-Python-Programming/dp/1593275994/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=&sr=') |