diff options
-rw-r--r-- | parse-deals.py | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/parse-deals.py b/parse-deals.py new file mode 100644 index 0000000..f545d90 --- /dev/null +++ b/parse-deals.py @@ -0,0 +1,82 @@ +import re +from bs4 import BeautifulSoup +import markdown +import requests +from urllib.parse import urlparse + +def get_article_text(url): + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.57 Safari/537.36'} + return requests.get(url, headers=headers) + + +def parse_article(url): + article = get_article_text(url) + soup = BeautifulSoup(article.text, "lxml") + for div in soup.find_all("div", class_="article__body"): + for a in div.find_all('a'): + domain = urlparse(a['href']) + if domain.netloc in ('www.amazon.com', 'amzn.to'): + try: + w_price = a.get_text().split("$")[1] + try: + w_price = w_price.strip() + w_price = re.sub("[^\d\.]", "", w_price) + amzn_url = a['href'] + amzn_r = requests.get(amzn_url) + print(amzn_r.status_code) + if amzn_r.status_code == 404: + print(a.get_text(), "is a 404") + else: + amzn_soup = BeautifulSoup(amzn_r.text, "lxml") + amzn_price = amzn_soup.find("span", {"id": "newBuyBoxPrice"}) + print(w_price, amzn_price) + except: + print("wrong") + except: + w_price = "link has no price" + +class Article(models.Model): + title = models.CharField(max_length=200, blank=True) + url = models.CharField(max_length=200) + pub_date = models.DateTimeField('Date published', blank=True) + + class Meta: + ordering = ('-pub_date',) + get_latest_by = 'pub_date' + + def __str__(self): + return self.title + + def save(self, *args, **kwargs): + parse_article(self.url) + super(Article, self).save(*args, **kwargs) + +class Deal(models.Model): + deal_title = models.CharField(max_length=200) + deal_price = models.FloatField(max_length=200) + deal_url = models.CharField(max_length=200) + store_price = models.FloatField(max_length=200) + + class Meta: + ordering = ('-pub_date',) + get_latest_by = 'pub_date' + + def __str__(self): + return self.item_title + +def get_article_text(url): + return requests.get(url) + + + + + +subid = "weekenddeals06132020" +page_url = "https://www.wired.com/story/weekend-deals-june-12-2020" +for a in soup.find_all('a'): + start = a['href'].split('//')[1][:4] + if str(start) == 'best' or start == 'goto': + l = "%s,%s,%s,Impact,%s\n" % (page_url, subid, a.contents[0], a['href']) + result.write(l) +result.close() + |