import re from bs4 import BeautifulSoup import markdown import requests from urllib.parse import urlparse def get_article_text(url): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.57 Safari/537.36'} return requests.get(url, headers=headers) def parse_article(url): article = get_article_text(url) soup = BeautifulSoup(article.text, "lxml") for div in soup.find_all("div", class_="article__body"): for a in div.find_all('a'): domain = urlparse(a['href']) if domain.netloc in ('www.amazon.com', 'amzn.to'): try: w_price = a.get_text().split("$")[1] try: w_price = w_price.strip() w_price = re.sub("[^\d\.]", "", w_price) amzn_url = a['href'] amzn_r = requests.get(amzn_url) print(amzn_r.status_code) if amzn_r.status_code == 404: print(a.get_text(), "is a 404") else: amzn_soup = BeautifulSoup(amzn_r.text, "lxml") amzn_price = amzn_soup.find("span", {"id": "newBuyBoxPrice"}) print(w_price, amzn_price) except: print("wrong") except: w_price = "link has no price" class Article(models.Model): title = models.CharField(max_length=200, blank=True) url = models.CharField(max_length=200) pub_date = models.DateTimeField('Date published', blank=True) class Meta: ordering = ('-pub_date',) get_latest_by = 'pub_date' def __str__(self): return self.title def save(self, *args, **kwargs): parse_article(self.url) super(Article, self).save(*args, **kwargs) class Deal(models.Model): deal_title = models.CharField(max_length=200) deal_price = models.FloatField(max_length=200) deal_url = models.CharField(max_length=200) store_price = models.FloatField(max_length=200) class Meta: ordering = ('-pub_date',) get_latest_by = 'pub_date' def __str__(self): return self.item_title def get_article_text(url): return requests.get(url) subid = "weekenddeals06132020" page_url = "https://www.wired.com/story/weekend-deals-june-12-2020" for a in soup.find_all('a'): start = a['href'].split('//')[1][:4] if str(start) == 'best' or start == 'goto': l = "%s,%s,%s,Impact,%s\n" % (page_url, subid, a.contents[0], a['href']) result.write(l) result.close()