diff options
Diffstat (limited to 'parse-deals.py')
-rw-r--r-- | parse-deals.py | 94 |
1 files changed, 0 insertions, 94 deletions
diff --git a/parse-deals.py b/parse-deals.py deleted file mode 100644 index 8a3db43..0000000 --- a/parse-deals.py +++ /dev/null @@ -1,94 +0,0 @@ -import re -from bs4 import BeautifulSoup -import markdown -import requests -from urllib.parse import urlparse - -def scrap_url(url): - headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.57 Safari/537.36'} - return requests.get(url, headers=headers) - - -def parse_article(url): - article = scrap_url(url) - soup = BeautifulSoup(article.text, "lxml") - for div in soup.find_all("div", class_="article__body"): - for a in div.find_all('a'): - domain = urlparse(a['href']) - if domain.netloc in ('www.amazon.com', 'amzn.to'): - try: - w_price = a.get_text().split("$")[1] - try: - w_price = w_price.strip() - w_price = re.sub("[^\d\.]", "", w_price) - amzn_url = a['href'] - amzn_r = scrap_url(amzn_url) - if amzn_r.status_code == 404: - print(a.get_text(), "is a 404") - else: - amzn_soup = BeautifulSoup(amzn_r.content, "lxml") - price = soup.find(id="priceblock_ourprice") - print(price) - except: - print("wrong") - except: - w_price = "link has no price" - -class Article(models.Model): - title = models.CharField(max_length=200, blank=True) - url = models.CharField(max_length=200) - pub_date = models.DateTimeField('Date published', blank=True) - - class Meta: - ordering = ('-pub_date',) - get_latest_by = 'pub_date' - - def __str__(self): - return self.title - - def save(self, *args, **kwargs): - parse_article(self.url) - super(Article, self).save(*args, **kwargs) - -class Deal(models.Model): - deal_title = models.CharField(max_length=200) - deal_price = models.FloatField(max_length=200) - deal_url = models.CharField(max_length=200) - store_price = models.FloatField(max_length=200) - - class Meta: - ordering = ('-pub_date',) - get_latest_by = 'pub_date' - - def __str__(self): - return self.item_title - -def get_article_text(url): - return requests.get(url) - -def getAmazonPrice(productUrl): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', - } - res = requests.get(productUrl, headers=headers) - res.raise_for_status() - - - soup = BeautifulSoup(res.text, 'html.parser') - amzn_price = soup.find("span", {"id": "newBuyBoxPrice"}) - return soup - - -price = getAmazonPrice('http://www.amazon.com/Automate-Boring-Stuff-Python-Programming/dp/1593275994/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=&sr=') - - - -subid = "weekenddeals06132020" -page_url = "https://www.wired.com/story/weekend-deals-june-12-2020" -for a in soup.find_all('a'): - start = a['href'].split('//')[1][:4] - if str(start) == 'best' or start == 'goto': - l = "%s,%s,%s,Impact,%s\n" % (page_url, subid, a.contents[0], a['href']) - result.write(l) -result.close() - |