summaryrefslogtreecommitdiff
path: root/parse-deals.py
diff options
context:
space:
mode:
Diffstat (limited to 'parse-deals.py')
-rw-r--r--parse-deals.py94
1 files changed, 0 insertions, 94 deletions
diff --git a/parse-deals.py b/parse-deals.py
deleted file mode 100644
index 8a3db43..0000000
--- a/parse-deals.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-import markdown
-import requests
-from urllib.parse import urlparse
-
-def scrap_url(url):
- headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.57 Safari/537.36'}
- return requests.get(url, headers=headers)
-
-
-def parse_article(url):
- article = scrap_url(url)
- soup = BeautifulSoup(article.text, "lxml")
- for div in soup.find_all("div", class_="article__body"):
- for a in div.find_all('a'):
- domain = urlparse(a['href'])
- if domain.netloc in ('www.amazon.com', 'amzn.to'):
- try:
- w_price = a.get_text().split("$")[1]
- try:
- w_price = w_price.strip()
- w_price = re.sub("[^\d\.]", "", w_price)
- amzn_url = a['href']
- amzn_r = scrap_url(amzn_url)
- if amzn_r.status_code == 404:
- print(a.get_text(), "is a 404")
- else:
- amzn_soup = BeautifulSoup(amzn_r.content, "lxml")
- price = soup.find(id="priceblock_ourprice")
- print(price)
- except:
- print("wrong")
- except:
- w_price = "link has no price"
-
-class Article(models.Model):
- title = models.CharField(max_length=200, blank=True)
- url = models.CharField(max_length=200)
- pub_date = models.DateTimeField('Date published', blank=True)
-
- class Meta:
- ordering = ('-pub_date',)
- get_latest_by = 'pub_date'
-
- def __str__(self):
- return self.title
-
- def save(self, *args, **kwargs):
- parse_article(self.url)
- super(Article, self).save(*args, **kwargs)
-
-class Deal(models.Model):
- deal_title = models.CharField(max_length=200)
- deal_price = models.FloatField(max_length=200)
- deal_url = models.CharField(max_length=200)
- store_price = models.FloatField(max_length=200)
-
- class Meta:
- ordering = ('-pub_date',)
- get_latest_by = 'pub_date'
-
- def __str__(self):
- return self.item_title
-
-def get_article_text(url):
- return requests.get(url)
-
-def getAmazonPrice(productUrl):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
- }
- res = requests.get(productUrl, headers=headers)
- res.raise_for_status()
-
-
- soup = BeautifulSoup(res.text, 'html.parser')
- amzn_price = soup.find("span", {"id": "newBuyBoxPrice"})
- return soup
-
-
-price = getAmazonPrice('http://www.amazon.com/Automate-Boring-Stuff-Python-Programming/dp/1593275994/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=&sr=')
-
-
-
-subid = "weekenddeals06132020"
-page_url = "https://www.wired.com/story/weekend-deals-june-12-2020"
-for a in soup.find_all('a'):
- start = a['href'].split('//')[1][:4]
- if str(start) == 'best' or start == 'goto':
- l = "%s,%s,%s,Impact,%s\n" % (page_url, subid, a.contents[0], a['href'])
- result.write(l)
-result.close()
-