added parse deals so I can test from server

author: luxagraf <sng@luxagraf.net> 2020-10-08 15:44:44 -0400
committer: luxagraf <sng@luxagraf.net> 2020-10-08 15:44:44 -0400
commit: 96d5b52372d39be7d6588ec44f32260d43c38648 (patch)
tree: a2fe16de5ae07b7f2cb535d5b574d749338bc993
parent: e97f4a37022a5745dd0cbb58f93bf09f10b22889 (diff)
1 files changed, 82 insertions, 0 deletions
diff --git a/parse-deals.py b/parse-deals.py
new file mode 100644
index 0000000..f545d90
--- /dev/null
+++ b/parse-deals.py
@@ -0,0 +1,82 @@
+import re
+from bs4 import BeautifulSoup
+import markdown
+import requests 
+from urllib.parse import urlparse
+
+def get_article_text(url):
+    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.57 Safari/537.36'}
+    return requests.get(url, headers=headers)
+
+
+def parse_article(url):
+    article = get_article_text(url)
+    soup = BeautifulSoup(article.text, "lxml")
+    for div in soup.find_all("div", class_="article__body"):
+        for a in div.find_all('a'):
+            domain = urlparse(a['href'])
+            if domain.netloc in ('www.amazon.com', 'amzn.to'):
+                try:
+                    w_price = a.get_text().split("$")[1]
+                    try: 
+                        w_price = w_price.strip()
+                        w_price = re.sub("[^\d\.]", "", w_price)
+                        amzn_url = a['href']
+                        amzn_r = requests.get(amzn_url)
+                        print(amzn_r.status_code)
+                        if amzn_r.status_code == 404:
+                            print(a.get_text(), "is a 404")
+                        else:
+                            amzn_soup = BeautifulSoup(amzn_r.text, "lxml")
+                            amzn_price = amzn_soup.find("span", {"id": "newBuyBoxPrice"})
+                            print(w_price, amzn_price)
+                    except:
+                        print("wrong")
+                except:
+                    w_price = "link has no price"
+
+class Article(models.Model):
+    title = models.CharField(max_length=200, blank=True)
+    url = models.CharField(max_length=200)
+    pub_date = models.DateTimeField('Date published', blank=True)
+
+    class Meta:
+        ordering = ('-pub_date',)
+        get_latest_by = 'pub_date'
+
+    def __str__(self):
+        return self.title
+
+    def save(self, *args, **kwargs):
+        parse_article(self.url)
+        super(Article, self).save(*args, **kwargs)
+
+class Deal(models.Model):
+    deal_title = models.CharField(max_length=200)
+    deal_price = models.FloatField(max_length=200)
+    deal_url = models.CharField(max_length=200)
+    store_price = models.FloatField(max_length=200)
+
+    class Meta:
+        ordering = ('-pub_date',)
+        get_latest_by = 'pub_date'
+
+    def __str__(self):
+        return self.item_title
+
+def get_article_text(url):
+    return requests.get(url)
+
+
+
+
+
+subid = "weekenddeals06132020"
+page_url = "https://www.wired.com/story/weekend-deals-june-12-2020"
+for a in soup.find_all('a'):
+    start = a['href'].split('//')[1][:4]
+    if str(start) == 'best' or start == 'goto':
+        l = "%s,%s,%s,Impact,%s\n" % (page_url, subid, a.contents[0], a['href']) 
+        result.write(l)
+result.close()
+
author	luxagraf <sng@luxagraf.net>	2020-10-08 15:44:44 -0400
committer	luxagraf <sng@luxagraf.net>	2020-10-08 15:44:44 -0400
commit	96d5b52372d39be7d6588ec44f32260d43c38648 (patch)
tree	a2fe16de5ae07b7f2cb535d5b574d749338bc993
parent	e97f4a37022a5745dd0cbb58f93bf09f10b22889 (diff)