from bs4 import BeautifulSoup def get_agent(source_feed): if source_feed.is_cloudflare: agent = random_user_agent() logging.error("using agent: {}".format(agent)) else: agent = "{user_agent} (+{server}; Updater; {subs} subscribers)".format(user_agent=settings.FEEDS_USER_AGENT, server=settings.FEEDS_SERVER, subs=source_feed.num_subs) return agent def random_user_agent(): return choice([ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)", "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4", "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "Mozilla/5.0 (Linux; Android 5.0; SAMSUNG SM-N900 Build/LRX21V) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/2.1 Chrome/34.0.1847.76 Mobile Safari/537.36", "Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-G570Y Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/4.0 Chrome/44.0.2403.133 Mobile Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0" ]) def update_feeds(max_feeds=3, output=NullOutput()): todo = Source.objects.filter(Q(due_poll__lt = timezone.now()) & Q(live = True)) output.write("Queue size is {}".format(todo.count())) sources = todo.order_by("due_poll")[:max_feeds] output.write("\nProcessing %d\n\n" % sources.count()) for src in sources: read_feed(src, output) # kill shit proxies WebProxy.objects.filter(address='X').delete() def read_feed(source_feed, output=NullOutput()): old_interval = source_feed.interval was302 = False output.write("\n------------------------------\n") source_feed.last_polled = timezone.now() agent = get_agent(source_feed) headers = { "User-Agent": agent } #identify ourselves proxies = {} proxy = None feed_url = source_feed.feed_url if source_feed.is_cloudflare : # Fuck you ! if settings.FEEDS_CLOUDFLARE_WORKER: feed_url = "{}/read/?target={}".format(settings.FEEDS_CLOUDFLARE_WORKER, feed_url) else: try: proxy = get_proxy(output) if proxy.address != "X": proxies = { 'http': proxy.address, 'https': proxy.address, } except: pass if source_feed.etag: headers["If-None-Match"] = str(source_feed.etag) if source_feed.last_modified: headers["If-Modified-Since"] = str(source_feed.last_modified) output.write("\nFetching %s" % feed_url) ret = None try: ret = requests.get(feed_url, headers=headers, verify=False, allow_redirects=False, timeout=20, proxies=proxies) source_feed.status_code = ret.status_code source_feed.last_result = "Unhandled Case" output.write(str(ret)) except Exception as ex: source_feed.last_result = ("Fetch error:" + str(ex))[:255] source_feed.status_code = 0 output.write("\nFetch error: " + str(ex)) if proxy: source_feed.last_result = "Proxy failed. Next retry will use new proxy" source_feed.status_code = 1 # this will stop us increasing the interval output.write("\nBurning the proxy.") proxy.delete() source_feed.interval /= 2 if ret is None and source_feed.status_code == 1: # er ?? pass elif ret == None or source_feed.status_code == 0: source_feed.interval += 120 elif ret.status_code < 200 or ret.status_code >= 500: #errors, impossible return codes source_feed.interval += 120 source_feed.last_result = "Server error fetching feed (%d)" % ret.status_code elif ret.status_code == 404: #not found source_feed.interval += 120 source_feed.last_result = "The feed could not be found" elif ret.status_code == 403 or ret.status_code == 410: #Forbidden or gone if "Cloudflare" in ret.text or ("Server" in ret.headers and "cloudflare" in ret.headers["Server"]): if source_feed.is_cloudflare and proxy is not None: # we are already proxied - this proxy on cloudflare's shit list too? proxy.delete() output.write("\nProxy seemed to also be blocked, burning") source_feed.interval /= 2 source_feed.last_result = "Proxy kind of worked but still got cloudflared." else: source_feed.is_cloudflare = True source_feed.last_result = "Blocked by Cloudflare (grr)" else: source_feed.last_result = "Feed is no longer accessible." source_feed.live = False elif ret.status_code >= 400 and ret.status_code < 500: #treat as bad request source_feed.live = False source_feed.last_result = "Bad request (%d)" % ret.status_code elif ret.status_code == 304: #not modified source_feed.interval += 10 source_feed.last_result = "Not modified" source_feed.last_success = timezone.now() if source_feed.last_success and (timezone.now() - source_feed.last_success).days > 7: source_feed.last_result = "Clearing etag/last modified due to lack of changes" source_feed.etag = None source_feed.last_modified = None elif ret.status_code == 301 or ret.status_code == 308: #permenant redirect new_url = "" try: if "Location" in ret.headers: new_url = ret.headers["Location"] if new_url[0] == "/": #find the domain from the feed base = "/".join(source_feed.feed_url.split("/")[:3]) new_url = base + new_url source_feed.feed_url = new_url source_feed.last_result = "Moved" source_feed.save(update_fields=["feed_url", "last_result"]) else: source_feed.last_result = "Feed has moved but no location provided" except exception as Ex: output.write("\nError redirecting.") source_feed.last_result = ("Error redirecting feed to " + new_url)[:255] pass elif ret.status_code == 302 or ret.status_code == 303 or ret.status_code == 307: #Temporary redirect new_url = "" was302 = True try: new_url = ret.headers["Location"] if new_url[0] == "/": #find the domain from the feed start = source_feed.feed_url[:8] end = source_feed.feed_url[8:] if end.find("/") >= 0: end = end[:end.find("/")] new_url = start + end + new_url ret = requests.get(new_url, headers=headers, allow_redirects=True, timeout=20, verify=False) source_feed.status_code = ret.status_code source_feed.last_result = ("Temporary Redirect to " + new_url)[:255] if source_feed.last_302_url == new_url: #this is where we 302'd to last time td = timezone.now() - source_feed.last_302_start if td.days > 60: source_feed.feed_url = new_url source_feed.last_302_url = " " source_feed.last_302_start = None source_feed.last_result = ("Permanent Redirect to " + new_url)[:255] source_feed.save(update_fields=["feed_url", "last_result", "last_302_url", "last_302_start"]) else: source_feed.last_result = ("Temporary Redirect to " + new_url + " since " + source_feed.last_302_start.strftime("%d %B"))[:255] else: source_feed.last_302_url = new_url source_feed.last_302_start = timezone.now() source_feed.last_result = ("Temporary Redirect to " + new_url + " since " + source_feed.last_302_start.strftime("%d %B"))[:255] except Exception as ex: source_feed.last_result = ("Failed Redirection to " + new_url + " " + str(ex))[:255] source_feed.interval += 60 #NOT ELIF, WE HAVE TO START THE IF AGAIN TO COPE WTIH 302 if ret and ret.status_code >= 200 and ret.status_code < 300: #now we are not following redirects 302,303 and so forth are going to fail here, but what the hell :) # great! ok = True changed = False if was302: source_feed.etag = None source_feed.last_modified = None else: try: source_feed.etag = ret.headers["etag"] except Exception as ex: source_feed.etag = None try: source_feed.last_modified = ret.headers["Last-Modified"] except Exception as ex: source_feed.last_modified = None output.write("\netag:%s\nLast Mod:%s\n\n" % (source_feed.etag,source_feed.last_modified)) content_type = "Not Set" if "Content-Type" in ret.headers: content_type = ret.headers["Content-Type"] (ok,changed) = import_feed(source_feed=source_feed, feed_body=ret.content, content_type=content_type, output=output) if ok and changed: source_feed.interval /= 2 source_feed.last_result = " OK (updated)" #and temporary redirects source_feed.last_change = timezone.now() elif ok: source_feed.last_result = " OK" source_feed.interval += 20 # we slow down feeds a little more that don't send headers we can use else: #not OK source_feed.interval += 120 if source_feed.interval < 60: source_feed.interval = 60 # no less than 1 hour if source_feed.interval > (60 * 24): source_feed.interval = (60 * 24) # no more than a day output.write("\nUpdating source_feed.interval from %d to %d\n" % (old_interval, source_feed.interval)) td = datetime.timedelta(minutes=source_feed.interval) source_feed.due_poll = timezone.now() + td source_feed.save(update_fields=[ "due_poll", "interval", "last_result", "last_modified", "etag", "last_302_start", "last_302_url", "last_success", "live", "status_code", "max_index", "is_cloudflare", "last_change", ]) def parse_feed_xml(feed_content): #r = requests.get('https://news.ycombinator.com/rss') soup = BeautifulSoup(r.content, features='xml') posts = soup.findAll('item') for post in posts: title = post.find('title').text link = post.find('link').text date = post.find('pubDate').text for category in post.findAll('category'): if category.text in text_list: #assign post type print(title, link, date, "-----------------\r\r") #create a new post if it doesn't exist: p, created = Post.objects.get_or_create ( title = post.find('title').text url = post.find('link').text pub_date = post.find('pubDate').text author = post.find('dc:creator').text last_update = timezone.now() post_type = # need to parse categories to get posttype ) #crawl here to get links from post # then get or create products for each thing in the post #product, created = Post.objects.get_or_create ( # title = post.find('title').text # url = post.find('link').text # pub_date = post.find('pubDate').text # author = post.find('dc:creator').text # last_update = timezone.now() # post_type = # need to parse categories to get posttype #) def parse_feed_xml(source_feed, feed_content, output): ok = True changed = False if source_feed.posts.all().count() == 0: is_first = True else: is_first = False #output.write(ret.content) try: _customize_sanitizer(parser) f = parser.parse(feed_content) #need to start checking feed parser errors here entries = f['entries'] if len(entries): source_feed.last_success = timezone.now() #in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] ok = False source_feed.save(update_fields=["last_success", "last_result"]) if ok: try: source_feed.name = f.feed.title source_feed.save(update_fields=["name"]) except Exception as ex: output.write("\nUpdate name error:" + str(ex)) pass try: source_feed.site_url = f.feed.link source_feed.save(update_fields=["site_url"]) except Exception as ex: pass try: source_feed.image_url = f.feed.image.href source_feed.save(update_fields=["image_url"]) except: pass # either of these is fine, prefer description over summary # also feedparser will give us itunes:summary etc if there try: source_feed.description = f.feed.summary except: pass try: source_feed.description = f.feed.description except: pass try: source_feed.save(update_fields=["description"]) except: pass #output.write(entries) entries.reverse() # Entries are typically in reverse chronological order - put them in right order for e in entries: # we are going to take the longest body = "" if hasattr(e, "content"): for c in e.content: if len(c.value) > len(body): body = c.value if hasattr(e, "summary"): if len(e.summary) > len(body): body = e.summary if hasattr(e, "summary_detail"): if len(e.summary_detail.value) > len(body): body = e.summary_detail.value if hasattr(e, "description"): if len(e.description) > len(body): body = e.description body = fix_relative(body, source_feed.site_url) try: guid = e.guid except Exception as ex: try: guid = e.link except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter(guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=" ", title="", guid=guid) p.found = timezone.now() changed = True try: p.created = datetime.datetime.fromtimestamp(time.mktime(e.published_parsed)).replace(tzinfo=timezone.utc) except Exception as ex2: try: p.created = datetime.datetime.fromtimestamp(time.mktime(e.updated_parsed)).replace(tzinfo=timezone.utc) except Exception as ex3: output.write("CREATED ERROR:" + str(ex3)) p.created = timezone.now() p.source = source_feed p.save() try: p.title = e.title p.save(update_fields=["title"]) except Exception as ex: output.write("Title error:" + str(ex)) try: p.link = e.link p.save(update_fields=["link"]) except Exception as ex: output.write("Link error:" + str(ex)) try: p.image_url = e.image.href p.save(update_fields=["image_url"]) except: pass try: p.author = e.author p.save(update_fields=["author"]) except Exception as ex: p.author = "" try: p.body = body p.save(update_fields=["body"]) # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) try: seen_files = [] post_files = e["enclosures"] non_dupes = [] # find any files in media_content that aren't already declared as enclosures if "media_content" in e: for ee in e["media_content"]: found = False for ff in post_files: if ff["href"] == ee["url"]: found = True break if not found: non_dupes.append(ee) post_files += non_dupes for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False for pe in post_files: href = "href" if href not in pe: href = "url" length = "length" if length not in pe: length = "filesize" if pe["href"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe[length]) except: ee.length = 0 try: type = pe["type"] except: type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) for pe in post_files: href = "href" if href not in pe: href = "url" length = "length" if length not in pe: length = "filesize" try: if pe[href] not in seen_files: try: length = int(pe[length]) except: length = 0 try: type = pe["type"] except: type = "audio/mpeg" ee = Enclosure(post=p, href=pe[href], length=length, type=type) ee.save() except Exception as ex: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) if is_first and source_feed.posts.all().count() > 0: # If this is the first time we have parsed this # then see if it's paginated and go back through its history agent = get_agent(source_feed) headers = { "User-Agent": agent } #identify ourselves keep_going = True while keep_going: keep_going = False # assume were stopping unless we find a next link if hasattr(f.feed, 'links'): for link in f.feed.links: if 'rel' in link and link['rel'] == "next": ret = requests.get(link['href'], headers=headers, verify=False, allow_redirects=True, timeout=20) (pok, pchanged) = parse_feed_xml(source_feed, ret.content, output) # print(link['href']) # print((pok, pchanged)) f = parser.parse(ret.content) # rebase the loop on this feed version keep_going = True return (ok,changed) def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f['items'] if len(entries): source_feed.last_success = timezone.now() #in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False source_feed.save(update_fields=["last_success", "last_result"]) except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = (24*3*60) source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] source_feed.name = f["title"] source_feed.save(update_fields=["site_url", "title"]) except Exception as ex: pass try: if "description" in f: _customize_sanitizer(parser) source_feed.description = parser.sanitizer._sanitize_html(f["description"], "utf-8", 'text/html') source_feed.save(update_fields=["description"]) except Exception as ex: pass try: _customize_sanitizer(parser) source_feed.name = parser.sanitizer._sanitize_html(source_feed.name, "utf-8", 'text/html') source_feed.save(update_fields=["name"]) except Exception as ex: pass try: if "icon" in f: source_feed.image_url = f["icon"] source_feed.save(update_fields=["icon"]) except Exception as ex: pass #output.write(entries) entries.reverse() # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body,source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter(guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=' ') p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(parser) body = parser.sanitizer._sanitize_html(body, "utf-8", 'text/html') # TODO: validate charset ?? _customize_sanitizer(parser) title = parser.sanitizer._sanitize_html(title, "utf-8", 'text/html') # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = '' p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: output.write("CREATED ERROR") p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe["size_in_bytes"]) except: ee.length = 0 try: type = pe["mime_type"] except: type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: if pe["url"] not in seen_files: try: length = int(pe["size_in_bytes"]) except: length = 0 try: type = pe["mime_type"] except: type = "audio/mpeg" ee = Enclosure(post=p , href=pe["url"], length=length, type=type) ee.save() except Exception as ex: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) try: p.body = body p.save() # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) return (ok,changed) def test_feed(source, cache=False, output=NullOutput()): headers = { "User-Agent": get_agent(source) } #identify ourselves and also stop our requests getting picked up by any cache if cache: if source.etag: headers["If-None-Match"] = str(source.etag) if source.last_modified: headers["If-Modified-Since"] = str(source.last_modified) else: headers["Cache-Control"] = "no-cache,max-age=0" headers["Pragma"] = "no-cache" output.write("\n" + str(headers)) ret = requests.get(source.feed_url, headers=headers, allow_redirects=False, verify=False, timeout=20) output.write("\n\n") output.write(str(ret)) output.write("\n\n") output.write(ret.text) def get_proxy(out=NullOutput()): p = WebProxy.objects.first() if p is None: find_proxies(out) p = WebProxy.objects.first() out.write("Proxy: {}".format(str(p))) return p def find_proxies(out=NullOutput()): out.write("\nLooking for proxies\n") try: req = requests.get("https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list.txt", timeout=30) if req.status_code == 200: list = req.text list = list.split("\n") # remove header list = list[4:] for item in list: if ":" in item: item = item.split(" ")[0] WebProxy(address=item).save() except Exception as ex: logging.error("Proxy scrape error: {}".format(str(ex))) out.write("Proxy scrape error: {}\n".format(str(ex))) if WebProxy.objects.count() == 0: # something went wrong. # to stop infinite loops we will insert duff proxys now for i in range(20): WebProxy(address="X").save() out.write("No proxies found.\n") import csv from datetime import datetime def import_master_guides(path): """ Takes a CSV dump of Jeff's sheet and puts it in the database row[0] #url row[1] #title row[2] #post_type row[3] #author row[7] #date_last_pub row[10] #update_frequency """ with open(path) as f: reader = csv.reader(f) count = 0 for row in reader: if count > 1: if row[2] == "Deals": continue elif row[2] == "Buying Guide": gtype = PostType.GUIDE else: gtype = PostType.HOWTO if row[10] == "Retired": continue else: up = int(row[10])*30 print(row[10]) d = datetime.strptime(row[7], '%m/%d/%Y') post, created = Post.objects.get_or_create( title = str(row[1]).strip(), url = str(row[0]).strip(), date_last_pub = d, author = str(row[3]).strip(), post_type = gtype, update_frequency = up ) count = count+1