diff options
Diffstat (limited to 'app/posts')
-rw-r--r-- | app/posts/admin.py | 34 | ||||
-rw-r--r-- | app/posts/migrations/0001_initial.py | 33 | ||||
-rw-r--r-- | app/posts/migrations/0002_alter_post_body.py | 18 | ||||
-rw-r--r-- | app/posts/migrations/0003_alter_post_date_last_pub.py | 18 | ||||
-rw-r--r-- | app/posts/migrations/0004_alter_post_update_frequency.py | 18 | ||||
-rw-r--r-- | app/posts/migrations/0005_post_template_type.py | 18 | ||||
-rw-r--r-- | app/posts/migrations/0006_alter_post_post_type.py | 18 | ||||
-rw-r--r-- | app/posts/migrations/0007_alter_post_title.py | 18 | ||||
-rw-r--r-- | app/posts/migrations/0008_post_needs_update_alter_post_products.py | 24 | ||||
-rw-r--r-- | app/posts/migrations/0009_note.py | 29 | ||||
-rw-r--r-- | app/posts/migrations/__init__.py | 0 | ||||
-rw-r--r-- | app/posts/models.py | 141 | ||||
-rw-r--r-- | app/posts/utils.py | 928 |
13 files changed, 1297 insertions, 0 deletions
diff --git a/app/posts/admin.py b/app/posts/admin.py new file mode 100644 index 0000000..a4e29b8 --- /dev/null +++ b/app/posts/admin.py @@ -0,0 +1,34 @@ +from django.contrib import admin + +from .models import Post, Note +from utils.widgets import AdminImageWidget, LGEntryForm + +from django.contrib.admin import SimpleListFilter + + +@admin.register(Post) +class PostAdmin(admin.ModelAdmin): + form = LGEntryForm + list_display = ('title', 'admin_url', 'author', 'date_last_pub', 'post_type', 'update_frequency', 'needs_update', 'days_overdue') + search_fields = ['title'] + list_filter = ['needs_update', 'author', 'post_type'] + + class Media: + js = ('image-loader.js', 'next-prev-links.js') + css = { + "all": ("my_styles.css",) + } + + +@admin.register(Note) +class NoteAdmin(admin.ModelAdmin): + form = LGEntryForm + list_display = ('date_created', 'title', 'post') + search_fields = ['title'] + list_filter = ['date_created'] + + class Media: + js = ('image-loader.js', 'next-prev-links.js') + css = { + "all": ("my_styles.css",) + } diff --git a/app/posts/migrations/0001_initial.py b/app/posts/migrations/0001_initial.py new file mode 100644 index 0000000..87a9c95 --- /dev/null +++ b/app/posts/migrations/0001_initial.py @@ -0,0 +1,33 @@ +# Generated by Django 4.2.2 on 2023-07-10 18:02 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('products', '__first__'), + ] + + operations = [ + migrations.CreateModel( + name='Post', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.TextField(blank=True)), + ('body', models.TextField()), + ('url', models.CharField(blank=True, max_length=512, null=True)), + ('date_last_pub', models.DateTimeField()), + ('guid', models.CharField(blank=True, db_index=True, max_length=512, null=True)), + ('author', models.CharField(blank=True, max_length=255, null=True)), + ('post_type', models.IntegerField(choices=[(0, 'review'), (1, 'guide'), (2, 'gallery'), (3, 'how-to')], default=1)), + ('update_frequency', models.IntegerField(help_text='In days')), + ('products', models.ManyToManyField(to='products.productlink')), + ], + options={ + 'ordering': ('date_last_pub',), + }, + ), + ] diff --git a/app/posts/migrations/0002_alter_post_body.py b/app/posts/migrations/0002_alter_post_body.py new file mode 100644 index 0000000..2a33e58 --- /dev/null +++ b/app/posts/migrations/0002_alter_post_body.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.2 on 2023-07-10 18:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='post', + name='body', + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/app/posts/migrations/0003_alter_post_date_last_pub.py b/app/posts/migrations/0003_alter_post_date_last_pub.py new file mode 100644 index 0000000..f19c142 --- /dev/null +++ b/app/posts/migrations/0003_alter_post_date_last_pub.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.2 on 2023-07-10 18:59 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0002_alter_post_body'), + ] + + operations = [ + migrations.AlterField( + model_name='post', + name='date_last_pub', + field=models.DateField(), + ), + ] diff --git a/app/posts/migrations/0004_alter_post_update_frequency.py b/app/posts/migrations/0004_alter_post_update_frequency.py new file mode 100644 index 0000000..ec176b7 --- /dev/null +++ b/app/posts/migrations/0004_alter_post_update_frequency.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.2 on 2023-07-10 19:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0003_alter_post_date_last_pub'), + ] + + operations = [ + migrations.AlterField( + model_name='post', + name='update_frequency', + field=models.BigIntegerField(help_text='In days'), + ), + ] diff --git a/app/posts/migrations/0005_post_template_type.py b/app/posts/migrations/0005_post_template_type.py new file mode 100644 index 0000000..2eef54b --- /dev/null +++ b/app/posts/migrations/0005_post_template_type.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.2 on 2023-07-10 19:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0004_alter_post_update_frequency'), + ] + + operations = [ + migrations.AddField( + model_name='post', + name='template_type', + field=models.IntegerField(choices=[(0, 'story'), (1, 'gallery')], default=0), + ), + ] diff --git a/app/posts/migrations/0006_alter_post_post_type.py b/app/posts/migrations/0006_alter_post_post_type.py new file mode 100644 index 0000000..93985c7 --- /dev/null +++ b/app/posts/migrations/0006_alter_post_post_type.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.2 on 2023-07-10 19:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0005_post_template_type'), + ] + + operations = [ + migrations.AlterField( + model_name='post', + name='post_type', + field=models.IntegerField(choices=[(0, 'review'), (1, 'guide'), (2, 'how-to')], default=1), + ), + ] diff --git a/app/posts/migrations/0007_alter_post_title.py b/app/posts/migrations/0007_alter_post_title.py new file mode 100644 index 0000000..a838347 --- /dev/null +++ b/app/posts/migrations/0007_alter_post_title.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.2 on 2023-07-10 19:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0006_alter_post_post_type'), + ] + + operations = [ + migrations.AlterField( + model_name='post', + name='title', + field=models.CharField(blank=True, max_length=512, null=True), + ), + ] diff --git a/app/posts/migrations/0008_post_needs_update_alter_post_products.py b/app/posts/migrations/0008_post_needs_update_alter_post_products.py new file mode 100644 index 0000000..f40f62e --- /dev/null +++ b/app/posts/migrations/0008_post_needs_update_alter_post_products.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.2 on 2023-07-12 21:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('products', '0001_initial'), + ('posts', '0007_alter_post_title'), + ] + + operations = [ + migrations.AddField( + model_name='post', + name='needs_update', + field=models.BooleanField(default=False), + ), + migrations.AlterField( + model_name='post', + name='products', + field=models.ManyToManyField(blank=True, null=True, to='products.productlink'), + ), + ] diff --git a/app/posts/migrations/0009_note.py b/app/posts/migrations/0009_note.py new file mode 100644 index 0000000..ecd6473 --- /dev/null +++ b/app/posts/migrations/0009_note.py @@ -0,0 +1,29 @@ +# Generated by Django 4.2.2 on 2023-07-14 19:38 + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ('posts', '0008_post_needs_update_alter_post_products'), + ] + + operations = [ + migrations.CreateModel( + name='Note', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.CharField(max_length=400)), + ('url', models.CharField(max_length=400)), + ('body_markdown', models.TextField(blank=True, null=True)), + ('date_created', models.DateTimeField(default=django.utils.timezone.now)), + ('post', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='posts.post')), + ], + options={ + 'ordering': ('date_created',), + }, + ), + ] diff --git a/app/posts/migrations/__init__.py b/app/posts/migrations/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/app/posts/migrations/__init__.py diff --git a/app/posts/models.py b/app/posts/models.py new file mode 100644 index 0000000..a19a50f --- /dev/null +++ b/app/posts/models.py @@ -0,0 +1,141 @@ +from django.db import models +from django.utils.html import format_html, format_html_join +from django.utils import timezone + +from products.models import ProductLink + +""" +class Feed(models.Model): + name = models.CharField(max_length=255) + feed_url = models.CharField(max_length=512) + slug = models.CharField(max_length=50) + last_polled = models.DateTimeField(blank=True, null=True) + due_poll = models.DateTimeField(default=datetime.datetime(1900, 1, 1)) # default to distant past to put new sources to front of queue + etag = models.CharField(max_length=255, blank=True, null=True) + last_modified = models.CharField(max_length=255, blank=True, null=True) # just pass this back and forward between server and me , no need to parse + last_result = models.CharField(max_length=255,blank=True,null=True) + interval = models.PositiveIntegerField(default=400) + last_success = models.DateTimeField(blank=True, null=True) + last_change = models.DateTimeField(blank=True, null=True) + live = models.BooleanField(default=True) + status_code = models.PositiveIntegerField(default=0) + last_302_url = models.CharField(max_length=512, null=True, blank=True) + last_302_start = models.DateTimeField(null=True, blank=True) + + def __str__(self): + return self.name +""" + + +class PostType(models.IntegerChoices): + REVIEW = 0, ('review') + GUIDE = 1, ('guide') + HOWTO = 2, ('how-to') + + +class TemplateType(models.IntegerChoices): + STORY = 0, ('story') + GALLERY = 1, ('gallery') + + +class Post(models.Model): + # an entry in a feed + title = models.CharField(max_length=512, blank=True, null=True) + body = models.TextField(blank=True, null=True) + url = models.CharField(max_length=512, blank=True, null=True) + date_last_pub = models.DateField() + guid = models.CharField(max_length=512, blank=True, null=True, db_index=True) + author = models.CharField(max_length=255, blank=True, null=True) + post_type = models.IntegerField(choices=PostType.choices, default=PostType.GUIDE) + template_type = models.IntegerField(choices=TemplateType.choices, default=TemplateType.STORY) + update_frequency = models.BigIntegerField(help_text="In days") + products = models.ManyToManyField(ProductLink, blank=True, null=True) + needs_update = models.BooleanField(default=False) + + class Meta: + ordering = ('date_last_pub',) + + def __str__(self): + return self.title + + def time_since_update(self): + td = timezone.localdate() - self.date_last_pub + return td.days + + #def get_needs_update(self): + # if self.time_since_update() > self.update_frequency: + # return True + # else: + # return False + + def days_overdue(self): + if self.needs_update == True: + return self.time_since_update() - self.update_frequency + else: + return '' + + def admin_url(self): + return format_html('<a target="_blank" href="%s">%s</a>' % (self.url, self.url)) + admin_link.short_description = 'Link' + + def save(self, *args, **kwargs): + td = timezone.localdate() - self.date_last_pub + if td.days > self.update_frequency: + self.needs_update = True + else: + self.needs_update = False + super(Post, self).save() + + +class Note(models.Model): + title = models.CharField(max_length=400) + url = models.CharField(max_length=400) + body_markdown = models.TextField(blank=True, null=True) + date_created = models.DateTimeField(default=timezone.now) + post = models.ForeignKey(Post, on_delete=models.CASCADE, null=True) + + class Meta: + ordering = ('date_created',) + + def __str__(self): + return self.title + + +#URL,This Article,Type,Lead,Previous Leads,Other Testers,Notes/Docs,Last Pub Date,Update Next,Months Since Update,Update Frequency (Months),Updates per year,Prev. Updates,"18 Mo Traffic Trend +''' +row[0] #url +row[1] #title +row[2] #post_type +row[3] #author +row[7] #date_last_pub +row[10] #update_frequency + + +with open(path) as f: + reader = csv.reader(f) + count = 0 + for row in reader: + if count > 1: + if row[2] == "Deals": + # don't care about deals posts + continue + elif row[2] == "Buying Guide": + gtype = PostType.GUIDE + else: + gtype = PostType.HOWTO + if row[10] == "Retired": + continue + else: + print(int(row[10])) + print(gtype) + d = datetime.strptime(row[7], '%m/%d/%Y') + post, created = Post.objects.get_or_create( + title = str(row[1]).strip(), + url = str(row[0]).strip(), + date_last_pub = d, + author = str(row[3]).strip(), + post_type = gtype, + update_frequency = int(row[10]*30), + ) + +''' diff --git a/app/posts/utils.py b/app/posts/utils.py new file mode 100644 index 0000000..915721c --- /dev/null +++ b/app/posts/utils.py @@ -0,0 +1,928 @@ +from bs4 import BeautifulSoup + +def get_agent(source_feed): + + if source_feed.is_cloudflare: + agent = random_user_agent() + logging.error("using agent: {}".format(agent)) + else: + agent = "{user_agent} (+{server}; Updater; {subs} subscribers)".format(user_agent=settings.FEEDS_USER_AGENT, server=settings.FEEDS_SERVER, subs=source_feed.num_subs) + + return agent + +def random_user_agent(): + + return choice([ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", + "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)", + "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4", + "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (Linux; Android 5.0; SAMSUNG SM-N900 Build/LRX21V) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/2.1 Chrome/34.0.1847.76 Mobile Safari/537.36", + "Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-G570Y Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/4.0 Chrome/44.0.2403.133 Mobile Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0" + ]) + + +def update_feeds(max_feeds=3, output=NullOutput()): + + + todo = Source.objects.filter(Q(due_poll__lt = timezone.now()) & Q(live = True)) + + + output.write("Queue size is {}".format(todo.count())) + + sources = todo.order_by("due_poll")[:max_feeds] + + output.write("\nProcessing %d\n\n" % sources.count()) + + + for src in sources: + read_feed(src, output) + + # kill shit proxies + + WebProxy.objects.filter(address='X').delete() + + +def read_feed(source_feed, output=NullOutput()): + + old_interval = source_feed.interval + + + was302 = False + + output.write("\n------------------------------\n") + + source_feed.last_polled = timezone.now() + + agent = get_agent(source_feed) + + headers = { "User-Agent": agent } #identify ourselves + + + + + proxies = {} + proxy = None + + feed_url = source_feed.feed_url + if source_feed.is_cloudflare : # Fuck you ! + + + if settings.FEEDS_CLOUDFLARE_WORKER: + feed_url = "{}/read/?target={}".format(settings.FEEDS_CLOUDFLARE_WORKER, feed_url) + else: + try: + proxy = get_proxy(output) + + if proxy.address != "X": + + proxies = { + 'http': proxy.address, + 'https': proxy.address, + } + except: + pass + + + if source_feed.etag: + headers["If-None-Match"] = str(source_feed.etag) + if source_feed.last_modified: + headers["If-Modified-Since"] = str(source_feed.last_modified) + + output.write("\nFetching %s" % feed_url) + + ret = None + try: + ret = requests.get(feed_url, headers=headers, verify=False, allow_redirects=False, timeout=20, proxies=proxies) + source_feed.status_code = ret.status_code + source_feed.last_result = "Unhandled Case" + output.write(str(ret)) + except Exception as ex: + source_feed.last_result = ("Fetch error:" + str(ex))[:255] + source_feed.status_code = 0 + output.write("\nFetch error: " + str(ex)) + + + if proxy: + source_feed.last_result = "Proxy failed. Next retry will use new proxy" + source_feed.status_code = 1 # this will stop us increasing the interval + + output.write("\nBurning the proxy.") + proxy.delete() + source_feed.interval /= 2 + + + + if ret is None and source_feed.status_code == 1: # er ?? + pass + elif ret == None or source_feed.status_code == 0: + source_feed.interval += 120 + elif ret.status_code < 200 or ret.status_code >= 500: + #errors, impossible return codes + source_feed.interval += 120 + source_feed.last_result = "Server error fetching feed (%d)" % ret.status_code + elif ret.status_code == 404: + #not found + source_feed.interval += 120 + source_feed.last_result = "The feed could not be found" + elif ret.status_code == 403 or ret.status_code == 410: #Forbidden or gone + + if "Cloudflare" in ret.text or ("Server" in ret.headers and "cloudflare" in ret.headers["Server"]): + + if source_feed.is_cloudflare and proxy is not None: + # we are already proxied - this proxy on cloudflare's shit list too? + proxy.delete() + output.write("\nProxy seemed to also be blocked, burning") + source_feed.interval /= 2 + source_feed.last_result = "Proxy kind of worked but still got cloudflared." + else: + source_feed.is_cloudflare = True + source_feed.last_result = "Blocked by Cloudflare (grr)" + else: + source_feed.last_result = "Feed is no longer accessible." + source_feed.live = False + + + elif ret.status_code >= 400 and ret.status_code < 500: + #treat as bad request + source_feed.live = False + source_feed.last_result = "Bad request (%d)" % ret.status_code + elif ret.status_code == 304: + #not modified + source_feed.interval += 10 + source_feed.last_result = "Not modified" + source_feed.last_success = timezone.now() + + if source_feed.last_success and (timezone.now() - source_feed.last_success).days > 7: + source_feed.last_result = "Clearing etag/last modified due to lack of changes" + source_feed.etag = None + source_feed.last_modified = None + + + + elif ret.status_code == 301 or ret.status_code == 308: #permenant redirect + new_url = "" + try: + if "Location" in ret.headers: + new_url = ret.headers["Location"] + + if new_url[0] == "/": + #find the domain from the feed + + base = "/".join(source_feed.feed_url.split("/")[:3]) + + + new_url = base + new_url + + + source_feed.feed_url = new_url + source_feed.last_result = "Moved" + source_feed.save(update_fields=["feed_url", "last_result"]) + + + else: + source_feed.last_result = "Feed has moved but no location provided" + except exception as Ex: + output.write("\nError redirecting.") + source_feed.last_result = ("Error redirecting feed to " + new_url)[:255] + pass + elif ret.status_code == 302 or ret.status_code == 303 or ret.status_code == 307: #Temporary redirect + new_url = "" + was302 = True + try: + new_url = ret.headers["Location"] + + if new_url[0] == "/": + #find the domain from the feed + start = source_feed.feed_url[:8] + end = source_feed.feed_url[8:] + if end.find("/") >= 0: + end = end[:end.find("/")] + + new_url = start + end + new_url + + + ret = requests.get(new_url, headers=headers, allow_redirects=True, timeout=20, verify=False) + source_feed.status_code = ret.status_code + source_feed.last_result = ("Temporary Redirect to " + new_url)[:255] + + if source_feed.last_302_url == new_url: + #this is where we 302'd to last time + td = timezone.now() - source_feed.last_302_start + if td.days > 60: + source_feed.feed_url = new_url + source_feed.last_302_url = " " + source_feed.last_302_start = None + source_feed.last_result = ("Permanent Redirect to " + new_url)[:255] + + source_feed.save(update_fields=["feed_url", "last_result", "last_302_url", "last_302_start"]) + + + + else: + source_feed.last_result = ("Temporary Redirect to " + new_url + " since " + source_feed.last_302_start.strftime("%d %B"))[:255] + + else: + source_feed.last_302_url = new_url + source_feed.last_302_start = timezone.now() + + source_feed.last_result = ("Temporary Redirect to " + new_url + " since " + source_feed.last_302_start.strftime("%d %B"))[:255] + + + except Exception as ex: + source_feed.last_result = ("Failed Redirection to " + new_url + " " + str(ex))[:255] + source_feed.interval += 60 + + #NOT ELIF, WE HAVE TO START THE IF AGAIN TO COPE WTIH 302 + if ret and ret.status_code >= 200 and ret.status_code < 300: #now we are not following redirects 302,303 and so forth are going to fail here, but what the hell :) + + # great! + ok = True + changed = False + + + if was302: + source_feed.etag = None + source_feed.last_modified = None + else: + try: + source_feed.etag = ret.headers["etag"] + except Exception as ex: + source_feed.etag = None + try: + source_feed.last_modified = ret.headers["Last-Modified"] + except Exception as ex: + source_feed.last_modified = None + + output.write("\netag:%s\nLast Mod:%s\n\n" % (source_feed.etag,source_feed.last_modified)) + + + content_type = "Not Set" + if "Content-Type" in ret.headers: + content_type = ret.headers["Content-Type"] + + (ok,changed) = import_feed(source_feed=source_feed, feed_body=ret.content, content_type=content_type, output=output) + + if ok and changed: + source_feed.interval /= 2 + source_feed.last_result = " OK (updated)" #and temporary redirects + source_feed.last_change = timezone.now() + + elif ok: + source_feed.last_result = " OK" + source_feed.interval += 20 # we slow down feeds a little more that don't send headers we can use + else: #not OK + source_feed.interval += 120 + + if source_feed.interval < 60: + source_feed.interval = 60 # no less than 1 hour + if source_feed.interval > (60 * 24): + source_feed.interval = (60 * 24) # no more than a day + + output.write("\nUpdating source_feed.interval from %d to %d\n" % (old_interval, source_feed.interval)) + td = datetime.timedelta(minutes=source_feed.interval) + source_feed.due_poll = timezone.now() + td + source_feed.save(update_fields=[ + "due_poll", "interval", "last_result", + "last_modified", "etag", "last_302_start", + "last_302_url", "last_success", "live", + "status_code", "max_index", "is_cloudflare", + "last_change", + ]) + + +def parse_feed_xml(feed_content): + #r = requests.get('https://news.ycombinator.com/rss') + soup = BeautifulSoup(r.content, features='xml') + posts = soup.findAll('item') + for post in posts: + title = post.find('title').text + link = post.find('link').text + date = post.find('pubDate').text + for category in post.findAll('category'): + if category.text in text_list: + #assign post type + print(title, link, date, "-----------------\r\r") + #create a new post if it doesn't exist: + p, created = Post.objects.get_or_create ( + title = post.find('title').text + url = post.find('link').text + pub_date = post.find('pubDate').text + author = post.find('dc:creator').text + last_update = timezone.now() + post_type = # need to parse categories to get posttype + ) + #crawl here to get links from post + # then get or create products for each thing in the post + #product, created = Post.objects.get_or_create ( + # title = post.find('title').text + # url = post.find('link').text + # pub_date = post.find('pubDate').text + # author = post.find('dc:creator').text + # last_update = timezone.now() + # post_type = # need to parse categories to get posttype + #) + + +def parse_feed_xml(source_feed, feed_content, output): + + ok = True + changed = False + + if source_feed.posts.all().count() == 0: + is_first = True + else: + is_first = False + + #output.write(ret.content) + try: + + _customize_sanitizer(parser) + f = parser.parse(feed_content) #need to start checking feed parser errors here + entries = f['entries'] + if len(entries): + source_feed.last_success = timezone.now() #in case we start auto unsubscribing long dead feeds + else: + source_feed.last_result = "Feed is empty" + ok = False + + except Exception as ex: + source_feed.last_result = "Feed Parse Error" + entries = [] + ok = False + + source_feed.save(update_fields=["last_success", "last_result"]) + + if ok: + try: + source_feed.name = f.feed.title + source_feed.save(update_fields=["name"]) + except Exception as ex: + output.write("\nUpdate name error:" + str(ex)) + pass + + try: + source_feed.site_url = f.feed.link + source_feed.save(update_fields=["site_url"]) + except Exception as ex: + pass + + + try: + source_feed.image_url = f.feed.image.href + source_feed.save(update_fields=["image_url"]) + except: + pass + + + # either of these is fine, prefer description over summary + # also feedparser will give us itunes:summary etc if there + try: + source_feed.description = f.feed.summary + except: + pass + + try: + source_feed.description = f.feed.description + except: + pass + + try: + source_feed.save(update_fields=["description"]) + except: + pass + + + #output.write(entries) + entries.reverse() # Entries are typically in reverse chronological order - put them in right order + for e in entries: + + + # we are going to take the longest + body = "" + + if hasattr(e, "content"): + for c in e.content: + if len(c.value) > len(body): + body = c.value + + if hasattr(e, "summary"): + if len(e.summary) > len(body): + body = e.summary + + if hasattr(e, "summary_detail"): + if len(e.summary_detail.value) > len(body): + body = e.summary_detail.value + + if hasattr(e, "description"): + if len(e.description) > len(body): + body = e.description + + + body = fix_relative(body, source_feed.site_url) + + try: + guid = e.guid + except Exception as ex: + try: + guid = e.link + except Exception as ex: + m = hashlib.md5() + m.update(body.encode("utf-8")) + guid = m.hexdigest() + + try: + p = Post.objects.filter(source=source_feed).filter(guid=guid)[0] + output.write("EXISTING " + guid + "\n") + + except Exception as ex: + output.write("NEW " + guid + "\n") + p = Post(index=0, body=" ", title="", guid=guid) + p.found = timezone.now() + changed = True + + + try: + p.created = datetime.datetime.fromtimestamp(time.mktime(e.published_parsed)).replace(tzinfo=timezone.utc) + except Exception as ex2: + try: + p.created = datetime.datetime.fromtimestamp(time.mktime(e.updated_parsed)).replace(tzinfo=timezone.utc) + except Exception as ex3: + output.write("CREATED ERROR:" + str(ex3)) + p.created = timezone.now() + + + p.source = source_feed + p.save() + + try: + p.title = e.title + p.save(update_fields=["title"]) + except Exception as ex: + output.write("Title error:" + str(ex)) + + try: + p.link = e.link + p.save(update_fields=["link"]) + except Exception as ex: + output.write("Link error:" + str(ex)) + + try: + p.image_url = e.image.href + p.save(update_fields=["image_url"]) + except: + pass + + + + try: + p.author = e.author + p.save(update_fields=["author"]) + except Exception as ex: + p.author = "" + + + + try: + p.body = body + p.save(update_fields=["body"]) + # output.write(p.body) + except Exception as ex: + output.write(str(ex)) + output.write(p.body) + + + try: + seen_files = [] + + post_files = e["enclosures"] + non_dupes = [] + + # find any files in media_content that aren't already declared as enclosures + if "media_content" in e: + for ee in e["media_content"]: + found = False + for ff in post_files: + if ff["href"] == ee["url"]: + found = True + break + if not found: + non_dupes.append(ee) + + post_files += non_dupes + + + for ee in list(p.enclosures.all()): + # check existing enclosure is still there + found_enclosure = False + for pe in post_files: + + href = "href" + if href not in pe: + href = "url" + + length = "length" + if length not in pe: + length = "filesize" + + + if pe["href"] == ee.href and ee.href not in seen_files: + found_enclosure = True + + try: + ee.length = int(pe[length]) + except: + ee.length = 0 + + try: + type = pe["type"] + except: + type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe + + ee.type = type + ee.save() + break + if not found_enclosure: + ee.delete() + seen_files.append(ee.href) + + for pe in post_files: + + href = "href" + if href not in pe: + href = "url" + + length = "length" + if length not in pe: + length = "filesize" + + try: + if pe[href] not in seen_files: + + try: + length = int(pe[length]) + except: + length = 0 + + try: + type = pe["type"] + except: + type = "audio/mpeg" + + ee = Enclosure(post=p, href=pe[href], length=length, type=type) + ee.save() + except Exception as ex: + pass + except Exception as ex: + if output: + output.write("No enclosures - " + str(ex)) + + + if is_first and source_feed.posts.all().count() > 0: + # If this is the first time we have parsed this + # then see if it's paginated and go back through its history + agent = get_agent(source_feed) + headers = { "User-Agent": agent } #identify ourselves + keep_going = True + while keep_going: + keep_going = False # assume were stopping unless we find a next link + if hasattr(f.feed, 'links'): + for link in f.feed.links: + if 'rel' in link and link['rel'] == "next": + ret = requests.get(link['href'], headers=headers, verify=False, allow_redirects=True, timeout=20) + (pok, pchanged) = parse_feed_xml(source_feed, ret.content, output) + # print(link['href']) + # print((pok, pchanged)) + f = parser.parse(ret.content) # rebase the loop on this feed version + keep_going = True + + + return (ok,changed) + + +def parse_feed_json(source_feed, feed_content, output): + + ok = True + changed = False + + try: + f = json.loads(feed_content) + entries = f['items'] + if len(entries): + source_feed.last_success = timezone.now() #in case we start auto unsubscribing long dead feeds + else: + source_feed.last_result = "Feed is empty" + source_feed.interval += 120 + ok = False + + source_feed.save(update_fields=["last_success", "last_result"]) + + + except Exception as ex: + source_feed.last_result = "Feed Parse Error" + entries = [] + source_feed.interval += 120 + ok = False + + if ok: + + + if "expired" in f and f["expired"]: + # This feed says it is done + # TODO: permanently disable + # for now source_feed.interval to max + source_feed.interval = (24*3*60) + source_feed.last_result = "This feed has expired" + return (False, False, source_feed.interval) + + try: + source_feed.site_url = f["home_page_url"] + source_feed.name = f["title"] + + source_feed.save(update_fields=["site_url", "title"]) + + except Exception as ex: + pass + + + try: + if "description" in f: + _customize_sanitizer(parser) + source_feed.description = parser.sanitizer._sanitize_html(f["description"], "utf-8", 'text/html') + source_feed.save(update_fields=["description"]) + except Exception as ex: + pass + + try: + _customize_sanitizer(parser) + source_feed.name = parser.sanitizer._sanitize_html(source_feed.name, "utf-8", 'text/html') + source_feed.save(update_fields=["name"]) + + except Exception as ex: + pass + + try: + if "icon" in f: + source_feed.image_url = f["icon"] + source_feed.save(update_fields=["icon"]) + except Exception as ex: + pass + + #output.write(entries) + entries.reverse() # Entries are typically in reverse chronological order - put them in right order + for e in entries: + body = " " + if "content_text" in e: + body = e["content_text"] + if "content_html" in e: + body = e["content_html"] # prefer html over text + + body = fix_relative(body,source_feed.site_url) + + + + try: + guid = e["id"] + except Exception as ex: + try: + guid = e["url"] + except Exception as ex: + m = hashlib.md5() + m.update(body.encode("utf-8")) + guid = m.hexdigest() + + try: + p = Post.objects.filter(source=source_feed).filter(guid=guid)[0] + output.write("EXISTING " + guid + "\n") + + except Exception as ex: + output.write("NEW " + guid + "\n") + p = Post(index=0, body=' ') + p.found = timezone.now() + changed = True + p.source = source_feed + + try: + title = e["title"] + except Exception as ex: + title = "" + + # borrow the RSS parser's sanitizer + _customize_sanitizer(parser) + body = parser.sanitizer._sanitize_html(body, "utf-8", 'text/html') # TODO: validate charset ?? + _customize_sanitizer(parser) + title = parser.sanitizer._sanitize_html(title, "utf-8", 'text/html') # TODO: validate charset ?? + # no other fields are ever marked as |safe in the templates + + if "banner_image" in e: + p.image_url = e["banner_image"] + + if "image" in e: + p.image_url = e["image"] + + + try: + p.link = e["url"] + except Exception as ex: + p.link = '' + + p.title = title + + try: + p.created = pyrfc3339.parse(e["date_published"]) + except Exception as ex: + output.write("CREATED ERROR") + p.created = timezone.now() + + + p.guid = guid + try: + p.author = e["author"] + except Exception as ex: + p.author = "" + + p.save() + + + try: + seen_files = [] + for ee in list(p.enclosures.all()): + # check existing enclosure is still there + found_enclosure = False + if "attachments" in e: + for pe in e["attachments"]: + + if pe["url"] == ee.href and ee.href not in seen_files: + found_enclosure = True + + try: + ee.length = int(pe["size_in_bytes"]) + except: + ee.length = 0 + + try: + type = pe["mime_type"] + except: + type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe + + ee.type = type + ee.save() + break + if not found_enclosure: + ee.delete() + seen_files.append(ee.href) + + if "attachments" in e: + for pe in e["attachments"]: + + try: + if pe["url"] not in seen_files: + + try: + length = int(pe["size_in_bytes"]) + except: + length = 0 + + try: + type = pe["mime_type"] + except: + type = "audio/mpeg" + + ee = Enclosure(post=p , href=pe["url"], length=length, type=type) + ee.save() + except Exception as ex: + pass + except Exception as ex: + if output: + output.write("No enclosures - " + str(ex)) + + try: + p.body = body + p.save() + # output.write(p.body) + except Exception as ex: + output.write(str(ex)) + output.write(p.body) + + return (ok,changed) + + +def test_feed(source, cache=False, output=NullOutput()): + + + headers = { "User-Agent": get_agent(source) } #identify ourselves and also stop our requests getting picked up by any cache + + if cache: + if source.etag: + headers["If-None-Match"] = str(source.etag) + if source.last_modified: + headers["If-Modified-Since"] = str(source.last_modified) + else: + headers["Cache-Control"] = "no-cache,max-age=0" + headers["Pragma"] = "no-cache" + + output.write("\n" + str(headers)) + + ret = requests.get(source.feed_url, headers=headers, allow_redirects=False, verify=False, timeout=20) + + output.write("\n\n") + + output.write(str(ret)) + + output.write("\n\n") + + output.write(ret.text) + + +def get_proxy(out=NullOutput()): + + p = WebProxy.objects.first() + + if p is None: + find_proxies(out) + p = WebProxy.objects.first() + + out.write("Proxy: {}".format(str(p))) + + return p + + + +def find_proxies(out=NullOutput()): + + + out.write("\nLooking for proxies\n") + + try: + req = requests.get("https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list.txt", timeout=30) + if req.status_code == 200: + list = req.text + + list = list.split("\n") + + # remove header + list = list[4:] + + for item in list: + if ":" in item: + item = item.split(" ")[0] + WebProxy(address=item).save() + + + + except Exception as ex: + logging.error("Proxy scrape error: {}".format(str(ex))) + out.write("Proxy scrape error: {}\n".format(str(ex))) + + if WebProxy.objects.count() == 0: + # something went wrong. + # to stop infinite loops we will insert duff proxys now + for i in range(20): + WebProxy(address="X").save() + out.write("No proxies found.\n") + + +import csv +from datetime import datetime +def import_master_guides(path): + """ + Takes a CSV dump of Jeff's sheet and puts it in the database + row[0] #url + row[1] #title + row[2] #post_type + row[3] #author + row[7] #date_last_pub + row[10] #update_frequency + """ + with open(path) as f: + reader = csv.reader(f) + count = 0 + for row in reader: + if count > 1: + if row[2] == "Deals": + continue + elif row[2] == "Buying Guide": + gtype = PostType.GUIDE + else: + gtype = PostType.HOWTO + if row[10] == "Retired": + continue + else: + up = int(row[10])*30 + print(row[10]) + d = datetime.strptime(row[7], '%m/%d/%Y') + post, created = Post.objects.get_or_create( + title = str(row[1]).strip(), + url = str(row[0]).strip(), + date_last_pub = d, + author = str(row[3]).strip(), + post_type = gtype, + update_frequency = up + ) + count = count+1 + + |