diff options
Diffstat (limited to 'app/builder')
-rw-r--r-- | app/builder/__init__.py | 0 | ||||
-rw-r--r-- | app/builder/base.py | 152 | ||||
-rw-r--r-- | app/builder/sanitizer.py | 60 | ||||
-rw-r--r-- | app/builder/views.py | 13 |
4 files changed, 225 insertions, 0 deletions
diff --git a/app/builder/__init__.py b/app/builder/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/app/builder/__init__.py diff --git a/app/builder/base.py b/app/builder/base.py new file mode 100644 index 0000000..0d2cb0f --- /dev/null +++ b/app/builder/base.py @@ -0,0 +1,152 @@ +import os +from math import ceil +from decimal import Decimal +from django.test.client import Client +from django.template.loader import render_to_string +from django.template import Context +from django.urls import reverse +from django.apps import apps +from django.conf import settings +from jsmin import jsmin + + +class _FileWriter(object): + """ + Given a path and text object; write the page to disc + """ + def __init__(self, path, text_object, ext='html', filename='index', base_path=settings.FLATFILES_ROOT): + self.path = '%s%s' % (base_path, path) + if not os.path.isdir(self.path): + os.makedirs(self.path) + fpath = '%s%s.%s' % (self.path, filename, ext) + self.write(fpath, text_object) + + def write(self, fpath, text_object): + f = open(fpath, 'wb') + f.write(text_object) + f.close() + + def compress_js(self, filename, text_object): + path = '%s%s.min.js' % (self.path, filename) + compressed = jsmin(text_object.decode('utf-8')).encode('utf-8') + self.write(path, compressed) + + +class BuildNew(): + def __init__(self, model, app): + self.model = apps.get_model(model, app) + self.get_model_queryset() + self.client = Client() + + def build(self): + self.build_list_view() + self.build_detail_view() + + def get_model_queryset(self): + return self.model.objects.filter(status__exact=1) + + def write_file(self, path, text_object, ext='html', filename='index'): + self.writer = _FileWriter(path, text_object, ext=ext, filename=filename) + + def get_pages(self, qs, paginate_by): + return int(ceil(Decimal(qs.count()) / Decimal(paginate_by))) + + def build_list_view(self, base_path='', qs=None, paginate_by=10): + """ + Archive Page builder that actually crawls the urls + because we need to be able to pass a request object to the template + """ + + if not qs: + qs = self.get_model_queryset() + pages = self.get_pages(qs, paginate_by) + for page in range(pages): + if int(pages) > 1: + path = '%s%s/' % (base_path, str(page + 1)) + url = '%s%s/' % (base_path, str(page + 1)) + else: + path = base_path + url = base_path + print(path) + response = self.client.get(url, HTTP_HOST='127.0.0.1', follow=True) + if page == 0: + self.write_file(base_path, response.content) + self.write_file(path, response.content) + + def build_year_view(self, url, paginate_by=99999): + years = self.model.objects.dates('pub_date', 'year') + for year in years: + year = year.strftime('%Y') + qs = self.model.objects.filter( + status__exact=1, + pub_date__year=year + ) + self.build_list_view( + base_path=reverse(url, kwargs={'year': year, }), + qs=qs, + paginate_by=paginate_by + ) + + def build_month_view(self, url, paginate_by=99999): + months = self.model.objects.dates('pub_date', 'month') + for m in months: + year = m.strftime('%Y') + month = m.strftime('%m') + qs = self.model.objects.filter( + status__exact=1, + pub_date__year=year, + pub_date__month=month + ) + if qs.exists(): + self.build_list_view( + base_path=reverse(url, kwargs={'year': year, 'month': month}), + qs=qs, + paginate_by=paginate_by + ) + + def build_detail_view(self): + ''' + Grab all the blog posts, render them to a template + string and write that out to the filesystem + ''' + for entry in self.get_model_queryset(): + url = entry.get_absolute_url() + path, slug = os.path.split(entry.get_absolute_url()) + path = '%s/' % path + # write html + response = self.client.get(url) + self.write_file(path, response.content, filename=slug) + # write txt + response = self.client.get('%s.txt' % url) + self.write_file(path, response.content, ext='txt', filename=slug) + + + def build_feed(self, url_name): + """ + Not called, but available for subclassing + """ + url = reverse(url_name,) + path, slug = os.path.split(url) + slug, ext = os.path.splitext(slug) + response = self.client.get(url, HTTP_HOST='127.0.0.1') + self.write_file('%s/' % path, response.content, ext=ext.split(".")[-1], filename=slug) + + +class BuildSitemap(BuildNew): + def build(self): + c = Client() + response = c.get('/sitemap.xml', HTTP_HOST='127.0.0.1') + self.write_file('', response.content, 'xml', 'sitemap') + + +class BuildPages(BuildNew): + def build(self): + model = apps.get_model('pages', 'page') + pages = model.objects.all() + for page in pages: + c = Context({'object':page,'SITE_URL':settings.SITE_URL, 'MEDIA_URL':settings.BAKED_MEDIA_URL}) + t = render_to_string(["details/%s.html" % page.slug, 'details/page.html'],c).encode('utf-8') + s = render_to_string('details/page.txt',c).encode('utf-8') + fpath = '%s' %(page.slug) + self.write_file('', t, 'html', page.slug) + self.write_file('', t, 'txt', page.slug) diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py new file mode 100644 index 0000000..8512f4f --- /dev/null +++ b/app/builder/sanitizer.py @@ -0,0 +1,60 @@ +from bs4 import BeautifulSoup + + +class Sanitizer(object): + blacklisted_tags = [] + blacklisted_attributes = [] + blacklisted_protocols = [] + + def __init__(self, tags=None, attributes=None, protocols=None): + if tags: + self.blacklisted_tags = tags + if attributes: + self.blacklisted_attributes = attributes + if protocols: + self.blacklisted_protocols = protocols + + def strip(self, content=None): + """Strip HTML content to meet standards of output type. + Meant to be subclassed for each converter. + + Keyword arguments: + content -- subset of an HTML document. (ie. contents of a body tag) + """ + if not content: + content = self.content + return content + + soup = BeautifulSoup(content, "lxml") + self.strip_tags(soup) + self.strip_attributes(soup) + + output = soup.body.decode_contents() + return output + + def strip_tags(self, soup): + if self.blacklisted_tags: + [x.extract() for x in soup.find_all(self.blacklisted_tags)] + + def strip_attributes_extra(self, node): + pass + + def strip_attributes(self, soup): + if not (self.blacklisted_attributes or self.blacklisted_protocols): + return + + for node in soup.body.find_all(True): + attributes = node.attrs.keys() + if not attributes: + continue + + for attr in self.blacklisted_attributes: + if attr in attributes: + del node.attrs[attr] + + self.strip_attributes_extra(node) + + if 'href' in attributes: + protocol = node['href'].split(':')[0] + if protocol in self.blacklisted_protocols: + del node['href']
\ No newline at end of file diff --git a/app/builder/views.py b/app/builder/views.py new file mode 100644 index 0000000..9d12aaa --- /dev/null +++ b/app/builder/views.py @@ -0,0 +1,13 @@ +from django.shortcuts import render_to_response +from django.template import RequestContext +#from src.build import builder as src_builder +from pages.build import builder as page_builder + + +def do_build(request): + section = request.GET.get('id', '') + context = {} + if section == 'pages': + context = {'message': 'Writing Pages to Disk'} + page_builder() + return render_to_response('admin/message.html', context) |