diff options
author | luxagraf <sng@luxagraf.net> | 2016-02-18 09:35:45 -0500 |
---|---|---|
committer | luxagraf <sng@luxagraf.net> | 2016-02-18 09:35:45 -0500 |
commit | f520b80d69a9c51478a26f7c7e98a860e4e64c3d (patch) | |
tree | a30d4da7224e6b1b1b1ea2bc2ce21e5d8c3396f7 /app | |
parent | 3b6588ec9bdcc9a2a0099b5568923485079a4dec (diff) |
added AMP template filter to replace img tags and strip out the
disallowed HTML. Because somewhere to someone it makes sense to speed up
pages by requiring javascript.
Diffstat (limited to 'app')
-rw-r--r-- | app/builder/sanitizer.py | 60 | ||||
-rw-r--r-- | app/lib/templatetags/templatetags/amp.py | 39 |
2 files changed, 99 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py new file mode 100644 index 0000000..8512f4f --- /dev/null +++ b/app/builder/sanitizer.py @@ -0,0 +1,60 @@ +from bs4 import BeautifulSoup + + +class Sanitizer(object): + blacklisted_tags = [] + blacklisted_attributes = [] + blacklisted_protocols = [] + + def __init__(self, tags=None, attributes=None, protocols=None): + if tags: + self.blacklisted_tags = tags + if attributes: + self.blacklisted_attributes = attributes + if protocols: + self.blacklisted_protocols = protocols + + def strip(self, content=None): + """Strip HTML content to meet standards of output type. + Meant to be subclassed for each converter. + + Keyword arguments: + content -- subset of an HTML document. (ie. contents of a body tag) + """ + if not content: + content = self.content + return content + + soup = BeautifulSoup(content, "lxml") + self.strip_tags(soup) + self.strip_attributes(soup) + + output = soup.body.decode_contents() + return output + + def strip_tags(self, soup): + if self.blacklisted_tags: + [x.extract() for x in soup.find_all(self.blacklisted_tags)] + + def strip_attributes_extra(self, node): + pass + + def strip_attributes(self, soup): + if not (self.blacklisted_attributes or self.blacklisted_protocols): + return + + for node in soup.body.find_all(True): + attributes = node.attrs.keys() + if not attributes: + continue + + for attr in self.blacklisted_attributes: + if attr in attributes: + del node.attrs[attr] + + self.strip_attributes_extra(node) + + if 'href' in attributes: + protocol = node['href'].split(':')[0] + if protocol in self.blacklisted_protocols: + del node['href']
\ No newline at end of file diff --git a/app/lib/templatetags/templatetags/amp.py b/app/lib/templatetags/templatetags/amp.py new file mode 100644 index 0000000..9c6f118 --- /dev/null +++ b/app/lib/templatetags/templatetags/amp.py @@ -0,0 +1,39 @@ +from django import template +from PIL import Image +from io import BytesIO +try: + import Image + import ImageFile +except ImportError: + try: + from PIL import Image + from PIL import ImageFile + except ImportError: + raise ImportError("Could not import the Python Imaging Library.") + +import requests +from bs4 import BeautifulSoup +from builder.sanitizer import Sanitizer + +register = template.Library() + + +def remove_img_tags(text): + soup = BeautifulSoup(text, 'xml') + for img in soup.find_all('img'): + r = requests.get(img['src']) + i = Image.open(BytesIO(r.content)) + width, height = i.size + try: + new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src'], srcset=img['srcset']) + except: + new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src']) + img.replace_with(new_tag) + return soup.prettify() + + +def do_amp(text): + bs = remove_img_tags(text) + return Sanitizer().strip(bs) + +register.filter('amp', do_amp) |