added AMP template filter to replace img tags and strip out the

disallowed HTML. Because somewhere to someone it makes sense to speed up pages by requiring javascript.
author: luxagraf <sng@luxagraf.net> 2016-02-18 09:35:45 -0500
committer: luxagraf <sng@luxagraf.net> 2016-02-18 09:35:45 -0500
commit: f520b80d69a9c51478a26f7c7e98a860e4e64c3d (patch)
tree: a30d4da7224e6b1b1b1ea2bc2ce21e5d8c3396f7 /app
parent: 3b6588ec9bdcc9a2a0099b5568923485079a4dec (diff)
2 files changed, 99 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py
new file mode 100644
index 0000000..8512f4f
--- /dev/null
+++ b/app/builder/sanitizer.py
@@ -0,0 +1,60 @@
+from bs4 import BeautifulSoup
+
+
+class Sanitizer(object):
+    blacklisted_tags = []
+    blacklisted_attributes = []
+    blacklisted_protocols = []
+
+    def __init__(self, tags=None, attributes=None, protocols=None):
+        if tags:
+            self.blacklisted_tags = tags
+        if attributes:
+            self.blacklisted_attributes = attributes
+        if protocols:
+            self.blacklisted_protocols = protocols
+
+    def strip(self, content=None):
+        """Strip HTML content to meet standards of output type.
+        Meant to be subclassed for each converter.
+
+        Keyword arguments:
+        content -- subset of an HTML document. (ie. contents of a body tag)
+        """
+        if not content:
+            content = self.content
+            return content
+
+        soup = BeautifulSoup(content, "lxml")
+        self.strip_tags(soup)
+        self.strip_attributes(soup)
+
+        output = soup.body.decode_contents()
+        return output
+
+    def strip_tags(self, soup):
+        if self.blacklisted_tags:
+            [x.extract() for x in soup.find_all(self.blacklisted_tags)]
+
+    def strip_attributes_extra(self, node):
+        pass
+
+    def strip_attributes(self, soup):
+        if not (self.blacklisted_attributes or self.blacklisted_protocols):
+            return
+
+        for node in soup.body.find_all(True):
+            attributes = node.attrs.keys()
+            if not attributes:
+                continue
+
+            for attr in self.blacklisted_attributes:
+                if attr in attributes:
+                    del node.attrs[attr]
+
+            self.strip_attributes_extra(node)
+
+            if 'href' in attributes:
+                protocol = node['href'].split(':')[0]
+                if protocol in self.blacklisted_protocols:
+                    del node['href']
+\ No newline at end of file
diff --git a/app/lib/templatetags/templatetags/amp.py b/app/lib/templatetags/templatetags/amp.py
new file mode 100644
index 0000000..9c6f118
--- /dev/null
+++ b/app/lib/templatetags/templatetags/amp.py
@@ -0,0 +1,39 @@
+from django import template
+from PIL import Image
+from io import BytesIO
+try:
+    import Image
+    import ImageFile
+except ImportError:
+    try:
+        from PIL import Image
+        from PIL import ImageFile
+    except ImportError:
+        raise ImportError("Could not import the Python Imaging Library.")
+
+import requests
+from bs4 import BeautifulSoup
+from builder.sanitizer import Sanitizer
+
+register = template.Library()
+
+
+def remove_img_tags(text):
+    soup = BeautifulSoup(text, 'xml')
+    for img in soup.find_all('img'):
+        r = requests.get(img['src'])
+        i = Image.open(BytesIO(r.content))
+        width, height = i.size
+        try:
+            new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src'], srcset=img['srcset'])
+        except:
+            new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src'])
+        img.replace_with(new_tag)
+    return soup.prettify()
+
+
+def do_amp(text):
+    bs = remove_img_tags(text)
+    return Sanitizer().strip(bs)
+
+register.filter('amp', do_amp)
author	luxagraf <sng@luxagraf.net>	2016-02-18 09:35:45 -0500
committer	luxagraf <sng@luxagraf.net>	2016-02-18 09:35:45 -0500
commit	f520b80d69a9c51478a26f7c7e98a860e4e64c3d (patch)
tree	a30d4da7224e6b1b1b1ea2bc2ce21e5d8c3396f7 /app
parent	3b6588ec9bdcc9a2a0099b5568923485079a4dec (diff)