From f520b80d69a9c51478a26f7c7e98a860e4e64c3d Mon Sep 17 00:00:00 2001
From: luxagraf <sng@luxagraf.net>
Date: Thu, 18 Feb 2016 09:35:45 -0500
Subject: added AMP template filter to replace img tags and strip out the
 disallowed HTML. Because somewhere to someone it makes sense to speed up
 pages by requiring javascript.

---
 app/builder/sanitizer.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 app/builder/sanitizer.py

(limited to 'app/builder')

diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py
new file mode 100644
index 0000000..8512f4f
--- /dev/null
+++ b/app/builder/sanitizer.py
@@ -0,0 +1,60 @@
+from bs4 import BeautifulSoup
+
+
+class Sanitizer(object):
+    blacklisted_tags = []
+    blacklisted_attributes = []
+    blacklisted_protocols = []
+
+    def __init__(self, tags=None, attributes=None, protocols=None):
+        if tags:
+            self.blacklisted_tags = tags
+        if attributes:
+            self.blacklisted_attributes = attributes
+        if protocols:
+            self.blacklisted_protocols = protocols
+
+    def strip(self, content=None):
+        """Strip HTML content to meet standards of output type.
+        Meant to be subclassed for each converter.
+
+        Keyword arguments:
+        content -- subset of an HTML document. (ie. contents of a body tag)
+        """
+        if not content:
+            content = self.content
+            return content
+
+        soup = BeautifulSoup(content, "lxml")
+        self.strip_tags(soup)
+        self.strip_attributes(soup)
+
+        output = soup.body.decode_contents()
+        return output
+
+    def strip_tags(self, soup):
+        if self.blacklisted_tags:
+            [x.extract() for x in soup.find_all(self.blacklisted_tags)]
+
+    def strip_attributes_extra(self, node):
+        pass
+
+    def strip_attributes(self, soup):
+        if not (self.blacklisted_attributes or self.blacklisted_protocols):
+            return
+
+        for node in soup.body.find_all(True):
+            attributes = node.attrs.keys()
+            if not attributes:
+                continue
+
+            for attr in self.blacklisted_attributes:
+                if attr in attributes:
+                    del node.attrs[attr]
+
+            self.strip_attributes_extra(node)
+
+            if 'href' in attributes:
+                protocol = node['href'].split(':')[0]
+                if protocol in self.blacklisted_protocols:
+                    del node['href']
\ No newline at end of file
-- 
cgit v1.2.3