summaryrefslogtreecommitdiff
path: root/app/builder
diff options
context:
space:
mode:
authorluxagraf <sng@luxagraf.net>2016-02-18 09:35:45 -0500
committerluxagraf <sng@luxagraf.net>2016-02-18 09:35:45 -0500
commitf520b80d69a9c51478a26f7c7e98a860e4e64c3d (patch)
treea30d4da7224e6b1b1b1ea2bc2ce21e5d8c3396f7 /app/builder
parent3b6588ec9bdcc9a2a0099b5568923485079a4dec (diff)
added AMP template filter to replace img tags and strip out the
disallowed HTML. Because somewhere to someone it makes sense to speed up pages by requiring javascript.
Diffstat (limited to 'app/builder')
-rw-r--r--app/builder/sanitizer.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py
new file mode 100644
index 0000000..8512f4f
--- /dev/null
+++ b/app/builder/sanitizer.py
@@ -0,0 +1,60 @@
+from bs4 import BeautifulSoup
+
+
+class Sanitizer(object):
+ blacklisted_tags = []
+ blacklisted_attributes = []
+ blacklisted_protocols = []
+
+ def __init__(self, tags=None, attributes=None, protocols=None):
+ if tags:
+ self.blacklisted_tags = tags
+ if attributes:
+ self.blacklisted_attributes = attributes
+ if protocols:
+ self.blacklisted_protocols = protocols
+
+ def strip(self, content=None):
+ """Strip HTML content to meet standards of output type.
+ Meant to be subclassed for each converter.
+
+ Keyword arguments:
+ content -- subset of an HTML document. (ie. contents of a body tag)
+ """
+ if not content:
+ content = self.content
+ return content
+
+ soup = BeautifulSoup(content, "lxml")
+ self.strip_tags(soup)
+ self.strip_attributes(soup)
+
+ output = soup.body.decode_contents()
+ return output
+
+ def strip_tags(self, soup):
+ if self.blacklisted_tags:
+ [x.extract() for x in soup.find_all(self.blacklisted_tags)]
+
+ def strip_attributes_extra(self, node):
+ pass
+
+ def strip_attributes(self, soup):
+ if not (self.blacklisted_attributes or self.blacklisted_protocols):
+ return
+
+ for node in soup.body.find_all(True):
+ attributes = node.attrs.keys()
+ if not attributes:
+ continue
+
+ for attr in self.blacklisted_attributes:
+ if attr in attributes:
+ del node.attrs[attr]
+
+ self.strip_attributes_extra(node)
+
+ if 'href' in attributes:
+ protocol = node['href'].split(':')[0]
+ if protocol in self.blacklisted_protocols:
+ del node['href'] \ No newline at end of file