diff options
author | luxagraf <sng@luxagraf.net> | 2016-02-18 09:35:45 -0500 |
---|---|---|
committer | luxagraf <sng@luxagraf.net> | 2016-02-18 09:35:45 -0500 |
commit | f520b80d69a9c51478a26f7c7e98a860e4e64c3d (patch) | |
tree | a30d4da7224e6b1b1b1ea2bc2ce21e5d8c3396f7 /app/builder | |
parent | 3b6588ec9bdcc9a2a0099b5568923485079a4dec (diff) |
added AMP template filter to replace img tags and strip out the
disallowed HTML. Because somewhere to someone it makes sense to speed up
pages by requiring javascript.
Diffstat (limited to 'app/builder')
-rw-r--r-- | app/builder/sanitizer.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py new file mode 100644 index 0000000..8512f4f --- /dev/null +++ b/app/builder/sanitizer.py @@ -0,0 +1,60 @@ +from bs4 import BeautifulSoup + + +class Sanitizer(object): + blacklisted_tags = [] + blacklisted_attributes = [] + blacklisted_protocols = [] + + def __init__(self, tags=None, attributes=None, protocols=None): + if tags: + self.blacklisted_tags = tags + if attributes: + self.blacklisted_attributes = attributes + if protocols: + self.blacklisted_protocols = protocols + + def strip(self, content=None): + """Strip HTML content to meet standards of output type. + Meant to be subclassed for each converter. + + Keyword arguments: + content -- subset of an HTML document. (ie. contents of a body tag) + """ + if not content: + content = self.content + return content + + soup = BeautifulSoup(content, "lxml") + self.strip_tags(soup) + self.strip_attributes(soup) + + output = soup.body.decode_contents() + return output + + def strip_tags(self, soup): + if self.blacklisted_tags: + [x.extract() for x in soup.find_all(self.blacklisted_tags)] + + def strip_attributes_extra(self, node): + pass + + def strip_attributes(self, soup): + if not (self.blacklisted_attributes or self.blacklisted_protocols): + return + + for node in soup.body.find_all(True): + attributes = node.attrs.keys() + if not attributes: + continue + + for attr in self.blacklisted_attributes: + if attr in attributes: + del node.attrs[attr] + + self.strip_attributes_extra(node) + + if 'href' in attributes: + protocol = node['href'].split(':')[0] + if protocol in self.blacklisted_protocols: + del node['href']
\ No newline at end of file |