diff options
Diffstat (limited to 'app/builder/sanitizer.py')
-rw-r--r-- | app/builder/sanitizer.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py new file mode 100644 index 0000000..8512f4f --- /dev/null +++ b/app/builder/sanitizer.py @@ -0,0 +1,60 @@ +from bs4 import BeautifulSoup + + +class Sanitizer(object): + blacklisted_tags = [] + blacklisted_attributes = [] + blacklisted_protocols = [] + + def __init__(self, tags=None, attributes=None, protocols=None): + if tags: + self.blacklisted_tags = tags + if attributes: + self.blacklisted_attributes = attributes + if protocols: + self.blacklisted_protocols = protocols + + def strip(self, content=None): + """Strip HTML content to meet standards of output type. + Meant to be subclassed for each converter. + + Keyword arguments: + content -- subset of an HTML document. (ie. contents of a body tag) + """ + if not content: + content = self.content + return content + + soup = BeautifulSoup(content, "lxml") + self.strip_tags(soup) + self.strip_attributes(soup) + + output = soup.body.decode_contents() + return output + + def strip_tags(self, soup): + if self.blacklisted_tags: + [x.extract() for x in soup.find_all(self.blacklisted_tags)] + + def strip_attributes_extra(self, node): + pass + + def strip_attributes(self, soup): + if not (self.blacklisted_attributes or self.blacklisted_protocols): + return + + for node in soup.body.find_all(True): + attributes = node.attrs.keys() + if not attributes: + continue + + for attr in self.blacklisted_attributes: + if attr in attributes: + del node.attrs[attr] + + self.strip_attributes_extra(node) + + if 'href' in attributes: + protocol = node['href'].split(':')[0] + if protocol in self.blacklisted_protocols: + del node['href']
\ No newline at end of file |