summaryrefslogtreecommitdiff
path: root/app/builder/sanitizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'app/builder/sanitizer.py')
-rw-r--r--app/builder/sanitizer.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py
new file mode 100644
index 0000000..8512f4f
--- /dev/null
+++ b/app/builder/sanitizer.py
@@ -0,0 +1,60 @@
+from bs4 import BeautifulSoup
+
+
+class Sanitizer(object):
+ blacklisted_tags = []
+ blacklisted_attributes = []
+ blacklisted_protocols = []
+
+ def __init__(self, tags=None, attributes=None, protocols=None):
+ if tags:
+ self.blacklisted_tags = tags
+ if attributes:
+ self.blacklisted_attributes = attributes
+ if protocols:
+ self.blacklisted_protocols = protocols
+
+ def strip(self, content=None):
+ """Strip HTML content to meet standards of output type.
+ Meant to be subclassed for each converter.
+
+ Keyword arguments:
+ content -- subset of an HTML document. (ie. contents of a body tag)
+ """
+ if not content:
+ content = self.content
+ return content
+
+ soup = BeautifulSoup(content, "lxml")
+ self.strip_tags(soup)
+ self.strip_attributes(soup)
+
+ output = soup.body.decode_contents()
+ return output
+
+ def strip_tags(self, soup):
+ if self.blacklisted_tags:
+ [x.extract() for x in soup.find_all(self.blacklisted_tags)]
+
+ def strip_attributes_extra(self, node):
+ pass
+
+ def strip_attributes(self, soup):
+ if not (self.blacklisted_attributes or self.blacklisted_protocols):
+ return
+
+ for node in soup.body.find_all(True):
+ attributes = node.attrs.keys()
+ if not attributes:
+ continue
+
+ for attr in self.blacklisted_attributes:
+ if attr in attributes:
+ del node.attrs[attr]
+
+ self.strip_attributes_extra(node)
+
+ if 'href' in attributes:
+ protocol = node['href'].split(':')[0]
+ if protocol in self.blacklisted_protocols:
+ del node['href'] \ No newline at end of file