initial commit

author: lxf <sng@luxagraf.net> 2022-05-14 16:38:07 -0400
committer: lxf <sng@luxagraf.net> 2022-05-14 16:38:07 -0400
commit: bb3973ffb714c932e9ec6dd6a751228dc71fe1d3 (patch)
tree: 6fa32f9392ad2ec32271349b86a4c1388fd6ba95 /app/builder/sanitizer.py
1 files changed, 60 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py
new file mode 100644
index 0000000..8512f4f
--- /dev/null
+++ b/app/builder/sanitizer.py
@@ -0,0 +1,60 @@
+from bs4 import BeautifulSoup
+
+
+class Sanitizer(object):
+    blacklisted_tags = []
+    blacklisted_attributes = []
+    blacklisted_protocols = []
+
+    def __init__(self, tags=None, attributes=None, protocols=None):
+        if tags:
+            self.blacklisted_tags = tags
+        if attributes:
+            self.blacklisted_attributes = attributes
+        if protocols:
+            self.blacklisted_protocols = protocols
+
+    def strip(self, content=None):
+        """Strip HTML content to meet standards of output type.
+        Meant to be subclassed for each converter.
+
+        Keyword arguments:
+        content -- subset of an HTML document. (ie. contents of a body tag)
+        """
+        if not content:
+            content = self.content
+            return content
+
+        soup = BeautifulSoup(content, "lxml")
+        self.strip_tags(soup)
+        self.strip_attributes(soup)
+
+        output = soup.body.decode_contents()
+        return output
+
+    def strip_tags(self, soup):
+        if self.blacklisted_tags:
+            [x.extract() for x in soup.find_all(self.blacklisted_tags)]
+
+    def strip_attributes_extra(self, node):
+        pass
+
+    def strip_attributes(self, soup):
+        if not (self.blacklisted_attributes or self.blacklisted_protocols):
+            return
+
+        for node in soup.body.find_all(True):
+            attributes = node.attrs.keys()
+            if not attributes:
+                continue
+
+            for attr in self.blacklisted_attributes:
+                if attr in attributes:
+                    del node.attrs[attr]
+
+            self.strip_attributes_extra(node)
+
+            if 'href' in attributes:
+                protocol = node['href'].split(':')[0]
+                if protocol in self.blacklisted_protocols:
+                    del node['href']
+\ No newline at end of file
author	lxf <sng@luxagraf.net>	2022-05-14 16:38:07 -0400
committer	lxf <sng@luxagraf.net>	2022-05-14 16:38:07 -0400
commit	bb3973ffb714c932e9ec6dd6a751228dc71fe1d3 (patch)
tree	6fa32f9392ad2ec32271349b86a4c1388fd6ba95 /app/builder/sanitizer.py