summaryrefslogtreecommitdiff
path: root/app
diff options
context:
space:
mode:
Diffstat (limited to 'app')
-rw-r--r--app/builder/sanitizer.py60
-rw-r--r--app/lib/templatetags/templatetags/amp.py39
2 files changed, 99 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py
new file mode 100644
index 0000000..8512f4f
--- /dev/null
+++ b/app/builder/sanitizer.py
@@ -0,0 +1,60 @@
+from bs4 import BeautifulSoup
+
+
+class Sanitizer(object):
+ blacklisted_tags = []
+ blacklisted_attributes = []
+ blacklisted_protocols = []
+
+ def __init__(self, tags=None, attributes=None, protocols=None):
+ if tags:
+ self.blacklisted_tags = tags
+ if attributes:
+ self.blacklisted_attributes = attributes
+ if protocols:
+ self.blacklisted_protocols = protocols
+
+ def strip(self, content=None):
+ """Strip HTML content to meet standards of output type.
+ Meant to be subclassed for each converter.
+
+ Keyword arguments:
+ content -- subset of an HTML document. (ie. contents of a body tag)
+ """
+ if not content:
+ content = self.content
+ return content
+
+ soup = BeautifulSoup(content, "lxml")
+ self.strip_tags(soup)
+ self.strip_attributes(soup)
+
+ output = soup.body.decode_contents()
+ return output
+
+ def strip_tags(self, soup):
+ if self.blacklisted_tags:
+ [x.extract() for x in soup.find_all(self.blacklisted_tags)]
+
+ def strip_attributes_extra(self, node):
+ pass
+
+ def strip_attributes(self, soup):
+ if not (self.blacklisted_attributes or self.blacklisted_protocols):
+ return
+
+ for node in soup.body.find_all(True):
+ attributes = node.attrs.keys()
+ if not attributes:
+ continue
+
+ for attr in self.blacklisted_attributes:
+ if attr in attributes:
+ del node.attrs[attr]
+
+ self.strip_attributes_extra(node)
+
+ if 'href' in attributes:
+ protocol = node['href'].split(':')[0]
+ if protocol in self.blacklisted_protocols:
+ del node['href'] \ No newline at end of file
diff --git a/app/lib/templatetags/templatetags/amp.py b/app/lib/templatetags/templatetags/amp.py
new file mode 100644
index 0000000..9c6f118
--- /dev/null
+++ b/app/lib/templatetags/templatetags/amp.py
@@ -0,0 +1,39 @@
+from django import template
+from PIL import Image
+from io import BytesIO
+try:
+ import Image
+ import ImageFile
+except ImportError:
+ try:
+ from PIL import Image
+ from PIL import ImageFile
+ except ImportError:
+ raise ImportError("Could not import the Python Imaging Library.")
+
+import requests
+from bs4 import BeautifulSoup
+from builder.sanitizer import Sanitizer
+
+register = template.Library()
+
+
+def remove_img_tags(text):
+ soup = BeautifulSoup(text, 'xml')
+ for img in soup.find_all('img'):
+ r = requests.get(img['src'])
+ i = Image.open(BytesIO(r.content))
+ width, height = i.size
+ try:
+ new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src'], srcset=img['srcset'])
+ except:
+ new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src'])
+ img.replace_with(new_tag)
+ return soup.prettify()
+
+
+def do_amp(text):
+ bs = remove_img_tags(text)
+ return Sanitizer().strip(bs)
+
+register.filter('amp', do_amp)