diff options
Diffstat (limited to 'app')
-rw-r--r-- | app/builder/sanitizer.py | 60 | ||||
-rw-r--r-- | app/lib/templatetags/templatetags/amp.py | 39 |
2 files changed, 99 insertions, 0 deletions
diff --git a/app/builder/sanitizer.py b/app/builder/sanitizer.py new file mode 100644 index 0000000..8512f4f --- /dev/null +++ b/app/builder/sanitizer.py @@ -0,0 +1,60 @@ +from bs4 import BeautifulSoup + + +class Sanitizer(object): + blacklisted_tags = [] + blacklisted_attributes = [] + blacklisted_protocols = [] + + def __init__(self, tags=None, attributes=None, protocols=None): + if tags: + self.blacklisted_tags = tags + if attributes: + self.blacklisted_attributes = attributes + if protocols: + self.blacklisted_protocols = protocols + + def strip(self, content=None): + """Strip HTML content to meet standards of output type. + Meant to be subclassed for each converter. + + Keyword arguments: + content -- subset of an HTML document. (ie. contents of a body tag) + """ + if not content: + content = self.content + return content + + soup = BeautifulSoup(content, "lxml") + self.strip_tags(soup) + self.strip_attributes(soup) + + output = soup.body.decode_contents() + return output + + def strip_tags(self, soup): + if self.blacklisted_tags: + [x.extract() for x in soup.find_all(self.blacklisted_tags)] + + def strip_attributes_extra(self, node): + pass + + def strip_attributes(self, soup): + if not (self.blacklisted_attributes or self.blacklisted_protocols): + return + + for node in soup.body.find_all(True): + attributes = node.attrs.keys() + if not attributes: + continue + + for attr in self.blacklisted_attributes: + if attr in attributes: + del node.attrs[attr] + + self.strip_attributes_extra(node) + + if 'href' in attributes: + protocol = node['href'].split(':')[0] + if protocol in self.blacklisted_protocols: + del node['href']
\ No newline at end of file diff --git a/app/lib/templatetags/templatetags/amp.py b/app/lib/templatetags/templatetags/amp.py new file mode 100644 index 0000000..9c6f118 --- /dev/null +++ b/app/lib/templatetags/templatetags/amp.py @@ -0,0 +1,39 @@ +from django import template +from PIL import Image +from io import BytesIO +try: + import Image + import ImageFile +except ImportError: + try: + from PIL import Image + from PIL import ImageFile + except ImportError: + raise ImportError("Could not import the Python Imaging Library.") + +import requests +from bs4 import BeautifulSoup +from builder.sanitizer import Sanitizer + +register = template.Library() + + +def remove_img_tags(text): + soup = BeautifulSoup(text, 'xml') + for img in soup.find_all('img'): + r = requests.get(img['src']) + i = Image.open(BytesIO(r.content)) + width, height = i.size + try: + new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src'], srcset=img['srcset']) + except: + new_tag = soup.new_tag("amp-img", alt=img["alt"], width=width, height=height, src=img['src']) + img.replace_with(new_tag) + return soup.prettify() + + +def do_amp(text): + bs = remove_img_tags(text) + return Sanitizer().strip(bs) + +register.filter('amp', do_amp) |