app/builder/sanitizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

from bs4 import BeautifulSoup


class Sanitizer(object):
    blacklisted_tags = []
    blacklisted_attributes = []
    blacklisted_protocols = []

    def __init__(self, tags=None, attributes=None, protocols=None):
        if tags:
            self.blacklisted_tags = tags
        if attributes:
            self.blacklisted_attributes = attributes
        if protocols:
            self.blacklisted_protocols = protocols

    def strip(self, content=None):
        """Strip HTML content to meet standards of output type.
        Meant to be subclassed for each converter.

        Keyword arguments:
        content -- subset of an HTML document. (ie. contents of a body tag)
        """
        if not content:
            content = self.content
            return content

        soup = BeautifulSoup(content, "lxml")
        self.strip_tags(soup)
        self.strip_attributes(soup)

        output = soup.body.decode_contents()
        return output

    def strip_tags(self, soup):
        if self.blacklisted_tags:
            [x.extract() for x in soup.find_all(self.blacklisted_tags)]

    def strip_attributes_extra(self, node):
        pass

    def strip_attributes(self, soup):
        if not (self.blacklisted_attributes or self.blacklisted_protocols):
            return

        for node in soup.body.find_all(True):
            attributes = node.attrs.keys()
            if not attributes:
                continue

            for attr in self.blacklisted_attributes:
                if attr in attributes:
                    del node.attrs[attr]

            self.strip_attributes_extra(node)

            if 'href' in attributes:
                protocol = node['href'].split(':')[0]
                if protocol in self.blacklisted_protocols:
                    del node['href']