blob: 8512f4f33ac0abec1cd676b72f89b1e6238c6372 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
from bs4 import BeautifulSoup
class Sanitizer(object):
blacklisted_tags = []
blacklisted_attributes = []
blacklisted_protocols = []
def __init__(self, tags=None, attributes=None, protocols=None):
if tags:
self.blacklisted_tags = tags
if attributes:
self.blacklisted_attributes = attributes
if protocols:
self.blacklisted_protocols = protocols
def strip(self, content=None):
"""Strip HTML content to meet standards of output type.
Meant to be subclassed for each converter.
Keyword arguments:
content -- subset of an HTML document. (ie. contents of a body tag)
"""
if not content:
content = self.content
return content
soup = BeautifulSoup(content, "lxml")
self.strip_tags(soup)
self.strip_attributes(soup)
output = soup.body.decode_contents()
return output
def strip_tags(self, soup):
if self.blacklisted_tags:
[x.extract() for x in soup.find_all(self.blacklisted_tags)]
def strip_attributes_extra(self, node):
pass
def strip_attributes(self, soup):
if not (self.blacklisted_attributes or self.blacklisted_protocols):
return
for node in soup.body.find_all(True):
attributes = node.attrs.keys()
if not attributes:
continue
for attr in self.blacklisted_attributes:
if attr in attributes:
del node.attrs[attr]
self.strip_attributes_extra(node)
if 'href' in attributes:
protocol = node['href'].split(':')[0]
if protocol in self.blacklisted_protocols:
del node['href']
|