123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- """
- A generic HTML whitelisting engine, designed to accommodate subclassing to override
- specific rules.
- """
- import re
- from bs4 import BeautifulSoup, Comment, NavigableString, Tag
- from django.utils.html import escape
- ALLOWED_URL_SCHEMES = ['http', 'https', 'ftp', 'mailto', 'tel']
- PROTOCOL_RE = re.compile("^[a-z0-9][-+.a-z0-9]*:")
- def check_url(url_string):
- # Remove control characters and other disallowed characters
- # Browsers sometimes ignore these, so that 'jav\tascript:alert("XSS")'
- # is treated as a valid javascript: link
- unescaped = url_string.lower()
- unescaped = unescaped.replace("<", "<")
- unescaped = unescaped.replace(">", ">")
- unescaped = unescaped.replace("&", "&")
- unescaped = re.sub(r'[`\000-\040\177-\240\s]+', '', unescaped)
- unescaped = unescaped.replace("\ufffd", "")
- if PROTOCOL_RE.match(unescaped):
- protocol = unescaped.split(':', 1)[0]
- if protocol not in ALLOWED_URL_SCHEMES:
- return None
- return url_string
- def attribute_rule(allowed_attrs):
- """
- Generator for functions that can be used as entries in Whitelister.element_rules.
- These functions accept a tag, and modify its attributes by looking each attribute
- up in the 'allowed_attrs' dict defined here:
- * if the lookup fails, drop the attribute
- * if the lookup returns a callable, replace the attribute with the result of calling
- it - e.g. {'title': uppercase} will replace 'title' with the result of uppercasing
- the title. If the callable returns None, the attribute is dropped
- * if the lookup returns a truthy value, keep the attribute; if falsy, drop it
- """
- def fn(tag):
- for attr, val in list(tag.attrs.items()):
- rule = allowed_attrs.get(attr)
- if rule:
- if callable(rule):
- new_val = rule(val)
- if new_val is None:
- del tag[attr]
- else:
- tag[attr] = new_val
- else:
- # rule is not callable, just truthy - keep the attribute
- pass
- else:
- # rule is falsy or absent - remove the attribute
- del tag[attr]
- return fn
- allow_without_attributes = attribute_rule({})
- DEFAULT_ELEMENT_RULES = {
- '[document]': allow_without_attributes,
- 'a': attribute_rule({'href': check_url}),
- 'b': allow_without_attributes,
- 'br': allow_without_attributes,
- 'div': allow_without_attributes,
- 'em': allow_without_attributes,
- 'h1': allow_without_attributes,
- 'h2': allow_without_attributes,
- 'h3': allow_without_attributes,
- 'h4': allow_without_attributes,
- 'h5': allow_without_attributes,
- 'h6': allow_without_attributes,
- 'hr': allow_without_attributes,
- 'i': allow_without_attributes,
- 'img': attribute_rule({'src': check_url, 'width': True, 'height': True,
- 'alt': True}),
- 'li': allow_without_attributes,
- 'ol': allow_without_attributes,
- 'p': allow_without_attributes,
- 'strong': allow_without_attributes,
- 'sub': allow_without_attributes,
- 'sup': allow_without_attributes,
- 'ul': allow_without_attributes,
- }
- class Whitelister:
- element_rules = DEFAULT_ELEMENT_RULES
- def clean(self, html):
- """Clean up an HTML string to contain just the allowed elements /
- attributes"""
- doc = BeautifulSoup(html, 'html5lib')
- self.clean_node(doc, doc)
- # Pass strings through django.utils.html.escape when generating the final HTML.
- # This differs from BeautifulSoup's default EntitySubstitution.substitute_html formatter
- # in that it escapes " to " as well as escaping < > & - if we don't do this, then
- # BeautifulSoup will try to be clever and use single-quotes to wrap attribute values,
- # which confuses our regexp-based db-HTML-to-real-HTML conversion.
- return doc.decode(formatter=escape)
- def clean_node(self, doc, node):
- """Clean a BeautifulSoup document in-place"""
- if isinstance(node, NavigableString):
- self.clean_string_node(doc, node)
- elif isinstance(node, Tag):
- self.clean_tag_node(doc, node)
- # This branch is here in case node is a BeautifulSoup object that does
- # not inherit from NavigableString or Tag. I can't find any examples
- # of such a thing at the moment, so this branch is untested.
- else: # pragma: no cover
- self.clean_unknown_node(doc, node)
- def clean_string_node(self, doc, node):
- # Remove comments
- if isinstance(node, Comment):
- node.extract()
- return
- # by default, nothing needs to be done to whitelist string nodes
- pass
- def clean_tag_node(self, doc, tag):
- # first, whitelist the contents of this tag
- # NB tag.contents will change while this iteration is running, so we need
- # to capture the initial state into a static list() and iterate over that
- # to avoid losing our place in the sequence.
- for child in list(tag.contents):
- self.clean_node(doc, child)
- # see if there is a rule in element_rules for this tag type
- try:
- rule = self.element_rules[tag.name]
- except KeyError:
- # don't recognise this tag name, so KILL IT WITH FIRE
- tag.unwrap()
- return
- # apply the rule
- rule(tag)
- def clean_unknown_node(self, doc, node):
- # don't know what type of object this is, so KILL IT WITH FIRE
- node.decompose()
|