whitelist.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. """
  2. A generic HTML whitelisting engine, designed to accommodate subclassing to override
  3. specific rules.
  4. """
  5. import re
  6. from bs4 import BeautifulSoup, Comment, NavigableString, Tag
  7. from django.utils.html import escape
  8. ALLOWED_URL_SCHEMES = ['http', 'https', 'ftp', 'mailto', 'tel']
  9. PROTOCOL_RE = re.compile("^[a-z0-9][-+.a-z0-9]*:")
  10. def check_url(url_string):
  11. # Remove control characters and other disallowed characters
  12. # Browsers sometimes ignore these, so that 'jav\tascript:alert("XSS")'
  13. # is treated as a valid javascript: link
  14. unescaped = url_string.lower()
  15. unescaped = unescaped.replace("&lt;", "<")
  16. unescaped = unescaped.replace("&gt;", ">")
  17. unescaped = unescaped.replace("&amp;", "&")
  18. unescaped = re.sub(r'[`\000-\040\177-\240\s]+', '', unescaped)
  19. unescaped = unescaped.replace("\ufffd", "")
  20. if PROTOCOL_RE.match(unescaped):
  21. protocol = unescaped.split(':', 1)[0]
  22. if protocol not in ALLOWED_URL_SCHEMES:
  23. return None
  24. return url_string
  25. def attribute_rule(allowed_attrs):
  26. """
  27. Generator for functions that can be used as entries in Whitelister.element_rules.
  28. These functions accept a tag, and modify its attributes by looking each attribute
  29. up in the 'allowed_attrs' dict defined here:
  30. * if the lookup fails, drop the attribute
  31. * if the lookup returns a callable, replace the attribute with the result of calling
  32. it - e.g. {'title': uppercase} will replace 'title' with the result of uppercasing
  33. the title. If the callable returns None, the attribute is dropped
  34. * if the lookup returns a truthy value, keep the attribute; if falsy, drop it
  35. """
  36. def fn(tag):
  37. for attr, val in list(tag.attrs.items()):
  38. rule = allowed_attrs.get(attr)
  39. if rule:
  40. if callable(rule):
  41. new_val = rule(val)
  42. if new_val is None:
  43. del tag[attr]
  44. else:
  45. tag[attr] = new_val
  46. else:
  47. # rule is not callable, just truthy - keep the attribute
  48. pass
  49. else:
  50. # rule is falsy or absent - remove the attribute
  51. del tag[attr]
  52. return fn
  53. allow_without_attributes = attribute_rule({})
  54. DEFAULT_ELEMENT_RULES = {
  55. '[document]': allow_without_attributes,
  56. 'a': attribute_rule({'href': check_url}),
  57. 'b': allow_without_attributes,
  58. 'br': allow_without_attributes,
  59. 'div': allow_without_attributes,
  60. 'em': allow_without_attributes,
  61. 'h1': allow_without_attributes,
  62. 'h2': allow_without_attributes,
  63. 'h3': allow_without_attributes,
  64. 'h4': allow_without_attributes,
  65. 'h5': allow_without_attributes,
  66. 'h6': allow_without_attributes,
  67. 'hr': allow_without_attributes,
  68. 'i': allow_without_attributes,
  69. 'img': attribute_rule({'src': check_url, 'width': True, 'height': True,
  70. 'alt': True}),
  71. 'li': allow_without_attributes,
  72. 'ol': allow_without_attributes,
  73. 'p': allow_without_attributes,
  74. 'strong': allow_without_attributes,
  75. 'sub': allow_without_attributes,
  76. 'sup': allow_without_attributes,
  77. 'ul': allow_without_attributes,
  78. }
  79. class Whitelister:
  80. element_rules = DEFAULT_ELEMENT_RULES
  81. def clean(self, html):
  82. """Clean up an HTML string to contain just the allowed elements /
  83. attributes"""
  84. doc = BeautifulSoup(html, 'html5lib')
  85. self.clean_node(doc, doc)
  86. # Pass strings through django.utils.html.escape when generating the final HTML.
  87. # This differs from BeautifulSoup's default EntitySubstitution.substitute_html formatter
  88. # in that it escapes " to &quot; as well as escaping < > & - if we don't do this, then
  89. # BeautifulSoup will try to be clever and use single-quotes to wrap attribute values,
  90. # which confuses our regexp-based db-HTML-to-real-HTML conversion.
  91. return doc.decode(formatter=escape)
  92. def clean_node(self, doc, node):
  93. """Clean a BeautifulSoup document in-place"""
  94. if isinstance(node, NavigableString):
  95. self.clean_string_node(doc, node)
  96. elif isinstance(node, Tag):
  97. self.clean_tag_node(doc, node)
  98. # This branch is here in case node is a BeautifulSoup object that does
  99. # not inherit from NavigableString or Tag. I can't find any examples
  100. # of such a thing at the moment, so this branch is untested.
  101. else: # pragma: no cover
  102. self.clean_unknown_node(doc, node)
  103. def clean_string_node(self, doc, node):
  104. # Remove comments
  105. if isinstance(node, Comment):
  106. node.extract()
  107. return
  108. # by default, nothing needs to be done to whitelist string nodes
  109. pass
  110. def clean_tag_node(self, doc, tag):
  111. # first, whitelist the contents of this tag
  112. # NB tag.contents will change while this iteration is running, so we need
  113. # to capture the initial state into a static list() and iterate over that
  114. # to avoid losing our place in the sequence.
  115. for child in list(tag.contents):
  116. self.clean_node(doc, child)
  117. # see if there is a rule in element_rules for this tag type
  118. try:
  119. rule = self.element_rules[tag.name]
  120. except KeyError:
  121. # don't recognise this tag name, so KILL IT WITH FIRE
  122. tag.unwrap()
  123. return
  124. # apply the rule
  125. rule(tag)
  126. def clean_unknown_node(self, doc, node):
  127. # don't know what type of object this is, so KILL IT WITH FIRE
  128. node.decompose()