html_parser.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. from django.utils.six.moves import html_parser as _html_parser
  2. import re
  3. import sys
  4. current_version = sys.version_info
  5. use_workaround = (
  6. (current_version < (2, 7, 3)) or
  7. (current_version >= (3, 0) and current_version < (3, 2, 3))
  8. )
  9. HTMLParseError = _html_parser.HTMLParseError
  10. if not use_workaround:
  11. HTMLParser = _html_parser.HTMLParser
  12. else:
  13. tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
  14. class HTMLParser(_html_parser.HTMLParser):
  15. """
  16. Patched version of stdlib's HTMLParser with patch from:
  17. http://bugs.python.org/issue670664
  18. """
  19. def __init__(self):
  20. _html_parser.HTMLParser.__init__(self)
  21. self.cdata_tag = None
  22. def set_cdata_mode(self, tag):
  23. try:
  24. self.interesting = _html_parser.interesting_cdata
  25. except AttributeError:
  26. self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
  27. self.cdata_tag = tag.lower()
  28. def clear_cdata_mode(self):
  29. self.interesting = _html_parser.interesting_normal
  30. self.cdata_tag = None
  31. # Internal -- handle starttag, return end or -1 if not terminated
  32. def parse_starttag(self, i):
  33. self.__starttag_text = None
  34. endpos = self.check_for_whole_start_tag(i)
  35. if endpos < 0:
  36. return endpos
  37. rawdata = self.rawdata
  38. self.__starttag_text = rawdata[i:endpos]
  39. # Now parse the data between i+1 and j into a tag and attrs
  40. attrs = []
  41. match = tagfind.match(rawdata, i + 1)
  42. assert match, 'unexpected call to parse_starttag()'
  43. k = match.end()
  44. self.lasttag = tag = match.group(1).lower()
  45. while k < endpos:
  46. m = _html_parser.attrfind.match(rawdata, k)
  47. if not m:
  48. break
  49. attrname, rest, attrvalue = m.group(1, 2, 3)
  50. if not rest:
  51. attrvalue = None
  52. elif (attrvalue[:1] == '\'' == attrvalue[-1:] or
  53. attrvalue[:1] == '"' == attrvalue[-1:]):
  54. attrvalue = attrvalue[1:-1]
  55. if attrvalue:
  56. attrvalue = self.unescape(attrvalue)
  57. attrs.append((attrname.lower(), attrvalue))
  58. k = m.end()
  59. end = rawdata[k:endpos].strip()
  60. if end not in (">", "/>"):
  61. lineno, offset = self.getpos()
  62. if "\n" in self.__starttag_text:
  63. lineno = lineno + self.__starttag_text.count("\n")
  64. offset = (len(self.__starttag_text)
  65. - self.__starttag_text.rfind("\n"))
  66. else:
  67. offset = offset + len(self.__starttag_text)
  68. self.error("junk characters in start tag: %r"
  69. % (rawdata[k:endpos][:20],))
  70. if end.endswith('/>'):
  71. # XHTML-style empty tag: <span attr="value" />
  72. self.handle_startendtag(tag, attrs)
  73. else:
  74. self.handle_starttag(tag, attrs)
  75. if tag in self.CDATA_CONTENT_ELEMENTS:
  76. self.set_cdata_mode(tag) # <--------------------------- Changed
  77. return endpos
  78. # Internal -- parse endtag, return end or -1 if incomplete
  79. def parse_endtag(self, i):
  80. rawdata = self.rawdata
  81. assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
  82. match = _html_parser.endendtag.search(rawdata, i + 1) # >
  83. if not match:
  84. return -1
  85. j = match.end()
  86. match = _html_parser.endtagfind.match(rawdata, i) # </ + tag + >
  87. if not match:
  88. if self.cdata_tag is not None: # *** add ***
  89. self.handle_data(rawdata[i:j]) # *** add ***
  90. return j # *** add ***
  91. self.error("bad end tag: %r" % (rawdata[i:j],))
  92. # --- changed start ---------------------------------------------------
  93. tag = match.group(1).strip()
  94. if self.cdata_tag is not None:
  95. if tag.lower() != self.cdata_tag:
  96. self.handle_data(rawdata[i:j])
  97. return j
  98. # --- changed end -----------------------------------------------------
  99. self.handle_endtag(tag.lower())
  100. self.clear_cdata_mode()
  101. return j