|
@@ -3,11 +3,14 @@
|
|
|
import re
|
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
-WHITESPACE = re.compile(r'\s+')
|
|
|
+# ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020
|
|
|
+# SPACE.
|
|
|
+# https://infra.spec.whatwg.org/#ascii-whitespace
|
|
|
+ASCII_WHITESPACE = re.compile(r'[\t\n\f\r ]+')
|
|
|
|
|
|
|
|
|
def normalize_whitespace(string):
|
|
|
- return WHITESPACE.sub(' ', string)
|
|
|
+ return ASCII_WHITESPACE.sub(' ', string)
|
|
|
|
|
|
|
|
|
class Element:
|
|
@@ -144,7 +147,7 @@ class Parser(HTMLParser):
|
|
|
)
|
|
|
|
|
|
def __init__(self):
|
|
|
- super().__init__(convert_charrefs=False)
|
|
|
+ super().__init__()
|
|
|
self.root = RootElement()
|
|
|
self.open_tags = []
|
|
|
self.element_positions = {}
|
|
@@ -202,12 +205,6 @@ class Parser(HTMLParser):
|
|
|
def handle_data(self, data):
|
|
|
self.current.append(data)
|
|
|
|
|
|
- def handle_charref(self, name):
|
|
|
- self.current.append('&%s;' % name)
|
|
|
-
|
|
|
- def handle_entityref(self, name):
|
|
|
- self.current.append('&%s;' % name)
|
|
|
-
|
|
|
|
|
|
def parse_html(html):
|
|
|
"""
|