فهرست منبع

Update tests & html whitelist to always use `html.parser` & remove `html5lib`

- It's faster, built in, and saves the complexities of swappable backends and needing to test all combinations.
- Remove additional dependency on `html5lib`
- See https://beautiful-soup-4.readthedocs.io/en/latest/index.html#installing-a-parser
Jake Howard 1 سال پیش
والد
کامیت
e1e3c8efde

+ 2 - 0
CHANGELOG.txt

@@ -6,8 +6,10 @@ Changelog
 
  * Refine wording of page & collection privacy using password is a shared password and should not be used for secure content (Rohit Sharma, Jake Howard)
  * Add RelatedObjectsColumn to the table UI framework (Matt Westcott)
+ * Move RichText HTML whitelist parser to use the faster, built in `html.parser` (Jake Howard)
  * Docs: Add contributing development documentation on how to work with a fork of Wagtail (Nix Asteri, Dan Braghis)
  * Maintenance: Remove duplicate 'path' in default_exclude_fields_in_copy (ramchandra-st)
+ * Maintenance: Update unit tests to always use the faster, built in `html.parser` & remove `html5lib` dependency (Jake Howard)
 
 
 6.0 (xx.xx.xxxx) - IN DEVELOPMENT

+ 0 - 1
setup.py

@@ -29,7 +29,6 @@ install_requires = [
     "draftjs_exporter>=2.1.5,<6.0",
     "Pillow>=9.1.0,<11.0.0",
     "beautifulsoup4>=4.8,<4.13",
-    "html5lib>=0.999,<2",
     "Willow[heif]>=1.6.2,<2",
     "requests>=2.11.1,<3.0",
     "l18n>=2018.5",

+ 5 - 5
wagtail/admin/tests/test_compare.py

@@ -153,13 +153,13 @@ class TestRichTextFieldComparison(TestFieldComparison):
             SimplePage._meta.get_field("content"),
             SimplePage(content="Original content"),
             SimplePage(
-                content='<script type="text/javascript">doSomethingBad();</script>'
+                content='Do something good. <script type="text/javascript">doSomethingBad();</script>'
             ),
         )
 
         self.assertEqual(
             comparison.htmldiff(),
-            '<span class="deletion">Original content</span><span class="addition">doSomethingBad();</span>',
+            '<span class="deletion">Original content</span><span class="addition">Do something good.</span>',
         )
         self.assertIsInstance(comparison.htmldiff(), SafeString)
 
@@ -491,7 +491,7 @@ class TestStreamFieldComparison(TestCase):
 
         self.assertEqual(
             comparison.htmldiff(),
-            '<div class="comparison__child-object">I really like <span class="deletion">Wagtail &lt;3</span><span class="addition">evil code &gt;_&lt; doSomethingBad();</span></div>',
+            '<div class="comparison__child-object">I really like <span class="deletion">Wagtail &lt;3</span><span class="addition">evil code &gt;_&lt;</span></div>',
         )
         self.assertIsInstance(comparison.htmldiff(), SafeString)
 
@@ -525,7 +525,7 @@ class TestStreamFieldComparison(TestCase):
 
         self.assertEqual(
             comparison.htmldiff(),
-            '<div class="comparison__child-object">Original and unchanged content</div>\n<div class="comparison__child-object addition">I really like evil code &gt;_&lt; doSomethingBad();</div>',
+            '<div class="comparison__child-object">Original and unchanged content</div>\n<div class="comparison__child-object addition">I really like evil code &gt;_&lt;</div>',
         )
         self.assertIsInstance(comparison.htmldiff(), SafeString)
 
@@ -559,7 +559,7 @@ class TestStreamFieldComparison(TestCase):
 
         self.assertEqual(
             comparison.htmldiff(),
-            '<div class="comparison__child-object">Original and unchanged content</div>\n<div class="comparison__child-object deletion">I really like evil code &gt;_&lt; doSomethingBad();</div>',
+            '<div class="comparison__child-object">Original and unchanged content</div>\n<div class="comparison__child-object deletion">I really like evil code &gt;_&lt;</div>',
         )
         self.assertIsInstance(comparison.htmldiff(), SafeString)
 

+ 4 - 5
wagtail/admin/tests/test_dbwhitelister.py

@@ -9,7 +9,7 @@ class TestDbWhitelisterMethods(WagtailTestUtils, TestCase):
         self.whitelister = EditorHTMLConverter().whitelister
 
     def test_clean_tag_node_div(self):
-        soup = self.get_soup("<div>foo</div>", "html5lib")
+        soup = self.get_soup("<div>foo</div>")
         tag = soup.div
         self.assertEqual(tag.name, "div")
         self.whitelister.clean_tag_node(soup, tag)
@@ -18,7 +18,6 @@ class TestDbWhitelisterMethods(WagtailTestUtils, TestCase):
     def test_clean_tag_node_with_data_embedtype(self):
         soup = self.get_soup(
             '<p><a data-embedtype="image" data-id=1 data-format="left" data-alt="bar" irrelevant="baz">foo</a></p>',
-            "html5lib",
         )
         tag = soup.p
         self.whitelister.clean_tag_node(soup, tag)
@@ -29,14 +28,13 @@ class TestDbWhitelisterMethods(WagtailTestUtils, TestCase):
     def test_clean_tag_node_with_data_linktype(self):
         soup = self.get_soup(
             '<a data-linktype="document" data-id="1" irrelevant="baz">foo</a>',
-            "html5lib",
         )
         tag = soup.a
         self.whitelister.clean_tag_node(soup, tag)
         self.assertEqual(str(tag), '<a id="1" linktype="document">foo</a>')
 
     def test_clean_tag_node(self):
-        soup = self.get_soup('<a irrelevant="baz">foo</a>', "html5lib")
+        soup = self.get_soup('<a irrelevant="baz">foo</a>')
         tag = soup.a
         self.whitelister.clean_tag_node(soup, tag)
         self.assertEqual(str(tag), "<a>foo</a>")
@@ -52,7 +50,8 @@ class TestDbWhitelister(WagtailTestUtils, TestCase):
         (necessary because we can't guarantee the order that attributes are output in)
         """
         self.assertEqual(
-            self.get_soup(str1, "html5lib"), self.get_soup(str2, "html5lib")
+            self.get_soup(str1),
+            self.get_soup(str2),
         )
 
     def test_page_link_is_rewritten(self):

+ 2 - 2
wagtail/test/utils/wagtail_tests.py

@@ -37,8 +37,8 @@ def override_settings(**kwargs):
 
 class WagtailTestUtils:
     @staticmethod
-    def get_soup(markup: Union[str, bytes], parser="html.parser") -> BeautifulSoup:
-        return BeautifulSoup(markup, parser)
+    def get_soup(markup: Union[str, bytes]) -> BeautifulSoup:
+        return BeautifulSoup(markup, "html.parser")
 
     @staticmethod
     def create_test_user():

+ 7 - 7
wagtail/tests/test_whitelist.py

@@ -28,7 +28,7 @@ class TestCheckUrl(TestCase):
 
 class TestAttributeRule(WagtailTestUtils, TestCase):
     def setUp(self):
-        self.soup = self.get_soup('<b foo="bar">baz</b>', "html5lib")
+        self.soup = self.get_soup('<b foo="bar">baz</b>')
 
     def test_no_rule_for_attr(self):
         """
@@ -88,7 +88,7 @@ class TestAttributeRule(WagtailTestUtils, TestCase):
         attributes.
         """
         soup = self.get_soup(
-            '<b foo="bar" baz="quux" snowman="barbecue"></b>', "html5lib"
+            '<b foo="bar" baz="quux" snowman="barbecue"></b>',
         )
         tag = soup.b
         allow_without_attributes(tag)
@@ -103,7 +103,7 @@ class TestWhitelister(WagtailTestUtils, TestCase):
         """
         Unknown node should remove a node from the parent document
         """
-        soup = self.get_soup("<foo><bar>baz</bar>quux</foo>", "html5lib")
+        soup = self.get_soup("<foo><bar>baz</bar>quux</foo>")
         tag = soup.foo
         self.whitelister.clean_unknown_node("", soup.bar)
         self.assertEqual(str(tag), "<foo>quux</foo>")
@@ -113,7 +113,7 @@ class TestWhitelister(WagtailTestUtils, TestCase):
         <b> tags are allowed without attributes. This remains true
         when tags are nested.
         """
-        soup = self.get_soup('<b><b class="delete me">foo</b></b>', "html5lib")
+        soup = self.get_soup('<b><b class="delete me">foo</b></b>')
         tag = soup.b
         self.whitelister.clean_tag_node(tag, tag)
         self.assertEqual(str(tag), "<b><b>foo</b></b>")
@@ -122,19 +122,19 @@ class TestWhitelister(WagtailTestUtils, TestCase):
         """
         <foo> tags should be removed, even when nested.
         """
-        soup = self.get_soup("<b><foo>bar</foo></b>", "html5lib")
+        soup = self.get_soup("<b><foo>bar</foo></b>")
         tag = soup.b
         self.whitelister.clean_tag_node(tag, tag)
         self.assertEqual(str(tag), "<b>bar</b>")
 
     def test_clean_string_node_does_nothing(self):
-        soup = self.get_soup("<b>bar</b>", "html5lib")
+        soup = self.get_soup("<b>bar</b>")
         string = soup.b.string
         self.whitelister.clean_string_node(string, string)
         self.assertEqual(str(string), "bar")
 
     def test_clean_node_does_not_change_navigable_strings(self):
-        soup = self.get_soup("<b>bar</b>", "html5lib")
+        soup = self.get_soup("<b>bar</b>")
         string = soup.b.string
         self.whitelister.clean_node(string, string)
         self.assertEqual(str(string), "bar")

+ 1 - 1
wagtail/utils/text.py

@@ -4,4 +4,4 @@ from django.utils.encoding import force_str
 
 def text_from_html(val):
     # Return the unescaped text content of an HTML string
-    return BeautifulSoup(force_str(val), "html5lib").getText()
+    return BeautifulSoup(force_str(val), "html.parser").getText().strip()

+ 1 - 1
wagtail/whitelist.py

@@ -99,7 +99,7 @@ class Whitelister:
     def clean(self, html):
         """Clean up an HTML string to contain just the allowed elements /
         attributes"""
-        doc = BeautifulSoup(html, "html5lib")
+        doc = BeautifulSoup(html, "html.parser")
         self.clean_node(doc, doc)
 
         # Pass strings through django.utils.html.escape when generating the final HTML.