test_html.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. import os
  2. from datetime import datetime
  3. from django.core.exceptions import SuspiciousOperation
  4. from django.core.serializers.json import DjangoJSONEncoder
  5. from django.test import SimpleTestCase
  6. from django.test.utils import override_settings
  7. from django.utils.deprecation import RemovedInDjango70Warning
  8. from django.utils.functional import lazystr
  9. from django.utils.html import (
  10. conditional_escape,
  11. escape,
  12. escapejs,
  13. format_html,
  14. format_html_join,
  15. html_safe,
  16. json_script,
  17. linebreaks,
  18. smart_urlquote,
  19. strip_spaces_between_tags,
  20. strip_tags,
  21. urlize,
  22. )
  23. from django.utils.safestring import mark_safe
  24. @override_settings(URLIZE_ASSUME_HTTPS=True)
  25. class TestUtilsHtml(SimpleTestCase):
  26. def check_output(self, function, value, output=None):
  27. """
  28. function(value) equals output. If output is None, function(value)
  29. equals value.
  30. """
  31. if output is None:
  32. output = value
  33. self.assertEqual(function(value), output)
  34. def test_escape(self):
  35. items = (
  36. ("&", "&"),
  37. ("<", "&lt;"),
  38. (">", "&gt;"),
  39. ('"', "&quot;"),
  40. ("'", "&#x27;"),
  41. )
  42. # Substitution patterns for testing the above items.
  43. patterns = ("%s", "asdf%sfdsa", "%s1", "1%sb")
  44. for value, output in items:
  45. with self.subTest(value=value, output=output):
  46. for pattern in patterns:
  47. with self.subTest(value=value, output=output, pattern=pattern):
  48. self.check_output(escape, pattern % value, pattern % output)
  49. self.check_output(
  50. escape, lazystr(pattern % value), pattern % output
  51. )
  52. # Check repeated values.
  53. self.check_output(escape, value * 2, output * 2)
  54. # Verify it doesn't double replace &.
  55. self.check_output(escape, "<&", "&lt;&amp;")
  56. def test_format_html(self):
  57. self.assertEqual(
  58. format_html(
  59. "{} {} {third} {fourth}",
  60. "< Dangerous >",
  61. mark_safe("<b>safe</b>"),
  62. third="< dangerous again",
  63. fourth=mark_safe("<i>safe again</i>"),
  64. ),
  65. "&lt; Dangerous &gt; <b>safe</b> &lt; dangerous again <i>safe again</i>",
  66. )
  67. def test_format_html_no_params(self):
  68. msg = "args or kwargs must be provided."
  69. with self.assertRaisesMessage(TypeError, msg):
  70. name = "Adam"
  71. self.assertEqual(format_html(f"<i>{name}</i>"), "<i>Adam</i>")
  72. def test_format_html_join_with_positional_arguments(self):
  73. self.assertEqual(
  74. format_html_join(
  75. "\n",
  76. "<li>{}) {}</li>",
  77. [(1, "Emma"), (2, "Matilda")],
  78. ),
  79. "<li>1) Emma</li>\n<li>2) Matilda</li>",
  80. )
  81. def test_format_html_join_with_keyword_arguments(self):
  82. self.assertEqual(
  83. format_html_join(
  84. "\n",
  85. "<li>{id}) {text}</li>",
  86. [{"id": 1, "text": "Emma"}, {"id": 2, "text": "Matilda"}],
  87. ),
  88. "<li>1) Emma</li>\n<li>2) Matilda</li>",
  89. )
  90. def test_linebreaks(self):
  91. items = (
  92. ("para1\n\npara2\r\rpara3", "<p>para1</p>\n\n<p>para2</p>\n\n<p>para3</p>"),
  93. (
  94. "para1\nsub1\rsub2\n\npara2",
  95. "<p>para1<br>sub1<br>sub2</p>\n\n<p>para2</p>",
  96. ),
  97. (
  98. "para1\r\n\r\npara2\rsub1\r\rpara4",
  99. "<p>para1</p>\n\n<p>para2<br>sub1</p>\n\n<p>para4</p>",
  100. ),
  101. ("para1\tmore\n\npara2", "<p>para1\tmore</p>\n\n<p>para2</p>"),
  102. )
  103. for value, output in items:
  104. with self.subTest(value=value, output=output):
  105. self.check_output(linebreaks, value, output)
  106. self.check_output(linebreaks, lazystr(value), output)
  107. def test_strip_tags(self):
  108. items = (
  109. (
  110. "<p>See: &#39;&eacute; is an apostrophe followed by e acute</p>",
  111. "See: &#39;&eacute; is an apostrophe followed by e acute",
  112. ),
  113. (
  114. "<p>See: &#x27;&eacute; is an apostrophe followed by e acute</p>",
  115. "See: &#x27;&eacute; is an apostrophe followed by e acute",
  116. ),
  117. ("<adf>a", "a"),
  118. ("</adf>a", "a"),
  119. ("<asdf><asdf>e", "e"),
  120. ("hi, <f x", "hi, <f x"),
  121. ("234<235, right?", "234<235, right?"),
  122. ("a4<a5 right?", "a4<a5 right?"),
  123. ("b7>b2!", "b7>b2!"),
  124. ("</fe", "</fe"),
  125. ("<x>b<y>", "b"),
  126. ("a<p onclick=\"alert('<test>')\">b</p>c", "abc"),
  127. ("a<p a >b</p>c", "abc"),
  128. ("d<a:b c:d>e</p>f", "def"),
  129. ('<strong>foo</strong><a href="http://example.com">bar</a>', "foobar"),
  130. # caused infinite loop on Pythons not patched with
  131. # https://bugs.python.org/issue20288
  132. ("&gotcha&#;<>", "&gotcha&#;<>"),
  133. ("<sc<!-- -->ript>test<<!-- -->/script>", "ript>test"),
  134. ("<script>alert()</script>&h", "alert()h"),
  135. ("><!" + ("&" * 16000) + "D", "><!" + ("&" * 16000) + "D"),
  136. ("X<<<<br>br>br>br>X", "XX"),
  137. ("<" * 50 + "a>" * 50, ""),
  138. )
  139. for value, output in items:
  140. with self.subTest(value=value, output=output):
  141. self.check_output(strip_tags, value, output)
  142. self.check_output(strip_tags, lazystr(value), output)
  143. def test_strip_tags_suspicious_operation(self):
  144. value = "<" * 51 + "a>" * 51, "<a>"
  145. with self.assertRaises(SuspiciousOperation):
  146. strip_tags(value)
  147. def test_strip_tags_files(self):
  148. # Test with more lengthy content (also catching performance regressions)
  149. for filename in ("strip_tags1.html", "strip_tags2.txt"):
  150. with self.subTest(filename=filename):
  151. path = os.path.join(os.path.dirname(__file__), "files", filename)
  152. with open(path) as fp:
  153. content = fp.read()
  154. start = datetime.now()
  155. stripped = strip_tags(content)
  156. elapsed = datetime.now() - start
  157. self.assertEqual(elapsed.seconds, 0)
  158. self.assertIn("Test string that has not been stripped.", stripped)
  159. self.assertNotIn("<", stripped)
  160. def test_strip_spaces_between_tags(self):
  161. # Strings that should come out untouched.
  162. items = (" <adf>", "<adf> ", " </adf> ", " <f> x</f>")
  163. for value in items:
  164. with self.subTest(value=value):
  165. self.check_output(strip_spaces_between_tags, value)
  166. self.check_output(strip_spaces_between_tags, lazystr(value))
  167. # Strings that have spaces to strip.
  168. items = (
  169. ("<d> </d>", "<d></d>"),
  170. ("<p>hello </p>\n<p> world</p>", "<p>hello </p><p> world</p>"),
  171. ("\n<p>\t</p>\n<p> </p>\n", "\n<p></p><p></p>\n"),
  172. )
  173. for value, output in items:
  174. with self.subTest(value=value, output=output):
  175. self.check_output(strip_spaces_between_tags, value, output)
  176. self.check_output(strip_spaces_between_tags, lazystr(value), output)
  177. def test_escapejs(self):
  178. items = (
  179. (
  180. "\"double quotes\" and 'single quotes'",
  181. "\\u0022double quotes\\u0022 and \\u0027single quotes\\u0027",
  182. ),
  183. (r"\ : backslashes, too", "\\u005C : backslashes, too"),
  184. (
  185. "and lots of whitespace: \r\n\t\v\f\b",
  186. "and lots of whitespace: \\u000D\\u000A\\u0009\\u000B\\u000C\\u0008",
  187. ),
  188. (
  189. r"<script>and this</script>",
  190. "\\u003Cscript\\u003Eand this\\u003C/script\\u003E",
  191. ),
  192. (
  193. "paragraph separator:\u2029and line separator:\u2028",
  194. "paragraph separator:\\u2029and line separator:\\u2028",
  195. ),
  196. ("`", "\\u0060"),
  197. )
  198. for value, output in items:
  199. with self.subTest(value=value, output=output):
  200. self.check_output(escapejs, value, output)
  201. self.check_output(escapejs, lazystr(value), output)
  202. def test_json_script(self):
  203. tests = (
  204. # "<", ">" and "&" are quoted inside JSON strings
  205. (
  206. (
  207. "&<>",
  208. '<script id="test_id" type="application/json">'
  209. '"\\u0026\\u003C\\u003E"</script>',
  210. )
  211. ),
  212. # "<", ">" and "&" are quoted inside JSON objects
  213. (
  214. {"a": "<script>test&ing</script>"},
  215. '<script id="test_id" type="application/json">'
  216. '{"a": "\\u003Cscript\\u003Etest\\u0026ing\\u003C/script\\u003E"}'
  217. "</script>",
  218. ),
  219. # Lazy strings are quoted
  220. (
  221. lazystr("&<>"),
  222. '<script id="test_id" type="application/json">"\\u0026\\u003C\\u003E"'
  223. "</script>",
  224. ),
  225. (
  226. {"a": lazystr("<script>test&ing</script>")},
  227. '<script id="test_id" type="application/json">'
  228. '{"a": "\\u003Cscript\\u003Etest\\u0026ing\\u003C/script\\u003E"}'
  229. "</script>",
  230. ),
  231. )
  232. for arg, expected in tests:
  233. with self.subTest(arg=arg):
  234. self.assertEqual(json_script(arg, "test_id"), expected)
  235. def test_json_script_custom_encoder(self):
  236. class CustomDjangoJSONEncoder(DjangoJSONEncoder):
  237. def encode(self, o):
  238. return '{"hello": "world"}'
  239. self.assertHTMLEqual(
  240. json_script({}, encoder=CustomDjangoJSONEncoder),
  241. '<script type="application/json">{"hello": "world"}</script>',
  242. )
  243. def test_json_script_without_id(self):
  244. self.assertHTMLEqual(
  245. json_script({"key": "value"}),
  246. '<script type="application/json">{"key": "value"}</script>',
  247. )
  248. def test_smart_urlquote(self):
  249. items = (
  250. # IDN is encoded as percent-encoded ("quoted") UTF-8 (#36013).
  251. ("http://öäü.com/", "http://%C3%B6%C3%A4%C3%BC.com/"),
  252. ("https://faß.example.com", "https://fa%C3%9F.example.com"),
  253. (
  254. "http://öäü.com/öäü/",
  255. "http://%C3%B6%C3%A4%C3%BC.com/%C3%B6%C3%A4%C3%BC/",
  256. ),
  257. (
  258. # Valid under IDNA 2008, but was invalid in IDNA 2003.
  259. "https://މިހާރު.com",
  260. "https://%DE%89%DE%A8%DE%80%DE%A7%DE%83%DE%AA.com",
  261. ),
  262. (
  263. # Valid under WHATWG URL Specification but not IDNA 2008.
  264. "http://👓.ws",
  265. "http://%F0%9F%91%93.ws",
  266. ),
  267. # Pre-encoded IDNA is left unchanged.
  268. ("http://xn--iny-zx5a.com/idna2003", "http://xn--iny-zx5a.com/idna2003"),
  269. ("http://xn--fa-hia.com/idna2008", "http://xn--fa-hia.com/idna2008"),
  270. # Everything unsafe is quoted, !*'();:@&=+$,/?#[]~ is considered
  271. # safe as per RFC.
  272. (
  273. "http://example.com/path/öäü/",
  274. "http://example.com/path/%C3%B6%C3%A4%C3%BC/",
  275. ),
  276. ("http://example.com/%C3%B6/ä/", "http://example.com/%C3%B6/%C3%A4/"),
  277. ("http://example.com/?x=1&y=2+3&z=", "http://example.com/?x=1&y=2+3&z="),
  278. ("http://example.com/?x=<>\"'", "http://example.com/?x=%3C%3E%22%27"),
  279. (
  280. "http://example.com/?q=http://example.com/?x=1%26q=django",
  281. "http://example.com/?q=http%3A%2F%2Fexample.com%2F%3Fx%3D1%26q%3D"
  282. "django",
  283. ),
  284. (
  285. "http://example.com/?q=http%3A%2F%2Fexample.com%2F%3Fx%3D1%26q%3D"
  286. "django",
  287. "http://example.com/?q=http%3A%2F%2Fexample.com%2F%3Fx%3D1%26q%3D"
  288. "django",
  289. ),
  290. ("http://.www.f oo.bar/", "http://.www.f%20oo.bar/"),
  291. ('http://example.com">', "http://example.com%22%3E"),
  292. ("http://10.22.1.1/", "http://10.22.1.1/"),
  293. ("http://[fd00::1]/", "http://[fd00::1]/"),
  294. )
  295. for value, output in items:
  296. with self.subTest(value=value, output=output):
  297. self.assertEqual(smart_urlquote(value), output)
  298. def test_conditional_escape(self):
  299. s = "<h1>interop</h1>"
  300. self.assertEqual(conditional_escape(s), "&lt;h1&gt;interop&lt;/h1&gt;")
  301. self.assertEqual(conditional_escape(mark_safe(s)), s)
  302. self.assertEqual(conditional_escape(lazystr(mark_safe(s))), s)
  303. def test_html_safe(self):
  304. @html_safe
  305. class HtmlClass:
  306. def __str__(self):
  307. return "<h1>I'm a html class!</h1>"
  308. html_obj = HtmlClass()
  309. self.assertTrue(hasattr(HtmlClass, "__html__"))
  310. self.assertTrue(hasattr(html_obj, "__html__"))
  311. self.assertEqual(str(html_obj), html_obj.__html__())
  312. def test_html_safe_subclass(self):
  313. class BaseClass:
  314. def __html__(self):
  315. # defines __html__ on its own
  316. return "some html content"
  317. def __str__(self):
  318. return "some non html content"
  319. @html_safe
  320. class Subclass(BaseClass):
  321. def __str__(self):
  322. # overrides __str__ and is marked as html_safe
  323. return "some html safe content"
  324. subclass_obj = Subclass()
  325. self.assertEqual(str(subclass_obj), subclass_obj.__html__())
  326. def test_html_safe_defines_html_error(self):
  327. msg = "can't apply @html_safe to HtmlClass because it defines __html__()."
  328. with self.assertRaisesMessage(ValueError, msg):
  329. @html_safe
  330. class HtmlClass:
  331. def __html__(self):
  332. return "<h1>I'm a html class!</h1>"
  333. def test_html_safe_doesnt_define_str(self):
  334. msg = "can't apply @html_safe to HtmlClass because it doesn't define __str__()."
  335. with self.assertRaisesMessage(ValueError, msg):
  336. @html_safe
  337. class HtmlClass:
  338. pass
  339. def test_urlize(self):
  340. tests = (
  341. (
  342. "Search for google.com/?q=! and see.",
  343. 'Search for <a href="https://google.com/?q=">google.com/?q=</a>! and '
  344. "see.",
  345. ),
  346. (
  347. "Search for google.com/?q=1&lt! and see.",
  348. 'Search for <a href="https://google.com/?q=1%3C">google.com/?q=1&lt'
  349. "</a>! and see.",
  350. ),
  351. (
  352. lazystr("Search for google.com/?q=!"),
  353. 'Search for <a href="https://google.com/?q=">google.com/?q=</a>!',
  354. ),
  355. (
  356. "http://www.foo.bar/",
  357. '<a href="http://www.foo.bar/">http://www.foo.bar/</a>',
  358. ),
  359. (
  360. "Look on www.نامه‌ای.com.",
  361. "Look on <a "
  362. 'href="https://www.%D9%86%D8%A7%D9%85%D9%87%E2%80%8C%D8%A7%DB%8C.com"'
  363. ">www.نامه‌ای.com</a>.",
  364. ),
  365. ("foo@example.com", '<a href="mailto:foo@example.com">foo@example.com</a>'),
  366. (
  367. "test@" + "한.글." * 15 + "aaa",
  368. '<a href="mailto:test@'
  369. + "%ED%95%9C.%EA%B8%80." * 15
  370. + 'aaa">'
  371. + "test@"
  372. + "한.글." * 15
  373. + "aaa</a>",
  374. ),
  375. (
  376. # RFC 6068 requires a mailto URI to percent-encode a number of
  377. # characters that can appear in <addr-spec>.
  378. "yes+this=is&a%valid!email@example.com",
  379. '<a href="mailto:yes%2Bthis%3Dis%26a%25valid%21email@example.com"'
  380. ">yes+this=is&a%valid!email@example.com</a>",
  381. ),
  382. (
  383. "foo@faß.example.com",
  384. '<a href="mailto:foo@fa%C3%9F.example.com">foo@faß.example.com</a>',
  385. ),
  386. (
  387. "idna-2008@މިހާރު.example.mv",
  388. '<a href="mailto:idna-2008@%DE%89%DE%A8%DE%80%DE%A7%DE%83%DE%AA.ex'
  389. 'ample.mv">idna-2008@މިހާރު.example.mv</a>',
  390. ),
  391. )
  392. for value, output in tests:
  393. with self.subTest(value=value):
  394. self.assertEqual(urlize(value), output)
  395. @override_settings(URLIZE_ASSUME_HTTPS=False)
  396. def test_urlize_http_default_warning(self):
  397. msg = (
  398. "The default protocol will be changed from HTTP to HTTPS in Django 7.0. "
  399. "Set the URLIZE_ASSUME_HTTPS transitional setting to True to opt into "
  400. "using HTTPS as the new default protocol."
  401. )
  402. with self.assertWarnsMessage(RemovedInDjango70Warning, msg):
  403. self.assertEqual(
  404. urlize("Visit example.com"),
  405. 'Visit <a href="http://example.com">example.com</a>',
  406. )
  407. def test_urlize_unchanged_inputs(self):
  408. tests = (
  409. ("a" + "@a" * 50000) + "a", # simple_email_re catastrophic test
  410. # Unicode domain catastrophic tests.
  411. "a@" + "한.글." * 1_000_000 + "a",
  412. "http://" + "한.글." * 1_000_000 + "com",
  413. "www." + "한.글." * 1_000_000 + "com",
  414. ("a" + "." * 1000000) + "a", # trailing_punctuation catastrophic test
  415. "foo@",
  416. "@foo.com",
  417. "foo@.example.com",
  418. "foo@localhost",
  419. "foo@localhost.",
  420. "test@example?;+!.com",
  421. "email me@example.com,then I'll respond",
  422. # trim_punctuation catastrophic tests
  423. "(" * 100_000 + ":" + ")" * 100_000,
  424. "(" * 100_000 + "&:" + ")" * 100_000,
  425. "([" * 100_000 + ":" + "])" * 100_000,
  426. "[(" * 100_000 + ":" + ")]" * 100_000,
  427. "([[" * 100_000 + ":" + "]])" * 100_000,
  428. "&:" + ";" * 100_000,
  429. "&.;" * 100_000,
  430. ".;" * 100_000,
  431. "&" + ";:" * 100_000,
  432. )
  433. for value in tests:
  434. with self.subTest(value=value):
  435. self.assertEqual(urlize(value), value)