8 years ago · 03281d8fe7
--- a/django/test/client.py
+++ b/django/test/client.py
@@ -6,7 +6,7 @@ import sys
 
				 from copy import copy
			
 
				 from importlib import import_module
			
 
				 from io import BytesIO
			
 
				-from urllib.parse import urljoin, urlparse, urlsplit
			
 
				+from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
			
 
				 
			
 
				 from django.conf import settings
			
 
				 from django.core.handlers.base import BaseHandler
			
@@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
 
				 from django.test import signals
			
 
				 from django.test.utils import ContextList
			
 
				 from django.urls import resolve
			
 
				-from django.utils.encoding import force_bytes, uri_to_iri
			
 
				+from django.utils.encoding import force_bytes
			
 
				 from django.utils.functional import SimpleLazyObject, curry
			
 
				 from django.utils.http import urlencode
			
 
				 from django.utils.itercompat import is_iterable
			
@@ -320,7 +320,7 @@ class RequestFactory:
 
				         # If there are parameters, add them
			
 
				         if parsed.params:
			
 
				             path += ";" + parsed.params
			
 
				-        path = uri_to_iri(path).encode()
			
 
				+        path = unquote_to_bytes(path)
			
 
				         # Replace the behavior where non-ASCII values in the WSGI environ are
			
 
				         # arbitrarily decoded with ISO-8859-1.
			
 
				         # Refs comment in `get_bytes_from_wsgi()`.
			
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -2,7 +2,7 @@ import codecs
 
				 import datetime
			
 
				 import locale
			
 
				 from decimal import Decimal
			
 
				-from urllib.parse import quote, unquote_to_bytes
			
 
				+from urllib.parse import quote
			
 
				 
			
 
				 from django.utils import six
			
 
				 from django.utils.functional import Promise
			
@@ -151,20 +151,57 @@ def iri_to_uri(iri):
 
				     return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
			
 
				 
			
 
				 
			
 
				+# List of byte values that uri_to_iri() decodes from percent encoding.
			
 
				+# First, the unreserved characters from RFC 3986:
			
 
				+_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
			
 
				+_hextobyte = {
			
 
				+    (fmt % char).encode(): bytes((char,))
			
 
				+    for ascii_range in _ascii_ranges
			
 
				+    for char in ascii_range
			
 
				+    for fmt in ['%02x', '%02X']
			
 
				+}
			
 
				+# And then everything above 128, because bytes ≥ 128 are part of multibyte
			
 
				+# unicode characters.
			
 
				+_hexdig = '0123456789ABCDEFabcdef'
			
 
				+_hextobyte.update({
			
 
				+    (a + b).encode(): bytes.fromhex(a + b)
			
 
				+    for a in _hexdig[8:] for b in _hexdig
			
 
				+})
			
 
				+
			
 
				+
			
 
				 def uri_to_iri(uri):
			
 
				     """
			
 
				     Converts a Uniform Resource Identifier(URI) into an Internationalized
			
 
				     Resource Identifier(IRI).
			
 
				 
			
 
				-    This is the algorithm from section 3.2 of RFC 3987.
			
 
				+    This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
			
 
				 
			
 
				     Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
			
 
				-    a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
			
 
				+    a string containing the encoded result (e.g. '/I%20♥%20Django/').
			
 
				     """
			
 
				     if uri is None:
			
 
				         return uri
			
 
				     uri = force_bytes(uri)
			
 
				-    iri = unquote_to_bytes(uri)
			
 
				+    # Fast selective unqote: First, split on '%' and then starting with the
			
 
				+    # second block, decode the first 2 bytes if they represent a hex code to
			
 
				+    # decode. The rest of the block is the part after '%AB', not containing
			
 
				+    # any '%'. Add that to the output without further processing.
			
 
				+    bits = uri.split(b'%')
			
 
				+    if len(bits) == 1:
			
 
				+        iri = uri
			
 
				+    else:
			
 
				+        parts = [bits[0]]
			
 
				+        append = parts.append
			
 
				+        hextobyte = _hextobyte
			
 
				+        for item in bits[1:]:
			
 
				+            hex = item[:2]
			
 
				+            if hex in hextobyte:
			
 
				+                append(hextobyte[item[:2]])
			
 
				+                append(item[2:])
			
 
				+            else:
			
 
				+                append(b'%')
			
 
				+                append(item)
			
 
				+        iri = b''.join(parts)
			
 
				     return repercent_broken_unicode(iri).decode()
			
 
				 
			
 
				 
			
--- a/docs/ref/unicode.txt
+++ b/docs/ref/unicode.txt
@@ -195,19 +195,17 @@ result.
 
				 
			
 
				 Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
			
 
				 implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
			
 
				-It decodes all percent-encodings except those that don't represent a valid
			
 
				-UTF-8 sequence.
			
 
				 
			
 
				 An example to demonstrate::
			
 
				 
			
 
				     >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
			
 
				     '/♥♥/?utf8=✓'
			
 
				-    >>> uri_to_iri('%A9helloworld')
			
 
				-    '%A9helloworld'
			
 
				+    >>> uri_to_iri('%A9hello%3Fworld')
			
 
				+    '%A9hello%3Fworld'
			
 
				 
			
 
				-In the first example, the UTF-8 characters and reserved characters are
			
 
				-unquoted. In the second, the percent-encoding remains unchanged because it
			
 
				-lies outside the valid UTF-8 range.
			
 
				+In the first example, the UTF-8 characters are unquoted. In the second, the
			
 
				+percent-encodings remain unchanged because they lie outside the valid UTF-8
			
 
				+range or represent a reserved character.
			
 
				 
			
 
				 Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
			
 
				 following is always true::
			
--- a/tests/utils_tests/test_encoding.py
+++ b/tests/utils_tests/test_encoding.py
@@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
 
				     def test_uri_to_iri(self):
			
 
				         cases = [
			
 
				             # Valid UTF-8 sequences are decoded.
			
 
				-            ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
			
 
				+            ('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
			
 
				             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
			
 
				-
			
 
				+            ('/%41%5a%6B/', '/AZk/'),
			
 
				+            # Reserved and non-URL valid ASCII chars are not decoded.
			
 
				+            ('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
			
 
				             # Broken UTF-8 sequences remain escaped.
			
 
				             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
			
 
				             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
			
@@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
 
				 
			
 
				     def test_complementarity(self):
			
 
				         cases = [
			
 
				-            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
			
 
				+            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
			
 
				             ('%&', '%&'),
			
 
				             ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
			
 
				             ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
			
 
				             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
			
 
				+            ('/%25%20%02%7b/', '/%25%20%02%7b/'),
			
 
				             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
			
 
				             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
			
 
				             ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),