Browse Source

Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().

Chronial 8 years ago
parent
commit
03281d8fe7
4 changed files with 55 additions and 17 deletions
  1. 3 3
      django/test/client.py
  2. 41 4
      django/utils/encoding.py
  3. 5 7
      docs/ref/unicode.txt
  4. 6 3
      tests/utils_tests/test_encoding.py

+ 3 - 3
django/test/client.py

@@ -6,7 +6,7 @@ import sys
 from copy import copy
 from importlib import import_module
 from io import BytesIO
-from urllib.parse import urljoin, urlparse, urlsplit
+from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
 
 from django.conf import settings
 from django.core.handlers.base import BaseHandler
@@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist
 from django.test import signals
 from django.test.utils import ContextList
 from django.urls import resolve
-from django.utils.encoding import force_bytes, uri_to_iri
+from django.utils.encoding import force_bytes
 from django.utils.functional import SimpleLazyObject, curry
 from django.utils.http import urlencode
 from django.utils.itercompat import is_iterable
@@ -320,7 +320,7 @@ class RequestFactory:
         # If there are parameters, add them
         if parsed.params:
             path += ";" + parsed.params
-        path = uri_to_iri(path).encode()
+        path = unquote_to_bytes(path)
         # Replace the behavior where non-ASCII values in the WSGI environ are
         # arbitrarily decoded with ISO-8859-1.
         # Refs comment in `get_bytes_from_wsgi()`.

+ 41 - 4
django/utils/encoding.py

@@ -2,7 +2,7 @@ import codecs
 import datetime
 import locale
 from decimal import Decimal
-from urllib.parse import quote, unquote_to_bytes
+from urllib.parse import quote
 
 from django.utils import six
 from django.utils.functional import Promise
@@ -151,20 +151,57 @@ def iri_to_uri(iri):
     return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
 
 
+# List of byte values that uri_to_iri() decodes from percent encoding.
+# First, the unreserved characters from RFC 3986:
+_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
+_hextobyte = {
+    (fmt % char).encode(): bytes((char,))
+    for ascii_range in _ascii_ranges
+    for char in ascii_range
+    for fmt in ['%02x', '%02X']
+}
+# And then everything above 128, because bytes ≥ 128 are part of multibyte
+# unicode characters.
+_hexdig = '0123456789ABCDEFabcdef'
+_hextobyte.update({
+    (a + b).encode(): bytes.fromhex(a + b)
+    for a in _hexdig[8:] for b in _hexdig
+})
+
+
 def uri_to_iri(uri):
     """
     Converts a Uniform Resource Identifier(URI) into an Internationalized
     Resource Identifier(IRI).
 
-    This is the algorithm from section 3.2 of RFC 3987.
+    This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
 
     Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
-    a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+    a string containing the encoded result (e.g. '/I%20♥%20Django/').
     """
     if uri is None:
         return uri
     uri = force_bytes(uri)
-    iri = unquote_to_bytes(uri)
+    # Fast selective unqote: First, split on '%' and then starting with the
+    # second block, decode the first 2 bytes if they represent a hex code to
+    # decode. The rest of the block is the part after '%AB', not containing
+    # any '%'. Add that to the output without further processing.
+    bits = uri.split(b'%')
+    if len(bits) == 1:
+        iri = uri
+    else:
+        parts = [bits[0]]
+        append = parts.append
+        hextobyte = _hextobyte
+        for item in bits[1:]:
+            hex = item[:2]
+            if hex in hextobyte:
+                append(hextobyte[item[:2]])
+                append(item[2:])
+            else:
+                append(b'%')
+                append(item)
+        iri = b''.join(parts)
     return repercent_broken_unicode(iri).decode()
 
 

+ 5 - 7
docs/ref/unicode.txt

@@ -195,19 +195,17 @@ result.
 
 Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
 implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
-It decodes all percent-encodings except those that don't represent a valid
-UTF-8 sequence.
 
 An example to demonstrate::
 
     >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
     '/♥♥/?utf8=✓'
-    >>> uri_to_iri('%A9helloworld')
-    '%A9helloworld'
+    >>> uri_to_iri('%A9hello%3Fworld')
+    '%A9hello%3Fworld'
 
-In the first example, the UTF-8 characters and reserved characters are
-unquoted. In the second, the percent-encoding remains unchanged because it
-lies outside the valid UTF-8 range.
+In the first example, the UTF-8 characters are unquoted. In the second, the
+percent-encodings remain unchanged because they lie outside the valid UTF-8
+range or represent a reserved character.
 
 Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
 following is always true::

+ 6 - 3
tests/utils_tests/test_encoding.py

@@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
     def test_uri_to_iri(self):
         cases = [
             # Valid UTF-8 sequences are decoded.
-            ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+            ('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'),
             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
-
+            ('/%41%5a%6B/', '/AZk/'),
+            # Reserved and non-URL valid ASCII chars are not decoded.
+            ('/%25%20%02%41%7b/', '/%25%20%02A%7b/'),
             # Broken UTF-8 sequences remain escaped.
             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
@@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase):
 
     def test_complementarity(self):
         cases = [
-            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
+            ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'),
             ('%&', '%&'),
             ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
             ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+            ('/%25%20%02%7b/', '/%25%20%02%7b/'),
             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
             ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),