encoding.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. # -*- encoding: utf-8 -*-
  2. from __future__ import unicode_literals
  3. import codecs
  4. import datetime
  5. from decimal import Decimal
  6. import locale
  7. from django.utils.functional import Promise
  8. from django.utils import six
  9. from django.utils.six.moves.urllib.parse import quote, unquote
  10. if six.PY3:
  11. from urllib.parse import unquote_to_bytes
  12. class DjangoUnicodeDecodeError(UnicodeDecodeError):
  13. def __init__(self, obj, *args):
  14. self.obj = obj
  15. UnicodeDecodeError.__init__(self, *args)
  16. def __str__(self):
  17. original = UnicodeDecodeError.__str__(self)
  18. return '%s. You passed in %r (%s)' % (original, self.obj,
  19. type(self.obj))
  20. def python_2_unicode_compatible(klass):
  21. """
  22. A decorator that defines __unicode__ and __str__ methods under Python 2.
  23. Under Python 3 it does nothing.
  24. To support Python 2 and 3 with a single code base, define a __str__ method
  25. returning text and apply this decorator to the class.
  26. """
  27. if six.PY2:
  28. if '__str__' not in klass.__dict__:
  29. raise ValueError("@python_2_unicode_compatible cannot be applied "
  30. "to %s because it doesn't define __str__()." %
  31. klass.__name__)
  32. klass.__unicode__ = klass.__str__
  33. klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
  34. return klass
  35. def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  36. """
  37. Returns a text object representing 's' -- unicode on Python 2 and str on
  38. Python 3. Treats bytestrings using the 'encoding' codec.
  39. If strings_only is True, don't convert (some) non-string-like objects.
  40. """
  41. if isinstance(s, Promise):
  42. # The input is the result of a gettext_lazy() call.
  43. return s
  44. return force_text(s, encoding, strings_only, errors)
  45. _PROTECTED_TYPES = six.integer_types + (type(None), float, Decimal,
  46. datetime.datetime, datetime.date, datetime.time)
  47. def is_protected_type(obj):
  48. """Determine if the object instance is of a protected type.
  49. Objects of protected types are preserved as-is when passed to
  50. force_text(strings_only=True).
  51. """
  52. return isinstance(obj, _PROTECTED_TYPES)
  53. def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  54. """
  55. Similar to smart_text, except that lazy instances are resolved to
  56. strings, rather than kept as lazy objects.
  57. If strings_only is True, don't convert (some) non-string-like objects.
  58. """
  59. # Handle the common case first for performance reasons.
  60. if isinstance(s, six.text_type):
  61. return s
  62. if strings_only and is_protected_type(s):
  63. return s
  64. try:
  65. if not isinstance(s, six.string_types):
  66. if six.PY3:
  67. if isinstance(s, bytes):
  68. s = six.text_type(s, encoding, errors)
  69. else:
  70. s = six.text_type(s)
  71. elif hasattr(s, '__unicode__'):
  72. s = six.text_type(s)
  73. else:
  74. s = six.text_type(bytes(s), encoding, errors)
  75. else:
  76. # Note: We use .decode() here, instead of six.text_type(s, encoding,
  77. # errors), so that if s is a SafeBytes, it ends up being a
  78. # SafeText at the end.
  79. s = s.decode(encoding, errors)
  80. except UnicodeDecodeError as e:
  81. if not isinstance(s, Exception):
  82. raise DjangoUnicodeDecodeError(s, *e.args)
  83. else:
  84. # If we get to here, the caller has passed in an Exception
  85. # subclass populated with non-ASCII bytestring data without a
  86. # working unicode method. Try to handle this without raising a
  87. # further exception by individually forcing the exception args
  88. # to unicode.
  89. s = ' '.join([force_text(arg, encoding, strings_only,
  90. errors) for arg in s])
  91. return s
  92. def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
  93. """
  94. Returns a bytestring version of 's', encoded as specified in 'encoding'.
  95. If strings_only is True, don't convert (some) non-string-like objects.
  96. """
  97. if isinstance(s, Promise):
  98. # The input is the result of a gettext_lazy() call.
  99. return s
  100. return force_bytes(s, encoding, strings_only, errors)
  101. def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
  102. """
  103. Similar to smart_bytes, except that lazy instances are resolved to
  104. strings, rather than kept as lazy objects.
  105. If strings_only is True, don't convert (some) non-string-like objects.
  106. """
  107. # Handle the common case first for performance reasons.
  108. if isinstance(s, bytes):
  109. if encoding == 'utf-8':
  110. return s
  111. else:
  112. return s.decode('utf-8', errors).encode(encoding, errors)
  113. if strings_only and is_protected_type(s):
  114. return s
  115. if isinstance(s, six.memoryview):
  116. return bytes(s)
  117. if isinstance(s, Promise):
  118. return six.text_type(s).encode(encoding, errors)
  119. if not isinstance(s, six.string_types):
  120. try:
  121. if six.PY3:
  122. return six.text_type(s).encode(encoding)
  123. else:
  124. return bytes(s)
  125. except UnicodeEncodeError:
  126. if isinstance(s, Exception):
  127. # An Exception subclass containing non-ASCII data that doesn't
  128. # know how to print itself properly. We shouldn't raise a
  129. # further exception.
  130. return b' '.join([force_bytes(arg, encoding, strings_only,
  131. errors) for arg in s])
  132. return six.text_type(s).encode(encoding, errors)
  133. else:
  134. return s.encode(encoding, errors)
  135. if six.PY3:
  136. smart_str = smart_text
  137. force_str = force_text
  138. else:
  139. smart_str = smart_bytes
  140. force_str = force_bytes
  141. # backwards compatibility for Python 2
  142. smart_unicode = smart_text
  143. force_unicode = force_text
  144. smart_str.__doc__ = """
  145. Apply smart_text in Python 3 and smart_bytes in Python 2.
  146. This is suitable for writing to sys.stdout (for instance).
  147. """
  148. force_str.__doc__ = """
  149. Apply force_text in Python 3 and force_bytes in Python 2.
  150. """
  151. def iri_to_uri(iri):
  152. """
  153. Convert an Internationalized Resource Identifier (IRI) portion to a URI
  154. portion that is suitable for inclusion in a URL.
  155. This is the algorithm from section 3.1 of RFC 3987. However, since we are
  156. assuming input is either UTF-8 or unicode already, we can simplify things a
  157. little from the full method.
  158. Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
  159. (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
  160. (e.g. '/I%20%E2%99%A5%20Django/').
  161. """
  162. # The list of safe characters here is constructed from the "reserved" and
  163. # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
  164. # reserved = gen-delims / sub-delims
  165. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  166. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  167. # / "*" / "+" / "," / ";" / "="
  168. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  169. # Of the unreserved characters, urllib.quote already considers all but
  170. # the ~ safe.
  171. # The % character is also added to the list of safe characters here, as the
  172. # end of section 3.1 of RFC 3987 specifically mentions that % must not be
  173. # converted.
  174. if iri is None:
  175. return iri
  176. return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
  177. def uri_to_iri(uri):
  178. """
  179. Converts a Uniform Resource Identifier(URI) into an Internationalized
  180. Resource Identifier(IRI).
  181. This is the algorithm from section 3.2 of RFC 3987.
  182. Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
  183. unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
  184. """
  185. if uri is None:
  186. return uri
  187. uri = force_bytes(uri)
  188. iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
  189. return repercent_broken_unicode(iri).decode('utf-8')
  190. def escape_uri_path(path):
  191. """
  192. Escape the unsafe characters from the path portion of a Uniform Resource
  193. Identifier (URI).
  194. """
  195. # These are the "reserved" and "unreserved" characters specified in
  196. # sections 2.2 and 2.3 of RFC 2396:
  197. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
  198. # unreserved = alphanum | mark
  199. # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
  200. # The list of safe characters here is constructed substracting ";", "=",
  201. # and "?" according to section 3.3 of RFC 2396.
  202. # The reason for not subtracting and escaping "/" is that we are escaping
  203. # the entire path, not a path segment.
  204. return quote(force_bytes(path), safe=b"/:@&+$,-_.!~*'()")
  205. def repercent_broken_unicode(path):
  206. """
  207. As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
  208. we need to re-percent-encode any octet produced that is not part of a
  209. strictly legal UTF-8 octet sequence.
  210. """
  211. try:
  212. path.decode('utf-8')
  213. except UnicodeDecodeError as e:
  214. repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
  215. path = repercent_broken_unicode(
  216. path[:e.start] + force_bytes(repercent) + path[e.end:])
  217. return path
  218. def filepath_to_uri(path):
  219. """Convert a file system path to a URI portion that is suitable for
  220. inclusion in a URL.
  221. We are assuming input is either UTF-8 or unicode already.
  222. This method will encode certain chars that would normally be recognized as
  223. special chars for URIs. Note that this method does not encode the '
  224. character, as it is a valid character within URIs. See
  225. encodeURIComponent() JavaScript function for more details.
  226. Returns an ASCII string containing the encoded result.
  227. """
  228. if path is None:
  229. return path
  230. # I know about `os.sep` and `os.altsep` but I want to leave
  231. # some flexibility for hardcoding separators.
  232. return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'")
  233. def get_system_encoding():
  234. """
  235. The encoding of the default system locale but falls back to the given
  236. fallback encoding if the encoding is unsupported by python or could
  237. not be determined. See tickets #10335 and #5846
  238. """
  239. try:
  240. encoding = locale.getdefaultlocale()[1] or 'ascii'
  241. codecs.lookup(encoding)
  242. except Exception:
  243. encoding = 'ascii'
  244. return encoding
  245. DEFAULT_LOCALE_ENCODING = get_system_encoding()