You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

encoding.py 9.9 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. # -*- encoding: utf-8 -*-
  2. from __future__ import unicode_literals
  3. import codecs
  4. import datetime
  5. import locale
  6. from decimal import Decimal
  7. from django.utils import six
  8. from django.utils.functional import Promise
  9. from django.utils.six.moves.urllib.parse import quote, unquote
  10. if six.PY3:
  11. from urllib.parse import unquote_to_bytes
  12. class DjangoUnicodeDecodeError(UnicodeDecodeError):
  13. def __init__(self, obj, *args):
  14. self.obj = obj
  15. UnicodeDecodeError.__init__(self, *args)
  16. def __str__(self):
  17. original = UnicodeDecodeError.__str__(self)
  18. return '%s. You passed in %r (%s)' % (original, self.obj,
  19. type(self.obj))
  20. # For backwards compatibility. (originally in Django, then added to six 1.9)
  21. python_2_unicode_compatible = six.python_2_unicode_compatible
  22. def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  23. """
  24. Returns a text object representing 's' -- unicode on Python 2 and str on
  25. Python 3. Treats bytestrings using the 'encoding' codec.
  26. If strings_only is True, don't convert (some) non-string-like objects.
  27. """
  28. if isinstance(s, Promise):
  29. # The input is the result of a gettext_lazy() call.
  30. return s
  31. return force_text(s, encoding, strings_only, errors)
  32. _PROTECTED_TYPES = six.integer_types + (type(None), float, Decimal,
  33. datetime.datetime, datetime.date, datetime.time)
  34. def is_protected_type(obj):
  35. """Determine if the object instance is of a protected type.
  36. Objects of protected types are preserved as-is when passed to
  37. force_text(strings_only=True).
  38. """
  39. return isinstance(obj, _PROTECTED_TYPES)
  40. def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  41. """
  42. Similar to smart_text, except that lazy instances are resolved to
  43. strings, rather than kept as lazy objects.
  44. If strings_only is True, don't convert (some) non-string-like objects.
  45. """
  46. # Handle the common case first for performance reasons.
  47. if issubclass(type(s), six.text_type):
  48. return s
  49. if strings_only and is_protected_type(s):
  50. return s
  51. try:
  52. if not issubclass(type(s), six.string_types):
  53. if six.PY3:
  54. if isinstance(s, bytes):
  55. s = six.text_type(s, encoding, errors)
  56. else:
  57. s = six.text_type(s)
  58. elif hasattr(s, '__unicode__'):
  59. s = six.text_type(s)
  60. else:
  61. s = six.text_type(bytes(s), encoding, errors)
  62. else:
  63. # Note: We use .decode() here, instead of six.text_type(s, encoding,
  64. # errors), so that if s is a SafeBytes, it ends up being a
  65. # SafeText at the end.
  66. s = s.decode(encoding, errors)
  67. except UnicodeDecodeError as e:
  68. if not isinstance(s, Exception):
  69. raise DjangoUnicodeDecodeError(s, *e.args)
  70. else:
  71. # If we get to here, the caller has passed in an Exception
  72. # subclass populated with non-ASCII bytestring data without a
  73. # working unicode method. Try to handle this without raising a
  74. # further exception by individually forcing the exception args
  75. # to unicode.
  76. s = ' '.join(force_text(arg, encoding, strings_only, errors)
  77. for arg in s)
  78. return s
  79. def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
  80. """
  81. Returns a bytestring version of 's', encoded as specified in 'encoding'.
  82. If strings_only is True, don't convert (some) non-string-like objects.
  83. """
  84. if isinstance(s, Promise):
  85. # The input is the result of a gettext_lazy() call.
  86. return s
  87. return force_bytes(s, encoding, strings_only, errors)
  88. def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
  89. """
  90. Similar to smart_bytes, except that lazy instances are resolved to
  91. strings, rather than kept as lazy objects.
  92. If strings_only is True, don't convert (some) non-string-like objects.
  93. """
  94. # Handle the common case first for performance reasons.
  95. if isinstance(s, bytes):
  96. if encoding == 'utf-8':
  97. return s
  98. else:
  99. return s.decode('utf-8', errors).encode(encoding, errors)
  100. if strings_only and is_protected_type(s):
  101. return s
  102. if isinstance(s, six.memoryview):
  103. return bytes(s)
  104. if isinstance(s, Promise):
  105. return six.text_type(s).encode(encoding, errors)
  106. if not isinstance(s, six.string_types):
  107. try:
  108. if six.PY3:
  109. return six.text_type(s).encode(encoding)
  110. else:
  111. return bytes(s)
  112. except UnicodeEncodeError:
  113. if isinstance(s, Exception):
  114. # An Exception subclass containing non-ASCII data that doesn't
  115. # know how to print itself properly. We shouldn't raise a
  116. # further exception.
  117. return b' '.join(force_bytes(arg, encoding, strings_only, errors)
  118. for arg in s)
  119. return six.text_type(s).encode(encoding, errors)
  120. else:
  121. return s.encode(encoding, errors)
  122. if six.PY3:
  123. smart_str = smart_text
  124. force_str = force_text
  125. else:
  126. smart_str = smart_bytes
  127. force_str = force_bytes
  128. # backwards compatibility for Python 2
  129. smart_unicode = smart_text
  130. force_unicode = force_text
  131. smart_str.__doc__ = """
  132. Apply smart_text in Python 3 and smart_bytes in Python 2.
  133. This is suitable for writing to sys.stdout (for instance).
  134. """
  135. force_str.__doc__ = """
  136. Apply force_text in Python 3 and force_bytes in Python 2.
  137. """
  138. def iri_to_uri(iri):
  139. """
  140. Convert an Internationalized Resource Identifier (IRI) portion to a URI
  141. portion that is suitable for inclusion in a URL.
  142. This is the algorithm from section 3.1 of RFC 3987. However, since we are
  143. assuming input is either UTF-8 or unicode already, we can simplify things a
  144. little from the full method.
  145. Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
  146. (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
  147. (e.g. '/I%20%E2%99%A5%20Django/').
  148. """
  149. # The list of safe characters here is constructed from the "reserved" and
  150. # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
  151. # reserved = gen-delims / sub-delims
  152. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  153. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  154. # / "*" / "+" / "," / ";" / "="
  155. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  156. # Of the unreserved characters, urllib.quote already considers all but
  157. # the ~ safe.
  158. # The % character is also added to the list of safe characters here, as the
  159. # end of section 3.1 of RFC 3987 specifically mentions that % must not be
  160. # converted.
  161. if iri is None:
  162. return iri
  163. return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
  164. def uri_to_iri(uri):
  165. """
  166. Converts a Uniform Resource Identifier(URI) into an Internationalized
  167. Resource Identifier(IRI).
  168. This is the algorithm from section 3.2 of RFC 3987.
  169. Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
  170. unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
  171. """
  172. if uri is None:
  173. return uri
  174. uri = force_bytes(uri)
  175. iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
  176. return repercent_broken_unicode(iri).decode('utf-8')
  177. def escape_uri_path(path):
  178. """
  179. Escape the unsafe characters from the path portion of a Uniform Resource
  180. Identifier (URI).
  181. """
  182. # These are the "reserved" and "unreserved" characters specified in
  183. # sections 2.2 and 2.3 of RFC 2396:
  184. # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
  185. # unreserved = alphanum | mark
  186. # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
  187. # The list of safe characters here is constructed subtracting ";", "=",
  188. # and "?" according to section 3.3 of RFC 2396.
  189. # The reason for not subtracting and escaping "/" is that we are escaping
  190. # the entire path, not a path segment.
  191. return quote(force_bytes(path), safe=b"/:@&+$,-_.!~*'()")
  192. def repercent_broken_unicode(path):
  193. """
  194. As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
  195. we need to re-percent-encode any octet produced that is not part of a
  196. strictly legal UTF-8 octet sequence.
  197. """
  198. try:
  199. path.decode('utf-8')
  200. except UnicodeDecodeError as e:
  201. repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
  202. path = repercent_broken_unicode(
  203. path[:e.start] + force_bytes(repercent) + path[e.end:])
  204. return path
  205. def filepath_to_uri(path):
  206. """Convert a file system path to a URI portion that is suitable for
  207. inclusion in a URL.
  208. We are assuming input is either UTF-8 or unicode already.
  209. This method will encode certain chars that would normally be recognized as
  210. special chars for URIs. Note that this method does not encode the '
  211. character, as it is a valid character within URIs. See
  212. encodeURIComponent() JavaScript function for more details.
  213. Returns an ASCII string containing the encoded result.
  214. """
  215. if path is None:
  216. return path
  217. # I know about `os.sep` and `os.altsep` but I want to leave
  218. # some flexibility for hardcoding separators.
  219. return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'")
  220. def get_system_encoding():
  221. """
  222. The encoding of the default system locale but falls back to the given
  223. fallback encoding if the encoding is unsupported by python or could
  224. not be determined. See tickets #10335 and #5846
  225. """
  226. try:
  227. encoding = locale.getdefaultlocale()[1] or 'ascii'
  228. codecs.lookup(encoding)
  229. except Exception:
  230. encoding = 'ascii'
  231. return encoding
  232. DEFAULT_LOCALE_ENCODING = get_system_encoding()