123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- import base64
- import datetime
- import re
- import unicodedata
- from binascii import Error as BinasciiError
- from email.utils import formatdate
- from urllib.parse import quote, unquote
- from urllib.parse import urlencode as original_urlencode
- from urllib.parse import urlparse
- from django.utils.datastructures import MultiValueDict
- from django.utils.regex_helper import _lazy_re_compile
- # Based on RFC 9110 Appendix A.
- ETAG_MATCH = _lazy_re_compile(
- r"""
- \A( # start of string and capture group
- (?:W/)? # optional weak indicator
- " # opening quote
- [^"]* # any sequence of non-quote characters
- " # end quote
- )\Z # end of string and capture group
- """,
- re.X,
- )
- MONTHS = "jan feb mar apr may jun jul aug sep oct nov dec".split()
- __D = r"(?P<day>[0-9]{2})"
- __D2 = r"(?P<day>[ 0-9][0-9])"
- __M = r"(?P<mon>\w{3})"
- __Y = r"(?P<year>[0-9]{4})"
- __Y2 = r"(?P<year>[0-9]{2})"
- __T = r"(?P<hour>[0-9]{2}):(?P<min>[0-9]{2}):(?P<sec>[0-9]{2})"
- RFC1123_DATE = _lazy_re_compile(r"^\w{3}, %s %s %s %s GMT$" % (__D, __M, __Y, __T))
- RFC850_DATE = _lazy_re_compile(r"^\w{6,9}, %s-%s-%s %s GMT$" % (__D, __M, __Y2, __T))
- ASCTIME_DATE = _lazy_re_compile(r"^\w{3} %s %s %s %s$" % (__M, __D2, __T, __Y))
- RFC3986_GENDELIMS = ":/?#[]@"
- RFC3986_SUBDELIMS = "!$&'()*+,;="
- def urlencode(query, doseq=False):
- """
- A version of Python's urllib.parse.urlencode() function that can operate on
- MultiValueDict and non-string values.
- """
- if isinstance(query, MultiValueDict):
- query = query.lists()
- elif hasattr(query, "items"):
- query = query.items()
- query_params = []
- for key, value in query:
- if value is None:
- raise TypeError(
- "Cannot encode None for key '%s' in a query string. Did you "
- "mean to pass an empty string or omit the value?" % key
- )
- elif not doseq or isinstance(value, (str, bytes)):
- query_val = value
- else:
- try:
- itr = iter(value)
- except TypeError:
- query_val = value
- else:
- # Consume generators and iterators, when doseq=True, to
- # work around https://bugs.python.org/issue31706.
- query_val = []
- for item in itr:
- if item is None:
- raise TypeError(
- "Cannot encode None for key '%s' in a query "
- "string. Did you mean to pass an empty string or "
- "omit the value?" % key
- )
- elif not isinstance(item, bytes):
- item = str(item)
- query_val.append(item)
- query_params.append((key, query_val))
- return original_urlencode(query_params, doseq)
- def http_date(epoch_seconds=None):
- """
- Format the time to match the RFC 5322 date format as specified by RFC 9110
- Section 5.6.7.
- `epoch_seconds` is a floating point number expressed in seconds since the
- epoch, in UTC - such as that outputted by time.time(). If set to None, it
- defaults to the current time.
- Output a string in the format 'Wdy, DD Mon YYYY HH:MM:SS GMT'.
- """
- return formatdate(epoch_seconds, usegmt=True)
- def parse_http_date(date):
- """
- Parse a date format as specified by HTTP RFC 9110 Section 5.6.7.
- The three formats allowed by the RFC are accepted, even if only the first
- one is still in widespread use.
- Return an integer expressed in seconds since the epoch, in UTC.
- """
- # email.utils.parsedate() does the job for RFC 1123 dates; unfortunately
- # RFC 9110 makes it mandatory to support RFC 850 dates too. So we roll
- # our own RFC-compliant parsing.
- for regex in RFC1123_DATE, RFC850_DATE, ASCTIME_DATE:
- m = regex.match(date)
- if m is not None:
- break
- else:
- raise ValueError("%r is not in a valid HTTP date format" % date)
- try:
- tz = datetime.timezone.utc
- year = int(m["year"])
- if year < 100:
- current_year = datetime.datetime.now(tz=tz).year
- current_century = current_year - (current_year % 100)
- if year - (current_year % 100) > 50:
- # year that appears to be more than 50 years in the future are
- # interpreted as representing the past.
- year += current_century - 100
- else:
- year += current_century
- month = MONTHS.index(m["mon"].lower()) + 1
- day = int(m["day"])
- hour = int(m["hour"])
- min = int(m["min"])
- sec = int(m["sec"])
- result = datetime.datetime(year, month, day, hour, min, sec, tzinfo=tz)
- return int(result.timestamp())
- except Exception as exc:
- raise ValueError("%r is not a valid date" % date) from exc
- def parse_http_date_safe(date):
- """
- Same as parse_http_date, but return None if the input is invalid.
- """
- try:
- return parse_http_date(date)
- except Exception:
- pass
- # Base 36 functions: useful for generating compact URLs
- def base36_to_int(s):
- """
- Convert a base 36 string to an int. Raise ValueError if the input won't fit
- into an int.
- """
- # To prevent overconsumption of server resources, reject any
- # base36 string that is longer than 13 base36 digits (13 digits
- # is sufficient to base36-encode any 64-bit integer)
- if len(s) > 13:
- raise ValueError("Base36 input too large")
- return int(s, 36)
- def int_to_base36(i):
- """Convert an integer to a base36 string."""
- char_set = "0123456789abcdefghijklmnopqrstuvwxyz"
- if i < 0:
- raise ValueError("Negative base36 conversion input.")
- if i < 36:
- return char_set[i]
- b36 = ""
- while i != 0:
- i, n = divmod(i, 36)
- b36 = char_set[n] + b36
- return b36
- def urlsafe_base64_encode(s):
- """
- Encode a bytestring to a base64 string for use in URLs. Strip any trailing
- equal signs.
- """
- return base64.urlsafe_b64encode(s).rstrip(b"\n=").decode("ascii")
- def urlsafe_base64_decode(s):
- """
- Decode a base64 encoded string. Add back any trailing equal signs that
- might have been stripped.
- """
- s = s.encode()
- try:
- return base64.urlsafe_b64decode(s.ljust(len(s) + len(s) % 4, b"="))
- except (LookupError, BinasciiError) as e:
- raise ValueError(e)
- def parse_etags(etag_str):
- """
- Parse a string of ETags given in an If-None-Match or If-Match header as
- defined by RFC 9110. Return a list of quoted ETags, or ['*'] if all ETags
- should be matched.
- """
- if etag_str.strip() == "*":
- return ["*"]
- else:
- # Parse each ETag individually, and return any that are valid.
- etag_matches = (ETAG_MATCH.match(etag.strip()) for etag in etag_str.split(","))
- return [match[1] for match in etag_matches if match]
- def quote_etag(etag_str):
- """
- If the provided string is already a quoted ETag, return it. Otherwise, wrap
- the string in quotes, making it a strong ETag.
- """
- if ETAG_MATCH.match(etag_str):
- return etag_str
- else:
- return '"%s"' % etag_str
- def is_same_domain(host, pattern):
- """
- Return ``True`` if the host is either an exact match or a match
- to the wildcard pattern.
- Any pattern beginning with a period matches a domain and all of its
- subdomains. (e.g. ``.example.com`` matches ``example.com`` and
- ``foo.example.com``). Anything else is an exact string match.
- """
- if not pattern:
- return False
- pattern = pattern.lower()
- return (
- pattern[0] == "."
- and (host.endswith(pattern) or host == pattern[1:])
- or pattern == host
- )
- def url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
- """
- Return ``True`` if the url uses an allowed host and a safe scheme.
- Always return ``False`` on an empty url.
- If ``require_https`` is ``True``, only 'https' will be considered a valid
- scheme, as opposed to 'http' and 'https' with the default, ``False``.
- Note: "True" doesn't entail that a URL is "safe". It may still be e.g.
- quoted incorrectly. Ensure to also use django.utils.encoding.iri_to_uri()
- on the path component of untrusted URLs.
- """
- if url is not None:
- url = url.strip()
- if not url:
- return False
- if allowed_hosts is None:
- allowed_hosts = set()
- elif isinstance(allowed_hosts, str):
- allowed_hosts = {allowed_hosts}
- # Chrome treats \ completely as / in paths but it could be part of some
- # basic auth credentials so we need to check both URLs.
- return _url_has_allowed_host_and_scheme(
- url, allowed_hosts, require_https=require_https
- ) and _url_has_allowed_host_and_scheme(
- url.replace("\\", "/"), allowed_hosts, require_https=require_https
- )
- def _url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
- # Chrome considers any URL with more than two slashes to be absolute, but
- # urlparse is not so flexible. Treat any url with three slashes as unsafe.
- if url.startswith("///"):
- return False
- try:
- url_info = urlparse(url)
- except ValueError: # e.g. invalid IPv6 addresses
- return False
- # Forbid URLs like http:///example.com - with a scheme, but without a hostname.
- # In that URL, example.com is not the hostname but, a path component. However,
- # Chrome will still consider example.com to be the hostname, so we must not
- # allow this syntax.
- if not url_info.netloc and url_info.scheme:
- return False
- # Forbid URLs that start with control characters. Some browsers (like
- # Chrome) ignore quite a few control characters at the start of a
- # URL and might consider the URL as scheme relative.
- if unicodedata.category(url[0])[0] == "C":
- return False
- scheme = url_info.scheme
- # Consider URLs without a scheme (e.g. //example.com/p) to be http.
- if not url_info.scheme and url_info.netloc:
- scheme = "http"
- valid_schemes = ["https"] if require_https else ["http", "https"]
- return (not url_info.netloc or url_info.netloc in allowed_hosts) and (
- not scheme or scheme in valid_schemes
- )
- def escape_leading_slashes(url):
- """
- If redirecting to an absolute path (two leading slashes), a slash must be
- escaped to prevent browsers from handling the path as schemaless and
- redirecting to another host.
- """
- if url.startswith("//"):
- url = "/%2F{}".format(url.removeprefix("//"))
- return url
- def _parseparam(s):
- while s[:1] == ";":
- s = s[1:]
- end = s.find(";")
- while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
- end = s.find(";", end + 1)
- if end < 0:
- end = len(s)
- f = s[:end]
- yield f.strip()
- s = s[end:]
- def parse_header_parameters(line):
- """
- Parse a Content-type like header.
- Return the main content-type and a dictionary of options.
- """
- parts = _parseparam(";" + line)
- key = parts.__next__().lower()
- pdict = {}
- for p in parts:
- i = p.find("=")
- if i >= 0:
- has_encoding = False
- name = p[:i].strip().lower()
- if name.endswith("*"):
- # Lang/encoding embedded in the value (like "filename*=UTF-8''file.ext")
- # https://tools.ietf.org/html/rfc2231#section-4
- name = name[:-1]
- if p.count("'") == 2:
- has_encoding = True
- value = p[i + 1 :].strip()
- if len(value) >= 2 and value[0] == value[-1] == '"':
- value = value[1:-1]
- value = value.replace("\\\\", "\\").replace('\\"', '"')
- if has_encoding:
- encoding, lang, value = value.split("'")
- value = unquote(value, encoding=encoding)
- pdict[name] = value
- return key, pdict
- def content_disposition_header(as_attachment, filename):
- """
- Construct a Content-Disposition HTTP header value from the given filename
- as specified by RFC 6266.
- """
- if filename:
- disposition = "attachment" if as_attachment else "inline"
- try:
- filename.encode("ascii")
- file_expr = 'filename="{}"'.format(
- filename.replace("\\", "\\\\").replace('"', r"\"")
- )
- except UnicodeEncodeError:
- file_expr = "filename*=utf-8''{}".format(quote(filename))
- return f"{disposition}; {file_expr}"
- elif as_attachment:
- return "attachment"
- else:
- return None
|