Forráskód Böngészése

Fixed #33195 -- Refactored urlize() based on a class.

This allows easier customization/
Claude Paroz 3 éve
szülő
commit
e567670b1a
1 módosított fájl, 95 hozzáadás és 72 törlés
  1. 95 72
      django/utils/html.py

+ 95 - 72
django/utils/html.py

@@ -15,17 +15,6 @@ from django.utils.regex_helper import _lazy_re_compile
 from django.utils.safestring import SafeData, SafeString, mark_safe
 from django.utils.text import normalize_newlines
 
-# Configuration for urlize() function.
-TRAILING_PUNCTUATION_CHARS = '.,:;!'
-WRAPPING_PUNCTUATION = [('(', ')'), ('[', ']')]
-
-word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
-simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
-simple_url_2_re = _lazy_re_compile(
-    r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
-    re.IGNORECASE
-)
-
 
 @keep_lazy(str, SafeString)
 def escape(text):
@@ -229,48 +218,118 @@ def smart_urlquote(url):
     return urlunsplit((scheme, netloc, path, query, fragment))
 
 
-@keep_lazy_text
-def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
+class Urlizer:
     """
     Convert any URLs in text into clickable links.
 
-    Works on http://, https://, www. links, and also on links ending in one of
+    Work on http://, https://, www. links, and also on links ending in one of
     the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
     Links can have trailing punctuation (periods, commas, close-parens) and
     leading punctuation (opening parens) and it'll still do the right thing.
+    """
+    trailing_punctuation_chars = '.,:;!'
+    wrapping_punctuation = [('(', ')'), ('[', ']')]
+
+    simple_url_re = _lazy_re_compile(r'^https?://\[?\w', re.IGNORECASE)
+    simple_url_2_re = _lazy_re_compile(
+        r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$',
+        re.IGNORECASE
+    )
+    word_split_re = _lazy_re_compile(r'''([\s<>"']+)''')
 
-    If trim_url_limit is not None, truncate the URLs in the link text longer
-    than this limit to trim_url_limit - 1 characters and append an ellipsis.
+    mailto_template = 'mailto:{local}@{domain}'
+    url_template = '<a href="{href}"{attrs}>{url}</a>'
 
-    If nofollow is True, give the links a rel="nofollow" attribute.
+    def __call__(self, text, trim_url_limit=None, nofollow=False, autoescape=False):
+        """
+        If trim_url_limit is not None, truncate the URLs in the link text
+        longer than this limit to trim_url_limit - 1 characters and append an
+        ellipsis.
 
-    If autoescape is True, autoescape the link text and URLs.
-    """
-    safe_input = isinstance(text, SafeData)
+        If nofollow is True, give the links a rel="nofollow" attribute.
+
+        If autoescape is True, autoescape the link text and URLs.
+        """
+        self.trim_url_limit = trim_url_limit
+        self.nofollow = nofollow
+        self.autoescape = autoescape
+        self.safe_input = isinstance(text, SafeData)
+
+        words = self.word_split_re.split(str(text))
+        return ''.join([
+            self.handle_word(word) for word in words
+        ])
 
-    def trim_url(x, limit=trim_url_limit):
-        if limit is None or len(x) <= limit:
+    def handle_word(self, word):
+        if '.' in word or '@' in word or ':' in word:
+            # lead: Punctuation trimmed from the beginning of the word.
+            # middle: State of the word.
+            # trail: Punctuation trimmed from the end of the word.
+            lead, middle, trail = self.trim_punctuation(word)
+            # Make URL we want to point to.
+            url = None
+            nofollow_attr = ' rel="nofollow"' if self.nofollow else ''
+            if self.simple_url_re.match(middle):
+                url = smart_urlquote(html.unescape(middle))
+            elif self.simple_url_2_re.match(middle):
+                url = smart_urlquote('http://%s' % html.unescape(middle))
+            elif ':' not in middle and self.is_email_simple(middle):
+                local, domain = middle.rsplit('@', 1)
+                try:
+                    domain = punycode(domain)
+                except UnicodeError:
+                    return word
+                url = self.mailto_template.format(local=local, domain=domain)
+                nofollow_attr = ''
+            # Make link.
+            if url:
+                trimmed = self.trim_url(middle)
+                if self.autoescape and not self.safe_input:
+                    lead, trail = escape(lead), escape(trail)
+                    trimmed = escape(trimmed)
+                middle = self.url_template.format(
+                    href=escape(url),
+                    attrs=nofollow_attr,
+                    url=trimmed,
+                )
+                return mark_safe(f'{lead}{middle}{trail}')
+            else:
+                if self.safe_input:
+                    return mark_safe(word)
+                elif self.autoescape:
+                    return escape(word)
+        elif self.safe_input:
+            return mark_safe(word)
+        elif self.autoescape:
+            return escape(word)
+        return word
+
+    def trim_url(self, x):
+        if self.trim_url_limit is None or len(x) <= self.trim_url_limit:
             return x
-        return '%s…' % x[:max(0, limit - 1)]
+        return '%s…' % x[:max(0, self.trim_url_limit - 1)]
 
-    def trim_punctuation(lead, middle, trail):
+    def trim_punctuation(self, word):
         """
-        Trim trailing and wrapping punctuation from `middle`. Return the items
-        of the new state.
+        Trim trailing and wrapping punctuation from `word`. Return the items of
+        the new state.
         """
+        lead, middle, trail = '', word, ''
         # Continue trimming until middle remains unchanged.
         trimmed_something = True
         while trimmed_something:
             trimmed_something = False
             # Trim wrapping punctuation.
-            for opening, closing in WRAPPING_PUNCTUATION:
+            for opening, closing in self.wrapping_punctuation:
                 if middle.startswith(opening):
                     middle = middle[len(opening):]
                     lead += opening
                     trimmed_something = True
                 # Keep parentheses at the end only if they're balanced.
-                if (middle.endswith(closing) and
-                        middle.count(closing) == middle.count(opening) + 1):
+                if (
+                    middle.endswith(closing) and
+                    middle.count(closing) == middle.count(opening) + 1
+                ):
                     middle = middle[:-len(closing)]
                     trail = closing + trail
                     trimmed_something = True
@@ -278,7 +337,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
             # as encoded entities contain ';'). Unescape entities to avoid
             # breaking them by removing ';'.
             middle_unescaped = html.unescape(middle)
-            stripped = middle_unescaped.rstrip(TRAILING_PUNCTUATION_CHARS)
+            stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars)
             if middle_unescaped != stripped:
                 punctuation_count = len(middle_unescaped) - len(stripped)
                 trail = middle[-punctuation_count:] + trail
@@ -286,6 +345,7 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
                 trimmed_something = True
         return lead, middle, trail
 
+    @staticmethod
     def is_email_simple(value):
         """Return True if value looks like an email address."""
         # An @ must be in the middle of the value.
@@ -301,50 +361,13 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
             return False
         return True
 
-    words = word_split_re.split(str(text))
-    for i, word in enumerate(words):
-        if '.' in word or '@' in word or ':' in word:
-            # lead: Current punctuation trimmed from the beginning of the word.
-            # middle: Current state of the word.
-            # trail: Current punctuation trimmed from the end of the word.
-            lead, middle, trail = '', word, ''
-            # Deal with punctuation.
-            lead, middle, trail = trim_punctuation(lead, middle, trail)
 
-            # Make URL we want to point to.
-            url = None
-            nofollow_attr = ' rel="nofollow"' if nofollow else ''
-            if simple_url_re.match(middle):
-                url = smart_urlquote(html.unescape(middle))
-            elif simple_url_2_re.match(middle):
-                url = smart_urlquote('http://%s' % html.unescape(middle))
-            elif ':' not in middle and is_email_simple(middle):
-                local, domain = middle.rsplit('@', 1)
-                try:
-                    domain = punycode(domain)
-                except UnicodeError:
-                    continue
-                url = 'mailto:%s@%s' % (local, domain)
-                nofollow_attr = ''
+urlizer = Urlizer()
 
-            # Make link.
-            if url:
-                trimmed = trim_url(middle)
-                if autoescape and not safe_input:
-                    lead, trail = escape(lead), escape(trail)
-                    trimmed = escape(trimmed)
-                middle = '<a href="%s"%s>%s</a>' % (escape(url), nofollow_attr, trimmed)
-                words[i] = mark_safe('%s%s%s' % (lead, middle, trail))
-            else:
-                if safe_input:
-                    words[i] = mark_safe(word)
-                elif autoescape:
-                    words[i] = escape(word)
-        elif safe_input:
-            words[i] = mark_safe(word)
-        elif autoescape:
-            words[i] = escape(word)
-    return ''.join(words)
+
+@keep_lazy_text
+def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
+    return urlizer(text, trim_url_limit=trim_url_limit, nofollow=nofollow, autoescape=autoescape)
 
 
 def avoid_wrapping(value):