Improve replacing of entities with links in html_for_tweet()

I've re-written the parts of `html_for_tweet()` so that it handles all the replacements of URLs, mentions, symbols and hashtags better. Mainly to fix #447 but it should be a little more robust generally. Shamelessly cribbed from https://stackoverflow.com/a/25514650/250962 Passes all tests, but I haven't checked it beyond that. Fixes #447
2017-08-22 13:49:40 +01:00 · 2017-08-22 13:49:40 +01:00 · b366ab55c3
commit b366ab55c3
parent 5a87fc7d84
3 changed files with 80 additions and 48 deletions
--- a/twython/api.py
+++ b/twython/api.py
@ -556,62 +556,78 @@ class Twython(EndpointsMixin, object):
        suffix_text = orig_tweet_text[display_text_end:len(orig_tweet_text)]

        if 'entities' in tweet:
-            entities = tweet['entities']
+            # We'll put all the bits of replacement HTML and their starts/ends
+            # in this list:
+            entities = []

            # Mentions
-            for entity in sorted(entities['user_mentions'],
-                                 key=lambda mention: len(mention['screen_name']), reverse=True):
-                start, end = entity['indices'][0], entity['indices'][1]
+            if 'user_mentions' in tweet['entities']:
+                for entity in tweet['entities']['user_mentions']:
+                    temp = {}
+                    temp['start'] = entity['indices'][0]
+                    temp['end'] = entity['indices'][1]

-                mention_html = '<a href="https://twitter.com/%(screen_name)s" ' \
-                               'class="twython-mention">@%(screen_name)s</a>' % {'screen_name': entity['screen_name']}
-                sub_expr = r'(?<!>)' + orig_tweet_text[start:end] + '(?!</a>)'
-                if display_text_start <= start <= display_text_end:
-                    display_text = re.sub(sub_expr, mention_html, display_text)
-                else:
-                    prefix_text = re.sub(sub_expr, mention_html, prefix_text)
+                    mention_html = '<a href="https://twitter.com/%(screen_name)s" class="twython-mention">@%(screen_name)s</a>' % {'screen_name': entity['screen_name']}
+
+                    if display_text_start <= temp['start'] <= display_text_end:
+                        temp['replacement'] = mention_html
+                        entities.append(temp)
+                    else:
+                        prefix_text = re.sub(sub_expr, mention_html, prefix_text)

            # Hashtags
-            for entity in sorted(entities['hashtags'],
-                                 key=lambda hashtag: len(hashtag['text']), reverse=True):
-                start, end = entity['indices'][0], entity['indices'][1]
+            if 'hashtags' in tweet['entities']:
+                for entity in tweet['entities']['hashtags']:
+                    temp = {}
+                    temp['start'] = entity['indices'][0]
+                    temp['end'] = entity['indices'][1]

-                hashtag_html = '<a href="https://twitter.com/search?q=%%23%(hashtag)s" class="twython-hashtag">#%(hashtag)s</a>'
-                display_text = re.sub(r'(?<!>)' + orig_tweet_text[start:end] + '(?!</a>)',
-                                      hashtag_html % {'hashtag': entity['text']}, display_text)
+                    url_html = '<a href="https://twitter.com/search?q=%%23%(hashtag)s" class="twython-hashtag">#%(hashtag)s</a>' % {'hashtag': entity['text']}
+
+                    temp['replacement'] = url_html
+                    entities.append(temp)

            # Symbols
-            for entity in sorted(entities['symbols'],
-                                 key=lambda symbol: len(symbol['text']), reverse=True):
-                start, end = entity['indices'][0], entity['indices'][1]
+            if 'symbols' in tweet['entities']:
+                for entity in tweet['entities']['symbols']:
+                    temp = {}
+                    temp['start'] = entity['indices'][0]
+                    temp['end'] = entity['indices'][1]

-                symbol_html = '<a href="https://twitter.com/search?q=%%24%(symbol)s" class="twython-symbol">$%(symbol)s</a>'
-                display_text = re.sub(r'(?<!>)' + re.escape(orig_tweet_text[start:end]) + r'\b(?!</a>)',
-                                      symbol_html % {'symbol': entity['text']}, display_text)
+                    url_html = '<a href="https://twitter.com/search?q=%%24%(symbol)s" class="twython-symbol">$%(symbol)s</a>' % {'symbol': entity['text']}

-            # Urls
-            for entity in entities['urls']:
-                start, end = entity['indices'][0], entity['indices'][1]
-                if use_display_url and entity.get('display_url') \
-                   and not use_expanded_url:
-                    shown_url = entity['display_url']
-                elif use_expanded_url and entity.get('expanded_url'):
-                    shown_url = entity['expanded_url']
-                else:
-                    shown_url = entity['url']
+                    temp['replacement'] = url_html
+                    entities.append(temp)

-                url_html = '<a href="%s" class="twython-url">%s</a>' % (entity['url'], shown_url)
-                if display_text_start <= start <= display_text_end:
-                    display_text = display_text.replace(orig_tweet_text[start:end], url_html)
-                else:
-                    suffix_text = suffix_text.replace(orig_tweet_text[start:end], url_html)
+            # URLs
+            if 'urls' in tweet['entities']:
+                for entity in tweet['entities']['urls']:
+                    temp = {}
+                    temp['start'] = entity['indices'][0]
+                    temp['end'] = entity['indices'][1]

-             # Media
-            if 'media' in entities:
-                for entity in entities['media']:
-                    start, end = entity['indices'][0], entity['indices'][1]
-                    if use_display_url and entity.get('display_url') \
-                       and not use_expanded_url:
+                    if use_display_url and entity.get('display_url') and not use_expanded_url:
+                        shown_url = entity['display_url']
+                    elif use_expanded_url and entity.get('expanded_url'):
+                        shown_url = entity['expanded_url']
+                    else:
+                        shown_url = entity['url']
+
+                    url_html = '<a href="%s" class="twython-url">%s</a>' % (entity['url'], shown_url)
+
+                    if display_text_start <= temp['start'] <= display_text_end:
+                        temp['replacement'] = url_html
+                        entities.append(temp)
+                    else:
+                        suffix_text = suffix_text.replace(orig_tweet_text[temp['start']:temp['end']], url_html)
+
+            if 'media' in tweet['entities']:
+                for entity in tweet['entities']['media']:
+                    temp = {}
+                    temp['start'] = entity['indices'][0]
+                    temp['end'] = entity['indices'][1]
+
+                    if use_display_url and entity.get('display_url') and not use_expanded_url:
                        shown_url = entity['display_url']
                    elif use_expanded_url and entity.get('expanded_url'):
                        shown_url = entity['expanded_url']
@ -619,11 +635,17 @@ class Twython(EndpointsMixin, object):
                        shown_url = entity['url']

                    url_html = '<a href="%s" class="twython-media">%s</a>' % (entity['url'], shown_url)
-                    if display_text_start <= start <= display_text_end:
-                        # for compatibility with pre-extended tweets
-                        display_text = display_text.replace(orig_tweet_text[start:end], url_html)
+
+                    if display_text_start <= temp['start'] <= display_text_end:
+                        temp['replacement'] = url_html
+                        entities.append(temp)
                    else:
-                        suffix_text = suffix_text.replace(orig_tweet_text[start:end], url_html)
+                        suffix_text = suffix_text.replace(orig_tweet_text[temp['start']:temp['end']], url_html)
+
+            # Now do all the replacements, starting from the end, so that the
+            # start/end indices still work:
+            for entity in sorted(entities, key=lambda e: e['start'], reverse=True):
+                display_text = display_text[0:entity['start']] + entity['replacement'] + display_text[entity['end']:]

        quote_text = ''
        if expand_quoted_status and tweet.get('is_quote_status') and tweet.get('quoted_status'):