mirror of
https://github.com/MCV-Software/TWBlue.git
synced 2025-10-24 17:42:02 +00:00
Display properly HTML Entities in tweets
This commit is contained in:
@@ -3,7 +3,6 @@ import platform
|
||||
system = platform.system()
|
||||
from . import utils
|
||||
import re
|
||||
import html.entities
|
||||
import time
|
||||
import output
|
||||
import languageHandler
|
||||
@@ -11,21 +10,9 @@ import arrow
|
||||
import logging
|
||||
import config
|
||||
from .long_tweets import twishort, tweets
|
||||
from .utils import StripChars
|
||||
log = logging.getLogger("compose")
|
||||
|
||||
def StripChars(s):
|
||||
"""Converts any html entities in s to their unicode-decoded equivalents and returns a string."""
|
||||
entity_re = re.compile(r"&(#\d+|\w+);")
|
||||
def matchFunc(match):
|
||||
"""Nested function to handle a match object.
|
||||
If we match &blah; and it's not found, &blah; will be returned.
|
||||
if we match #\d+, unichr(digits) will be returned.
|
||||
Else, a unicode string will be returned."""
|
||||
if match.group(1).startswith('#'): return chr(int(match.group(1)[1:]))
|
||||
replacement = html.entities.entitydefs.get(match.group(1), "&%s;" % match.group(1))
|
||||
return replacement
|
||||
return str(entity_re.sub(matchFunc, s))
|
||||
|
||||
chars = "abcdefghijklmnopqrstuvwxyz"
|
||||
|
||||
def compose_tweet(tweet, db, relative_times, show_screen_names=False, session=None):
|
||||
|
@@ -32,7 +32,7 @@ def process_text(tweet):
|
||||
elif hasattr(tweet, "text"):
|
||||
text = tweet.text
|
||||
# Cleanup mentions, so we'll remove more than 2 mentions to make the tweet easier to read.
|
||||
text = utils.clean_mentions(text)
|
||||
text = utils.clean_mentions(utils.StripChars(text))
|
||||
# Replace URLS for extended version of those.
|
||||
if hasattr(tweet, "entities"):
|
||||
text = utils.expand_urls(text, tweet.entities)
|
||||
|
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import html.entities
|
||||
import output
|
||||
import logging
|
||||
import requests
|
||||
@@ -16,6 +17,19 @@ url_re = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4
|
||||
url_re2 = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ \\n\\t]*")
|
||||
bad_chars = '\'\\\n.,[](){}:;"'
|
||||
|
||||
def StripChars(s):
|
||||
"""Converts any html entities in s to their unicode-decoded equivalents and returns a string."""
|
||||
entity_re = re.compile(r"&(#\d+|\w+);")
|
||||
def matchFunc(match):
|
||||
"""Nested function to handle a match object.
|
||||
If we match &blah; and it's not found, &blah; will be returned.
|
||||
if we match #\d+, unichr(digits) will be returned.
|
||||
Else, a unicode string will be returned."""
|
||||
if match.group(1).startswith('#'): return chr(int(match.group(1)[1:]))
|
||||
replacement = html.entities.entitydefs.get(match.group(1), "&%s;" % match.group(1))
|
||||
return replacement
|
||||
return str(entity_re.sub(matchFunc, s))
|
||||
|
||||
def find_urls_in_text(text):
|
||||
return url_re2.findall(text)
|
||||
|
||||
|
Reference in New Issue
Block a user