Text should be cleaned better in posts, coments and topic comments. Cleaned texts will render properly usernames, group names and render properly certain unicode characters

This commit is contained in:
Manuel Cortez 2019-02-03 20:56:32 -06:00
parent 2496f19bee
commit 38b0eec741
3 changed files with 23 additions and 24 deletions

View File

@ -1,6 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
import os import os
import six import six
import threading import threading
@ -25,7 +23,7 @@ log = logging.getLogger(__file__)
def get_message(status): def get_message(status):
message = "" message = ""
if "text" in status: if "text" in status:
message = renderers.clean_text(status["text"]) message = utils.clean_text(status["text"])
return message return message
class displayPostPresenter(base.basePresenter): class displayPostPresenter(base.basePresenter):
@ -83,7 +81,7 @@ class displayPostPresenter(base.basePresenter):
extra_info = self.session.get_user(i["reply_to_user"])["user1_nom"] extra_info = self.session.get_user(i["reply_to_user"])["user1_nom"]
from_ = _("{0} > {1}").format(from_, extra_info) from_ = _("{0} > {1}").format(from_, extra_info)
# As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists. # As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists.
fixed_text = re.sub("^\[id\d+\|\D+\], ", "", i["text"]) fixed_text = utils.clean_text(i["text"])
if len(fixed_text) > 140: if len(fixed_text) > 140:
text = fixed_text[:141] text = fixed_text[:141]
else: else:
@ -297,7 +295,7 @@ class displayPostPresenter(base.basePresenter):
else: else:
from_ = from_["user1_nom"] from_ = from_["user1_nom"]
# As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists. # As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists.
fixed_text = re.sub("^\[id\d+\|\D+\], ", "", comment_object["text"]) fixed_text = utils.clean_text(comment_object["text"])
if len(fixed_text) > 140: if len(fixed_text) > 140:
text = fixed_text[:141] text = fixed_text[:141]
else: else:
@ -484,7 +482,7 @@ class displayCommentPresenter(displayPostPresenter):
else: else:
from_ = from_["user1_nom"] from_ = from_["user1_nom"]
# As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists. # As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists.
fixed_text = re.sub("^\[id\d+\|\D+\], ", "", i["text"]) fixed_text = utils.clean_text(i["text"])
if len(fixed_text) > 140: if len(fixed_text) > 140:
text = fixed_text[:141] text = fixed_text[:141]
else: else:
@ -548,17 +546,10 @@ class displayTopicPresenter(displayPostPresenter):
continue continue
from_ = self.session.get_user(i["from_id"])["user1_nom"] from_ = self.session.get_user(i["from_id"])["user1_nom"]
# match user mentions inside text comment. # match user mentions inside text comment.
matched_data = re.match(".*(\[)(id|club)(\d+:bp-\d+_\d+\|)(\D+)(\])", i["text"])
# If matched data exists we should modify the title.
# if len(matched_data.groups()) > 2:
# from_ = "{from_} > {to_}".format(from_=from_, to_=matched_data.groups()[1])
original_date = arrow.get(i["date"]) original_date = arrow.get(i["date"])
created_at = original_date.humanize(locale=languageHandler.curLang[:2]) created_at = original_date.humanize(locale=languageHandler.curLang[:2])
likes = str(i["likes"]["count"]) likes = str(i["likes"]["count"])
if matched_data != None: text = utils.clean_text(text=i["text"])
text = re.sub("\[(id|club)\d+:bp-\d+_\d+\|\D+\]", matched_data.groups()[3]+", ", i["text"])
else:
text = i["text"]
comments_.append((from_, text, created_at, likes)) comments_.append((from_, text, created_at, likes))
self.send_message("add_items", control="comments", items=comments_) self.send_message("add_items", control="comments", items=comments_)

View File

@ -1,12 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" this module contains everything used to render different kind of posts (posts in the home buffer, """ this module contains everything used to render different kind of posts (posts in the home buffer,
Chat messages, audios, videos, photos, comments in posts, etc)""" Chat messages, audios, videos, photos, comments in posts, etc)"""
from __future__ import unicode_literals
from builtins import range
import arrow import arrow
import languageHandler import languageHandler
import logging import logging
from . utils import seconds_to_string from . utils import seconds_to_string, clean_text
log = logging.getLogger(__file__) log = logging.getLogger(__file__)
@ -50,12 +48,6 @@ def clean_audio(audio):
audio["count"] = audio["count"] -1 audio["count"] = audio["count"] -1
return audio return audio
def clean_text(text):
""" Replaces all HTML entities and put the plain text equivalent if it's possible."""
text = text.replace("<br>", "\n")
text = text.replace("\\n", "\n")
return text
def add_attachment(attachment): def add_attachment(attachment):
msg = "" msg = ""
tpe = "" tpe = ""

View File

@ -1,9 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" Some utilities. I no have idea how I should put these, so...""" """ Some utilities. I no have idea how I should put these, so..."""
import os import os
import requests
import re import re
import html
import logging import logging
import requests
log = logging.getLogger("utils") log = logging.getLogger("utils")
url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ ]*") url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ ]*")
@ -57,3 +58,18 @@ def download_file(url, local_filename, window):
window.change_status(_("Ready")) window.change_status(_("Ready"))
return local_filename return local_filename
def detect_users(text):
""" Detect all users and communities mentionned in any text posted in VK."""
# This regexp gets group and users mentionned in topic comments.
for matched_data in re.finditer("(\[)(id|club)(\d+:bp-\d+_\d+\|)(\D+)(\])", text):
text = re.sub("\[(id|club)\d+:bp-\d+_\d+\|\D+\]", matched_data.groups()[3]+", ", text, count=1)
# This is for users and communities just mentionned in wall comments or posts.
for matched_data in re.finditer("(\[)(id|club)(\d+\|)(\D+)(\])", text):
text = re.sub("\[(id|club)\d+\|\D+\]", matched_data.groups()[3]+", ", text, count=1)
return text
def clean_text(text):
""" Clean text, removing all unneeded HTMl and converting HTML represented characters in their unicode counterparts."""
text = detect_users(text)
text = html.unescape(text)
return text