Text should be cleaned better in posts, coments and topic comments. Cleaned texts will render properly usernames, group names and render properly certain unicode characters

2019-02-03 20:56:32 -06:00 · 2019-02-03 20:56:32 -06:00 · 38b0eec741
commit 38b0eec741
parent 2496f19bee
3 changed files with 23 additions and 24 deletions
--- a/src/presenters/postDisplayer.py
+++ b/src/presenters/postDisplayer.py
@ -1,6 +1,4 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-import re
 import os
 import six
 import threading
@ -25,7 +23,7 @@ log = logging.getLogger(__file__)
 def get_message(status):
 	message = ""
 	if "text" in status:
-		message = renderers.clean_text(status["text"])
+		message = utils.clean_text(status["text"])
 	return message

 class displayPostPresenter(base.basePresenter):
@ -83,7 +81,7 @@ class displayPostPresenter(base.basePresenter):
 				extra_info = self.session.get_user(i["reply_to_user"])["user1_nom"]
 				from_ = _("{0} > {1}").format(from_, extra_info)
 			# As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists.
-			fixed_text = re.sub("^\[id\d+\|\D+\], ", "", i["text"])
+			fixed_text = utils.clean_text(i["text"])
 			if len(fixed_text) > 140:
 				text = fixed_text[:141]
 			else:
@ -297,7 +295,7 @@ class displayPostPresenter(base.basePresenter):
 			else:
 				from_ = from_["user1_nom"]
 			# As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists.
-			fixed_text = re.sub("^\[id\d+\|\D+\], ", "", comment_object["text"])
+			fixed_text = utils.clean_text(comment_object["text"])
 			if len(fixed_text) > 140:
 				text = fixed_text[:141]
 			else:
@ -484,7 +482,7 @@ class displayCommentPresenter(displayPostPresenter):
 			else:
 				from_ = from_["user1_nom"]
 			# As we set the comment reply properly in the from_ field, let's remove the first username from here if it exists.
-			fixed_text = re.sub("^\[id\d+\|\D+\], ", "", i["text"])
+			fixed_text = utils.clean_text(i["text"])
 			if len(fixed_text) > 140:
 				text = fixed_text[:141]
 			else:
@ -548,17 +546,10 @@ class displayTopicPresenter(displayPostPresenter):
 				continue
 			from_ = self.session.get_user(i["from_id"])["user1_nom"]
 			# match user mentions inside text comment.
-			matched_data = re.match(".*(\[)(id|club)(\d+:bp-\d+_\d+\|)(\D+)(\])", i["text"])
-			# If matched data exists we should modify the title.
-#			if len(matched_data.groups()) > 2:
-#				from_ = "{from_} > {to_}".format(from_=from_, to_=matched_data.groups()[1])
 			original_date = arrow.get(i["date"])
 			created_at = original_date.humanize(locale=languageHandler.curLang[:2])
 			likes = str(i["likes"]["count"])
-			if matched_data != None:
-				text = re.sub("\[(id|club)\d+:bp-\d+_\d+\|\D+\]", matched_data.groups()[3]+", ", i["text"])
-			else:
-				text = i["text"]
+			text = utils.clean_text(text=i["text"])
 			comments_.append((from_, text, created_at, likes))
 		self.send_message("add_items", control="comments", items=comments_)

--- a/src/sessionmanager/renderers.py
+++ b/src/sessionmanager/renderers.py
@ -1,12 +1,10 @@
 # -*- coding: utf-8 -*-
 """ this module contains everything used to render different kind of posts (posts in the home buffer,
 Chat messages, audios, videos, photos, comments in posts, etc)"""
-from __future__ import unicode_literals
-from builtins import range
 import arrow
 import languageHandler
 import logging
-from . utils import seconds_to_string
+from . utils import seconds_to_string, clean_text

 log = logging.getLogger(__file__)

@ -50,12 +48,6 @@ def clean_audio(audio):
 			audio["count"] = audio["count"] -1
 	return audio

-def clean_text(text):
-	""" Replaces all HTML entities and put the plain text equivalent if it's possible."""
-	text = text.replace("<br>", "\n")
-	text = text.replace("\\n", "\n")
-	return text 
-
 def add_attachment(attachment):
 	msg = ""
 	tpe = ""
--- a/src/sessionmanager/utils.py
+++ b/src/sessionmanager/utils.py
@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
 """ Some utilities. I no have idea how I should put these, so..."""
 import os
-import requests
 import re
+import html
 import logging
+import requests

 log = logging.getLogger("utils")
 url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ ]*")
@ -57,3 +58,18 @@ def download_file(url, local_filename, window):
 	window.change_status(_("Ready"))
 	return local_filename

+def detect_users(text):
+	""" Detect all users and communities mentionned in any text posted in VK."""
+	# This regexp gets group and users mentionned in topic comments.
+	for matched_data in re.finditer("(\[)(id|club)(\d+:bp-\d+_\d+\|)(\D+)(\])", text):
+		text = re.sub("\[(id|club)\d+:bp-\d+_\d+\|\D+\]", matched_data.groups()[3]+", ", text, count=1)
+	# This is for users and communities just mentionned in wall comments or posts.
+	for matched_data in  re.finditer("(\[)(id|club)(\d+\|)(\D+)(\])", text):
+		text = re.sub("\[(id|club)\d+\|\D+\]", matched_data.groups()[3]+", ", text, count=1)
+	return text
+
+def clean_text(text):
+	""" Clean text, removing all unneeded HTMl and converting HTML represented characters in their unicode counterparts."""
+	text = detect_users(text)
+	text = html.unescape(text)
+	return text