From 0aad2f0ab396e0a4ea1c3f0afc62d7c89c5f00e8 Mon Sep 17 00:00:00 2001 From: Manuel Cortez Date: Thu, 10 Nov 2022 10:01:39 -0600 Subject: [PATCH] Improved html parsing for toots. Remove Tags from URLList --- src/controller/buffers/mastodon/base.py | 11 ++++++----- src/sessions/mastodon/utils.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/controller/buffers/mastodon/base.py b/src/controller/buffers/mastodon/base.py index 7b2f76f4..d42d4a13 100644 --- a/src/controller/buffers/mastodon/base.py +++ b/src/controller/buffers/mastodon/base.py @@ -17,7 +17,8 @@ from mysc.thread_utils import call_threaded from pubsub import pub from extra import ocr from wxUI import buffers, dialogs, commonMessageDialogs -from wxUI.dialogs.mastodon import dialogs, menus +from wxUI.dialogs.mastodon import menus +from wxUI.dialogs.mastodon import dialogs as mastodon_dialogs log = logging.getLogger("controller.buffers.mastodon.base") @@ -330,7 +331,7 @@ class BaseBuffer(base.Buffer): toot = self.get_item() id = toot.id if self.session.settings["general"]["boost_mode"] == "ask": - answer = dialogs.boost_question() + answer = mastodon_dialogs.boost_question() if answer == True: self._direct_boost(id) else: @@ -378,9 +379,9 @@ class BaseBuffer(base.Buffer): if url == '': toot = self.get_item() if toot.reblog != None: - urls = utils.find_urls(toot.reblog.content) + urls = utils.find_urls(toot.REBLOG) else: - urls = utils.find_urls(toot.reblog.content) + urls = utils.find_urls(toot) if len(urls) == 1: url=urls[0] elif len(urls) > 1: @@ -406,7 +407,7 @@ class BaseBuffer(base.Buffer): if item.account.id != self.session.db["user_id"] or item.reblog != None: output.speak(_("You can delete only your own toots.")) return - answer = dialogs.delete_toot_dialog() + answer = mastodon_dialogs.delete_toot_dialog() if answer == True: items = self.session.db[self.name] try: diff --git a/src/sessions/mastodon/utils.py b/src/sessions/mastodon/utils.py index 71738249..a824496d 100644 --- a/src/sessions/mastodon/utils.py +++ b/src/sessions/mastodon/utils.py @@ -1,13 +1,19 @@ import re from html.parser import HTMLParser -url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ \\n\\t]*") +url_re = re.compile('') class HTMLFilter(HTMLParser): text = "" def handle_data(self, data): self.text += data + def handle_starttag(self, tag, attrs): + if tag == "br": + self.text = self.text+"\n" + elif tag == "p": + self.text = self.text+"\n\n" + def html_filter(data): f = HTMLFilter() f.feed(data) @@ -45,5 +51,11 @@ def get_media_urls(toot): urls.append(media.get("url")) return urls -def find_urls(text): - return url_re.findall(html_filter(text)) \ No newline at end of file +def find_urls(toot, include_tags=False): + urls = url_re.findall(toot.content) + if include_tags == False: + for tag in toot.tags: + for url in urls[::]: + if url.lower().endswith("/tags/"+tag["name"]): + urls.remove(url) + return urls \ No newline at end of file