Improved html parsing for toots. Remove Tags from URLList

2025-07-21 23:36:08 -04:00 · 2022-11-10 10:01:39 -06:00
parent f151d6554d
commit 0aad2f0ab3
2 changed files with 21 additions and 8 deletions
--- a/src/controller/buffers/mastodon/base.py
+++ b/src/controller/buffers/mastodon/base.py
@@ -17,7 +17,8 @@ from mysc.thread_utils import call_threaded
 from pubsub import pub
 from extra import ocr
 from wxUI import buffers, dialogs, commonMessageDialogs
-from wxUI.dialogs.mastodon import dialogs, menus
+from wxUI.dialogs.mastodon import menus
+from wxUI.dialogs.mastodon import dialogs as mastodon_dialogs

 log = logging.getLogger("controller.buffers.mastodon.base")

@@ -330,7 +331,7 @@ class BaseBuffer(base.Buffer):
        toot = self.get_item()
        id = toot.id
        if self.session.settings["general"]["boost_mode"] == "ask":
-            answer = dialogs.boost_question()
+            answer = mastodon_dialogs.boost_question()
            if answer == True:
                self._direct_boost(id)
        else:
@@ -378,9 +379,9 @@ class BaseBuffer(base.Buffer):
        if url == '':
            toot = self.get_item()
            if toot.reblog != None:
-                urls = utils.find_urls(toot.reblog.content)
+                urls = utils.find_urls(toot.REBLOG)
            else:
-                urls = utils.find_urls(toot.reblog.content)
+                urls = utils.find_urls(toot)
            if len(urls) == 1:
                url=urls[0]
            elif len(urls) > 1:
@@ -406,7 +407,7 @@ class BaseBuffer(base.Buffer):
        if item.account.id != self.session.db["user_id"] or item.reblog != None:
            output.speak(_("You can delete only your own toots."))
            return
-        answer = dialogs.delete_toot_dialog()
+        answer = mastodon_dialogs.delete_toot_dialog()
        if answer == True:
            items = self.session.db[self.name]
            try:
--- a/src/sessions/mastodon/utils.py
+++ b/src/sessions/mastodon/utils.py
@@ -1,13 +1,19 @@
 import re
 from html.parser import HTMLParser

-url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ \\n\\t]*")
+url_re = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

 class HTMLFilter(HTMLParser):
    text = ""
    def handle_data(self, data):
        self.text += data

+    def handle_starttag(self, tag, attrs):
+        if tag == "br":
+            self.text = self.text+"\n"
+        elif tag == "p":
+            self.text = self.text+"\n\n"
+
 def html_filter(data):
    f = HTMLFilter()
    f.feed(data)
@@ -45,5 +51,11 @@ def get_media_urls(toot):
            urls.append(media.get("url"))
    return urls

-def find_urls(text):
-    return  url_re.findall(html_filter(text))
+def find_urls(toot, include_tags=False):
+    urls = url_re.findall(toot.content)
+    if include_tags == False:
+        for tag in toot.tags:
+            for url in urls[::]:
+                if url.lower().endswith("/tags/"+tag["name"]):
+                    urls.remove(url)
+    return urls