From 0aad2f0ab396e0a4ea1c3f0afc62d7c89c5f00e8 Mon Sep 17 00:00:00 2001
From: Manuel Cortez <manuel@manuelcortez.net>
Date: Thu, 10 Nov 2022 10:01:39 -0600
Subject: [PATCH] Improved html parsing for toots. Remove Tags from URLList

---
 src/controller/buffers/mastodon/base.py | 11 ++++++-----
 src/sessions/mastodon/utils.py          | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/controller/buffers/mastodon/base.py b/src/controller/buffers/mastodon/base.py
index 7b2f76f4..d42d4a13 100644
--- a/src/controller/buffers/mastodon/base.py
+++ b/src/controller/buffers/mastodon/base.py
@@ -17,7 +17,8 @@ from mysc.thread_utils import call_threaded
 from pubsub import pub
 from extra import ocr
 from wxUI import buffers, dialogs, commonMessageDialogs
-from wxUI.dialogs.mastodon import dialogs, menus
+from wxUI.dialogs.mastodon import menus
+from wxUI.dialogs.mastodon import dialogs as mastodon_dialogs
 
 log = logging.getLogger("controller.buffers.mastodon.base")
 
@@ -330,7 +331,7 @@ class BaseBuffer(base.Buffer):
         toot = self.get_item()
         id = toot.id
         if self.session.settings["general"]["boost_mode"] == "ask":
-            answer = dialogs.boost_question()
+            answer = mastodon_dialogs.boost_question()
             if answer == True:
                 self._direct_boost(id)
         else:
@@ -378,9 +379,9 @@ class BaseBuffer(base.Buffer):
         if url == '':
             toot = self.get_item()
             if toot.reblog != None:
-                urls = utils.find_urls(toot.reblog.content)
+                urls = utils.find_urls(toot.REBLOG)
             else:
-                urls = utils.find_urls(toot.reblog.content)
+                urls = utils.find_urls(toot)
             if len(urls) == 1:
                 url=urls[0]
             elif len(urls) > 1:
@@ -406,7 +407,7 @@ class BaseBuffer(base.Buffer):
         if item.account.id != self.session.db["user_id"] or item.reblog != None:
             output.speak(_("You can delete only your own toots."))
             return
-        answer = dialogs.delete_toot_dialog()
+        answer = mastodon_dialogs.delete_toot_dialog()
         if answer == True:
             items = self.session.db[self.name]
             try:
diff --git a/src/sessions/mastodon/utils.py b/src/sessions/mastodon/utils.py
index 71738249..a824496d 100644
--- a/src/sessions/mastodon/utils.py
+++ b/src/sessions/mastodon/utils.py
@@ -1,13 +1,19 @@
 import re
 from html.parser import HTMLParser
 
-url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ \\n\\t]*")
+url_re = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
 
 class HTMLFilter(HTMLParser):
     text = ""
     def handle_data(self, data):
         self.text += data
 
+    def handle_starttag(self, tag, attrs):
+        if tag == "br":
+            self.text = self.text+"\n"
+        elif tag == "p":
+            self.text = self.text+"\n\n"
+
 def html_filter(data):
     f = HTMLFilter()
     f.feed(data)
@@ -45,5 +51,11 @@ def get_media_urls(toot):
             urls.append(media.get("url"))
     return urls
 
-def find_urls(text):
-    return  url_re.findall(html_filter(text))
\ No newline at end of file
+def find_urls(toot, include_tags=False):
+    urls = url_re.findall(toot.content)
+    if include_tags == False:
+        for tag in toot.tags:
+            for url in urls[::]:
+                if url.lower().endswith("/tags/"+tag["name"]):
+                    urls.remove(url)
+    return urls
\ No newline at end of file