Improved html parsing for toots. Remove Tags from URLList

This commit is contained in:
Manuel Cortez 2022-11-10 10:01:39 -06:00
parent f151d6554d
commit 0aad2f0ab3
No known key found for this signature in database
GPG Key ID: 9E0735CA15EFE790
2 changed files with 21 additions and 8 deletions

View File

@ -17,7 +17,8 @@ from mysc.thread_utils import call_threaded
from pubsub import pub from pubsub import pub
from extra import ocr from extra import ocr
from wxUI import buffers, dialogs, commonMessageDialogs from wxUI import buffers, dialogs, commonMessageDialogs
from wxUI.dialogs.mastodon import dialogs, menus from wxUI.dialogs.mastodon import menus
from wxUI.dialogs.mastodon import dialogs as mastodon_dialogs
log = logging.getLogger("controller.buffers.mastodon.base") log = logging.getLogger("controller.buffers.mastodon.base")
@ -330,7 +331,7 @@ class BaseBuffer(base.Buffer):
toot = self.get_item() toot = self.get_item()
id = toot.id id = toot.id
if self.session.settings["general"]["boost_mode"] == "ask": if self.session.settings["general"]["boost_mode"] == "ask":
answer = dialogs.boost_question() answer = mastodon_dialogs.boost_question()
if answer == True: if answer == True:
self._direct_boost(id) self._direct_boost(id)
else: else:
@ -378,9 +379,9 @@ class BaseBuffer(base.Buffer):
if url == '': if url == '':
toot = self.get_item() toot = self.get_item()
if toot.reblog != None: if toot.reblog != None:
urls = utils.find_urls(toot.reblog.content) urls = utils.find_urls(toot.REBLOG)
else: else:
urls = utils.find_urls(toot.reblog.content) urls = utils.find_urls(toot)
if len(urls) == 1: if len(urls) == 1:
url=urls[0] url=urls[0]
elif len(urls) > 1: elif len(urls) > 1:
@ -406,7 +407,7 @@ class BaseBuffer(base.Buffer):
if item.account.id != self.session.db["user_id"] or item.reblog != None: if item.account.id != self.session.db["user_id"] or item.reblog != None:
output.speak(_("You can delete only your own toots.")) output.speak(_("You can delete only your own toots."))
return return
answer = dialogs.delete_toot_dialog() answer = mastodon_dialogs.delete_toot_dialog()
if answer == True: if answer == True:
items = self.session.db[self.name] items = self.session.db[self.name]
try: try:

View File

@ -1,13 +1,19 @@
import re import re
from html.parser import HTMLParser from html.parser import HTMLParser
url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ \\n\\t]*") url_re = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
class HTMLFilter(HTMLParser): class HTMLFilter(HTMLParser):
text = "" text = ""
def handle_data(self, data): def handle_data(self, data):
self.text += data self.text += data
def handle_starttag(self, tag, attrs):
if tag == "br":
self.text = self.text+"\n"
elif tag == "p":
self.text = self.text+"\n\n"
def html_filter(data): def html_filter(data):
f = HTMLFilter() f = HTMLFilter()
f.feed(data) f.feed(data)
@ -45,5 +51,11 @@ def get_media_urls(toot):
urls.append(media.get("url")) urls.append(media.get("url"))
return urls return urls
def find_urls(text): def find_urls(toot, include_tags=False):
return url_re.findall(html_filter(text)) urls = url_re.findall(toot.content)
if include_tags == False:
for tag in toot.tags:
for url in urls[::]:
if url.lower().endswith("/tags/"+tag["name"]):
urls.remove(url)
return urls