Improved html parsing for toots. Remove Tags from URLList

This commit is contained in:
Manuel Cortez 2022-11-10 10:01:39 -06:00
parent f151d6554d
commit 0aad2f0ab3
No known key found for this signature in database
GPG Key ID: 9E0735CA15EFE790
2 changed files with 21 additions and 8 deletions

View File

@ -17,7 +17,8 @@ from mysc.thread_utils import call_threaded
from pubsub import pub
from extra import ocr
from wxUI import buffers, dialogs, commonMessageDialogs
from wxUI.dialogs.mastodon import dialogs, menus
from wxUI.dialogs.mastodon import menus
from wxUI.dialogs.mastodon import dialogs as mastodon_dialogs
log = logging.getLogger("controller.buffers.mastodon.base")
@ -330,7 +331,7 @@ class BaseBuffer(base.Buffer):
toot = self.get_item()
id = toot.id
if self.session.settings["general"]["boost_mode"] == "ask":
answer = dialogs.boost_question()
answer = mastodon_dialogs.boost_question()
if answer == True:
self._direct_boost(id)
else:
@ -378,9 +379,9 @@ class BaseBuffer(base.Buffer):
if url == '':
toot = self.get_item()
if toot.reblog != None:
urls = utils.find_urls(toot.reblog.content)
urls = utils.find_urls(toot.REBLOG)
else:
urls = utils.find_urls(toot.reblog.content)
urls = utils.find_urls(toot)
if len(urls) == 1:
url=urls[0]
elif len(urls) > 1:
@ -406,7 +407,7 @@ class BaseBuffer(base.Buffer):
if item.account.id != self.session.db["user_id"] or item.reblog != None:
output.speak(_("You can delete only your own toots."))
return
answer = dialogs.delete_toot_dialog()
answer = mastodon_dialogs.delete_toot_dialog()
if answer == True:
items = self.session.db[self.name]
try:

View File

@ -1,13 +1,19 @@
import re
from html.parser import HTMLParser
url_re = re.compile("(?:\w+://|www\.)[^ ,.?!#%=+][^ \\n\\t]*")
url_re = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
class HTMLFilter(HTMLParser):
text = ""
def handle_data(self, data):
self.text += data
def handle_starttag(self, tag, attrs):
if tag == "br":
self.text = self.text+"\n"
elif tag == "p":
self.text = self.text+"\n\n"
def html_filter(data):
f = HTMLFilter()
f.feed(data)
@ -45,5 +51,11 @@ def get_media_urls(toot):
urls.append(media.get("url"))
return urls
def find_urls(text):
return url_re.findall(html_filter(text))
def find_urls(toot, include_tags=False):
urls = url_re.findall(toot.content)
if include_tags == False:
for tag in toot.tags:
for url in urls[::]:
if url.lower().endswith("/tags/"+tag["name"]):
urls.remove(url)
return urls