2022-11-09 15:52:18 -06:00
|
|
|
import re
|
2022-11-07 17:13:03 -06:00
|
|
|
from html.parser import HTMLParser
|
|
|
|
|
2022-11-10 10:01:39 -06:00
|
|
|
url_re = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
|
2022-11-09 15:52:18 -06:00
|
|
|
|
2022-11-07 17:13:03 -06:00
|
|
|
class HTMLFilter(HTMLParser):
|
|
|
|
text = ""
|
2022-12-14 12:08:02 -06:00
|
|
|
first_paragraph = True
|
|
|
|
|
2022-11-07 17:13:03 -06:00
|
|
|
def handle_data(self, data):
|
|
|
|
self.text += data
|
|
|
|
|
2022-11-10 10:01:39 -06:00
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag == "br":
|
|
|
|
self.text = self.text+"\n"
|
2022-12-14 12:08:02 -06:00
|
|
|
elif tag == "p":
|
|
|
|
if self.first_paragraph:
|
|
|
|
self.first_paragraph = False
|
|
|
|
else:
|
|
|
|
self.text = self.text+"\n\n"
|
2022-11-10 10:01:39 -06:00
|
|
|
|
2022-11-07 17:13:03 -06:00
|
|
|
def html_filter(data):
|
|
|
|
f = HTMLFilter()
|
|
|
|
f.feed(data)
|
2022-11-08 12:19:05 -06:00
|
|
|
return f.text
|
|
|
|
|
|
|
|
def find_item(item, listItems):
|
|
|
|
for i in range(0, len(listItems)):
|
|
|
|
if listItems[i].id == item.id:
|
|
|
|
return i
|
2022-11-12 11:20:16 -06:00
|
|
|
if hasattr(item, "reblog") and item.reblog != None and item.reblog.id == listItems[i].id:
|
2022-11-08 12:19:05 -06:00
|
|
|
return i
|
|
|
|
return None
|
2022-11-09 09:09:37 -06:00
|
|
|
|
2022-11-16 13:28:45 -06:00
|
|
|
def is_audio_or_video(post):
|
|
|
|
if post.reblog != None:
|
|
|
|
return is_audio_or_video(post.reblog)
|
2022-11-09 09:09:37 -06:00
|
|
|
# Checks firstly for Mastodon native videos and audios.
|
2022-11-16 13:28:45 -06:00
|
|
|
for media in post.media_attachments:
|
2022-11-09 09:09:37 -06:00
|
|
|
if media["type"] == "video" or media["type"] == "audio":
|
|
|
|
return True
|
|
|
|
|
2022-11-16 13:28:45 -06:00
|
|
|
def is_image(post):
|
|
|
|
if post.reblog != None:
|
2022-11-20 14:54:10 -06:00
|
|
|
return is_image(post.reblog)
|
2022-11-09 09:09:37 -06:00
|
|
|
# Checks firstly for Mastodon native videos and audios.
|
2022-11-16 13:28:45 -06:00
|
|
|
for media in post.media_attachments:
|
2022-11-09 09:09:37 -06:00
|
|
|
if media["type"] == "gifv" or media["type"] == "image":
|
2022-11-09 13:07:59 -06:00
|
|
|
return True
|
|
|
|
|
2022-11-16 13:28:45 -06:00
|
|
|
def get_media_urls(post):
|
2022-11-20 14:54:10 -06:00
|
|
|
if hasattr(post, "reblog") and post.reblog != None:
|
|
|
|
return get_media_urls(post.reblog)
|
2022-11-09 13:07:59 -06:00
|
|
|
urls = []
|
2022-11-16 13:28:45 -06:00
|
|
|
for media in post.media_attachments:
|
2022-11-09 13:07:59 -06:00
|
|
|
if media.get("type") == "audio" or media.get("type") == "video":
|
2023-04-05 08:29:21 -06:00
|
|
|
url_keys = ["remote_url", "url"]
|
|
|
|
for url_key in url_keys:
|
|
|
|
if media.get(url_key) != None:
|
|
|
|
urls.append(media.get(url_key))
|
|
|
|
break
|
2022-11-09 15:52:18 -06:00
|
|
|
return urls
|
|
|
|
|
2022-11-16 13:28:45 -06:00
|
|
|
def find_urls(post, include_tags=False):
|
|
|
|
urls = url_re.findall(post.content)
|
2022-11-10 10:01:39 -06:00
|
|
|
if include_tags == False:
|
2022-11-16 13:28:45 -06:00
|
|
|
for tag in post.tags:
|
2022-11-10 10:01:39 -06:00
|
|
|
for url in urls[::]:
|
|
|
|
if url.lower().endswith("/tags/"+tag["name"]):
|
|
|
|
urls.remove(url)
|
|
|
|
return urls
|