mirror of
https://github.com/MCV-Software/TWBlue.git
synced 2025-11-08 15:17:04 +00:00
Added a better HTML filter to remove elements with certain classes
This commit is contained in:
@@ -3,16 +3,32 @@ import demoji
|
|||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
url_re = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
|
url_re = re.compile(r'<a\s*href=[\'|"](.*?)[\'"].*?>')
|
||||||
|
|
||||||
class HTMLFilter(HTMLParser):
|
class HTMLFilter(HTMLParser):
|
||||||
|
# Classes to ignore when parsing HTML
|
||||||
|
IGNORED_CLASSES = ["quote-inline"]
|
||||||
|
|
||||||
text = ""
|
text = ""
|
||||||
first_paragraph = True
|
first_paragraph = True
|
||||||
|
skip_depth = 0 # Track nesting depth of ignored elements
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
|
# Only add data if we're not inside an ignored element
|
||||||
|
if self.skip_depth == 0:
|
||||||
self.text += data
|
self.text += data
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
|
# Check if this tag has a class that should be ignored
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
tag_class = attrs_dict.get("class", "")
|
||||||
|
|
||||||
|
# Check if any ignored class is present in this tag
|
||||||
|
should_skip = any(ignored_class in tag_class for ignored_class in self.IGNORED_CLASSES)
|
||||||
|
|
||||||
|
if should_skip:
|
||||||
|
self.skip_depth += 1
|
||||||
|
elif self.skip_depth == 0: # Only process tags if we're not skipping
|
||||||
if tag == "br":
|
if tag == "br":
|
||||||
self.text = self.text+"\n"
|
self.text = self.text+"\n"
|
||||||
elif tag == "p":
|
elif tag == "p":
|
||||||
@@ -20,6 +36,14 @@ class HTMLFilter(HTMLParser):
|
|||||||
self.first_paragraph = False
|
self.first_paragraph = False
|
||||||
else:
|
else:
|
||||||
self.text = self.text+"\n\n"
|
self.text = self.text+"\n\n"
|
||||||
|
else:
|
||||||
|
# We're inside a skipped element, increment depth for nested tags
|
||||||
|
self.skip_depth += 1
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
# Decrement skip depth when closing any tag while skipping
|
||||||
|
if self.skip_depth > 0:
|
||||||
|
self.skip_depth -= 1
|
||||||
|
|
||||||
def html_filter(data):
|
def html_filter(data):
|
||||||
f = HTMLFilter()
|
f = HTMLFilter()
|
||||||
|
|||||||
Reference in New Issue
Block a user