import sys
from HTMLParser import HTMLParser
class TextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.recording = True
self.output = []
def handle_starttag(self, tag, attrs):
# Ignore these tags and everything inside them
if tag in ['script', 'style', 'head', 'title', 'meta', 'noscript']:
self.recording = False
def handle_endtag(self, tag):
# Resume recording after the block ends
if tag in ['script', 'style', 'head', 'title', 'meta', 'noscript']:
self.recording = True
def handle_data(self, data):
if self.recording:
# Clean and add the text to our list
text = data.strip()
if text:
self.output.append(text)
if __name__ == "__main__":
parser = TextExtractor()
parser.feed(sys.stdin.read())
print " ".join(parser.output)
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)