42 lines
1 KiB
Python
Executable file
42 lines
1 KiB
Python
Executable file
from htmlentitydefs import name2codepoint
|
|
from HTMLParser import HTMLParser
|
|
import errno
|
|
import re
|
|
|
|
class HTMLStripper(HTMLParser):
|
|
|
|
def __init__(self, data):
|
|
HTMLParser.__init__(self)
|
|
self._stripped = []
|
|
self.feed(data)
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag.lower() == 'br':
|
|
self._stripped.append('\n')
|
|
|
|
def handle_charref(self, name):
|
|
try:
|
|
if name.lower().startswith('x'):
|
|
char = int(name[1:], 16)
|
|
else:
|
|
char = int(name)
|
|
self._stripped.append(unichr(char))
|
|
except Exception, error:
|
|
return
|
|
|
|
def handle_entityref(self, name):
|
|
try:
|
|
char = unichr(name2codepoint[name])
|
|
except Exception, error:
|
|
char = u'&%s;' % name
|
|
self._stripped.append(char)
|
|
|
|
def handle_data(self, data):
|
|
self._stripped.append(data)
|
|
|
|
@property
|
|
def stripped(self):
|
|
return ''.join(self._stripped)
|
|
|
|
def strip_html(data):
|
|
return HTMLStripper(data).stripped
|