diff --git a/plugins/util/http.py b/plugins/util/http.py index 8a426fd..17ffe5e 100755 --- a/plugins/util/http.py +++ b/plugins/util/http.py @@ -11,6 +11,9 @@ from urllib import quote, quote_plus as _quote_plus from lxml import etree, html from bs4 import BeautifulSoup +from HTMLParser import HTMLParser +import htmlentitydefs + # used in plugins that import this from urllib2 import URLError, HTTPError @@ -27,6 +30,26 @@ ua_chrome = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.4 (KHTML, ' \ jar = cookielib.CookieJar() +class HTMLTextExtractor(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.result = [ ] + + def handle_data(self, d): + self.result.append(d) + + def handle_charref(self, number): + codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number) + self.result.append(unichr(codepoint)) + + def handle_entityref(self, name): + codepoint = htmlentitydefs.name2codepoint[name] + self.result.append(unichr(codepoint)) + + def get_text(self): + return u''.join(self.result) + + def get(*args, **kwargs): return open(*args, **kwargs).read() @@ -111,3 +134,9 @@ def unescape(s): if not s.strip(): return s return html.fromstring(s).text_content() + + +def strip_html(html): + s = HTMLTextExtractor() + s.feed(html) + return s.get_text()