From 6c022dac1c10ef0ac33ce408714989fe45303882 Mon Sep 17 00:00:00 2001 From: Luke Rogers Date: Wed, 5 Sep 2012 07:41:52 +1200 Subject: [PATCH] Tweaked http.py, started rewriting urlparse.py --- plugins/urlparse.py | 45 +++++++++++++++----------------------------- plugins/util/http.py | 15 +++++++++------ 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/plugins/urlparse.py b/plugins/urlparse.py index 9378051..7bf5abc 100755 --- a/plugins/urlparse.py +++ b/plugins/urlparse.py @@ -1,36 +1,21 @@ from util import hook, http, urlnorm -import re - -titler = re.compile(r'(?si)(.+?)') - - -def get_title(url): - url = urlnorm.normalize(url.encode('utf-8')) - url = url.decode('utf-8') - # add http if its missing - if not "://" in url: - url = "http://" + url - try: - # get the title - request = http.open(url) - real_url = request.geturl() - text = request.read() - text = text.decode('utf8') - match = titler.search(text) - title = match.group(1) - except: - return "Could not parse URL! Are you sure its valid?" - - title = http.unescape(title) - - # if the url has been redirected, show us - if real_url == url: - return title - else: - return u"%s [%s]" % (title, real_url) @hook.command def title(inp): "title -- gets the title of a web page" - return get_title(inp) + url = urlnorm.normalize(inp.encode('utf-8')) + + try: + page = http.get_html(url) + except: + return "Could not fetch page." + + try: + title = page.find(".//title").text + except: + return "Could not find title." + + title = http.unescape(title) + + return title \ No newline at end of file diff --git a/plugins/util/http.py b/plugins/util/http.py index b420311..58cddf8 100755 --- a/plugins/util/http.py +++ b/plugins/util/http.py @@ -6,10 +6,7 @@ import urllib import urllib2 import urlparse -from urllib import quote, quote_plus as _quote_plus -from urllib2 import HTTPError, URLError - - +from urllib import quote as _quote, quote_plus as _quote_plus from lxml import etree, html @@ -30,9 +27,11 @@ def get(*args, **kwargs): def get_url(*args, **kwargs): return open(*args, **kwargs).geturl() + def get_html(*args, **kwargs): return html.fromstring(get(*args, **kwargs)) + def get_xml(*args, **kwargs): return etree.fromstring(get(*args, **kwargs)) @@ -60,7 +59,7 @@ def open(url, query_params=None, user_agent=None, post_data=None, request.get_method = lambda: get_method request.add_header('User-Agent', user_agent) - + if referer is not None: request.add_header('Referer', referer) @@ -97,7 +96,11 @@ def quote_plus(s): return _quote_plus(to_utf8(s)) +def quote(s): + return _quote(to_utf8(s)) + + def unescape(s): if not s.strip(): return s - return html.fromstring(s).text_content() + return html.fromstring(s).text_content() \ No newline at end of file