Tweaked http.py, started rewriting urlparse.py

2012-09-05 07:41:52 +12:00 · 2012-09-05 07:41:52 +12:00 · 6c022dac1c
commit 6c022dac1c
parent 8e877416a7
2 changed files with 24 additions and 36 deletions
--- a/plugins/urlparse.py
+++ b/plugins/urlparse.py
@ -1,36 +1,21 @@
 from util import hook, http, urlnorm
-import re
-
-titler = re.compile(r'(?si)<title>(.+?)</title>')
-
-
-def get_title(url):
-    url = urlnorm.normalize(url.encode('utf-8'))
-    url = url.decode('utf-8')
-    # add http if its missing
-    if not "://" in url:
-        url = "http://" + url
-    try:
-        # get the title
-        request = http.open(url)
-        real_url = request.geturl()
-        text = request.read()
-        text = text.decode('utf8')
-        match = titler.search(text)
-        title = match.group(1)
-    except:
-        return "Could not parse URL! Are you sure its valid?"
-
-    title = http.unescape(title)
-
-    # if the url has been redirected, show us
-    if real_url == url:
-        return title
-    else:
-        return u"%s [%s]" % (title, real_url)


@hook.command
 def title(inp):
    "title <url> -- gets the title of a web page"
-    return get_title(inp)
+    url = urlnorm.normalize(inp.encode('utf-8'))
+
+    try:
+        page = http.get_html(url)
+    except:
+        return "Could not fetch page."
+
+    try:
+        title = page.find(".//title").text
+    except:
+        return "Could not find title."
+
+    title = http.unescape(title)
+
+    return title
--- a/plugins/util/http.py
+++ b/plugins/util/http.py
@ -6,10 +6,7 @@ import urllib
 import urllib2
 import urlparse

-from urllib import quote, quote_plus as _quote_plus
-from urllib2 import HTTPError, URLError
-
-
+from urllib import quote as _quote, quote_plus as _quote_plus

 from lxml import etree, html

@ -30,9 +27,11 @@ def get(*args, **kwargs):
 def get_url(*args, **kwargs):
    return open(*args, **kwargs).geturl()

+
 def get_html(*args, **kwargs):
    return html.fromstring(get(*args, **kwargs))

+
 def get_xml(*args, **kwargs):
    return etree.fromstring(get(*args, **kwargs))

@ -60,7 +59,7 @@ def open(url, query_params=None, user_agent=None, post_data=None,
        request.get_method = lambda: get_method

    request.add_header('User-Agent', user_agent)
-    
+
    if referer is not None:
        request.add_header('Referer', referer)

@ -97,7 +96,11 @@ def quote_plus(s):
    return _quote_plus(to_utf8(s))


+def quote(s):
+    return _quote(to_utf8(s))
+
+
 def unescape(s):
    if not s.strip():
        return s
-    return html.fromstring(s).text_content()
+    return html.fromstring(s).text_content()