Tweaked http.py, started rewriting urlparse.py
This commit is contained in:
parent
8e877416a7
commit
6c022dac1c
|
@ -1,36 +1,21 @@
|
||||||
from util import hook, http, urlnorm
|
from util import hook, http, urlnorm
|
||||||
import re
|
|
||||||
|
|
||||||
titler = re.compile(r'(?si)<title>(.+?)</title>')
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(url):
|
|
||||||
url = urlnorm.normalize(url.encode('utf-8'))
|
|
||||||
url = url.decode('utf-8')
|
|
||||||
# add http if its missing
|
|
||||||
if not "://" in url:
|
|
||||||
url = "http://" + url
|
|
||||||
try:
|
|
||||||
# get the title
|
|
||||||
request = http.open(url)
|
|
||||||
real_url = request.geturl()
|
|
||||||
text = request.read()
|
|
||||||
text = text.decode('utf8')
|
|
||||||
match = titler.search(text)
|
|
||||||
title = match.group(1)
|
|
||||||
except:
|
|
||||||
return "Could not parse URL! Are you sure its valid?"
|
|
||||||
|
|
||||||
title = http.unescape(title)
|
|
||||||
|
|
||||||
# if the url has been redirected, show us
|
|
||||||
if real_url == url:
|
|
||||||
return title
|
|
||||||
else:
|
|
||||||
return u"%s [%s]" % (title, real_url)
|
|
||||||
|
|
||||||
|
|
||||||
@hook.command
|
@hook.command
|
||||||
def title(inp):
|
def title(inp):
|
||||||
"title <url> -- gets the title of a web page"
|
"title <url> -- gets the title of a web page"
|
||||||
return get_title(inp)
|
url = urlnorm.normalize(inp.encode('utf-8'))
|
||||||
|
|
||||||
|
try:
|
||||||
|
page = http.get_html(url)
|
||||||
|
except:
|
||||||
|
return "Could not fetch page."
|
||||||
|
|
||||||
|
try:
|
||||||
|
title = page.find(".//title").text
|
||||||
|
except:
|
||||||
|
return "Could not find title."
|
||||||
|
|
||||||
|
title = http.unescape(title)
|
||||||
|
|
||||||
|
return title
|
|
@ -6,10 +6,7 @@ import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from urllib import quote, quote_plus as _quote_plus
|
from urllib import quote as _quote, quote_plus as _quote_plus
|
||||||
from urllib2 import HTTPError, URLError
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
|
|
||||||
|
@ -30,9 +27,11 @@ def get(*args, **kwargs):
|
||||||
def get_url(*args, **kwargs):
|
def get_url(*args, **kwargs):
|
||||||
return open(*args, **kwargs).geturl()
|
return open(*args, **kwargs).geturl()
|
||||||
|
|
||||||
|
|
||||||
def get_html(*args, **kwargs):
|
def get_html(*args, **kwargs):
|
||||||
return html.fromstring(get(*args, **kwargs))
|
return html.fromstring(get(*args, **kwargs))
|
||||||
|
|
||||||
|
|
||||||
def get_xml(*args, **kwargs):
|
def get_xml(*args, **kwargs):
|
||||||
return etree.fromstring(get(*args, **kwargs))
|
return etree.fromstring(get(*args, **kwargs))
|
||||||
|
|
||||||
|
@ -60,7 +59,7 @@ def open(url, query_params=None, user_agent=None, post_data=None,
|
||||||
request.get_method = lambda: get_method
|
request.get_method = lambda: get_method
|
||||||
|
|
||||||
request.add_header('User-Agent', user_agent)
|
request.add_header('User-Agent', user_agent)
|
||||||
|
|
||||||
if referer is not None:
|
if referer is not None:
|
||||||
request.add_header('Referer', referer)
|
request.add_header('Referer', referer)
|
||||||
|
|
||||||
|
@ -97,7 +96,11 @@ def quote_plus(s):
|
||||||
return _quote_plus(to_utf8(s))
|
return _quote_plus(to_utf8(s))
|
||||||
|
|
||||||
|
|
||||||
|
def quote(s):
|
||||||
|
return _quote(to_utf8(s))
|
||||||
|
|
||||||
|
|
||||||
def unescape(s):
|
def unescape(s):
|
||||||
if not s.strip():
|
if not s.strip():
|
||||||
return s
|
return s
|
||||||
return html.fromstring(s).text_content()
|
return html.fromstring(s).text_content()
|
Reference in a new issue