This repository has been archived on 2023-04-13. You can view files and clone it, but cannot push or open issues or pull requests.
CloudBot/plugins/urlparse.py

37 lines
989 B
Python
Raw Normal View History

2011-11-20 10:23:31 +01:00
from util import hook, http, urlnorm
import re
titler = re.compile(r'(?si)<title>(.+?)</title>');
def parse(url):
""" an improved version of our parsing code - now regex powered """
url = urlnorm.normalize(url.encode('utf-8'))
url = url.decode('utf-8')
# add http if its missing
if url[:7] != "http://" and url[:8] != "https://":
url = "http://" + url
try:
# get the title
request = http.open(url)
real_url = request.geturl()
text = request.read()
text = text.decode('utf8')
match = titler.search(text)
title = match.group(1)
except:
return "Could not parse URL! Are you sure its valid?"
title = http.unescape(title)
# if the url has been redirected, show us
if real_url == url:
return title
2011-11-30 13:51:43 +01:00
else:
return u"%s [%s]" % (title, real_url)
@hook.command
def title(inp):
".title <url> -- gets the title of a web page"
return parse(inp)