Removed poken factoid prefixes, added more HTML entity codes to the URL parser

This commit is contained in:
Luke Rogers 2011-11-26 13:15:43 +13:00
parent 2098397120
commit fbde2addaa

View file

@ -1,9 +1,25 @@
from util import hook, http, urlnorm from util import hook, http, urlnorm
import urllib import urllib
from urllib2 import urlopen, Request, HTTPError
import re import re
import BeautifulSoup import BeautifulSoup
ignored_urls = ["http://google.com","http://youtube.com"] ignored_urls = ["http://google.com","http://youtube.com","http://pastebin.com","http://mibpaste.com","http://fpaste.com"]
wordDic = {
'"': '"',
''': '\'',
'&': '&',
'&#60;': '<',
'&#62;': '>',
'&#171;': '«',
'&quot;': '"',
'&apos;': '\'',
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&laquo;': '«',
' ': ' '}
def parse(match): def parse(match):
url = urlnorm.normalize(match.encode('utf-8')) url = urlnorm.normalize(match.encode('utf-8'))
@ -15,18 +31,36 @@ def parse(match):
except: except:
return "fail" return "fail"
def tiny(url, user, apikey):
try:
params = urllib.urlencode({'longUrl': url, 'login': user, 'apiKey': apikey, 'format': 'json'})
j = http.get_json("http://api.bit.ly/v3/shorten?%s" % params)
if j['status_code'] == 200:
return j['data']['url']
raise Exception('%s'%j['status_txt'])
def multiwordReplace(text, wordDic):
rc = re.compile('|'.join(map(re.escape, wordDic)))
def translate(match):
return wordDic[match.group(0)]
return rc.sub(translate, text)
#@hook.regex(r'^(?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?$') #@hook.regex(r'^(?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?$')
@hook.regex(r'([a-zA-Z]+://|www\.)[^ ]+') @hook.regex(r'([a-zA-Z]+://|www\.)[^ ]+')
def urlparser(match, say = None): def urlparser(match, say = None, bot = None):
url = urlnorm.normalize(match.group().encode('utf-8')) url = urlnorm.normalize(match.group().encode('utf-8'))
user = bot.config['api_keys']['bitly_user']
api = bot.config['api_keys']['bitly_api']
for x in ignored_urls: for x in ignored_urls:
if x in url: if x in url:
return return
title = parse(url) title = parse(url)
if title == "fail": if title == "fail":
return return
say("(Link) %s [%s]" % (title, url)) short_url = tiny(url, user, api)
title = multiwordReplace(title, wordDic)
say("(Link) %s [%s]" % (title, short_url))