Removed poken factoid prefixes, added more HTML entity codes to the URL parser

2011-11-26 13:15:43 +13:00 · 2011-11-26 13:15:43 +13:00 · fbde2addaa
commit fbde2addaa
parent 2098397120
1 changed files with 37 additions and 3 deletions
--- a/plugins/urltools.py
+++ b/plugins/urltools.py
@ -1,9 +1,25 @@
 from util import hook, http, urlnorm
 import urllib
 from urllib2 import urlopen, Request, HTTPError
 import re
 import BeautifulSoup
-ignored_urls = ["http://google.com","http://youtube.com"]
+ignored_urls = ["http://google.com","http://youtube.com","http://pastebin.com","http://mibpaste.com","http://fpaste.com"]
 wordDic = {
 '&#34;': '"',
 '&#39;': '\'',
 '&#38;': '&',
 '&#60;': '<',
 '&#62;': '>',
 '&#171;': '«',
 '&quot;': '"',
 '&apos;': '\'',
 '&amp;': '&',
 '&lt;': '<',
 '&gt;': '>',
 '&laquo;': '«',
 '  ': ' '}
 def parse(match):
    url = urlnorm.normalize(match.encode('utf-8'))
@ -15,18 +31,36 @@ def parse(match):
        except:
            return "fail"
 def tiny(url, user, apikey):
  try:
    params = urllib.urlencode({'longUrl': url, 'login': user, 'apiKey': apikey, 'format': 'json'})
    j = http.get_json("http://api.bit.ly/v3/shorten?%s" % params)
    if j['status_code'] == 200:
      return j['data']['url']
    raise Exception('%s'%j['status_txt'])
 def multiwordReplace(text, wordDic):
    rc = re.compile('|'.join(map(re.escape, wordDic)))
    def translate(match):
        return wordDic[match.group(0)]
    return rc.sub(translate, text)
 #@hook.regex(r'^(?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?$')
@hook.regex(r'([a-zA-Z]+://|www\.)[^ ]+')
-def urlparser(match, say = None):
+def urlparser(match, say = None, bot = None):
    url = urlnorm.normalize(match.group().encode('utf-8'))
    user = bot.config['api_keys']['bitly_user']
    api = bot.config['api_keys']['bitly_api']
    for x in ignored_urls:
        if x in url:
            return
    title = parse(url)
    if title == "fail":
        return
-    say("(Link) %s [%s]" % (title, url))
+    short_url = tiny(url, user, api)
    title = multiwordReplace(title, wordDic)
    say("(Link) %s [%s]" % (title, short_url))