CloudBot/plugins/urltools.py

from util import hook, http, urlnorm
import urllib
from urllib2 import urlopen, Request, HTTPError
import re
import BeautifulSoup

ignored_urls = ["http://google.com","http://youtube.com","http://pastebin.com","http://mibpaste.com","http://fpaste.com","beastnode.com"]

wordDic = {
'&#34;': '"',
'&#39;': '\'',
'&#38;': '&',
'&#60;': '<',
'&#62;': '>',
'&#171;': '«',
'&quot;': '"',
'&apos;': '\'',
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&laquo;': '«',
'&#33;': '!',
'&#036;': '$',
'  ': ' '}

def parse(match):
    url = urlnorm.normalize(match.encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        try:
            soup = BeautifulSoup.BeautifulSoup(http.get(url))
            return soup.title.string
        except:
            return "fail"

def multiwordReplace(text, wordDic):
    rc = re.compile('|'.join(map(re.escape, wordDic)))
    def translate(match):
        return wordDic[match.group(0)]
    return rc.sub(translate, text)


#@hook.regex(r'^(?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?$')
#@hook.regex(r'([a-zA-Z]+://|www\.)[^ ]+')
def urlparser(match, say = None):
    print "[debug] URL found"
    url = urlnorm.normalize(match.group().encode('utf-8'))
    for x in ignored_urls:
        if x in url:
            return
    title = parse(url)
    if title == "fail":
        print "[url] No title found"
        return
    title = multiwordReplace(title, wordDic)
    realurl = http.get_url(url)
    if realurl == url:
        say("(Link) %s" % title)
        return
    else:
        say("(Link) %s [%s]" % (title, realurl))
        return
First :D 2011-11-20 10:23:31 +01:00			`from util import hook, http, urlnorm`
			`import urllib`
Removed poken factoid prefixes, added more HTML entity codes to the URL parser 2011-11-26 01:15:43 +01:00			`from urllib2 import urlopen, Request, HTTPError`
First :D 2011-11-20 10:23:31 +01:00			`import re`
			`import BeautifulSoup`

Massive code dump :o 2012-02-02 14:05:11 +01:00			`ignored_urls = ["http://google.com","http://youtube.com","http://pastebin.com","http://mibpaste.com","http://fpaste.com","beastnode.com"]`
Removed poken factoid prefixes, added more HTML entity codes to the URL parser 2011-11-26 01:15:43 +01:00
			`wordDic = {`
			`'"': '"',`
			`''': '\'',`
			`'&': '&',`
			`'<': '<',`
			`'>': '>',`
			`'«': '«',`
			`'"': '"',`
			`''': '\'',`
			`'&': '&',`
			`'<': '<',`
			`'>': '>',`
			`'«': '«',`
bla 2011-11-30 16:56:46 +01:00			`'!': '!',`
			`'$': '$',`
Removed poken factoid prefixes, added more HTML entity codes to the URL parser 2011-11-26 01:15:43 +01:00			`' ': ' '}`
First :D 2011-11-20 10:23:31 +01:00
			`def parse(match):`
			`url = urlnorm.normalize(match.encode('utf-8'))`
			`if url not in ignored_urls:`
			`url = url.decode('utf-8')`
			`try:`
Changed factoid commands, added URL parser 2011-11-25 16:36:44 +01:00			`soup = BeautifulSoup.BeautifulSoup(http.get(url))`
First :D 2011-11-20 10:23:31 +01:00			`return soup.title.string`
			`except:`
Changed factoid commands, added URL parser 2011-11-25 16:36:44 +01:00			`return "fail"`
First :D 2011-11-20 10:23:31 +01:00
Removed poken factoid prefixes, added more HTML entity codes to the URL parser 2011-11-26 01:15:43 +01:00			`def multiwordReplace(text, wordDic):`
			`rc = re.compile('\|'.join(map(re.escape, wordDic)))`
			`def translate(match):`
			`return wordDic[match.group(0)]`
			`return rc.sub(translate, text)`

First :D 2011-11-20 10:23:31 +01:00
			#@hook.regex(r'^(?#Protocol)(?:(?:ht\|f)tp(?:s?)\:\/\/\|~\/\|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com\|org\|net\|gov\|mil\|biz\|info\|mobi\|name\|aero\|jobs\|museum\|travel\|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+\|.,=]\|%[a-f\d]{2})+)+\|\/)+\|\?\|#)?(?#Query)(?:(?:\?(?:[-\w~!$+\|.,:]\|%[a-f\d{2}])+=?(?:[-\w~!$+\|.,:=]\|%[a-f\d]{2}))(?:&(?:[-\w~!$+\|.,:]\|%[a-f\d{2}])+=?(?:[-\w~!$+\|.,:=]\|%[a-f\d]{2})))(?#Anchor)(?:#(?:[-\w~!$+\|.,:=]\|%[a-f\d]{2}))?$')
Massive code dump :o 2012-02-02 14:05:11 +01:00			`#@hook.regex(r'([a-zA-Z]+://\|www\.)[^ ]+')`
Fixed URL parser (again)" 2011-11-30 12:24:03 +01:00			`def urlparser(match, say = None):`
bla 2011-11-30 16:56:46 +01:00			`print "[debug] URL found"`
Changed factoid commands, added URL parser 2011-11-25 16:36:44 +01:00			`url = urlnorm.normalize(match.group().encode('utf-8'))`
			`for x in ignored_urls:`
			`if x in url:`
			`return`
			`title = parse(url)`
			`if title == "fail":`
bla 2011-11-30 16:56:46 +01:00			`print "[url] No title found"`
Changed factoid commands, added URL parser 2011-11-25 16:36:44 +01:00			`return`
Removed poken factoid prefixes, added more HTML entity codes to the URL parser 2011-11-26 01:15:43 +01:00			`title = multiwordReplace(title, wordDic)`
Improved URL parser 2011-11-30 13:51:43 +01:00			`realurl = http.get_url(url)`
			`if realurl == url:`
			`say("(Link) %s" % title)`
			`return`
			`else:`
			`say("(Link) %s [%s]" % (title, realurl))`
			`return`
First :D 2011-11-20 10:23:31 +01:00