First :D
This commit is contained in:
commit
37588421f3
100 changed files with 22673 additions and 0 deletions
0
plugins/util/__init__.py
Normal file
0
plugins/util/__init__.py
Normal file
101
plugins/util/hook.py
Normal file
101
plugins/util/hook.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
import inspect
|
||||
import re
|
||||
|
||||
|
||||
def _hook_add(func, add, name=''):
|
||||
if not hasattr(func, '_hook'):
|
||||
func._hook = []
|
||||
func._hook.append(add)
|
||||
|
||||
if not hasattr(func, '_filename'):
|
||||
func._filename = func.func_code.co_filename
|
||||
|
||||
if not hasattr(func, '_args'):
|
||||
argspec = inspect.getargspec(func)
|
||||
if name:
|
||||
n_args = len(argspec.args)
|
||||
if argspec.defaults:
|
||||
n_args -= len(argspec.defaults)
|
||||
if argspec.keywords:
|
||||
n_args -= 1
|
||||
if argspec.varargs:
|
||||
n_args -= 1
|
||||
if n_args != 1:
|
||||
err = '%ss must take 1 non-keyword argument (%s)' % (name,
|
||||
func.__name__)
|
||||
raise ValueError(err)
|
||||
|
||||
args = []
|
||||
if argspec.defaults:
|
||||
end = bool(argspec.keywords) + bool(argspec.varargs)
|
||||
args.extend(argspec.args[-len(argspec.defaults):
|
||||
end if end else None])
|
||||
if argspec.keywords:
|
||||
args.append(0) # means kwargs present
|
||||
func._args = args
|
||||
|
||||
if not hasattr(func, '_thread'): # does function run in its own thread?
|
||||
func._thread = False
|
||||
|
||||
|
||||
def sieve(func):
|
||||
if func.func_code.co_argcount != 5:
|
||||
raise ValueError(
|
||||
'sieves must take 5 arguments: (bot, input, func, type, args)')
|
||||
_hook_add(func, ['sieve', (func,)])
|
||||
return func
|
||||
|
||||
|
||||
def command(arg=None, **kwargs):
|
||||
args = {}
|
||||
|
||||
def command_wrapper(func):
|
||||
args.setdefault('name', func.func_name)
|
||||
_hook_add(func, ['command', (func, args)], 'command')
|
||||
return func
|
||||
|
||||
if kwargs or not inspect.isfunction(arg):
|
||||
if arg is not None:
|
||||
args['name'] = arg
|
||||
args.update(kwargs)
|
||||
return command_wrapper
|
||||
else:
|
||||
return command_wrapper(arg)
|
||||
|
||||
|
||||
def event(arg=None, **kwargs):
|
||||
args = kwargs
|
||||
|
||||
def event_wrapper(func):
|
||||
args['name'] = func.func_name
|
||||
args.setdefault('events', ['*'])
|
||||
_hook_add(func, ['event', (func, args)], 'event')
|
||||
return func
|
||||
|
||||
if inspect.isfunction(arg):
|
||||
return event_wrapper(arg, kwargs)
|
||||
else:
|
||||
if arg is not None:
|
||||
args['events'] = arg.split()
|
||||
return event_wrapper
|
||||
|
||||
|
||||
def singlethread(func):
|
||||
func._thread = True
|
||||
return func
|
||||
|
||||
|
||||
def regex(regex, flags=0, **kwargs):
|
||||
args = kwargs
|
||||
|
||||
def regex_wrapper(func):
|
||||
args['name'] = func.func_name
|
||||
args['regex'] = regex
|
||||
args['re'] = re.compile(regex, flags)
|
||||
_hook_add(func, ['regex', (func, args)], 'regex')
|
||||
return func
|
||||
|
||||
if inspect.isfunction(regex):
|
||||
raise ValueError("regex decorators require a regex to match against")
|
||||
else:
|
||||
return regex_wrapper
|
103
plugins/util/http.py
Normal file
103
plugins/util/http.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
# convenience wrapper for urllib2 & friends
|
||||
|
||||
import cookielib
|
||||
import json
|
||||
import urllib
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
from urllib import quote, quote_plus as _quote_plus
|
||||
from urllib2 import HTTPError, URLError
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
|
||||
from lxml import etree, html
|
||||
|
||||
|
||||
ua_skybot = 'Cloudbot/3.4 http://github.com/lukeroge/cloudbot'
|
||||
|
||||
ua_firefox = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) ' \
|
||||
'Gecko/20070725 Firefox/2.0.0.6'
|
||||
ua_internetexplorer = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
||||
|
||||
jar = cookielib.CookieJar()
|
||||
|
||||
|
||||
def get(*args, **kwargs):
|
||||
return open(*args, **kwargs).read()
|
||||
|
||||
|
||||
def get_html(*args, **kwargs):
|
||||
return html.fromstring(get(*args, **kwargs))
|
||||
|
||||
|
||||
def get_soup(*args, **kwargs):
|
||||
return BeautifulSoup(get(*args, **kwargs))
|
||||
|
||||
|
||||
def get_xml(*args, **kwargs):
|
||||
return etree.fromstring(get(*args, **kwargs))
|
||||
|
||||
|
||||
def get_json(*args, **kwargs):
|
||||
return json.loads(get(*args, **kwargs))
|
||||
|
||||
|
||||
def open(url, query_params=None, user_agent=None, post_data=None,
|
||||
get_method=None, cookies=False, **kwargs):
|
||||
|
||||
if query_params is None:
|
||||
query_params = {}
|
||||
|
||||
if user_agent is None:
|
||||
user_agent = ua_skybot
|
||||
|
||||
query_params.update(kwargs)
|
||||
|
||||
url = prepare_url(url, query_params)
|
||||
|
||||
request = urllib2.Request(url, post_data)
|
||||
|
||||
if get_method is not None:
|
||||
request.get_method = lambda: get_method
|
||||
|
||||
request.add_header('User-Agent', user_agent)
|
||||
|
||||
if cookies:
|
||||
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
|
||||
else:
|
||||
opener = urllib2.build_opener()
|
||||
|
||||
return opener.open(request)
|
||||
|
||||
|
||||
def prepare_url(url, queries):
|
||||
if queries:
|
||||
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
|
||||
|
||||
query = dict(urlparse.parse_qsl(query))
|
||||
query.update(queries)
|
||||
query = urllib.urlencode(dict((to_utf8(key), to_utf8(value))
|
||||
for key, value in query.iteritems()))
|
||||
|
||||
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def to_utf8(s):
|
||||
if isinstance(s, unicode):
|
||||
return s.encode('utf8', 'ignore')
|
||||
else:
|
||||
return str(s)
|
||||
|
||||
|
||||
def quote_plus(s):
|
||||
return _quote_plus(to_utf8(s))
|
||||
|
||||
|
||||
def unescape(s):
|
||||
if not s.strip():
|
||||
return s
|
||||
return html.fromstring(s).text_content()
|
54
plugins/util/misc.py
Normal file
54
plugins/util/misc.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from htmlentitydefs import name2codepoint
|
||||
from time import time as unix_time
|
||||
from HTMLParser import HTMLParser
|
||||
from datetime import datetime
|
||||
import tempfile
|
||||
import logging as log
|
||||
import errno
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
class HTMLStripper(HTMLParser):
|
||||
|
||||
def __init__(self, data):
|
||||
HTMLParser.__init__(self)
|
||||
self._stripped = []
|
||||
self.feed(data)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag.lower() == 'br':
|
||||
self._stripped.append('\n')
|
||||
|
||||
def handle_charref(self, name):
|
||||
try:
|
||||
if name.lower().startswith('x'):
|
||||
char = int(name[1:], 16)
|
||||
else:
|
||||
char = int(name)
|
||||
self._stripped.append(unichr(char))
|
||||
except Exception, error:
|
||||
log.warn('invalid entity: %s' % error)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
try:
|
||||
char = unichr(name2codepoint[name])
|
||||
except Exception, error:
|
||||
log.warn('unknown entity: %s' % error)
|
||||
char = u'&%s;' % name
|
||||
self._stripped.append(char)
|
||||
|
||||
def handle_data(self, data):
|
||||
self._stripped.append(data)
|
||||
|
||||
@property
|
||||
def stripped(self):
|
||||
return ''.join(self._stripped)
|
||||
|
||||
def superscript(text):
|
||||
if isinstance(text, str):
|
||||
text = decode(text, 'utf-8')
|
||||
return text.translate(SUPER_MAP)
|
||||
|
||||
def strip_html(data):
|
||||
return HTMLStripper(data).stripped
|
219
plugins/util/molecular.py
Normal file
219
plugins/util/molecular.py
Normal file
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# molecular.py
|
||||
# Copyright (c) 2001, Chris Gonnerman
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# Neither the name of the author nor the names of any contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""molecular.py -- molecular (ngenoid) name generator
|
||||
|
||||
This module knows how to generate "random" names for RPG characters.
|
||||
It uses the same method as the "ngen" name generator by Kimmo Kulovesi,
|
||||
and in fact it can use the same name files. molecular.py knows how
|
||||
to merge multiple tables also, which can be handy...
|
||||
|
||||
If run as a command-line program, use the following options:
|
||||
|
||||
-r namefile -- read the given name file and add to the
|
||||
current name table.
|
||||
nnn -- generate nnn (a number) names and print
|
||||
on standard output.
|
||||
|
||||
To generate names from a name file:
|
||||
|
||||
python molecular.py -r file 10
|
||||
|
||||
As a module (to be imported) you get the following classes and functions:
|
||||
|
||||
NameFile (class) -- a file wrapper with a disabled close() method,
|
||||
used internally and probably not useful otherwise.
|
||||
nameopen (function) -- opens a file; takes filename and mode options,
|
||||
searches the default name file directory if not
|
||||
found in current directory, handles "-" filenames,
|
||||
and uses NameFile to disable closing of sys.stdin/
|
||||
sys.stdout.
|
||||
Molecule (class) -- the meat of the matter. A Molecule instance has
|
||||
the following methods:
|
||||
|
||||
.load(file) -- loads a name file,
|
||||
which may be a file-like
|
||||
object with a .readline()
|
||||
method or a filename as a
|
||||
string.
|
||||
.name() -- generate one name and
|
||||
return it.
|
||||
"""
|
||||
|
||||
__version__ = "1.0"
|
||||
|
||||
import string, re, sys, random
|
||||
|
||||
NAMEDIR = "/home/ircbot/bot/plugins/util/names"
|
||||
NAMESECTIONS = [ "inf", "first", "mid", "final", "notes", "end" ]
|
||||
|
||||
class NameFile:
|
||||
__file_attributes = ('closed','mode','name','softspace')
|
||||
def __init__(self, file):
|
||||
self.fd = file
|
||||
def close(self):
|
||||
pass
|
||||
def flush(self):
|
||||
return self.fd.flush()
|
||||
def isatty(self):
|
||||
return self.fd.isatty()
|
||||
def fileno(self):
|
||||
return self.fd.fileno()
|
||||
def read(self, *args):
|
||||
return apply(self.fd.read, args)
|
||||
def readline(self, *args):
|
||||
return apply(self.fd.readline, args)
|
||||
def readlines(self, *args):
|
||||
return apply(self.fd.readlines, args)
|
||||
def seek(self, *args):
|
||||
return apply(self.fd.seek, args)
|
||||
def tell(self):
|
||||
return self.fd.tell()
|
||||
def write(self, str):
|
||||
return self.fd.write(str)
|
||||
def writelines(self, list):
|
||||
return self.fd.writelines(list)
|
||||
def __repr__(self):
|
||||
return repr(self.fd)
|
||||
def __getattr__(self, name):
|
||||
if name in self.__file_attributes:
|
||||
return getattr(self.fd, name)
|
||||
else:
|
||||
return self.__dict__[name]
|
||||
def __setattr__(self, name, value):
|
||||
if name in self.__file_attributes:
|
||||
setattr(self.fd, name, value)
|
||||
else:
|
||||
self.__dict__[name] = value
|
||||
def __cmp__(self, file):
|
||||
"""I'm not sure what the correct behavior is, and therefore
|
||||
this implementation is just a guess."""
|
||||
if type(file) == type(self.fd):
|
||||
return cmp(self.fd, file)
|
||||
else:
|
||||
return cmp(self.fd, file.fd)
|
||||
|
||||
|
||||
class NameReader:
|
||||
def __init__(self, file):
|
||||
self.file = file
|
||||
self.line = ""
|
||||
def next(self):
|
||||
self.line = self.file.readline()
|
||||
return self.line
|
||||
def close(self):
|
||||
return self.file.close()
|
||||
|
||||
|
||||
def safeopen(filename, mode):
|
||||
try:
|
||||
return open(filename, mode)
|
||||
except IOError:
|
||||
return None
|
||||
|
||||
def nameopen(filename, mode):
|
||||
if filename == "-":
|
||||
if "r" in mode:
|
||||
return NameFile(sys.stdin)
|
||||
else:
|
||||
return NameFile(sys.stdout)
|
||||
fp = safeopen(filename, mode)
|
||||
if fp is None:
|
||||
fp = safeopen(filename + ".nam", mode)
|
||||
if "r" in mode and fp is None:
|
||||
fp = safeopen(NAMEDIR + "/" + filename, mode)
|
||||
# last call is open() instead of safeopen() to finally raise
|
||||
# the exception if we just can't find the file.
|
||||
if fp is None:
|
||||
fp = open(NAMEDIR + "/" + filename + ".nam", mode)
|
||||
return fp
|
||||
|
||||
|
||||
class Molecule:
|
||||
|
||||
def __init__(self):
|
||||
self.nametbl = {}
|
||||
for i in NAMESECTIONS:
|
||||
self.nametbl[i] = []
|
||||
self.nametbl[""] = []
|
||||
self.cursection = self.nametbl[""]
|
||||
|
||||
def load(self, fp):
|
||||
if type(fp) is type(""):
|
||||
fp = nameopen(fp, "r")
|
||||
else:
|
||||
fp = NameFile(fp)
|
||||
rdr = NameReader(fp)
|
||||
while rdr.next():
|
||||
line = rdr.line[:-1]
|
||||
if len(line) > 0 and line[0] == '[' and line[-1] == ']':
|
||||
line = string.strip(line)[1:-1]
|
||||
if not self.nametbl.has_key(line):
|
||||
self.nametbl[line] = []
|
||||
self.cursection = self.nametbl[line]
|
||||
else:
|
||||
self.cursection.append(line)
|
||||
fp.close()
|
||||
|
||||
def name(self):
|
||||
n = []
|
||||
if len(self.nametbl["first"]) > 0:
|
||||
n.append(random.choice(self.nametbl["first"]))
|
||||
if len(self.nametbl["mid"]) > 0:
|
||||
n.append(random.choice(self.nametbl["mid"]))
|
||||
if len(self.nametbl["final"]) > 0:
|
||||
n.append(random.choice(self.nametbl["final"]))
|
||||
return string.join(n, "")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) <= 1:
|
||||
sys.stderr.write( \
|
||||
"Usage: molecular.py [ -r file ] [ nn ]\n")
|
||||
sys.exit(0)
|
||||
|
||||
name = Molecule()
|
||||
|
||||
i = 1
|
||||
|
||||
while i < len(sys.argv):
|
||||
arg = sys.argv[i]
|
||||
if arg == "-r":
|
||||
i += 1
|
||||
name.load(sys.argv[i])
|
||||
else:
|
||||
n = int(sys.argv[i])
|
||||
lst = []
|
||||
for i in range(n):
|
||||
print name.name()
|
||||
i += 1
|
102
plugins/util/timesince.py
Normal file
102
plugins/util/timesince.py
Normal file
|
@ -0,0 +1,102 @@
|
|||
# Copyright (c) Django Software Foundation and individual contributors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of Django nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"AND
|
||||
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
#DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
#ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
#ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import datetime
|
||||
|
||||
|
||||
def timesince(d, now=None):
|
||||
"""
|
||||
Takes two datetime objects and returns the time between d and now
|
||||
as a nicely formatted string, e.g. "10 minutes". If d occurs after now,
|
||||
then "0 minutes" is returned.
|
||||
|
||||
Units used are years, months, weeks, days, hours, and minutes.
|
||||
Seconds and microseconds are ignored. Up to two adjacent units will be
|
||||
displayed. For example, "2 weeks, 3 days" and "1 year, 3 months" are
|
||||
possible outputs, but "2 weeks, 3 hours" and "1 year, 5 days" are not.
|
||||
|
||||
Adapted from http://blog.natbat.co.uk/archive/2003/Jun/14/time_since
|
||||
"""
|
||||
chunks = (
|
||||
(60 * 60 * 24 * 365, ('year', 'years')),
|
||||
(60 * 60 * 24 * 30, ('month', 'months')),
|
||||
(60 * 60 * 24 * 7, ('week', 'weeks')),
|
||||
(60 * 60 * 24, ('day', 'days')),
|
||||
(60 * 60, ('hour', 'hours')),
|
||||
(60, ('minute', 'minutes'))
|
||||
)
|
||||
|
||||
# Convert int or float (unix epoch) to datetime.datetime for comparison
|
||||
if isinstance(d, int) or isinstance(d, float):
|
||||
d = datetime.datetime.fromtimestamp(d)
|
||||
|
||||
# Convert datetime.date to datetime.datetime for comparison.
|
||||
if not isinstance(d, datetime.datetime):
|
||||
d = datetime.datetime(d.year, d.month, d.day)
|
||||
if now and not isinstance(now, datetime.datetime):
|
||||
now = datetime.datetime(now.year, now.month, now.day)
|
||||
|
||||
if not now:
|
||||
now = datetime.datetime.now()
|
||||
|
||||
# ignore microsecond part of 'd' since we removed it from 'now'
|
||||
delta = now - (d - datetime.timedelta(0, 0, d.microsecond))
|
||||
since = delta.days * 24 * 60 * 60 + delta.seconds
|
||||
if since <= 0:
|
||||
# d is in the future compared to now, stop processing.
|
||||
return u'0 ' + 'minutes'
|
||||
for i, (seconds, name) in enumerate(chunks):
|
||||
count = since // seconds
|
||||
if count != 0:
|
||||
break
|
||||
|
||||
if count == 1:
|
||||
s = '%(number)d %(type)s' % {'number': count, 'type': name[0]}
|
||||
else:
|
||||
s = '%(number)d %(type)s' % {'number': count, 'type': name[1]}
|
||||
|
||||
if i + 1 < len(chunks):
|
||||
# Now get the second item
|
||||
seconds2, name2 = chunks[i + 1]
|
||||
count2 = (since - (seconds * count)) // seconds2
|
||||
if count2 != 0:
|
||||
if count2 == 1:
|
||||
s += ', %d %s' % (count2, name2[0])
|
||||
else:
|
||||
s += ', %d %s' % (count2, name2[1])
|
||||
return s
|
||||
|
||||
|
||||
def timeuntil(d, now=None):
|
||||
"""
|
||||
Like timesince, but returns a string measuring the time until
|
||||
the given time.
|
||||
"""
|
||||
if not now:
|
||||
now = datetime.datetime.now()
|
||||
return timesince(now, d)
|
133
plugins/util/urlnorm.py
Normal file
133
plugins/util/urlnorm.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
"""
|
||||
URI Normalization function:
|
||||
* Always provide the URI scheme in lowercase characters.
|
||||
* Always provide the host, if any, in lowercase characters.
|
||||
* Only perform percent-encoding where it is essential.
|
||||
* Always use uppercase A-through-F characters when percent-encoding.
|
||||
* Prevent dot-segments appearing in non-relative URI paths.
|
||||
* For schemes that define a default authority, use an empty authority if the
|
||||
default is desired.
|
||||
* For schemes that define an empty path to be equivalent to a path of "/",
|
||||
use "/".
|
||||
* For schemes that define a port, use an empty port if the default is desired
|
||||
* All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||
|
||||
implements:
|
||||
http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
|
||||
http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
|
||||
|
||||
inspired by:
|
||||
Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
|
||||
Mark Nottingham, http://www.mnot.net/python/urlnorm.py
|
||||
"""
|
||||
|
||||
__license__ = "Python"
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
import urlparse
|
||||
from urllib import quote, unquote
|
||||
|
||||
default_port = {
|
||||
'http': 80,
|
||||
}
|
||||
|
||||
|
||||
class Normalizer(object):
|
||||
def __init__(self, regex, normalize_func):
|
||||
self.regex = regex
|
||||
self.normalize = normalize_func
|
||||
|
||||
normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P<tld>[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P<ASIN>[0-9A-Za-z]{10})'),
|
||||
lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))),
|
||||
Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'),
|
||||
lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ),
|
||||
Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'),
|
||||
lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ),
|
||||
)
|
||||
|
||||
|
||||
def normalize(url):
|
||||
"""Normalize a URL."""
|
||||
|
||||
scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
|
||||
userinfo, host, port = re.search('([^@]*@)?([^:]*):?(.*)', auth).groups()
|
||||
|
||||
# Always provide the URI scheme in lowercase characters.
|
||||
scheme = scheme.lower()
|
||||
|
||||
# Always provide the host, if any, in lowercase characters.
|
||||
host = host.lower()
|
||||
if host and host[-1] == '.':
|
||||
host = host[:-1]
|
||||
if host and host.startswith("www."):
|
||||
if not scheme:
|
||||
scheme = "http"
|
||||
host = host[4:]
|
||||
elif path and path.startswith("www."):
|
||||
if not scheme:
|
||||
scheme = "http"
|
||||
path = path[4:]
|
||||
|
||||
# Only perform percent-encoding where it is essential.
|
||||
# Always use uppercase A-through-F characters when percent-encoding.
|
||||
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||
def clean(string):
|
||||
string = unicode(unquote(string), 'utf-8', 'replace')
|
||||
return unicodedata.normalize('NFC', string).encode('utf-8')
|
||||
path = quote(clean(path), "~:/?#[]@!$&'()*+,;=")
|
||||
fragment = quote(clean(fragment), "~")
|
||||
|
||||
# note care must be taken to only encode & and = characters as values
|
||||
query = "&".join(["=".join([quote(clean(t), "~:/?#[]@!$'()*+,;=")
|
||||
for t in q.split("=", 1)]) for q in query.split("&")])
|
||||
|
||||
# Prevent dot-segments appearing in non-relative URI paths.
|
||||
if scheme in ["", "http", "https", "ftp", "file"]:
|
||||
output = []
|
||||
for input in path.split('/'):
|
||||
if input == "":
|
||||
if not output:
|
||||
output.append(input)
|
||||
elif input == ".":
|
||||
pass
|
||||
elif input == "..":
|
||||
if len(output) > 1:
|
||||
output.pop()
|
||||
else:
|
||||
output.append(input)
|
||||
if input in ["", ".", ".."]:
|
||||
output.append("")
|
||||
path = '/'.join(output)
|
||||
|
||||
# For schemes that define a default authority, use an empty authority if
|
||||
# the default is desired.
|
||||
if userinfo in ["@", ":@"]:
|
||||
userinfo = ""
|
||||
|
||||
# For schemes that define an empty path to be equivalent to a path of "/",
|
||||
# use "/".
|
||||
if path == "" and scheme in ["http", "https", "ftp", "file"]:
|
||||
path = "/"
|
||||
|
||||
# For schemes that define a port, use an empty port if the default is
|
||||
# desired
|
||||
if port and scheme in default_port.keys():
|
||||
if port.isdigit():
|
||||
port = str(int(port))
|
||||
if int(port) == default_port[scheme]:
|
||||
port = ''
|
||||
|
||||
# Put it all back together again
|
||||
auth = (userinfo or "") + host
|
||||
if port:
|
||||
auth += ":" + port
|
||||
if url.endswith("#") and query == "" and fragment == "":
|
||||
path += "#"
|
||||
normal_url = urlparse.urlunsplit((scheme, auth, path, query,
|
||||
fragment)).replace("http:///", "http://")
|
||||
for norm in normalizers:
|
||||
m = norm.regex.match(normal_url)
|
||||
if m:
|
||||
return norm.normalize(m)
|
||||
return normal_url
|
Reference in a new issue