diff --git a/.gitignore b/.gitignore index 719b47b..5fa7446 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ persist -config.json +config +config.ssl gitflow *.db *.log @@ -11,8 +12,4 @@ gitflow *.sublime-project *.sublime-workspace .idea/ -data/GeoLiteCity.dat -plugins/New Text Document.txt -plugins/srvv.py -run.cmd -config +plugins/data/GeoLiteCity.dat diff --git a/README.md b/README.md index cfb69ac..06d64ac 100644 --- a/README.md +++ b/README.md @@ -14,37 +14,33 @@ Unzip the resulting file, and continue to read this document. ### Install -Before you can run the bot, you need to install a few Python dependencies. LXML is required while Enchant, PyGeoIP, TweePy and PyDNS are needed for several plugins. +Before you can run the bot, you need to install a few Python dependencies. LXML is required while Enchant and PyDNS are needed for several plugins. -These can be installed with `pip` (The Python package manager) by running the following command in the bot directory: +These can be installed with `pip` (The Python package manager): - pip install -r requirements.txt + [sudo] pip install -r requirements.txt -**Note:** If you use `pip`, you will also need the following packages on linux or `pip` will fail to install the requirements. +If you use `pip`, you will also need the following packages on linux or `pip` will fail to install the requirements. ```python, python-dev, libenchant-dev, libenchant1c2a, libxslt-dev, libxml2-dev.``` - -(this can be done using your package manager (eg: *apt-get* or *yum*) #### How to install `pip` -You can usually install pip on linux by installing the `python-pip` package using your package manager (eg. *apt-get install python-pip* or *yum install python-pip* as root), or you can try the below code to download and install it manually. - curl -O http://python-distribute.org/distribute_setup.py # or download with your browser on windows python distribute_setup.py easy_install pip -If you need help installing pip on Windows, follow [this guide](http://simpledeveloper.com/how-to-install-easy_install/) and then run `easy_install pip` on the command line. +If you are unable to use pip, there are Windows installers for LXML available for [64 bit](https://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win-amd64-py2.7.exe) and [32 bit](https://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win32-py2.7.exe) versions of Python. ### Run -Before you run the bot, rename `config.default` to `config.json` and edit it with your preferred settings. You can check if your JSON is valid on [this site](http://jsonlint.com/)! +Before you run the bot, rename `config.default` to `config` and edit it with your preferred settings. Once you have installed the required dependencies and renamed the config file, you can run the bot! Make sure you are in the correct folder and run the following command: -`python cloudbot.py` +`python bot.py` -On Windows you can usually just double-click `cloudbot.py` to start the bot, as long as you have Python installed correctly. +On Windows you can usually just double-click `bot.py` to start the bot, as long as you have Python installed correctly. ## Getting help with CloudBot @@ -62,17 +58,15 @@ More at the [Wiki Main Page](http://git.io/cloudbotircwiki). The developers reside in [#CloudBot](irc://irc.esper.net/cloudbot) on [EsperNet](http://esper.net) and would be glad to help you. -If you think you have found a bug/have a idea/suggestion, please **open a issue** here on Github and contact us on IRC! +If you think you have found a bug/have a idea/suggestion, please **open a issue** here on Github. ### Requirements CloudBot runs on **Python** *2.7.x*. It is currently developed on **Windows** *8* with **Python** *2.7.5*. -It **requires the Python modules** lXML, watchdog and BeautifulSoup4. +It **requires the Python module** lXML. The module `Enchant` is needed for the spellcheck plugin. The module `PyDNS` is needed for SRV record lookup in the mcping plugin. -The module `PyGeoIP` is needed for location lookup in the geoip plugin. -The module `TweePy` is needed for the twitter plugin. **Windows** users: Windows compatibility some plugins is **broken** (such as ping), but we do intend to add it. Eventually. diff --git a/cloudbot.py b/cloudbot.py index f567315..91515db 100755 --- a/cloudbot.py +++ b/cloudbot.py @@ -1,52 +1,74 @@ #!/usr/bin/env python -from core import bot import os +import Queue import sys import time -import signal +import re -# check python version -if sys.version_info < (3, 2, 0): - print("CloudBot3 requires Python 3.2 or newer.") - sys.exit(1) - -# set up environment +sys.path += ['plugins', 'lib'] # add stuff to the sys.path for easy imports os.chdir(sys.path[0] or '.') # do stuff relative to the install directory -# this is not the code you are looking for -if os.path.exists(os.path.abspath('lib')): - sys.path += ['lib'] -print('CloudBot3 ') +class Bot(object): + pass +print 'CloudBot DEV ' -def exit_gracefully(signum, frame): - # this doesn't really work at all - cloudbot.stop() +# create new bot object +bot = Bot() +bot.vars = {} - # restore the original handler so if they do it again it triggers - signal.signal(signal.SIGINT, original_sigint) +# record start time for the uptime command +bot.start_time = time.time() -# store the original SIGINT handler -original_sigint = signal.getsignal(signal.SIGINT) -signal.signal(signal.SIGINT, exit_gracefully) +print 'Begin Plugin Loading.' -# create a bot master and start it -cloudbot = bot.CloudBot() -cloudbot.start() +# bootstrap the reloader +eval(compile(open(os.path.join('core', 'reload.py'), 'U').read(), + os.path.join('core', 'reload.py'), 'exec')) +reload(init=True) -# watch to see if the bot stops running or needs a restart -while True: - if cloudbot.running: - time.sleep(.1) - else: - if cloudbot.do_restart: - # create a new bot thread and start it - # Todo: Make this work - del cloudbot - cloudbot = bot.Bot() - cloudbot.start() - continue +config() +if not hasattr(bot, 'config'): + exit() + +print 'Connecting to IRC...' + +bot.conns = {} + +try: + for name, conf in bot.config['connections'].iteritems(): + # strip all spaces and capitalization from the connection name + name = name.replace(" ", "_") + name = re.sub('[^A-Za-z0-9_]+', '', name) + print 'Connecting to server: %s' % conf['server'] + if conf.get('ssl'): + bot.conns[name] = SSLIRC(name, conf['server'], conf['nick'], conf=conf, + port=conf.get('port', 6667), channels=conf['channels'], + ignore_certificate_errors=conf.get('ignore_cert', True)) else: - break \ No newline at end of file + bot.conns[name] = IRC(name, conf['server'], conf['nick'], conf=conf, + port=conf.get('port', 6667), channels=conf['channels']) +except Exception as e: + print 'ERROR: malformed config file', e + sys.exit() + +bot.persist_dir = os.path.abspath('persist') +if not os.path.exists(bot.persist_dir): + os.mkdir(bot.persist_dir) + +print 'Connection(s) made, starting main loop.' + +while True: + reload() # these functions only do things + config() # if changes have occured + + for conn in bot.conns.itervalues(): + try: + out = conn.out.get_nowait() + main(conn, out) + except Queue.Empty: + pass + while all(conn.out.empty() for conn in bot.conns.itervalues()): + time.sleep(.1) diff --git a/config.default b/config.default index 237482c..4bda1b0 100644 --- a/config.default +++ b/config.default @@ -1,62 +1,77 @@ { - "connections": - [ - { - "name": "esper", - "connection": { - "server": "irc.esper.net", - "port": 6667, - "ssl": false, - "ignore_cert": true - }, - "nick": "MyCloueqerdBot", - "user": "cloudbot", - "real_name": "CloudBot - http://git.io/cloudbotirc", - "channels": ["#cloudbot", "#cloudbot2"], - "disabled_commands": [], - "acls": {}, - "nickserv": { - "enabled": false, - "nickserv_password": "", - "nickserv_user": "", - "nickserv_name": "nickserv", - "nickserv_command": "IDENTIFY" - }, - "permissions": { + "connections": { + "hackint": { + "server": "irc.hackint.eu", + "nick": "antibot", + "user": "antibot", + "realname": "CloudBot - http://git.io/cloudbotirc", + "mode": "", + "_nickserv_password": "", + "-nickserv_user": "", + "channels": [ + "#ChaosChemnitz", + "#logbot" + ], + "invite_join": true, + "auto_rejoin": false, + "command_prefix": "." + } + }, + "disabled_plugins": [], + "disabled_commands": [], + "acls": {}, + "api_keys": { + "tvdb": "", + "wolframalpha": "", + "lastfm": "", + "rottentomatoes": "", + "soundcloud": "", + "twitter_consumer_key": "", + "twitter_consumer_secret": "", + "twitter_access_token": "", + "twitter_access_secret": "", + "wunderground": "", + "googletranslate": "", + "rdio_key": "", + "rdio_secret": "" + }, + "permissions": { "admins": { - "perms": ["adminonly", "addfactoid", "delfactoid", "ignore", "botcontrol", "permissions_users", "op"], - "users": ["examplea!user@example.com", "exampleb!user@example.com"] + "perms": [ + "adminonly", + "addfactoid", + "delfactoid", + "ignore", + "botcontrol", + "permissions_users", + "op" + ], + "users": [ + "examplea!user@example.com", + "exampleb!user@example.com" + ] }, "moderators": { - "perms": ["addfactoid", "delfactoid", "ignore"], - "users": ["examplec!user@example.com"] - }, - "trusted": { - "perms": ["addfactoid", "delfactoid"], - "users": ["exampled!user@example.com"] + "perms": [ + "addfactoid", + "delfactoid", + "ignore" + ], + "users": [ + "stummi!~Stummi@stummi.org" + ] } - }, - "plugins": { - - }, - "command_prefix": "." - } - ], - "api_keys": - { - "tvdb": "", - "wolframalpha": "", - "lastfm": "", - "rottentomatoes": "", - "soundcloud": "", - "twitter_consumer_key": "", - "twitter_consumer_secret": "", - "twitter_access_token": "", - "twitter_access_secret": "", - "wunderground": "", - "googletranslate": "", - "rdio_key": "", - "rdio_secret": "" - }, - "disabled_plugins": [] + }, + "plugins": { + "factoids": { + "prefix": false + }, + "ignore": { + "ignored": [] + } + }, + "censored_strings": [ + "mypass", + "mysecret" + ] } diff --git a/core/bot.py b/core/bot.py deleted file mode 100644 index 95d0dfe..0000000 --- a/core/bot.py +++ /dev/null @@ -1,175 +0,0 @@ -import time -import logging -import re -import os -import queue -import collections -import threading - -from sqlalchemy.orm import scoped_session, sessionmaker -from sqlalchemy import create_engine - -from core import config, irc, main -from core.permissions import PermissionManager -from core.loader import PluginLoader - - -def clean_name(n): - """strip all spaces and capitalization""" - return re.sub('[^A-Za-z0-9_]+', '', n.replace(" ", "_")) - - -def get_logger(): - """create and return a new logger object""" - # create logger - logger = logging.getLogger("cloudbot") - logger.setLevel(logging.DEBUG) - - # add a file handler - log_name = "bot.log" - fh = logging.FileHandler(log_name) - fh.setLevel(logging.INFO) - - # stdout handler - sh = logging.StreamHandler() - sh.setLevel(logging.DEBUG) - - # create a formatter and set the formatter for the handler. - frmt = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s') - fh.setFormatter(frmt) - simple_frmt = logging.Formatter('[%(levelname)s] %(message)s') - sh.setFormatter(simple_frmt) - - # add the Handlers to the logger - logger.addHandler(fh) - logger.addHandler(sh) - return logger - - -class CloudBot(threading.Thread): - def __init__(self): - # basic variables - self.start_time = time.time() - self.running = True - self.do_restart = False - - # stores each instance of the - self.instances = [] - - # set up config and logging - self.setup() - self.logger.debug("Bot setup completed.") - - # start bot instances - self.create() - - for instance in self.instances: - instance.permissions = PermissionManager(self, instance) - - # run plugin loader - self.plugins = collections.defaultdict(list) - - """ self.plugins format - {'PLUGIN_TYPE': [(, - {PLUGIN_ARGS}), - (, - {PLUGIN_ARGS})], - 'PLUGIN_TYPE': [(, - {PLUGIN_ARGS})] - } - """ - - self.threads = {} - - self.loader = PluginLoader(self) - - threading.Thread.__init__(self) - - def run(self): - """recieves input from the IRC engine and processes it""" - self.logger.info("Starting main thread.") - while self.running: - for instance in self.instances: - try: - incoming = instance.parsed_queue.get_nowait() - if incoming == StopIteration: - print("StopIteration") - # IRC engine has signalled timeout, so reconnect (ugly) - instance.connection.reconnect() - main.main(self, instance, incoming) - except queue.Empty: - pass - - # if no messages are in the incoming queue, sleep - while self.running and all(i.parsed_queue.empty() for i in self.instances): - time.sleep(.1) - - def setup(self): - """create the logger and config objects""" - # logging - self.logger = get_logger() - self.logger.debug("Logging system initalised.") - - # data folder - self.data_dir = os.path.abspath('persist') - if not os.path.exists(self.data_dir): - self.logger.debug("Data folder not found, creating.") - os.mkdir(self.data_dir) - - # config - self.config = config.Config(self) - self.logger.debug("Config system initalised.") - - # db - engine = create_engine('sqlite:///cloudbot.db') - db_factory = sessionmaker(bind=engine) - self.db_session = scoped_session(db_factory) - self.logger.debug("Database system initalised.") - - def create(self): - """ Create a BotInstance for all the networks defined in the config """ - for conf in self.config['instances']: - - # strip all spaces and capitalization from the connection name - name = clean_name(conf['name']) - nick = conf['nick'] - server = conf['connection']['server'] - port = conf['connection'].get('port', 6667) - - self.logger.debug("Creating BotInstance for {}.".format(name)) - - self.instances.append(irc.BotInstance(name, server, nick, config=conf, - port=port, logger=self.logger, channels=conf['channels'], - ssl=conf['connection'].get('ssl', False))) - self.logger.debug("({}) Created connection.".format(name)) - - - def stop(self, reason=None): - """quits all networks and shuts the bot down""" - self.logger.info("Stopping bot.") - - self.config.observer.stop() - self.logger.debug("Stopping config reloader.") - - self.loader.stop() - self.logger.debug("Stopping plugin loader.") - - for connection in self.connections: - self.logger.debug("({}) Closing connection.".format(connection.name)) - - if reason: - connection.cmd("QUIT", [reason]) - else: - connection.cmd("QUIT") - - connection.stop() - - self.logger.debug("Logging engine stopped") - logging.shutdown() - - self.running = False - - def restart(self, reason=None): - """shuts the bot down and restarts it""" - self.do_restart = True - self.stop(reason) \ No newline at end of file diff --git a/core/config.py b/core/config.py index 9d8d010..c813ea5 100644 --- a/core/config.py +++ b/core/config.py @@ -1,69 +1,27 @@ +import inspect import json import os -import time -import sys - -from watchdog.observers import Observer -from watchdog.tricks import Trick -class Config(dict): - def __init__(self, bot, *args, **kwargs): - self.filename = "config.json" - self.path = os.path.abspath(self.filename) - self.bot = bot - self.logger = bot.logger - self.update(*args, **kwargs) +def save(conf): + json.dump(conf, open('config', 'w'), sort_keys=True, indent=2) - # populate self with config data - self.load_config() - - # start watcher - self.watcher() +if not os.path.exists('config'): + print "Please rename 'config.default' to 'config' to set up your bot!" + print "For help, see http://git.io/cloudbotirc" + print "Thank you for using CloudBot!" + sys.exit() - def load_config(self): - """(re)loads the bot config from the config file""" - if not os.path.exists(self.path): - # if there is no config, show an error and die - self.logger.critical("No config file found, bot shutting down!") - print("No config file found! Bot shutting down in five seconds.") - print("Copy 'config.default' to 'config.json' for defaults.") - print("For help, see http://git.io/cloudbotirc. Thank you for using CloudBot!") - time.sleep(5) - sys.exit() +def config(): + # reload config from file if file has changed + config_mtime = os.stat('config').st_mtime + if bot._config_mtime != config_mtime: + try: + bot.config = json.load(open('config')) + bot._config_mtime = config_mtime + except ValueError, e: + print 'error: malformed config', e - with open(self.path) as f: - self.update(json.load(f)) - self.logger.info("Config loaded from file.") - # reload permissions - if self.bot.instances: - for instance in self.bot.instances: - instance.permissions.reload() - - def save_config(self): - """saves the contents of the config dict to the config file""" - json.dump(self, open(self.path, 'w'), sort_keys=True, indent=2) - self.logger.info("Config saved to file.") - - def watcher(self): - """starts the watchdog to automatically reload the config when it changes on disk""" - self.observer = Observer() - - pattern = "*{}".format(self.filename) - - self.event_handler = ConfigEventHandler(self, patterns=[pattern]) - self.observer.schedule(self.event_handler, path='.', recursive=False) - self.observer.start() - - -class ConfigEventHandler(Trick): - def __init__(self, config, *args, **kwargs): - self.config = config - self.logger = config.logger - Trick.__init__(self, *args, **kwargs) - - def on_any_event(self, event): - self.logger.info("Config changed, triggering reload.") - self.config.load_config() +bot._config_mtime = 0 diff --git a/core/db.py b/core/db.py index d4226a2..6bdf8fa 100644 --- a/core/db.py +++ b/core/db.py @@ -1,6 +1,6 @@ import os import sqlite3 -import _thread +import thread threaddbs = {} @@ -11,10 +11,10 @@ def get_db_connection(conn, name=''): if not name: name = '{}.db'.format(conn.name) - threadid = _thread.get_ident() + threadid = thread.get_ident() if name in threaddbs and threadid in threaddbs[name]: return threaddbs[name][threadid] - filename = os.path.join(bot.data_dir, name) + filename = os.path.join(bot.persist_dir, name) db = sqlite3.connect(filename, timeout=10) if name in threaddbs: diff --git a/core/irc.py b/core/irc.py index 88357d5..40831e3 100644 --- a/core/irc.py +++ b/core/irc.py @@ -1,18 +1,11 @@ import re import socket import time -import threading -import queue - -from core import permissions +import thread +import Queue from ssl import wrap_socket, CERT_NONE, CERT_REQUIRED, SSLError -irc_prefix_rem = re.compile(r'(.*?) (.*?) (.*)').match -irc_noprefix_rem = re.compile(r'()(.*?) (.*)').match -irc_netmask_rem = re.compile(r':?([^!@]*)!?([^@]*)@?(.*)').match -irc_param_ref = re.compile(r'(?:^|(?<= ))(:.*|[^ ]+)').findall - def decode(txt): for codec in ('utf-8', 'iso-8859-1', 'shift_jis', 'cp1252'): @@ -24,44 +17,78 @@ def decode(txt): def censor(text): + text = text.replace('\n', '').replace('\r', '') + replacement = '[censored]' + if 'censored_strings' in bot.config: + if bot.config['censored_strings']: + words = map(re.escape, bot.config['censored_strings']) + regex = re.compile('({})'.format("|".join(words))) + text = regex.sub(replacement, text) return text -class ReceiveThread(threading.Thread): - """receives messages from IRC and puts them in the input_queue""" - def __init__(self, sock, input_queue, timeout): - self.input_buffer = b"" - self.input_queue = input_queue - self.socket = sock +class crlf_tcp(object): + """Handles tcp connections that consist of utf-8 lines ending with crlf""" + + def __init__(self, host, port, timeout=300): + self.ibuffer = "" + self.obuffer = "" + self.oqueue = Queue.Queue() # lines to be sent out + self.iqueue = Queue.Queue() # lines that were received + self.socket = self.create_socket() + self.host = host + self.port = port self.timeout = timeout - self.shutdown = False - threading.Thread.__init__(self) + def create_socket(self): + return socket.socket(socket.AF_INET, socket.TCP_NODELAY) + + def run(self): + noerror = 0 + while 1: + try: + self.socket.connect((self.host, self.port)) + break + except socket.gaierror as e: + time.sleep(5) + except socket.timeout as e: + time.sleep(5) + + thread.start_new_thread(self.recv_loop, ()) + thread.start_new_thread(self.send_loop, ()) def recv_from_socket(self, nbytes): return self.socket.recv(nbytes) + def get_timeout_exception_type(self): + return socket.timeout + def handle_receive_exception(self, error, last_timestamp): + print("Receive exception: %s" % (error)) if time.time() - last_timestamp > self.timeout: - self.input_queue.put(StopIteration) + print("Receive timeout. Restart connection.") + self.iqueue.put(StopIteration) self.socket.close() return True return False - def get_timeout_exception_type(self): - return socket.timeout + def handle_send_exception(self, error): + print("Send exception: %s" % (error)) + self.iqueue.put(StopIteration) + self.socket.close() + return True - def run(self): + def recv_loop(self): last_timestamp = time.time() - while not self.shutdown: + while True: try: - data = self.recv_from_socket(4096) - self.input_buffer += data + data = self.recv_from_socket(4096) + self.ibuffer += data if data: last_timestamp = time.time() else: if time.time() - last_timestamp > self.timeout: - self.input_queue.put(StopIteration) + self.iqueue.put(StopIteration) self.socket.close() return time.sleep(1) @@ -69,16 +96,40 @@ class ReceiveThread(threading.Thread): if self.handle_receive_exception(e, last_timestamp): return continue + except AttributeError: + return - while b'\r\n' in self.input_buffer: - line, self.input_buffer = self.input_buffer.split(b'\r\n', 1) - print(decode(line)) - self.input_queue.put(decode(line)) + while '\r\n' in self.ibuffer: + line, self.ibuffer = self.ibuffer.split('\r\n', 1) + self.iqueue.put(decode(line)) + def send_loop(self): + while True: + try: + line = self.oqueue.get().splitlines()[0][:500] + if line == StopIteration: + return + print ">>> %r" % line + self.obuffer += line.encode('utf-8', 'replace') + '\r\n' + while self.obuffer: + sent = self.socket.send(self.obuffer) + self.obuffer = self.obuffer[sent:] -class SSLReceiveThread(ReceiveThread): - def __init__(self, sock, input_queue, timeout): - ReceiveThread.__init__(self, sock, input_queue, timeout) + except socket.error as e: + self.handle_send_exception(e) + return + +class crlf_ssl_tcp(crlf_tcp): + """Handles ssl tcp connetions that consist of utf-8 lines ending with crlf""" + + def __init__(self, host, port, ignore_cert_errors, timeout=300): + self.ignore_cert_errors = ignore_cert_errors + crlf_tcp.__init__(self, host, port, timeout) + + def create_socket(self): + return wrap_socket(crlf_tcp.create_socket(self), server_side=False, + cert_reqs=CERT_NONE if self.ignore_cert_errors else + CERT_REQUIRED) def recv_from_socket(self, nbytes): return self.socket.read(nbytes) @@ -87,50 +138,60 @@ class SSLReceiveThread(ReceiveThread): return SSLError def handle_receive_exception(self, error, last_timestamp): - # this is terrible - if not "timed out" in error.args[0]: - raise - return ReceiveThread.handle_receive_exception(self, error, last_timestamp) + # this is terrible + #if not "timed out" in error.args[0]: + # raise + return crlf_tcp.handle_receive_exception(self, error, last_timestamp) + + def handle_send_exception(self, error): + return crlf_tcp.handle_send_exception(self, error) -class SendThread(threading.Thread): - """sends messages from output_queue to IRC""" - def __init__(self, sock, conn_name, output_queue): - self.output_buffer = b"" - self.output_queue = output_queue - self.conn_name = conn_name - self.socket = sock - - self.shutdown = False - threading.Thread.__init__(self) - - def run(self): - while not self.shutdown: - line = self.output_queue.get().splitlines()[0][:500] - self.output_buffer += line.encode('utf-8', 'replace') + b'\r\n' - while self.output_buffer: - sent = self.socket.send(self.output_buffer) - self.output_buffer = self.output_buffer[sent:] +irc_prefix_rem = re.compile(r'(.*?) (.*?) (.*)').match +irc_noprefix_rem = re.compile(r'()(.*?) (.*)').match +irc_netmask_rem = re.compile(r':?([^!@]*)!?([^@]*)@?(.*)').match +irc_param_ref = re.compile(r'(?:^|(?<= ))(:.*|[^ ]+)').findall -class ParseThread(threading.Thread): - """parses messages from input_queue and puts them in parsed_queue""" - def __init__(self, input_queue, output_queue, parsed_queue): - self.input_queue = input_queue # lines that were received - self.output_queue = output_queue # lines to be sent out - self.parsed_queue = parsed_queue # lines that have been parsed +class IRC(object): + """handles the IRC protocol""" - threading.Thread.__init__(self) + def __init__(self, name, server, nick, port=6667, channels=[], conf={}): + self.name = name + self.channels = channels + self.conf = conf + self.server = server + self.port = port + self.nick = nick + self.history = {} + self.vars = {} - def run(self): + self.out = Queue.Queue() # responses from the server are placed here + # format: [rawline, prefix, command, params, + # nick, user, host, paramlist, msg] + self.connect() + + thread.start_new_thread(self.parse_loop, ()) + + def create_connection(self): + return crlf_tcp(self.server, self.port) + + def connect(self): + self.conn = self.create_connection() + thread.start_new_thread(self.conn.run, ()) + self.set_pass(self.conf.get('server_password')) + self.set_nick(self.nick) + self.cmd("USER", + [conf.get('user', 'cloudbot'), "3", "*", conf.get('realname', + 'CloudBot - http://git.io/cloudbot')]) + + def parse_loop(self): while True: # get a message from the input queue - msg = self.input_queue.get() + msg = self.conn.iqueue.get() if msg == StopIteration: - # got a StopIteration from the receive thread, pass it on - # so the main thread can restart the connection - self.parsed_queue.put(StopIteration) + self.connect() continue # parse the message @@ -141,115 +202,17 @@ class ParseThread(threading.Thread): nick, user, host = irc_netmask_rem(prefix).groups() mask = nick + "!" + user + "@" + host paramlist = irc_param_ref(params) - lastparam = "" + lastparam = "" if paramlist: if paramlist[-1].startswith(':'): paramlist[-1] = paramlist[-1][1:] lastparam = paramlist[-1] # put the parsed message in the response queue - self.parsed_queue.put([msg, prefix, command, params, nick, user, host, - mask, paramlist, lastparam]) + self.out.put([msg, prefix, command, params, nick, user, host, + mask, paramlist, lastparam]) # if the server pings us, pong them back if command == "PING": - string = "PONG :" + paramlist[0] - self.output_queue.put(string) - - -class IRCConnection(object): - """handles an IRC connection""" - def __init__(self, name, host, port, input_queue, output_queue): - self.output_queue = output_queue # lines to be sent out - self.input_queue = input_queue # lines that were received - self.socket = self.create_socket() - self.conn_name = name - self.host = host - self.port = port - self.timeout = 300 - - def create_socket(self): - return socket.socket(socket.AF_INET, socket.TCP_NODELAY) - - def connect(self): - self.socket.connect((self.host, self.port)) - - self.receive_thread = ReceiveThread(self.socket, self.input_queue, self.timeout) - self.receive_thread.start() - - self.send_thread = SendThread(self.socket, self.conn_name, self.output_queue) - self.send_thread.start() - - def stop(self): - self.send_thread.shutdown = True - self.receive_thread.shutdown = True - time.sleep(0.1) - self.socket.close() - - def reconnect(self): - self.stop() - self.connect() - - -class SSLIRCConnection(IRCConnection): - """handles a SSL IRC connection""" - - def __init__(self, name, host, port, input_queue, output_queue, ignore_cert_errors): - self.ignore_cert_errors = ignore_cert_errors - IRCConnection.__init__(self, name, host, port, input_queue, output_queue) - - def create_socket(self): - return wrap_socket(IRCConnection.create_socket(self), server_side=False, - cert_reqs=CERT_NONE if self.ignore_cert_errors else - CERT_REQUIRED) - - -class BotInstance(object): - """ A BotInstance represents each connection the bot makes to an IRC server """ - - def __init__(self, name, server, nick, port=6667, ssl=False, logger=None, channels=[], config={}): - self.name = name - self.channels = channels - self.config = config - self.ssl = ssl - self.server = server - self.port = port - self.logger = logger - self.nick = nick - self.vars = {} - self.history = {} - - self.parsed_queue = queue.Queue() # responses from the server are placed here - # format: [rawline, prefix, command, params, - # nick, user, host, paramlist, msg] - - self.parsed_queue = queue.Queue() - self.input_queue = queue.Queue() - self.output_queue = queue.Queue() - - # create the IRC connection and connect - self.connection = self.create_connection() - self.connection.connect() - - self.set_pass(self.config.get('server_password')) - self.set_nick(self.nick) - self.cmd("USER", - [self.config.get('user', 'cloudbot'), "3", "*", - self.config.get('realname', 'CloudBot - http://git.io/cloudbot')]) - - self.parse_thread = ParseThread(self.input_queue, self.output_queue, - self.parsed_queue) - self.parse_thread.daemon = True - self.parse_thread.start() - - def create_connection(self): - if self.ssl: - return SSLIRCConnection(self.name, self.server, self.port, self.input_queue, - self.output_queue, True) - else: - return IRCConnection(self.name, self.server, self.port, - self.input_queue, self.output_queue) - - def stop(self): - self.connection.stop() + self.cmd("PONG", paramlist) def set_pass(self, password): if password: @@ -276,20 +239,25 @@ class BotInstance(object): def ctcp(self, target, ctcp_type, text): """ makes the bot send a PRIVMSG CTCP to a target """ - out = "\x01{} {}\x01".format(ctcp_type, text) + out = u"\x01{} {}\x01".format(ctcp_type, text) self.cmd("PRIVMSG", [target, out]) def cmd(self, command, params=None): if params: - params[-1] = ':' + params[-1] - self.send("{} {}".format(command, ' '.join(params))) + params[-1] = u':' + params[-1] + self.send(u"{} {}".format(command, ' '.join(params))) else: self.send(command) - def send(self, string): - try: - self.logger.info("{} >> {}".format(self.name.upper(), string)) - except: - # if this doesn't work, no big deal - pass - self.output_queue.put(string) \ No newline at end of file + def send(self, str): + self.conn.oqueue.put(str) + + +class SSLIRC(IRC): + def __init__(self, name, server, nick, port=6667, channels=[], conf={}, + ignore_certificate_errors=True): + self.ignore_cert_errors = ignore_certificate_errors + IRC.__init__(self, name, server, nick, port, channels, conf) + + def create_connection(self): + return crlf_ssl_tcp(self.server, self.port, self.ignore_cert_errors) diff --git a/core/loader.py b/core/loader.py deleted file mode 100644 index 2374041..0000000 --- a/core/loader.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import re -import glob -import collections - -from watchdog.observers import Observer -from watchdog.tricks import Trick -from pprint import pprint - -from core import main - - -def make_signature(f): - return f.__code__.co_filename, f.__name__, f.__code__.co_firstlineno - - -def format_plug(plug, kind='', lpad=0): - out = ' ' * lpad + '{}:{}:{}'.format(*make_signature(plug[0])) - if kind == 'command': - out += ' ' * (50 - len(out)) + plug[1]['name'] - - if kind == 'event': - out += ' ' * (50 - len(out)) + ', '.join(plug[1]['events']) - - if kind == 'regex': - out += ' ' * (50 - len(out)) + plug[1]['regex'] - - return out - - -class PluginLoader(object): - def __init__(self, bot): - self.observer = Observer() - self.path = os.path.abspath("plugins") - self.bot = bot - - self.event_handler = PluginEventHandler(self, patterns=["*.py"]) - self.observer.schedule(self.event_handler, self.path, recursive=False) - self.observer.start() - - self.load_all() - - def stop(self): - """shuts down the plugin reloader""" - self.observer.stop() - - def load_all(self): - """runs load_file() on all python files in the plugins folder""" - files = set(glob.glob(os.path.join(self.path, '*.py'))) - for f in files: - self.load_file(f, rebuild=True) - self.rebuild() - - def load_file(self, path, rebuild=False): - """loads (or reloads) all valid plugins from a specified file""" - filename = os.path.basename(path) - title = os.path.splitext(filename)[0] - - disabled = self.bot.config.get('disabled_plugins', []) - if title in disabled: - self.bot.logger.info("Did not load plugins from: {} (plugin disabled)".format(filename)) - return - - # compile the file and eval it in a namespace - try: - code = compile(open(path, 'U').read(), filename, 'exec') - namespace = {} - eval(code, namespace) - except Exception: - self.bot.logger.exception("Error compiling {}:".format(filename)) - return - - # remove plugins already loaded from this file - for plug_type, data in self.bot.plugins.items(): - self.bot.plugins[plug_type] = [x for x in data - if x[0]._filename != filename] - - # stop all currently running instances of the plugins from this file - for func, handler in list(self.bot.threads.items()): - if func._filename == filename: - handler.stop() - del self.bot.threads[func] - - # find objects with hooks in the plugin namespace - # TODO: kill it with fire, kill it all - for obj in namespace.values(): - if hasattr(obj, '_hook'): # check for magic - if obj._thread: - self.bot.threads[obj] = main.Handler(self.bot, obj) - for plug_type, data in obj._hook: - # add plugin to the plugin list - self.bot.plugins[plug_type] += [data] - self.bot.logger.info("Loaded plugin: {} ({})".format(format_plug(data), plug_type)) - - # do a rebuild, unless the bot is loading all plugins (rebuild happens after load_all) - if not rebuild: - self.rebuild() - - def unload_file(self, path): - """unloads all loaded plugins from a specified file""" - filename = os.path.basename(path) - self.bot.logger.info("Unloading plugins from: {}".format(filename)) - - # remove plugins loaded from this file - for plugin_type, plugins in self.bot.plugins.items(): - self.bot.plugins[plugin_type] = [x for x in plugins if x[0]._filename != filename] - - # stop all currently running instances of the plugins from this file - for func, handler in list(self.bot.threads.items()): - if func._filename == filename: - handler.stop() - del self.bot.threads[func] - - self.rebuild() - - def rebuild(self): - """rebuilds the cloudbot command and event hook lists""" - self.bot.commands = {} - for plugin in self.bot.plugins['command']: - name = plugin[1]['name'].lower() - if not re.match(r'^\w+$', name): - self.bot.logger.error('Invalid command name: "{}" ({})'.format(name, format_plug(plugin))) - continue - if name in self.bot.commands: - self.bot.logger.error('Command already registered: "{}" ({}, {})'.format(name, - format_plug(self.bot.commands[name]), - format_plug(plugin))) - continue - self.bot.commands[name] = plugin - - self.bot.events = collections.defaultdict(list) - for func, args in self.bot.plugins['event']: - for event in args['events']: - self.bot.events[event].append((func, args)) - - -class PluginEventHandler(Trick): - def __init__(self, loader, *args, **kwargs): - self.loader = loader - Trick.__init__(self, *args, **kwargs) - - def on_created(self, event): - self.loader.load_file(event.src_path) - - def on_deleted(self, event): - self.loader.unload_file(event.src_path) - - def on_modified(self, event): - self.loader.load_file(event.src_path) - - def on_moved(self, event): - self.loader.unload_file(event.src_path) - self.loader.load_file(event.dest_path) diff --git a/core/main.py b/core/main.py index da120ce..0054b0a 100644 --- a/core/main.py +++ b/core/main.py @@ -1,16 +1,12 @@ -import _thread +import thread import traceback -import queue -import re - -from sqlalchemy.orm import scoped_session - -_thread.stack_size(1024 * 512) # reduce vm size -#TODO: redesign this messy thing +thread.stack_size(1024 * 512) # reduce vm size + + class Input(dict): - def __init__(self, bot, conn, raw, prefix, command, params, + def __init__(self, conn, raw, prefix, command, params, nick, user, host, mask, paraml, msg): chan = paraml[0].lower() @@ -26,7 +22,7 @@ class Input(dict): if target == nick: conn.msg(target, message) else: - conn.msg(target, "({}) {}".format(nick, message)) + conn.msg(target, u"({}) {}".format(nick, message)) def action(message, target=chan): """sends an action to the current channel/user or a specific channel/user""" @@ -54,59 +50,67 @@ class Input(dict): self[key] = value -def run(bot, func, input): - uses_db = True - # TODO: change to bot.get_db_session() - print(input) - if 'text' not in input: - input.text = input.paraml +def run(func, input): + args = func._args - if uses_db: - # create SQLAlchemy session - bot.logger.debug("Opened DB session for: {}".format(func._filename)) - input.db = input.bot.db_session() - - try: - out = func(input, input.conn) - except: - bot.logger.exception("Error in plugin {}:".format(func._filename)) - return - finally: - if uses_db: - bot.logger.debug("Closed DB session for: {}".format(func._filename)) - input.db.close() + if 'inp' not in input: + input.inp = input.paraml + if args: + if 'db' in args and 'db' not in input: + input.db = get_db_connection(input.conn) + if 'input' in args: + input.input = input + if 0 in args: + out = func(input.inp, **input) + else: + kw = dict((key, input[key]) for key in args if key in input) + out = func(input.inp, **kw) + else: + out = func(input.inp) if out is not None: - input.reply(str(out)) + input.reply(unicode(out)) def do_sieve(sieve, bot, input, func, type, args): try: return sieve(bot, input, func, type, args) except Exception: - bot.logger.exception("Error in sieve {}:".format(func._filename)) + print 'sieve error', + traceback.print_exc() return None class Handler(object): """Runs plugins in their own threads (ensures order)""" - def __init__(self, bot, func): + def __init__(self, func): self.func = func - self.bot = bot - self.input_queue = queue.Queue() - _thread.start_new_thread(self.start, ()) + self.input_queue = Queue.Queue() + thread.start_new_thread(self.start, ()) def start(self): - uses_db = True + uses_db = 'db' in self.func._args + db_conns = {} while True: input = self.input_queue.get() if input == StopIteration: break - run(self.bot, self.func, input) + if uses_db: + db = db_conns.get(input.conn) + if db is None: + db = bot.get_db_connection(input.conn) + db_conns[input.conn] = db + input.db = db + try: + run(self.func, input) + except: + import traceback + + traceback.print_exc() def stop(self): self.input_queue.put(StopIteration) @@ -115,27 +119,27 @@ class Handler(object): self.input_queue.put(value) -def dispatch(bot, input, kind, func, args, autohelp=False): - for sieve, in bot.plugins['sieve']: +def dispatch(input, kind, func, args, autohelp=False): + for sieve, in bot.plugs['sieve']: input = do_sieve(sieve, bot, input, func, kind, args) if input is None: return if not (not autohelp or not args.get('autohelp', True) or input.inp or not (func.__doc__ is not None)): - input.notice(input.conn.config["command_prefix"] + func.__doc__) + input.notice(input.conn.conf["command_prefix"] + func.__doc__) return if func._thread: bot.threads[func].put(input) else: - _thread.start_new_thread(run, (bot, func, input)) + thread.start_new_thread(run, (func, input)) -def match_command(bot, command): +def match_command(command): commands = list(bot.commands) # do some fuzzy matching - prefix = [x for x in commands if x.startswith(command)] + prefix = filter(lambda x: x.startswith(command), commands) if len(prefix) == 1: return prefix[0] elif prefix and command not in prefix: @@ -144,13 +148,13 @@ def match_command(bot, command): return command -def main(bot, conn, out): - inp = Input(bot, conn, *out) - command_prefix = conn.config.get('command_prefix', '.') +def main(conn, out): + inp = Input(conn, *out) + command_prefix = conn.conf.get('command_prefix', '.') # EVENTS for func, args in bot.events[inp.command] + bot.events['*']: - dispatch(bot, Input(bot, conn, *out), "event", func, args) + dispatch(Input(conn, *out), "event", func, args) if inp.command == 'PRIVMSG': # COMMANDS @@ -158,6 +162,7 @@ def main(bot, conn, out): prefix = '^(?:[{}]?|'.format(command_prefix) else: prefix = '^(?:[{}]|'.format(command_prefix) + command_re = prefix + inp.conn.nick command_re += r'[,;:]+\s+)(\w+)(?:$|\s+)(.*)' @@ -165,26 +170,26 @@ def main(bot, conn, out): if m: trigger = m.group(1).lower() - command = match_command(bot, trigger) + command = match_command(trigger) if isinstance(command, list): # multiple potential matches - input = Input(bot, conn, *out) + input = Input(conn, *out) input.notice("Did you mean {} or {}?".format (', '.join(command[:-1]), command[-1])) elif command in bot.commands: - input = Input(bot, conn, *out) + input = Input(conn, *out) input.trigger = trigger - input.text_unstripped = m.group(2) - input.text = input.text_unstripped.strip() + input.inp_unstripped = m.group(2) + input.inp = input.inp_unstripped.strip() func, args = bot.commands[command] - dispatch(bot, input, "command", func, args, autohelp=True) + dispatch(input, "command", func, args, autohelp=True) # REGEXES - for func, args in bot.plugins['regex']: + for func, args in bot.plugs['regex']: m = args['re'].search(inp.lastparam) if m: - input = Input(bot, conn, *out) - input.text = m + input = Input(conn, *out) + input.inp = m - dispatch(bot, input, "regex", func, args) + dispatch(input, "regex", func, args) diff --git a/core/permissions.py b/core/permissions.py deleted file mode 100644 index 103542a..0000000 --- a/core/permissions.py +++ /dev/null @@ -1,48 +0,0 @@ -from fnmatch import fnmatch - - -class PermissionManager(object): - def __init__(self, bot, conn): - - # this is all legacy code, needs to be redone with classes and whatnot - self.logger = bot.logger - - self.logger.info("Creating simple permission manager for {}.".format(conn.name)) - - # stuff - self.bot = bot - self.conn = conn - self.config = conn.config - - self.group_perms = {} - self.group_users = {} - self.perm_users = {} - - self.reload() - - def reload(self): - self.logger.info("Reloading permissions for {}.".format(self.conn.name)) - groups = self.conn.config.get("permissions", []) - # work out the permissions and users each group has - for key, value in groups.items(): - self.group_perms[key] = [] - self.group_users[key] = [] - for permission in value["perms"]: - self.group_perms[key].append(permission) - for user in value["users"]: - self.group_users[key].append(user) - - for group, users in self.group_users.items(): - group_perms = self.group_perms[group] - for perm in group_perms: - self.perm_users[perm] = [] - self.perm_users[perm] = users - - def has_perm_mask(self, mask, perm): - - allowed_users = self.perm_users[perm] - - for pattern in allowed_users: - if fnmatch(mask.lower(), pattern.lower()): - return input - diff --git a/core/reload.py b/core/reload.py new file mode 100644 index 0000000..f1bfeb6 --- /dev/null +++ b/core/reload.py @@ -0,0 +1,160 @@ +import collections +import glob +import os +import re +import sys +import traceback + + +if 'mtimes' not in globals(): + mtimes = {} + +if 'lastfiles' not in globals(): + lastfiles = set() + + +def make_signature(f): + return f.func_code.co_filename, f.func_name, f.func_code.co_firstlineno + + +def format_plug(plug, kind='', lpad=0): + out = ' ' * lpad + '{}:{}:{}'.format(*make_signature(plug[0])) + if kind == 'command': + out += ' ' * (50 - len(out)) + plug[1]['name'] + + if kind == 'event': + out += ' ' * (50 - len(out)) + ', '.join(plug[1]['events']) + + if kind == 'regex': + out += ' ' * (50 - len(out)) + plug[1]['regex'] + + return out + + +def reload(init=False): + changed = False + + if init: + bot.plugs = collections.defaultdict(list) + bot.threads = {} + + core_fileset = set(glob.glob(os.path.join("core", "*.py"))) + + for filename in core_fileset: + mtime = os.stat(filename).st_mtime + if mtime != mtimes.get(filename): + mtimes[filename] = mtime + + changed = True + + try: + eval(compile(open(filename, 'U').read(), filename, 'exec'), + globals()) + except Exception: + traceback.print_exc() + if init: # stop if there's an error (syntax?) in a core + sys.exit() # script on startup + continue + + if filename == os.path.join('core', 'reload.py'): + reload(init=init) + return + + fileset = set(glob.glob(os.path.join('plugins', '*.py'))) + + # remove deleted/moved plugins + for name, data in bot.plugs.iteritems(): + bot.plugs[name] = [x for x in data if x[0]._filename in fileset] + + for filename in list(mtimes): + if filename not in fileset and filename not in core_fileset: + mtimes.pop(filename) + + for func, handler in list(bot.threads.iteritems()): + if func._filename not in fileset: + handler.stop() + del bot.threads[func] + + # compile new plugins + for filename in fileset: + mtime = os.stat(filename).st_mtime + if mtime != mtimes.get(filename): + mtimes[filename] = mtime + + changed = True + + try: + code = compile(open(filename, 'U').read(), filename, 'exec') + namespace = {} + eval(code, namespace) + except Exception: + traceback.print_exc() + continue + + # remove plugins already loaded from this filename + for name, data in bot.plugs.iteritems(): + bot.plugs[name] = [x for x in data + if x[0]._filename != filename] + + for func, handler in list(bot.threads.iteritems()): + if func._filename == filename: + handler.stop() + del bot.threads[func] + + for obj in namespace.itervalues(): + if hasattr(obj, '_hook'): # check for magic + if obj._thread: + bot.threads[obj] = Handler(obj) + + for type, data in obj._hook: + bot.plugs[type] += [data] + + if not init: + print '### new plugin (type: %s) loaded:' % \ + type, format_plug(data) + + if changed: + bot.commands = {} + for plug in bot.plugs['command']: + name = plug[1]['name'].lower() + if not re.match(r'^\w+$', name): + print '### ERROR: invalid command name "{}" ({})'.format(name, format_plug(plug)) + continue + if name in bot.commands: + print "### ERROR: command '{}' already registered ({}, {})".format(name, + format_plug(bot.commands[name]), + format_plug(plug)) + continue + bot.commands[name] = plug + + bot.events = collections.defaultdict(list) + for func, args in bot.plugs['event']: + for event in args['events']: + bot.events[event].append((func, args)) + + if init: + print ' plugin listing:' + + if bot.commands: + # hack to make commands with multiple aliases + # print nicely + + print ' command:' + commands = collections.defaultdict(list) + + for name, (func, args) in bot.commands.iteritems(): + commands[make_signature(func)].append(name) + + for sig, names in sorted(commands.iteritems()): + names.sort(key=lambda x: (-len(x), x)) # long names first + out = ' ' * 6 + '%s:%s:%s' % sig + out += ' ' * (50 - len(out)) + ', '.join(names) + print out + + for kind, plugs in sorted(bot.plugs.iteritems()): + if kind == 'command': + continue + print ' {}:'.format(kind) + for plug in plugs: + print format_plug(plug, kind=kind, lpad=6) + print diff --git a/plugins/attacks.py b/disabled_stuff/attacks.py similarity index 92% rename from plugins/attacks.py rename to disabled_stuff/attacks.py index ebc1064..feb00b8 100644 --- a/plugins/attacks.py +++ b/disabled_stuff/attacks.py @@ -3,15 +3,15 @@ import random from util import hook -with open("data/larts.txt") as f: +with open("plugins/data/larts.txt") as f: larts = [line.strip() for line in f.readlines() if not line.startswith("//")] -with open("data/insults.txt") as f: +with open("plugins/data/insults.txt") as f: insults = [line.strip() for line in f.readlines() if not line.startswith("//")] -with open("data/flirts.txt") as f: +with open("plugins/data/flirts.txt") as f: flirts = [line.strip() for line in f.readlines() if not line.startswith("//")] diff --git a/plugins/brainfuck.py b/disabled_stuff/brainfuck.py similarity index 100% rename from plugins/brainfuck.py rename to disabled_stuff/brainfuck.py diff --git a/plugins/choose.py b/disabled_stuff/choose.py similarity index 100% rename from plugins/choose.py rename to disabled_stuff/choose.py diff --git a/disabled_stuff/cleverbot.py b/disabled_stuff/cleverbot.py new file mode 100644 index 0000000..6604d8b --- /dev/null +++ b/disabled_stuff/cleverbot.py @@ -0,0 +1,121 @@ +# from jessi bot +import urllib2 +import hashlib +import re +import unicodedata +from util import hook + +# these are just parts required +# TODO: Merge them. + +arglist = ['', 'y', '', '', '', '', '', '', '', '', 'wsf', '', + '', '', '', '', '', '', '', '0', 'Say', '1', 'false'] + +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') + +headers = {'X-Moz': 'prefetch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1)Gecko/20100101 Firefox/7.0', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Referer': 'http://www.cleverbot.com', + 'Pragma': 'no-cache', 'Cache-Control': 'no-cache, no-cache', 'Accept-Language': 'en-us;q=0.8,en;q=0.5'} + +keylist = ['stimulus', 'start', 'sessionid', 'vText8', 'vText7', 'vText6', + 'vText5', 'vText4', 'vText3', 'vText2', 'icognoid', + 'icognocheck', 'prevref', 'emotionaloutput', 'emotionalhistory', + 'asbotname', 'ttsvoice', 'typing', 'lineref', 'fno', 'sub', + 'islearning', 'cleanslate'] + +MsgList = list() + + +def quote(s, safe='/'): # quote('abc def') -> 'abc%20def' + s = s.encode('utf-8') + s = s.decode('utf-8') + print "s= " + s + print "safe= " + safe + safe += always_safe + safe_map = dict() + for i in range(256): + c = chr(i) + safe_map[c] = (c in safe) and c or ('%%%02X' % i) + try: + res = map(safe_map.__getitem__, s) + except: + print "blank" + return '' + print "res= " + ''.join(res) + return ''.join(res) + + +def encode(keylist, arglist): + text = str() + for i in range(len(keylist)): + k = keylist[i] + v = quote(arglist[i]) + text += '&' + k + '=' + v + text = text[1:] + return text + + +def Send(): + data = encode(keylist, arglist) + digest_txt = data[9:29] + new_hash = hashlib.md5(digest_txt).hexdigest() + arglist[keylist.index('icognocheck')] = new_hash + data = encode(keylist, arglist) + req = urllib2.Request('http://www.cleverbot.com/webservicemin', + data, headers) + f = urllib2.urlopen(req) + reply = f.read() + return reply + + +def parseAnswers(text): + d = dict() + keys = ['text', 'sessionid', 'logurl', 'vText8', 'vText7', 'vText6', + 'vText5', 'vText4', 'vText3', 'vText2', 'prevref', 'foo', + 'emotionalhistory', 'ttsLocMP3', 'ttsLocTXT', 'ttsLocTXT3', + 'ttsText', 'lineRef', 'lineURL', 'linePOST', 'lineChoices', + 'lineChoicesAbbrev', 'typingData', 'divert'] + values = text.split('\r') + i = 0 + for key in keys: + d[key] = values[i] + i += 1 + return d + + +def ask(inp): + arglist[keylist.index('stimulus')] = inp + if MsgList: + arglist[keylist.index('lineref')] = '!0' + str(len( + MsgList) / 2) + asw = Send() + MsgList.append(inp) + answer = parseAnswers(asw) + for k, v in answer.iteritems(): + try: + arglist[keylist.index(k)] = v + except ValueError: + pass + arglist[keylist.index('emotionaloutput')] = str() + text = answer['ttsText'] + MsgList.append(text) + return text + + +@hook.command("cb") +def cleverbot(inp, reply=None): + reply(ask(inp)) + + +''' # TODO: add in command to control extra verbose per channel +@hook.event('PRIVMSG') +def cbevent(inp, reply=None): + reply(ask(inp)) + +@hook.command("cbver", permissions=['cleverbot']) +def cleverbotverbose(inp, notice=None): + if on in input +''' diff --git a/disabled_stuff/cloudbot.sh b/disabled_stuff/cloudbot.sh new file mode 100644 index 0000000..877c4ea --- /dev/null +++ b/disabled_stuff/cloudbot.sh @@ -0,0 +1,126 @@ +#!/bin/bash +echo "" +echo " ________ ______ __ " +echo " / ____/ /___ __ ______/ / __ )____ / /_" +echo " / / / / __ \/ / / / __ / __ / __ \/ __/" +echo "/ /___/ / /_/ / /_/ / /_/ / /_/ / /_/ / /_ " +echo "\____/_/\____/\__,_/\__,_/_____/\____/\__/ " +echo " http://git.io/cloudbotirc by ClouDev " +echo "" +locatefiles() { + botfile="/bot.py" + botfile=$(pwd)$botfile + logfile="/bot.log" + logfile=$(pwd)$logfile +} + +running() { + if [[ $(ps aux|grep bot.py|grep -v grep|grep -v daemon|grep -v SCREEN) != "" ]]; then + true + else + false + fi +} + +checkbackend() { + if dpkg -l| grep ^ii|grep daemon|grep 'turns other' > /dev/null; then + backend="daemon" + elif dpkg -l| grep ^ii|grep screen|grep 'terminal multi' > /dev/null; then + backend="screen" + else + backend="manual" + fi + return 0 +} + +setcommands() { + status() { + if running; then + echo "CloudBot is running!" + else + echo "CloudBot is not running!" + fi + } + clear() { + : > $logfile + } + if [ "$backend" == "daemon" ]; then + start() { + daemon -r -n cloudbot -O $logfile python $botfile + } + stop() { + daemon -n cloudbot --stop + } + elif [ "$backend" == "screen" ]; then + start() { + screen -d -m -S cloudbot -t cloudbot python $botfile > $logfile 2>&1 + } + stop() { + pid=`ps ax|grep -v grep|grep python|grep -v SCREEN|grep $botfile|awk '{print $1}'` + kill $pid + } + elif [ "$backend" == "manual" ]; then + start() { + $botfile + } + stop() { + pid=`ps ax|grep -v grep|grep python|grep $botfile|awk '{print $1}'` + kill $pid + } + fi +} + +processargs() { + case $1 in + start|-start|--start) + if running; then + echo "Cannot start! Bot is already running!" + exit 1 + else + echo "Starting CloudBot... ($backend)" + start + fi + ;; + stop|-stop|--stop) + if running; then + echo "Stopping CloudBot... ($backend)" + stop + else + echo "Cannot stop! Bot is not already running!" + exit 1 + fi + ;; + restart|-restart|--restart) + if running; then + echo "Restarting CloudBot... ($backend)" + stop + sleep 3 + start + else + echo "Cannot restart! Bot is not already running!" + exit 1 + fi + ;; + clear|-clear|--clear) + echo "Clearing logs..." + clear + ;; + status|-status|--status) + status + ;; + *) + usage="usage: ./cloudbot {start|stop|restart|clear|status}" + echo $usage + ;; + esac +} + +main() { + locatefiles + checkbackend + setcommands + processargs $1 +} + +main $* +exit 0 \ No newline at end of file diff --git a/plugins/coin.py b/disabled_stuff/coin.py similarity index 100% rename from plugins/coin.py rename to disabled_stuff/coin.py diff --git a/disabled_stuff/correction.py b/disabled_stuff/correction.py new file mode 100644 index 0000000..7617e11 --- /dev/null +++ b/disabled_stuff/correction.py @@ -0,0 +1,37 @@ +from util import hook + +import re + +CORRECTION_RE = r'^(s|S)/.*/.*/?\S*$' + + +@hook.regex(CORRECTION_RE) +def correction(match, input=None, conn=None, message=None): + split = input.msg.split("/") + + if len(split) == 4: + nick = split[3].lower() + else: + nick = None + + find = split[1] + replace = split[2] + + for item in conn.history[input.chan].__reversed__(): + name, timestamp, msg = item + if msg.startswith("s/"): + # don't correct corrections, it gets really confusing + continue + if nick: + if nick != name.lower(): + continue + if find in msg: + if "\x01ACTION" in msg: + msg = msg.replace("\x01ACTION ", "/me ").replace("\x01", "") + message(u"Correction, <{}> {}".format(name, msg.replace(find, "\x02" + replace + "\x02"))) + return + else: + continue + + return u"Did not find {} in any recent messages.".format(find) + diff --git a/plugins/cryptocoins.py b/disabled_stuff/cryptocoins.py similarity index 75% rename from plugins/cryptocoins.py rename to disabled_stuff/cryptocoins.py index 7ede4ad..42d5945 100644 --- a/plugins/cryptocoins.py +++ b/disabled_stuff/cryptocoins.py @@ -5,21 +5,21 @@ from util import http, hook exchanges = { "blockchain": { "api_url": "https://blockchain.info/ticker", - "func": lambda data: "Blockchain // Buy: \x0307${:,.2f}\x0f -" - " Sell: \x0307${:,.2f}\x0f".format(data["USD"]["buy"], data["USD"]["sell"]) + "func": lambda data: u"Blockchain // Buy: \x0307${:,.2f}\x0f -" + u" Sell: \x0307${:,.2f}\x0f".format(data["USD"]["buy"], data["USD"]["sell"]) }, "coinbase": { "api_url": "https://coinbase.com/api/v1/prices/spot_rate", - "func": lambda data: "Coinbase // Current: \x0307${:,.2f}\x0f".format(float(data['amount'])) + "func": lambda data: u"Coinbase // Current: \x0307${:,.2f}\x0f".format(float(data['amount'])) }, "bitpay": { "api_url": "https://bitpay.com/api/rates", - "func": lambda data: "Bitpay // Current: \x0307${:,.2f}\x0f".format(data[0]['rate']) + "func": lambda data: u"Bitpay // Current: \x0307${:,.2f}\x0f".format(data[0]['rate']) }, "bitstamp": { "api_url": "https://www.bitstamp.net/api/ticker/", - "func": lambda data: "BitStamp // Current: \x0307${:,.2f}\x0f - High: \x0307${:,.2f}\x0f -" - " Low: \x0307${:,.2f}\x0f - Volume: {:,.2f} BTC".format(float(data['last']), + "func": lambda data: u"BitStamp // Current: \x0307${:,.2f}\x0f - High: \x0307${:,.2f}\x0f -" + u" Low: \x0307${:,.2f}\x0f - Volume: {:,.2f} BTC".format(float(data['last']), float(data['high']), float(data['low']), float(data['volume'])) diff --git a/plugins/cypher.py b/disabled_stuff/cypher.py similarity index 100% rename from plugins/cypher.py rename to disabled_stuff/cypher.py diff --git a/data/8ball_responses.txt b/disabled_stuff/data/8ball_responses.txt similarity index 100% rename from data/8ball_responses.txt rename to disabled_stuff/data/8ball_responses.txt diff --git a/disabled_stuff/data/GeoLiteCity.dat b/disabled_stuff/data/GeoLiteCity.dat new file mode 100644 index 0000000..e94f60e Binary files /dev/null and b/disabled_stuff/data/GeoLiteCity.dat differ diff --git a/data/flirts.txt b/disabled_stuff/data/flirts.txt similarity index 100% rename from data/flirts.txt rename to disabled_stuff/data/flirts.txt diff --git a/data/fortunes.txt b/disabled_stuff/data/fortunes.txt similarity index 100% rename from data/fortunes.txt rename to disabled_stuff/data/fortunes.txt diff --git a/data/geoip_regions.json b/disabled_stuff/data/geoip_regions.json similarity index 100% rename from data/geoip_regions.json rename to disabled_stuff/data/geoip_regions.json diff --git a/data/insults.txt b/disabled_stuff/data/insults.txt similarity index 100% rename from data/insults.txt rename to disabled_stuff/data/insults.txt diff --git a/data/itemids.txt b/disabled_stuff/data/itemids.txt similarity index 100% rename from data/itemids.txt rename to disabled_stuff/data/itemids.txt diff --git a/data/kills.json b/disabled_stuff/data/kills.json similarity index 100% rename from data/kills.json rename to disabled_stuff/data/kills.json diff --git a/data/kills.txt b/disabled_stuff/data/kills.txt similarity index 100% rename from data/kills.txt rename to disabled_stuff/data/kills.txt diff --git a/data/larts.txt b/disabled_stuff/data/larts.txt similarity index 100% rename from data/larts.txt rename to disabled_stuff/data/larts.txt diff --git a/data/name_files/dragons.json b/disabled_stuff/data/name_files/dragons.json similarity index 100% rename from data/name_files/dragons.json rename to disabled_stuff/data/name_files/dragons.json diff --git a/data/name_files/dwarves.json b/disabled_stuff/data/name_files/dwarves.json similarity index 100% rename from data/name_files/dwarves.json rename to disabled_stuff/data/name_files/dwarves.json diff --git a/data/name_files/elves_female.json b/disabled_stuff/data/name_files/elves_female.json similarity index 100% rename from data/name_files/elves_female.json rename to disabled_stuff/data/name_files/elves_female.json diff --git a/data/name_files/elves_male.json b/disabled_stuff/data/name_files/elves_male.json similarity index 100% rename from data/name_files/elves_male.json rename to disabled_stuff/data/name_files/elves_male.json diff --git a/data/name_files/fantasy.json b/disabled_stuff/data/name_files/fantasy.json similarity index 100% rename from data/name_files/fantasy.json rename to disabled_stuff/data/name_files/fantasy.json diff --git a/data/name_files/female.json b/disabled_stuff/data/name_files/female.json similarity index 100% rename from data/name_files/female.json rename to disabled_stuff/data/name_files/female.json diff --git a/data/name_files/general.json b/disabled_stuff/data/name_files/general.json similarity index 100% rename from data/name_files/general.json rename to disabled_stuff/data/name_files/general.json diff --git a/data/name_files/hobbits.json b/disabled_stuff/data/name_files/hobbits.json similarity index 100% rename from data/name_files/hobbits.json rename to disabled_stuff/data/name_files/hobbits.json diff --git a/data/name_files/inns.json b/disabled_stuff/data/name_files/inns.json similarity index 100% rename from data/name_files/inns.json rename to disabled_stuff/data/name_files/inns.json diff --git a/data/name_files/items.json b/disabled_stuff/data/name_files/items.json similarity index 100% rename from data/name_files/items.json rename to disabled_stuff/data/name_files/items.json diff --git a/data/name_files/male.json b/disabled_stuff/data/name_files/male.json similarity index 100% rename from data/name_files/male.json rename to disabled_stuff/data/name_files/male.json diff --git a/data/name_files/narn.json b/disabled_stuff/data/name_files/narn.json similarity index 100% rename from data/name_files/narn.json rename to disabled_stuff/data/name_files/narn.json diff --git a/data/name_files/warrior_cats.json b/disabled_stuff/data/name_files/warrior_cats.json similarity index 100% rename from data/name_files/warrior_cats.json rename to disabled_stuff/data/name_files/warrior_cats.json diff --git a/data/recipes.txt b/disabled_stuff/data/recipes.txt similarity index 100% rename from data/recipes.txt rename to disabled_stuff/data/recipes.txt diff --git a/data/slaps.json b/disabled_stuff/data/slaps.json similarity index 100% rename from data/slaps.json rename to disabled_stuff/data/slaps.json diff --git a/data/slogans.txt b/disabled_stuff/data/slogans.txt similarity index 100% rename from data/slogans.txt rename to disabled_stuff/data/slogans.txt diff --git a/plugins/dice.py b/disabled_stuff/dice.py similarity index 91% rename from plugins/dice.py rename to disabled_stuff/dice.py index 4ec9f03..a89f3d5 100644 --- a/plugins/dice.py +++ b/disabled_stuff/dice.py @@ -1,6 +1,5 @@ # Written by Scaevolus, updated by Lukeroge - import re import random @@ -18,15 +17,15 @@ split_re = re.compile(r'([\d+-]*)d?(F|\d*)', re.I) def n_rolls(count, n): """roll an n-sided die count times""" if n == "F": - return [random.randint(-1, 1) for x in range(min(count, 100))] + return [random.randint(-1, 1) for x in xrange(min(count, 100))] if n < 2: # it's a coin if count < 100: - return [random.randint(0, 1) for x in range(count)] + return [random.randint(0, 1) for x in xrange(count)] else: # fake it return [int(random.normalvariate(.5 * count, (.75 * count) ** .5))] else: if count < 100: - return [random.randint(1, n) for x in range(count)] + return [random.randint(1, n) for x in xrange(count)] else: # fake it return [int(random.normalvariate(.5 * (1 + n) * count, (((n + 1) * (2 * n + 1) / 6. - @@ -75,7 +74,7 @@ def dice(inp): try: if count > 0: d = n_rolls(count, side) - rolls += list(map(str, d)) + rolls += map(str, d) total += sum(d) else: d = n_rolls(-count, side) diff --git a/plugins/dictionary.py b/disabled_stuff/dictionary.py similarity index 89% rename from plugins/dictionary.py rename to disabled_stuff/dictionary.py index e80a9b1..5b4123b 100644 --- a/plugins/dictionary.py +++ b/disabled_stuff/dictionary.py @@ -19,10 +19,10 @@ def define(inp): '//div[@class="example"]') if not definition: - return 'No results for ' + inp + ' :(' + return u'No results for {} :('.format(inp) def format_output(show_examples): - result = '{}: '.format(h.xpath('//dt[@class="title-word"]/a/text()')[0]) + result = u'{}: '.format(h.xpath('//dt[@class="title-word"]/a/text()')[0]) correction = h.xpath('//span[@class="correct-word"]/text()') if correction: @@ -41,7 +41,7 @@ def define(inp): for article in sections: result += article[0] if len(article) > 2: - result += ' '.join('{}. {}'.format(n + 1, section) + result += u' '.join(u'{}. {}'.format(n + 1, section) for n, section in enumerate(article[1:])) else: result += article[1] + ' ' @@ -77,7 +77,7 @@ def etymology(inp): etym = h.xpath('//dl') if not etym: - return 'No etymology found for {} :('.format(inp) + return u'No etymology found for {} :('.format(inp) etym = etym[0].text_content() diff --git a/plugins/domainr.py b/disabled_stuff/domainr.py similarity index 100% rename from plugins/domainr.py rename to disabled_stuff/domainr.py diff --git a/plugins/down.py b/disabled_stuff/down.py similarity index 85% rename from plugins/down.py rename to disabled_stuff/down.py index 8c16d34..f03c078 100644 --- a/plugins/down.py +++ b/disabled_stuff/down.py @@ -1,4 +1,4 @@ -import urllib.parse +import urlparse from util import hook, http @@ -10,7 +10,7 @@ def down(inp): if 'http://' not in inp: inp = 'http://' + inp - inp = 'http://' + urllib.parse.urlparse(inp).netloc + inp = 'http://' + urlparse.urlparse(inp).netloc # http://mail.python.org/pipermail/python-list/2006-December/589854.html try: diff --git a/plugins/drama.py b/disabled_stuff/drama.py similarity index 100% rename from plugins/drama.py rename to disabled_stuff/drama.py diff --git a/plugins/eightball.py b/disabled_stuff/eightball.py similarity index 72% rename from plugins/eightball.py rename to disabled_stuff/eightball.py index 4730bc4..8d91303 100644 --- a/plugins/eightball.py +++ b/disabled_stuff/eightball.py @@ -9,15 +9,15 @@ color_codes = { "": "\x02" } -with open("./data/8ball_responses.txt") as f: +with open("plugins/data/8ball_responses.txt") as f: responses = [line.strip() for line in f.readlines() if not line.startswith("//")] -@hook.command() -def eightball(input, conn): +@hook.command('8ball') +def eightball(inp, action=None): """8ball -- The all knowing magic eight ball, in electronic form. Ask and it shall be answered!""" magic = text.multiword_replace(random.choice(responses), color_codes) - input.action("shakes the magic 8 ball... {}".format(magic)) + action("shakes the magic 8 ball... {}".format(magic)) diff --git a/plugins/encrypt.py b/disabled_stuff/encrypt.py similarity index 91% rename from plugins/encrypt.py rename to disabled_stuff/encrypt.py index 119e37d..e391a04 100644 --- a/plugins/encrypt.py +++ b/disabled_stuff/encrypt.py @@ -37,8 +37,8 @@ def get_salt(bot): """generate an encryption salt if none exists, then returns the salt""" if not bot.config.get("random_salt", False): bot.config["random_salt"] = hashlib.md5(os.urandom(16)).hexdigest() - bot.config.save_config() - return bot.config.get("random_salt") + json.dump(bot.config, open('config', 'w'), sort_keys=True, indent=2) + return bot.config["random_salt"] @hook.command @@ -69,8 +69,7 @@ def encrypt(inp, bot=None, db=None, notice=None): # store the encoded text and IV in the DB for decoding later db.execute("insert or replace into encryption(encrypted, iv)" - "values(:encoded,:iv)", {'encoded': encoded, - 'iv': iv_encoded}) + "values(?,?)", (encoded, iv_encoded)) db.commit() return encoded @@ -98,7 +97,7 @@ def decrypt(inp, bot=None, db=None, notice=None): # get the encoded IV from the database and decode it iv_encoded = db.execute("select iv from encryption where" - " encrypted=:text", {'text': text}).fetchone()[0] + " encrypted=?", (text,)).fetchone()[0] iv = base64.b64decode(iv_encoded) # create AES cipher, decode text, decrypt text, and unpad it diff --git a/plugins/fact.py b/disabled_stuff/fact.py similarity index 100% rename from plugins/fact.py rename to disabled_stuff/fact.py diff --git a/plugins/factoids.py b/disabled_stuff/factoids.py similarity index 93% rename from plugins/factoids.py rename to disabled_stuff/factoids.py index a702b70..403e6f5 100644 --- a/plugins/factoids.py +++ b/disabled_stuff/factoids.py @@ -114,6 +114,10 @@ def info(inp, notice=None, db=None): @hook.regex(r'^\? ?(.+)') def factoid(inp, message=None, db=None, bot=None, action=None, conn=None, input=None): """? -- Shows what data is associated with .""" + try: + prefix_on = bot.config["plugins"]["factoids"].get("prefix", False) + except KeyError: + prefix_on = False db_init(db) @@ -152,4 +156,7 @@ def factoid(inp, message=None, db=None, bot=None, action=None, conn=None, input= except http.HttpError: message("Could not fetch URL.") else: - message(result) + if prefix_on: + message("\x02[{}]:\x02 {}".format(factoid_id, result)) + else: + message(result) diff --git a/plugins/fishbans.py b/disabled_stuff/fishbans.py similarity index 94% rename from plugins/fishbans.py rename to disabled_stuff/fishbans.py index df29d1e..aa76676 100644 --- a/plugins/fishbans.py +++ b/disabled_stuff/fishbans.py @@ -1,4 +1,4 @@ -from urllib.parse import quote_plus +from urllib import quote_plus from util import hook, http @@ -44,7 +44,7 @@ def bancount(inp): services = request["stats"]["service"] out = [] - for service, ban_count in list(services.items()): + for service, ban_count in services.items(): if ban_count != 0: out.append("{}: \x02{}\x02".format(service, ban_count)) else: diff --git a/plugins/fmylife.py b/disabled_stuff/fmylife.py similarity index 100% rename from plugins/fmylife.py rename to disabled_stuff/fmylife.py diff --git a/plugins/fortune.py b/disabled_stuff/fortune.py similarity index 85% rename from plugins/fortune.py rename to disabled_stuff/fortune.py index 0c766b3..5f1c478 100644 --- a/plugins/fortune.py +++ b/disabled_stuff/fortune.py @@ -3,8 +3,7 @@ import random from util import hook -with open("./data/fortunes.txt") as f: - +with open("plugins/data/fortunes.txt") as f: fortunes = [line.strip() for line in f.readlines() if not line.startswith("//")] diff --git a/disabled_stuff/freddy.py b/disabled_stuff/freddy.py new file mode 100644 index 0000000..c77fa5a --- /dev/null +++ b/disabled_stuff/freddy.py @@ -0,0 +1,13 @@ +from util import hook, http, web +from subprocess import check_output, CalledProcessError + +@hook.command +def freddycode(inp): + """freddycode - Check if the Freddy Fresh code is correct.""" + + try: + return "Freddy: '%s' ist %s" % (inp, \ + check_output(["/bin/freddycheck", inp])) + except CalledProcessError as err: + return "Freddy: Skript returned %s" % (str(err)) + diff --git a/plugins/geoip.py b/disabled_stuff/geoip.py similarity index 71% rename from plugins/geoip.py rename to disabled_stuff/geoip.py index 86e1829..b7ca61d 100644 --- a/plugins/geoip.py +++ b/disabled_stuff/geoip.py @@ -1,7 +1,7 @@ import os.path import json import gzip -from io import StringIO +from StringIO import StringIO import pygeoip @@ -9,22 +9,22 @@ from util import hook, http # load region database -with open("./data/geoip_regions.json", "rb") as f: +with open("./plugins/data/geoip_regions.json", "rb") as f: regions = json.loads(f.read()) -if os.path.isfile(os.path.abspath("./data/GeoLiteCity.dat")): +if os.path.isfile(os.path.abspath("./plugins/data/GeoLiteCity.dat")): # initialise geolocation database - geo = pygeoip.GeoIP(os.path.abspath("./data/GeoLiteCity.dat")) + geo = pygeoip.GeoIP(os.path.abspath("./plugins/data/GeoLiteCity.dat")) else: download = http.get("http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz") string_io = StringIO(download) geoip_file = gzip.GzipFile(fileobj=string_io, mode='rb') - output = open(os.path.abspath("./data/GeoLiteCity.dat"), 'wb') + output = open(os.path.abspath("./plugins/data/GeoLiteCity.dat"), 'wb') output.write(geoip_file.read()) output.close() - geo = pygeoip.GeoIP(os.path.abspath("./data/GeoLiteCity.dat")) + geo = pygeoip.GeoIP(os.path.abspath("./plugins/data/GeoLiteCity.dat")) @hook.command @@ -51,4 +51,4 @@ def geoip(inp): data["cc"] = record["country_code"] or "N/A" data["country"] = record["country_name"] or "Unknown" data["city"] = record["city"] or "Unknown" - return "\x02Country:\x02 {country} ({cc}), \x02City:\x02 {city}{region}".format(**data) + return u"\x02Country:\x02 {country} ({cc}), \x02City:\x02 {city}{region}".format(**data) diff --git a/plugins/github.py b/disabled_stuff/github.py similarity index 88% rename from plugins/github.py rename to disabled_stuff/github.py index c7e5d63..18033ef 100644 --- a/plugins/github.py +++ b/disabled_stuff/github.py @@ -1,5 +1,5 @@ import json -import urllib.request, urllib.error, urllib.parse +import urllib2 from util import hook, http @@ -37,18 +37,18 @@ def ghissues(inp): except IndexError: return "Invalid syntax. .github issues username/repo [number]" try: - url += "/{}".format(args[1]) + url += "/%s" % args[1] number = True except IndexError: number = False try: data = json.loads(http.open(url).read()) - print(url) + print url if not number: try: data = data[0] except IndexError: - print(data) + print data return "Repo has no open issues" except ValueError: return "Invalid data returned. Check arguments (.github issues username/repo [number]" @@ -56,9 +56,9 @@ def ghissues(inp): fmt1 = "Issue: #%s (%s) by %s: %s %s" # (number, state, user.login, title, gitio.gitio(data.url)) number = data["number"] if data["state"] == "open": - state = "\x033\x02OPEN\x02\x0f" + state = u"\x033\x02OPEN\x02\x0f" else: - state = "\x034\x02CLOSED\x02\x0f by {}".format(data["closed_by"]["login"]) + state = u"\x034\x02CLOSED\x02\x0f by {}".format(data["closed_by"]["login"]) user = data["user"]["login"] title = data["title"] summary = truncate(data["body"]) @@ -93,12 +93,12 @@ def gitio(inp): url = 'url=' + str(url) if code: url = url + '&code=' + str(code) - req = urllib.request.Request(url='http://git.io', data=url) + req = urllib2.Request(url='http://git.io', data=url) # try getting url, catch http error try: - f = urllib.request.urlopen(req) - except urllib.error.HTTPError: + f = urllib2.urlopen(req) + except urllib2.HTTPError: return "Failed to get URL!" urlinfo = str(f.info()) @@ -110,7 +110,7 @@ def gitio(inp): if row.find("Location") != -1: location = row - print(status) + print status if not "201" in status: return "Failed to get URL!" diff --git a/plugins/google.py b/disabled_stuff/google.py similarity index 94% rename from plugins/google.py rename to disabled_stuff/google.py index e36432f..fe9e288 100644 --- a/plugins/google.py +++ b/disabled_stuff/google.py @@ -48,4 +48,4 @@ def google(inp): content = http.html.fromstring(content).text_content() content = text.truncate_str(content, 150) - return '{} -- \x02{}\x02: "{}"'.format(result['unescapedUrl'], title, content) + return u'{} -- \x02{}\x02: "{}"'.format(result['unescapedUrl'], title, content) diff --git a/plugins/google_translate.py b/disabled_stuff/google_translate.py similarity index 94% rename from plugins/google_translate.py rename to disabled_stuff/google_translate.py index 87f089c..a9d4ea3 100644 --- a/plugins/google_translate.py +++ b/disabled_stuff/google_translate.py @@ -3,7 +3,7 @@ A Google API key is required and retrieved from the bot config file. Since December 1, 2011, the Google Translate API is a paid service only. """ -import html.entities +import htmlentitydefs import re from util import hook, http @@ -22,15 +22,15 @@ def unescape(text): # character reference try: if text[:3] == "&#x": - return chr(int(text[3:-1], 16)) + return unichr(int(text[3:-1], 16)) else: - return chr(int(text[2:-1])) + return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: - text = chr(html.entities.name2codepoint[text[1:-1]]) + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is @@ -83,7 +83,7 @@ def translate(inp, bot=None): if not api_key: return "This command requires a paid API key." - args = inp.split(' ', 2) + args = inp.split(u' ', 2) try: if len(args) >= 2: @@ -100,7 +100,7 @@ def translate(inp, bot=None): return goog_trans(api_key, args[1] + ' ' + args[2], sl, 'en') return goog_trans(api_key, args[2], sl, tl) return goog_trans(api_key, inp, '', 'en') - except IOError as e: + except IOError, e: return e diff --git a/plugins/googleurlparse.py b/disabled_stuff/googleurlparse.py similarity index 96% rename from plugins/googleurlparse.py rename to disabled_stuff/googleurlparse.py index 3b858eb..cbea897 100644 --- a/plugins/googleurlparse.py +++ b/disabled_stuff/googleurlparse.py @@ -1,5 +1,5 @@ from util import hook -from urllib.parse import unquote +from urllib import unquote @hook.command(autohelp=False) def googleurl(inp, db=None, nick=None): diff --git a/plugins/history.py b/disabled_stuff/history.py similarity index 84% rename from plugins/history.py rename to disabled_stuff/history.py index 06283ae..c703bcf 100644 --- a/plugins/history.py +++ b/disabled_stuff/history.py @@ -22,11 +22,8 @@ def track_seen(input, message_time, db, conn): # keep private messages private if input.chan[:1] == "#" and not re.findall('^s/.*/.*/$', input.msg.lower()): db.execute("insert or replace into seen_user(name, time, quote, chan, host)" - "values(:name,:time,:quote,:chan,:host)", {'name': input.nick.lower(), - 'time': time.time(), - 'quote': input.msg, - 'chan': input.chan, - 'host': input.mask}) + "values(?,?,?,?,?)", (input.nick.lower(), message_time, input.msg, + input.chan, input.mask)) db.commit() @@ -59,6 +56,7 @@ def resethistory(inp, input=None, conn=None): # wat return "There is no history for this channel." +"""seen.py: written by sklnd in about two beers July 2009""" @hook.command def seen(inp, nick='', chan='', db=None, input=None, conn=None): @@ -76,7 +74,7 @@ def seen(inp, nick='', chan='', db=None, input=None, conn=None): db_init(db, conn.name) last_seen = db.execute("select name, time, quote from seen_user where name" - " like :name and chan = :chan", {'name': inp, 'chan': chan}).fetchone() + " like ? and chan = ?", (inp, chan)).fetchone() if last_seen: reltime = timesince.timesince(last_seen[1]) diff --git a/plugins/horoscope.py b/disabled_stuff/horoscope.py similarity index 81% rename from plugins/horoscope.py rename to disabled_stuff/horoscope.py index 8f8e6b3..e4404cf 100644 --- a/plugins/horoscope.py +++ b/disabled_stuff/horoscope.py @@ -14,11 +14,6 @@ def db_init(db): db_ready = True -@hook.onload -def init(paraml, db=None): - db_init(db) - - @hook.command(autohelp=False) def horoscope(inp, db=None, notice=None, nick=None): """horoscope -- Get your horoscope.""" @@ -34,8 +29,8 @@ def horoscope(inp, db=None, notice=None, nick=None): db.execute("create table if not exists horoscope(nick primary key, sign)") if not sign: - sign = db.execute("select sign from horoscope where " - "nick=lower(:nick)", {'nick': nick}).fetchone() + sign = db.execute("select sign from horoscope where nick=lower(?)", + (nick,)).fetchone() if not sign: notice("horoscope -- Get your horoscope") return @@ -46,7 +41,7 @@ def horoscope(inp, db=None, notice=None, nick=None): title = soup.find_all('h1', {'class': 'h1b'})[1] horoscope_text = soup.find('div', {'class': 'fontdef1'}) - result = "\x02{}\x02 {}".format(title, horoscope_text) + result = u"\x02%s\x02 %s" % (title, horoscope_text) result = text.strip_html(result) #result = unicode(result, "utf8").replace('flight ','') @@ -54,8 +49,8 @@ def horoscope(inp, db=None, notice=None, nick=None): return "Could not get the horoscope for {}.".format(inp) if inp and not dontsave: - db.execute("insert or replace into horoscope(nick, sign) values (:nick, :sign)", - {'nick': nick.lower(), 'sign': sign}) + db.execute("insert or replace into horoscope(nick, sign) values (?,?)", + (nick.lower(), sign)) db.commit() return result diff --git a/plugins/hulu.py b/disabled_stuff/hulu.py similarity index 97% rename from plugins/hulu.py rename to disabled_stuff/hulu.py index b220380..74e6b00 100644 --- a/plugins/hulu.py +++ b/disabled_stuff/hulu.py @@ -1,4 +1,4 @@ -from urllib.parse import urlencode +from urllib import urlencode import re from util import hook, http, timeformat diff --git a/plugins/imdb.py b/disabled_stuff/imdb.py similarity index 100% rename from plugins/imdb.py rename to disabled_stuff/imdb.py diff --git a/plugins/imgur.py b/disabled_stuff/imgur.py similarity index 100% rename from plugins/imgur.py rename to disabled_stuff/imgur.py diff --git a/plugins/isup.py b/disabled_stuff/isup.py similarity index 80% rename from plugins/isup.py rename to disabled_stuff/isup.py index ff248a9..5fc95d6 100644 --- a/plugins/isup.py +++ b/disabled_stuff/isup.py @@ -1,4 +1,4 @@ -import urllib.parse +import urlparse from util import hook, http, urlnorm @@ -8,15 +8,14 @@ def isup(inp): """isup -- uses isup.me to see if a site is up or not""" # slightly overcomplicated, esoteric URL parsing - scheme, auth, path, query, fragment = urllib.parse.urlsplit(inp.strip()) + scheme, auth, path, query, fragment = urlparse.urlsplit(inp.strip()) domain = auth.encode('utf-8') or path.encode('utf-8') url = urlnorm.normalize(domain, assume_scheme="http") try: soup = http.get_soup('http://isup.me/' + domain) - except http.HTTPError as xxx_todo_changeme: - http.URLError = xxx_todo_changeme + except http.HTTPError, http.URLError: return "Could not get status." content = soup.find('div').text.strip() diff --git a/plugins/kernel.py b/disabled_stuff/kernel.py similarity index 100% rename from plugins/kernel.py rename to disabled_stuff/kernel.py diff --git a/plugins/kill.py b/disabled_stuff/kill.py similarity index 94% rename from plugins/kill.py rename to disabled_stuff/kill.py index 727274c..d25228e 100644 --- a/plugins/kill.py +++ b/disabled_stuff/kill.py @@ -26,7 +26,7 @@ def kill(inp, action=None, nick=None, conn=None, notice=None): "user": target } - with open("./data/kills.json") as f: + with open("plugins/data/kills.json") as f: generator = get_generator(f.read(), variables) # act out the message diff --git a/plugins/lastfm.py b/disabled_stuff/lastfm.py similarity index 84% rename from plugins/lastfm.py rename to disabled_stuff/lastfm.py index 36e2889..b928b1e 100644 --- a/plugins/lastfm.py +++ b/disabled_stuff/lastfm.py @@ -25,8 +25,8 @@ def lastfm(inp, nick='', db=None, bot=None, notice=None): db.execute("create table if not exists lastfm(nick primary key, acc)") if not user: - user = db.execute("select acc from lastfm where nick=lower(:nick)", - {'nick': nick}).fetchone() + user = db.execute("select acc from lastfm where nick=lower(?)", + (nick,)).fetchone() if not user: notice(lastfm.__doc__) return @@ -36,10 +36,10 @@ def lastfm(inp, nick='', db=None, bot=None, notice=None): api_key=api_key, user=user, limit=1) if 'error' in response: - return "Error: {}.".format(response["message"]) + return u"Error: {}.".format(response["message"]) if not "track" in response["recenttracks"] or len(response["recenttracks"]["track"]) == 0: - return 'No recent tracks for user "{}" found.'.format(user) + return u'No recent tracks for user "{}" found.'.format(user) tracks = response["recenttracks"]["track"] @@ -66,18 +66,18 @@ def lastfm(inp, nick='', db=None, bot=None, notice=None): album = track["album"]["#text"] artist = track["artist"]["#text"] - out = '{} {} "{}"'.format(user, status, title) + out = u'{} {} "{}"'.format(user, status, title) if artist: - out += " by \x02{}\x0f".format(artist) + out += u" by \x02{}\x0f".format(artist) if album: - out += " from the album \x02{}\x0f".format(album) + out += u" from the album \x02{}\x0f".format(album) # append ending based on what type it was out += ending if inp and not dontsave: - db.execute("insert or replace into lastfm(nick, acc) values " - "(:nick, :account)", {'nick': nick.lower(), 'account': user}) + db.execute("insert or replace into lastfm(nick, acc) values (?,?)", + (nick.lower(), user)) db.commit() return out diff --git a/plugins/lmgtfy.py b/disabled_stuff/lmgtfy.py similarity index 80% rename from plugins/lmgtfy.py rename to disabled_stuff/lmgtfy.py index c8dcee3..768075f 100644 --- a/plugins/lmgtfy.py +++ b/disabled_stuff/lmgtfy.py @@ -6,7 +6,7 @@ from util import hook, web, http def lmgtfy(inp): """lmgtfy [phrase] - Posts a google link for the specified phrase""" - link = "http://lmgtfy.com/?q={}".format(http.quote_plus(inp)) + link = u"http://lmgtfy.com/?q={}".format(http.quote_plus(inp)) try: return web.isgd(link) diff --git a/plugins/log.py b/disabled_stuff/log.py similarity index 93% rename from plugins/log.py rename to disabled_stuff/log.py index a08832f..d72dc1a 100644 --- a/plugins/log.py +++ b/disabled_stuff/log.py @@ -3,7 +3,6 @@ log.py: written by Scaevolus 2009 """ import os -import sys import codecs import time import re @@ -89,12 +88,12 @@ def get_log_fd(dir, server, chan): return fd -#@hook.singlethread +@hook.singlethread @hook.event('*') def log(paraml, input=None, bot=None): timestamp = gmtime(timestamp_format) - fd = get_log_fd(bot.data_dir, input.server, 'raw') + fd = get_log_fd(bot.persist_dir, input.server, 'raw') fd.write(timestamp + ' ' + input.raw + '\n') if input.command == 'QUIT': # these are temporary fixes until proper @@ -108,9 +107,7 @@ def log(paraml, input=None, bot=None): return if input.chan: - fd = get_log_fd(bot.data_dir, input.server, input.chan) + fd = get_log_fd(bot.persist_dir, input.server, input.chan) fd.write(timestamp + ' ' + beau + '\n') - out = "{} {} {}".format(timestamp, input.chan, beau) - - bot.logger.debug(out) + print timestamp, input.chan, beau.encode('utf8', 'ignore') diff --git a/plugins/lyrics.py b/disabled_stuff/lyrics.py similarity index 100% rename from plugins/lyrics.py rename to disabled_stuff/lyrics.py diff --git a/plugins/metacritic.py b/disabled_stuff/metacritic.py similarity index 98% rename from plugins/metacritic.py rename to disabled_stuff/metacritic.py index cac0546..92d0933 100644 --- a/plugins/metacritic.py +++ b/disabled_stuff/metacritic.py @@ -1,7 +1,7 @@ # metacritic.com scraper import re -from urllib.error import HTTPError +from urllib2 import HTTPError from util import hook, http diff --git a/plugins/minecraft_bukget.py b/disabled_stuff/minecraft_bukget.py similarity index 90% rename from plugins/minecraft_bukget.py rename to disabled_stuff/minecraft_bukget.py index 5777094..496f169 100644 --- a/plugins/minecraft_bukget.py +++ b/disabled_stuff/minecraft_bukget.py @@ -56,7 +56,7 @@ def plugin_random(): while not results: plugin_number = random.randint(1, count_total) - print("trying {}".format(plugin_number)) + print "trying {}".format(plugin_number) try: results = http.get_json(random_url.format(plugin_number)) except (http.HTTPError, http.URLError) as e: @@ -84,7 +84,7 @@ def format_output(data): description = text.truncate_str(data['description'], 30) url = data['website'] authors = data['authors'][0] - authors = authors[0] + "\u200b" + authors[1:] + authors = authors[0] + u"\u200b" + authors[1:] stage = data['stage'] current_version = data['versions'][0] @@ -97,11 +97,11 @@ def format_output(data): link = web.try_isgd(current_version['link']) if description: - line_a = "\x02{}\x02, by \x02{}\x02 - {} - ({}) \x02{}".format(name, authors, description, stage, url) + line_a = u"\x02{}\x02, by \x02{}\x02 - {} - ({}) \x02{}".format(name, authors, description, stage, url) else: - line_a = "\x02{}\x02, by \x02{}\x02 ({}) \x02{}".format(name, authors, stage, url) + line_a = u"\x02{}\x02, by \x02{}\x02 ({}) \x02{}".format(name, authors, stage, url) - line_b = "Last release: \x02v{}\x02 for \x02{}\x02 at {} \x02{}\x02".format(version_number, bukkit_versions, + line_b = u"Last release: \x02v{}\x02 for \x02{}\x02 at {} \x02{}\x02".format(version_number, bukkit_versions, last_update, link) return line_a, line_b diff --git a/plugins/minecraft_items.py b/disabled_stuff/minecraft_items.py similarity index 96% rename from plugins/minecraft_items.py rename to disabled_stuff/minecraft_items.py index 4646269..f1e94f9 100644 --- a/plugins/minecraft_items.py +++ b/disabled_stuff/minecraft_items.py @@ -24,7 +24,7 @@ class Recipe(object): return self.line -with open("./data/recipes.txt") as f: +with open("plugins/data/recipes.txt") as f: for line in f.readlines(): if line.startswith("//"): continue @@ -39,7 +39,7 @@ with open("./data/recipes.txt") as f: ids = [] -with open("./data/itemids.txt") as f: +with open("plugins/data/itemids.txt") as f: for line in f.readlines(): if line.startswith("//"): continue diff --git a/plugins/minecraft_ping.py b/disabled_stuff/minecraft_ping.py similarity index 83% rename from plugins/minecraft_ping.py rename to disabled_stuff/minecraft_ping.py index eab1190..978ca19 100644 --- a/plugins/minecraft_ping.py +++ b/disabled_stuff/minecraft_ping.py @@ -13,12 +13,12 @@ except ImportError: has_dns = False -mc_colors = [('\xa7f', '\x0300'), ('\xa70', '\x0301'), ('\xa71', '\x0302'), ('\xa72', '\x0303'), - ('\xa7c', '\x0304'), ('\xa74', '\x0305'), ('\xa75', '\x0306'), ('\xa76', '\x0307'), - ('\xa7e', '\x0308'), ('\xa7a', '\x0309'), ('\xa73', '\x0310'), ('\xa7b', '\x0311'), - ('\xa71', '\x0312'), ('\xa7d', '\x0313'), ('\xa78', '\x0314'), ('\xa77', '\x0315'), - ('\xa7l', '\x02'), ('\xa79', '\x0310'), ('\xa7o', '\t'), ('\xa7m', '\x13'), - ('\xa7r', '\x0f'), ('\xa7n', '\x15')] +mc_colors = [(u'\xa7f', u'\x0300'), (u'\xa70', u'\x0301'), (u'\xa71', u'\x0302'), (u'\xa72', u'\x0303'), + (u'\xa7c', u'\x0304'), (u'\xa74', u'\x0305'), (u'\xa75', u'\x0306'), (u'\xa76', u'\x0307'), + (u'\xa7e', u'\x0308'), (u'\xa7a', u'\x0309'), (u'\xa73', u'\x0310'), (u'\xa7b', u'\x0311'), + (u'\xa71', u'\x0312'), (u'\xa7d', u'\x0313'), (u'\xa78', u'\x0314'), (u'\xa77', u'\x0315'), + (u'\xa7l', u'\x02'), (u'\xa79', u'\x0310'), (u'\xa7o', u'\t'), (u'\xa7m', u'\x13'), + (u'\xa7r', u'\x0f'), (u'\xa7n', u'\x15')] ## EXCEPTIONS @@ -98,9 +98,9 @@ def mcping_modern(host, port): try: version = data["version"]["name"] try: - desc = " ".join(data["description"]["text"].split()) + desc = u" ".join(data["description"]["text"].split()) except TypeError: - desc = " ".join(data["description"].split()) + desc = u" ".join(data["description"].split()) max_players = data["players"]["max"] online = data["players"]["online"] except Exception as e: @@ -136,10 +136,10 @@ def mcping_legacy(host, port): length = struct.unpack('!h', sock.recv(2))[0] values = sock.recv(length * 2).decode('utf-16be') - data = values.split('\x00') # try to decode data using new format + data = values.split(u'\x00') # try to decode data using new format if len(data) == 1: # failed to decode data, server is using old format - data = values.split('\xa7') + data = values.split(u'\xa7') output = { "motd": format_colors(" ".join(data[0].split())), "motd_raw": data[0], @@ -199,17 +199,17 @@ def parse_input(inp): def format_colors(motd): for original, replacement in mc_colors: motd = motd.replace(original, replacement) - motd = motd.replace("\xa7k", "") + motd = motd.replace(u"\xa7k", "") return motd def format_output(data): if data["version"]: - return "{motd}\x0f - {version}\x0f - {players}/{players_max}" \ - " players.".format(**data).replace("\n", "\x0f - ") + return u"{motd}\x0f - {version}\x0f - {players}/{players_max}" \ + u" players.".format(**data).replace("\n", u"\x0f - ") else: - return "{motd}\x0f - {players}/{players_max}" \ - " players.".format(**data).replace("\n", "\x0f - ") + return u"{motd}\x0f - {players}/{players_max}" \ + u" players.".format(**data).replace("\n", u"\x0f - ") @hook.command diff --git a/plugins/minecraft_status.py b/disabled_stuff/minecraft_status.py similarity index 96% rename from plugins/minecraft_status.py rename to disabled_stuff/minecraft_status.py index 0a57ab0..4ca67d3 100644 --- a/plugins/minecraft_status.py +++ b/disabled_stuff/minecraft_status.py @@ -21,7 +21,7 @@ def mcstatus(inp): green = [] yellow = [] red = [] - for server, status in list(data.items()): + for server, status in data.items(): if status == "green": green.append(server) elif status == "yellow": diff --git a/plugins/minecraft_user.py b/disabled_stuff/minecraft_user.py similarity index 84% rename from plugins/minecraft_user.py rename to disabled_stuff/minecraft_user.py index 525c41b..4026994 100644 --- a/plugins/minecraft_user.py +++ b/disabled_stuff/minecraft_user.py @@ -87,15 +87,15 @@ def mcuser(inp): profile["lt"] = ", legacy" if profile["legacy"] else "" if profile["paid"]: - return "The account \x02{name}\x02 ({id}{lt}) exists. It is a \x02paid\x02" \ - " account.".format(**profile) + return u"The account \x02{name}\x02 ({id}{lt}) exists. It is a \x02paid\x02" \ + u" account.".format(**profile) else: - return "The account \x02{name}\x02 ({id}{lt}) exists. It \x034\x02is NOT\x02\x0f a paid" \ - " account.".format(**profile) + return u"The account \x02{name}\x02 ({id}{lt}) exists. It \x034\x02is NOT\x02\x0f a paid" \ + u" account.".format(**profile) elif name_status == "free": - return "The account \x02{}\x02 does not exist.".format(user) + return u"The account \x02{}\x02 does not exist.".format(user) elif name_status == "invalid": - return "The name \x02{}\x02 contains invalid characters.".format(user) + return u"The name \x02{}\x02 contains invalid characters.".format(user) else: # if you see this, panic return "Unknown Error." \ No newline at end of file diff --git a/plugins/minecraft_wiki.py b/disabled_stuff/minecraft_wiki.py similarity index 96% rename from plugins/minecraft_wiki.py rename to disabled_stuff/minecraft_wiki.py index 4101aae..072a8ac 100644 --- a/plugins/minecraft_wiki.py +++ b/disabled_stuff/minecraft_wiki.py @@ -45,7 +45,7 @@ def mcwiki(inp): summary = " ".join(p.text_content().splitlines()) summary = re.sub("\[\d+\]", "", summary) summary = text.truncate_str(summary, 200) - return "{} :: {}".format(summary, url) + return u"{} :: {}".format(summary, url) # this shouldn't happen return "Unknown Error." diff --git a/plugins/mlia.py b/disabled_stuff/mlia.py similarity index 100% rename from plugins/mlia.py rename to disabled_stuff/mlia.py diff --git a/disabled_stuff/mtg.py b/disabled_stuff/mtg.py new file mode 100644 index 0000000..3db8306 --- /dev/null +++ b/disabled_stuff/mtg.py @@ -0,0 +1,183 @@ +import re + +from util import hook, http + + +@hook.command +def mtg(inp): + ".mtg -- Gets information about Magic the Gathering card ." + + url = 'http://magiccards.info/query?v=card&s=cname' + h = http.get_html(url, q=inp) + + name = h.find('body/table/tr/td/span/a') + if name is None: + return "No cards found :(" + card = name.getparent().getparent().getparent() + + type = card.find('td/p').text.replace('\n', '') + + # this is ugly + text = http.html.tostring(card.xpath("//p[@class='ctext']/b")[0]) + text = text.replace('
', '$') + text = http.html.fromstring(text).text_content() + text = re.sub(r'(\w+\s*)\$+(\s*\w+)', r'\1. \2', text) + text = text.replace('$', ' ') + text = re.sub(r'\(.*?\)', '', text) # strip parenthetical explanations + text = re.sub(r'\.(\S)', r'. \1', text) # fix spacing + + printings = card.find('td/small').text_content() + printings = re.search(r'Editions:(.*)Languages:', printings).group(1) + printings = re.findall(r'\s*(.+?(?: \([^)]+\))*) \((.*?)\)', + ' '.join(printings.split())) + + printing_out = ', '.join('%s (%s)' % (set_abbrevs.get(x[0], x[0]), + rarity_abbrevs.get(x[1], x[1])) + for x in printings) + + name.make_links_absolute(base_url=url) + link = name.attrib['href'] + name = name.text_content().strip() + type = type.strip() + text = ' '.join(text.split()) + + return ' | '.join((name, type, text, printing_out, link)) + + +set_abbrevs = { + '15th Anniversary': '15ANN', + 'APAC Junior Series': 'AJS', + 'Alara Reborn': 'ARB', + 'Alliances': 'AI', + 'Anthologies': 'AT', + 'Antiquities': 'AQ', + 'Apocalypse': 'AP', + 'Arabian Nights': 'AN', + 'Arena League': 'ARENA', + 'Asia Pacific Land Program': 'APAC', + 'Battle Royale': 'BR', + 'Battle Royale Box Set': 'BRB', + 'Beatdown': 'BTD', + 'Beatdown Box Set': 'BTD', + 'Betrayers of Kamigawa': 'BOK', + 'Celebration Cards': 'UQC', + 'Champions of Kamigawa': 'CHK', + 'Champs': 'CP', + 'Chronicles': 'CH', + 'Classic Sixth Edition': '6E', + 'Coldsnap': 'CS', + 'Coldsnap Theme Decks': 'CSTD', + 'Conflux': 'CFX', + 'Core Set - Eighth Edition': '8E', + 'Core Set - Ninth Edition': '9E', + 'Darksteel': 'DS', + 'Deckmasters': 'DM', + 'Dissension': 'DI', + 'Dragon Con': 'DRC', + 'Duel Decks: Divine vs. Demonic': 'DVD', + 'Duel Decks: Elves vs. Goblins': 'EVG', + 'Duel Decks: Garruk vs. Liliana': 'GVL', + 'Duel Decks: Jace vs. Chandra': 'JVC', + 'Eighth Edition': '8ED', + 'Eighth Edition Box Set': '8EB', + 'European Land Program': 'EURO', + 'Eventide': 'EVE', + 'Exodus': 'EX', + 'Fallen Empires': 'FE', + 'Fifth Dawn': '5DN', + 'Fifth Edition': '5E', + 'Fourth Edition': '4E', + 'Friday Night Magic': 'FNMP', + 'From the Vault: Dragons': 'FVD', + 'From the Vault: Exiled': 'FVE', + 'Future Sight': 'FUT', + 'Gateway': 'GRC', + 'Grand Prix': 'GPX', + 'Guildpact': 'GP', + 'Guru': 'GURU', + 'Happy Holidays': 'HHO', + 'Homelands': 'HL', + 'Ice Age': 'IA', + 'Introductory Two-Player Set': 'ITP', + 'Invasion': 'IN', + 'Judge Gift Program': 'JR', + 'Judgment': 'JU', + 'Junior Series': 'JSR', + 'Legend Membership': 'DCILM', + 'Legends': 'LG', + 'Legions': 'LE', + 'Limited Edition (Alpha)': 'LEA', + 'Limited Edition (Beta)': 'LEB', + 'Limited Edition Alpha': 'LEA', + 'Limited Edition Beta': 'LEB', + 'Lorwyn': 'LW', + 'MTGO Masters Edition': 'MED', + 'MTGO Masters Edition II': 'ME2', + 'MTGO Masters Edition III': 'ME3', + 'Magic 2010': 'M10', + 'Magic Game Day Cards': 'MGDC', + 'Magic Player Rewards': 'MPRP', + 'Magic Scholarship Series': 'MSS', + 'Magic: The Gathering Launch Parties': 'MLP', + 'Media Inserts': 'MBP', + 'Mercadian Masques': 'MM', + 'Mirage': 'MR', + 'Mirrodin': 'MI', + 'Morningtide': 'MT', + 'Multiverse Gift Box Cards': 'MGBC', + 'Nemesis': 'NE', + 'Ninth Edition Box Set': '9EB', + 'Odyssey': 'OD', + 'Onslaught': 'ON', + 'Planar Chaos': 'PC', + 'Planechase': 'PCH', + 'Planeshift': 'PS', + 'Portal': 'PO', + 'Portal Demogame': 'POT', + 'Portal Second Age': 'PO2', + 'Portal Three Kingdoms': 'P3K', + 'Premium Deck Series: Slivers': 'PDS', + 'Prerelease Events': 'PTC', + 'Pro Tour': 'PRO', + 'Prophecy': 'PR', + 'Ravnica: City of Guilds': 'RAV', + 'Release Events': 'REP', + 'Revised Edition': 'RV', + 'Saviors of Kamigawa': 'SOK', + 'Scourge': 'SC', + 'Seventh Edition': '7E', + 'Shadowmoor': 'SHM', + 'Shards of Alara': 'ALA', + 'Starter': 'ST', + 'Starter 1999': 'S99', + 'Starter 2000 Box Set': 'ST2K', + 'Stronghold': 'SH', + 'Summer of Magic': 'SOM', + 'Super Series': 'SUS', + 'Tempest': 'TP', + 'Tenth Edition': '10E', + 'The Dark': 'DK', + 'Time Spiral': 'TS', + 'Time Spiral Timeshifted': 'TSTS', + 'Torment': 'TR', + 'Two-Headed Giant Tournament': 'THGT', + 'Unglued': 'UG', + 'Unhinged': 'UH', + 'Unhinged Alternate Foils': 'UHAA', + 'Unlimited Edition': 'UN', + "Urza's Destiny": 'UD', + "Urza's Legacy": 'UL', + "Urza's Saga": 'US', + 'Visions': 'VI', + 'Weatherlight': 'WL', + 'Worlds': 'WRL', + 'WotC Online Store': 'WOTC', + 'Zendikar': 'ZEN'} + +rarity_abbrevs = { + 'Land': 'L', + 'Common': 'C', + 'Uncommon': 'UC', + 'Rare': 'R', + 'Special': 'S', + 'Mythic Rare': 'MR'} diff --git a/disabled_stuff/mygengo_translate.py b/disabled_stuff/mygengo_translate.py new file mode 100644 index 0000000..6e7b006 --- /dev/null +++ b/disabled_stuff/mygengo_translate.py @@ -0,0 +1,115 @@ +# BING translation plugin by Lukeroge and neersighted +from util import hook +from util import http +import re +import htmlentitydefs +import mygengo + +gengo = mygengo.MyGengo( + public_key='PlwtF1CZ2tu27IdX_SXNxTFmfN0j|_-pJ^Rf({O-oLl--r^QM4FygRdt^jusSSDE', + private_key='wlXpL=SU[#JpPu[dQaf$v{S3@rg[=95$$TA(k$sb3_6~B_zDKkTbd4#hXxaorIae', + sandbox=False, +) + +def gengo_translate(text, source, target): + try: + translation = gengo.postTranslationJob(job={ + 'type': 'text', + 'slug': 'Translating '+source+' to '+target+' with the myGengo API', + 'body_src': text, + 'lc_src': source, + 'lc_tgt': target, + 'tier': 'machine', + }) + translated = translation['response']['job']['body_tgt'] + return u"(%s > %s) %s" % (source, target, translated) + except mygengo.MyGengoError: + return "error: could not translate" + +def match_language(fragment): + fragment = fragment.lower() + for short, _ in lang_pairs: + if fragment in short.lower().split(): + return short.split()[0] + + for short, full in lang_pairs: + if fragment in full.lower(): + return short.split()[0] + return None + +@hook.command +def translate(inp): + ".translate -- Translates from to using MyGengo." + args = inp.split(' ') + sl = match_language(args[0]) + tl = match_language(args[1]) + txt = unicode(" ".join(args[2:])) + if sl and tl: + return unicode(gengo_translate(txt, sl, tl)) + else: + return "error: translate could not reliably determine one or both languages" + +languages = 'ja fr de ko ru zh'.split() +language_pairs = zip(languages[:-1], languages[1:]) +lang_pairs = [ + ("no", "Norwegian"), + ("it", "Italian"), + ("ht", "Haitian Creole"), + ("af", "Afrikaans"), + ("sq", "Albanian"), + ("ar", "Arabic"), + ("hy", "Armenian"), + ("az", "Azerbaijani"), + ("eu", "Basque"), + ("be", "Belarusian"), + ("bg", "Bulgarian"), + ("ca", "Catalan"), + ("zh-CN zh", "Chinese"), + ("hr", "Croatian"), + ("cs cz", "Czech"), + ("da dk", "Danish"), + ("nl", "Dutch"), + ("en", "English"), + ("et", "Estonian"), + ("tl", "Filipino"), + ("fi", "Finnish"), + ("fr", "French"), + ("gl", "Galician"), + ("ka", "Georgian"), + ("de", "German"), + ("el", "Greek"), + ("ht", "Haitian Creole"), + ("iw", "Hebrew"), + ("hi", "Hindi"), + ("hu", "Hungarian"), + ("is", "Icelandic"), + ("id", "Indonesian"), + ("ga", "Irish"), + ("it", "Italian"), + ("ja jp jpn", "Japanese"), + ("ko", "Korean"), + ("lv", "Latvian"), + ("lt", "Lithuanian"), + ("mk", "Macedonian"), + ("ms", "Malay"), + ("mt", "Maltese"), + ("no", "Norwegian"), + ("fa", "Persian"), + ("pl", "Polish"), + ("pt", "Portuguese"), + ("ro", "Romanian"), + ("ru", "Russian"), + ("sr", "Serbian"), + ("sk", "Slovak"), + ("sl", "Slovenian"), + ("es", "Spanish"), + ("sw", "Swahili"), + ("sv", "Swedish"), + ("th", "Thai"), + ("tr", "Turkish"), + ("uk", "Ukrainian"), + ("ur", "Urdu"), + ("vi", "Vietnamese"), + ("cy", "Welsh"), + ("yi", "Yiddish") +] diff --git a/plugins/namegen.py b/disabled_stuff/namegen.py similarity index 94% rename from plugins/namegen.py rename to disabled_stuff/namegen.py index 5f5a169..7a1f0e6 100644 --- a/plugins/namegen.py +++ b/disabled_stuff/namegen.py @@ -14,12 +14,12 @@ def get_generator(_json): @hook.command(autohelp=False) -def namegen(input, instance, bot): +def namegen(inp, notice=None): """namegen [generator] -- Generates some names using the chosen generator. 'namegen list' will display a list of all generators.""" # clean up the input - inp = input.text.strip().lower() + inp = inp.strip().lower() # get a list of available name generators files = os.listdir(GEN_DIR) @@ -33,7 +33,7 @@ def namegen(input, instance, bot): if inp == "list": message = "Available generators: " message += text.get_text_list(all_modules, 'and') - input.notice(message) + notice(message) return if inp: diff --git a/plugins/newegg.py b/disabled_stuff/newegg.py similarity index 91% rename from plugins/newegg.py rename to disabled_stuff/newegg.py index d0c4b32..68d604d 100644 --- a/plugins/newegg.py +++ b/disabled_stuff/newegg.py @@ -46,18 +46,18 @@ def format_item(item, show_url=True): tags.append("\x02Featured\x02") if item["IsShellShockerItem"]: - tags.append("\x02SHELL SHOCKER\u00AE\x02") + tags.append(u"\x02SHELL SHOCKER\u00AE\x02") # join all the tags together in a comma separated string ("tag1, tag2, tag3") - tag_text = ", ".join(tags) + tag_text = u", ".join(tags) if show_url: # create the item URL and shorten it url = web.try_isgd(ITEM_URL.format(item["NeweggItemNumber"])) - return "\x02{}\x02 ({}) - {} - {} - {}".format(title, price, rating, + return u"\x02{}\x02 ({}) - {} - {} - {}".format(title, price, rating, tag_text, url) else: - return "\x02{}\x02 ({}) - {} - {}".format(title, price, rating, + return u"\x02{}\x02 ({}) - {} - {}".format(title, price, rating, tag_text) diff --git a/plugins/newgrounds.py b/disabled_stuff/newgrounds.py similarity index 90% rename from plugins/newgrounds.py rename to disabled_stuff/newgrounds.py index 94c115f..b26ffe4 100644 --- a/plugins/newgrounds.py +++ b/disabled_stuff/newgrounds.py @@ -15,7 +15,7 @@ def test(s): def newgrounds_url(match): location = match.group(4).split("/")[-1] if not test(location): - print("Not a valid Newgrounds portal ID. Example: http://www.newgrounds.com/portal/view/593993") + print "Not a valid Newgrounds portal ID. Example: http://www.newgrounds.com/portal/view/593993" return None soup = http.get_soup("http://www.newgrounds.com/portal/view/" + location) @@ -31,7 +31,7 @@ def newgrounds_url(match): # get rating try: rating_info = soup.find('dd', {'class': 'star-variable'})['title'].split("Stars –")[0].strip() - rating = " - rated \x02{}\x02/\x025.0\x02".format(rating_info) + rating = u" - rated \x02{}\x02/\x025.0\x02".format(rating_info) except: rating = "" diff --git a/plugins/notes.py b/disabled_stuff/notes.py similarity index 100% rename from plugins/notes.py rename to disabled_stuff/notes.py diff --git a/plugins/osrc.py b/disabled_stuff/osrc.py similarity index 100% rename from plugins/osrc.py rename to disabled_stuff/osrc.py diff --git a/plugins/password.py b/disabled_stuff/password.py similarity index 96% rename from plugins/password.py rename to disabled_stuff/password.py index a4db2f3..34a379b 100644 --- a/plugins/password.py +++ b/disabled_stuff/password.py @@ -29,7 +29,7 @@ def password(inp, notice=None): # add numbers if "numeric" in inp or "number" in inp: - okay = okay + [str(x) for x in range(0, 10)] + okay = okay + [str(x) for x in xrange(0, 10)] # add symbols if "symbol" in inp: diff --git a/plugins/plpaste.py b/disabled_stuff/plpaste.py similarity index 100% rename from plugins/plpaste.py rename to disabled_stuff/plpaste.py diff --git a/plugins/potato.py b/disabled_stuff/potato.py similarity index 100% rename from plugins/potato.py rename to disabled_stuff/potato.py diff --git a/plugins/pre.py b/disabled_stuff/pre.py similarity index 97% rename from plugins/pre.py rename to disabled_stuff/pre.py index e346195..f4e61a3 100644 --- a/plugins/pre.py +++ b/disabled_stuff/pre.py @@ -36,5 +36,3 @@ def pre(inp): size = '' return '{} - {}{} - {} ({} ago)'.format(section, name, size, date_string, since) - -print(pre("top gear")) diff --git a/plugins/python.py b/disabled_stuff/python.py similarity index 100% rename from plugins/python.py rename to disabled_stuff/python.py diff --git a/plugins/qrcode.py b/disabled_stuff/qrcode.py similarity index 100% rename from plugins/qrcode.py rename to disabled_stuff/qrcode.py diff --git a/plugins/quote.py b/disabled_stuff/quote.py similarity index 100% rename from plugins/quote.py rename to disabled_stuff/quote.py diff --git a/plugins/rdio.py b/disabled_stuff/rdio.py similarity index 80% rename from plugins/rdio.py rename to disabled_stuff/rdio.py index 870b99d..2677090 100644 --- a/plugins/rdio.py +++ b/disabled_stuff/rdio.py @@ -1,4 +1,4 @@ -import urllib.request, urllib.parse, urllib.error +import urllib import json import re @@ -11,7 +11,7 @@ def getdata(inp, types, api_key, api_secret): consumer = oauth.Consumer(api_key, api_secret) client = oauth.Client(consumer) response = client.request('http://api.rdio.com/1/', 'POST', - urllib.parse.urlencode({'method': 'search', 'query': inp, 'types': types, 'count': '1'})) + urllib.urlencode({'method': 'search', 'query': inp, 'types': types, 'count': '1'})) data = json.loads(response[1]) return data @@ -34,16 +34,16 @@ def rdio(inp, bot=None): artist = info['artist'] album = info['album'] url = info['shortUrl'] - return "\x02{}\x02 by \x02{}\x02 - {} {}".format(name, artist, album, url) + return u"\x02{}\x02 by \x02{}\x02 - {} {}".format(name, artist, album, url) elif 'artist' in info and not 'album' in info: # Album name = info['name'] artist = info['artist'] url = info['shortUrl'] - return "\x02{}\x02 by \x02{}\x02 - {}".format(name, artist, url) + return u"\x02{}\x02 by \x02{}\x02 - {}".format(name, artist, url) else: # Artist name = info['name'] url = info['shortUrl'] - return "\x02{}\x02 - {}".format(name, url) + return u"\x02{}\x02 - {}".format(name, url) @hook.command @@ -62,7 +62,7 @@ def rdiot(inp, bot=None): artist = info['artist'] album = info['album'] url = info['shortUrl'] - return "\x02{}\x02 by \x02{}\x02 - {} - {}".format(name, artist, album, url) + return u"\x02{}\x02 by \x02{}\x02 - {} - {}".format(name, artist, album, url) @hook.command @@ -79,7 +79,7 @@ def rdioar(inp, bot=None): return "No results." name = info['name'] url = info['shortUrl'] - return "\x02{}\x02 - {}".format(name, url) + return u"\x02{}\x02 - {}".format(name, url) @hook.command @@ -97,7 +97,7 @@ def rdioal(inp, bot=None): name = info['name'] artist = info['artist'] url = info['shortUrl'] - return "\x02{}\x02 by \x02{}\x02 - {}".format(name, artist, url) + return u"\x02{}\x02 by \x02{}\x02 - {}".format(name, artist, url) rdio_re = (r'(.*:)//(rd.io|www.rdio.com|rdio.com)(:[0-9]+)?(.*)', re.I) @@ -113,7 +113,7 @@ def rdio_url(match, bot=None): consumer = oauth.Consumer(api_key, api_secret) client = oauth.Client(consumer) response = client.request('http://api.rdio.com/1/', 'POST', - urllib.parse.urlencode({'method': 'getObjectFromUrl', 'url': url})) + urllib.urlencode({'method': 'getObjectFromUrl', 'url': url})) data = json.loads(response[1]) info = data['result'] if 'name' in info: @@ -121,11 +121,11 @@ def rdio_url(match, bot=None): name = info['name'] artist = info['artist'] album = info['album'] - return "Rdio track: \x02{}\x02 by \x02{}\x02 - {}".format(name, artist, album) + return u"Rdio track: \x02{}\x02 by \x02{}\x02 - {}".format(name, artist, album) elif 'artist' in info and not 'album' in info: # Album name = info['name'] artist = info['artist'] - return "Rdio album: \x02{}\x02 by \x02{}\x02".format(name, artist) + return u"Rdio album: \x02{}\x02 by \x02{}\x02".format(name, artist) else: # Artist name = info['name'] - return "Rdio artist: \x02{}\x02".format(name) + return u"Rdio artist: \x02{}\x02".format(name) diff --git a/disabled_stuff/recipe.py b/disabled_stuff/recipe.py new file mode 100644 index 0000000..0e04572 --- /dev/null +++ b/disabled_stuff/recipe.py @@ -0,0 +1,106 @@ +import random + +from util import hook, http, web + +metadata_url = "http://omnidator.appspot.com/microdata/json/?url={}" + +base_url = "http://www.cookstr.com" +search_url = base_url + "/searches" +random_url = search_url + "/surprise" + +# set this to true to censor this plugin! +censor = True +phrases = [ + u"EAT SOME FUCKING \x02{}\x02", + u"YOU WON'T NOT MAKE SOME FUCKING \x02{}\x02", + u"HOW ABOUT SOME FUCKING \x02{}?\x02", + u"WHY DON'T YOU EAT SOME FUCKING \x02{}?\x02", + u"MAKE SOME FUCKING \x02{}\x02", + u"INDUCE FOOD COMA WITH SOME FUCKING \x02{}\x02" +] + +clean_key = lambda i: i.split("#")[1] + + +class ParseError(Exception): + pass + + +def get_data(url): + """ Uses the omnidator API to parse the metadata from the provided URL """ + try: + omni = http.get_json(metadata_url.format(url)) + except (http.HTTPError, http.URLError) as e: + raise ParseError(e) + schemas = omni["@"] + for d in schemas: + if d["a"] == "": + data = {clean_key(key): value for (key, value) in d.iteritems() + if key.startswith("http://schema.org/Recipe")} + return data + raise ParseError("No recipe data found") + + +@hook.command(autohelp=False) +def recipe(inp): + """recipe [term] - Gets a recipe for [term], or ets a random recipe if [term] is not provided""" + if inp: + # get the recipe URL by searching + try: + search = http.get_soup(search_url, query=inp.strip()) + except (http.HTTPError, http.URLError) as e: + return "Could not get recipe: {}".format(e) + + # find the list of results + result_list = search.find('div', {'class': 'found_results'}) + + if result_list: + results = result_list.find_all('div', {'class': 'recipe_result'}) + else: + return "No results" + + # pick a random front page result + result = random.choice(results) + + # extract the URL from the result + url = base_url + result.find('div', {'class': 'image-wrapper'}).find('a')['href'] + + else: + # get a random recipe URL + try: + page = http.open(random_url) + except (http.HTTPError, http.URLError) as e: + return "Could not get recipe: {}".format(e) + url = page.geturl() + + # use get_data() to get the recipe info from the URL + try: + data = get_data(url) + except ParseError as e: + return "Could not parse recipe: {}".format(e) + + name = data["name"].strip() + return u"Try eating \x02{}!\x02 - {}".format(name, web.try_isgd(url)) + + +@hook.command(autohelp=False) +def dinner(inp): + """dinner - WTF IS FOR DINNER""" + try: + page = http.open(random_url) + except (http.HTTPError, http.URLError) as e: + return "Could not get recipe: {}".format(e) + url = page.geturl() + + try: + data = get_data(url) + except ParseError as e: + return "Could not parse recipe: {}".format(e) + + name = data["name"].strip().upper() + text = random.choice(phrases).format(name) + + if censor: + text = text.replace("FUCK", "F**K") + + return u"{} - {}".format(text, web.try_isgd(url)) diff --git a/plugins/reddit.py b/disabled_stuff/reddit.py similarity index 93% rename from plugins/reddit.py rename to disabled_stuff/reddit.py index 11907f3..80fcb76 100644 --- a/plugins/reddit.py +++ b/disabled_stuff/reddit.py @@ -22,7 +22,7 @@ def reddit_url(match): timeago = thread.xpath("//div[@id='siteTable']//p[@class='tagline']/time/text()")[0] comments = thread.xpath("//div[@id='siteTable']//a[@class='comments']/text()")[0] - return '\x02{}\x02 - posted by \x02{}\x02 {} ago - {} upvotes, {} downvotes - {}'.format( + return u'\x02{}\x02 - posted by \x02{}\x02 {} ago - {} upvotes, {} downvotes - {}'.format( title, author, timeago, upvotes, downvotes, comments) @@ -74,6 +74,6 @@ def reddit(inp): else: item["warning"] = "" - return "\x02{title} : {subreddit}\x02 - posted by \x02{author}\x02" \ + return u"\x02{title} : {subreddit}\x02 - posted by \x02{author}\x02" \ " {timesince} ago - {ups} upvotes, {downs} downvotes -" \ " {link}{warning}".format(**item) diff --git a/disabled_stuff/regex_chans.py b/disabled_stuff/regex_chans.py new file mode 100644 index 0000000..c16c250 --- /dev/null +++ b/disabled_stuff/regex_chans.py @@ -0,0 +1,128 @@ +from util import hook + + +# Default value. +# If True, all channels without a setting will have regex enabled +# If False, all channels without a setting will have regex disabled +default_enabled = True + +db_ready = False + + +def db_init(db): + global db_ready + if not db_ready: + db.execute("CREATE TABLE IF NOT EXISTS regexchans(channel PRIMARY KEY, status)") + db.commit() + db_ready = True + + +def get_status(db, channel): + row = db.execute("SELECT status FROM regexchans WHERE channel = ?", [channel]).fetchone() + if row: + return row[0] + else: + return None + + +def set_status(db, channel, status): + row = db.execute("REPLACE INTO regexchans (channel, status) VALUES(?, ?)", [channel, status]) + db.commit() + + +def delete_status(db, channel): + row = db.execute("DELETE FROM regexchans WHERE channel = ?", [channel]) + db.commit() + + +def list_status(db): + row = db.execute("SELECT * FROM regexchans").fetchall() + result = None + for values in row: + if result: + result += u", {}: {}".format(values[0], values[1]) + else: + result = u"{}: {}".format(values[0], values[1]) + return result + + +@hook.sieve +def sieve_regex(bot, inp, func, kind, args): + db = bot.get_db_connection(inp.conn) + db_init(db) + if kind == 'regex' and inp.chan.startswith("#") and func.__name__ != 'factoid': + chanstatus = get_status(db, inp.chan) + if chanstatus != "ENABLED" and (chanstatus == "DISABLED" or not default_enabled): + print u"Denying input.raw={}, kind={}, args={} from {}".format(inp.raw, kind, args, inp.chan) + return None + print u"Allowing input.raw={}, kind={}, args={} from {}".format(inp.raw, kind, args, inp.chan) + + return inp + + +@hook.command(permissions=["botcontrol"]) +def enableregex(inp, db=None, message=None, notice=None, chan=None, nick=None): + db_init(db) + inp = inp.strip().lower() + if not inp: + channel = chan + elif inp.startswith("#"): + channel = inp + else: + channel = u"#{}".format(inp) + + message(u"Enabling regex matching (youtube, etc) (issued by {})".format(nick), target=channel) + notice(u"Enabling regex matching (youtube, etc) in channel {}".format(channel)) + set_status(db, channel, "ENABLED") + + +@hook.command(permissions=["botcontrol"]) +def disableregex(inp, db=None, message=None, notice=None, chan=None, nick=None): + db_init(db) + inp = inp.strip().lower() + if not inp: + channel = chan + elif inp.startswith("#"): + channel = inp + else: + channel = u"#{}".format(inp) + + message(u"Disabling regex matching (youtube, etc) (issued by {})".format(nick), target=channel) + notice(u"Disabling regex matching (youtube, etc) in channel {}".format(channel)) + set_status(db, channel, "DISABLED") + + +@hook.command(permissions=["botcontrol"]) +def resetregex(inp, db=None, message=None, notice=None, chan=None, nick=None): + db_init(db) + inp = inp.strip().lower() + if not inp: + channel = chan + elif inp.startswith("#"): + channel = inp + else: + channel = u"#{}".format(inp) + + message(u"Resetting regex matching setting (youtube, etc) (issued by {})".format(nick), target=channel) + notice(u"Resetting regex matching setting (youtube, etc) in channel {}".format(channel)) + delete_status(db, channel) + + +@hook.command(permissions=["botcontrol"]) +def regexstatus(inp, db=None, chan=None): + db_init(db) + inp = inp.strip().lower() + if not inp: + channel = chan + elif inp.startswith("#"): + channel = inp + else: + channel = u"#{}".format(inp) + + return u"Regex status for {}: {}".format(channel, get_status(db, channel)) + + +@hook.command(permissions=["botcontrol"]) +def listregex(inp, db=None): + db_init(db) + return list_status(db) diff --git a/disabled_stuff/religion.py b/disabled_stuff/religion.py new file mode 100644 index 0000000..552b23f --- /dev/null +++ b/disabled_stuff/religion.py @@ -0,0 +1,38 @@ +from util import hook, http + + +@hook.command('god') +@hook.command +def bible(inp): + """.bible -- gets from the Bible (ESV)""" + + base_url = ('http://www.esvapi.org/v2/rest/passageQuery?key=IP&' + 'output-format=plain-text&include-heading-horizontal-lines&' + 'include-headings=false&include-passage-horizontal-lines=false&' + 'include-passage-references=false&include-short-copyright=false&' + 'include-footnotes=false&line-length=0&' + 'include-heading-horizontal-lines=false') + + text = http.get(base_url, passage=inp) + + text = ' '.join(text.split()) + + if len(text) > 400: + text = text[:text.rfind(' ', 0, 400)] + '...' + + return text + + +@hook.command('allah') +@hook.command +def koran(inp): # Koran look-up plugin by Ghetto Wizard + """.koran -- gets from the Koran""" + + url = 'http://quod.lib.umich.edu/cgi/k/koran/koran-idx?type=simple' + + results = http.get_html(url, q1=inp).xpath('//li') + + if not results: + return 'No results for ' + inp + + return results[0].text_content() diff --git a/disabled_stuff/repaste.py b/disabled_stuff/repaste.py new file mode 100644 index 0000000..1443345 --- /dev/null +++ b/disabled_stuff/repaste.py @@ -0,0 +1,180 @@ +from util import hook, http + +import urllib +import random +import urllib2 +import htmlentitydefs +import re + +re_htmlent = re.compile("&(" + "|".join(htmlentitydefs.name2codepoint.keys()) + ");") +re_numeric = re.compile(r'&#(x?)([a-fA-F0-9]+);') + + +def db_init(db): + db.execute("create table if not exists repaste(chan, manual, primary key(chan))") + db.commit() + + +def decode_html(text): + text = re.sub(re_htmlent, + lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), + text) + + text = re.sub(re_numeric, + lambda m: unichr(int(m.group(2), 16 if m.group(1) else 10)), + text) + return text + + +def scrape_mibpaste(url): + if not url.startswith("http"): + url = "http://" + url + pagesource = http.get(url) + rawpaste = re.search(r'(?s)(?<=\n).+(?=
)', pagesource).group(0) + filterbr = rawpaste.replace("
", "") + unescaped = decode_html(filterbr) + stripped = unescaped.strip() + + return stripped + + +def scrape_pastebin(url): + id = re.search(r'(?:www\.)?pastebin.com/([a-zA-Z0-9]+)$', url).group(1) + rawurl = "http://pastebin.com/raw.php?i=" + id + text = http.get(rawurl) + + return text + + +autorepastes = {} + + +#@hook.regex('(pastebin\.com)(/[^ ]+)') +@hook.regex('(mibpaste\.com)(/[^ ]+)') +def autorepaste(inp, input=None, notice=None, db=None, chan=None, nick=None): + db_init(db) + manual = db.execute("select manual from repaste where chan=?", (chan, )).fetchone() + if manual and len(manual) and manual[0]: + return + url = inp.group(1) + inp.group(2) + urllib.unquote(url) + if url in autorepastes: + out = autorepastes[url] + notice("In the future, please use a less awful pastebin (e.g. pastebin.com)") + else: + out = repaste("http://" + url, input, db, False) + autorepastes[url] = out + notice("In the future, please use a less awful pastebin (e.g. pastebin.com) instead of %s." % inp.group(1)) + input.say("%s (repasted for %s)" % (out, nick)) + + +scrapers = { + r'mibpaste\.com': scrape_mibpaste, + r'pastebin\.com': scrape_pastebin +} + + +def scrape(url): + for pat, scraper in scrapers.iteritems(): + print "matching " + repr(pat) + " " + url + if re.search(pat, url): + break + else: + return None + + return scraper(url) + + +def paste_sprunge(text, syntax=None, user=None): + data = urllib.urlencode({"sprunge": text}) + url = urllib2.urlopen("http://sprunge.us/", data).read().strip() + + if syntax: + url += "?" + syntax + + return url + + +def paste_ubuntu(text, user=None, syntax='text'): + data = urllib.urlencode({"poster": user, + "syntax": syntax, + "content": text}) + + return urllib2.urlopen("http://paste.ubuntu.com/", data).url + + +def paste_gist(text, user=None, syntax=None, description=None): + data = { + 'file_contents[gistfile1]': text, + 'action_button': "private" + } + + if description: + data['description'] = description + + if syntax: + data['file_ext[gistfile1]'] = "." + syntax + + req = urllib2.urlopen('https://gist.github.com/gists', urllib.urlencode(data).encode('utf8')) + return req.url + + +def paste_strictfp(text, user=None, syntax="plain"): + data = urllib.urlencode(dict( + language=syntax, + paste=text, + private="private", + submit="Paste")) + req = urllib2.urlopen("http://paste.strictfp.com/", data) + return req.url + + +pasters = dict( + ubuntu=paste_ubuntu, + sprunge=paste_sprunge, + gist=paste_gist, + strictfp=paste_strictfp +) + + +@hook.command +def repaste(inp, input=None, db=None, isManual=True): + ".repaste mode|list|[provider] [syntax] -- Reuploads mibpaste to [provider]." + + parts = inp.split() + db_init(db) + if parts[0] == 'list': + return " ".join(pasters.keys()) + + paster = paste_gist + args = {} + + if not parts[0].startswith("http"): + p = parts[0].lower() + + if p in pasters: + paster = pasters[p] + parts = parts[1:] + + if not parts[0].startswith("http"): + p = parts[0].lower() + parts = parts[1:] + + args["syntax"] = p + + if len(parts) > 1: + return "PEBKAC" + + args["user"] = input.user + + url = parts[0] + + scraped = scrape(url) + + if not scraped: + return "No scraper for given url" + + args["text"] = scraped + pasted = paster(**args) + + return pasted diff --git a/plugins/rottentomatoes.py b/disabled_stuff/rottentomatoes.py similarity index 94% rename from plugins/rottentomatoes.py rename to disabled_stuff/rottentomatoes.py index 34e6f6e..2d7af38 100644 --- a/plugins/rottentomatoes.py +++ b/disabled_stuff/rottentomatoes.py @@ -35,5 +35,5 @@ def rottentomatoes(inp, bot=None): fresh = critics_score * review_count / 100 rotten = review_count - fresh - return "{} - Critics Rating: \x02{}%\x02 ({} liked, {} disliked) " \ + return u"{} - Critics Rating: \x02{}%\x02 ({} liked, {} disliked) " \ "Audience Rating: \x02{}%\x02 - {}".format(title, critics_score, fresh, rotten, audience_score, url) diff --git a/plugins/rss.py b/disabled_stuff/rss.py similarity index 95% rename from plugins/rss.py rename to disabled_stuff/rss.py index d7e93d0..f7ed1c4 100644 --- a/plugins/rss.py +++ b/disabled_stuff/rss.py @@ -31,7 +31,7 @@ def rss(inp, message=None): link = web.isgd(row["link"]) except (web.ShortenError, http.HTTPError, http.URLError): link = row["link"] - message("{} - {}".format(title, link)) + message(u"{} - {}".format(title, link)) @hook.command(autohelp=False) diff --git a/plugins/shorten.py b/disabled_stuff/shorten.py similarity index 100% rename from plugins/shorten.py rename to disabled_stuff/shorten.py diff --git a/plugins/slap.py b/disabled_stuff/slap.py similarity index 94% rename from plugins/slap.py rename to disabled_stuff/slap.py index 4147781..37dfbbd 100644 --- a/plugins/slap.py +++ b/disabled_stuff/slap.py @@ -26,7 +26,7 @@ def slap(inp, action=None, nick=None, conn=None, notice=None): "user": target } - with open("./data/slaps.json") as f: + with open("plugins/data/slaps.json") as f: generator = get_generator(f.read(), variables) # act out the message diff --git a/plugins/slogan.py b/disabled_stuff/slogan.py similarity index 89% rename from plugins/slogan.py rename to disabled_stuff/slogan.py index b1be9a7..279c41d 100644 --- a/plugins/slogan.py +++ b/disabled_stuff/slogan.py @@ -3,7 +3,7 @@ import random from util import hook, text -with open("./data/slogans.txt") as f: +with open("plugins/data/slogans.txt") as f: slogans = [line.strip() for line in f.readlines() if not line.startswith("//")] diff --git a/plugins/snopes.py b/disabled_stuff/snopes.py similarity index 91% rename from plugins/snopes.py rename to disabled_stuff/snopes.py index 5dadaf7..9850a68 100644 --- a/plugins/snopes.py +++ b/disabled_stuff/snopes.py @@ -25,8 +25,8 @@ def snopes(inp): if status is not None: status = status.group(0).strip() else: # new-style statuses - status = "Status: {}.".format(re.search(r"FALSE|TRUE|MIXTURE|UNDETERMINED", - snopes_text).group(0).title()) + status = "Status: %s." % re.search(r"FALSE|TRUE|MIXTURE|UNDETERMINED", + snopes_text).group(0).title() claim = re.sub(r"[\s\xa0]+", " ", claim) # compress whitespace status = re.sub(r"[\s\xa0]+", " ", status) diff --git a/plugins/soundcloud.py b/disabled_stuff/soundcloud.py similarity index 78% rename from plugins/soundcloud.py rename to disabled_stuff/soundcloud.py index c0e615f..d31f103 100644 --- a/plugins/soundcloud.py +++ b/disabled_stuff/soundcloud.py @@ -1,4 +1,4 @@ -from urllib.parse import urlencode +from urllib import urlencode import re from util import hook, http, web, text @@ -13,17 +13,17 @@ def soundcloud(url, api_key): data = http.get_json(api_url + '/resolve.json?' + urlencode({'url': url, 'client_id': api_key})) if data['description']: - desc = ": {} ".format(text.truncate_str(data['description'], 50)) + desc = u": {} ".format(text.truncate_str(data['description'], 50)) else: desc = "" if data['genre']: - genre = "- Genre: \x02{}\x02 ".format(data['genre']) + genre = u"- Genre: \x02{}\x02 ".format(data['genre']) else: genre = "" url = web.try_isgd(data['permalink_url']) - return "SoundCloud track: \x02{}\x02 by \x02{}\x02 {}{}- {} plays, {} downloads, {} comments - {}".format( + return u"SoundCloud track: \x02{}\x02 by \x02{}\x02 {}{}- {} plays, {} downloads, {} comments - {}".format( data['title'], data['user']['username'], desc, genre, data['playback_count'], data['download_count'], data['comment_count'], url) @@ -32,7 +32,7 @@ def soundcloud(url, api_key): def soundcloud_url(match, bot=None): api_key = bot.config.get("api_keys", {}).get("soundcloud") if not api_key: - print("Error: no api key set") + print "Error: no api key set" return None url = match.group(1).split(' ')[-1] + "//" + (match.group(2) if match.group(2) else "") + match.group(3) + \ match.group(4).split(' ')[0] @@ -43,7 +43,7 @@ def soundcloud_url(match, bot=None): def sndsc_url(match, bot=None): api_key = bot.config.get("api_keys", {}).get("soundcloud") if not api_key: - print("Error: no api key set") + print "Error: no api key set" return None url = match.group(1).split(' ')[-1] + "//" + (match.group(2) if match.group(2) else "") + match.group(3) + \ match.group(4).split(' ')[0] diff --git a/plugins/spellcheck.py b/disabled_stuff/spellcheck.py similarity index 100% rename from plugins/spellcheck.py rename to disabled_stuff/spellcheck.py diff --git a/plugins/spotify.py b/disabled_stuff/spotify.py similarity index 83% rename from plugins/spotify.py rename to disabled_stuff/spotify.py index 9a9f1f6..9897235 100644 --- a/plugins/spotify.py +++ b/disabled_stuff/spotify.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urlencode +from urllib import urlencode from util import hook, http, web @@ -24,8 +24,8 @@ def sptfy(inp, sptfy=False): link = soup.find('div', {'class': 'resultLink'}).text.strip() return link except: - message = "Unable to shorten URL: {}".format(soup.find('div', { - 'class': 'messagebox_text'}).find('p').text.split("
")[0]) + message = "Unable to shorten URL: %s" % \ + soup.find('div', {'class': 'messagebox_text'}).find('p').text.split("
")[0] return message else: return web.try_isgd(inp) @@ -45,8 +45,7 @@ def spotify(inp): except IndexError: return "Could not find track." url = sptfy(gateway.format(type, id)) - - return "\x02{}\x02 by \x02{}\x02 - {}".format(data["tracks"][0]["name"], + return u"\x02{}\x02 by \x02{}\x02 - {}".format(data["tracks"][0]["name"], data["tracks"][0]["artists"][0]["name"], url) @@ -63,8 +62,7 @@ def spalbum(inp): except IndexError: return "Could not find album." url = sptfy(gateway.format(type, id)) - - return "\x02{}\x02 by \x02{}\x02 - {}".format(data["albums"][0]["name"], + return u"\x02{}\x02 by \x02{}\x02 - {}".format(data["albums"][0]["name"], data["albums"][0]["artists"][0]["name"], url) @@ -81,8 +79,7 @@ def spartist(inp): except IndexError: return "Could not find artist." url = sptfy(gateway.format(type, id)) - - return "\x02{}\x02 - {}".format(data["artists"][0]["name"], url) + return u"\x02{}\x02 - {}".format(data["artists"][0]["name"], url) @hook.regex(*http_re) @@ -97,14 +94,13 @@ def spotify_url(match): name = data["track"]["name"] artist = data["track"]["artists"][0]["name"] album = data["track"]["album"]["name"] - - return "Spotify Track: \x02{}\x02 by \x02{}\x02 from the album \x02{}\x02 - {}".format(name, artist, + return u"Spotify Track: \x02{}\x02 by \x02{}\x02 from the album \x02{}\x02 - {}".format(name, artist, album, sptfy( gateway.format(type, spotify_id))) elif type == "artist": - return "Spotify Artist: \x02{}\x02 - {}".format(data["artist"]["name"], + return u"Spotify Artist: \x02{}\x02 - {}".format(data["artist"]["name"], sptfy(gateway.format(type, spotify_id))) elif type == "album": - return "Spotify Album: \x02{}\x02 - \x02{}\x02 - {}".format(data["album"]["artist"], + return u"Spotify Album: \x02{}\x02 - \x02{}\x02 - {}".format(data["album"]["artist"], data["album"]["name"], sptfy(gateway.format(type, spotify_id))) diff --git a/disabled_stuff/status.py b/disabled_stuff/status.py new file mode 100644 index 0000000..977ac8e --- /dev/null +++ b/disabled_stuff/status.py @@ -0,0 +1,53 @@ +from util import hook +import re +import time +from subprocess import check_output + +def getstatus(): + try: + return check_output("sudo /bin/chch-status", shell=True).strip("\n").decode("utf-8") + except: + return "unbekannt" + +@hook.command("status", autohelp=False) +def cmd_status(inp, reply=None): + """status - Return the door status""" + reply("Chaostreff Status: %s" % (getstatus())) + +@hook.event("TOPIC") +def topic_update(info, conn=None, chan=None): + """topic_update -- Update the topic on TOPIC command""" + status = getstatus() + + topic = info[-1] + + sstr = "Status: %s" % (status) + if sstr in topic: + return + + if 'Status: ' in topic: + new_topic = re.sub("Status: [^ ]*", sstr, topic) + else: + new_topic = "%s | %s" % (topic.rstrip(' |'), sstr) + + if new_topic != topic: + conn.send("TOPIC %s :%s" % (chan, new_topic)) + +@hook.event("332") +def e332_update(info, conn=None, chan=None): + """e332_update -- run after current topic was requested""" + chan = info[1] + topic_update(info, conn=conn, chan=chan) + +@hook.singlethread +@hook.event("353") +def e353_update(info, conn=None, chan=None): + """e353_update -- runs after a channel was joined""" + chan = info[2] + if chan.lower() == "#chaoschemnitz": + conn.send("PRIVMSG Chanserv :op #chaoschemnitz") + + while True: + conn.send("TOPIC %s" % (chan)) + time.sleep(60) + diff --git a/plugins/steam.py b/disabled_stuff/steam.py similarity index 94% rename from plugins/steam.py rename to disabled_stuff/steam.py index afa86f8..f3814db 100644 --- a/plugins/steam.py +++ b/disabled_stuff/steam.py @@ -57,8 +57,8 @@ def get_steam_info(url): data["price"] = soup.find('div', {'class': 'game_purchase_price price'}).text.strip() - return "\x02{name}\x02: {desc}, \x02Genre\x02: {genre}, \x02Release Date\x02: {release date}," \ - " \x02Price\x02: {price}".format(**data) + return u"\x02{name}\x02: {desc}, \x02Genre\x02: {genre}, \x02Release Date\x02: {release date}," \ + u" \x02Price\x02: {price}".format(**data) @hook.regex(*steam_re) diff --git a/plugins/steam_calc.py b/disabled_stuff/steam_calc.py similarity index 94% rename from plugins/steam_calc.py rename to disabled_stuff/steam_calc.py index c5cd902..6684eba 100644 --- a/plugins/steam_calc.py +++ b/disabled_stuff/steam_calc.py @@ -1,5 +1,5 @@ import csv -import io +import StringIO from util import hook, http, text @@ -29,7 +29,7 @@ def is_number(s): def unicode_dictreader(utf8_data, **kwargs): csv_reader = csv.DictReader(utf8_data, **kwargs) for row in csv_reader: - yield dict([(key.lower(), str(value, 'utf-8')) for key, value in row.items()]) + yield dict([(key.lower(), unicode(value, 'utf-8')) for key, value in row.iteritems()]) @hook.command('sc') @@ -66,7 +66,7 @@ def steamcalc(inp, reply=None): except (http.HTTPError, http.URLError): return "Could not get data for this user." - csv_data = io.StringIO(request) # we use StringIO because CSV can't read a string + csv_data = StringIO.StringIO(request) # we use StringIO because CSV can't read a string reader = unicode_dictreader(csv_data) # put the games in a list diff --git a/plugins/stock.py b/disabled_stuff/stock.py similarity index 90% rename from plugins/stock.py rename to disabled_stuff/stock.py index f61fc5c..aedf051 100644 --- a/plugins/stock.py +++ b/disabled_stuff/stock.py @@ -22,9 +22,9 @@ def stock(inp): quote['color'] = "3" quote['PercentChange'] = 100 * change / (price - change) - print(quote) + print quote - return "\x02{Name}\x02 (\x02{symbol}\x02) - {LastTradePriceOnly} " \ + return u"\x02{Name}\x02 (\x02{symbol}\x02) - {LastTradePriceOnly} " \ "\x03{color}{Change} ({PercentChange:.2f}%)\x03 " \ "Day Range: {DaysRange} " \ "MCAP: {MarketCapitalization}".format(**quote) diff --git a/plugins/suggest.py b/disabled_stuff/suggest.py similarity index 50% rename from plugins/suggest.py rename to disabled_stuff/suggest.py index e76ac3c..ec66144 100644 --- a/plugins/suggest.py +++ b/disabled_stuff/suggest.py @@ -1,5 +1,3 @@ -import json - from util import hook, http, text from bs4 import BeautifulSoup @@ -7,20 +5,14 @@ from bs4 import BeautifulSoup @hook.command def suggest(inp): """suggest -- Gets suggested phrases for a google search""" - - page = http.get('http://google.com/complete/search', - output='json', client='hp', q=inp) - page_json = page.split('(', 1)[1][:-1] - - suggestions = json.loads(page_json)[1] - suggestions = [suggestion[0] for suggestion in suggestions] + suggestions = http.get_json('http://suggestqueries.google.com/complete/search', client='firefox', q=inp)[1] if not suggestions: return 'no suggestions found' - out = ", ".join(suggestions) + out = u", ".join(suggestions) - # defuckify text + # defuckify text (might not be needed now, but I'll keep it) soup = BeautifulSoup(out) out = soup.get_text() diff --git a/plugins/system.py b/disabled_stuff/system.py similarity index 100% rename from plugins/system.py rename to disabled_stuff/system.py diff --git a/plugins/tell.py b/disabled_stuff/tell.py similarity index 93% rename from plugins/tell.py rename to disabled_stuff/tell.py index 2310cbd..52a0aa1 100644 --- a/plugins/tell.py +++ b/disabled_stuff/tell.py @@ -22,7 +22,8 @@ def db_init(db, conn): def get_tells(db, user_to): return db.execute("select user_from, message, time, chan from tell where" - " user_to=lower(:user) order by time", {'user': user_to}).fetchall() + " user_to=lower(?) order by time", + (user_to.lower(),)).fetchall() @hook.singlethread @@ -44,8 +45,8 @@ def tellinput(inp, input=None, notice=None, db=None, nick=None, conn=None): if len(tells) > 1: reply += " (+{} more, {}showtells to view)".format(len(tells) - 1, conn.conf["command_prefix"]) - db.execute("delete from tell where user_to=lower(:user) and message=:message", - {'user': nick, 'message': message}) + db.execute("delete from tell where user_to=lower(?) and message=?", + (nick, message)) db.commit() notice(reply) diff --git a/plugins/time_plugin.py b/disabled_stuff/time_plugin.py similarity index 100% rename from plugins/time_plugin.py rename to disabled_stuff/time_plugin.py diff --git a/plugins/title.py b/disabled_stuff/title.py similarity index 91% rename from plugins/title.py rename to disabled_stuff/title.py index 506c793..4264188 100644 --- a/plugins/title.py +++ b/disabled_stuff/title.py @@ -20,4 +20,4 @@ def title(inp): if not page_title: return "Could not find title." - return "{} [{}]".format(page_title, real_url) + return u"{} [{}]".format(page_title, real_url) diff --git a/plugins/tvdb.py b/disabled_stuff/tvdb.py similarity index 98% rename from plugins/tvdb.py rename to disabled_stuff/tvdb.py index 46b25db..b5fa12f 100644 --- a/plugins/tvdb.py +++ b/disabled_stuff/tvdb.py @@ -44,7 +44,7 @@ def get_episode_info(episode, api_key): first_aired = episode.findtext("FirstAired") try: - air_date = datetime.date(*list(map(int, first_aired.split('-')))) + air_date = datetime.date(*map(int, first_aired.split('-'))) except (ValueError, TypeError): return None diff --git a/plugins/twitch.py b/disabled_stuff/twitch.py similarity index 95% rename from plugins/twitch.py rename to disabled_stuff/twitch.py index f6a4864..7e1a56a 100644 --- a/plugins/twitch.py +++ b/disabled_stuff/twitch.py @@ -1,5 +1,5 @@ import re -from html.parser import HTMLParser +from HTMLParser import HTMLParser from util import hook, http @@ -36,7 +36,7 @@ def multitwitch_url(match): out = "" for i in usernames: if not test(i): - print("Not a valid username") + print "Not a valid username" return None if out == "": out = twitch_lookup(i) @@ -50,7 +50,7 @@ def twitch_url(match): bit = match.group(4).split("#")[0] location = "/".join(bit.split("/")[1:]) if not test(location): - print("Not a valid username") + print "Not a valid username" return None return twitch_lookup(location) @@ -100,9 +100,9 @@ def twitch_lookup(location): title = data['title'] playing = data['meta_game'] viewers = "\x033\x02Online now!\x02\x0f " + str(data["channel_count"]) + " viewer" - print(viewers) + print viewers viewers = viewers + "s" if not " 1 view" in viewers else viewers - print(viewers) + print viewers return h.unescape(fmt.format(title, channel, playing, viewers)) else: try: diff --git a/plugins/twitter.py b/disabled_stuff/twitter.py similarity index 82% rename from plugins/twitter.py rename to disabled_stuff/twitter.py index 60a3c1e..c83ea67 100644 --- a/plugins/twitter.py +++ b/disabled_stuff/twitter.py @@ -45,13 +45,13 @@ def twitter_url(match, bot=None): text = " ".join(tweet.text.split()) if user.verified: - prefix = "\u2713" + prefix = u"\u2713" else: prefix = "" time = timesince.timesince(tweet.created_at, datetime.utcnow()) - return "{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, text, time) + return u"{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, text, time) @hook.command("tw") @@ -74,7 +74,7 @@ def twitter(inp, bot=None): if e[0][0]['code'] == 34: return "Could not find tweet." else: - return "Error {}: {}".format(e[0][0]['code'], e[0][0]['message']) + return u"Error {}: {}".format(e[0][0]['code'], e[0][0]['message']) user = tweet.user @@ -98,21 +98,21 @@ def twitter(inp, bot=None): if e[0][0]['code'] == 34: return "Could not find user." else: - return "Error {}: {}".format(e[0][0]['code'], e[0][0]['message']) + return u"Error {}: {}".format(e[0][0]['code'], e[0][0]['message']) # get the users tweets user_timeline = api.user_timeline(id=user.id, count=tweet_number + 1) # if the timeline is empty, return an error if not user_timeline: - return "The user \x02{}\x02 has no tweets.".format(user.screen_name) + return u"The user \x02{}\x02 has no tweets.".format(user.screen_name) # grab the newest tweet from the users timeline try: tweet = user_timeline[tweet_number] except IndexError: tweet_count = len(user_timeline) - return "The user \x02{}\x02 only has \x02{}\x02 tweets.".format(user.screen_name, tweet_count) + return u"The user \x02{}\x02 only has \x02{}\x02 tweets.".format(user.screen_name, tweet_count) elif re.match(r'^#\w+$', inp): # user is searching by hashtag @@ -131,13 +131,13 @@ def twitter(inp, bot=None): text = " ".join(tweet.text.split()) if user.verified: - prefix = "\u2713" + prefix = u"\u2713" else: prefix = "" time = timesince.timesince(tweet.created_at, datetime.utcnow()) - return "{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, text, time) + return u"{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, text, time) @hook.command("twinfo") @@ -159,20 +159,20 @@ def twuser(inp, bot=None): return "Unknown error" if user.verified: - prefix = "\u2713" + prefix = u"\u2713" else: prefix = "" if user.location: - loc_str = " is located in \x02{}\x02 and".format(user.location) + loc_str = u" is located in \x02{}\x02 and".format(user.location) else: loc_str = "" if user.description: - desc_str = " The users description is \"{}\"".format(user.description) + desc_str = u" The users description is \"{}\"".format(user.description) else: desc_str = "" - return "{}@\x02{}\x02 ({}){} has \x02{:,}\x02 tweets and \x02{:,}\x02 followers.{}" \ + return u"{}@\x02{}\x02 ({}){} has \x02{:,}\x02 tweets and \x02{:,}\x02 followers.{}" \ "".format(prefix, user.screen_name, user.name, loc_str, user.statuses_count, user.followers_count, desc_str) diff --git a/plugins/update.py b/disabled_stuff/update.py similarity index 100% rename from plugins/update.py rename to disabled_stuff/update.py diff --git a/disabled_stuff/urban.py b/disabled_stuff/urban.py new file mode 100644 index 0000000..48da433 --- /dev/null +++ b/disabled_stuff/urban.py @@ -0,0 +1,66 @@ +import re +import random + +from util import hook, http, text + + +base_url = 'http://api.urbandictionary.com/v0' +define_url = base_url + "/define" +random_url = base_url + "/random" + +@hook.command('u', autohelp=False) +@hook.command(autohelp=False) +def urban(inp): + """urban [id] -- Looks up on urbandictionary.com.""" + + if inp: + # clean and split the input + inp = inp.lower().strip() + parts = inp.split() + + # if the last word is a number, set the ID to that number + if parts[-1].isdigit(): + id_num = int(parts[-1]) + # remove the ID from the input string + del parts[-1] + inp = " ".join(parts) + else: + id_num = 1 + + # fetch the definitions + page = http.get_json(define_url, term=inp, referer="http://m.urbandictionary.com") + + if page['result_type'] == 'no_results': + return 'Not found.' + else: + # get a random definition! + page = http.get_json(random_url, referer="http://m.urbandictionary.com") + id_num = None + + definitions = page['list'] + + if id_num: + # try getting the requested definition + try: + definition = definitions[id_num - 1] + + def_text = " ".join(definition['definition'].split()) # remove excess spaces + def_text = text.truncate_str(def_text, 200) + except IndexError: + return 'Not found.' + + url = definition['permalink'] + output = u"[%i/%i] %s :: %s" % \ + (id_num, len(definitions), def_text, url) + + else: + definition = random.choice(definitions) + + def_text = " ".join(definition['definition'].split()) # remove excess spaces + def_text = text.truncate_str(def_text, 200) + + name = definition['word'] + url = definition['permalink'] + output = u"\x02{}\x02: {} :: {}".format(name, def_text, url) + + return output diff --git a/disabled_stuff/urlhistory.py b/disabled_stuff/urlhistory.py new file mode 100644 index 0000000..c5e344e --- /dev/null +++ b/disabled_stuff/urlhistory.py @@ -0,0 +1,80 @@ +import math +import re +import time + +from util import hook, urlnorm, timesince + + +expiration_period = 60 * 60 * 24 # 1 day + +ignored_urls = [urlnorm.normalize("http://google.com"),] + + +def db_init(db): + db.execute("create table if not exists urlhistory" + "(chan, url, nick, time)") + db.commit() + + +def insert_history(db, chan, url, nick): + now = time.time() + db.execute("insert into urlhistory(chan, url, nick, time) " + "values(?,?,?,?)", (chan, url, nick, time.time())) + db.commit() + + +def get_history(db, chan, url): + db.execute("delete from urlhistory where time < ?", + (time.time() - expiration_period,)) + return db.execute("select nick, time from urlhistory where " + "chan=? and url=? order by time desc", (chan, url)).fetchall() + + +def nicklist(nicks): + nicks = sorted(dict(nicks), key=unicode.lower) + if len(nicks) <= 2: + return ' and '.join(nicks) + else: + return ', and '.join((', '.join(nicks[:-1]), nicks[-1])) + + +def format_reply(history): + if not history: + return + + last_nick, recent_time = history[0] + last_time = timesince.timesince(recent_time) + + if len(history) == 1: + return #"%s linked that %s ago." % (last_nick, last_time) + + hour_span = math.ceil((time.time() - history[-1][1]) / 3600) + hour_span = '%.0f hours' % hour_span if hour_span > 1 else 'hour' + + hlen = len(history) + ordinal = ["once", "twice", "%d times" % hlen][min(hlen, 3) - 1] + + if len(dict(history)) == 1: + last = "last linked %s ago" % last_time + else: + last = "last linked by %s %s ago" % (last_nick, last_time) + + return #"that url has been posted %s in the past %s by %s (%s)." % (ordinal, + +@hook.command +def url(inp, nick='', chan='', db=None, bot=None): + db_init(db) + url = urlnorm.normalize(inp.group().encode('utf-8')) + if url not in ignored_urls: + url = url.decode('utf-8') + history = get_history(db, chan, url) + insert_history(db, chan, url, nick) + + inp = match.string.lower() + + for name in dict(history): + if name.lower() in inp: # person was probably quoting a line + return # that had a link. don't remind them. + + if nick not in dict(history): + return format_reply(history) diff --git a/plugins/utility.py b/disabled_stuff/utility.py similarity index 97% rename from plugins/utility.py rename to disabled_stuff/utility.py index 0bb6b41..b0afa5b 100644 --- a/plugins/utility.py +++ b/disabled_stuff/utility.py @@ -161,9 +161,9 @@ def munge(inp): @hook.command def rainbow(inp): - inp = str(inp) + inp = unicode(inp) inp = strip(inp) - col = list(colors.items()) + col = colors.items() out = "" l = len(colors) for i, t in enumerate(inp): @@ -176,8 +176,8 @@ def rainbow(inp): @hook.command def wrainbow(inp): - inp = str(inp) - col = list(colors.items()) + inp = unicode(inp) + col = colors.items() inp = strip(inp).split(' ') out = [] l = len(colors) diff --git a/plugins/validate.py b/disabled_stuff/validate.py similarity index 100% rename from plugins/validate.py rename to disabled_stuff/validate.py diff --git a/plugins/valvesounds.py b/disabled_stuff/valvesounds.py similarity index 95% rename from plugins/valvesounds.py rename to disabled_stuff/valvesounds.py index 0a6a127..88bc8ce 100644 --- a/plugins/valvesounds.py +++ b/disabled_stuff/valvesounds.py @@ -1,5 +1,5 @@ import json -import urllib.request, urllib.error, urllib.parse +import urllib2 from util import hook, http, web @@ -8,14 +8,14 @@ def get_sound_info(game, search): search = search.replace(" ", "+") try: data = http.get_json("http://p2sounds.blha303.com.au/search/%s/%s?format=json" % (game, search)) - except urllib.error.HTTPError as e: + except urllib2.HTTPError as e: return "Error: " + json.loads(e.read())["error"] items = [] for item in data["items"]: if "music" in game: textsplit = item["text"].split('"') text = "" - for i in range(len(textsplit)): + for i in xrange(len(textsplit)): if i % 2 != 0 and i < 6: if text: text += " / " + textsplit[i] diff --git a/plugins/vimeo.py b/disabled_stuff/vimeo.py similarity index 100% rename from plugins/vimeo.py rename to disabled_stuff/vimeo.py diff --git a/plugins/weather.py b/disabled_stuff/weather.py similarity index 100% rename from plugins/weather.py rename to disabled_stuff/weather.py diff --git a/plugins/wikipedia.py b/disabled_stuff/wikipedia.py similarity index 89% rename from plugins/wikipedia.py rename to disabled_stuff/wikipedia.py index 6b3827a..90461f4 100644 --- a/plugins/wikipedia.py +++ b/disabled_stuff/wikipedia.py @@ -42,8 +42,8 @@ def wiki(inp): if title.lower() not in desc.lower(): desc = title + desc - desc = re.sub('\s+', ' ', desc).strip() # remove excess spaces + desc = u' '.join(desc.split()) # remove excess spaces desc = text.truncate_str(desc, 200) - return '{} :: {}'.format(desc, http.quote(url, ':/')) + return u'{} :: {}'.format(desc, http.quote(url, ':/')) diff --git a/plugins/wolframalpha.py b/disabled_stuff/wolframalpha.py similarity index 88% rename from plugins/wolframalpha.py rename to disabled_stuff/wolframalpha.py index 9a98774..b20ffed 100644 --- a/plugins/wolframalpha.py +++ b/disabled_stuff/wolframalpha.py @@ -36,9 +36,9 @@ def wolframalpha(inp, bot=None): if subpod: results.append(subpod) if results: - pod_texts.append(title + ': ' + ', '.join(results)) + pod_texts.append(title + u': ' + u', '.join(results)) - ret = ' - '.join(pod_texts) + ret = u' - '.join(pod_texts) if not pod_texts: return 'No results.' @@ -46,7 +46,7 @@ def wolframalpha(inp, bot=None): ret = re.sub(r'\\(.)', r'\1', ret) def unicode_sub(match): - return chr(int(match.group(1), 16)) + return unichr(int(match.group(1), 16)) ret = re.sub(r'\\:([0-9a-z]{4})', unicode_sub, ret) @@ -55,4 +55,4 @@ def wolframalpha(inp, bot=None): if not ret: return 'No results.' - return "{} - {}".format(ret, short_url) + return u"{} - {}".format(ret, short_url) diff --git a/disabled_stuff/wordoftheday.py b/disabled_stuff/wordoftheday.py new file mode 100644 index 0000000..7b7a19b --- /dev/null +++ b/disabled_stuff/wordoftheday.py @@ -0,0 +1,20 @@ +import re +from util import hook, http, misc +from BeautifulSoup import BeautifulSoup + + +@hook.command(autohelp=False) +def word(inp, say=False, nick=False): + "word -- Gets the word of the day." + page = http.get('http://merriam-webster.com/word-of-the-day') + + soup = BeautifulSoup(page) + + word = soup.find('strong', {'class': 'main_entry_word'}).renderContents() + function = soup.find('p', {'class': 'word_function'}).renderContents() + + #definitions = re.findall(r':' + # r' *([^<]+)', content) + + say("(%s) The word of the day is:"\ + " \x02%s\x02 (%s)" % (nick, word, function)) diff --git a/disabled_stuff/wrapper.old b/disabled_stuff/wrapper.old new file mode 100644 index 0000000..d2f2cda --- /dev/null +++ b/disabled_stuff/wrapper.old @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# Bot Wrapper by neersighted + +# Import required modules +import os +import sys +import subprocess +import json +import re + +# Files +configfile = os.path.isfile("./config") +botfile = os.path.isfile("./bot.py") + +# Colors +nocol = "\033[1;m" +red = "\033[1;31m" +green = "\033[1;32m" + +# Messages +firstrun = "Welclome to your first run of: " +usage = "usage: ./cloudbot {start|stop|restart|status}" +iusage = "{1|start} {2|stop} {3|restart} {4|status} {5|exit}" +quit = "Thanks for using CloudBot!" + +error1 = red + "Neither screen nor daemon is installed! "\ + "This program cannot run! {ERROR 1}" + nocol +error2 = red + "Could not find bot.py! Are you in the wrong folder? "\ + "{ERROR 2}" + nocol +error3 = red + "Invalid choice, exiting! {ERROR 3}" + nocol +error4 = red + "Program killed by user! {ERROR 4}" + nocol +error5 = red + "Invalid backend in config! (Or, backend not installed)"\ + " {ERROR 5}" + nocol +error6 = red + "Author error! We be derpin'! {ERROR 6}" + nocol + + +# Commands +pwd = os.getcwd() +clearlog = ": > ./bot.log" + +start = "echo " + "'" + error1 + "'" +stop = "echo " + "'" + error1 + "'" +restart = "echo " + "'" + error1 + "'" +pid = "echo 'Cannot get pid'" + +daemonstart = "daemon -r -n cloudbot -O " + pwd + \ + "/bot.log python " + pwd + "/bot.py" +daemonstop = "daemon -n cloudbot --stop" +daemonrestart = "./cloudbot stop > /dev/null 2>&1 && ./cloudbot start > /dev/null 2>&1" +daemonpid = "pidof /usr/bin/daemon" + +screenstart = "screen -d -m -S cloudbot -t cloudbot python " + pwd +\ + "/bot.py > " + pwd + "/bot.log 2>&1" +screenstop = "kill `pidof /usr/bin/screen`" +screenrestart = "./cloudbot stop > /dev/null 2>&1 && ./cloudbot start > /dev/null 2>&1" +screenpid = "pidof /usr/bin/screen" + +# Checks +if configfile: + try: + config = json.load(open('config')) + command = ":" + except ValueError, e: + print 'error: malformed config', e +else: + config = False + command = "python bot.py" + +daemoncheck = subprocess.check_output("locate /usr/bin/daemon", shell=True) +daemon = re.match(r'^/usr/bin/daemon$', daemoncheck) + +screencheck = subprocess.check_output("locate /usr/bin/screen", shell=True) +screen = re.match(r'^/usr/bin/screen$', screencheck) + +if configfile: + backend = config.get("wrapper", {}).get("backend", "daemon") + daemonloc = config.get("wrapper", {}).get("daemonloc", "/usr/bin/daemon") + screenloc = config.get("wrapper", {}).get("screenloc", "/usr/bin/screen") +else: + backend = False + daemonloc = "/usr/bin/daemon" + screenloc = "/usr/bin/screen" + +try: + runningcheck = subprocess.check_output("ps ax|grep cloudbot|"\ + "grep -v grep|grep -v ./cloudbot", shell=True) + running = re.match(r'^[1-9]+', runningcheck) +except (subprocess.CalledProcessError): + running = False + +# Set commands +if (backend == "daemon"): + if daemon: + start = daemonstart + stop = daemonstop + restart = daemonrestart + pid = daemonpid + else: + print error5 + exit +elif (backend == "screen"): + if screen: + start = screenstart + stop = screenstop + restart = screenrestart + pid = screenpid + else: + print error5 + exit +elif (backend == False): + print firstrun +else: + print error5 + exit + +# Fancy banner +print " ______ __ ______ __ __ "\ +" _______ .______ ______ .___________." +print " / || | / __ \ | | | | "\ +"| \ | _ \ / __ \ | |" +print "| ,----'| | | | | | | | | | "\ +"| .--. || |_) | | | | | `---| |----`" +print "| | | | | | | | | | | | "\ +"| | | || _ < | | | | | | " +print "| `----.| `----.| `--' | | `--' | "\ +"| '--' || |_) | | `--' | | | " +print " \______||_______| \______/ \______/ "\ +"|_______/ |______/ \______/ |__| " +print "http://git.io/cloudbot "\ +" by lukeroge" + +# Read arguments/turn interactive +try: + if (len(sys.argv) > 1): + read = 0 + else: + sys.argv = "interactive" + print iusage + read = int(raw_input("Please choose a option: ")) + + if (sys.argv[1] == "start") or (read == 1): + if running: + print "Bot is already running, cannot start!" + else: + command = start + print "Starting... (" + backend + ")" + elif (sys.argv[1] == "stop") or (read == 2): + if running: + command = stop + print "Stopping... (" + backend + ")" + else: + print "Bot is not running, cannot stop!" + elif (sys.argv[1] == "restart") or (read == 3): + if running: + command = restart + print "Restarting... (" + backend + ")" + else: + print "Bot is not running, cannot restart!" + elif (sys.argv[1] == "status") or (read == 4): + if running: + command = pid + print green + "Bot is running! " + nocol + else: + print red + "Bot is not running! " + nocol + elif (sys.argv[1] == "clear"): + command = clearlog + elif (sys.argv[1] == "exit") or (read == 5): + exit + elif (sys.argv[1] == "interactive"): + pass + else: + print usage + exit + +# Pretify errors +except (TypeError, ValueError), e: + print error3 + exit +except (KeyboardInterrupt), e: + print error4 + exit +except (NameError, SyntaxError), e: + print error6 + exit + +# Check for bot files +if botfile: + pass +else: + print error2 + exit + +# Call command +subprocess.call(command, shell=True) +print quit +exit diff --git a/plugins/xkcd.py b/disabled_stuff/xkcd.py similarity index 86% rename from plugins/xkcd.py rename to disabled_stuff/xkcd.py index 6ec286c..d7fad59 100644 --- a/plugins/xkcd.py +++ b/disabled_stuff/xkcd.py @@ -11,10 +11,10 @@ months = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'Jun def xkcd_info(xkcd_id, url=False): """ takes an XKCD entry ID and returns a formatted string """ data = http.get_json("http://www.xkcd.com/" + xkcd_id + "/info.0.json") - date = "{} {} {}".format(data['day'], months[int(data['month'])], data['year']) + date = "%s %s %s" % (data['day'], months[int(data['month'])], data['year']) if url: url = " | http://xkcd.com/" + xkcd_id.replace("/", "") - return "xkcd: \x02{}\x02 ({}){}".format(data['title'], date, url if url else "") + return "xkcd: \x02%s\x02 (%s)%s" % (data['title'], date, url if url else "") def xkcd_search(term): @@ -25,7 +25,7 @@ def xkcd_search(term): if result: url = result.find('div', {'class': 'tinylink'}).text xkcd_id = url[:-1].split("/")[-1] - print(xkcd_id) + print xkcd_id return xkcd_info(xkcd_id, url=True) else: return "No results found!" diff --git a/plugins/yahooanswers.py b/disabled_stuff/yahooanswers.py similarity index 85% rename from plugins/yahooanswers.py rename to disabled_stuff/yahooanswers.py index 3301b97..e28ed63 100644 --- a/plugins/yahooanswers.py +++ b/disabled_stuff/yahooanswers.py @@ -13,4 +13,4 @@ def answer(inp): # we split the answer and .join() it to remove newlines/extra spaces answer_text = text.truncate_str(' '.join(result["ChosenAnswer"].split()), 80) - return '\x02{}\x02 "{}" - {}'.format(result["Subject"], answer_text, short_url) + return u'\x02{}\x02 "{}" - {}'.format(result["Subject"], answer_text, short_url) diff --git a/plugins/youtube.py b/disabled_stuff/youtube.py similarity index 82% rename from plugins/youtube.py rename to disabled_stuff/youtube.py index 27d68e0..e63bca3 100644 --- a/plugins/youtube.py +++ b/disabled_stuff/youtube.py @@ -25,26 +25,25 @@ def get_video_description(video_id): data = request['data'] - out = '\x02{}\x02'.format(data['title']) + out = u'\x02{}\x02'.format(data['title']) if not data.get('duration'): return out length = data['duration'] - out += ' - length \x02{}\x02'.format(timeformat.format_time(length, simple=True)) + out += u' - length \x02{}\x02'.format(timeformat.format_time(length, simple=True)) if 'ratingCount' in data: - # format likes = plural(int(data['likeCount']), "like") dislikes = plural(data['ratingCount'] - int(data['likeCount']), "dislike") percent = 100 * float(data['likeCount']) / float(data['ratingCount']) - out += ' - {}, {} (\x02{:.1f}\x02%)'.format(likes, + out += u' - {}, {} (\x02{:.1f}\x02%)'.format(likes, dislikes, percent) if 'viewCount' in data: views = data['viewCount'] - out += ' - \x02{:,}\x02 view{}'.format(views, "s"[views == 1:]) + out += u' - \x02{:,}\x02 view{}'.format(views, "s"[views == 1:]) try: uploader = http.get_json(base_url + "users/{}?alt=json".format(data["uploader"]))["entry"]["author"][0]["name"][ @@ -53,11 +52,11 @@ def get_video_description(video_id): uploader = data["uploader"] upload_time = time.strptime(data['uploaded'], "%Y-%m-%dT%H:%M:%S.000Z") - out += ' - \x02{}\x02 on \x02{}\x02'.format(uploader, + out += u' - \x02{}\x02 on \x02{}\x02'.format(uploader, time.strftime("%Y.%m.%d", upload_time)) if 'contentRating' in data: - out += ' - \x034NSFW\x02' + out += u' - \x034NSFW\x02' return out @@ -83,7 +82,7 @@ def youtube(inp): video_id = request['data']['items'][0]['id'] - return get_video_description(video_id) + " - " + video_url % video_id + return get_video_description(video_id) + u" - " + video_url % video_id @hook.command('ytime') @@ -115,8 +114,8 @@ def youtime(inp): length_text = timeformat.format_time(length, simple=True) total_text = timeformat.format_time(total, accuracy=8) - return 'The video \x02{}\x02 has a length of {} and has been viewed {:,} times for ' \ - 'a total run time of {}!'.format(data['title'], length_text, views, + return u'The video \x02{}\x02 has a length of {} and has been viewed {:,} times for ' \ + u'a total run time of {}!'.format(data['title'], length_text, views, total_text) @@ -134,4 +133,4 @@ def ytplaylist_url(match): author = soup.find('img', {'class': 'channel-header-profile-image'})['title'] num_videos = soup.find('ul', {'class': 'header-stats'}).findAll('li')[0].text.split(' ')[0] views = soup.find('ul', {'class': 'header-stats'}).findAll('li')[1].text.split(' ')[0] - return "\x02{}\x02 - \x02{}\x02 views - \x02{}\x02 videos - \x0{}\x02".format(title, views, num_videos, author) + return u"\x02%s\x02 - \x02%s\x02 views - \x02%s\x02 videos - \x02%s\x02" % (title, views, num_videos, author) diff --git a/lib/bs4/AUTHORS.txt b/lib/bs4/AUTHORS.txt new file mode 100644 index 0000000..2ac8fcc --- /dev/null +++ b/lib/bs4/AUTHORS.txt @@ -0,0 +1,43 @@ +Behold, mortal, the origins of Beautiful Soup... +================================================ + +Leonard Richardson is the primary programmer. + +Aaron DeVore is awesome. + +Mark Pilgrim provided the encoding detection code that forms the base +of UnicodeDammit. + +Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful +Soup 4 working under Python 3. + +Simon Willison wrote soupselect, which was used to make Beautiful Soup +support CSS selectors. + +Sam Ruby helped with a lot of edge cases. + +Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his +work in solving the nestable tags conundrum. + +An incomplete list of people have contributed patches to Beautiful +Soup: + + Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, + Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris + Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, + Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed + Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko + Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn + Webster, Paul Wright, Danny Yoo + +An incomplete list of people who made suggestions or found bugs or +found ways to break Beautiful Soup: + + Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, + Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, + Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, + warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, + Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed + Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart + Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de + Sousa Rocha, Yichun Wei, Per Vognsen diff --git a/lib/bs4/COPYING.txt b/lib/bs4/COPYING.txt new file mode 100644 index 0000000..d668d13 --- /dev/null +++ b/lib/bs4/COPYING.txt @@ -0,0 +1,26 @@ +Beautiful Soup is made available under the MIT license: + + Copyright (c) 2004-2012 Leonard Richardson + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE, DAMMIT. + +Beautiful Soup incorporates code from the html5lib library, which is +also made available under the MIT license. diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py new file mode 100644 index 0000000..03b2416 --- /dev/null +++ b/lib/bs4/__init__.py @@ -0,0 +1,365 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides provides methods and Pythonic idioms that make it easy to +navigate, search, and modify the parse tree. + +Beautiful Soup works with Python 2.6 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.2.1" +__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import re +import warnings + +from .builder import builder_registry +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = u'[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, **kwargs): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if 'convertEntities' in kwargs: + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. You can pass in features='html' " + "or features='xml' to get a builder capable of handling " + "one or the other.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if len(kwargs) > 0: + arg = kwargs.keys().pop() + raise TypeError( + "__init__() got an unexpected keyword argument '%s'" % arg) + + if builder is None: + if isinstance(features, basestring): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + builder = builder_class() + self.builder = builder + self.is_xml = builder.is_xml + self.builder.soup = self + + self.parse_only = parse_only + + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) = ( + self.builder.prepare_markup(markup, from_encoding)) + + try: + self._feed() + except StopParsing: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + """Create a new tag associated with this soup.""" + return Tag(None, self.builder, name, namespace, nsprefix, attrs) + + def new_string(self, s, subclass=NavigableString): + """Create a new NavigableString associated with this soup.""" + navigable = subclass(s) + navigable.setup() + return navigable + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.builder.preserve_whitespace_tags)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(currentData)): + return + o = containerClass(currentData) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Add an object to the parse tree.""" + parent = parent or self.currentTag + most_recent_element = most_recent_element or self._most_recent_element + o.setup(parent, most_recent_element) + if most_recent_element is not None: + most_recent_element.next_element = o + self._most_recent_element = o + parent.contents.append(o) + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + + for i in range(len(self.tagStack) - 1, 0, -1): + if (name == self.tagStack[i].name + and nsprefix == self.tagStack[i].prefix): + numPops = len(self.tagStack) - i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element) + if tag is None: + return tag + if self._most_recent_element: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.currentData.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'\n' % encoding_part + else: + prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py new file mode 100644 index 0000000..bae453e --- /dev/null +++ b/lib/bs4/builder/__init__.py @@ -0,0 +1,316 @@ +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + whitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + features = [] + + is_xml = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + cdata_list_attributes = {} + + + def __init__(self): + self.soup = None + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "" will become "", and "bar" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), []) + for cdata_list_attr in itertools.chain(universal, tag_specific): + if cdata_list_attr in attrs: + # Basically, we have a "class" attribute whose + # value is a whitespace-separated list of CSS + # classes. Split it into a list. + value = attrs[cdata_list_attr] + if isinstance(value, basestring): + values = whitespace_re.split(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[cdata_list_attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + preserve_whitespace_tags = set(['pre', 'textarea']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + cdata_list_attributes = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + def set_up_substitutions(self, tag): + # We are only interested in tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in tags that say what encoding the + # document was originally in. This means HTML 5-style + # tags that provide the "charset" attribute. It also means + # HTML 4-style tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py new file mode 100644 index 0000000..e439ac8 --- /dev/null +++ b/lib/bs4/builder/_html5lib.py @@ -0,0 +1,222 @@ +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import NamespacedAttribute +import html5lib +from html5lib.constants import namespaces +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + features = ['html5lib', PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + return markup, None, None, False + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + doc = parser.parse(markup, encoding=self.user_specified_encoding) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, unicode): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment + + +class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(html5lib.treebuilders._base.Node): + def __init__(self, element, soup, namespace): + html5lib.treebuilders._base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # XXX This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + node.element) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + self.soup.object_was_parsed(node.element, parent=self.element) + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild( + Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild( + TextNode(child, self.soup)) + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + html5lib.treebuilders._base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..65ee618 --- /dev/null +++ b/lib/bs4/builder/_htmlparser.py @@ -0,0 +1,249 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import ( + HTMLParser, + HTMLParseError, + ) +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = ( + major > 3 + or (major == 3 and minor > 2) + or (major == 3 and minor == 2 and release >= 3)) + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + def handle_starttag(self, name, attrs): + # XXX namespace + self.soup.handle_starttag(name, None, None, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed. + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + data = u"\N{REPLACEMENT CHARACTER}" + + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "" + data = '' + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + if data.endswith("?") and data.lower().startswith("xml"): + # "An XHTML processing instruction using the trailing '?' + # will cause the '?' to be included in data." - HTMLParser + # docs. + # + # Strip the question mark so we don't end up with two + # question marks. + data = data[:-1] + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + except HTMLParseError, e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like

as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py new file mode 100644 index 0000000..be35d70 --- /dev/null +++ b/lib/bs4/builder/_lxml.py @@ -0,0 +1,199 @@ +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +from io import BytesIO +from StringIO import StringIO +import collections +from lxml import etree +from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + TreeBuilder, + XML) +from bs4.dammit import UnicodeDammit + +LXML = 'lxml' + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + + # Well, it's permissive by XML parser standards. + features = [LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + + @property + def default_parser(self): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + return etree.XMLParser(target=self, strip_cdata=False, recover=True) + + def __init__(self, parser=None, empty_element_tags=None): + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + if parser is None: + # Use the default parser. + parser = self.default_parser + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False) + self.parser = parser + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS] + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, unicode): + markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) + while data != '': + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if data != '': + self.parser.feed(data) + self.parser.close() + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + pass + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + features = [LXML, HTML, FAST, PERMISSIVE] + is_xml = False + + @property + def default_parser(self): + return etree.HTMLParser + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py new file mode 100644 index 0000000..a733cad --- /dev/null +++ b/lib/bs4/dammit.py @@ -0,0 +1,827 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This class forces XML data into a standard format (usually to UTF-8 or +Unicode). It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It does not rewrite the XML or HTML to reflect a new +encoding; that's the tree builder's job. +""" + +import codecs +from htmlentitydefs import codepoint2name +import re +import logging + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + for codepoint, name in list(codepoint2name.items()): + character = unichr(codepoint) + if codepoint != 34: + # There's no point in turning the quotation mark into + # ", unless it happens within an attribute value, which + # is handled elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to turn " into the quotation mark. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False): + self.declared_html_encoding = None + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + + if markup == '' or isinstance(markup, unicode): + self.markup = markup + self.unicode_markup = unicode(markup) + self.original_encoding = None + return + + new_markup, document_encoding, sniffed_encoding = \ + self._detectEncoding(markup, is_html) + self.markup = new_markup + + u = None + if new_markup != markup: + # _detectEncoding modified the markup, then converted it to + # Unicode and then to UTF-8. So convert it from UTF-8. + u = self._convert_from("utf8") + self.original_encoding = sniffed_encoding + + if not u: + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) + if u: + break + + # If no luck and we have auto-detection library, try that: + if not u and not isinstance(self.markup, unicode): + u = self._convert_from(chardet_dammit(self.markup)) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convert_from(proposed_encoding) + if u: + break + + # As an absolute last resort, try the encodings again with + # character replacement. + if not u: + for proposed_encoding in ( + override_encodings + [ + document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): + if proposed_encoding != "ascii": + u = self._convert_from(proposed_encoding, "replace") + if u is not None: + logging.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER.") + self.contains_replacement_characters = True + break + + # We could at this point force it to ASCII, but that would + # destroy so much data that I think giving up is better + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding, errors) + return newdata + + def _detectEncoding(self, xml_data, is_html=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == b'\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == b'\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ + and (xml_data[2:4] != b'\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == b'\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ + (xml_data[2:4] != b'\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == b'\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == b'\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == b'\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == b'\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == b'\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_match = xml_encoding_re.match(xml_data) + if not xml_encoding_match and is_html: + xml_encoding_match = html_meta_re.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if is_html: + self.declared_html_encoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn Ć” into a, but also + # contains non-horrors like turning ā€œ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ā€š + 0x83 : b'\xc6\x92', # ʒ + 0x84 : b'\xe2\x80\x9e', # ā€ž + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # — + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Å  + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ā€˜ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # ā€œ + 0x94 : b'\xe2\x80\x9d', # ā€ + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ā„¢ + 0x9a : b'\xc5\xa1', # Å” + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Åø + 0xa0 : b'\xc2\xa0', # Ā  + 0xa1 : b'\xc2\xa1', # Ā” + 0xa2 : b'\xc2\xa2', # Ā¢ + 0xa3 : b'\xc2\xa3', # Ā£ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # Ā„ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ĀØ + 0xa9 : b'\xc2\xa9', # Ā© + 0xaa : b'\xc2\xaa', # ĀŖ + 0xab : b'\xc2\xab', # Ā« + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # Ā­ + 0xae : b'\xc2\xae', # Ā® + 0xaf : b'\xc2\xaf', # ĀÆ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # Ā“ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # Ā· + 0xb8 : b'\xc2\xb8', # Āø + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # Āŗ + 0xbb : b'\xc2\xbb', # Ā» + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # Āæ + 0xc0 : b'\xc3\x80', # ƀ + 0xc1 : b'\xc3\x81', # Ɓ + 0xc2 : b'\xc3\x82', # Ƃ + 0xc3 : b'\xc3\x83', # ƃ + 0xc4 : b'\xc3\x84', # Ƅ + 0xc5 : b'\xc3\x85', # ƅ + 0xc6 : b'\xc3\x86', # Ɔ + 0xc7 : b'\xc3\x87', # Ƈ + 0xc8 : b'\xc3\x88', # ƈ + 0xc9 : b'\xc3\x89', # Ɖ + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ƌ + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # ƍ + 0xce : b'\xc3\x8e', # Ǝ + 0xcf : b'\xc3\x8f', # Ə + 0xd0 : b'\xc3\x90', # Ɛ + 0xd1 : b'\xc3\x91', # Ƒ + 0xd2 : b'\xc3\x92', # ƒ + 0xd3 : b'\xc3\x93', # Ɠ + 0xd4 : b'\xc3\x94', # Ɣ + 0xd5 : b'\xc3\x95', # ƕ + 0xd6 : b'\xc3\x96', # Ɩ + 0xd7 : b'\xc3\x97', # Ɨ + 0xd8 : b'\xc3\x98', # Ƙ + 0xd9 : b'\xc3\x99', # ƙ + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # ƛ + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ɲ + 0xde : b'\xc3\x9e', # ƞ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # Ć  + 0xe1 : b'\xa1', # Ć” + 0xe2 : b'\xc3\xa2', # Ć¢ + 0xe3 : b'\xc3\xa3', # Ć£ + 0xe4 : b'\xc3\xa4', # Ƥ + 0xe5 : b'\xc3\xa5', # Ć„ + 0xe6 : b'\xc3\xa6', # Ʀ + 0xe7 : b'\xc3\xa7', # Ƨ + 0xe8 : b'\xc3\xa8', # ĆØ + 0xe9 : b'\xc3\xa9', # Ć© + 0xea : b'\xc3\xaa', # ĆŖ + 0xeb : b'\xc3\xab', # Ć« + 0xec : b'\xc3\xac', # Ƭ + 0xed : b'\xc3\xad', # Ć­ + 0xee : b'\xc3\xae', # Ć® + 0xef : b'\xc3\xaf', # ĆÆ + 0xf0 : b'\xc3\xb0', # ư + 0xf1 : b'\xc3\xb1', # Ʊ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # Ć“ + 0xf5 : b'\xc3\xb5', # Ƶ + 0xf6 : b'\xc3\xb6', # ƶ + 0xf7 : b'\xc3\xb7', # Ć· + 0xf8 : b'\xc3\xb8', # Ćø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # Ćŗ + 0xfb : b'\xc3\xbb', # Ć» + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py new file mode 100644 index 0000000..25fda5c --- /dev/null +++ b/lib/bs4/diagnose.py @@ -0,0 +1,178 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" +from StringIO import StringIO +from HTMLParser import HTMLParser +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry +import os +import random +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + print "Diagnostic running on Beautiful Soup %s" % __version__ + print "Python version %s" % sys.version + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print ( + "I noticed that %s is not installed. Installing it may help." % + name) + + if 'lxml' in basic_parsers: + basic_parsers.append(["lxml", "xml"]) + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + + if 'html5lib' in basic_parsers: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + + if hasattr(data, 'read'): + data = data.read() + elif os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + data = open(data).read() + elif data.startswith("http:") or data.startswith("https:"): + print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data + print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + return + print + + for parser in basic_parsers: + print "Trying to parse your markup with %s" % parser + success = False + try: + soup = BeautifulSoup(data, parser) + success = True + except Exception, e: + print "%s could not parse the markup." % parser + traceback.print_exc() + if success: + print "Here's what %s did with the markup:" % parser + print soup.prettify() + + print "-" * 80 + +def lxml_trace(data, html=True): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html): + print("%s, %4s, %s" % (event, element.tag, element.text)) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("" % tag_name) + return "" + "\n".join(elements) + "" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + data = rdoc(num_elements) + print "Generated a large invalid HTML document (%d bytes)." % len(data) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception, e: + print "%s could not parse the markup." % parser + traceback.print_exc() + if success: + print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print "Raw lxml parsed the markup in %.2fs." % (b-a) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/lib/bs4/element.py b/lib/bs4/element.py new file mode 100644 index 0000000..f6864f2 --- /dev/null +++ b/lib/bs4/element.py @@ -0,0 +1,1598 @@ +import collections +import re +import sys +import warnings +from bs4.dammit import EntitySubstitution + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +whitespace_re = re.compile("\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(unicode): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = unicode.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = unicode.__new__(cls, name) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + +class HTMLAwareEntitySubstitution(EntitySubstitution): + + """Entity substitution rules that are aware of some HTML quirks. + + Specifically, the contents of +""" + soup = BeautifulSoup(doc, "xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_popping_namespaced_tag(self): + markup = 'b2012-07-02T20:33:42Zcd' + soup = self.soup(markup) + self.assertEqual( + unicode(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") + self.assertEqual( + soup.encode("latin1"), + b'\n') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("

", "

") + self.assertSoupEquals("

foo

") + + def test_namespaces_are_preserved(self): + markup = 'This tag is in the a namespaceThis tag is in the b namespace' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '

20010504

' + soup = self.soup(markup) + self.assertEqual(unicode(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = 'bar' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '5' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py new file mode 100644 index 0000000..142c8cc --- /dev/null +++ b/lib/bs4/tests/__init__.py @@ -0,0 +1 @@ +"The beautifulsoup tests." diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py new file mode 100644 index 0000000..92ad10f --- /dev/null +++ b/lib/bs4/tests/test_builder_registry.py @@ -0,0 +1,141 @@ +"""Tests of the builder registry.""" + +import unittest + +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry as registry, + HTMLParserTreeBuilder, + TreeBuilderRegistry, +) + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError: + HTML5LIB_PRESENT = False + +try: + from bs4.builder import ( + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + ) + LXML_PRESENT = True +except ImportError: + LXML_PRESENT = False + + +class BuiltInRegistryTest(unittest.TestCase): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('fast', 'html'), + LXMLTreeBuilder) + + if LXML_PRESENT: + self.assertEqual(registry.lookup('permissive', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib', 'html'), + HTML5TreeBuilder) + + def test_lookup_by_markup_type(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) + self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) + else: + self.assertEqual(registry.lookup('xml'), None) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) + else: + self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) + + def test_named_library(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('lxml', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('lxml', 'html'), + LXMLTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib'), + HTML5TreeBuilder) + + self.assertEqual(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + + def test_beautifulsoup_constructor_does_lookup(self): + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) + + # You'll get an exception if BS can't find an appropriate + # builder. + self.assertRaises(ValueError, BeautifulSoup, + "", features="no-such-feature") + +class RegistryTest(unittest.TestCase): + """Test the TreeBuilderRegistry class in general.""" + + def setUp(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list): + cls = type('Builder_' + '_'.join(feature_list), + (object,), {'features' : feature_list}) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + self.assertEqual(self.registry.lookup('foo'), None) + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + self.assertEqual(self.registry.lookup(), builder) + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('foo'), builder) + self.assertEqual(self.registry.lookup('bar'), builder) + + def test_lookup_fails_when_no_builder_implements_feature(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('baz'), None) + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + builder1 = self.builder_for_features('foo') + builder2 = self.builder_for_features('bar') + self.assertEqual(self.registry.lookup(), builder2) + + def test_lookup_fails_when_no_tree_builders_registered(self): + self.assertEqual(self.registry.lookup(), None) + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + has_one = self.builder_for_features('foo') + has_the_other = self.builder_for_features('bar') + has_both_early = self.builder_for_features('foo', 'bar', 'baz') + has_both_late = self.builder_for_features('foo', 'bar', 'quux') + lacks_one = self.builder_for_features('bar') + has_the_other = self.builder_for_features('foo') + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + self.assertEqual(self.registry.lookup('foo', 'bar'), + has_both_late) + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), + has_both_early) + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + builder1 = self.builder_for_features('foo', 'bar') + builder2 = self.builder_for_features('foo', 'baz') + self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/lib/bs4/tests/test_docs.py b/lib/bs4/tests/test_docs.py new file mode 100644 index 0000000..5b9f677 --- /dev/null +++ b/lib/bs4/tests/test_docs.py @@ -0,0 +1,36 @@ +"Test harness for doctests." + +# pylint: disable-msg=E0611,W0142 + +__metaclass__ = type +__all__ = [ + 'additional_tests', + ] + +import atexit +import doctest +import os +#from pkg_resources import ( +# resource_filename, resource_exists, resource_listdir, cleanup_resources) +import unittest + +DOCTEST_FLAGS = ( + doctest.ELLIPSIS | + doctest.NORMALIZE_WHITESPACE | + doctest.REPORT_NDIFF) + + +# def additional_tests(): +# "Run the doc tests (README.txt and docs/*, if any exist)" +# doctest_files = [ +# os.path.abspath(resource_filename('bs4', 'README.txt'))] +# if resource_exists('bs4', 'docs'): +# for name in resource_listdir('bs4', 'docs'): +# if name.endswith('.txt'): +# doctest_files.append( +# os.path.abspath( +# resource_filename('bs4', 'docs/%s' % name))) +# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) +# atexit.register(cleanup_resources) +# return unittest.TestSuite(( +# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py new file mode 100644 index 0000000..2a3b41e --- /dev/null +++ b/lib/bs4/tests/test_html5lib.py @@ -0,0 +1,72 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import warnings + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError, e: + HTML5LIB_PRESENT = False +from bs4.element import SoupStrainer +from bs4.testing import ( + HTML5TreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing its tree builder.") +class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_soupstrainer(self): + # The html5lib tree builder does not support SoupStrainers. + strainer = SoupStrainer("b") + markup = "

A bold statement.

" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup, parse_only=strainer) + self.assertEqual( + soup.decode(), self.document_for(markup)) + + self.assertTrue( + "the html5lib tree builder doesn't support parse_only" in + str(w[0].message)) + + def test_correctly_nested_tables(self): + """html5lib inserts tags where other parsers don't.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_xml_declaration_followed_by_doctype(self): + markup = ''' + + + + + +

foo

+ +''' + soup = self.soup(markup) + # Verify that we can reach the

tag; this means the tree is connected. + self.assertEqual(b"

foo

", soup.p.encode()) diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py new file mode 100644 index 0000000..bcb5ed2 --- /dev/null +++ b/lib/bs4/tests/test_htmlparser.py @@ -0,0 +1,19 @@ +"""Tests to ensure that the html.parser tree builder generates good +trees.""" + +from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest +from bs4.builder import HTMLParserTreeBuilder + +class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + + @property + def default_builder(self): + return HTMLParserTreeBuilder() + + def test_namespaced_system_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_namespaced_public_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py new file mode 100644 index 0000000..80458de --- /dev/null +++ b/lib/bs4/tests/test_lxml.py @@ -0,0 +1,88 @@ +"""Tests to ensure that the lxml tree builder generates good trees.""" + +import re +import warnings + +try: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + LXML_PRESENT = True + import lxml.etree + LXML_VERSION = lxml.etree.LXML_VERSION +except ImportError, e: + LXML_PRESENT = False + LXML_VERSION = (0,) + +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, + ) +from bs4.element import Comment, Doctype, SoupStrainer +from bs4.testing import skipIf +from bs4.tests import test_htmlparser +from bs4.testing import ( + HTMLTreeBuilderSmokeTest, + XMLTreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its tree builder.") +class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilder() + + def test_out_of_range_entity(self): + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + not LXML_PRESENT or LXML_VERSION < (2,3,5,0), + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_beautifulstonesoup_is_xml_parser(self): + # Make sure that the deprecated BSS class uses an xml builder + # if one is installed. + with warnings.catch_warnings(record=False) as w: + soup = BeautifulStoneSoup("") + self.assertEqual(u"", unicode(soup.b)) + + def test_real_xhtml_document(self): + """lxml strips the XML definition from an XHTML doc, which is fine.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b''), + markup.replace(b'\n', b'').replace( + b'', b'')) + + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its XML tree builder.") +class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilderForXML() diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py new file mode 100644 index 0000000..b127716 --- /dev/null +++ b/lib/bs4/tests/test_soup.py @@ -0,0 +1,383 @@ +# -*- coding: utf-8 -*- +"""Tests of Beautiful Soup as a whole.""" + +import logging +import unittest +import sys +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, +) +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + SoupStrainer, + NamespacedAttribute, + ) +import bs4.dammit +from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.testing import ( + SoupTest, + skipIf, +) +import warnings + +try: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + LXML_PRESENT = True +except ImportError, e: + LXML_PRESENT = False + +PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) +PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) + +class TestDeprecatedConstructorArguments(SoupTest): + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("
", parseOnlyThese=SoupStrainer("b")) + msg = str(w[0].message) + self.assertTrue("parseOnlyThese" in msg) + self.assertTrue("parse_only" in msg) + self.assertEqual(b"", soup.encode()) + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + utf8 = b"\xc3\xa9" + soup = self.soup(utf8, fromEncoding="utf8") + msg = str(w[0].message) + self.assertTrue("fromEncoding" in msg) + self.assertTrue("from_encoding" in msg) + self.assertEqual("utf8", soup.original_encoding) + + def test_unrecognized_keyword_argument(self): + self.assertRaises( + TypeError, self.soup, "", no_such_argument=True) + + @skipIf( + not LXML_PRESENT, + "lxml not present, not testing BeautifulStoneSoup.") + def test_beautifulstonesoup(self): + with warnings.catch_warnings(record=True) as w: + soup = BeautifulStoneSoup("") + self.assertTrue(isinstance(soup, BeautifulSoup)) + self.assertTrue("BeautifulStoneSoup class is deprecated") + +class TestSelectiveParsing(SoupTest): + + def test_parse_with_soupstrainer(self): + markup = "NoYesNoYes Yes" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.encode(), b"YesYes Yes") + + +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEqual(self.sub.substitute_html(s), + u"foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = b"\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEqual(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEqual(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + self.assertEqual( + self.sub.substitute_xml(s, True), + '"Welcome to "Bob\'s Bar""') + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEqual(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEqual( + self.sub.substitute_xml("foo"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml("ÁT&T"), + "&Aacute;T&T") + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml_containing_entities("ÁT&T"), + "ÁT&T") + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + self.assertEqual(self.sub.substitute_html(text), text) + + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setUp(self): + super(TestEncodingConversion, self).setUp() + self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + self.assertEqual( + self.utf8_data, + b'Sacr\xc3\xa9 bleu!') + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set. + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii") + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) + self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.original_encoding, None) + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) + self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) + + @skipIf( + PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, + "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") + def test_attribute_name_containing_unicode_characters(self): + markup = u'
' + self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) + +class TestUnicodeDammit(unittest.TestCase): + """Standalone tests of Unicode, Dammit.""" + + def test_smart_quotes_to_unicode(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup) + self.assertEqual( + dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") + + def test_smart_quotes_to_xml_entities(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEqual( + dammit.unicode_markup, "‘’“”") + + def test_smart_quotes_to_html_entities(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="html") + self.assertEqual( + dammit.unicode_markup, "‘’“”") + + def test_smart_quotes_to_ascii(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="ascii") + self.assertEqual( + dammit.unicode_markup, """''""""") + + def test_detect_utf8(self): + utf8 = b"\xc3\xa9" + dammit = UnicodeDammit(utf8) + self.assertEqual(dammit.unicode_markup, u'\xe9') + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') + self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) + + def test_ignore_inappropriate_codecs(self): + utf8_data = u"RƤksmƶrgĆ„s".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_ignore_invalid_codecs(self): + utf8_data = u"RƤksmƶrgĆ„s".encode("utf-8") + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_detect_html5_style_meta_tag(self): + + for data in ( + b'', + b"", + b"", + b""): + dammit = UnicodeDammit(data, is_html=True) + self.assertEqual( + "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (ie. encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + # + # If chardet is installed, it will detect that the document + # can be converted into ISO-8859-1 without errors. This happens + # to be the wrong encoding, but it is a consistent encoding, so the + # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. + doc = b"""\357\273\277 +\330\250\330\252\330\261 +\310\322\321\220\312\321\355\344""" + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + bs4.dammit.chardet_dammit = noop + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet + + def test_sniffed_xml_encoding(self): + # A document written in UTF-16LE will be converted by a different + # code path that sniffs the byte order markers. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual(u"ƔƩ", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + + def test_detwingle(self): + # Here's a UTF8 document. + utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") + + # Here's a Windows-1252 document. + windows_1252 = ( + u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + + # Through some unholy alchemy, they've been stuck together. + doc = utf8 + windows_1252 + utf8 + + # The document can't be turned into UTF-8: + self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") + + # Unicode, Dammit thinks the whole document is Windows-1252, + # and decodes it into "Ć¢ĖœĘ’Ć¢ĖœĘ’Ć¢ĖœĘ’ā€œHi, I like Windows!ā€Ć¢ĖœĘ’Ć¢ĖœĘ’Ć¢ĖœĘ’" + + # But if we run it through fix_embedded_windows_1252, it's fixed: + + fixed = UnicodeDammit.detwingle(doc) + self.assertEqual( + u"ā˜ƒā˜ƒā˜ƒā€œHi, I like Windows!ā€ā˜ƒā˜ƒā˜ƒ", fixed.decode("utf8")) + + def test_detwingle_ignores_multibyte_characters(self): + # Each of these characters has a UTF-8 representation ending + # in \x93. \x93 is a smart quote if interpreted as + # Windows-1252. But our code knows to skip over multibyte + # UTF-8 characters, so they'll survive the process unscathed. + for tricky_unicode_char in ( + u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + ): + input = tricky_unicode_char.encode("utf8") + self.assertTrue(input.endswith(b'\x93')) + output = UnicodeDammit.detwingle(input) + self.assertEqual(output, input) + +class TestNamedspacedAttribute(SoupTest): + + def test_name_may_be_none(self): + a = NamespacedAttribute("xmlns", None) + self.assertEqual(a, "xmlns") + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + self.assertEqual("a:b", a) + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + self.assertEqual(a, b) + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + self.assertEqual(a, c) + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + self.assertNotEqual(a, d) + + e = NamespacedAttribute("z", "b", "c") + self.assertNotEqual(a, e) + + +class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): + + def test_content_meta_attribute_value(self): + value = CharsetMetaAttributeValue("euc-jp") + self.assertEqual("euc-jp", value) + self.assertEqual("euc-jp", value.original_value) + self.assertEqual("utf8", value.encode("utf8")) + + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + self.assertEqual("text/html; charset=euc-jp", value) + self.assertEqual("text/html; charset=euc-jp", value.original_value) + self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py new file mode 100644 index 0000000..2d09f96 --- /dev/null +++ b/lib/bs4/tests/test_tree.py @@ -0,0 +1,1800 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +import copy +import pickle +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry, + HTMLParserTreeBuilder, +) +from bs4.element import ( + CData, + Comment, + Doctype, + NavigableString, + SoupStrainer, + Tag, +) +from bs4.testing import ( + SoupTest, + skipIf, +) + +XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) +LXML_PRESENT = (builder_registry.lookup("lxml") is not None) + +class TreeTest(SoupTest): + + def assertSelects(self, tags, should_match): + """Make sure that the given tags have the correct text. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag.string for tag in tags], should_match) + + def assertSelectsIDs(self, tags, should_match): + """Make sure that the given tags have the correct IDs. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag['id'] for tag in tags], should_match) + + +class TestFind(TreeTest): + """Basic tests of the find() method. + + find() just calls find_all() with limit=1, so it's not tested all + that thouroughly here. + """ + + def test_find_tag(self): + soup = self.soup("1234") + self.assertEqual(soup.find("b").string, "2") + + def test_unicode_text_find(self): + soup = self.soup(u'

RƤksmƶrgƄs

') + self.assertEqual(soup.find(text=u'RƤksmƶrgĆ„s'), u'RƤksmƶrgĆ„s') + +class TestFindAll(TreeTest): + """Basic tests of the find_all() method.""" + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. + self.assertEqual(soup.find_all(text="bar"), [u"bar"]) + # Match any of a number of strings. + self.assertEqual( + soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) + # Match a regular expression. + self.assertEqual(soup.find_all(text=re.compile('.*')), + [u"Foo", u"bar", u'\xbb']) + # Match anything. + self.assertEqual(soup.find_all(text=True), + [u"Foo", u"bar", u'\xbb']) + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("12345") + self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) + self.assertSelects(soup.find_all('a', limit=1), ["1"]) + self.assertSelects( + soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assertSelects( + soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("123") + self.assertSelects(soup('a', limit=1), ["1"]) + self.assertSelects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): + soup = self.soup("") + # Create a self-referential list. + l = [] + l.append(l) + + # Without special code in _normalize_search_value, this would cause infinite + # recursion. + self.assertEqual([], soup.find_all(l)) + +class TestFindAllBasicNamespaces(TreeTest): + + def test_find_by_namespaced_name(self): + soup = self.soup('4') + self.assertEqual("4", soup.find("mathml:msqrt").string) + self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) + + +class TestFindAllByName(TreeTest): + """Test ways of finding tags by tag name.""" + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup("""First tag. + Second tag. + Third Nested tag. tag.""") + + def test_find_all_by_tag_name(self): + # Find all the tags. + self.assertSelects( + self.tree.find_all('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_name_and_text(self): + self.assertSelects( + self.tree.find_all('a', text='First tag.'), ['First tag.']) + + self.assertSelects( + self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) + + self.assertSelects( + self.tree.find_all('a', text=re.compile("tag")), + ['First tag.', 'Nested tag.']) + + + def test_find_all_on_non_root_element(self): + # You can call find_all on any node, not just the root. + self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) + + def test_calling_element_invokes_find_all(self): + self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_strainer(self): + self.assertSelects( + self.tree.find_all(SoupStrainer('a')), + ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_names(self): + self.assertSelects( + self.tree.find_all(['a', 'b']), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_dict(self): + self.assertSelects( + self.tree.find_all({'a' : True, 'b' : True}), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_re(self): + self.assertSelects( + self.tree.find_all(re.compile('^[ab]$')), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_with_tags_matching_method(self): + # You can define an oracle method that determines whether + # a tag matches the search. + def id_matches_name(tag): + return tag.name == tag.get('id') + + tree = self.soup("""Match 1. + Does not match. + Match 2.""") + + self.assertSelects( + tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + + +class TestFindAllByAttribute(TreeTest): + + def test_find_all_by_attribute_name(self): + # You can pass in keyword arguments to find_all to search by + # attribute. + tree = self.soup(""" + Matching a. + + Non-matching Matching b.a. + """) + self.assertSelects(tree.find_all(id='first'), + ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): + peace = u"םולש".encode("utf8") + data = u''.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) + self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) + + def test_find_all_by_attribute_dict(self): + # You can pass in a dictionary as the argument 'attrs'. This + # lets you search for attributes like 'name' (a fixed argument + # to find_all) and 'class' (a reserved word in Python.) + tree = self.soup(""" + Name match. + Class match. + Non-match. + A tag called 'name1'. + """) + + # This doesn't do what you want. + self.assertSelects(tree.find_all(name='name1'), + ["A tag called 'name1'."]) + # This does what you want. + self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), + ["Name match."]) + + self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), + ["Class match."]) + + def test_find_all_by_class(self): + tree = self.soup(""" + Class 1. + Class 2. + Class 1. + Class 3 and 4. + """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) + self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) + + # Passing in a string to 'attrs' will also search the CSS class. + self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) + self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) + self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) + + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("Found it") + + f = tree.find_all("gar", class_=re.compile("o")) + self.assertSelects(f, ["Found it"]) + + f = tree.find_all("gar", class_=re.compile("a")) + self.assertSelects(f, ["Found it"]) + + # Since the class is not the string "foo bar", but the two + # strings "foo" and "bar", this will not find anything. + f = tree.find_all("gar", class_=re.compile("o b")) + self.assertSelects(f, []) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("Found it") + + self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assertSelects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assertSelects( + soup.find_all("a", small_attribute_value), ["Found it"]) + + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('') + a, a2 = soup.find_all("a") + self.assertEqual([a, a2], soup.find_all("a", "foo")) + self.assertEqual([a], soup.find_all("a", "bar")) + + # If you specify the class as a string that contains a + # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", class_="foo bar")) + self.assertEqual([a], soup.find_all("a", "foo bar")) + self.assertEqual([], soup.find_all("a", "bar foo")) + + def test_find_all_by_attribute_soupstrainer(self): + tree = self.soup(""" + Match. + Non-match.""") + + strainer = SoupStrainer(attrs={'id' : 'first'}) + self.assertSelects(tree.find_all(strainer), ['Match.']) + + def test_find_all_with_missing_atribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that do not have that attribute set. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) + + def test_find_all_with_defined_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that have that attribute set to any value. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assertSelects( + tree.find_all(id=True), ["ID present.", "ID is empty."]) + + def test_find_all_with_numeric_attribute(self): + # If you search for a number, it's treated as a string. + tree = self.soup("""Unquoted attribute. + Quoted attribute.""") + + expected = ["Unquoted attribute.", "Quoted attribute."] + self.assertSelects(tree.find_all(id=1), expected) + self.assertSelects(tree.find_all(id="1"), expected) + + def test_find_all_with_list_attribute_values(self): + # You can pass a list of attribute values instead of just one, + # and you'll get tags that match any of the values. + tree = self.soup("""1 + 2 + 3 + No ID.""") + self.assertSelects(tree.find_all(id=["1", "3", "4"]), + ["1", "3"]) + + def test_find_all_with_regular_expression_attribute_value(self): + # You can pass a regular expression as an attribute value, and + # you'll get tags whose values for that attribute match the + # regular expression. + tree = self.soup("""One a. + Two as. + Mixed as and bs. + One b. + No ID.""") + + self.assertSelects(tree.find_all(id=re.compile("^a+$")), + ["One a.", "Two as."]) + + def test_find_by_name_and_containing_string(self): + soup = self.soup("foobarfoo") + a = soup.a + + self.assertEqual([a], soup.find_all("a", text="foo")) + self.assertEqual([], soup.find_all("a", text="bar")) + self.assertEqual([], soup.find_all("a", text="bar")) + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("foofoo") + self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('foofoo') + a = soup.a + + self.assertEqual([a], soup.find_all(id=2, text="foo")) + self.assertEqual([], soup.find_all(id=1, text="bar")) + + + + +class TestIndex(TreeTest): + """Test Tag.index""" + def test_index(self): + tree = self.soup("""
+ Identical + Not identical + Identical + + Identical with child + Also not identical + Identical with child +
""") + div = tree.div + for i, element in enumerate(div.contents): + self.assertEqual(i, div.index(element)) + self.assertRaises(ValueError, tree.index, 1) + + +class TestParentOperations(TreeTest): + """Test navigation and searching through an element's parents.""" + + def setUp(self): + super(TestParentOperations, self).setUp() + self.tree = self.soup('''
    +
      +
        +
          + Start here +
        +
      ''') + self.start = self.tree.b + + + def test_parent(self): + self.assertEqual(self.start.parent['id'], 'bottom') + self.assertEqual(self.start.parent.parent['id'], 'middle') + self.assertEqual(self.start.parent.parent.parent['id'], 'top') + + def test_parent_of_top_tag_is_soup_object(self): + top_tag = self.tree.contents[0] + self.assertEqual(top_tag.parent, self.tree) + + def test_soup_object_has_no_parent(self): + self.assertEqual(None, self.tree.parent) + + def test_find_parents(self): + self.assertSelectsIDs( + self.start.find_parents('ul'), ['bottom', 'middle', 'top']) + self.assertSelectsIDs( + self.start.find_parents('ul', id="middle"), ['middle']) + + def test_find_parent(self): + self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') + self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') + + def test_parent_of_text_element(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.parent.name, 'b') + + def test_text_element_find_parent(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.find_parent('ul')['id'], 'bottom') + + def test_parent_generator(self): + parents = [parent['id'] for parent in self.start.parents + if parent is not None and 'id' in parent.attrs] + self.assertEqual(parents, ['bottom', 'middle', 'top']) + + +class ProximityTest(TreeTest): + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup( + 'OneTwoThree') + + +class TestNextOperations(ProximityTest): + + def setUp(self): + super(TestNextOperations, self).setUp() + self.start = self.tree.b + + def test_next(self): + self.assertEqual(self.start.next_element, "One") + self.assertEqual(self.start.next_element.next_element['id'], "2") + + def test_next_of_last_item_is_none(self): + last = self.tree.find(text="Three") + self.assertEqual(last.next_element, None) + + def test_next_of_root_is_none(self): + # The document root is outside the next/previous chain. + self.assertEqual(self.tree.next_element, None) + + def test_find_all_next(self): + self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) + self.start.find_all_next(id=3) + self.assertSelects(self.start.find_all_next(id=3), ["Three"]) + + def test_find_next(self): + self.assertEqual(self.start.find_next('b')['id'], '2') + self.assertEqual(self.start.find_next(text="Three"), "Three") + + def test_find_next_for_text_element(self): + text = self.tree.find(text="One") + self.assertEqual(text.find_next("b").string, "Two") + self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) + + def test_next_generator(self): + start = self.tree.find(text="Two") + successors = [node for node in start.next_elements] + # There are two successors: the final tag and its text contents. + tag, contents = successors + self.assertEqual(tag['id'], '3') + self.assertEqual(contents, "Three") + +class TestPreviousOperations(ProximityTest): + + def setUp(self): + super(TestPreviousOperations, self).setUp() + self.end = self.tree.find(text="Three") + + def test_previous(self): + self.assertEqual(self.end.previous_element['id'], "3") + self.assertEqual(self.end.previous_element.previous_element, "Two") + + def test_previous_of_first_item_is_none(self): + first = self.tree.find('html') + self.assertEqual(first.previous_element, None) + + def test_previous_of_root_is_none(self): + # The document root is outside the next/previous chain. + # XXX This is broken! + #self.assertEqual(self.tree.previous_element, None) + pass + + def test_find_all_previous(self): + # The tag containing the "Three" node is the predecessor + # of the "Three" node itself, which is why "Three" shows up + # here. + self.assertSelects( + self.end.find_all_previous('b'), ["Three", "Two", "One"]) + self.assertSelects(self.end.find_all_previous(id=1), ["One"]) + + def test_find_previous(self): + self.assertEqual(self.end.find_previous('b')['id'], '3') + self.assertEqual(self.end.find_previous(text="One"), "One") + + def test_find_previous_for_text_element(self): + text = self.tree.find(text="Three") + self.assertEqual(text.find_previous("b").string, "Three") + self.assertSelects( + text.find_all_previous("b"), ["Three", "Two", "One"]) + + def test_previous_generator(self): + start = self.tree.find(text="One") + predecessors = [node for node in start.previous_elements] + + # There are four predecessors: the tag containing "One" + # the tag, the tag, and the tag. + b, body, head, html = predecessors + self.assertEqual(b['id'], '1') + self.assertEqual(body.name, "body") + self.assertEqual(head.name, "head") + self.assertEqual(html.name, "html") + + +class SiblingTest(TreeTest): + + def setUp(self): + super(SiblingTest, self).setUp() + markup = ''' + + + + + + + + + + + ''' + # All that whitespace looks good but makes the tests more + # difficult. Get rid of it. + markup = re.compile("\n\s*").sub("", markup) + self.tree = self.soup(markup) + + +class TestNextSibling(SiblingTest): + + def setUp(self): + super(TestNextSibling, self).setUp() + self.start = self.tree.find(id="1") + + def test_next_sibling_of_root_is_none(self): + self.assertEqual(self.tree.next_sibling, None) + + def test_next_sibling(self): + self.assertEqual(self.start.next_sibling['id'], '2') + self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') + + # Note the difference between next_sibling and next_element. + self.assertEqual(self.start.next_element['id'], '1.1') + + def test_next_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.next_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.next_sibling, None) + + last_span = self.tree.find(id="4") + self.assertEqual(last_span.next_sibling, None) + + def test_find_next_sibling(self): + self.assertEqual(self.start.find_next_sibling('span')['id'], '2') + + def test_next_siblings(self): + self.assertSelectsIDs(self.start.find_next_siblings("span"), + ['2', '3', '4']) + + self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) + + def test_next_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(text="Foo") + self.assertEqual(start.next_sibling.name, 'b') + self.assertEqual(start.next_sibling.next_sibling, 'baz') + + self.assertSelects(start.find_next_siblings('b'), ['bar']) + self.assertEqual(start.find_next_sibling(text="baz"), "baz") + self.assertEqual(start.find_next_sibling(text="nonesuch"), None) + + +class TestPreviousSibling(SiblingTest): + + def setUp(self): + super(TestPreviousSibling, self).setUp() + self.end = self.tree.find(id="4") + + def test_previous_sibling_of_root_is_none(self): + self.assertEqual(self.tree.previous_sibling, None) + + def test_previous_sibling(self): + self.assertEqual(self.end.previous_sibling['id'], '3') + self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') + + # Note the difference between previous_sibling and previous_element. + self.assertEqual(self.end.previous_element['id'], '3.1') + + def test_previous_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.previous_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.previous_sibling, None) + + first_span = self.tree.find(id="1") + self.assertEqual(first_span.previous_sibling, None) + + def test_find_previous_sibling(self): + self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') + + def test_previous_siblings(self): + self.assertSelectsIDs(self.end.find_previous_siblings("span"), + ['3', '2', '1']) + + self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) + + def test_previous_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(text="baz") + self.assertEqual(start.previous_sibling.name, 'b') + self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') + + self.assertSelects(start.find_previous_siblings('b'), ['bar']) + self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") + self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) + + +class TestTagCreation(SoupTest): + """Test the ability to create new tags.""" + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", bar="baz") + self.assertTrue(isinstance(new_tag, Tag)) + self.assertEqual("foo", new_tag.name) + self.assertEqual(dict(bar="baz"), new_tag.attrs) + self.assertEqual(None, new_tag.parent) + + def test_tag_inherits_self_closing_rules_from_builder(self): + if XML_BUILDER_PRESENT: + xml_soup = BeautifulSoup("", "xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the
      and

      tag are empty-element, just because + # they have no contents. + self.assertEqual(b"
      ", xml_br.encode()) + self.assertEqual(b"

      ", xml_p.encode()) + + html_soup = BeautifulSoup("", "html") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + self.assertEqual(b"
      ", html_br.encode()) + self.assertEqual(b"

      ", html_p.encode()) + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, NavigableString)) + + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, Comment)) + +class TestTreeModification(SoupTest): + + def test_attribute_modification(self): + soup = self.soup('') + soup.a['id'] = 2 + self.assertEqual(soup.decode(), self.document_for('')) + del(soup.a['id']) + self.assertEqual(soup.decode(), self.document_for('')) + soup.a['id2'] = 'foo' + self.assertEqual(soup.decode(), self.document_for('')) + + def test_new_tag_creation(self): + builder = builder_registry.lookup('html')() + soup = self.soup("", builder=builder) + a = Tag(soup, builder, 'a') + ol = Tag(soup, builder, 'ol') + a['href'] = 'http://foo.com/' + soup.body.insert(0, a) + soup.body.insert(1, ol) + self.assertEqual( + soup.body.encode(), + b'
        ') + + def test_append_to_contents_moves_tag(self): + doc = """

        Don't leave me here.

        +

        Don\'t leave!

        """ + soup = self.soup(doc) + second_para = soup.find(id='2') + bold = soup.b + + # Move the tag to the end of the second paragraph. + soup.find(id='2').append(soup.b) + + # The tag is now a child of the second paragraph. + self.assertEqual(bold.parent, second_para) + + self.assertEqual( + soup.decode(), self.document_for( + '

        Don\'t leave me .

        \n' + '

        Don\'t leave!here

        ')) + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.replace_with(soup.c) + self.assertEqual(a, new_a) + + def test_unwrap_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.unwrap() + self.assertEqual(a, new_a) + + def test_replace_tag_with_itself(self): + text = "Foo" + soup = self.soup(text) + c = soup.c + soup.c.replace_with(c) + self.assertEqual(soup.decode(), self.document_for(text)) + + def test_replace_tag_with_its_parent_raises_exception(self): + text = "" + soup = self.soup(text) + self.assertRaises(ValueError, soup.b.replace_with, soup.a) + + def test_insert_tag_into_itself_raises_exception(self): + text = "" + soup = self.soup(text) + self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup('

        onethree

        ') + a = soup.a + b = a.contents[0] + # Make it so the tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replaceWith('') + right.replaceWith('') + + # The tag is still connected to the tree. + self.assertEqual("three", soup.b.string) + + def test_replace_final_node(self): + soup = self.soup("Argh!") + soup.find(text="Argh!").replace_with("Hooray!") + new_text = soup.find(text="Hooray!") + b = soup.b + self.assertEqual(new_text.previous_element, b) + self.assertEqual(new_text.parent, b) + self.assertEqual(new_text.previous_element.next_element, new_text) + self.assertEqual(new_text.next_element, None) + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("Argh!") + soup.b.insert(1, "Hooray!") + + self.assertEqual( + soup.decode(), self.document_for( + "Argh!Hooray!")) + + new_text = soup.find(text="Hooray!") + self.assertEqual(new_text.previous_element, "Argh!") + self.assertEqual(new_text.previous_element.next_element, new_text) + + self.assertEqual(new_text.previous_sibling, "Argh!") + self.assertEqual(new_text.previous_sibling.next_sibling, new_text) + + self.assertEqual(new_text.next_sibling, None) + self.assertEqual(new_text.next_element, soup.c) + + def test_insert_string(self): + soup = self.soup("") + soup.a.insert(0, "bar") + soup.a.insert(0, "foo") + # The string were added to the tag. + self.assertEqual(["foo", "bar"], soup.a.contents) + # And they were converted to NavigableStrings. + self.assertEqual(soup.a.contents[0].next_element, "bar") + + def test_insert_tag(self): + builder = self.default_builder + soup = self.soup( + "Findlady!", builder=builder) + magic_tag = Tag(soup, builder, 'magictag') + magic_tag.insert(0, "the") + soup.a.insert(1, magic_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "Findthelady!")) + + # Make sure all the relationships are hooked up correctly. + b_tag = soup.b + self.assertEqual(b_tag.next_sibling, magic_tag) + self.assertEqual(magic_tag.previous_sibling, b_tag) + + find = b_tag.find(text="Find") + self.assertEqual(find.next_element, magic_tag) + self.assertEqual(magic_tag.previous_element, find) + + c_tag = soup.c + self.assertEqual(magic_tag.next_sibling, c_tag) + self.assertEqual(c_tag.previous_sibling, magic_tag) + + the = magic_tag.find(text="the") + self.assertEqual(the.parent, magic_tag) + self.assertEqual(the.next_element, c_tag) + self.assertEqual(c_tag.previous_element, the) + + def test_append_child_thats_already_at_the_end(self): + data = "" + soup = self.soup(data) + soup.a.append(soup.b) + self.assertEqual(data, soup.decode()) + + def test_move_tag_to_beginning_of_parent(self): + data = "" + soup = self.soup(data) + soup.a.insert(0, soup.d) + self.assertEqual("", soup.decode()) + + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("
        ") + soup.br.insert(1, "Contents") + self.assertEqual(str(soup.br), "
        Contents
        ") + + def test_insert_before(self): + soup = self.soup("foobar") + soup.b.insert_before("BAZ") + soup.a.insert_before("QUUX") + self.assertEqual( + soup.decode(), self.document_for("QUUXfooBAZbar")) + + soup.a.insert_before(soup.b) + self.assertEqual( + soup.decode(), self.document_for("QUUXbarfooBAZ")) + + def test_insert_after(self): + soup = self.soup("foobar") + soup.b.insert_after("BAZ") + soup.a.insert_after("QUUX") + self.assertEqual( + soup.decode(), self.document_for("fooQUUXbarBAZ")) + soup.b.insert_after(soup.a) + self.assertEqual( + soup.decode(), self.document_for("QUUXbarfooBAZ")) + + def test_insert_after_raises_exception_if_after_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_after, tag) + self.assertRaises(NotImplementedError, soup.insert_after, tag) + self.assertRaises(ValueError, tag.insert_after, tag) + + def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_before, tag) + self.assertRaises(NotImplementedError, soup.insert_before, tag) + self.assertRaises(ValueError, tag.insert_before, tag) + + def test_replace_with(self): + soup = self.soup( + "

        There's no business like show business

        ") + no, show = soup.find_all('b') + show.replace_with(no) + self.assertEqual( + soup.decode(), + self.document_for( + "

        There's business like no business

        ")) + + self.assertEqual(show.parent, None) + self.assertEqual(no.parent, soup.p) + self.assertEqual(no.next_element, "no") + self.assertEqual(no.next_sibling, " business") + + def test_replace_first_child(self): + data = "" + soup = self.soup(data) + soup.b.replace_with(soup.c) + self.assertEqual("", soup.decode()) + + def test_replace_last_child(self): + data = "" + soup = self.soup(data) + soup.c.replace_with(soup.b) + self.assertEqual("", soup.decode()) + + def test_nested_tag_replace_with(self): + soup = self.soup( + """Wereservetherighttorefuseservice""") + + # Replace the entire tag and its contents ("reserve the + # right") with the tag ("refuse"). + remove_tag = soup.b + move_tag = soup.f + remove_tag.replace_with(move_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "Werefusetoservice")) + + # The tag is now an orphan. + self.assertEqual(remove_tag.parent, None) + self.assertEqual(remove_tag.find(text="right").next_element, None) + self.assertEqual(remove_tag.previous_element, None) + self.assertEqual(remove_tag.next_sibling, None) + self.assertEqual(remove_tag.previous_sibling, None) + + # The tag is now connected to the tag. + self.assertEqual(move_tag.parent, soup.a) + self.assertEqual(move_tag.previous_element, "We") + self.assertEqual(move_tag.next_element.next_element, soup.e) + self.assertEqual(move_tag.next_sibling, None) + + # The gap where the tag used to be has been mended, and + # the word "to" is now connected to the tag. + to_text = soup.find(text="to") + g_tag = soup.g + self.assertEqual(to_text.next_element, g_tag) + self.assertEqual(to_text.next_sibling, g_tag) + self.assertEqual(g_tag.previous_element, to_text) + self.assertEqual(g_tag.previous_sibling, to_text) + + def test_unwrap(self): + tree = self.soup(""" +

        Unneeded formatting is unneeded

        + """) + tree.em.unwrap() + self.assertEqual(tree.em, None) + self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + self.assertEqual(value.decode(), "I wish I was bold.") + self.assertEqual( + soup.decode(), self.document_for("I wish I was bold.")) + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual( + soup.decode(), self.document_for("I wish I was bold.")) + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("I like being bold.I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual(2, len(soup.b.contents)) + self.assertEqual( + soup.decode(), self.document_for( + "I like being bold.I wish I was bold.")) + + def test_extract(self): + soup = self.soup( + 'Some content. More content.') + + self.assertEqual(len(soup.body.contents), 3) + extracted = soup.find(id="nav").extract() + + self.assertEqual( + soup.decode(), "Some content. More content.") + self.assertEqual(extracted.decode(), '') + + # The extracted tag is now an orphan. + self.assertEqual(len(soup.body.contents), 2) + self.assertEqual(extracted.parent, None) + self.assertEqual(extracted.previous_element, None) + self.assertEqual(extracted.next_element.next_element, None) + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(text="Some content. ") + content_2 = soup.find(text=" More content.") + self.assertEqual(content_1.next_element, content_2) + self.assertEqual(content_1.next_sibling, content_2) + self.assertEqual(content_2.previous_element, content_1) + self.assertEqual(content_2.previous_sibling, content_1) + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("
        foobar") + foo_1 = soup.a.string + bar_1 = soup.b.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the tag, and two + # in the tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + self.assertEqual(foo_2, soup.a.string) + self.assertEqual(bar_2, soup.b.string) + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("

        String Italicized and another

        ") + # clear using extract() + a = soup.a + soup.p.clear() + self.assertEqual(len(soup.p.contents), 0) + self.assertTrue(hasattr(a, "contents")) + + # clear using decompose() + em = a.em + a.clear(decompose=True) + self.assertEqual(0, len(em.contents)) + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup(" ") + soup.a.string = "foo" + self.assertEqual(soup.a.contents, ["foo"]) + soup.b.string = "bar" + self.assertEqual(soup.b.contents, ["bar"]) + + def test_string_set_does_not_affect_original_string(self): + soup = self.soup("foobar") + soup.b.string = soup.c.string + self.assertEqual(soup.a.encode(), b"barbar") + + def test_set_string_preserves_class_of_string(self): + soup = self.soup("") + cdata = CData("foo") + soup.a.string = cdata + self.assertTrue(isinstance(soup.a.string, CData)) + +class TestElementObjects(SoupTest): + """Test various features of element objects.""" + + def test_len(self): + """The length of an element is its number of children.""" + soup = self.soup("123") + + # The BeautifulSoup object itself contains one element: the + # tag. + self.assertEqual(len(soup.contents), 1) + self.assertEqual(len(soup), 1) + + # The tag contains three elements: the text node "1", the + # tag, and the text node "3". + self.assertEqual(len(soup.top), 3) + self.assertEqual(len(soup.top.contents), 3) + + def test_member_access_invokes_find(self): + """Accessing a Python member .foo invokes find('foo')""" + soup = self.soup('') + self.assertEqual(soup.b, soup.find('b')) + self.assertEqual(soup.b.i, soup.find('b').find('i')) + self.assertEqual(soup.a, None) + + def test_deprecated_member_access(self): + soup = self.soup('') + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + self.assertEqual(soup.b, tag) + self.assertEqual( + '.bTag is deprecated, use .find("b") instead.', + str(w[0].message)) + + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ + soup = self.soup("") + self.assertTrue(soup.foo.has_attr('attr')) + self.assertFalse(soup.foo.has_attr('attr2')) + + + def test_attributes_come_out_in_alphabetical_order(self): + markup = '' + self.assertSoupEquals(markup, '') + + def test_string(self): + # A tag that contains only a text node makes that node + # available as .string. + soup = self.soup("foo") + self.assertEqual(soup.b.string, 'foo') + + def test_empty_tag_has_no_string(self): + # A tag with no children has no .stirng. + soup = self.soup("") + self.assertEqual(soup.b.string, None) + + def test_tag_with_multiple_children_has_no_string(self): + # A tag with no children has no .string. + soup = self.soup("foo") + self.assertEqual(soup.b.string, None) + + soup = self.soup("foobar
        ") + self.assertEqual(soup.b.string, None) + + # Even if all the children are strings, due to trickery, + # it won't work--but this would be a good optimization. + soup = self.soup("foo
        ") + soup.a.insert(1, "bar") + self.assertEqual(soup.a.string, None) + + def test_tag_with_recursive_string_has_string(self): + # A tag with a single child which has a .string inherits that + # .string. + soup = self.soup("foo") + self.assertEqual(soup.a.string, "foo") + self.assertEqual(soup.string, "foo") + + def test_lack_of_string(self): + """Only a tag containing a single text node has a .string.""" + soup = self.soup("feo") + self.assertFalse(soup.b.string) + + soup = self.soup("") + self.assertFalse(soup.b.string) + + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("ar t ") + self.assertEqual(soup.a.text, "ar t ") + self.assertEqual(soup.a.get_text(strip=True), "art") + self.assertEqual(soup.a.get_text(","), "a,r, , t ") + self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") + + def test_get_text_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(soup.get_text(), "foobar") + + self.assertEqual( + soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") + self.assertEqual( + soup.get_text(types=None), "fooIGNOREbar") + + def test_all_strings_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + +class TestCDAtaListAttributes(SoupTest): + + """Testing cdata-list attributes like 'class'. + """ + def test_single_value_becomes_list(self): + soup = self.soup("") + self.assertEqual(["foo"],soup.a['class']) + + def test_multiple_values_becomes_list(self): + soup = self.soup("") + self.assertEqual(["foo", "bar"], soup.a['class']) + + def test_multiple_values_separated_by_weird_whitespace(self): + soup = self.soup("") + self.assertEqual(["foo", "bar", "baz"],soup.a['class']) + + def test_attributes_joined_into_string_on_output(self): + soup = self.soup("") + self.assertEqual(b'', soup.a.encode()) + + def test_accept_charset(self): + soup = self.soup('
        ') + self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) + + def test_cdata_attribute_applying_only_to_one_tag(self): + data = '' + soup = self.soup(data) + # We saw in another test that accept-charset is a cdata-list + # attribute for the tag. But it's not a cdata-list + # attribute for any other tag. + self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) + + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setUp(self): + super(TestPersistence, self).setUp() + self.page = """ + + + +Beautiful Soup: We called him Tortoise because he taught us. + + + + + + +foo +bar + +""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), self.tree.decode()) + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + self.assertEqual(copied.decode(), self.tree.decode()) + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = u"\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + +class TestSubstitutions(SoupTest): + + def test_default_formatter_is_minimal(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_html(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( + decoded, + self.document_for("<<Sacré bleu!>>")) + + def test_formatter_minimal(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_null(self): + markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + self.assertEqual(decoded, + self.document_for(u"<>")) + + def test_formatter_custom(self): + markup = u"<foo>bar" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, + self.document_for(u"BAR")) + + def test_formatter_is_run_on_attribute_values(self): + markup = u'e' + soup = self.soup(markup) + a = soup.a + + expect_minimal = u'e' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + + expect_html = u'e' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) + expect_upper = u'E' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc).encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc).encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup("
        foo
          \tbar\n  \n  
        baz ") + # Everything outside the
         tag is reformatted, but everything
        +        # inside is left alone.
        +        self.assertEqual(
        +            u'
        \n foo\n
          \tbar\n  \n  
        \n baz\n
        ', + soup.div.prettify()) + + def test_prettify_accepts_formatter(self): + soup = BeautifulSoup("foo") + pretty = soup.prettify(formatter = lambda x: x.upper()) + self.assertTrue("FOO" in pretty) + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("") + self.assertEqual(unicode, type(soup.prettify())) + + def test_prettify_can_encode_data(self): + soup = self.soup("") + self.assertEqual(bytes, type(soup.prettify("utf-8"))) + + def test_html_entity_substitution_off_by_default(self): + markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEqual(encoded, markup.encode('utf-8')) + + def test_encoding_substitution(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + self.assertTrue(b"charset=utf-8" in utf_8) + + euc_jp = soup.encode("euc_jp") + self.assertTrue(b"charset=euc_jp" in euc_jp) + + shift_jis = soup.encode("shift-jis") + self.assertTrue(b"charset=shift-jis" in shift_jis) + + utf_16_u = soup.encode("utf-16").decode("utf-16") + self.assertTrue("charset=utf-16" in utf_16_u) + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ('
        foo
        ') + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.contents[0].name, 'pre') + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.string.encode("utf-8"), + u"\N{SNOWMAN}".encode("utf-8")) + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + soup.b.encode("utf-8"), html.encode("utf-8")) + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.encode("ascii"), b"") + + def test_encoding_can_be_made_strict(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertRaises( + UnicodeEncodeError, soup.encode, "ascii", errors="strict") + + def test_decode_contents(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) + + def test_encode_contents(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + encoding="utf8")) + + def test_deprecated_renderContents(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + +class TestNavigableStringSubclasses(SoupTest): + + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEqual(str(soup), "") + self.assertEqual(soup.find(text="foo"), "foo") + self.assertEqual(soup.contents[0], "foo") + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"\n") + + +class TestSoupSelector(TreeTest): + + HTML = """ + + + +The title + + + + +
        +
        +

        An H1

        +

        Some text

        +

        Some more text

        +

        An H2

        +

        Another

        +Bob +

        Another H2

        +me + +span1a1 +span1a2 test + +span2a1 + + + +
        +

        English

        +

        English UK

        +

        English US

        +

        French

        +
        + + +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML) + + def assertSelects(self, selector, expected_ids): + el_ids = [el['id'] for el in self.soup.select(selector)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, [u'The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 3) + for div in els: + self.assertEqual(div.name, 'div') + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['main', 'inner', 'footer']) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertRaises(ValueError, self.soup.select, 'tag%t') + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assertFalse(els[0].has_attr('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_child_selector_id(self): + self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), + ('div[id$="1"]', []), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('a[href*="http://"]', ['bob', 'me']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), + ('div[id*="1"]', []), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ) + + def test_nth_of_type(self): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Some text') + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Another') + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') + self.assertEqual(len(els), 0) + + # Pass in an invalid value. + self.assertRaises( + ValueError, self.soup.select, 'div p:nth-of-type(0)') + + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Some text') + + def test_id_child_selector_nth_of_type(self): + self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The
        tag was selected. The