diff --git a/.gitignore b/.gitignore index 20e8101..719b47b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ persist -config +config.json gitflow *.db *.log @@ -11,4 +11,8 @@ gitflow *.sublime-project *.sublime-workspace .idea/ -plugins/data/GeoLiteCity.dat +data/GeoLiteCity.dat +plugins/New Text Document.txt +plugins/srvv.py +run.cmd +config diff --git a/README.md b/README.md index 06d64ac..cfb69ac 100644 --- a/README.md +++ b/README.md @@ -14,33 +14,37 @@ Unzip the resulting file, and continue to read this document. ### Install -Before you can run the bot, you need to install a few Python dependencies. LXML is required while Enchant and PyDNS are needed for several plugins. +Before you can run the bot, you need to install a few Python dependencies. LXML is required while Enchant, PyGeoIP, TweePy and PyDNS are needed for several plugins. -These can be installed with `pip` (The Python package manager): +These can be installed with `pip` (The Python package manager) by running the following command in the bot directory: - [sudo] pip install -r requirements.txt + pip install -r requirements.txt -If you use `pip`, you will also need the following packages on linux or `pip` will fail to install the requirements. +**Note:** If you use `pip`, you will also need the following packages on linux or `pip` will fail to install the requirements. ```python, python-dev, libenchant-dev, libenchant1c2a, libxslt-dev, libxml2-dev.``` + +(this can be done using your package manager (eg: *apt-get* or *yum*) #### How to install `pip` +You can usually install pip on linux by installing the `python-pip` package using your package manager (eg. *apt-get install python-pip* or *yum install python-pip* as root), or you can try the below code to download and install it manually. + curl -O http://python-distribute.org/distribute_setup.py # or download with your browser on windows python distribute_setup.py easy_install pip -If you are unable to use pip, there are Windows installers for LXML available for [64 bit](https://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win-amd64-py2.7.exe) and [32 bit](https://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win32-py2.7.exe) versions of Python. +If you need help installing pip on Windows, follow [this guide](http://simpledeveloper.com/how-to-install-easy_install/) and then run `easy_install pip` on the command line. ### Run -Before you run the bot, rename `config.default` to `config` and edit it with your preferred settings. +Before you run the bot, rename `config.default` to `config.json` and edit it with your preferred settings. You can check if your JSON is valid on [this site](http://jsonlint.com/)! Once you have installed the required dependencies and renamed the config file, you can run the bot! Make sure you are in the correct folder and run the following command: -`python bot.py` +`python cloudbot.py` -On Windows you can usually just double-click `bot.py` to start the bot, as long as you have Python installed correctly. +On Windows you can usually just double-click `cloudbot.py` to start the bot, as long as you have Python installed correctly. ## Getting help with CloudBot @@ -58,15 +62,17 @@ More at the [Wiki Main Page](http://git.io/cloudbotircwiki). The developers reside in [#CloudBot](irc://irc.esper.net/cloudbot) on [EsperNet](http://esper.net) and would be glad to help you. -If you think you have found a bug/have a idea/suggestion, please **open a issue** here on Github. +If you think you have found a bug/have a idea/suggestion, please **open a issue** here on Github and contact us on IRC! ### Requirements CloudBot runs on **Python** *2.7.x*. It is currently developed on **Windows** *8* with **Python** *2.7.5*. -It **requires the Python module** lXML. +It **requires the Python modules** lXML, watchdog and BeautifulSoup4. The module `Enchant` is needed for the spellcheck plugin. The module `PyDNS` is needed for SRV record lookup in the mcping plugin. +The module `PyGeoIP` is needed for location lookup in the geoip plugin. +The module `TweePy` is needed for the twitter plugin. **Windows** users: Windows compatibility some plugins is **broken** (such as ping), but we do intend to add it. Eventually. diff --git a/cloudbot.py b/cloudbot.py index 91515db..f567315 100755 --- a/cloudbot.py +++ b/cloudbot.py @@ -1,74 +1,52 @@ #!/usr/bin/env python +from core import bot import os -import Queue import sys import time -import re +import signal -sys.path += ['plugins', 'lib'] # add stuff to the sys.path for easy imports +# check python version +if sys.version_info < (3, 2, 0): + print("CloudBot3 requires Python 3.2 or newer.") + sys.exit(1) + +# set up environment os.chdir(sys.path[0] or '.') # do stuff relative to the install directory +# this is not the code you are looking for +if os.path.exists(os.path.abspath('lib')): + sys.path += ['lib'] -class Bot(object): - pass +print('CloudBot3 ') -print 'CloudBot DEV ' -# create new bot object -bot = Bot() -bot.vars = {} +def exit_gracefully(signum, frame): + # this doesn't really work at all + cloudbot.stop() -# record start time for the uptime command -bot.start_time = time.time() + # restore the original handler so if they do it again it triggers + signal.signal(signal.SIGINT, original_sigint) -print 'Begin Plugin Loading.' +# store the original SIGINT handler +original_sigint = signal.getsignal(signal.SIGINT) +signal.signal(signal.SIGINT, exit_gracefully) -# bootstrap the reloader -eval(compile(open(os.path.join('core', 'reload.py'), 'U').read(), - os.path.join('core', 'reload.py'), 'exec')) -reload(init=True) - -config() -if not hasattr(bot, 'config'): - exit() - -print 'Connecting to IRC...' - -bot.conns = {} - -try: - for name, conf in bot.config['connections'].iteritems(): - # strip all spaces and capitalization from the connection name - name = name.replace(" ", "_") - name = re.sub('[^A-Za-z0-9_]+', '', name) - print 'Connecting to server: %s' % conf['server'] - if conf.get('ssl'): - bot.conns[name] = SSLIRC(name, conf['server'], conf['nick'], conf=conf, - port=conf.get('port', 6667), channels=conf['channels'], - ignore_certificate_errors=conf.get('ignore_cert', True)) - else: - bot.conns[name] = IRC(name, conf['server'], conf['nick'], conf=conf, - port=conf.get('port', 6667), channels=conf['channels']) -except Exception as e: - print 'ERROR: malformed config file', e - sys.exit() - -bot.persist_dir = os.path.abspath('persist') -if not os.path.exists(bot.persist_dir): - os.mkdir(bot.persist_dir) - -print 'Connection(s) made, starting main loop.' +# create a bot master and start it +cloudbot = bot.CloudBot() +cloudbot.start() +# watch to see if the bot stops running or needs a restart while True: - reload() # these functions only do things - config() # if changes have occured - - for conn in bot.conns.itervalues(): - try: - out = conn.out.get_nowait() - main(conn, out) - except Queue.Empty: - pass - while all(conn.out.empty() for conn in bot.conns.itervalues()): + if cloudbot.running: time.sleep(.1) + else: + if cloudbot.do_restart: + # create a new bot thread and start it + # Todo: Make this work + del cloudbot + cloudbot = bot.Bot() + cloudbot.start() + continue + else: + break \ No newline at end of file diff --git a/config.default b/config.default index 97710d6..237482c 100644 --- a/config.default +++ b/config.default @@ -1,24 +1,47 @@ { "connections": - { - "esper": + [ { - "server": "irc.esper.net", - "nick": "MyCloudBot", + "name": "esper", + "connection": { + "server": "irc.esper.net", + "port": 6667, + "ssl": false, + "ignore_cert": true + }, + "nick": "MyCloueqerdBot", "user": "cloudbot", - "realname": "CloudBot - http://git.io/cloudbotirc", - "mode": "", - "nickserv_password": "", - "nickserv_user": "", + "real_name": "CloudBot - http://git.io/cloudbotirc", "channels": ["#cloudbot", "#cloudbot2"], - "invite_join": true, - "auto_rejoin": false, + "disabled_commands": [], + "acls": {}, + "nickserv": { + "enabled": false, + "nickserv_password": "", + "nickserv_user": "", + "nickserv_name": "nickserv", + "nickserv_command": "IDENTIFY" + }, + "permissions": { + "admins": { + "perms": ["adminonly", "addfactoid", "delfactoid", "ignore", "botcontrol", "permissions_users", "op"], + "users": ["examplea!user@example.com", "exampleb!user@example.com"] + }, + "moderators": { + "perms": ["addfactoid", "delfactoid", "ignore"], + "users": ["examplec!user@example.com"] + }, + "trusted": { + "perms": ["addfactoid", "delfactoid"], + "users": ["exampled!user@example.com"] + } + }, + "plugins": { + + }, "command_prefix": "." } - }, - "disabled_plugins": [], - "disabled_commands": [], - "acls": {}, + ], "api_keys": { "tvdb": "", @@ -35,30 +58,5 @@ "rdio_key": "", "rdio_secret": "" }, - "permissions": { - "admins": { - "perms": ["adminonly", "addfactoid", "delfactoid", "ignore", "botcontrol", "permissions_users", "op"], - "users": ["examplea!user@example.com", "exampleb!user@example.com"] - }, - "moderators": { - "perms": ["addfactoid", "delfactoid", "ignore"], - "users": ["examplec!user@example.com"] - } - }, - "plugins": - { - "factoids": - { - "prefix": false - }, - "ignore": - { - "ignored": [] - } - }, - "censored_strings": - [ - "mypass", - "mysecret" - ] + "disabled_plugins": [] } diff --git a/lib/oauth2/clients/__init__.py b/core/__init__.py similarity index 100% rename from lib/oauth2/clients/__init__.py rename to core/__init__.py diff --git a/core/bot.py b/core/bot.py new file mode 100644 index 0000000..95d0dfe --- /dev/null +++ b/core/bot.py @@ -0,0 +1,175 @@ +import time +import logging +import re +import os +import queue +import collections +import threading + +from sqlalchemy.orm import scoped_session, sessionmaker +from sqlalchemy import create_engine + +from core import config, irc, main +from core.permissions import PermissionManager +from core.loader import PluginLoader + + +def clean_name(n): + """strip all spaces and capitalization""" + return re.sub('[^A-Za-z0-9_]+', '', n.replace(" ", "_")) + + +def get_logger(): + """create and return a new logger object""" + # create logger + logger = logging.getLogger("cloudbot") + logger.setLevel(logging.DEBUG) + + # add a file handler + log_name = "bot.log" + fh = logging.FileHandler(log_name) + fh.setLevel(logging.INFO) + + # stdout handler + sh = logging.StreamHandler() + sh.setLevel(logging.DEBUG) + + # create a formatter and set the formatter for the handler. + frmt = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s') + fh.setFormatter(frmt) + simple_frmt = logging.Formatter('[%(levelname)s] %(message)s') + sh.setFormatter(simple_frmt) + + # add the Handlers to the logger + logger.addHandler(fh) + logger.addHandler(sh) + return logger + + +class CloudBot(threading.Thread): + def __init__(self): + # basic variables + self.start_time = time.time() + self.running = True + self.do_restart = False + + # stores each instance of the + self.instances = [] + + # set up config and logging + self.setup() + self.logger.debug("Bot setup completed.") + + # start bot instances + self.create() + + for instance in self.instances: + instance.permissions = PermissionManager(self, instance) + + # run plugin loader + self.plugins = collections.defaultdict(list) + + """ self.plugins format + {'PLUGIN_TYPE': [(, + {PLUGIN_ARGS}), + (, + {PLUGIN_ARGS})], + 'PLUGIN_TYPE': [(, + {PLUGIN_ARGS})] + } + """ + + self.threads = {} + + self.loader = PluginLoader(self) + + threading.Thread.__init__(self) + + def run(self): + """recieves input from the IRC engine and processes it""" + self.logger.info("Starting main thread.") + while self.running: + for instance in self.instances: + try: + incoming = instance.parsed_queue.get_nowait() + if incoming == StopIteration: + print("StopIteration") + # IRC engine has signalled timeout, so reconnect (ugly) + instance.connection.reconnect() + main.main(self, instance, incoming) + except queue.Empty: + pass + + # if no messages are in the incoming queue, sleep + while self.running and all(i.parsed_queue.empty() for i in self.instances): + time.sleep(.1) + + def setup(self): + """create the logger and config objects""" + # logging + self.logger = get_logger() + self.logger.debug("Logging system initalised.") + + # data folder + self.data_dir = os.path.abspath('persist') + if not os.path.exists(self.data_dir): + self.logger.debug("Data folder not found, creating.") + os.mkdir(self.data_dir) + + # config + self.config = config.Config(self) + self.logger.debug("Config system initalised.") + + # db + engine = create_engine('sqlite:///cloudbot.db') + db_factory = sessionmaker(bind=engine) + self.db_session = scoped_session(db_factory) + self.logger.debug("Database system initalised.") + + def create(self): + """ Create a BotInstance for all the networks defined in the config """ + for conf in self.config['instances']: + + # strip all spaces and capitalization from the connection name + name = clean_name(conf['name']) + nick = conf['nick'] + server = conf['connection']['server'] + port = conf['connection'].get('port', 6667) + + self.logger.debug("Creating BotInstance for {}.".format(name)) + + self.instances.append(irc.BotInstance(name, server, nick, config=conf, + port=port, logger=self.logger, channels=conf['channels'], + ssl=conf['connection'].get('ssl', False))) + self.logger.debug("({}) Created connection.".format(name)) + + + def stop(self, reason=None): + """quits all networks and shuts the bot down""" + self.logger.info("Stopping bot.") + + self.config.observer.stop() + self.logger.debug("Stopping config reloader.") + + self.loader.stop() + self.logger.debug("Stopping plugin loader.") + + for connection in self.connections: + self.logger.debug("({}) Closing connection.".format(connection.name)) + + if reason: + connection.cmd("QUIT", [reason]) + else: + connection.cmd("QUIT") + + connection.stop() + + self.logger.debug("Logging engine stopped") + logging.shutdown() + + self.running = False + + def restart(self, reason=None): + """shuts the bot down and restarts it""" + self.do_restart = True + self.stop(reason) \ No newline at end of file diff --git a/core/config.py b/core/config.py index c813ea5..9d8d010 100644 --- a/core/config.py +++ b/core/config.py @@ -1,27 +1,69 @@ -import inspect import json import os +import time +import sys + +from watchdog.observers import Observer +from watchdog.tricks import Trick -def save(conf): - json.dump(conf, open('config', 'w'), sort_keys=True, indent=2) +class Config(dict): + def __init__(self, bot, *args, **kwargs): + self.filename = "config.json" + self.path = os.path.abspath(self.filename) + self.bot = bot + self.logger = bot.logger + self.update(*args, **kwargs) -if not os.path.exists('config'): - print "Please rename 'config.default' to 'config' to set up your bot!" - print "For help, see http://git.io/cloudbotirc" - print "Thank you for using CloudBot!" - sys.exit() + # populate self with config data + self.load_config() + + # start watcher + self.watcher() -def config(): - # reload config from file if file has changed - config_mtime = os.stat('config').st_mtime - if bot._config_mtime != config_mtime: - try: - bot.config = json.load(open('config')) - bot._config_mtime = config_mtime - except ValueError, e: - print 'error: malformed config', e + def load_config(self): + """(re)loads the bot config from the config file""" + if not os.path.exists(self.path): + # if there is no config, show an error and die + self.logger.critical("No config file found, bot shutting down!") + print("No config file found! Bot shutting down in five seconds.") + print("Copy 'config.default' to 'config.json' for defaults.") + print("For help, see http://git.io/cloudbotirc. Thank you for using CloudBot!") + time.sleep(5) + sys.exit() + with open(self.path) as f: + self.update(json.load(f)) + self.logger.info("Config loaded from file.") -bot._config_mtime = 0 + # reload permissions + if self.bot.instances: + for instance in self.bot.instances: + instance.permissions.reload() + + def save_config(self): + """saves the contents of the config dict to the config file""" + json.dump(self, open(self.path, 'w'), sort_keys=True, indent=2) + self.logger.info("Config saved to file.") + + def watcher(self): + """starts the watchdog to automatically reload the config when it changes on disk""" + self.observer = Observer() + + pattern = "*{}".format(self.filename) + + self.event_handler = ConfigEventHandler(self, patterns=[pattern]) + self.observer.schedule(self.event_handler, path='.', recursive=False) + self.observer.start() + + +class ConfigEventHandler(Trick): + def __init__(self, config, *args, **kwargs): + self.config = config + self.logger = config.logger + Trick.__init__(self, *args, **kwargs) + + def on_any_event(self, event): + self.logger.info("Config changed, triggering reload.") + self.config.load_config() diff --git a/core/db.py b/core/db.py index 6bdf8fa..d4226a2 100644 --- a/core/db.py +++ b/core/db.py @@ -1,6 +1,6 @@ import os import sqlite3 -import thread +import _thread threaddbs = {} @@ -11,10 +11,10 @@ def get_db_connection(conn, name=''): if not name: name = '{}.db'.format(conn.name) - threadid = thread.get_ident() + threadid = _thread.get_ident() if name in threaddbs and threadid in threaddbs[name]: return threaddbs[name][threadid] - filename = os.path.join(bot.persist_dir, name) + filename = os.path.join(bot.data_dir, name) db = sqlite3.connect(filename, timeout=10) if name in threaddbs: diff --git a/core/irc.py b/core/irc.py index 35d4efa..88357d5 100644 --- a/core/irc.py +++ b/core/irc.py @@ -1,11 +1,18 @@ import re import socket import time -import thread -import Queue +import threading +import queue + +from core import permissions from ssl import wrap_socket, CERT_NONE, CERT_REQUIRED, SSLError +irc_prefix_rem = re.compile(r'(.*?) (.*?) (.*)').match +irc_noprefix_rem = re.compile(r'()(.*?) (.*)').match +irc_netmask_rem = re.compile(r':?([^!@]*)!?([^@]*)@?(.*)').match +irc_param_ref = re.compile(r'(?:^|(?<= ))(:.*|[^ ]+)').findall + def decode(txt): for codec in ('utf-8', 'iso-8859-1', 'shift_jis', 'cp1252'): @@ -17,61 +24,44 @@ def decode(txt): def censor(text): - text = text.replace('\n', '').replace('\r', '') - replacement = '[censored]' - if 'censored_strings' in bot.config: - if bot.config['censored_strings']: - words = map(re.escape, bot.config['censored_strings']) - regex = re.compile('({})'.format("|".join(words))) - text = regex.sub(replacement, text) return text -class crlf_tcp(object): - """Handles tcp connections that consist of utf-8 lines ending with crlf""" - - def __init__(self, host, port, timeout=300): - self.ibuffer = "" - self.obuffer = "" - self.oqueue = Queue.Queue() # lines to be sent out - self.iqueue = Queue.Queue() # lines that were received - self.socket = self.create_socket() - self.host = host - self.port = port +class ReceiveThread(threading.Thread): + """receives messages from IRC and puts them in the input_queue""" + def __init__(self, sock, input_queue, timeout): + self.input_buffer = b"" + self.input_queue = input_queue + self.socket = sock self.timeout = timeout - def create_socket(self): - return socket.socket(socket.AF_INET, socket.TCP_NODELAY) - - def run(self): - self.socket.connect((self.host, self.port)) - thread.start_new_thread(self.recv_loop, ()) - thread.start_new_thread(self.send_loop, ()) + self.shutdown = False + threading.Thread.__init__(self) def recv_from_socket(self, nbytes): return self.socket.recv(nbytes) - def get_timeout_exception_type(self): - return socket.timeout - def handle_receive_exception(self, error, last_timestamp): if time.time() - last_timestamp > self.timeout: - self.iqueue.put(StopIteration) + self.input_queue.put(StopIteration) self.socket.close() return True return False - def recv_loop(self): + def get_timeout_exception_type(self): + return socket.timeout + + def run(self): last_timestamp = time.time() - while True: + while not self.shutdown: try: data = self.recv_from_socket(4096) - self.ibuffer += data + self.input_buffer += data if data: last_timestamp = time.time() else: if time.time() - last_timestamp > self.timeout: - self.iqueue.put(StopIteration) + self.input_queue.put(StopIteration) self.socket.close() return time.sleep(1) @@ -80,31 +70,15 @@ class crlf_tcp(object): return continue - while '\r\n' in self.ibuffer: - line, self.ibuffer = self.ibuffer.split('\r\n', 1) - self.iqueue.put(decode(line)) - - def send_loop(self): - while True: - line = self.oqueue.get().splitlines()[0][:500] - print ">>> %r" % line - self.obuffer += line.encode('utf-8', 'replace') + '\r\n' - while self.obuffer: - sent = self.socket.send(self.obuffer) - self.obuffer = self.obuffer[sent:] + while b'\r\n' in self.input_buffer: + line, self.input_buffer = self.input_buffer.split(b'\r\n', 1) + print(decode(line)) + self.input_queue.put(decode(line)) -class crlf_ssl_tcp(crlf_tcp): - """Handles ssl tcp connetions that consist of utf-8 lines ending with crlf""" - - def __init__(self, host, port, ignore_cert_errors, timeout=300): - self.ignore_cert_errors = ignore_cert_errors - crlf_tcp.__init__(self, host, port, timeout) - - def create_socket(self): - return wrap_socket(crlf_tcp.create_socket(self), server_side=False, - cert_reqs=CERT_NONE if self.ignore_cert_errors else - CERT_REQUIRED) +class SSLReceiveThread(ReceiveThread): + def __init__(self, sock, input_queue, timeout): + ReceiveThread.__init__(self, sock, input_queue, timeout) def recv_from_socket(self, nbytes): return self.socket.read(nbytes) @@ -113,57 +87,50 @@ class crlf_ssl_tcp(crlf_tcp): return SSLError def handle_receive_exception(self, error, last_timestamp): - # this is terrible + # this is terrible if not "timed out" in error.args[0]: raise - return crlf_tcp.handle_receive_exception(self, error, last_timestamp) + return ReceiveThread.handle_receive_exception(self, error, last_timestamp) -irc_prefix_rem = re.compile(r'(.*?) (.*?) (.*)').match -irc_noprefix_rem = re.compile(r'()(.*?) (.*)').match -irc_netmask_rem = re.compile(r':?([^!@]*)!?([^@]*)@?(.*)').match -irc_param_ref = re.compile(r'(?:^|(?<= ))(:.*|[^ ]+)').findall +class SendThread(threading.Thread): + """sends messages from output_queue to IRC""" + def __init__(self, sock, conn_name, output_queue): + self.output_buffer = b"" + self.output_queue = output_queue + self.conn_name = conn_name + self.socket = sock + + self.shutdown = False + threading.Thread.__init__(self) + + def run(self): + while not self.shutdown: + line = self.output_queue.get().splitlines()[0][:500] + self.output_buffer += line.encode('utf-8', 'replace') + b'\r\n' + while self.output_buffer: + sent = self.socket.send(self.output_buffer) + self.output_buffer = self.output_buffer[sent:] -class IRC(object): - """handles the IRC protocol""" +class ParseThread(threading.Thread): + """parses messages from input_queue and puts them in parsed_queue""" + def __init__(self, input_queue, output_queue, parsed_queue): + self.input_queue = input_queue # lines that were received + self.output_queue = output_queue # lines to be sent out + self.parsed_queue = parsed_queue # lines that have been parsed - def __init__(self, name, server, nick, port=6667, channels=[], conf={}): - self.name = name - self.channels = channels - self.conf = conf - self.server = server - self.port = port - self.nick = nick - self.history = {} - self.vars = {} + threading.Thread.__init__(self) - self.out = Queue.Queue() # responses from the server are placed here - # format: [rawline, prefix, command, params, - # nick, user, host, paramlist, msg] - self.connect() - - thread.start_new_thread(self.parse_loop, ()) - - def create_connection(self): - return crlf_tcp(self.server, self.port) - - def connect(self): - self.conn = self.create_connection() - thread.start_new_thread(self.conn.run, ()) - self.set_pass(self.conf.get('server_password')) - self.set_nick(self.nick) - self.cmd("USER", - [conf.get('user', 'cloudbot'), "3", "*", conf.get('realname', - 'CloudBot - http://git.io/cloudbot')]) - - def parse_loop(self): + def run(self): while True: # get a message from the input queue - msg = self.conn.iqueue.get() + msg = self.input_queue.get() if msg == StopIteration: - self.connect() + # got a StopIteration from the receive thread, pass it on + # so the main thread can restart the connection + self.parsed_queue.put(StopIteration) continue # parse the message @@ -174,17 +141,115 @@ class IRC(object): nick, user, host = irc_netmask_rem(prefix).groups() mask = nick + "!" + user + "@" + host paramlist = irc_param_ref(params) - lastparam = "" + lastparam = "" if paramlist: if paramlist[-1].startswith(':'): paramlist[-1] = paramlist[-1][1:] lastparam = paramlist[-1] # put the parsed message in the response queue - self.out.put([msg, prefix, command, params, nick, user, host, - mask, paramlist, lastparam]) + self.parsed_queue.put([msg, prefix, command, params, nick, user, host, + mask, paramlist, lastparam]) # if the server pings us, pong them back if command == "PING": - self.cmd("PONG", paramlist) + string = "PONG :" + paramlist[0] + self.output_queue.put(string) + + +class IRCConnection(object): + """handles an IRC connection""" + def __init__(self, name, host, port, input_queue, output_queue): + self.output_queue = output_queue # lines to be sent out + self.input_queue = input_queue # lines that were received + self.socket = self.create_socket() + self.conn_name = name + self.host = host + self.port = port + self.timeout = 300 + + def create_socket(self): + return socket.socket(socket.AF_INET, socket.TCP_NODELAY) + + def connect(self): + self.socket.connect((self.host, self.port)) + + self.receive_thread = ReceiveThread(self.socket, self.input_queue, self.timeout) + self.receive_thread.start() + + self.send_thread = SendThread(self.socket, self.conn_name, self.output_queue) + self.send_thread.start() + + def stop(self): + self.send_thread.shutdown = True + self.receive_thread.shutdown = True + time.sleep(0.1) + self.socket.close() + + def reconnect(self): + self.stop() + self.connect() + + +class SSLIRCConnection(IRCConnection): + """handles a SSL IRC connection""" + + def __init__(self, name, host, port, input_queue, output_queue, ignore_cert_errors): + self.ignore_cert_errors = ignore_cert_errors + IRCConnection.__init__(self, name, host, port, input_queue, output_queue) + + def create_socket(self): + return wrap_socket(IRCConnection.create_socket(self), server_side=False, + cert_reqs=CERT_NONE if self.ignore_cert_errors else + CERT_REQUIRED) + + +class BotInstance(object): + """ A BotInstance represents each connection the bot makes to an IRC server """ + + def __init__(self, name, server, nick, port=6667, ssl=False, logger=None, channels=[], config={}): + self.name = name + self.channels = channels + self.config = config + self.ssl = ssl + self.server = server + self.port = port + self.logger = logger + self.nick = nick + self.vars = {} + self.history = {} + + self.parsed_queue = queue.Queue() # responses from the server are placed here + # format: [rawline, prefix, command, params, + # nick, user, host, paramlist, msg] + + self.parsed_queue = queue.Queue() + self.input_queue = queue.Queue() + self.output_queue = queue.Queue() + + # create the IRC connection and connect + self.connection = self.create_connection() + self.connection.connect() + + self.set_pass(self.config.get('server_password')) + self.set_nick(self.nick) + self.cmd("USER", + [self.config.get('user', 'cloudbot'), "3", "*", + self.config.get('realname', 'CloudBot - http://git.io/cloudbot')]) + + self.parse_thread = ParseThread(self.input_queue, self.output_queue, + self.parsed_queue) + self.parse_thread.daemon = True + self.parse_thread.start() + + def create_connection(self): + if self.ssl: + return SSLIRCConnection(self.name, self.server, self.port, self.input_queue, + self.output_queue, True) + else: + return IRCConnection(self.name, self.server, self.port, + self.input_queue, self.output_queue) + + def stop(self): + self.connection.stop() def set_pass(self, password): if password: @@ -211,25 +276,20 @@ class IRC(object): def ctcp(self, target, ctcp_type, text): """ makes the bot send a PRIVMSG CTCP to a target """ - out = u"\x01{} {}\x01".format(ctcp_type, text) + out = "\x01{} {}\x01".format(ctcp_type, text) self.cmd("PRIVMSG", [target, out]) def cmd(self, command, params=None): if params: - params[-1] = u':' + params[-1] - self.send(u"{} {}".format(command, ' '.join(params))) + params[-1] = ':' + params[-1] + self.send("{} {}".format(command, ' '.join(params))) else: self.send(command) - def send(self, str): - self.conn.oqueue.put(str) - - -class SSLIRC(IRC): - def __init__(self, name, server, nick, port=6667, channels=[], conf={}, - ignore_certificate_errors=True): - self.ignore_cert_errors = ignore_certificate_errors - IRC.__init__(self, name, server, nick, port, channels, conf) - - def create_connection(self): - return crlf_ssl_tcp(self.server, self.port, self.ignore_cert_errors) + def send(self, string): + try: + self.logger.info("{} >> {}".format(self.name.upper(), string)) + except: + # if this doesn't work, no big deal + pass + self.output_queue.put(string) \ No newline at end of file diff --git a/core/loader.py b/core/loader.py new file mode 100644 index 0000000..2374041 --- /dev/null +++ b/core/loader.py @@ -0,0 +1,153 @@ +import os +import re +import glob +import collections + +from watchdog.observers import Observer +from watchdog.tricks import Trick +from pprint import pprint + +from core import main + + +def make_signature(f): + return f.__code__.co_filename, f.__name__, f.__code__.co_firstlineno + + +def format_plug(plug, kind='', lpad=0): + out = ' ' * lpad + '{}:{}:{}'.format(*make_signature(plug[0])) + if kind == 'command': + out += ' ' * (50 - len(out)) + plug[1]['name'] + + if kind == 'event': + out += ' ' * (50 - len(out)) + ', '.join(plug[1]['events']) + + if kind == 'regex': + out += ' ' * (50 - len(out)) + plug[1]['regex'] + + return out + + +class PluginLoader(object): + def __init__(self, bot): + self.observer = Observer() + self.path = os.path.abspath("plugins") + self.bot = bot + + self.event_handler = PluginEventHandler(self, patterns=["*.py"]) + self.observer.schedule(self.event_handler, self.path, recursive=False) + self.observer.start() + + self.load_all() + + def stop(self): + """shuts down the plugin reloader""" + self.observer.stop() + + def load_all(self): + """runs load_file() on all python files in the plugins folder""" + files = set(glob.glob(os.path.join(self.path, '*.py'))) + for f in files: + self.load_file(f, rebuild=True) + self.rebuild() + + def load_file(self, path, rebuild=False): + """loads (or reloads) all valid plugins from a specified file""" + filename = os.path.basename(path) + title = os.path.splitext(filename)[0] + + disabled = self.bot.config.get('disabled_plugins', []) + if title in disabled: + self.bot.logger.info("Did not load plugins from: {} (plugin disabled)".format(filename)) + return + + # compile the file and eval it in a namespace + try: + code = compile(open(path, 'U').read(), filename, 'exec') + namespace = {} + eval(code, namespace) + except Exception: + self.bot.logger.exception("Error compiling {}:".format(filename)) + return + + # remove plugins already loaded from this file + for plug_type, data in self.bot.plugins.items(): + self.bot.plugins[plug_type] = [x for x in data + if x[0]._filename != filename] + + # stop all currently running instances of the plugins from this file + for func, handler in list(self.bot.threads.items()): + if func._filename == filename: + handler.stop() + del self.bot.threads[func] + + # find objects with hooks in the plugin namespace + # TODO: kill it with fire, kill it all + for obj in namespace.values(): + if hasattr(obj, '_hook'): # check for magic + if obj._thread: + self.bot.threads[obj] = main.Handler(self.bot, obj) + for plug_type, data in obj._hook: + # add plugin to the plugin list + self.bot.plugins[plug_type] += [data] + self.bot.logger.info("Loaded plugin: {} ({})".format(format_plug(data), plug_type)) + + # do a rebuild, unless the bot is loading all plugins (rebuild happens after load_all) + if not rebuild: + self.rebuild() + + def unload_file(self, path): + """unloads all loaded plugins from a specified file""" + filename = os.path.basename(path) + self.bot.logger.info("Unloading plugins from: {}".format(filename)) + + # remove plugins loaded from this file + for plugin_type, plugins in self.bot.plugins.items(): + self.bot.plugins[plugin_type] = [x for x in plugins if x[0]._filename != filename] + + # stop all currently running instances of the plugins from this file + for func, handler in list(self.bot.threads.items()): + if func._filename == filename: + handler.stop() + del self.bot.threads[func] + + self.rebuild() + + def rebuild(self): + """rebuilds the cloudbot command and event hook lists""" + self.bot.commands = {} + for plugin in self.bot.plugins['command']: + name = plugin[1]['name'].lower() + if not re.match(r'^\w+$', name): + self.bot.logger.error('Invalid command name: "{}" ({})'.format(name, format_plug(plugin))) + continue + if name in self.bot.commands: + self.bot.logger.error('Command already registered: "{}" ({}, {})'.format(name, + format_plug(self.bot.commands[name]), + format_plug(plugin))) + continue + self.bot.commands[name] = plugin + + self.bot.events = collections.defaultdict(list) + for func, args in self.bot.plugins['event']: + for event in args['events']: + self.bot.events[event].append((func, args)) + + +class PluginEventHandler(Trick): + def __init__(self, loader, *args, **kwargs): + self.loader = loader + Trick.__init__(self, *args, **kwargs) + + def on_created(self, event): + self.loader.load_file(event.src_path) + + def on_deleted(self, event): + self.loader.unload_file(event.src_path) + + def on_modified(self, event): + self.loader.load_file(event.src_path) + + def on_moved(self, event): + self.loader.unload_file(event.src_path) + self.loader.load_file(event.dest_path) diff --git a/core/main.py b/core/main.py index 0054b0a..da120ce 100644 --- a/core/main.py +++ b/core/main.py @@ -1,12 +1,16 @@ -import thread +import _thread import traceback +import queue +import re + +from sqlalchemy.orm import scoped_session + +_thread.stack_size(1024 * 512) # reduce vm size -thread.stack_size(1024 * 512) # reduce vm size - - +#TODO: redesign this messy thing class Input(dict): - def __init__(self, conn, raw, prefix, command, params, + def __init__(self, bot, conn, raw, prefix, command, params, nick, user, host, mask, paraml, msg): chan = paraml[0].lower() @@ -22,7 +26,7 @@ class Input(dict): if target == nick: conn.msg(target, message) else: - conn.msg(target, u"({}) {}".format(nick, message)) + conn.msg(target, "({}) {}".format(nick, message)) def action(message, target=chan): """sends an action to the current channel/user or a specific channel/user""" @@ -50,67 +54,59 @@ class Input(dict): self[key] = value -def run(func, input): - args = func._args +def run(bot, func, input): + uses_db = True + # TODO: change to bot.get_db_session() + print(input) + if 'text' not in input: + input.text = input.paraml - if 'inp' not in input: - input.inp = input.paraml + if uses_db: + # create SQLAlchemy session + bot.logger.debug("Opened DB session for: {}".format(func._filename)) + input.db = input.bot.db_session() + + try: + out = func(input, input.conn) + except: + bot.logger.exception("Error in plugin {}:".format(func._filename)) + return + finally: + if uses_db: + bot.logger.debug("Closed DB session for: {}".format(func._filename)) + input.db.close() - if args: - if 'db' in args and 'db' not in input: - input.db = get_db_connection(input.conn) - if 'input' in args: - input.input = input - if 0 in args: - out = func(input.inp, **input) - else: - kw = dict((key, input[key]) for key in args if key in input) - out = func(input.inp, **kw) - else: - out = func(input.inp) if out is not None: - input.reply(unicode(out)) + input.reply(str(out)) def do_sieve(sieve, bot, input, func, type, args): try: return sieve(bot, input, func, type, args) except Exception: - print 'sieve error', - traceback.print_exc() + bot.logger.exception("Error in sieve {}:".format(func._filename)) return None class Handler(object): """Runs plugins in their own threads (ensures order)""" - def __init__(self, func): + def __init__(self, bot, func): self.func = func - self.input_queue = Queue.Queue() - thread.start_new_thread(self.start, ()) + self.bot = bot + self.input_queue = queue.Queue() + _thread.start_new_thread(self.start, ()) def start(self): - uses_db = 'db' in self.func._args - db_conns = {} + uses_db = True while True: input = self.input_queue.get() if input == StopIteration: break - if uses_db: - db = db_conns.get(input.conn) - if db is None: - db = bot.get_db_connection(input.conn) - db_conns[input.conn] = db - input.db = db + run(self.bot, self.func, input) - try: - run(self.func, input) - except: - import traceback - - traceback.print_exc() def stop(self): self.input_queue.put(StopIteration) @@ -119,27 +115,27 @@ class Handler(object): self.input_queue.put(value) -def dispatch(input, kind, func, args, autohelp=False): - for sieve, in bot.plugs['sieve']: +def dispatch(bot, input, kind, func, args, autohelp=False): + for sieve, in bot.plugins['sieve']: input = do_sieve(sieve, bot, input, func, kind, args) if input is None: return if not (not autohelp or not args.get('autohelp', True) or input.inp or not (func.__doc__ is not None)): - input.notice(input.conn.conf["command_prefix"] + func.__doc__) + input.notice(input.conn.config["command_prefix"] + func.__doc__) return if func._thread: bot.threads[func].put(input) else: - thread.start_new_thread(run, (func, input)) + _thread.start_new_thread(run, (bot, func, input)) -def match_command(command): +def match_command(bot, command): commands = list(bot.commands) # do some fuzzy matching - prefix = filter(lambda x: x.startswith(command), commands) + prefix = [x for x in commands if x.startswith(command)] if len(prefix) == 1: return prefix[0] elif prefix and command not in prefix: @@ -148,13 +144,13 @@ def match_command(command): return command -def main(conn, out): - inp = Input(conn, *out) - command_prefix = conn.conf.get('command_prefix', '.') +def main(bot, conn, out): + inp = Input(bot, conn, *out) + command_prefix = conn.config.get('command_prefix', '.') # EVENTS for func, args in bot.events[inp.command] + bot.events['*']: - dispatch(Input(conn, *out), "event", func, args) + dispatch(bot, Input(bot, conn, *out), "event", func, args) if inp.command == 'PRIVMSG': # COMMANDS @@ -162,7 +158,6 @@ def main(conn, out): prefix = '^(?:[{}]?|'.format(command_prefix) else: prefix = '^(?:[{}]|'.format(command_prefix) - command_re = prefix + inp.conn.nick command_re += r'[,;:]+\s+)(\w+)(?:$|\s+)(.*)' @@ -170,26 +165,26 @@ def main(conn, out): if m: trigger = m.group(1).lower() - command = match_command(trigger) + command = match_command(bot, trigger) if isinstance(command, list): # multiple potential matches - input = Input(conn, *out) + input = Input(bot, conn, *out) input.notice("Did you mean {} or {}?".format (', '.join(command[:-1]), command[-1])) elif command in bot.commands: - input = Input(conn, *out) + input = Input(bot, conn, *out) input.trigger = trigger - input.inp_unstripped = m.group(2) - input.inp = input.inp_unstripped.strip() + input.text_unstripped = m.group(2) + input.text = input.text_unstripped.strip() func, args = bot.commands[command] - dispatch(input, "command", func, args, autohelp=True) + dispatch(bot, input, "command", func, args, autohelp=True) # REGEXES - for func, args in bot.plugs['regex']: + for func, args in bot.plugins['regex']: m = args['re'].search(inp.lastparam) if m: - input = Input(conn, *out) - input.inp = m + input = Input(bot, conn, *out) + input.text = m - dispatch(input, "regex", func, args) + dispatch(bot, input, "regex", func, args) diff --git a/core/permissions.py b/core/permissions.py new file mode 100644 index 0000000..103542a --- /dev/null +++ b/core/permissions.py @@ -0,0 +1,48 @@ +from fnmatch import fnmatch + + +class PermissionManager(object): + def __init__(self, bot, conn): + + # this is all legacy code, needs to be redone with classes and whatnot + self.logger = bot.logger + + self.logger.info("Creating simple permission manager for {}.".format(conn.name)) + + # stuff + self.bot = bot + self.conn = conn + self.config = conn.config + + self.group_perms = {} + self.group_users = {} + self.perm_users = {} + + self.reload() + + def reload(self): + self.logger.info("Reloading permissions for {}.".format(self.conn.name)) + groups = self.conn.config.get("permissions", []) + # work out the permissions and users each group has + for key, value in groups.items(): + self.group_perms[key] = [] + self.group_users[key] = [] + for permission in value["perms"]: + self.group_perms[key].append(permission) + for user in value["users"]: + self.group_users[key].append(user) + + for group, users in self.group_users.items(): + group_perms = self.group_perms[group] + for perm in group_perms: + self.perm_users[perm] = [] + self.perm_users[perm] = users + + def has_perm_mask(self, mask, perm): + + allowed_users = self.perm_users[perm] + + for pattern in allowed_users: + if fnmatch(mask.lower(), pattern.lower()): + return input + diff --git a/core/reload.py b/core/reload.py deleted file mode 100644 index f1bfeb6..0000000 --- a/core/reload.py +++ /dev/null @@ -1,160 +0,0 @@ -import collections -import glob -import os -import re -import sys -import traceback - - -if 'mtimes' not in globals(): - mtimes = {} - -if 'lastfiles' not in globals(): - lastfiles = set() - - -def make_signature(f): - return f.func_code.co_filename, f.func_name, f.func_code.co_firstlineno - - -def format_plug(plug, kind='', lpad=0): - out = ' ' * lpad + '{}:{}:{}'.format(*make_signature(plug[0])) - if kind == 'command': - out += ' ' * (50 - len(out)) + plug[1]['name'] - - if kind == 'event': - out += ' ' * (50 - len(out)) + ', '.join(plug[1]['events']) - - if kind == 'regex': - out += ' ' * (50 - len(out)) + plug[1]['regex'] - - return out - - -def reload(init=False): - changed = False - - if init: - bot.plugs = collections.defaultdict(list) - bot.threads = {} - - core_fileset = set(glob.glob(os.path.join("core", "*.py"))) - - for filename in core_fileset: - mtime = os.stat(filename).st_mtime - if mtime != mtimes.get(filename): - mtimes[filename] = mtime - - changed = True - - try: - eval(compile(open(filename, 'U').read(), filename, 'exec'), - globals()) - except Exception: - traceback.print_exc() - if init: # stop if there's an error (syntax?) in a core - sys.exit() # script on startup - continue - - if filename == os.path.join('core', 'reload.py'): - reload(init=init) - return - - fileset = set(glob.glob(os.path.join('plugins', '*.py'))) - - # remove deleted/moved plugins - for name, data in bot.plugs.iteritems(): - bot.plugs[name] = [x for x in data if x[0]._filename in fileset] - - for filename in list(mtimes): - if filename not in fileset and filename not in core_fileset: - mtimes.pop(filename) - - for func, handler in list(bot.threads.iteritems()): - if func._filename not in fileset: - handler.stop() - del bot.threads[func] - - # compile new plugins - for filename in fileset: - mtime = os.stat(filename).st_mtime - if mtime != mtimes.get(filename): - mtimes[filename] = mtime - - changed = True - - try: - code = compile(open(filename, 'U').read(), filename, 'exec') - namespace = {} - eval(code, namespace) - except Exception: - traceback.print_exc() - continue - - # remove plugins already loaded from this filename - for name, data in bot.plugs.iteritems(): - bot.plugs[name] = [x for x in data - if x[0]._filename != filename] - - for func, handler in list(bot.threads.iteritems()): - if func._filename == filename: - handler.stop() - del bot.threads[func] - - for obj in namespace.itervalues(): - if hasattr(obj, '_hook'): # check for magic - if obj._thread: - bot.threads[obj] = Handler(obj) - - for type, data in obj._hook: - bot.plugs[type] += [data] - - if not init: - print '### new plugin (type: %s) loaded:' % \ - type, format_plug(data) - - if changed: - bot.commands = {} - for plug in bot.plugs['command']: - name = plug[1]['name'].lower() - if not re.match(r'^\w+$', name): - print '### ERROR: invalid command name "{}" ({})'.format(name, format_plug(plug)) - continue - if name in bot.commands: - print "### ERROR: command '{}' already registered ({}, {})".format(name, - format_plug(bot.commands[name]), - format_plug(plug)) - continue - bot.commands[name] = plug - - bot.events = collections.defaultdict(list) - for func, args in bot.plugs['event']: - for event in args['events']: - bot.events[event].append((func, args)) - - if init: - print ' plugin listing:' - - if bot.commands: - # hack to make commands with multiple aliases - # print nicely - - print ' command:' - commands = collections.defaultdict(list) - - for name, (func, args) in bot.commands.iteritems(): - commands[make_signature(func)].append(name) - - for sig, names in sorted(commands.iteritems()): - names.sort(key=lambda x: (-len(x), x)) # long names first - out = ' ' * 6 + '%s:%s:%s' % sig - out += ' ' * (50 - len(out)) + ', '.join(names) - print out - - for kind, plugs in sorted(bot.plugs.iteritems()): - if kind == 'command': - continue - print ' {}:'.format(kind) - for plug in plugs: - print format_plug(plug, kind=kind, lpad=6) - print diff --git a/plugins/data/8ball_responses.txt b/data/8ball_responses.txt similarity index 100% rename from plugins/data/8ball_responses.txt rename to data/8ball_responses.txt diff --git a/plugins/data/flirts.txt b/data/flirts.txt similarity index 100% rename from plugins/data/flirts.txt rename to data/flirts.txt diff --git a/plugins/data/fortunes.txt b/data/fortunes.txt similarity index 100% rename from plugins/data/fortunes.txt rename to data/fortunes.txt diff --git a/plugins/data/geoip_regions.json b/data/geoip_regions.json similarity index 100% rename from plugins/data/geoip_regions.json rename to data/geoip_regions.json diff --git a/plugins/data/insults.txt b/data/insults.txt similarity index 100% rename from plugins/data/insults.txt rename to data/insults.txt diff --git a/plugins/data/itemids.txt b/data/itemids.txt similarity index 100% rename from plugins/data/itemids.txt rename to data/itemids.txt diff --git a/plugins/data/kills.json b/data/kills.json similarity index 100% rename from plugins/data/kills.json rename to data/kills.json diff --git a/plugins/data/kills.txt b/data/kills.txt similarity index 100% rename from plugins/data/kills.txt rename to data/kills.txt diff --git a/plugins/data/larts.txt b/data/larts.txt similarity index 100% rename from plugins/data/larts.txt rename to data/larts.txt diff --git a/plugins/data/name_files/dragons.json b/data/name_files/dragons.json similarity index 100% rename from plugins/data/name_files/dragons.json rename to data/name_files/dragons.json diff --git a/plugins/data/name_files/dwarves.json b/data/name_files/dwarves.json similarity index 100% rename from plugins/data/name_files/dwarves.json rename to data/name_files/dwarves.json diff --git a/plugins/data/name_files/elves_female.json b/data/name_files/elves_female.json similarity index 100% rename from plugins/data/name_files/elves_female.json rename to data/name_files/elves_female.json diff --git a/plugins/data/name_files/elves_male.json b/data/name_files/elves_male.json similarity index 100% rename from plugins/data/name_files/elves_male.json rename to data/name_files/elves_male.json diff --git a/plugins/data/name_files/fantasy.json b/data/name_files/fantasy.json similarity index 100% rename from plugins/data/name_files/fantasy.json rename to data/name_files/fantasy.json diff --git a/plugins/data/name_files/female.json b/data/name_files/female.json similarity index 100% rename from plugins/data/name_files/female.json rename to data/name_files/female.json diff --git a/plugins/data/name_files/general.json b/data/name_files/general.json similarity index 100% rename from plugins/data/name_files/general.json rename to data/name_files/general.json diff --git a/plugins/data/name_files/hobbits.json b/data/name_files/hobbits.json similarity index 100% rename from plugins/data/name_files/hobbits.json rename to data/name_files/hobbits.json diff --git a/plugins/data/name_files/inns.json b/data/name_files/inns.json similarity index 100% rename from plugins/data/name_files/inns.json rename to data/name_files/inns.json diff --git a/plugins/data/name_files/items.json b/data/name_files/items.json similarity index 100% rename from plugins/data/name_files/items.json rename to data/name_files/items.json diff --git a/plugins/data/name_files/male.json b/data/name_files/male.json similarity index 100% rename from plugins/data/name_files/male.json rename to data/name_files/male.json diff --git a/plugins/data/name_files/narn.json b/data/name_files/narn.json similarity index 100% rename from plugins/data/name_files/narn.json rename to data/name_files/narn.json diff --git a/plugins/data/name_files/warrior_cats.json b/data/name_files/warrior_cats.json similarity index 100% rename from plugins/data/name_files/warrior_cats.json rename to data/name_files/warrior_cats.json diff --git a/plugins/data/recipes.txt b/data/recipes.txt similarity index 100% rename from plugins/data/recipes.txt rename to data/recipes.txt diff --git a/plugins/data/slaps.json b/data/slaps.json similarity index 100% rename from plugins/data/slaps.json rename to data/slaps.json diff --git a/plugins/data/slogans.txt b/data/slogans.txt similarity index 100% rename from plugins/data/slogans.txt rename to data/slogans.txt diff --git a/disabled_stuff/cleverbot.py b/disabled_stuff/cleverbot.py deleted file mode 100644 index 6604d8b..0000000 --- a/disabled_stuff/cleverbot.py +++ /dev/null @@ -1,121 +0,0 @@ -# from jessi bot -import urllib2 -import hashlib -import re -import unicodedata -from util import hook - -# these are just parts required -# TODO: Merge them. - -arglist = ['', 'y', '', '', '', '', '', '', '', '', 'wsf', '', - '', '', '', '', '', '', '', '0', 'Say', '1', 'false'] - -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') - -headers = {'X-Moz': 'prefetch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1)Gecko/20100101 Firefox/7.0', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Referer': 'http://www.cleverbot.com', - 'Pragma': 'no-cache', 'Cache-Control': 'no-cache, no-cache', 'Accept-Language': 'en-us;q=0.8,en;q=0.5'} - -keylist = ['stimulus', 'start', 'sessionid', 'vText8', 'vText7', 'vText6', - 'vText5', 'vText4', 'vText3', 'vText2', 'icognoid', - 'icognocheck', 'prevref', 'emotionaloutput', 'emotionalhistory', - 'asbotname', 'ttsvoice', 'typing', 'lineref', 'fno', 'sub', - 'islearning', 'cleanslate'] - -MsgList = list() - - -def quote(s, safe='/'): # quote('abc def') -> 'abc%20def' - s = s.encode('utf-8') - s = s.decode('utf-8') - print "s= " + s - print "safe= " + safe - safe += always_safe - safe_map = dict() - for i in range(256): - c = chr(i) - safe_map[c] = (c in safe) and c or ('%%%02X' % i) - try: - res = map(safe_map.__getitem__, s) - except: - print "blank" - return '' - print "res= " + ''.join(res) - return ''.join(res) - - -def encode(keylist, arglist): - text = str() - for i in range(len(keylist)): - k = keylist[i] - v = quote(arglist[i]) - text += '&' + k + '=' + v - text = text[1:] - return text - - -def Send(): - data = encode(keylist, arglist) - digest_txt = data[9:29] - new_hash = hashlib.md5(digest_txt).hexdigest() - arglist[keylist.index('icognocheck')] = new_hash - data = encode(keylist, arglist) - req = urllib2.Request('http://www.cleverbot.com/webservicemin', - data, headers) - f = urllib2.urlopen(req) - reply = f.read() - return reply - - -def parseAnswers(text): - d = dict() - keys = ['text', 'sessionid', 'logurl', 'vText8', 'vText7', 'vText6', - 'vText5', 'vText4', 'vText3', 'vText2', 'prevref', 'foo', - 'emotionalhistory', 'ttsLocMP3', 'ttsLocTXT', 'ttsLocTXT3', - 'ttsText', 'lineRef', 'lineURL', 'linePOST', 'lineChoices', - 'lineChoicesAbbrev', 'typingData', 'divert'] - values = text.split('\r') - i = 0 - for key in keys: - d[key] = values[i] - i += 1 - return d - - -def ask(inp): - arglist[keylist.index('stimulus')] = inp - if MsgList: - arglist[keylist.index('lineref')] = '!0' + str(len( - MsgList) / 2) - asw = Send() - MsgList.append(inp) - answer = parseAnswers(asw) - for k, v in answer.iteritems(): - try: - arglist[keylist.index(k)] = v - except ValueError: - pass - arglist[keylist.index('emotionaloutput')] = str() - text = answer['ttsText'] - MsgList.append(text) - return text - - -@hook.command("cb") -def cleverbot(inp, reply=None): - reply(ask(inp)) - - -''' # TODO: add in command to control extra verbose per channel -@hook.event('PRIVMSG') -def cbevent(inp, reply=None): - reply(ask(inp)) - -@hook.command("cbver", permissions=['cleverbot']) -def cleverbotverbose(inp, notice=None): - if on in input -''' diff --git a/disabled_stuff/cloudbot.sh b/disabled_stuff/cloudbot.sh deleted file mode 100644 index 877c4ea..0000000 --- a/disabled_stuff/cloudbot.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash -echo "" -echo " ________ ______ __ " -echo " / ____/ /___ __ ______/ / __ )____ / /_" -echo " / / / / __ \/ / / / __ / __ / __ \/ __/" -echo "/ /___/ / /_/ / /_/ / /_/ / /_/ / /_/ / /_ " -echo "\____/_/\____/\__,_/\__,_/_____/\____/\__/ " -echo " http://git.io/cloudbotirc by ClouDev " -echo "" -locatefiles() { - botfile="/bot.py" - botfile=$(pwd)$botfile - logfile="/bot.log" - logfile=$(pwd)$logfile -} - -running() { - if [[ $(ps aux|grep bot.py|grep -v grep|grep -v daemon|grep -v SCREEN) != "" ]]; then - true - else - false - fi -} - -checkbackend() { - if dpkg -l| grep ^ii|grep daemon|grep 'turns other' > /dev/null; then - backend="daemon" - elif dpkg -l| grep ^ii|grep screen|grep 'terminal multi' > /dev/null; then - backend="screen" - else - backend="manual" - fi - return 0 -} - -setcommands() { - status() { - if running; then - echo "CloudBot is running!" - else - echo "CloudBot is not running!" - fi - } - clear() { - : > $logfile - } - if [ "$backend" == "daemon" ]; then - start() { - daemon -r -n cloudbot -O $logfile python $botfile - } - stop() { - daemon -n cloudbot --stop - } - elif [ "$backend" == "screen" ]; then - start() { - screen -d -m -S cloudbot -t cloudbot python $botfile > $logfile 2>&1 - } - stop() { - pid=`ps ax|grep -v grep|grep python|grep -v SCREEN|grep $botfile|awk '{print $1}'` - kill $pid - } - elif [ "$backend" == "manual" ]; then - start() { - $botfile - } - stop() { - pid=`ps ax|grep -v grep|grep python|grep $botfile|awk '{print $1}'` - kill $pid - } - fi -} - -processargs() { - case $1 in - start|-start|--start) - if running; then - echo "Cannot start! Bot is already running!" - exit 1 - else - echo "Starting CloudBot... ($backend)" - start - fi - ;; - stop|-stop|--stop) - if running; then - echo "Stopping CloudBot... ($backend)" - stop - else - echo "Cannot stop! Bot is not already running!" - exit 1 - fi - ;; - restart|-restart|--restart) - if running; then - echo "Restarting CloudBot... ($backend)" - stop - sleep 3 - start - else - echo "Cannot restart! Bot is not already running!" - exit 1 - fi - ;; - clear|-clear|--clear) - echo "Clearing logs..." - clear - ;; - status|-status|--status) - status - ;; - *) - usage="usage: ./cloudbot {start|stop|restart|clear|status}" - echo $usage - ;; - esac -} - -main() { - locatefiles - checkbackend - setcommands - processargs $1 -} - -main $* -exit 0 \ No newline at end of file diff --git a/disabled_stuff/mtg.py b/disabled_stuff/mtg.py deleted file mode 100644 index 3db8306..0000000 --- a/disabled_stuff/mtg.py +++ /dev/null @@ -1,183 +0,0 @@ -import re - -from util import hook, http - - -@hook.command -def mtg(inp): - ".mtg -- Gets information about Magic the Gathering card ." - - url = 'http://magiccards.info/query?v=card&s=cname' - h = http.get_html(url, q=inp) - - name = h.find('body/table/tr/td/span/a') - if name is None: - return "No cards found :(" - card = name.getparent().getparent().getparent() - - type = card.find('td/p').text.replace('\n', '') - - # this is ugly - text = http.html.tostring(card.xpath("//p[@class='ctext']/b")[0]) - text = text.replace('
', '$') - text = http.html.fromstring(text).text_content() - text = re.sub(r'(\w+\s*)\$+(\s*\w+)', r'\1. \2', text) - text = text.replace('$', ' ') - text = re.sub(r'\(.*?\)', '', text) # strip parenthetical explanations - text = re.sub(r'\.(\S)', r'. \1', text) # fix spacing - - printings = card.find('td/small').text_content() - printings = re.search(r'Editions:(.*)Languages:', printings).group(1) - printings = re.findall(r'\s*(.+?(?: \([^)]+\))*) \((.*?)\)', - ' '.join(printings.split())) - - printing_out = ', '.join('%s (%s)' % (set_abbrevs.get(x[0], x[0]), - rarity_abbrevs.get(x[1], x[1])) - for x in printings) - - name.make_links_absolute(base_url=url) - link = name.attrib['href'] - name = name.text_content().strip() - type = type.strip() - text = ' '.join(text.split()) - - return ' | '.join((name, type, text, printing_out, link)) - - -set_abbrevs = { - '15th Anniversary': '15ANN', - 'APAC Junior Series': 'AJS', - 'Alara Reborn': 'ARB', - 'Alliances': 'AI', - 'Anthologies': 'AT', - 'Antiquities': 'AQ', - 'Apocalypse': 'AP', - 'Arabian Nights': 'AN', - 'Arena League': 'ARENA', - 'Asia Pacific Land Program': 'APAC', - 'Battle Royale': 'BR', - 'Battle Royale Box Set': 'BRB', - 'Beatdown': 'BTD', - 'Beatdown Box Set': 'BTD', - 'Betrayers of Kamigawa': 'BOK', - 'Celebration Cards': 'UQC', - 'Champions of Kamigawa': 'CHK', - 'Champs': 'CP', - 'Chronicles': 'CH', - 'Classic Sixth Edition': '6E', - 'Coldsnap': 'CS', - 'Coldsnap Theme Decks': 'CSTD', - 'Conflux': 'CFX', - 'Core Set - Eighth Edition': '8E', - 'Core Set - Ninth Edition': '9E', - 'Darksteel': 'DS', - 'Deckmasters': 'DM', - 'Dissension': 'DI', - 'Dragon Con': 'DRC', - 'Duel Decks: Divine vs. Demonic': 'DVD', - 'Duel Decks: Elves vs. Goblins': 'EVG', - 'Duel Decks: Garruk vs. Liliana': 'GVL', - 'Duel Decks: Jace vs. Chandra': 'JVC', - 'Eighth Edition': '8ED', - 'Eighth Edition Box Set': '8EB', - 'European Land Program': 'EURO', - 'Eventide': 'EVE', - 'Exodus': 'EX', - 'Fallen Empires': 'FE', - 'Fifth Dawn': '5DN', - 'Fifth Edition': '5E', - 'Fourth Edition': '4E', - 'Friday Night Magic': 'FNMP', - 'From the Vault: Dragons': 'FVD', - 'From the Vault: Exiled': 'FVE', - 'Future Sight': 'FUT', - 'Gateway': 'GRC', - 'Grand Prix': 'GPX', - 'Guildpact': 'GP', - 'Guru': 'GURU', - 'Happy Holidays': 'HHO', - 'Homelands': 'HL', - 'Ice Age': 'IA', - 'Introductory Two-Player Set': 'ITP', - 'Invasion': 'IN', - 'Judge Gift Program': 'JR', - 'Judgment': 'JU', - 'Junior Series': 'JSR', - 'Legend Membership': 'DCILM', - 'Legends': 'LG', - 'Legions': 'LE', - 'Limited Edition (Alpha)': 'LEA', - 'Limited Edition (Beta)': 'LEB', - 'Limited Edition Alpha': 'LEA', - 'Limited Edition Beta': 'LEB', - 'Lorwyn': 'LW', - 'MTGO Masters Edition': 'MED', - 'MTGO Masters Edition II': 'ME2', - 'MTGO Masters Edition III': 'ME3', - 'Magic 2010': 'M10', - 'Magic Game Day Cards': 'MGDC', - 'Magic Player Rewards': 'MPRP', - 'Magic Scholarship Series': 'MSS', - 'Magic: The Gathering Launch Parties': 'MLP', - 'Media Inserts': 'MBP', - 'Mercadian Masques': 'MM', - 'Mirage': 'MR', - 'Mirrodin': 'MI', - 'Morningtide': 'MT', - 'Multiverse Gift Box Cards': 'MGBC', - 'Nemesis': 'NE', - 'Ninth Edition Box Set': '9EB', - 'Odyssey': 'OD', - 'Onslaught': 'ON', - 'Planar Chaos': 'PC', - 'Planechase': 'PCH', - 'Planeshift': 'PS', - 'Portal': 'PO', - 'Portal Demogame': 'POT', - 'Portal Second Age': 'PO2', - 'Portal Three Kingdoms': 'P3K', - 'Premium Deck Series: Slivers': 'PDS', - 'Prerelease Events': 'PTC', - 'Pro Tour': 'PRO', - 'Prophecy': 'PR', - 'Ravnica: City of Guilds': 'RAV', - 'Release Events': 'REP', - 'Revised Edition': 'RV', - 'Saviors of Kamigawa': 'SOK', - 'Scourge': 'SC', - 'Seventh Edition': '7E', - 'Shadowmoor': 'SHM', - 'Shards of Alara': 'ALA', - 'Starter': 'ST', - 'Starter 1999': 'S99', - 'Starter 2000 Box Set': 'ST2K', - 'Stronghold': 'SH', - 'Summer of Magic': 'SOM', - 'Super Series': 'SUS', - 'Tempest': 'TP', - 'Tenth Edition': '10E', - 'The Dark': 'DK', - 'Time Spiral': 'TS', - 'Time Spiral Timeshifted': 'TSTS', - 'Torment': 'TR', - 'Two-Headed Giant Tournament': 'THGT', - 'Unglued': 'UG', - 'Unhinged': 'UH', - 'Unhinged Alternate Foils': 'UHAA', - 'Unlimited Edition': 'UN', - "Urza's Destiny": 'UD', - "Urza's Legacy": 'UL', - "Urza's Saga": 'US', - 'Visions': 'VI', - 'Weatherlight': 'WL', - 'Worlds': 'WRL', - 'WotC Online Store': 'WOTC', - 'Zendikar': 'ZEN'} - -rarity_abbrevs = { - 'Land': 'L', - 'Common': 'C', - 'Uncommon': 'UC', - 'Rare': 'R', - 'Special': 'S', - 'Mythic Rare': 'MR'} diff --git a/disabled_stuff/mygengo_translate.py b/disabled_stuff/mygengo_translate.py deleted file mode 100644 index 6e7b006..0000000 --- a/disabled_stuff/mygengo_translate.py +++ /dev/null @@ -1,115 +0,0 @@ -# BING translation plugin by Lukeroge and neersighted -from util import hook -from util import http -import re -import htmlentitydefs -import mygengo - -gengo = mygengo.MyGengo( - public_key='PlwtF1CZ2tu27IdX_SXNxTFmfN0j|_-pJ^Rf({O-oLl--r^QM4FygRdt^jusSSDE', - private_key='wlXpL=SU[#JpPu[dQaf$v{S3@rg[=95$$TA(k$sb3_6~B_zDKkTbd4#hXxaorIae', - sandbox=False, -) - -def gengo_translate(text, source, target): - try: - translation = gengo.postTranslationJob(job={ - 'type': 'text', - 'slug': 'Translating '+source+' to '+target+' with the myGengo API', - 'body_src': text, - 'lc_src': source, - 'lc_tgt': target, - 'tier': 'machine', - }) - translated = translation['response']['job']['body_tgt'] - return u"(%s > %s) %s" % (source, target, translated) - except mygengo.MyGengoError: - return "error: could not translate" - -def match_language(fragment): - fragment = fragment.lower() - for short, _ in lang_pairs: - if fragment in short.lower().split(): - return short.split()[0] - - for short, full in lang_pairs: - if fragment in full.lower(): - return short.split()[0] - return None - -@hook.command -def translate(inp): - ".translate -- Translates from to using MyGengo." - args = inp.split(' ') - sl = match_language(args[0]) - tl = match_language(args[1]) - txt = unicode(" ".join(args[2:])) - if sl and tl: - return unicode(gengo_translate(txt, sl, tl)) - else: - return "error: translate could not reliably determine one or both languages" - -languages = 'ja fr de ko ru zh'.split() -language_pairs = zip(languages[:-1], languages[1:]) -lang_pairs = [ - ("no", "Norwegian"), - ("it", "Italian"), - ("ht", "Haitian Creole"), - ("af", "Afrikaans"), - ("sq", "Albanian"), - ("ar", "Arabic"), - ("hy", "Armenian"), - ("az", "Azerbaijani"), - ("eu", "Basque"), - ("be", "Belarusian"), - ("bg", "Bulgarian"), - ("ca", "Catalan"), - ("zh-CN zh", "Chinese"), - ("hr", "Croatian"), - ("cs cz", "Czech"), - ("da dk", "Danish"), - ("nl", "Dutch"), - ("en", "English"), - ("et", "Estonian"), - ("tl", "Filipino"), - ("fi", "Finnish"), - ("fr", "French"), - ("gl", "Galician"), - ("ka", "Georgian"), - ("de", "German"), - ("el", "Greek"), - ("ht", "Haitian Creole"), - ("iw", "Hebrew"), - ("hi", "Hindi"), - ("hu", "Hungarian"), - ("is", "Icelandic"), - ("id", "Indonesian"), - ("ga", "Irish"), - ("it", "Italian"), - ("ja jp jpn", "Japanese"), - ("ko", "Korean"), - ("lv", "Latvian"), - ("lt", "Lithuanian"), - ("mk", "Macedonian"), - ("ms", "Malay"), - ("mt", "Maltese"), - ("no", "Norwegian"), - ("fa", "Persian"), - ("pl", "Polish"), - ("pt", "Portuguese"), - ("ro", "Romanian"), - ("ru", "Russian"), - ("sr", "Serbian"), - ("sk", "Slovak"), - ("sl", "Slovenian"), - ("es", "Spanish"), - ("sw", "Swahili"), - ("sv", "Swedish"), - ("th", "Thai"), - ("tr", "Turkish"), - ("uk", "Ukrainian"), - ("ur", "Urdu"), - ("vi", "Vietnamese"), - ("cy", "Welsh"), - ("yi", "Yiddish") -] diff --git a/disabled_stuff/religion.py b/disabled_stuff/religion.py deleted file mode 100644 index 552b23f..0000000 --- a/disabled_stuff/religion.py +++ /dev/null @@ -1,38 +0,0 @@ -from util import hook, http - - -@hook.command('god') -@hook.command -def bible(inp): - """.bible -- gets from the Bible (ESV)""" - - base_url = ('http://www.esvapi.org/v2/rest/passageQuery?key=IP&' - 'output-format=plain-text&include-heading-horizontal-lines&' - 'include-headings=false&include-passage-horizontal-lines=false&' - 'include-passage-references=false&include-short-copyright=false&' - 'include-footnotes=false&line-length=0&' - 'include-heading-horizontal-lines=false') - - text = http.get(base_url, passage=inp) - - text = ' '.join(text.split()) - - if len(text) > 400: - text = text[:text.rfind(' ', 0, 400)] + '...' - - return text - - -@hook.command('allah') -@hook.command -def koran(inp): # Koran look-up plugin by Ghetto Wizard - """.koran -- gets from the Koran""" - - url = 'http://quod.lib.umich.edu/cgi/k/koran/koran-idx?type=simple' - - results = http.get_html(url, q1=inp).xpath('//li') - - if not results: - return 'No results for ' + inp - - return results[0].text_content() diff --git a/disabled_stuff/repaste.py b/disabled_stuff/repaste.py deleted file mode 100644 index 1443345..0000000 --- a/disabled_stuff/repaste.py +++ /dev/null @@ -1,180 +0,0 @@ -from util import hook, http - -import urllib -import random -import urllib2 -import htmlentitydefs -import re - -re_htmlent = re.compile("&(" + "|".join(htmlentitydefs.name2codepoint.keys()) + ");") -re_numeric = re.compile(r'&#(x?)([a-fA-F0-9]+);') - - -def db_init(db): - db.execute("create table if not exists repaste(chan, manual, primary key(chan))") - db.commit() - - -def decode_html(text): - text = re.sub(re_htmlent, - lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), - text) - - text = re.sub(re_numeric, - lambda m: unichr(int(m.group(2), 16 if m.group(1) else 10)), - text) - return text - - -def scrape_mibpaste(url): - if not url.startswith("http"): - url = "http://" + url - pagesource = http.get(url) - rawpaste = re.search(r'(?s)(?<=\n).+(?=
)', pagesource).group(0) - filterbr = rawpaste.replace("
", "") - unescaped = decode_html(filterbr) - stripped = unescaped.strip() - - return stripped - - -def scrape_pastebin(url): - id = re.search(r'(?:www\.)?pastebin.com/([a-zA-Z0-9]+)$', url).group(1) - rawurl = "http://pastebin.com/raw.php?i=" + id - text = http.get(rawurl) - - return text - - -autorepastes = {} - - -#@hook.regex('(pastebin\.com)(/[^ ]+)') -@hook.regex('(mibpaste\.com)(/[^ ]+)') -def autorepaste(inp, input=None, notice=None, db=None, chan=None, nick=None): - db_init(db) - manual = db.execute("select manual from repaste where chan=?", (chan, )).fetchone() - if manual and len(manual) and manual[0]: - return - url = inp.group(1) + inp.group(2) - urllib.unquote(url) - if url in autorepastes: - out = autorepastes[url] - notice("In the future, please use a less awful pastebin (e.g. pastebin.com)") - else: - out = repaste("http://" + url, input, db, False) - autorepastes[url] = out - notice("In the future, please use a less awful pastebin (e.g. pastebin.com) instead of %s." % inp.group(1)) - input.say("%s (repasted for %s)" % (out, nick)) - - -scrapers = { - r'mibpaste\.com': scrape_mibpaste, - r'pastebin\.com': scrape_pastebin -} - - -def scrape(url): - for pat, scraper in scrapers.iteritems(): - print "matching " + repr(pat) + " " + url - if re.search(pat, url): - break - else: - return None - - return scraper(url) - - -def paste_sprunge(text, syntax=None, user=None): - data = urllib.urlencode({"sprunge": text}) - url = urllib2.urlopen("http://sprunge.us/", data).read().strip() - - if syntax: - url += "?" + syntax - - return url - - -def paste_ubuntu(text, user=None, syntax='text'): - data = urllib.urlencode({"poster": user, - "syntax": syntax, - "content": text}) - - return urllib2.urlopen("http://paste.ubuntu.com/", data).url - - -def paste_gist(text, user=None, syntax=None, description=None): - data = { - 'file_contents[gistfile1]': text, - 'action_button': "private" - } - - if description: - data['description'] = description - - if syntax: - data['file_ext[gistfile1]'] = "." + syntax - - req = urllib2.urlopen('https://gist.github.com/gists', urllib.urlencode(data).encode('utf8')) - return req.url - - -def paste_strictfp(text, user=None, syntax="plain"): - data = urllib.urlencode(dict( - language=syntax, - paste=text, - private="private", - submit="Paste")) - req = urllib2.urlopen("http://paste.strictfp.com/", data) - return req.url - - -pasters = dict( - ubuntu=paste_ubuntu, - sprunge=paste_sprunge, - gist=paste_gist, - strictfp=paste_strictfp -) - - -@hook.command -def repaste(inp, input=None, db=None, isManual=True): - ".repaste mode|list|[provider] [syntax] -- Reuploads mibpaste to [provider]." - - parts = inp.split() - db_init(db) - if parts[0] == 'list': - return " ".join(pasters.keys()) - - paster = paste_gist - args = {} - - if not parts[0].startswith("http"): - p = parts[0].lower() - - if p in pasters: - paster = pasters[p] - parts = parts[1:] - - if not parts[0].startswith("http"): - p = parts[0].lower() - parts = parts[1:] - - args["syntax"] = p - - if len(parts) > 1: - return "PEBKAC" - - args["user"] = input.user - - url = parts[0] - - scraped = scrape(url) - - if not scraped: - return "No scraper for given url" - - args["text"] = scraped - pasted = paster(**args) - - return pasted diff --git a/disabled_stuff/urlhistory.py b/disabled_stuff/urlhistory.py deleted file mode 100644 index c5e344e..0000000 --- a/disabled_stuff/urlhistory.py +++ /dev/null @@ -1,80 +0,0 @@ -import math -import re -import time - -from util import hook, urlnorm, timesince - - -expiration_period = 60 * 60 * 24 # 1 day - -ignored_urls = [urlnorm.normalize("http://google.com"),] - - -def db_init(db): - db.execute("create table if not exists urlhistory" - "(chan, url, nick, time)") - db.commit() - - -def insert_history(db, chan, url, nick): - now = time.time() - db.execute("insert into urlhistory(chan, url, nick, time) " - "values(?,?,?,?)", (chan, url, nick, time.time())) - db.commit() - - -def get_history(db, chan, url): - db.execute("delete from urlhistory where time < ?", - (time.time() - expiration_period,)) - return db.execute("select nick, time from urlhistory where " - "chan=? and url=? order by time desc", (chan, url)).fetchall() - - -def nicklist(nicks): - nicks = sorted(dict(nicks), key=unicode.lower) - if len(nicks) <= 2: - return ' and '.join(nicks) - else: - return ', and '.join((', '.join(nicks[:-1]), nicks[-1])) - - -def format_reply(history): - if not history: - return - - last_nick, recent_time = history[0] - last_time = timesince.timesince(recent_time) - - if len(history) == 1: - return #"%s linked that %s ago." % (last_nick, last_time) - - hour_span = math.ceil((time.time() - history[-1][1]) / 3600) - hour_span = '%.0f hours' % hour_span if hour_span > 1 else 'hour' - - hlen = len(history) - ordinal = ["once", "twice", "%d times" % hlen][min(hlen, 3) - 1] - - if len(dict(history)) == 1: - last = "last linked %s ago" % last_time - else: - last = "last linked by %s %s ago" % (last_nick, last_time) - - return #"that url has been posted %s in the past %s by %s (%s)." % (ordinal, - -@hook.command -def url(inp, nick='', chan='', db=None, bot=None): - db_init(db) - url = urlnorm.normalize(inp.group().encode('utf-8')) - if url not in ignored_urls: - url = url.decode('utf-8') - history = get_history(db, chan, url) - insert_history(db, chan, url, nick) - - inp = match.string.lower() - - for name in dict(history): - if name.lower() in inp: # person was probably quoting a line - return # that had a link. don't remind them. - - if nick not in dict(history): - return format_reply(history) diff --git a/disabled_stuff/wordoftheday.py b/disabled_stuff/wordoftheday.py deleted file mode 100644 index 7b7a19b..0000000 --- a/disabled_stuff/wordoftheday.py +++ /dev/null @@ -1,20 +0,0 @@ -import re -from util import hook, http, misc -from BeautifulSoup import BeautifulSoup - - -@hook.command(autohelp=False) -def word(inp, say=False, nick=False): - "word -- Gets the word of the day." - page = http.get('http://merriam-webster.com/word-of-the-day') - - soup = BeautifulSoup(page) - - word = soup.find('strong', {'class': 'main_entry_word'}).renderContents() - function = soup.find('p', {'class': 'word_function'}).renderContents() - - #definitions = re.findall(r':' - # r' *([^<]+)', content) - - say("(%s) The word of the day is:"\ - " \x02%s\x02 (%s)" % (nick, word, function)) diff --git a/disabled_stuff/wrapper.old b/disabled_stuff/wrapper.old deleted file mode 100644 index d2f2cda..0000000 --- a/disabled_stuff/wrapper.old +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env python -# Bot Wrapper by neersighted - -# Import required modules -import os -import sys -import subprocess -import json -import re - -# Files -configfile = os.path.isfile("./config") -botfile = os.path.isfile("./bot.py") - -# Colors -nocol = "\033[1;m" -red = "\033[1;31m" -green = "\033[1;32m" - -# Messages -firstrun = "Welclome to your first run of: " -usage = "usage: ./cloudbot {start|stop|restart|status}" -iusage = "{1|start} {2|stop} {3|restart} {4|status} {5|exit}" -quit = "Thanks for using CloudBot!" - -error1 = red + "Neither screen nor daemon is installed! "\ - "This program cannot run! {ERROR 1}" + nocol -error2 = red + "Could not find bot.py! Are you in the wrong folder? "\ - "{ERROR 2}" + nocol -error3 = red + "Invalid choice, exiting! {ERROR 3}" + nocol -error4 = red + "Program killed by user! {ERROR 4}" + nocol -error5 = red + "Invalid backend in config! (Or, backend not installed)"\ - " {ERROR 5}" + nocol -error6 = red + "Author error! We be derpin'! {ERROR 6}" + nocol - - -# Commands -pwd = os.getcwd() -clearlog = ": > ./bot.log" - -start = "echo " + "'" + error1 + "'" -stop = "echo " + "'" + error1 + "'" -restart = "echo " + "'" + error1 + "'" -pid = "echo 'Cannot get pid'" - -daemonstart = "daemon -r -n cloudbot -O " + pwd + \ - "/bot.log python " + pwd + "/bot.py" -daemonstop = "daemon -n cloudbot --stop" -daemonrestart = "./cloudbot stop > /dev/null 2>&1 && ./cloudbot start > /dev/null 2>&1" -daemonpid = "pidof /usr/bin/daemon" - -screenstart = "screen -d -m -S cloudbot -t cloudbot python " + pwd +\ - "/bot.py > " + pwd + "/bot.log 2>&1" -screenstop = "kill `pidof /usr/bin/screen`" -screenrestart = "./cloudbot stop > /dev/null 2>&1 && ./cloudbot start > /dev/null 2>&1" -screenpid = "pidof /usr/bin/screen" - -# Checks -if configfile: - try: - config = json.load(open('config')) - command = ":" - except ValueError, e: - print 'error: malformed config', e -else: - config = False - command = "python bot.py" - -daemoncheck = subprocess.check_output("locate /usr/bin/daemon", shell=True) -daemon = re.match(r'^/usr/bin/daemon$', daemoncheck) - -screencheck = subprocess.check_output("locate /usr/bin/screen", shell=True) -screen = re.match(r'^/usr/bin/screen$', screencheck) - -if configfile: - backend = config.get("wrapper", {}).get("backend", "daemon") - daemonloc = config.get("wrapper", {}).get("daemonloc", "/usr/bin/daemon") - screenloc = config.get("wrapper", {}).get("screenloc", "/usr/bin/screen") -else: - backend = False - daemonloc = "/usr/bin/daemon" - screenloc = "/usr/bin/screen" - -try: - runningcheck = subprocess.check_output("ps ax|grep cloudbot|"\ - "grep -v grep|grep -v ./cloudbot", shell=True) - running = re.match(r'^[1-9]+', runningcheck) -except (subprocess.CalledProcessError): - running = False - -# Set commands -if (backend == "daemon"): - if daemon: - start = daemonstart - stop = daemonstop - restart = daemonrestart - pid = daemonpid - else: - print error5 - exit -elif (backend == "screen"): - if screen: - start = screenstart - stop = screenstop - restart = screenrestart - pid = screenpid - else: - print error5 - exit -elif (backend == False): - print firstrun -else: - print error5 - exit - -# Fancy banner -print " ______ __ ______ __ __ "\ -" _______ .______ ______ .___________." -print " / || | / __ \ | | | | "\ -"| \ | _ \ / __ \ | |" -print "| ,----'| | | | | | | | | | "\ -"| .--. || |_) | | | | | `---| |----`" -print "| | | | | | | | | | | | "\ -"| | | || _ < | | | | | | " -print "| `----.| `----.| `--' | | `--' | "\ -"| '--' || |_) | | `--' | | | " -print " \______||_______| \______/ \______/ "\ -"|_______/ |______/ \______/ |__| " -print "http://git.io/cloudbot "\ -" by lukeroge" - -# Read arguments/turn interactive -try: - if (len(sys.argv) > 1): - read = 0 - else: - sys.argv = "interactive" - print iusage - read = int(raw_input("Please choose a option: ")) - - if (sys.argv[1] == "start") or (read == 1): - if running: - print "Bot is already running, cannot start!" - else: - command = start - print "Starting... (" + backend + ")" - elif (sys.argv[1] == "stop") or (read == 2): - if running: - command = stop - print "Stopping... (" + backend + ")" - else: - print "Bot is not running, cannot stop!" - elif (sys.argv[1] == "restart") or (read == 3): - if running: - command = restart - print "Restarting... (" + backend + ")" - else: - print "Bot is not running, cannot restart!" - elif (sys.argv[1] == "status") or (read == 4): - if running: - command = pid - print green + "Bot is running! " + nocol - else: - print red + "Bot is not running! " + nocol - elif (sys.argv[1] == "clear"): - command = clearlog - elif (sys.argv[1] == "exit") or (read == 5): - exit - elif (sys.argv[1] == "interactive"): - pass - else: - print usage - exit - -# Pretify errors -except (TypeError, ValueError), e: - print error3 - exit -except (KeyboardInterrupt), e: - print error4 - exit -except (NameError, SyntaxError), e: - print error6 - exit - -# Check for bot files -if botfile: - pass -else: - print error2 - exit - -# Call command -subprocess.call(command, shell=True) -print quit -exit diff --git a/lib/bs4/AUTHORS.txt b/lib/bs4/AUTHORS.txt deleted file mode 100644 index 2ac8fcc..0000000 --- a/lib/bs4/AUTHORS.txt +++ /dev/null @@ -1,43 +0,0 @@ -Behold, mortal, the origins of Beautiful Soup... -================================================ - -Leonard Richardson is the primary programmer. - -Aaron DeVore is awesome. - -Mark Pilgrim provided the encoding detection code that forms the base -of UnicodeDammit. - -Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful -Soup 4 working under Python 3. - -Simon Willison wrote soupselect, which was used to make Beautiful Soup -support CSS selectors. - -Sam Ruby helped with a lot of edge cases. - -Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his -work in solving the nestable tags conundrum. - -An incomplete list of people have contributed patches to Beautiful -Soup: - - Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, - Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris - Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, - Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed - Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko - Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn - Webster, Paul Wright, Danny Yoo - -An incomplete list of people who made suggestions or found bugs or -found ways to break Beautiful Soup: - - Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, - Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, - Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, - warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, - Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed - Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart - Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de - Sousa Rocha, Yichun Wei, Per Vognsen diff --git a/lib/bs4/COPYING.txt b/lib/bs4/COPYING.txt deleted file mode 100644 index d668d13..0000000 --- a/lib/bs4/COPYING.txt +++ /dev/null @@ -1,26 +0,0 @@ -Beautiful Soup is made available under the MIT license: - - Copyright (c) 2004-2012 Leonard Richardson - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE, DAMMIT. - -Beautiful Soup incorporates code from the html5lib library, which is -also made available under the MIT license. diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py deleted file mode 100644 index 03b2416..0000000 --- a/lib/bs4/__init__.py +++ /dev/null @@ -1,365 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup uses a pluggable XML or HTML parser to parse a -(possibly invalid) document into a tree representation. Beautiful Soup -provides provides methods and Pythonic idioms that make it easy to -navigate, search, and modify the parse tree. - -Beautiful Soup works with Python 2.6 and up. It works better if lxml -and/or html5lib is installed. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/bs4/doc/ -""" - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.2.1" -__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" -__license__ = "MIT" - -__all__ = ['BeautifulSoup'] - -import re -import warnings - -from .builder import builder_registry -from .dammit import UnicodeDammit -from .element import ( - CData, - Comment, - DEFAULT_OUTPUT_ENCODING, - Declaration, - Doctype, - NavigableString, - PageElement, - ProcessingInstruction, - ResultSet, - SoupStrainer, - Tag, - ) - -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - -class BeautifulSoup(Tag): - """ - This class defines the basic interface called by the tree builders. - - These methods will be called by the parser: - reset() - feed(markup) - - The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs) # See note about return value - handle_endtag(name) - handle_data(data) # Appends to the current data node - endData(containerClass=NavigableString) # Ends the current data node - - No matter how complicated the underlying parser is, you should be - able to build a tree using 'start tag' events, 'end tag' events, - 'data' events, and "done with data" events. - - If you encounter an empty-element tag (aka a self-closing tag, - like HTML's
tag), call handle_starttag and then - handle_endtag. - """ - ROOT_TAG_NAME = u'[document]' - - # If the end-user gives no indication which tree builder they - # want, look for one with these features. - DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" - - if 'convertEntities' in kwargs: - warnings.warn( - "BS4 does not respect the convertEntities argument to the " - "BeautifulSoup constructor. Entities are always converted " - "to Unicode characters.") - - if 'markupMassage' in kwargs: - del kwargs['markupMassage'] - warnings.warn( - "BS4 does not respect the markupMassage argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for any necessary markup massage.") - - if 'smartQuotesTo' in kwargs: - del kwargs['smartQuotesTo'] - warnings.warn( - "BS4 does not respect the smartQuotesTo argument to the " - "BeautifulSoup constructor. Smart quotes are always converted " - "to Unicode characters.") - - if 'selfClosingTags' in kwargs: - del kwargs['selfClosingTags'] - warnings.warn( - "BS4 does not respect the selfClosingTags argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for understanding self-closing tags.") - - if 'isHTML' in kwargs: - del kwargs['isHTML'] - warnings.warn( - "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") - - def deprecated_argument(old_name, new_name): - if old_name in kwargs: - warnings.warn( - 'The "%s" argument to the BeautifulSoup constructor ' - 'has been renamed to "%s."' % (old_name, new_name)) - value = kwargs[old_name] - del kwargs[old_name] - return value - return None - - parse_only = parse_only or deprecated_argument( - "parseOnlyThese", "parse_only") - - from_encoding = from_encoding or deprecated_argument( - "fromEncoding", "from_encoding") - - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - if isinstance(features, basestring): - features = [features] - if features is None or len(features) == 0: - features = self.DEFAULT_BUILDER_FEATURES - builder_class = builder_registry.lookup(*features) - if builder_class is None: - raise FeatureNotFound( - "Couldn't find a tree builder with the features you " - "requested: %s. Do you need to install a parser library?" - % ",".join(features)) - builder = builder_class() - self.builder = builder - self.is_xml = builder.is_xml - self.builder.soup = self - - self.parse_only = parse_only - - self.reset() - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass - - # Clear out the markup and remove the builder's circular - # reference to this object. - self.markup = None - self.builder.soup = None - - def _feed(self): - # Convert the document to Unicode. - self.builder.reset() - - self.builder.feed(self.markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) - self.hidden = 1 - self.builder.reset() - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.pushTag(self) - - def new_tag(self, name, namespace=None, nsprefix=None, **attrs): - """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, namespace, nsprefix, attrs) - - def new_string(self, s, subclass=NavigableString): - """Create a new NavigableString associated with this soup.""" - navigable = subclass(s) - navigable.setup() - return navigable - - def insert_before(self, successor): - raise NotImplementedError("BeautifulSoup objects don't support insert_before().") - - def insert_after(self, successor): - raise NotImplementedError("BeautifulSoup objects don't support insert_after().") - - def popTag(self): - tag = self.tagStack.pop() - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.builder.preserve_whitespace_tags)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parse_only and len(self.tagStack) <= 1 and \ - (not self.parse_only.text or \ - not self.parse_only.search(currentData)): - return - o = containerClass(currentData) - self.object_was_parsed(o) - - def object_was_parsed(self, o, parent=None, most_recent_element=None): - """Add an object to the parse tree.""" - parent = parent or self.currentTag - most_recent_element = most_recent_element or self._most_recent_element - o.setup(parent, most_recent_element) - if most_recent_element is not None: - most_recent_element.next_element = o - self._most_recent_element = o - parent.contents.append(o) - - def _popToTag(self, name, nsprefix=None, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - - for i in range(len(self.tagStack) - 1, 0, -1): - if (name == self.tagStack[i].name - and nsprefix == self.tagStack[i].prefix): - numPops = len(self.tagStack) - i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def handle_starttag(self, name, namespace, nsprefix, attrs): - """Push a start tag on to the stack. - - If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured - in the document. For instance, if this was a self-closing tag, - don't call handle_endtag. - """ - - # print "Start tag %s: %s" % (name, attrs) - self.endData() - - if (self.parse_only and len(self.tagStack) <= 1 - and (self.parse_only.text - or not self.parse_only.search_tag(name, attrs))): - return None - - tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self._most_recent_element) - if tag is None: - return tag - if self._most_recent_element: - self._most_recent_element.next_element = tag - self._most_recent_element = tag - self.pushTag(tag) - return tag - - def handle_endtag(self, name, nsprefix=None): - #print "End tag: " + name - self.endData() - self._popToTag(name, nsprefix) - - def handle_data(self, data): - self.currentData.append(data) - - def decode(self, pretty_print=False, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Returns a string or Unicode representation of this document. - To get Unicode, pass None for encoding.""" - - if self.is_xml: - # Print the XML declaration - encoding_part = '' - if eventual_encoding != None: - encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'\n' % encoding_part - else: - prefix = u'' - if not pretty_print: - indent_level = None - else: - indent_level = 0 - return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) - -# Alias to make it easier to type import: 'from bs4 import _soup' -_s = BeautifulSoup -_soup = BeautifulSoup - -class BeautifulStoneSoup(BeautifulSoup): - """Deprecated interface to an XML parser.""" - - def __init__(self, *args, **kwargs): - kwargs['features'] = 'xml' - warnings.warn( - 'The BeautifulStoneSoup class is deprecated. Instead of using ' - 'it, pass features="xml" into the BeautifulSoup constructor.') - super(BeautifulStoneSoup, self).__init__(*args, **kwargs) - - -class StopParsing(Exception): - pass - - -class FeatureNotFound(ValueError): - pass - - -#By default, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulSoup(sys.stdin) - print soup.prettify() diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py deleted file mode 100644 index bae453e..0000000 --- a/lib/bs4/builder/__init__.py +++ /dev/null @@ -1,316 +0,0 @@ -from collections import defaultdict -import itertools -import sys -from bs4.element import ( - CharsetMetaAttributeValue, - ContentMetaAttributeValue, - whitespace_re - ) - -__all__ = [ - 'HTMLTreeBuilder', - 'SAXTreeBuilder', - 'TreeBuilder', - 'TreeBuilderRegistry', - ] - -# Some useful features for a TreeBuilder to have. -FAST = 'fast' -PERMISSIVE = 'permissive' -STRICT = 'strict' -XML = 'xml' -HTML = 'html' -HTML_5 = 'html5' - - -class TreeBuilderRegistry(object): - - def __init__(self): - self.builders_for_feature = defaultdict(list) - self.builders = [] - - def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features.""" - for feature in treebuilder_class.features: - self.builders_for_feature[feature].insert(0, treebuilder_class) - self.builders.insert(0, treebuilder_class) - - def lookup(self, *features): - if len(self.builders) == 0: - # There are no builders at all. - return None - - if len(features) == 0: - # They didn't ask for any features. Give them the most - # recently registered builder. - return self.builders[0] - - # Go down the list of features in order, and eliminate any builders - # that don't match every feature. - features = list(features) - features.reverse() - candidates = None - candidate_set = None - while len(features) > 0: - feature = features.pop() - we_have_the_feature = self.builders_for_feature.get(feature, []) - if len(we_have_the_feature) > 0: - if candidates is None: - candidates = we_have_the_feature - candidate_set = set(candidates) - else: - # Eliminate any candidates that don't have this feature. - candidate_set = candidate_set.intersection( - set(we_have_the_feature)) - - # The only valid candidates are the ones in candidate_set. - # Go through the original list of candidates and pick the first one - # that's in candidate_set. - if candidate_set is None: - return None - for candidate in candidates: - if candidate in candidate_set: - return candidate - return None - -# The BeautifulSoup class will take feature lists from developers and use them -# to look up builders in this registry. -builder_registry = TreeBuilderRegistry() - -class TreeBuilder(object): - """Turn a document into a Beautiful Soup object tree.""" - - features = [] - - is_xml = False - preserve_whitespace_tags = set() - empty_element_tags = None # A tag will be considered an empty-element - # tag when and only when it has no contents. - - # A value for these tag/attribute combinations is a space- or - # comma-separated list of CDATA, rather than a single CDATA. - cdata_list_attributes = {} - - - def __init__(self): - self.soup = None - - def reset(self): - pass - - def can_be_empty_element(self, tag_name): - """Might a tag with this name be an empty-element tag? - - The final markup may or may not actually present this tag as - self-closing. - - For instance: an HTMLBuilder does not consider a

tag to be - an empty-element tag (it's not in - HTMLBuilder.empty_element_tags). This means an empty

tag - will be presented as "

", not "

". - - The default implementation has no opinion about which tags are - empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no contents. - "" will become "", and "bar" will - be left alone. - """ - if self.empty_element_tags is None: - return True - return tag_name in self.empty_element_tags - - def feed(self, markup): - raise NotImplementedError() - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - return markup, None, None, False - - def test_fragment_to_document(self, fragment): - """Wrap an HTML fragment to make it look like a document. - - Different parsers do this differently. For instance, lxml - introduces an empty tag, and html5lib - doesn't. Abstracting this away lets us write simple tests - which run HTML fragments through the parser and compare the - results against other HTML fragments. - - This method should not be used outside of tests. - """ - return fragment - - def set_up_substitutions(self, tag): - return False - - def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """Replaces class="foo bar" with class=["foo", "bar"] - - Modifies its input in place. - """ - if self.cdata_list_attributes: - universal = self.cdata_list_attributes.get('*', []) - tag_specific = self.cdata_list_attributes.get( - tag_name.lower(), []) - for cdata_list_attr in itertools.chain(universal, tag_specific): - if cdata_list_attr in attrs: - # Basically, we have a "class" attribute whose - # value is a whitespace-separated list of CSS - # classes. Split it into a list. - value = attrs[cdata_list_attr] - if isinstance(value, basestring): - values = whitespace_re.split(value) - else: - # html5lib sometimes calls setAttributes twice - # for the same tag when rearranging the parse - # tree. On the second call the attribute value - # here is already a list. If this happens, - # leave the value alone rather than trying to - # split it again. - values = value - attrs[cdata_list_attr] = values - return attrs - -class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events.""" - - def feed(self, markup): - raise NotImplementedError() - - def close(self): - pass - - def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in list(attrs.items())) - #print "Start %s, %r" % (name, attrs) - self.soup.handle_starttag(name, attrs) - - def endElement(self, name): - #print "End %s" % name - self.soup.handle_endtag(name) - - def startElementNS(self, nsTuple, nodeName, attrs): - # Throw away (ns, nodeName) for now. - self.startElement(nodeName, attrs) - - def endElementNS(self, nsTuple, nodeName): - # Throw away (ns, nodeName) for now. - self.endElement(nodeName) - #handler.endElementNS((ns, node.nodeName), node.nodeName) - - def startPrefixMapping(self, prefix, nodeValue): - # Ignore the prefix for now. - pass - - def endPrefixMapping(self, prefix): - # Ignore the prefix for now. - # handler.endPrefixMapping(prefix) - pass - - def characters(self, content): - self.soup.handle_data(content) - - def startDocument(self): - pass - - def endDocument(self): - pass - - -class HTMLTreeBuilder(TreeBuilder): - """This TreeBuilder knows facts about HTML. - - Such as which tags are empty-element tags. - """ - - preserve_whitespace_tags = set(['pre', 'textarea']) - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - - # The HTML standard defines these attributes as containing a - # space-separated list of values, not a single value. That is, - # class="foo bar" means that the 'class' attribute has two values, - # 'foo' and 'bar', not the single value 'foo bar'. When we - # encounter one of these attributes, we will parse its value into - # a list of values if possible. Upon output, the list will be - # converted back into a string. - cdata_list_attributes = { - "*" : ['class', 'accesskey', 'dropzone'], - "a" : ['rel', 'rev'], - "link" : ['rel', 'rev'], - "td" : ["headers"], - "th" : ["headers"], - "td" : ["headers"], - "form" : ["accept-charset"], - "object" : ["archive"], - - # These are HTML5 specific, as are *.accesskey and *.dropzone above. - "area" : ["rel"], - "icon" : ["sizes"], - "iframe" : ["sandbox"], - "output" : ["for"], - } - - def set_up_substitutions(self, tag): - # We are only interested in tags - if tag.name != 'meta': - return False - - http_equiv = tag.get('http-equiv') - content = tag.get('content') - charset = tag.get('charset') - - # We are interested in tags that say what encoding the - # document was originally in. This means HTML 5-style - # tags that provide the "charset" attribute. It also means - # HTML 4-style tags that provide the "content" - # attribute and have "http-equiv" set to "content-type". - # - # In both cases we will replace the value of the appropriate - # attribute with a standin object that can take on any - # encoding. - meta_encoding = None - if charset is not None: - # HTML 5 style: - # - meta_encoding = charset - tag['charset'] = CharsetMetaAttributeValue(charset) - - elif (content is not None and http_equiv is not None - and http_equiv.lower() == 'content-type'): - # HTML 4 style: - # - tag['content'] = ContentMetaAttributeValue(content) - - return (meta_encoding is not None) - -def register_treebuilders_from(module): - """Copy TreeBuilders from the given module into this module.""" - # I'm fairly sure this is not the best way to do this. - this_module = sys.modules['bs4.builder'] - for name in module.__all__: - obj = getattr(module, name) - - if issubclass(obj, TreeBuilder): - setattr(this_module, name, obj) - this_module.__all__.append(name) - # Register the builder while we're at it. - this_module.builder_registry.register(obj) - -# Builders are registered in reverse order of priority, so that custom -# builder registrations will take precedence. In general, we want lxml -# to take precedence over html5lib, because it's faster. And we only -# want to use HTMLParser as a last result. -from . import _htmlparser -register_treebuilders_from(_htmlparser) -try: - from . import _html5lib - register_treebuilders_from(_html5lib) -except ImportError: - # They don't have html5lib installed. - pass -try: - from . import _lxml - register_treebuilders_from(_lxml) -except ImportError: - # They don't have lxml installed. - pass diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py deleted file mode 100644 index e439ac8..0000000 --- a/lib/bs4/builder/_html5lib.py +++ /dev/null @@ -1,222 +0,0 @@ -__all__ = [ - 'HTML5TreeBuilder', - ] - -import warnings -from bs4.builder import ( - PERMISSIVE, - HTML, - HTML_5, - HTMLTreeBuilder, - ) -from bs4.element import NamespacedAttribute -import html5lib -from html5lib.constants import namespaces -from bs4.element import ( - Comment, - Doctype, - NavigableString, - Tag, - ) - -class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" - - features = ['html5lib', PERMISSIVE, HTML_5, HTML] - - def prepare_markup(self, markup, user_specified_encoding): - # Store the user-specified encoding for use later on. - self.user_specified_encoding = user_specified_encoding - return markup, None, None, False - - # These methods are defined by Beautiful Soup. - def feed(self, markup): - if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") - parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) - - # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): - # We need to special-case this because html5lib sets - # charEncoding to UTF-8 if it gets Unicode input. - doc.original_encoding = None - else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] - - def create_treebuilder(self, namespaceHTMLElements): - self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) - return self.underlying_builder - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'%s' % fragment - - -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): - - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup - super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) - - def documentClass(self): - self.soup.reset() - return Element(self.soup, self.soup, None) - - def insertDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - - doctype = Doctype.for_name_and_ids(name, publicId, systemId) - self.soup.object_was_parsed(doctype) - - def elementClass(self, name, namespace): - tag = self.soup.new_tag(name, namespace) - return Element(tag, self.soup, namespace) - - def commentClass(self, data): - return TextNode(Comment(data), self.soup) - - def fragmentClass(self): - self.soup = BeautifulSoup("") - self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup, None) - - def appendChild(self, node): - # XXX This code is not covered by the BS4 tests. - self.soup.append(node.element) - - def getDocument(self): - return self.soup - - def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element - -class AttrList(object): - def __init__(self, element): - self.element = element - self.attrs = dict(self.element.attrs) - def __iter__(self): - return list(self.attrs.items()).__iter__() - def __setitem__(self, name, value): - "set attr", name, value - self.element[name] = value - def items(self): - return list(self.attrs.items()) - def keys(self): - return list(self.attrs.keys()) - def __len__(self): - return len(self.attrs) - def __getitem__(self, name): - return self.attrs[name] - def __contains__(self, name): - return name in list(self.attrs.keys()) - - -class Element(html5lib.treebuilders._base.Node): - def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) - self.element = element - self.soup = soup - self.namespace = namespace - - def appendChild(self, node): - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[-1].__class__ == NavigableString): - # Concatenate new text onto old text node - # XXX This has O(n^2) performance, for input like - # "aaa..." - old_element = self.element.contents[-1] - new_element = self.soup.new_string(old_element + node.element) - old_element.replace_with(new_element) - self.soup._most_recent_element = new_element - else: - self.soup.object_was_parsed(node.element, parent=self.element) - - def getAttributes(self): - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes is not None and len(attributes) > 0: - - converted_attributes = [] - for name, value in list(attributes.items()): - if isinstance(name, tuple): - new_name = NamespacedAttribute(*name) - del attributes[name] - attributes[new_name] = value - - self.soup.builder._replace_cdata_list_attribute_values( - self.name, attributes) - for name, value in attributes.items(): - self.element[name] = value - - # The attributes may contain variables that need substitution. - # Call set_up_substitutions manually. - # - # The Tag constructor called this method when the Tag was created, - # but we just set/changed the attributes, so call it again. - self.soup.builder.set_up_substitutions(self.element) - attributes = property(getAttributes, setAttributes) - - def insertText(self, data, insertBefore=None): - text = TextNode(self.soup.new_string(data), self.soup) - if insertBefore: - self.insertBefore(text, insertBefore) - else: - self.appendChild(text) - - def insertBefore(self, node, refNode): - index = self.element.index(refNode.element) - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[index-1].__class__ == NavigableString): - # (See comments in appendChild) - old_node = self.element.contents[index-1] - new_str = self.soup.new_string(old_node + node.element) - old_node.replace_with(new_str) - else: - self.element.insert(index, node.element) - node.parent = self - - def removeChild(self, node): - node.element.extract() - - def reparentChildren(self, newParent): - while self.element.contents: - child = self.element.contents[0] - child.extract() - if isinstance(child, Tag): - newParent.appendChild( - Element(child, self.soup, namespaces["html"])) - else: - newParent.appendChild( - TextNode(child, self.soup)) - - def cloneNode(self): - tag = self.soup.new_tag(self.element.name, self.namespace) - node = Element(tag, self.soup, self.namespace) - for key,value in self.attributes: - node.attributes[key] = value - return node - - def hasContent(self): - return self.element.contents - - def getNameTuple(self): - if self.namespace == None: - return namespaces["html"], self.name - else: - return self.namespace, self.name - - nameTuple = property(getNameTuple) - -class TextNode(Element): - def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) - self.element = element - self.soup = soup - - def cloneNode(self): - raise NotImplementedError diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py deleted file mode 100644 index 65ee618..0000000 --- a/lib/bs4/builder/_htmlparser.py +++ /dev/null @@ -1,249 +0,0 @@ -"""Use the HTMLParser library to parse HTML files that aren't too bad.""" - -__all__ = [ - 'HTMLParserTreeBuilder', - ] - -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) -import sys -import warnings - -# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' -# argument, which we'd like to set to False. Unfortunately, -# http://bugs.python.org/issue13273 makes strict=True a better bet -# before Python 3.2.3. -# -# At the end of this file, we monkeypatch HTMLParser so that -# strict=True works well on Python 3.2.2. -major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = ( - major > 3 - or (major == 3 and minor > 2) - or (major == 3 and minor == 2 and release >= 3)) - -from bs4.element import ( - CData, - Comment, - Declaration, - Doctype, - ProcessingInstruction, - ) -from bs4.dammit import EntitySubstitution, UnicodeDammit - -from bs4.builder import ( - HTML, - HTMLTreeBuilder, - STRICT, - ) - - -HTMLPARSER = 'html.parser' - -class BeautifulSoupHTMLParser(HTMLParser): - def handle_starttag(self, name, attrs): - # XXX namespace - self.soup.handle_starttag(name, None, None, dict(attrs)) - - def handle_endtag(self, name): - self.soup.handle_endtag(name) - - def handle_data(self, data): - self.soup.handle_data(data) - - def handle_charref(self, name): - # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed. - if name.startswith('x'): - real_name = int(name.lstrip('x'), 16) - elif name.startswith('X'): - real_name = int(name.lstrip('X'), 16) - else: - real_name = int(name) - - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - - self.handle_data(data) - - def handle_entityref(self, name): - character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) - if character is not None: - data = character - else: - data = "&%s;" % name - self.handle_data(data) - - def handle_comment(self, data): - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(Comment) - - def handle_decl(self, data): - self.soup.endData() - if data.startswith("DOCTYPE "): - data = data[len("DOCTYPE "):] - elif data == 'DOCTYPE': - # i.e. "" - data = '' - self.soup.handle_data(data) - self.soup.endData(Doctype) - - def unknown_decl(self, data): - if data.upper().startswith('CDATA['): - cls = CData - data = data[len('CDATA['):] - else: - cls = Declaration - self.soup.endData() - self.soup.handle_data(data) - self.soup.endData(cls) - - def handle_pi(self, data): - self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] - self.soup.handle_data(data) - self.soup.endData(ProcessingInstruction) - - -class HTMLParserTreeBuilder(HTMLTreeBuilder): - - is_xml = False - features = [HTML, STRICT, HTMLPARSER] - - def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: - kwargs['strict'] = False - self.parser_args = (args, kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). - """ - if isinstance(markup, unicode): - return markup, None, None, False - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - args, kwargs = self.parser_args - parser = BeautifulSoupHTMLParser(*args, **kwargs) - parser.soup = self.soup - try: - parser.feed(markup) - except HTMLParseError, e: - warnings.warn(RuntimeWarning( - "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) - raise e - -# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some -# 3.2.3 code. This ensures they don't treat markup like

as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py deleted file mode 100644 index be35d70..0000000 --- a/lib/bs4/builder/_lxml.py +++ /dev/null @@ -1,199 +0,0 @@ -__all__ = [ - 'LXMLTreeBuilderForXML', - 'LXMLTreeBuilder', - ] - -from io import BytesIO -from StringIO import StringIO -import collections -from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute -from bs4.builder import ( - FAST, - HTML, - HTMLTreeBuilder, - PERMISSIVE, - TreeBuilder, - XML) -from bs4.dammit import UnicodeDammit - -LXML = 'lxml' - -class LXMLTreeBuilderForXML(TreeBuilder): - DEFAULT_PARSER_CLASS = etree.XMLParser - - is_xml = True - - # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] - - CHUNK_SIZE = 512 - - # This namespace mapping is specified in the XML Namespace - # standard. - DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} - - @property - def default_parser(self): - # This can either return a parser object or a class, which - # will be instantiated with default arguments. - return etree.XMLParser(target=self, strip_cdata=False, recover=True) - - def __init__(self, parser=None, empty_element_tags=None): - if empty_element_tags is not None: - self.empty_element_tags = set(empty_element_tags) - if parser is None: - # Use the default parser. - parser = self.default_parser - if isinstance(parser, collections.Callable): - # Instantiate the parser with default arguments - parser = parser(target=self, strip_cdata=False) - self.parser = parser - self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS] - - def _getNsTag(self, tag): - # Split the namespace URL out of a fully-qualified lxml tag - # name. Copied from lxml's src/lxml/sax.py. - if tag[0] == '{': - return tuple(tag[1:].split('}', 1)) - else: - return (None, tag) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). - """ - if isinstance(markup, unicode): - return markup, None, None, False - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - if isinstance(markup, bytes): - markup = BytesIO(markup) - elif isinstance(markup, unicode): - markup = StringIO(markup) - # Call feed() at least once, even if the markup is empty, - # or the parser won't be initialized. - data = markup.read(self.CHUNK_SIZE) - self.parser.feed(data) - while data != '': - # Now call feed() on the rest of the data, chunk by chunk. - data = markup.read(self.CHUNK_SIZE) - if data != '': - self.parser.feed(data) - self.parser.close() - - def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS] - - def start(self, name, attrs, nsmap={}): - # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. - attrs = dict(attrs) - nsprefix = None - # Invert each namespace map as it comes in. - if len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) - elif len(nsmap) > 0: - # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) - self.nsmaps.append(inverted_nsmap) - # Also treat the namespace mapping as a set of attributes on the - # tag, so we can recreate it later. - attrs = attrs.copy() - for prefix, namespace in nsmap.items(): - attribute = NamespacedAttribute( - "xmlns", prefix, "http://www.w3.org/2000/xmlns/") - attrs[attribute] = namespace - - # Namespaces are in play. Find any attributes that came in - # from lxml with namespaces attached to their names, and - # turn then into NamespacedAttribute objects. - new_attrs = {} - for attr, value in attrs.items(): - namespace, attr = self._getNsTag(attr) - if namespace is None: - new_attrs[attr] = value - else: - nsprefix = self._prefix_for_namespace(namespace) - attr = NamespacedAttribute(nsprefix, attr, namespace) - new_attrs[attr] = value - attrs = new_attrs - - namespace, name = self._getNsTag(name) - nsprefix = self._prefix_for_namespace(namespace) - self.soup.handle_starttag(name, namespace, nsprefix, attrs) - - def _prefix_for_namespace(self, namespace): - """Find the currently active prefix for the given namespace.""" - if namespace is None: - return None - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - return inverted_nsmap[namespace] - return None - - def end(self, name): - self.soup.endData() - completed_tag = self.soup.tagStack[-1] - namespace, name = self._getNsTag(name) - nsprefix = None - if namespace is not None: - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - nsprefix = inverted_nsmap[namespace] - break - self.soup.handle_endtag(name, nsprefix) - if len(self.nsmaps) > 1: - # This tag, or one of its parents, introduced a namespace - # mapping, so pop it off the stack. - self.nsmaps.pop() - - def pi(self, target, data): - pass - - def data(self, content): - self.soup.handle_data(content) - - def doctype(self, name, pubid, system): - self.soup.endData() - doctype = Doctype.for_name_and_ids(name, pubid, system) - self.soup.object_was_parsed(doctype) - - def comment(self, content): - "Handle comments as Comment objects." - self.soup.endData() - self.soup.handle_data(content) - self.soup.endData(Comment) - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'\n%s' % fragment - - -class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - - features = [LXML, HTML, FAST, PERMISSIVE] - is_xml = False - - @property - def default_parser(self): - return etree.HTMLParser - - def feed(self, markup): - self.parser.feed(markup) - self.parser.close() - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'%s' % fragment diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py deleted file mode 100644 index a733cad..0000000 --- a/lib/bs4/dammit.py +++ /dev/null @@ -1,827 +0,0 @@ -# -*- coding: utf-8 -*- -"""Beautiful Soup bonus library: Unicode, Dammit - -This class forces XML data into a standard format (usually to UTF-8 or -Unicode). It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's the tree builder's job. -""" - -import codecs -from htmlentitydefs import codepoint2name -import re -import logging - -# Import a library to autodetect character encodings. -chardet_type = None -try: - # First try the fast C implementation. - # PyPI package: cchardet - import cchardet - def chardet_dammit(s): - return cchardet.detect(s)['encoding'] -except ImportError: - try: - # Fall back to the pure Python implementation - # Debian package: python-chardet - # PyPI package: chardet - import chardet - def chardet_dammit(s): - return chardet.detect(s)['encoding'] - #import chardet.constants - #chardet.constants._debug = 1 - except ImportError: - # No chardet available. - def chardet_dammit(s): - return None - -# Available from http://cjkpython.i18n.org/. -try: - import iconv_codec -except ImportError: - pass - -xml_encoding_re = re.compile( - '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) -html_meta_re = re.compile( - '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) - -class EntitySubstitution(object): - - """Substitute XML or HTML entities for the corresponding characters.""" - - def _populate_class_variables(): - lookup = {} - reverse_lookup = {} - characters_for_re = [] - for codepoint, name in list(codepoint2name.items()): - character = unichr(codepoint) - if codepoint != 34: - # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. - characters_for_re.append(character) - lookup[character] = name - # But we do want to turn " into the quotation mark. - reverse_lookup[name] = character - re_definition = "[%s]" % "".join(characters_for_re) - return lookup, reverse_lookup, re.compile(re_definition) - (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, - CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() - - CHARACTER_TO_XML_ENTITY = { - "'": "apos", - '"': "quot", - "&": "amp", - "<": "lt", - ">": "gt", - } - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - ")") - - AMPERSAND_OR_BRACKET = re.compile("([<>&])") - - @classmethod - def _substitute_html_entity(cls, matchobj): - entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) - return "&%s;" % entity - - @classmethod - def _substitute_xml_entity(cls, matchobj): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] - return "&%s;" % entity - - @classmethod - def quoted_attribute_value(self, value): - """Make a value into a quoted XML attribute, possibly escaping it. - - Most strings will be quoted using double quotes. - - Bob's Bar -> "Bob's Bar" - - If a string contains double quotes, it will be quoted using - single quotes. - - Welcome to "my bar" -> 'Welcome to "my bar"' - - If a string contains both single and double quotes, the - double quotes will be escaped, and the string will be quoted - using double quotes. - - Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" - """ - quote_with = '"' - if '"' in value: - if "'" in value: - # The string contains both single and double - # quotes. Turn the double quotes into - # entities. We quote the double quotes rather than - # the single quotes because the entity name is - # """ whether this is HTML or XML. If we - # quoted the single quotes, we'd have to decide - # between ' and &squot;. - replace_with = """ - value = value.replace('"', replace_with) - else: - # There are double quotes but no single quotes. - # We can use single quotes to quote the attribute. - quote_with = "'" - return quote_with + value + quote_with - - @classmethod - def substitute_xml(cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. - - :param value: A string to be substituted. The less-than sign - will become <, the greater-than sign will become >, - and any ampersands will become &. If you want ampersands - that appear to be part of an entity definition to be left - alone, use substitute_xml_containing_entities() instead. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - """ - # Escape angle brackets and ampersands. - value = cls.AMPERSAND_OR_BRACKET.sub( - cls._substitute_xml_entity, value) - - if make_quoted_attribute: - value = cls.quoted_attribute_value(value) - return value - - @classmethod - def substitute_xml_containing_entities( - cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. - - :param value: A string to be substituted. The less-than sign will - become <, the greater-than sign will become >, and any - ampersands that are not part of an entity defition will - become &. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - """ - # Escape angle brackets, and ampersands that aren't part of - # entities. - value = cls.BARE_AMPERSAND_OR_BRACKET.sub( - cls._substitute_xml_entity, value) - - if make_quoted_attribute: - value = cls.quoted_attribute_value(value) - return value - - - @classmethod - def substitute_html(cls, s): - """Replace certain Unicode characters with named HTML entities. - - This differs from data.encode(encoding, 'xmlcharrefreplace') - in that the goal is to make the result more readable (to those - with ASCII displays) rather than to recover from - errors. There's absolutely nothing wrong with a UTF-8 string - containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that - character with "é" will make it more readable to some - people. - """ - return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( - cls._substitute_html_entity, s) - - -class UnicodeDammit: - """A class for detecting the encoding of a *ML document and - converting it to a Unicode string. If the source encoding is - windows-1252, can replace MS smart quotes with their HTML or XML - equivalents.""" - - # This dictionary maps commonly seen values for "charset" in HTML - # meta tags to the corresponding Python codec names. It only covers - # values that aren't in Python's aliases and can't be determined - # by the heuristics in find_codec. - CHARSET_ALIASES = {"macintosh": "mac-roman", - "x-sjis": "shift-jis"} - - ENCODINGS_WITH_SMART_QUOTES = [ - "windows-1252", - "iso-8859-1", - "iso-8859-2", - ] - - def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None - self.smart_quotes_to = smart_quotes_to - self.tried_encodings = [] - self.contains_replacement_characters = False - - if markup == '' or isinstance(markup, unicode): - self.markup = markup - self.unicode_markup = unicode(markup) - self.original_encoding = None - return - - new_markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) - self.markup = new_markup - - u = None - if new_markup != markup: - # _detectEncoding modified the markup, then converted it to - # Unicode and then to UTF-8. So convert it from UTF-8. - u = self._convert_from("utf8") - self.original_encoding = sniffed_encoding - - if not u: - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break - - # If no luck and we have auto-detection library, try that: - if not u and not isinstance(self.markup, unicode): - u = self._convert_from(chardet_dammit(self.markup)) - - # As a last resort, try utf-8 and windows-1252: - if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convert_from(proposed_encoding) - if u: - break - - # As an absolute last resort, try the encodings again with - # character replacement. - if not u: - for proposed_encoding in ( - override_encodings + [ - document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): - if proposed_encoding != "ascii": - u = self._convert_from(proposed_encoding, "replace") - if u is not None: - logging.warning( - "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER.") - self.contains_replacement_characters = True - break - - # We could at this point force it to ASCII, but that would - # destroy so much data that I think giving up is better - self.unicode_markup = u - if not u: - self.original_encoding = None - - def _sub_ms_char(self, match): - """Changes a MS smart quote character to an XML or HTML - entity, or an ASCII character.""" - orig = match.group(1) - if self.smart_quotes_to == 'ascii': - sub = self.MS_CHARS_TO_ASCII.get(orig).encode() - else: - sub = self.MS_CHARS.get(orig) - if type(sub) == tuple: - if self.smart_quotes_to == 'xml': - sub = '&#x'.encode() + sub[1].encode() + ';'.encode() - else: - sub = '&'.encode() + sub[0].encode() + ';'.encode() - else: - sub = sub.encode() - return sub - - def _convert_from(self, proposed, errors="strict"): - proposed = self.find_codec(proposed) - if not proposed or (proposed, errors) in self.tried_encodings: - return None - self.tried_encodings.append((proposed, errors)) - markup = self.markup - # Convert smart quotes to HTML if coming from an encoding - # that might have them. - if (self.smart_quotes_to is not None - and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): - smart_quotes_re = b"([\x80-\x9f])" - smart_quotes_compiled = re.compile(smart_quotes_re) - markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) - - try: - #print "Trying to convert document to %s (errors=%s)" % ( - # proposed, errors) - u = self._to_unicode(markup, proposed, errors) - self.markup = u - self.original_encoding = proposed - except Exception as e: - #print "That didn't work!" - #print e - return None - #print "Correct encoding: %s" % proposed - return self.markup - - def _to_unicode(self, data, encoding, errors="strict"): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' - - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding, errors) - return newdata - - def _detectEncoding(self, xml_data, is_html=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ - and (xml_data[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ - (xml_data[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_match = xml_encoding_re.match(xml_data) - if not xml_encoding_match and is_html: - xml_encoding_match = html_meta_re.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if is_html: - self.declared_html_encoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding - - def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ - or charset - - def _codec(self, charset): - if not charset: - return charset - codec = None - try: - codecs.lookup(charset) - codec = charset - except (LookupError, ValueError): - pass - return codec - - EBCDIC_TO_ASCII_MAP = None - - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) - - # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. - MS_CHARS = {b'\x80': ('euro', '20AC'), - b'\x81': ' ', - b'\x82': ('sbquo', '201A'), - b'\x83': ('fnof', '192'), - b'\x84': ('bdquo', '201E'), - b'\x85': ('hellip', '2026'), - b'\x86': ('dagger', '2020'), - b'\x87': ('Dagger', '2021'), - b'\x88': ('circ', '2C6'), - b'\x89': ('permil', '2030'), - b'\x8A': ('Scaron', '160'), - b'\x8B': ('lsaquo', '2039'), - b'\x8C': ('OElig', '152'), - b'\x8D': '?', - b'\x8E': ('#x17D', '17D'), - b'\x8F': '?', - b'\x90': '?', - b'\x91': ('lsquo', '2018'), - b'\x92': ('rsquo', '2019'), - b'\x93': ('ldquo', '201C'), - b'\x94': ('rdquo', '201D'), - b'\x95': ('bull', '2022'), - b'\x96': ('ndash', '2013'), - b'\x97': ('mdash', '2014'), - b'\x98': ('tilde', '2DC'), - b'\x99': ('trade', '2122'), - b'\x9a': ('scaron', '161'), - b'\x9b': ('rsaquo', '203A'), - b'\x9c': ('oelig', '153'), - b'\x9d': '?', - b'\x9e': ('#x17E', '17E'), - b'\x9f': ('Yuml', ''),} - - # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains - # horrors like stripping diacritical marks to turn á into a, but also - # contains non-horrors like turning “ into ". - MS_CHARS_TO_ASCII = { - b'\x80' : 'EUR', - b'\x81' : ' ', - b'\x82' : ',', - b'\x83' : 'f', - b'\x84' : ',,', - b'\x85' : '...', - b'\x86' : '+', - b'\x87' : '++', - b'\x88' : '^', - b'\x89' : '%', - b'\x8a' : 'S', - b'\x8b' : '<', - b'\x8c' : 'OE', - b'\x8d' : '?', - b'\x8e' : 'Z', - b'\x8f' : '?', - b'\x90' : '?', - b'\x91' : "'", - b'\x92' : "'", - b'\x93' : '"', - b'\x94' : '"', - b'\x95' : '*', - b'\x96' : '-', - b'\x97' : '--', - b'\x98' : '~', - b'\x99' : '(TM)', - b'\x9a' : 's', - b'\x9b' : '>', - b'\x9c' : 'oe', - b'\x9d' : '?', - b'\x9e' : 'z', - b'\x9f' : 'Y', - b'\xa0' : ' ', - b'\xa1' : '!', - b'\xa2' : 'c', - b'\xa3' : 'GBP', - b'\xa4' : '$', #This approximation is especially parochial--this is the - #generic currency symbol. - b'\xa5' : 'YEN', - b'\xa6' : '|', - b'\xa7' : 'S', - b'\xa8' : '..', - b'\xa9' : '', - b'\xaa' : '(th)', - b'\xab' : '<<', - b'\xac' : '!', - b'\xad' : ' ', - b'\xae' : '(R)', - b'\xaf' : '-', - b'\xb0' : 'o', - b'\xb1' : '+-', - b'\xb2' : '2', - b'\xb3' : '3', - b'\xb4' : ("'", 'acute'), - b'\xb5' : 'u', - b'\xb6' : 'P', - b'\xb7' : '*', - b'\xb8' : ',', - b'\xb9' : '1', - b'\xba' : '(th)', - b'\xbb' : '>>', - b'\xbc' : '1/4', - b'\xbd' : '1/2', - b'\xbe' : '3/4', - b'\xbf' : '?', - b'\xc0' : 'A', - b'\xc1' : 'A', - b'\xc2' : 'A', - b'\xc3' : 'A', - b'\xc4' : 'A', - b'\xc5' : 'A', - b'\xc6' : 'AE', - b'\xc7' : 'C', - b'\xc8' : 'E', - b'\xc9' : 'E', - b'\xca' : 'E', - b'\xcb' : 'E', - b'\xcc' : 'I', - b'\xcd' : 'I', - b'\xce' : 'I', - b'\xcf' : 'I', - b'\xd0' : 'D', - b'\xd1' : 'N', - b'\xd2' : 'O', - b'\xd3' : 'O', - b'\xd4' : 'O', - b'\xd5' : 'O', - b'\xd6' : 'O', - b'\xd7' : '*', - b'\xd8' : 'O', - b'\xd9' : 'U', - b'\xda' : 'U', - b'\xdb' : 'U', - b'\xdc' : 'U', - b'\xdd' : 'Y', - b'\xde' : 'b', - b'\xdf' : 'B', - b'\xe0' : 'a', - b'\xe1' : 'a', - b'\xe2' : 'a', - b'\xe3' : 'a', - b'\xe4' : 'a', - b'\xe5' : 'a', - b'\xe6' : 'ae', - b'\xe7' : 'c', - b'\xe8' : 'e', - b'\xe9' : 'e', - b'\xea' : 'e', - b'\xeb' : 'e', - b'\xec' : 'i', - b'\xed' : 'i', - b'\xee' : 'i', - b'\xef' : 'i', - b'\xf0' : 'o', - b'\xf1' : 'n', - b'\xf2' : 'o', - b'\xf3' : 'o', - b'\xf4' : 'o', - b'\xf5' : 'o', - b'\xf6' : 'o', - b'\xf7' : '/', - b'\xf8' : 'o', - b'\xf9' : 'u', - b'\xfa' : 'u', - b'\xfb' : 'u', - b'\xfc' : 'u', - b'\xfd' : 'y', - b'\xfe' : 'b', - b'\xff' : 'y', - } - - # A map used when removing rogue Windows-1252/ISO-8859-1 - # characters in otherwise UTF-8 documents. - # - # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in - # Windows-1252. - WINDOWS_1252_TO_UTF8 = { - 0x80 : b'\xe2\x82\xac', # € - 0x82 : b'\xe2\x80\x9a', # ‚ - 0x83 : b'\xc6\x92', # Æ’ - 0x84 : b'\xe2\x80\x9e', # „ - 0x85 : b'\xe2\x80\xa6', # … - 0x86 : b'\xe2\x80\xa0', # † - 0x87 : b'\xe2\x80\xa1', # ‡ - 0x88 : b'\xcb\x86', # ˆ - 0x89 : b'\xe2\x80\xb0', # ‰ - 0x8a : b'\xc5\xa0', # Å  - 0x8b : b'\xe2\x80\xb9', # ‹ - 0x8c : b'\xc5\x92', # Å’ - 0x8e : b'\xc5\xbd', # Ž - 0x91 : b'\xe2\x80\x98', # ‘ - 0x92 : b'\xe2\x80\x99', # ’ - 0x93 : b'\xe2\x80\x9c', # “ - 0x94 : b'\xe2\x80\x9d', # †- 0x95 : b'\xe2\x80\xa2', # • - 0x96 : b'\xe2\x80\x93', # – - 0x97 : b'\xe2\x80\x94', # — - 0x98 : b'\xcb\x9c', # Ëœ - 0x99 : b'\xe2\x84\xa2', # â„¢ - 0x9a : b'\xc5\xa1', # Å¡ - 0x9b : b'\xe2\x80\xba', # › - 0x9c : b'\xc5\x93', # Å“ - 0x9e : b'\xc5\xbe', # ž - 0x9f : b'\xc5\xb8', # Ÿ - 0xa0 : b'\xc2\xa0', #   - 0xa1 : b'\xc2\xa1', # ¡ - 0xa2 : b'\xc2\xa2', # ¢ - 0xa3 : b'\xc2\xa3', # £ - 0xa4 : b'\xc2\xa4', # ¤ - 0xa5 : b'\xc2\xa5', # Â¥ - 0xa6 : b'\xc2\xa6', # ¦ - 0xa7 : b'\xc2\xa7', # § - 0xa8 : b'\xc2\xa8', # ¨ - 0xa9 : b'\xc2\xa9', # © - 0xaa : b'\xc2\xaa', # ª - 0xab : b'\xc2\xab', # « - 0xac : b'\xc2\xac', # ¬ - 0xad : b'\xc2\xad', # ­ - 0xae : b'\xc2\xae', # ® - 0xaf : b'\xc2\xaf', # ¯ - 0xb0 : b'\xc2\xb0', # ° - 0xb1 : b'\xc2\xb1', # ± - 0xb2 : b'\xc2\xb2', # ² - 0xb3 : b'\xc2\xb3', # ³ - 0xb4 : b'\xc2\xb4', # ´ - 0xb5 : b'\xc2\xb5', # µ - 0xb6 : b'\xc2\xb6', # ¶ - 0xb7 : b'\xc2\xb7', # · - 0xb8 : b'\xc2\xb8', # ¸ - 0xb9 : b'\xc2\xb9', # ¹ - 0xba : b'\xc2\xba', # º - 0xbb : b'\xc2\xbb', # » - 0xbc : b'\xc2\xbc', # ¼ - 0xbd : b'\xc2\xbd', # ½ - 0xbe : b'\xc2\xbe', # ¾ - 0xbf : b'\xc2\xbf', # ¿ - 0xc0 : b'\xc3\x80', # À - 0xc1 : b'\xc3\x81', # à - 0xc2 : b'\xc3\x82', #  - 0xc3 : b'\xc3\x83', # à - 0xc4 : b'\xc3\x84', # Ä - 0xc5 : b'\xc3\x85', # Ã… - 0xc6 : b'\xc3\x86', # Æ - 0xc7 : b'\xc3\x87', # Ç - 0xc8 : b'\xc3\x88', # È - 0xc9 : b'\xc3\x89', # É - 0xca : b'\xc3\x8a', # Ê - 0xcb : b'\xc3\x8b', # Ë - 0xcc : b'\xc3\x8c', # ÃŒ - 0xcd : b'\xc3\x8d', # à - 0xce : b'\xc3\x8e', # ÃŽ - 0xcf : b'\xc3\x8f', # à - 0xd0 : b'\xc3\x90', # à - 0xd1 : b'\xc3\x91', # Ñ - 0xd2 : b'\xc3\x92', # Ã’ - 0xd3 : b'\xc3\x93', # Ó - 0xd4 : b'\xc3\x94', # Ô - 0xd5 : b'\xc3\x95', # Õ - 0xd6 : b'\xc3\x96', # Ö - 0xd7 : b'\xc3\x97', # × - 0xd8 : b'\xc3\x98', # Ø - 0xd9 : b'\xc3\x99', # Ù - 0xda : b'\xc3\x9a', # Ú - 0xdb : b'\xc3\x9b', # Û - 0xdc : b'\xc3\x9c', # Ü - 0xdd : b'\xc3\x9d', # à - 0xde : b'\xc3\x9e', # Þ - 0xdf : b'\xc3\x9f', # ß - 0xe0 : b'\xc3\xa0', # à - 0xe1 : b'\xa1', # á - 0xe2 : b'\xc3\xa2', # â - 0xe3 : b'\xc3\xa3', # ã - 0xe4 : b'\xc3\xa4', # ä - 0xe5 : b'\xc3\xa5', # Ã¥ - 0xe6 : b'\xc3\xa6', # æ - 0xe7 : b'\xc3\xa7', # ç - 0xe8 : b'\xc3\xa8', # è - 0xe9 : b'\xc3\xa9', # é - 0xea : b'\xc3\xaa', # ê - 0xeb : b'\xc3\xab', # ë - 0xec : b'\xc3\xac', # ì - 0xed : b'\xc3\xad', # í - 0xee : b'\xc3\xae', # î - 0xef : b'\xc3\xaf', # ï - 0xf0 : b'\xc3\xb0', # ð - 0xf1 : b'\xc3\xb1', # ñ - 0xf2 : b'\xc3\xb2', # ò - 0xf3 : b'\xc3\xb3', # ó - 0xf4 : b'\xc3\xb4', # ô - 0xf5 : b'\xc3\xb5', # õ - 0xf6 : b'\xc3\xb6', # ö - 0xf7 : b'\xc3\xb7', # ÷ - 0xf8 : b'\xc3\xb8', # ø - 0xf9 : b'\xc3\xb9', # ù - 0xfa : b'\xc3\xba', # ú - 0xfb : b'\xc3\xbb', # û - 0xfc : b'\xc3\xbc', # ü - 0xfd : b'\xc3\xbd', # ý - 0xfe : b'\xc3\xbe', # þ - } - - MULTIBYTE_MARKERS_AND_SIZES = [ - (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF - (0xe0, 0xef, 3), # 3-byte characters start with E0-EF - (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 - ] - - FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] - LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] - - @classmethod - def detwingle(cls, in_bytes, main_encoding="utf8", - embedded_encoding="windows-1252"): - """Fix characters from one encoding embedded in some other encoding. - - Currently the only situation supported is Windows-1252 (or its - subset ISO-8859-1), embedded in UTF-8. - - The input must be a bytestring. If you've already converted - the document to Unicode, you're too late. - - The output is a bytestring in which `embedded_encoding` - characters have been converted to their `main_encoding` - equivalents. - """ - if embedded_encoding.replace('_', '-').lower() not in ( - 'windows-1252', 'windows_1252'): - raise NotImplementedError( - "Windows-1252 and ISO-8859-1 are the only currently supported " - "embedded encodings.") - - if main_encoding.lower() not in ('utf8', 'utf-8'): - raise NotImplementedError( - "UTF-8 is the only currently supported main encoding.") - - byte_chunks = [] - - chunk_start = 0 - pos = 0 - while pos < len(in_bytes): - byte = in_bytes[pos] - if not isinstance(byte, int): - # Python 2.x - byte = ord(byte) - if (byte >= cls.FIRST_MULTIBYTE_MARKER - and byte <= cls.LAST_MULTIBYTE_MARKER): - # This is the start of a UTF-8 multibyte character. Skip - # to the end. - for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: - if byte >= start and byte <= end: - pos += size - break - elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: - # We found a Windows-1252 character! - # Save the string up to this point as a chunk. - byte_chunks.append(in_bytes[chunk_start:pos]) - - # Now translate the Windows-1252 character into UTF-8 - # and add it as another, one-byte chunk. - byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) - pos += 1 - chunk_start = pos - else: - # Go on to the next character. - pos += 1 - if chunk_start == 0: - # The string is unchanged. - return in_bytes - else: - # Store the final chunk. - byte_chunks.append(in_bytes[chunk_start:]) - return b''.join(byte_chunks) - diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py deleted file mode 100644 index 25fda5c..0000000 --- a/lib/bs4/diagnose.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Diagnostic functions, mainly for use when doing tech support.""" -from StringIO import StringIO -from HTMLParser import HTMLParser -from bs4 import BeautifulSoup, __version__ -from bs4.builder import builder_registry -import os -import random -import time -import traceback -import sys -import cProfile - -def diagnose(data): - """Diagnostic suite for isolating common problems.""" - print "Diagnostic running on Beautiful Soup %s" % __version__ - print "Python version %s" % sys.version - - basic_parsers = ["html.parser", "html5lib", "lxml"] - for name in basic_parsers: - for builder in builder_registry.builders: - if name in builder.features: - break - else: - basic_parsers.remove(name) - print ( - "I noticed that %s is not installed. Installing it may help." % - name) - - if 'lxml' in basic_parsers: - basic_parsers.append(["lxml", "xml"]) - from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) - - if 'html5lib' in basic_parsers: - import html5lib - print "Found html5lib version %s" % html5lib.__version__ - - if hasattr(data, 'read'): - data = data.read() - elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - data = open(data).read() - elif data.startswith("http:") or data.startswith("https:"): - print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data - print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." - return - print - - for parser in basic_parsers: - print "Trying to parse your markup with %s" % parser - success = False - try: - soup = BeautifulSoup(data, parser) - success = True - except Exception, e: - print "%s could not parse the markup." % parser - traceback.print_exc() - if success: - print "Here's what %s did with the markup:" % parser - print soup.prettify() - - print "-" * 80 - -def lxml_trace(data, html=True): - """Print out the lxml events that occur during parsing. - - This lets you see how lxml parses a document when no Beautiful - Soup code is running. - """ - from lxml import etree - for event, element in etree.iterparse(StringIO(data), html=html): - print("%s, %4s, %s" % (event, element.tag, element.text)) - -class AnnouncingParser(HTMLParser): - """Announces HTMLParser parse events, without doing anything else.""" - - def _p(self, s): - print(s) - - def handle_starttag(self, name, attrs): - self._p("%s START" % name) - - def handle_endtag(self, name): - self._p("%s END" % name) - - def handle_data(self, data): - self._p("%s DATA" % data) - - def handle_charref(self, name): - self._p("%s CHARREF" % name) - - def handle_entityref(self, name): - self._p("%s ENTITYREF" % name) - - def handle_comment(self, data): - self._p("%s COMMENT" % data) - - def handle_decl(self, data): - self._p("%s DECL" % data) - - def unknown_decl(self, data): - self._p("%s UNKNOWN-DECL" % data) - - def handle_pi(self, data): - self._p("%s PI" % data) - -def htmlparser_trace(data): - """Print out the HTMLParser events that occur during parsing. - - This lets you see how HTMLParser parses a document when no - Beautiful Soup code is running. - """ - parser = AnnouncingParser() - parser.feed(data) - -_vowels = "aeiou" -_consonants = "bcdfghjklmnpqrstvwxyz" - -def rword(length=5): - "Generate a random word-like string." - s = '' - for i in range(length): - if i % 2 == 0: - t = _consonants - else: - t = _vowels - s += random.choice(t) - return s - -def rsentence(length=4): - "Generate a random sentence-like string." - return " ".join(rword(random.randint(4,9)) for i in range(length)) - -def rdoc(num_elements=1000): - """Randomly generate an invalid HTML document.""" - tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] - elements = [] - for i in range(num_elements): - choice = random.randint(0,3) - if choice == 0: - # New tag. - tag_name = random.choice(tag_names) - elements.append("<%s>" % tag_name) - elif choice == 1: - elements.append(rsentence(random.randint(1,4))) - elif choice == 2: - # Close a tag. - tag_name = random.choice(tag_names) - elements.append("" % tag_name) - return "" + "\n".join(elements) + "" - -def benchmark_parsers(num_elements=100000): - """Very basic head-to-head performance benchmark.""" - print "Comparative parser benchmark on Beautiful Soup %s" % __version__ - data = rdoc(num_elements) - print "Generated a large invalid HTML document (%d bytes)." % len(data) - - for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: - success = False - try: - a = time.time() - soup = BeautifulSoup(data, parser) - b = time.time() - success = True - except Exception, e: - print "%s could not parse the markup." % parser - traceback.print_exc() - if success: - print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) - - from lxml import etree - a = time.time() - etree.HTML(data) - b = time.time() - print "Raw lxml parsed the markup in %.2fs." % (b-a) - -if __name__ == '__main__': - diagnose(sys.stdin.read()) diff --git a/lib/bs4/element.py b/lib/bs4/element.py deleted file mode 100644 index f6864f2..0000000 --- a/lib/bs4/element.py +++ /dev/null @@ -1,1598 +0,0 @@ -import collections -import re -import sys -import warnings -from bs4.dammit import EntitySubstitution - -DEFAULT_OUTPUT_ENCODING = "utf-8" -PY3K = (sys.version_info[0] > 2) - -whitespace_re = re.compile("\s+") - -def _alias(attr): - """Alias one attribute name to another for backward compatibility""" - @property - def alias(self): - return getattr(self, attr) - - @alias.setter - def alias(self): - return setattr(self, attr) - return alias - - -class NamespacedAttribute(unicode): - - def __new__(cls, prefix, name, namespace=None): - if name is None: - obj = unicode.__new__(cls, prefix) - elif prefix is None: - # Not really namespaced. - obj = unicode.__new__(cls, name) - else: - obj = unicode.__new__(cls, prefix + ":" + name) - obj.prefix = prefix - obj.name = name - obj.namespace = namespace - return obj - -class AttributeValueWithCharsetSubstitution(unicode): - """A stand-in object for a character encoding specified in HTML.""" - -class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): - """A generic stand-in for the value of a meta tag's 'charset' attribute. - - When Beautiful Soup parses the markup '', the - value of the 'charset' attribute will be one of these objects. - """ - - def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) - obj.original_value = original_value - return obj - - def encode(self, encoding): - return encoding - - -class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): - """A generic stand-in for the value of a meta tag's 'content' attribute. - - When Beautiful Soup parses the markup: - - - The value of the 'content' attribute will be one of these objects. - """ - - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - - def __new__(cls, original_value): - match = cls.CHARSET_RE.search(original_value) - if match is None: - # No substitution necessary. - return unicode.__new__(unicode, original_value) - - obj = unicode.__new__(cls, original_value) - obj.original_value = original_value - return obj - - def encode(self, encoding): - def rewrite(match): - return match.group(1) + encoding - return self.CHARSET_RE.sub(rewrite, self.original_value) - -class HTMLAwareEntitySubstitution(EntitySubstitution): - - """Entity substitution rules that are aware of some HTML quirks. - - Specifically, the contents of -""" - soup = BeautifulSoup(doc, "xml") - # lxml would have stripped this while parsing, but we can add - # it later. - soup.script.string = 'console.log("< < hey > > ");' - encoded = soup.encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_popping_namespaced_tag(self): - markup = 'b2012-07-02T20:33:42Zcd' - soup = self.soup(markup) - self.assertEqual( - unicode(soup.rss), markup) - - def test_docstring_includes_correct_encoding(self): - soup = self.soup("") - self.assertEqual( - soup.encode("latin1"), - b'\n') - - def test_large_xml_document(self): - """A large XML document should come out the same as it went in.""" - markup = (b'\n' - + b'0' * (2**12) - + b'') - soup = self.soup(markup) - self.assertEqual(soup.encode("utf-8"), markup) - - - def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): - self.assertSoupEquals("

", "

") - self.assertSoupEquals("

foo

") - - def test_namespaces_are_preserved(self): - markup = 'This tag is in the a namespaceThis tag is in the b namespace' - soup = self.soup(markup) - root = soup.root - self.assertEqual("http://example.com/", root['xmlns:a']) - self.assertEqual("http://example.net/", root['xmlns:b']) - - def test_closing_namespaced_tag(self): - markup = '

20010504

' - soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) - - def test_namespaced_attributes(self): - markup = '' - soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) - - def test_namespaced_attributes_xml_namespace(self): - markup = 'bar' - soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) - -class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): - """Smoke test for a tree builder that supports HTML5.""" - - def test_real_xhtml_document(self): - # Since XHTML is not HTML5, HTML5 parsers are not tested to handle - # XHTML documents in any particular way. - pass - - def test_html_tags_have_namespace(self): - markup = "" - soup = self.soup(markup) - self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) - - def test_svg_tags_have_namespace(self): - markup = '' - soup = self.soup(markup) - namespace = "http://www.w3.org/2000/svg" - self.assertEqual(namespace, soup.svg.namespace) - self.assertEqual(namespace, soup.circle.namespace) - - - def test_mathml_tags_have_namespace(self): - markup = '5' - soup = self.soup(markup) - namespace = 'http://www.w3.org/1998/Math/MathML' - self.assertEqual(namespace, soup.math.namespace) - self.assertEqual(namespace, soup.msqrt.namespace) - - def test_xml_declaration_becomes_comment(self): - markup = '' - soup = self.soup(markup) - self.assertTrue(isinstance(soup.contents[0], Comment)) - self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') - self.assertEqual("html", soup.contents[0].next_element.name) - -def skipIf(condition, reason): - def nothing(test, *args, **kwargs): - return None - - def decorator(test_item): - if condition: - return nothing - else: - return test_item - - return decorator diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py deleted file mode 100644 index 142c8cc..0000000 --- a/lib/bs4/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"The beautifulsoup tests." diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py deleted file mode 100644 index 92ad10f..0000000 --- a/lib/bs4/tests/test_builder_registry.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Tests of the builder registry.""" - -import unittest - -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry as registry, - HTMLParserTreeBuilder, - TreeBuilderRegistry, -) - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError: - HTML5LIB_PRESENT = False - -try: - from bs4.builder import ( - LXMLTreeBuilderForXML, - LXMLTreeBuilder, - ) - LXML_PRESENT = True -except ImportError: - LXML_PRESENT = False - - -class BuiltInRegistryTest(unittest.TestCase): - """Test the built-in registry with the default builders registered.""" - - def test_combination(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('fast', 'html'), - LXMLTreeBuilder) - - if LXML_PRESENT: - self.assertEqual(registry.lookup('permissive', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('strict', 'html'), - HTMLParserTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib', 'html'), - HTML5TreeBuilder) - - def test_lookup_by_markup_type(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) - self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) - else: - self.assertEqual(registry.lookup('xml'), None) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: - self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) - - def test_named_library(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('lxml', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('lxml', 'html'), - LXMLTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib'), - HTML5TreeBuilder) - - self.assertEqual(registry.lookup('html.parser'), - HTMLParserTreeBuilder) - - def test_beautifulsoup_constructor_does_lookup(self): - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) - - # You'll get an exception if BS can't find an appropriate - # builder. - self.assertRaises(ValueError, BeautifulSoup, - "", features="no-such-feature") - -class RegistryTest(unittest.TestCase): - """Test the TreeBuilderRegistry class in general.""" - - def setUp(self): - self.registry = TreeBuilderRegistry() - - def builder_for_features(self, *feature_list): - cls = type('Builder_' + '_'.join(feature_list), - (object,), {'features' : feature_list}) - - self.registry.register(cls) - return cls - - def test_register_with_no_features(self): - builder = self.builder_for_features() - - # Since the builder advertises no features, you can't find it - # by looking up features. - self.assertEqual(self.registry.lookup('foo'), None) - - # But you can find it by doing a lookup with no features, if - # this happens to be the only registered builder. - self.assertEqual(self.registry.lookup(), builder) - - def test_register_with_features_makes_lookup_succeed(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('foo'), builder) - self.assertEqual(self.registry.lookup('bar'), builder) - - def test_lookup_fails_when_no_builder_implements_feature(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('baz'), None) - - def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): - builder1 = self.builder_for_features('foo') - builder2 = self.builder_for_features('bar') - self.assertEqual(self.registry.lookup(), builder2) - - def test_lookup_fails_when_no_tree_builders_registered(self): - self.assertEqual(self.registry.lookup(), None) - - def test_lookup_gets_most_recent_builder_supporting_all_features(self): - has_one = self.builder_for_features('foo') - has_the_other = self.builder_for_features('bar') - has_both_early = self.builder_for_features('foo', 'bar', 'baz') - has_both_late = self.builder_for_features('foo', 'bar', 'quux') - lacks_one = self.builder_for_features('bar') - has_the_other = self.builder_for_features('foo') - - # There are two builders featuring 'foo' and 'bar', but - # the one that also features 'quux' was registered later. - self.assertEqual(self.registry.lookup('foo', 'bar'), - has_both_late) - - # There is only one builder featuring 'foo', 'bar', and 'baz'. - self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), - has_both_early) - - def test_lookup_fails_when_cannot_reconcile_requested_features(self): - builder1 = self.builder_for_features('foo', 'bar') - builder2 = self.builder_for_features('foo', 'baz') - self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/lib/bs4/tests/test_docs.py b/lib/bs4/tests/test_docs.py deleted file mode 100644 index 5b9f677..0000000 --- a/lib/bs4/tests/test_docs.py +++ /dev/null @@ -1,36 +0,0 @@ -"Test harness for doctests." - -# pylint: disable-msg=E0611,W0142 - -__metaclass__ = type -__all__ = [ - 'additional_tests', - ] - -import atexit -import doctest -import os -#from pkg_resources import ( -# resource_filename, resource_exists, resource_listdir, cleanup_resources) -import unittest - -DOCTEST_FLAGS = ( - doctest.ELLIPSIS | - doctest.NORMALIZE_WHITESPACE | - doctest.REPORT_NDIFF) - - -# def additional_tests(): -# "Run the doc tests (README.txt and docs/*, if any exist)" -# doctest_files = [ -# os.path.abspath(resource_filename('bs4', 'README.txt'))] -# if resource_exists('bs4', 'docs'): -# for name in resource_listdir('bs4', 'docs'): -# if name.endswith('.txt'): -# doctest_files.append( -# os.path.abspath( -# resource_filename('bs4', 'docs/%s' % name))) -# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) -# atexit.register(cleanup_resources) -# return unittest.TestSuite(( -# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py deleted file mode 100644 index 2a3b41e..0000000 --- a/lib/bs4/tests/test_html5lib.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Tests to ensure that the html5lib tree builder generates good trees.""" - -import warnings - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError, e: - HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( - HTML5TreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): - """See ``HTML5TreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return HTML5TreeBuilder() - - def test_soupstrainer(self): - # The html5lib tree builder does not support SoupStrainers. - strainer = SoupStrainer("b") - markup = "

A bold statement.

" - with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) - self.assertEqual( - soup.decode(), self.document_for(markup)) - - self.assertTrue( - "the html5lib tree builder doesn't support parse_only" in - str(w[0].message)) - - def test_correctly_nested_tables(self): - """html5lib inserts tags where other parsers don't.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_xml_declaration_followed_by_doctype(self): - markup = ''' - - - - - -

foo

- -''' - soup = self.soup(markup) - # Verify that we can reach the

tag; this means the tree is connected. - self.assertEqual(b"

foo

", soup.p.encode()) diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py deleted file mode 100644 index bcb5ed2..0000000 --- a/lib/bs4/tests/test_htmlparser.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Tests to ensure that the html.parser tree builder generates good -trees.""" - -from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest -from bs4.builder import HTMLParserTreeBuilder - -class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - - @property - def default_builder(self): - return HTMLParserTreeBuilder() - - def test_namespaced_system_doctype(self): - # html.parser can't handle namespaced doctypes, so skip this one. - pass - - def test_namespaced_public_doctype(self): - # html.parser can't handle namespaced doctypes, so skip this one. - pass diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py deleted file mode 100644 index 80458de..0000000 --- a/lib/bs4/tests/test_lxml.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Tests to ensure that the lxml tree builder generates good trees.""" - -import re -import warnings - -try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True - import lxml.etree - LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError, e: - LXML_PRESENT = False - LXML_VERSION = (0,) - -from bs4 import ( - BeautifulSoup, - BeautifulStoneSoup, - ) -from bs4.element import Comment, Doctype, SoupStrainer -from bs4.testing import skipIf -from bs4.tests import test_htmlparser -from bs4.testing import ( - HTMLTreeBuilderSmokeTest, - XMLTreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not LXML_PRESENT, - "lxml seems not to be present, not testing its tree builder.") -class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - """See ``HTMLTreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return LXMLTreeBuilder() - - def test_out_of_range_entity(self): - self.assertSoupEquals( - "

foo�bar

", "

foobar

") - self.assertSoupEquals( - "

foo�bar

", "

foobar

") - self.assertSoupEquals( - "

foo�bar

", "

foobar

") - - # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this - # test if an old version of lxml is installed. - - @skipIf( - not LXML_PRESENT or LXML_VERSION < (2,3,5,0), - "Skipping doctype test for old version of lxml to avoid segfault.") - def test_empty_doctype(self): - soup = self.soup("") - doctype = soup.contents[0] - self.assertEqual("", doctype.strip()) - - def test_beautifulstonesoup_is_xml_parser(self): - # Make sure that the deprecated BSS class uses an xml builder - # if one is installed. - with warnings.catch_warnings(record=False) as w: - soup = BeautifulStoneSoup("") - self.assertEqual(u"", unicode(soup.b)) - - def test_real_xhtml_document(self): - """lxml strips the XML definition from an XHTML doc, which is fine.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b''), - markup.replace(b'\n', b'').replace( - b'', b'')) - - -@skipIf( - not LXML_PRESENT, - "lxml seems not to be present, not testing its XML tree builder.") -class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): - """See ``HTMLTreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return LXMLTreeBuilderForXML() diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py deleted file mode 100644 index b127716..0000000 --- a/lib/bs4/tests/test_soup.py +++ /dev/null @@ -1,383 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests of Beautiful Soup as a whole.""" - -import logging -import unittest -import sys -from bs4 import ( - BeautifulSoup, - BeautifulStoneSoup, -) -from bs4.element import ( - CharsetMetaAttributeValue, - ContentMetaAttributeValue, - SoupStrainer, - NamespacedAttribute, - ) -import bs4.dammit -from bs4.dammit import EntitySubstitution, UnicodeDammit -from bs4.testing import ( - SoupTest, - skipIf, -) -import warnings - -try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True -except ImportError, e: - LXML_PRESENT = False - -PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) -PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) - -class TestDeprecatedConstructorArguments(SoupTest): - - def test_parseOnlyThese_renamed_to_parse_only(self): - with warnings.catch_warnings(record=True) as w: - soup = self.soup("
", parseOnlyThese=SoupStrainer("b")) - msg = str(w[0].message) - self.assertTrue("parseOnlyThese" in msg) - self.assertTrue("parse_only" in msg) - self.assertEqual(b"", soup.encode()) - - def test_fromEncoding_renamed_to_from_encoding(self): - with warnings.catch_warnings(record=True) as w: - utf8 = b"\xc3\xa9" - soup = self.soup(utf8, fromEncoding="utf8") - msg = str(w[0].message) - self.assertTrue("fromEncoding" in msg) - self.assertTrue("from_encoding" in msg) - self.assertEqual("utf8", soup.original_encoding) - - def test_unrecognized_keyword_argument(self): - self.assertRaises( - TypeError, self.soup, "", no_such_argument=True) - - @skipIf( - not LXML_PRESENT, - "lxml not present, not testing BeautifulStoneSoup.") - def test_beautifulstonesoup(self): - with warnings.catch_warnings(record=True) as w: - soup = BeautifulStoneSoup("") - self.assertTrue(isinstance(soup, BeautifulSoup)) - self.assertTrue("BeautifulStoneSoup class is deprecated") - -class TestSelectiveParsing(SoupTest): - - def test_parse_with_soupstrainer(self): - markup = "NoYesNoYes Yes" - strainer = SoupStrainer("b") - soup = self.soup(markup, parse_only=strainer) - self.assertEqual(soup.encode(), b"YesYes Yes") - - -class TestEntitySubstitution(unittest.TestCase): - """Standalone tests of the EntitySubstitution class.""" - def setUp(self): - self.sub = EntitySubstitution - - def test_simple_html_substitution(self): - # Unicode characters corresponding to named HTML entites - # are substituted, and no others. - s = u"foo\u2200\N{SNOWMAN}\u00f5bar" - self.assertEqual(self.sub.substitute_html(s), - u"foo∀\N{SNOWMAN}õbar") - - def test_smart_quote_substitution(self): - # MS smart quotes are a common source of frustration, so we - # give them a special test. - quotes = b"\x91\x92foo\x93\x94" - dammit = UnicodeDammit(quotes) - self.assertEqual(self.sub.substitute_html(dammit.markup), - "‘’foo“”") - - def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): - s = 'Welcome to "my bar"' - self.assertEqual(self.sub.substitute_xml(s, False), s) - - def test_xml_attribute_quoting_normally_uses_double_quotes(self): - self.assertEqual(self.sub.substitute_xml("Welcome", True), - '"Welcome"') - self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), - '"Bob\'s Bar"') - - def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): - s = 'Welcome to "my bar"' - self.assertEqual(self.sub.substitute_xml(s, True), - "'Welcome to \"my bar\"'") - - def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): - s = 'Welcome to "Bob\'s Bar"' - self.assertEqual( - self.sub.substitute_xml(s, True), - '"Welcome to "Bob\'s Bar""') - - def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): - quoted = 'Welcome to "Bob\'s Bar"' - self.assertEqual(self.sub.substitute_xml(quoted), quoted) - - def test_xml_quoting_handles_angle_brackets(self): - self.assertEqual( - self.sub.substitute_xml("foo"), - "foo<bar>") - - def test_xml_quoting_handles_ampersands(self): - self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") - - def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): - self.assertEqual( - self.sub.substitute_xml("ÁT&T"), - "&Aacute;T&T") - - def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): - self.assertEqual( - self.sub.substitute_xml_containing_entities("ÁT&T"), - "ÁT&T") - - def test_quotes_not_html_substituted(self): - """There's no need to do this except inside attribute values.""" - text = 'Bob\'s "bar"' - self.assertEqual(self.sub.substitute_html(text), text) - - -class TestEncodingConversion(SoupTest): - # Test Beautiful Soup's ability to decode and encode from various - # encodings. - - def setUp(self): - super(TestEncodingConversion, self).setUp() - self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - self.utf8_data = self.unicode_data.encode("utf-8") - # Just so you know what it looks like. - self.assertEqual( - self.utf8_data, - b'Sacr\xc3\xa9 bleu!') - - def test_ascii_in_unicode_out(self): - # ASCII input is converted to Unicode. The original_encoding - # attribute is set. - ascii = b"a" - soup_from_ascii = self.soup(ascii) - unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) - self.assertEqual(unicode_output, self.document_for(ascii.decode())) - self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii") - - def test_unicode_in_unicode_out(self): - # Unicode input is left alone. The original_encoding attribute - # is not set. - soup_from_unicode = self.soup(self.unicode_data) - self.assertEqual(soup_from_unicode.decode(), self.unicode_data) - self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') - self.assertEqual(soup_from_unicode.original_encoding, None) - - def test_utf8_in_unicode_out(self): - # UTF-8 input is converted to Unicode. The original_encoding - # attribute is set. - soup_from_utf8 = self.soup(self.utf8_data) - self.assertEqual(soup_from_utf8.decode(), self.unicode_data) - self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') - - def test_utf8_out(self): - # The internal data structures can be encoded as UTF-8. - soup_from_unicode = self.soup(self.unicode_data) - self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) - - @skipIf( - PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, - "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") - def test_attribute_name_containing_unicode_characters(self): - markup = u'
' - self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) - -class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of Unicode, Dammit.""" - - def test_smart_quotes_to_unicode(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup) - self.assertEqual( - dammit.unicode_markup, u"\u2018\u2019\u201c\u201d") - - def test_smart_quotes_to_xml_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - self.assertEqual( - dammit.unicode_markup, "‘’“”") - - def test_smart_quotes_to_html_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - self.assertEqual( - dammit.unicode_markup, "‘’“”") - - def test_smart_quotes_to_ascii(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - self.assertEqual( - dammit.unicode_markup, """''""""") - - def test_detect_utf8(self): - utf8 = b"\xc3\xa9" - dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.unicode_markup, u'\xe9') - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_convert_hebrew(self): - hebrew = b"\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') - - def test_dont_see_smart_quotes_where_there_are_none(self): - utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - dammit = UnicodeDammit(utf_8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) - - def test_ignore_inappropriate_codecs(self): - utf8_data = u"RäksmörgÃ¥s".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_ignore_invalid_codecs(self): - utf8_data = u"RäksmörgÃ¥s".encode("utf-8") - for bad_encoding in ['.utf8', '...', 'utF---16.!']: - dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_detect_html5_style_meta_tag(self): - - for data in ( - b'', - b"", - b"", - b""): - dammit = UnicodeDammit(data, is_html=True) - self.assertEqual( - "euc-jp", dammit.original_encoding) - - def test_last_ditch_entity_replacement(self): - # This is a UTF-8 document that contains bytestrings - # completely incompatible with UTF-8 (ie. encoded with some other - # encoding). - # - # Since there is no consistent encoding for the document, - # Unicode, Dammit will eventually encode the document as UTF-8 - # and encode the incompatible characters as REPLACEMENT - # CHARACTER. - # - # If chardet is installed, it will detect that the document - # can be converted into ISO-8859-1 without errors. This happens - # to be the wrong encoding, but it is a consistent encoding, so the - # code we're testing here won't run. - # - # So we temporarily disable chardet if it's present. - doc = b"""\357\273\277 -\330\250\330\252\330\261 -\310\322\321\220\312\321\355\344""" - chardet = bs4.dammit.chardet_dammit - logging.disable(logging.WARNING) - try: - def noop(str): - return None - bs4.dammit.chardet_dammit = noop - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) - - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - finally: - logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet - - def test_sniffed_xml_encoding(self): - # A document written in UTF-16LE will be converted by a different - # code path that sniffs the byte order markers. - data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' - dammit = UnicodeDammit(data) - self.assertEqual(u"áé", dammit.unicode_markup) - self.assertEqual("utf-16le", dammit.original_encoding) - - def test_detwingle(self): - # Here's a UTF8 document. - utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") - - # Here's a Windows-1252 document. - windows_1252 = ( - u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") - - # Through some unholy alchemy, they've been stuck together. - doc = utf8 + windows_1252 + utf8 - - # The document can't be turned into UTF-8: - self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") - - # Unicode, Dammit thinks the whole document is Windows-1252, - # and decodes it into "☃☃☃“Hi, I like Windows!â€Ã¢ËœÆ’☃☃" - - # But if we run it through fix_embedded_windows_1252, it's fixed: - - fixed = UnicodeDammit.detwingle(doc) - self.assertEqual( - u"☃☃☃“Hi, I like Windows!â€â˜ƒâ˜ƒâ˜ƒ", fixed.decode("utf8")) - - def test_detwingle_ignores_multibyte_characters(self): - # Each of these characters has a UTF-8 representation ending - # in \x93. \x93 is a smart quote if interpreted as - # Windows-1252. But our code knows to skip over multibyte - # UTF-8 characters, so they'll survive the process unscathed. - for tricky_unicode_char in ( - u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. - ): - input = tricky_unicode_char.encode("utf8") - self.assertTrue(input.endswith(b'\x93')) - output = UnicodeDammit.detwingle(input) - self.assertEqual(output, input) - -class TestNamedspacedAttribute(SoupTest): - - def test_name_may_be_none(self): - a = NamespacedAttribute("xmlns", None) - self.assertEqual(a, "xmlns") - - def test_attribute_is_equivalent_to_colon_separated_string(self): - a = NamespacedAttribute("a", "b") - self.assertEqual("a:b", a) - - def test_attributes_are_equivalent_if_prefix_and_name_identical(self): - a = NamespacedAttribute("a", "b", "c") - b = NamespacedAttribute("a", "b", "c") - self.assertEqual(a, b) - - # The actual namespace is not considered. - c = NamespacedAttribute("a", "b", None) - self.assertEqual(a, c) - - # But name and prefix are important. - d = NamespacedAttribute("a", "z", "c") - self.assertNotEqual(a, d) - - e = NamespacedAttribute("z", "b", "c") - self.assertNotEqual(a, e) - - -class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): - - def test_content_meta_attribute_value(self): - value = CharsetMetaAttributeValue("euc-jp") - self.assertEqual("euc-jp", value) - self.assertEqual("euc-jp", value.original_value) - self.assertEqual("utf8", value.encode("utf8")) - - - def test_content_meta_attribute_value(self): - value = ContentMetaAttributeValue("text/html; charset=euc-jp") - self.assertEqual("text/html; charset=euc-jp", value) - self.assertEqual("text/html; charset=euc-jp", value.original_value) - self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py deleted file mode 100644 index 2d09f96..0000000 --- a/lib/bs4/tests/test_tree.py +++ /dev/null @@ -1,1800 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests for Beautiful Soup's tree traversal methods. - -The tree traversal methods are the main advantage of using Beautiful -Soup over just using a parser. - -Different parsers will build different Beautiful Soup trees given the -same markup, but all Beautiful Soup trees can be traversed with the -methods tested here. -""" - -import copy -import pickle -import re -import warnings -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry, - HTMLParserTreeBuilder, -) -from bs4.element import ( - CData, - Comment, - Doctype, - NavigableString, - SoupStrainer, - Tag, -) -from bs4.testing import ( - SoupTest, - skipIf, -) - -XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) -LXML_PRESENT = (builder_registry.lookup("lxml") is not None) - -class TreeTest(SoupTest): - - def assertSelects(self, tags, should_match): - """Make sure that the given tags have the correct text. - - This is used in tests that define a bunch of tags, each - containing a single string, and then select certain strings by - some mechanism. - """ - self.assertEqual([tag.string for tag in tags], should_match) - - def assertSelectsIDs(self, tags, should_match): - """Make sure that the given tags have the correct IDs. - - This is used in tests that define a bunch of tags, each - containing a single string, and then select certain strings by - some mechanism. - """ - self.assertEqual([tag['id'] for tag in tags], should_match) - - -class TestFind(TreeTest): - """Basic tests of the find() method. - - find() just calls find_all() with limit=1, so it's not tested all - that thouroughly here. - """ - - def test_find_tag(self): - soup = self.soup("1234") - self.assertEqual(soup.find("b").string, "2") - - def test_unicode_text_find(self): - soup = self.soup(u'

Räksmörgås

') - self.assertEqual(soup.find(text=u'RäksmörgÃ¥s'), u'RäksmörgÃ¥s') - -class TestFindAll(TreeTest): - """Basic tests of the find_all() method.""" - - def test_find_all_text_nodes(self): - """You can search the tree for text nodes.""" - soup = self.soup("Foobar\xbb") - # Exact match. - self.assertEqual(soup.find_all(text="bar"), [u"bar"]) - # Match any of a number of strings. - self.assertEqual( - soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) - # Match a regular expression. - self.assertEqual(soup.find_all(text=re.compile('.*')), - [u"Foo", u"bar", u'\xbb']) - # Match anything. - self.assertEqual(soup.find_all(text=True), - [u"Foo", u"bar", u'\xbb']) - - def test_find_all_limit(self): - """You can limit the number of items returned by find_all.""" - soup = self.soup("12345") - self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) - self.assertSelects(soup.find_all('a', limit=1), ["1"]) - self.assertSelects( - soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) - - # A limit of 0 means no limit. - self.assertSelects( - soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) - - def test_calling_a_tag_is_calling_findall(self): - soup = self.soup("123") - self.assertSelects(soup('a', limit=1), ["1"]) - self.assertSelects(soup.b(id="foo"), ["3"]) - - def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): - soup = self.soup("") - # Create a self-referential list. - l = [] - l.append(l) - - # Without special code in _normalize_search_value, this would cause infinite - # recursion. - self.assertEqual([], soup.find_all(l)) - -class TestFindAllBasicNamespaces(TreeTest): - - def test_find_by_namespaced_name(self): - soup = self.soup('4') - self.assertEqual("4", soup.find("mathml:msqrt").string) - self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) - - -class TestFindAllByName(TreeTest): - """Test ways of finding tags by tag name.""" - - def setUp(self): - super(TreeTest, self).setUp() - self.tree = self.soup("""First tag. - Second tag. - Third Nested tag. tag.""") - - def test_find_all_by_tag_name(self): - # Find all the tags. - self.assertSelects( - self.tree.find_all('a'), ['First tag.', 'Nested tag.']) - - def test_find_all_by_name_and_text(self): - self.assertSelects( - self.tree.find_all('a', text='First tag.'), ['First tag.']) - - self.assertSelects( - self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) - - self.assertSelects( - self.tree.find_all('a', text=re.compile("tag")), - ['First tag.', 'Nested tag.']) - - - def test_find_all_on_non_root_element(self): - # You can call find_all on any node, not just the root. - self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) - - def test_calling_element_invokes_find_all(self): - self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) - - def test_find_all_by_tag_strainer(self): - self.assertSelects( - self.tree.find_all(SoupStrainer('a')), - ['First tag.', 'Nested tag.']) - - def test_find_all_by_tag_names(self): - self.assertSelects( - self.tree.find_all(['a', 'b']), - ['First tag.', 'Second tag.', 'Nested tag.']) - - def test_find_all_by_tag_dict(self): - self.assertSelects( - self.tree.find_all({'a' : True, 'b' : True}), - ['First tag.', 'Second tag.', 'Nested tag.']) - - def test_find_all_by_tag_re(self): - self.assertSelects( - self.tree.find_all(re.compile('^[ab]$')), - ['First tag.', 'Second tag.', 'Nested tag.']) - - def test_find_all_with_tags_matching_method(self): - # You can define an oracle method that determines whether - # a tag matches the search. - def id_matches_name(tag): - return tag.name == tag.get('id') - - tree = self.soup("""Match 1. - Does not match. - Match 2.""") - - self.assertSelects( - tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) - - -class TestFindAllByAttribute(TreeTest): - - def test_find_all_by_attribute_name(self): - # You can pass in keyword arguments to find_all to search by - # attribute. - tree = self.soup(""" - Matching a. - - Non-matching Matching b.a. - """) - self.assertSelects(tree.find_all(id='first'), - ["Matching a.", "Matching b."]) - - def test_find_all_by_utf8_attribute_value(self): - peace = u"×ולש".encode("utf8") - data = u''.encode("utf8") - soup = self.soup(data) - self.assertEqual([soup.a], soup.find_all(title=peace)) - self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) - self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) - - def test_find_all_by_attribute_dict(self): - # You can pass in a dictionary as the argument 'attrs'. This - # lets you search for attributes like 'name' (a fixed argument - # to find_all) and 'class' (a reserved word in Python.) - tree = self.soup(""" - Name match. - Class match. - Non-match. - A tag called 'name1'. - """) - - # This doesn't do what you want. - self.assertSelects(tree.find_all(name='name1'), - ["A tag called 'name1'."]) - # This does what you want. - self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), - ["Name match."]) - - self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), - ["Class match."]) - - def test_find_all_by_class(self): - tree = self.soup(""" - Class 1. - Class 2. - Class 1. - Class 3 and 4. - """) - - # Passing in the class_ keyword argument will search against - # the 'class' attribute. - self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) - self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) - self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) - - # Passing in a string to 'attrs' will also search the CSS class. - self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) - self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) - self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) - self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) - - def test_find_by_class_when_multiple_classes_present(self): - tree = self.soup("Found it") - - f = tree.find_all("gar", class_=re.compile("o")) - self.assertSelects(f, ["Found it"]) - - f = tree.find_all("gar", class_=re.compile("a")) - self.assertSelects(f, ["Found it"]) - - # Since the class is not the string "foo bar", but the two - # strings "foo" and "bar", this will not find anything. - f = tree.find_all("gar", class_=re.compile("o b")) - self.assertSelects(f, []) - - def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): - soup = self.soup("Found it") - - self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) - - def big_attribute_value(value): - return len(value) > 3 - - self.assertSelects(soup.find_all("a", big_attribute_value), []) - - def small_attribute_value(value): - return len(value) <= 3 - - self.assertSelects( - soup.find_all("a", small_attribute_value), ["Found it"]) - - def test_find_all_with_string_for_attrs_finds_multiple_classes(self): - soup = self.soup('') - a, a2 = soup.find_all("a") - self.assertEqual([a, a2], soup.find_all("a", "foo")) - self.assertEqual([a], soup.find_all("a", "bar")) - - # If you specify the class as a string that contains a - # space, only that specific value will be found. - self.assertEqual([a], soup.find_all("a", class_="foo bar")) - self.assertEqual([a], soup.find_all("a", "foo bar")) - self.assertEqual([], soup.find_all("a", "bar foo")) - - def test_find_all_by_attribute_soupstrainer(self): - tree = self.soup(""" - Match. - Non-match.""") - - strainer = SoupStrainer(attrs={'id' : 'first'}) - self.assertSelects(tree.find_all(strainer), ['Match.']) - - def test_find_all_with_missing_atribute(self): - # You can pass in None as the value of an attribute to find_all. - # This will match tags that do not have that attribute set. - tree = self.soup("""ID present. - No ID present. - ID is empty.""") - self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) - - def test_find_all_with_defined_attribute(self): - # You can pass in None as the value of an attribute to find_all. - # This will match tags that have that attribute set to any value. - tree = self.soup("""ID present. - No ID present. - ID is empty.""") - self.assertSelects( - tree.find_all(id=True), ["ID present.", "ID is empty."]) - - def test_find_all_with_numeric_attribute(self): - # If you search for a number, it's treated as a string. - tree = self.soup("""Unquoted attribute. - Quoted attribute.""") - - expected = ["Unquoted attribute.", "Quoted attribute."] - self.assertSelects(tree.find_all(id=1), expected) - self.assertSelects(tree.find_all(id="1"), expected) - - def test_find_all_with_list_attribute_values(self): - # You can pass a list of attribute values instead of just one, - # and you'll get tags that match any of the values. - tree = self.soup("""1 - 2 - 3 - No ID.""") - self.assertSelects(tree.find_all(id=["1", "3", "4"]), - ["1", "3"]) - - def test_find_all_with_regular_expression_attribute_value(self): - # You can pass a regular expression as an attribute value, and - # you'll get tags whose values for that attribute match the - # regular expression. - tree = self.soup("""One a. - Two as. - Mixed as and bs. - One b. - No ID.""") - - self.assertSelects(tree.find_all(id=re.compile("^a+$")), - ["One a.", "Two as."]) - - def test_find_by_name_and_containing_string(self): - soup = self.soup("foobarfoo") - a = soup.a - - self.assertEqual([a], soup.find_all("a", text="foo")) - self.assertEqual([], soup.find_all("a", text="bar")) - self.assertEqual([], soup.find_all("a", text="bar")) - - def test_find_by_name_and_containing_string_when_string_is_buried(self): - soup = self.soup("foofoo") - self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) - - def test_find_by_attribute_and_containing_string(self): - soup = self.soup('foofoo') - a = soup.a - - self.assertEqual([a], soup.find_all(id=2, text="foo")) - self.assertEqual([], soup.find_all(id=1, text="bar")) - - - - -class TestIndex(TreeTest): - """Test Tag.index""" - def test_index(self): - tree = self.soup("""
- Identical - Not identical - Identical - - Identical with child - Also not identical - Identical with child -
""") - div = tree.div - for i, element in enumerate(div.contents): - self.assertEqual(i, div.index(element)) - self.assertRaises(ValueError, tree.index, 1) - - -class TestParentOperations(TreeTest): - """Test navigation and searching through an element's parents.""" - - def setUp(self): - super(TestParentOperations, self).setUp() - self.tree = self.soup('''
    -
      -
        -
          - Start here -
        -
      ''') - self.start = self.tree.b - - - def test_parent(self): - self.assertEqual(self.start.parent['id'], 'bottom') - self.assertEqual(self.start.parent.parent['id'], 'middle') - self.assertEqual(self.start.parent.parent.parent['id'], 'top') - - def test_parent_of_top_tag_is_soup_object(self): - top_tag = self.tree.contents[0] - self.assertEqual(top_tag.parent, self.tree) - - def test_soup_object_has_no_parent(self): - self.assertEqual(None, self.tree.parent) - - def test_find_parents(self): - self.assertSelectsIDs( - self.start.find_parents('ul'), ['bottom', 'middle', 'top']) - self.assertSelectsIDs( - self.start.find_parents('ul', id="middle"), ['middle']) - - def test_find_parent(self): - self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') - self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') - - def test_parent_of_text_element(self): - text = self.tree.find(text="Start here") - self.assertEqual(text.parent.name, 'b') - - def test_text_element_find_parent(self): - text = self.tree.find(text="Start here") - self.assertEqual(text.find_parent('ul')['id'], 'bottom') - - def test_parent_generator(self): - parents = [parent['id'] for parent in self.start.parents - if parent is not None and 'id' in parent.attrs] - self.assertEqual(parents, ['bottom', 'middle', 'top']) - - -class ProximityTest(TreeTest): - - def setUp(self): - super(TreeTest, self).setUp() - self.tree = self.soup( - 'OneTwoThree') - - -class TestNextOperations(ProximityTest): - - def setUp(self): - super(TestNextOperations, self).setUp() - self.start = self.tree.b - - def test_next(self): - self.assertEqual(self.start.next_element, "One") - self.assertEqual(self.start.next_element.next_element['id'], "2") - - def test_next_of_last_item_is_none(self): - last = self.tree.find(text="Three") - self.assertEqual(last.next_element, None) - - def test_next_of_root_is_none(self): - # The document root is outside the next/previous chain. - self.assertEqual(self.tree.next_element, None) - - def test_find_all_next(self): - self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) - self.start.find_all_next(id=3) - self.assertSelects(self.start.find_all_next(id=3), ["Three"]) - - def test_find_next(self): - self.assertEqual(self.start.find_next('b')['id'], '2') - self.assertEqual(self.start.find_next(text="Three"), "Three") - - def test_find_next_for_text_element(self): - text = self.tree.find(text="One") - self.assertEqual(text.find_next("b").string, "Two") - self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) - - def test_next_generator(self): - start = self.tree.find(text="Two") - successors = [node for node in start.next_elements] - # There are two successors: the final tag and its text contents. - tag, contents = successors - self.assertEqual(tag['id'], '3') - self.assertEqual(contents, "Three") - -class TestPreviousOperations(ProximityTest): - - def setUp(self): - super(TestPreviousOperations, self).setUp() - self.end = self.tree.find(text="Three") - - def test_previous(self): - self.assertEqual(self.end.previous_element['id'], "3") - self.assertEqual(self.end.previous_element.previous_element, "Two") - - def test_previous_of_first_item_is_none(self): - first = self.tree.find('html') - self.assertEqual(first.previous_element, None) - - def test_previous_of_root_is_none(self): - # The document root is outside the next/previous chain. - # XXX This is broken! - #self.assertEqual(self.tree.previous_element, None) - pass - - def test_find_all_previous(self): - # The tag containing the "Three" node is the predecessor - # of the "Three" node itself, which is why "Three" shows up - # here. - self.assertSelects( - self.end.find_all_previous('b'), ["Three", "Two", "One"]) - self.assertSelects(self.end.find_all_previous(id=1), ["One"]) - - def test_find_previous(self): - self.assertEqual(self.end.find_previous('b')['id'], '3') - self.assertEqual(self.end.find_previous(text="One"), "One") - - def test_find_previous_for_text_element(self): - text = self.tree.find(text="Three") - self.assertEqual(text.find_previous("b").string, "Three") - self.assertSelects( - text.find_all_previous("b"), ["Three", "Two", "One"]) - - def test_previous_generator(self): - start = self.tree.find(text="One") - predecessors = [node for node in start.previous_elements] - - # There are four predecessors: the tag containing "One" - # the tag, the tag, and the tag. - b, body, head, html = predecessors - self.assertEqual(b['id'], '1') - self.assertEqual(body.name, "body") - self.assertEqual(head.name, "head") - self.assertEqual(html.name, "html") - - -class SiblingTest(TreeTest): - - def setUp(self): - super(SiblingTest, self).setUp() - markup = ''' - - - - - - - - - - - ''' - # All that whitespace looks good but makes the tests more - # difficult. Get rid of it. - markup = re.compile("\n\s*").sub("", markup) - self.tree = self.soup(markup) - - -class TestNextSibling(SiblingTest): - - def setUp(self): - super(TestNextSibling, self).setUp() - self.start = self.tree.find(id="1") - - def test_next_sibling_of_root_is_none(self): - self.assertEqual(self.tree.next_sibling, None) - - def test_next_sibling(self): - self.assertEqual(self.start.next_sibling['id'], '2') - self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') - - # Note the difference between next_sibling and next_element. - self.assertEqual(self.start.next_element['id'], '1.1') - - def test_next_sibling_may_not_exist(self): - self.assertEqual(self.tree.html.next_sibling, None) - - nested_span = self.tree.find(id="1.1") - self.assertEqual(nested_span.next_sibling, None) - - last_span = self.tree.find(id="4") - self.assertEqual(last_span.next_sibling, None) - - def test_find_next_sibling(self): - self.assertEqual(self.start.find_next_sibling('span')['id'], '2') - - def test_next_siblings(self): - self.assertSelectsIDs(self.start.find_next_siblings("span"), - ['2', '3', '4']) - - self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) - - def test_next_sibling_for_text_element(self): - soup = self.soup("Foobarbaz") - start = soup.find(text="Foo") - self.assertEqual(start.next_sibling.name, 'b') - self.assertEqual(start.next_sibling.next_sibling, 'baz') - - self.assertSelects(start.find_next_siblings('b'), ['bar']) - self.assertEqual(start.find_next_sibling(text="baz"), "baz") - self.assertEqual(start.find_next_sibling(text="nonesuch"), None) - - -class TestPreviousSibling(SiblingTest): - - def setUp(self): - super(TestPreviousSibling, self).setUp() - self.end = self.tree.find(id="4") - - def test_previous_sibling_of_root_is_none(self): - self.assertEqual(self.tree.previous_sibling, None) - - def test_previous_sibling(self): - self.assertEqual(self.end.previous_sibling['id'], '3') - self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') - - # Note the difference between previous_sibling and previous_element. - self.assertEqual(self.end.previous_element['id'], '3.1') - - def test_previous_sibling_may_not_exist(self): - self.assertEqual(self.tree.html.previous_sibling, None) - - nested_span = self.tree.find(id="1.1") - self.assertEqual(nested_span.previous_sibling, None) - - first_span = self.tree.find(id="1") - self.assertEqual(first_span.previous_sibling, None) - - def test_find_previous_sibling(self): - self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') - - def test_previous_siblings(self): - self.assertSelectsIDs(self.end.find_previous_siblings("span"), - ['3', '2', '1']) - - self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) - - def test_previous_sibling_for_text_element(self): - soup = self.soup("Foobarbaz") - start = soup.find(text="baz") - self.assertEqual(start.previous_sibling.name, 'b') - self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') - - self.assertSelects(start.find_previous_siblings('b'), ['bar']) - self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") - self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) - - -class TestTagCreation(SoupTest): - """Test the ability to create new tags.""" - def test_new_tag(self): - soup = self.soup("") - new_tag = soup.new_tag("foo", bar="baz") - self.assertTrue(isinstance(new_tag, Tag)) - self.assertEqual("foo", new_tag.name) - self.assertEqual(dict(bar="baz"), new_tag.attrs) - self.assertEqual(None, new_tag.parent) - - def test_tag_inherits_self_closing_rules_from_builder(self): - if XML_BUILDER_PRESENT: - xml_soup = BeautifulSoup("", "xml") - xml_br = xml_soup.new_tag("br") - xml_p = xml_soup.new_tag("p") - - # Both the
      and

      tag are empty-element, just because - # they have no contents. - self.assertEqual(b"
      ", xml_br.encode()) - self.assertEqual(b"

      ", xml_p.encode()) - - html_soup = BeautifulSoup("", "html") - html_br = html_soup.new_tag("br") - html_p = html_soup.new_tag("p") - - # The HTML builder users HTML's rules about which tags are - # empty-element tags, and the new tags reflect these rules. - self.assertEqual(b"
      ", html_br.encode()) - self.assertEqual(b"

      ", html_p.encode()) - - def test_new_string_creates_navigablestring(self): - soup = self.soup("") - s = soup.new_string("foo") - self.assertEqual("foo", s) - self.assertTrue(isinstance(s, NavigableString)) - - def test_new_string_can_create_navigablestring_subclass(self): - soup = self.soup("") - s = soup.new_string("foo", Comment) - self.assertEqual("foo", s) - self.assertTrue(isinstance(s, Comment)) - -class TestTreeModification(SoupTest): - - def test_attribute_modification(self): - soup = self.soup('') - soup.a['id'] = 2 - self.assertEqual(soup.decode(), self.document_for('')) - del(soup.a['id']) - self.assertEqual(soup.decode(), self.document_for('')) - soup.a['id2'] = 'foo' - self.assertEqual(soup.decode(), self.document_for('')) - - def test_new_tag_creation(self): - builder = builder_registry.lookup('html')() - soup = self.soup("", builder=builder) - a = Tag(soup, builder, 'a') - ol = Tag(soup, builder, 'ol') - a['href'] = 'http://foo.com/' - soup.body.insert(0, a) - soup.body.insert(1, ol) - self.assertEqual( - soup.body.encode(), - b'
        ') - - def test_append_to_contents_moves_tag(self): - doc = """

        Don't leave me here.

        -

        Don\'t leave!

        """ - soup = self.soup(doc) - second_para = soup.find(id='2') - bold = soup.b - - # Move the tag to the end of the second paragraph. - soup.find(id='2').append(soup.b) - - # The tag is now a child of the second paragraph. - self.assertEqual(bold.parent, second_para) - - self.assertEqual( - soup.decode(), self.document_for( - '

        Don\'t leave me .

        \n' - '

        Don\'t leave!here

        ')) - - def test_replace_with_returns_thing_that_was_replaced(self): - text = "" - soup = self.soup(text) - a = soup.a - new_a = a.replace_with(soup.c) - self.assertEqual(a, new_a) - - def test_unwrap_returns_thing_that_was_replaced(self): - text = "" - soup = self.soup(text) - a = soup.a - new_a = a.unwrap() - self.assertEqual(a, new_a) - - def test_replace_tag_with_itself(self): - text = "Foo" - soup = self.soup(text) - c = soup.c - soup.c.replace_with(c) - self.assertEqual(soup.decode(), self.document_for(text)) - - def test_replace_tag_with_its_parent_raises_exception(self): - text = "" - soup = self.soup(text) - self.assertRaises(ValueError, soup.b.replace_with, soup.a) - - def test_insert_tag_into_itself_raises_exception(self): - text = "" - soup = self.soup(text) - self.assertRaises(ValueError, soup.a.insert, 0, soup.a) - - def test_replace_with_maintains_next_element_throughout(self): - soup = self.soup('

        onethree

        ') - a = soup.a - b = a.contents[0] - # Make it so the tag has two text children. - a.insert(1, "two") - - # Now replace each one with the empty string. - left, right = a.contents - left.replaceWith('') - right.replaceWith('') - - # The tag is still connected to the tree. - self.assertEqual("three", soup.b.string) - - def test_replace_final_node(self): - soup = self.soup("Argh!") - soup.find(text="Argh!").replace_with("Hooray!") - new_text = soup.find(text="Hooray!") - b = soup.b - self.assertEqual(new_text.previous_element, b) - self.assertEqual(new_text.parent, b) - self.assertEqual(new_text.previous_element.next_element, new_text) - self.assertEqual(new_text.next_element, None) - - def test_consecutive_text_nodes(self): - # A builder should never create two consecutive text nodes, - # but if you insert one next to another, Beautiful Soup will - # handle it correctly. - soup = self.soup("Argh!") - soup.b.insert(1, "Hooray!") - - self.assertEqual( - soup.decode(), self.document_for( - "Argh!Hooray!")) - - new_text = soup.find(text="Hooray!") - self.assertEqual(new_text.previous_element, "Argh!") - self.assertEqual(new_text.previous_element.next_element, new_text) - - self.assertEqual(new_text.previous_sibling, "Argh!") - self.assertEqual(new_text.previous_sibling.next_sibling, new_text) - - self.assertEqual(new_text.next_sibling, None) - self.assertEqual(new_text.next_element, soup.c) - - def test_insert_string(self): - soup = self.soup("") - soup.a.insert(0, "bar") - soup.a.insert(0, "foo") - # The string were added to the tag. - self.assertEqual(["foo", "bar"], soup.a.contents) - # And they were converted to NavigableStrings. - self.assertEqual(soup.a.contents[0].next_element, "bar") - - def test_insert_tag(self): - builder = self.default_builder - soup = self.soup( - "Findlady!", builder=builder) - magic_tag = Tag(soup, builder, 'magictag') - magic_tag.insert(0, "the") - soup.a.insert(1, magic_tag) - - self.assertEqual( - soup.decode(), self.document_for( - "Findthelady!")) - - # Make sure all the relationships are hooked up correctly. - b_tag = soup.b - self.assertEqual(b_tag.next_sibling, magic_tag) - self.assertEqual(magic_tag.previous_sibling, b_tag) - - find = b_tag.find(text="Find") - self.assertEqual(find.next_element, magic_tag) - self.assertEqual(magic_tag.previous_element, find) - - c_tag = soup.c - self.assertEqual(magic_tag.next_sibling, c_tag) - self.assertEqual(c_tag.previous_sibling, magic_tag) - - the = magic_tag.find(text="the") - self.assertEqual(the.parent, magic_tag) - self.assertEqual(the.next_element, c_tag) - self.assertEqual(c_tag.previous_element, the) - - def test_append_child_thats_already_at_the_end(self): - data = "" - soup = self.soup(data) - soup.a.append(soup.b) - self.assertEqual(data, soup.decode()) - - def test_move_tag_to_beginning_of_parent(self): - data = "" - soup = self.soup(data) - soup.a.insert(0, soup.d) - self.assertEqual("", soup.decode()) - - def test_insert_works_on_empty_element_tag(self): - # This is a little strange, since most HTML parsers don't allow - # markup like this to come through. But in general, we don't - # know what the parser would or wouldn't have allowed, so - # I'm letting this succeed for now. - soup = self.soup("
        ") - soup.br.insert(1, "Contents") - self.assertEqual(str(soup.br), "
        Contents
        ") - - def test_insert_before(self): - soup = self.soup("foobar") - soup.b.insert_before("BAZ") - soup.a.insert_before("QUUX") - self.assertEqual( - soup.decode(), self.document_for("QUUXfooBAZbar")) - - soup.a.insert_before(soup.b) - self.assertEqual( - soup.decode(), self.document_for("QUUXbarfooBAZ")) - - def test_insert_after(self): - soup = self.soup("foobar") - soup.b.insert_after("BAZ") - soup.a.insert_after("QUUX") - self.assertEqual( - soup.decode(), self.document_for("fooQUUXbarBAZ")) - soup.b.insert_after(soup.a) - self.assertEqual( - soup.decode(), self.document_for("QUUXbarfooBAZ")) - - def test_insert_after_raises_exception_if_after_has_no_meaning(self): - soup = self.soup("") - tag = soup.new_tag("a") - string = soup.new_string("") - self.assertRaises(ValueError, string.insert_after, tag) - self.assertRaises(NotImplementedError, soup.insert_after, tag) - self.assertRaises(ValueError, tag.insert_after, tag) - - def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): - soup = self.soup("") - tag = soup.new_tag("a") - string = soup.new_string("") - self.assertRaises(ValueError, string.insert_before, tag) - self.assertRaises(NotImplementedError, soup.insert_before, tag) - self.assertRaises(ValueError, tag.insert_before, tag) - - def test_replace_with(self): - soup = self.soup( - "

        There's no business like show business

        ") - no, show = soup.find_all('b') - show.replace_with(no) - self.assertEqual( - soup.decode(), - self.document_for( - "

        There's business like no business

        ")) - - self.assertEqual(show.parent, None) - self.assertEqual(no.parent, soup.p) - self.assertEqual(no.next_element, "no") - self.assertEqual(no.next_sibling, " business") - - def test_replace_first_child(self): - data = "" - soup = self.soup(data) - soup.b.replace_with(soup.c) - self.assertEqual("", soup.decode()) - - def test_replace_last_child(self): - data = "" - soup = self.soup(data) - soup.c.replace_with(soup.b) - self.assertEqual("", soup.decode()) - - def test_nested_tag_replace_with(self): - soup = self.soup( - """Wereservetherighttorefuseservice""") - - # Replace the entire tag and its contents ("reserve the - # right") with the tag ("refuse"). - remove_tag = soup.b - move_tag = soup.f - remove_tag.replace_with(move_tag) - - self.assertEqual( - soup.decode(), self.document_for( - "Werefusetoservice")) - - # The tag is now an orphan. - self.assertEqual(remove_tag.parent, None) - self.assertEqual(remove_tag.find(text="right").next_element, None) - self.assertEqual(remove_tag.previous_element, None) - self.assertEqual(remove_tag.next_sibling, None) - self.assertEqual(remove_tag.previous_sibling, None) - - # The tag is now connected to the tag. - self.assertEqual(move_tag.parent, soup.a) - self.assertEqual(move_tag.previous_element, "We") - self.assertEqual(move_tag.next_element.next_element, soup.e) - self.assertEqual(move_tag.next_sibling, None) - - # The gap where the tag used to be has been mended, and - # the word "to" is now connected to the tag. - to_text = soup.find(text="to") - g_tag = soup.g - self.assertEqual(to_text.next_element, g_tag) - self.assertEqual(to_text.next_sibling, g_tag) - self.assertEqual(g_tag.previous_element, to_text) - self.assertEqual(g_tag.previous_sibling, to_text) - - def test_unwrap(self): - tree = self.soup(""" -

        Unneeded formatting is unneeded

        - """) - tree.em.unwrap() - self.assertEqual(tree.em, None) - self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") - - def test_wrap(self): - soup = self.soup("I wish I was bold.") - value = soup.string.wrap(soup.new_tag("b")) - self.assertEqual(value.decode(), "I wish I was bold.") - self.assertEqual( - soup.decode(), self.document_for("I wish I was bold.")) - - def test_wrap_extracts_tag_from_elsewhere(self): - soup = self.soup("I wish I was bold.") - soup.b.next_sibling.wrap(soup.b) - self.assertEqual( - soup.decode(), self.document_for("I wish I was bold.")) - - def test_wrap_puts_new_contents_at_the_end(self): - soup = self.soup("I like being bold.I wish I was bold.") - soup.b.next_sibling.wrap(soup.b) - self.assertEqual(2, len(soup.b.contents)) - self.assertEqual( - soup.decode(), self.document_for( - "I like being bold.I wish I was bold.")) - - def test_extract(self): - soup = self.soup( - 'Some content. More content.') - - self.assertEqual(len(soup.body.contents), 3) - extracted = soup.find(id="nav").extract() - - self.assertEqual( - soup.decode(), "Some content. More content.") - self.assertEqual(extracted.decode(), '') - - # The extracted tag is now an orphan. - self.assertEqual(len(soup.body.contents), 2) - self.assertEqual(extracted.parent, None) - self.assertEqual(extracted.previous_element, None) - self.assertEqual(extracted.next_element.next_element, None) - - # The gap where the extracted tag used to be has been mended. - content_1 = soup.find(text="Some content. ") - content_2 = soup.find(text=" More content.") - self.assertEqual(content_1.next_element, content_2) - self.assertEqual(content_1.next_sibling, content_2) - self.assertEqual(content_2.previous_element, content_1) - self.assertEqual(content_2.previous_sibling, content_1) - - def test_extract_distinguishes_between_identical_strings(self): - soup = self.soup("
        foobar") - foo_1 = soup.a.string - bar_1 = soup.b.string - foo_2 = soup.new_string("foo") - bar_2 = soup.new_string("bar") - soup.a.append(foo_2) - soup.b.append(bar_2) - - # Now there are two identical strings in the tag, and two - # in the tag. Let's remove the first "foo" and the second - # "bar". - foo_1.extract() - bar_2.extract() - self.assertEqual(foo_2, soup.a.string) - self.assertEqual(bar_2, soup.b.string) - - def test_clear(self): - """Tag.clear()""" - soup = self.soup("

        String Italicized and another

        ") - # clear using extract() - a = soup.a - soup.p.clear() - self.assertEqual(len(soup.p.contents), 0) - self.assertTrue(hasattr(a, "contents")) - - # clear using decompose() - em = a.em - a.clear(decompose=True) - self.assertEqual(0, len(em.contents)) - - def test_string_set(self): - """Tag.string = 'string'""" - soup = self.soup(" ") - soup.a.string = "foo" - self.assertEqual(soup.a.contents, ["foo"]) - soup.b.string = "bar" - self.assertEqual(soup.b.contents, ["bar"]) - - def test_string_set_does_not_affect_original_string(self): - soup = self.soup("foobar") - soup.b.string = soup.c.string - self.assertEqual(soup.a.encode(), b"barbar") - - def test_set_string_preserves_class_of_string(self): - soup = self.soup("") - cdata = CData("foo") - soup.a.string = cdata - self.assertTrue(isinstance(soup.a.string, CData)) - -class TestElementObjects(SoupTest): - """Test various features of element objects.""" - - def test_len(self): - """The length of an element is its number of children.""" - soup = self.soup("123") - - # The BeautifulSoup object itself contains one element: the - # tag. - self.assertEqual(len(soup.contents), 1) - self.assertEqual(len(soup), 1) - - # The tag contains three elements: the text node "1", the - # tag, and the text node "3". - self.assertEqual(len(soup.top), 3) - self.assertEqual(len(soup.top.contents), 3) - - def test_member_access_invokes_find(self): - """Accessing a Python member .foo invokes find('foo')""" - soup = self.soup('') - self.assertEqual(soup.b, soup.find('b')) - self.assertEqual(soup.b.i, soup.find('b').find('i')) - self.assertEqual(soup.a, None) - - def test_deprecated_member_access(self): - soup = self.soup('') - with warnings.catch_warnings(record=True) as w: - tag = soup.bTag - self.assertEqual(soup.b, tag) - self.assertEqual( - '.bTag is deprecated, use .find("b") instead.', - str(w[0].message)) - - def test_has_attr(self): - """has_attr() checks for the presence of an attribute. - - Please note note: has_attr() is different from - __in__. has_attr() checks the tag's attributes and __in__ - checks the tag's chidlren. - """ - soup = self.soup("") - self.assertTrue(soup.foo.has_attr('attr')) - self.assertFalse(soup.foo.has_attr('attr2')) - - - def test_attributes_come_out_in_alphabetical_order(self): - markup = '' - self.assertSoupEquals(markup, '') - - def test_string(self): - # A tag that contains only a text node makes that node - # available as .string. - soup = self.soup("foo") - self.assertEqual(soup.b.string, 'foo') - - def test_empty_tag_has_no_string(self): - # A tag with no children has no .stirng. - soup = self.soup("") - self.assertEqual(soup.b.string, None) - - def test_tag_with_multiple_children_has_no_string(self): - # A tag with no children has no .string. - soup = self.soup("foo") - self.assertEqual(soup.b.string, None) - - soup = self.soup("foobar
        ") - self.assertEqual(soup.b.string, None) - - # Even if all the children are strings, due to trickery, - # it won't work--but this would be a good optimization. - soup = self.soup("foo
        ") - soup.a.insert(1, "bar") - self.assertEqual(soup.a.string, None) - - def test_tag_with_recursive_string_has_string(self): - # A tag with a single child which has a .string inherits that - # .string. - soup = self.soup("foo") - self.assertEqual(soup.a.string, "foo") - self.assertEqual(soup.string, "foo") - - def test_lack_of_string(self): - """Only a tag containing a single text node has a .string.""" - soup = self.soup("feo") - self.assertFalse(soup.b.string) - - soup = self.soup("") - self.assertFalse(soup.b.string) - - def test_all_text(self): - """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" - soup = self.soup("ar t ") - self.assertEqual(soup.a.text, "ar t ") - self.assertEqual(soup.a.get_text(strip=True), "art") - self.assertEqual(soup.a.get_text(","), "a,r, , t ") - self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") - - def test_get_text_ignores_comments(self): - soup = self.soup("foobar") - self.assertEqual(soup.get_text(), "foobar") - - self.assertEqual( - soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") - self.assertEqual( - soup.get_text(types=None), "fooIGNOREbar") - - def test_all_strings_ignores_comments(self): - soup = self.soup("foobar") - self.assertEqual(['foo', 'bar'], list(soup.strings)) - -class TestCDAtaListAttributes(SoupTest): - - """Testing cdata-list attributes like 'class'. - """ - def test_single_value_becomes_list(self): - soup = self.soup("") - self.assertEqual(["foo"],soup.a['class']) - - def test_multiple_values_becomes_list(self): - soup = self.soup("") - self.assertEqual(["foo", "bar"], soup.a['class']) - - def test_multiple_values_separated_by_weird_whitespace(self): - soup = self.soup("") - self.assertEqual(["foo", "bar", "baz"],soup.a['class']) - - def test_attributes_joined_into_string_on_output(self): - soup = self.soup("") - self.assertEqual(b'', soup.a.encode()) - - def test_accept_charset(self): - soup = self.soup('
        ') - self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) - - def test_cdata_attribute_applying_only_to_one_tag(self): - data = '' - soup = self.soup(data) - # We saw in another test that accept-charset is a cdata-list - # attribute for the tag. But it's not a cdata-list - # attribute for any other tag. - self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) - - -class TestPersistence(SoupTest): - "Testing features like pickle and deepcopy." - - def setUp(self): - super(TestPersistence, self).setUp() - self.page = """ - - - -Beautiful Soup: We called him Tortoise because he taught us. - - - - - - -foo -bar - -""" - self.tree = self.soup(self.page) - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - dumped = pickle.dumps(self.tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), self.tree.decode()) - - def test_deepcopy_identity(self): - # Making a deepcopy of a tree yields an identical tree. - copied = copy.deepcopy(self.tree) - self.assertEqual(copied.decode(), self.tree.decode()) - - def test_unicode_pickle(self): - # A tree containing Unicode characters can be pickled. - html = u"\N{SNOWMAN}" - soup = self.soup(html) - dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.decode(), soup.decode()) - - -class TestSubstitutions(SoupTest): - - def test_default_formatter_is_minimal(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter="minimal") - # The < is converted back into < but the e-with-acute is left alone. - self.assertEqual( - decoded, - self.document_for( - u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) - - def test_formatter_html(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter="html") - self.assertEqual( - decoded, - self.document_for("<<Sacré bleu!>>")) - - def test_formatter_minimal(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter="minimal") - # The < is converted back into < but the e-with-acute is left alone. - self.assertEqual( - decoded, - self.document_for( - u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) - - def test_formatter_null(self): - markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" - soup = self.soup(markup) - decoded = soup.decode(formatter=None) - # Neither the angle brackets nor the e-with-acute are converted. - # This is not valid HTML, but it's what the user wanted. - self.assertEqual(decoded, - self.document_for(u"<>")) - - def test_formatter_custom(self): - markup = u"<foo>bar" - soup = self.soup(markup) - decoded = soup.decode(formatter = lambda x: x.upper()) - # Instead of normal entity conversion code, the custom - # callable is called on every string. - self.assertEqual( - decoded, - self.document_for(u"BAR")) - - def test_formatter_is_run_on_attribute_values(self): - markup = u'e' - soup = self.soup(markup) - a = soup.a - - expect_minimal = u'e' - - self.assertEqual(expect_minimal, a.decode()) - self.assertEqual(expect_minimal, a.decode(formatter="minimal")) - - expect_html = u'e' - self.assertEqual(expect_html, a.decode(formatter="html")) - - self.assertEqual(markup, a.decode(formatter=None)) - expect_upper = u'E' - self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) - - def test_formatter_skips_script_tag_for_html_documents(self): - doc = """ - -""" - encoded = BeautifulSoup(doc).encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_formatter_skips_style_tag_for_html_documents(self): - doc = """ - -""" - encoded = BeautifulSoup(doc).encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_prettify_leaves_preformatted_text_alone(self): - soup = self.soup("
        foo
          \tbar\n  \n  
        baz ") - # Everything outside the
         tag is reformatted, but everything
        -        # inside is left alone.
        -        self.assertEqual(
        -            u'
        \n foo\n
          \tbar\n  \n  
        \n baz\n
        ', - soup.div.prettify()) - - def test_prettify_accepts_formatter(self): - soup = BeautifulSoup("foo") - pretty = soup.prettify(formatter = lambda x: x.upper()) - self.assertTrue("FOO" in pretty) - - def test_prettify_outputs_unicode_by_default(self): - soup = self.soup("") - self.assertEqual(unicode, type(soup.prettify())) - - def test_prettify_can_encode_data(self): - soup = self.soup("") - self.assertEqual(bytes, type(soup.prettify("utf-8"))) - - def test_html_entity_substitution_off_by_default(self): - markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" - soup = self.soup(markup) - encoded = soup.b.encode("utf-8") - self.assertEqual(encoded, markup.encode('utf-8')) - - def test_encoding_substitution(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - soup = self.soup(meta_tag) - - # Parse the document, and the charset apprears unchanged. - self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') - - # Encode the document into some encoding, and the encoding is - # substituted into the meta tag. - utf_8 = soup.encode("utf-8") - self.assertTrue(b"charset=utf-8" in utf_8) - - euc_jp = soup.encode("euc_jp") - self.assertTrue(b"charset=euc_jp" in euc_jp) - - shift_jis = soup.encode("shift-jis") - self.assertTrue(b"charset=shift-jis" in shift_jis) - - utf_16_u = soup.encode("utf-16").decode("utf-16") - self.assertTrue("charset=utf-16" in utf_16_u) - - def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): - markup = ('
        foo
        ') - - # Beautiful Soup used to try to rewrite the meta tag even if the - # meta tag got filtered out by the strainer. This test makes - # sure that doesn't happen. - strainer = SoupStrainer('pre') - soup = self.soup(markup, parse_only=strainer) - self.assertEqual(soup.contents[0].name, 'pre') - -class TestEncoding(SoupTest): - """Test the ability to encode objects into strings.""" - - def test_unicode_string_can_be_encoded(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual(soup.b.string.encode("utf-8"), - u"\N{SNOWMAN}".encode("utf-8")) - - def test_tag_containing_unicode_string_can_be_encoded(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual( - soup.b.encode("utf-8"), html.encode("utf-8")) - - def test_encoding_substitutes_unrecognized_characters_by_default(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual(soup.b.encode("ascii"), b"") - - def test_encoding_can_be_made_strict(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertRaises( - UnicodeEncodeError, soup.encode, "ascii", errors="strict") - - def test_decode_contents(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) - - def test_encode_contents(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( - encoding="utf8")) - - def test_deprecated_renderContents(self): - html = u"\N{SNOWMAN}" - soup = self.soup(html) - self.assertEqual( - u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) - -class TestNavigableStringSubclasses(SoupTest): - - def test_cdata(self): - # None of the current builders turn CDATA sections into CData - # objects, but you can create them manually. - soup = self.soup("") - cdata = CData("foo") - soup.insert(1, cdata) - self.assertEqual(str(soup), "") - self.assertEqual(soup.find(text="foo"), "foo") - self.assertEqual(soup.contents[0], "foo") - - def test_cdata_is_never_formatted(self): - """Text inside a CData object is passed into the formatter. - - But the return value is ignored. - """ - - self.count = 0 - def increment(*args): - self.count += 1 - return "BITTER FAILURE" - - soup = self.soup("") - cdata = CData("<><><>") - soup.insert(1, cdata) - self.assertEqual( - b"<><>]]>", soup.encode(formatter=increment)) - self.assertEqual(1, self.count) - - def test_doctype_ends_in_newline(self): - # Unlike other NavigableString subclasses, a DOCTYPE always ends - # in a newline. - doctype = Doctype("foo") - soup = self.soup("") - soup.insert(1, doctype) - self.assertEqual(soup.encode(), b"\n") - - -class TestSoupSelector(TreeTest): - - HTML = """ - - - -The title - - - - -
        -
        -

        An H1

        -

        Some text

        -

        Some more text

        -

        An H2

        -

        Another

        -Bob -

        Another H2

        -me - -span1a1 -span1a2 test - -span2a1 - - - -
        -

        English

        -

        English UK

        -

        English US

        -

        French

        -
        - - -""" - - def setUp(self): - self.soup = BeautifulSoup(self.HTML) - - def assertSelects(self, selector, expected_ids): - el_ids = [el['id'] for el in self.soup.select(selector)] - el_ids.sort() - expected_ids.sort() - self.assertEqual(expected_ids, el_ids, - "Selector %s, expected [%s], got [%s]" % ( - selector, ', '.join(expected_ids), ', '.join(el_ids) - ) - ) - - assertSelect = assertSelects - - def assertSelectMultiple(self, *tests): - for selector, expected_ids in tests: - self.assertSelect(selector, expected_ids) - - def test_one_tag_one(self): - els = self.soup.select('title') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].name, 'title') - self.assertEqual(els[0].contents, [u'The title']) - - def test_one_tag_many(self): - els = self.soup.select('div') - self.assertEqual(len(els), 3) - for div in els: - self.assertEqual(div.name, 'div') - - def test_tag_in_tag_one(self): - els = self.soup.select('div div') - self.assertSelects('div div', ['inner']) - - def test_tag_in_tag_many(self): - for selector in ('html div', 'html body div', 'body div'): - self.assertSelects(selector, ['main', 'inner', 'footer']) - - def test_tag_no_match(self): - self.assertEqual(len(self.soup.select('del')), 0) - - def test_invalid_tag(self): - self.assertRaises(ValueError, self.soup.select, 'tag%t') - - def test_header_tags(self): - self.assertSelectMultiple( - ('h1', ['header1']), - ('h2', ['header2', 'header3']), - ) - - def test_class_one(self): - for selector in ('.onep', 'p.onep', 'html p.onep'): - els = self.soup.select(selector) - self.assertEqual(len(els), 1) - self.assertEqual(els[0].name, 'p') - self.assertEqual(els[0]['class'], ['onep']) - - def test_class_mismatched_tag(self): - els = self.soup.select('div.onep') - self.assertEqual(len(els), 0) - - def test_one_id(self): - for selector in ('div#inner', '#inner', 'div div#inner'): - self.assertSelects(selector, ['inner']) - - def test_bad_id(self): - els = self.soup.select('#doesnotexist') - self.assertEqual(len(els), 0) - - def test_items_in_id(self): - els = self.soup.select('div#inner p') - self.assertEqual(len(els), 3) - for el in els: - self.assertEqual(el.name, 'p') - self.assertEqual(els[1]['class'], ['onep']) - self.assertFalse(els[0].has_attr('class')) - - def test_a_bunch_of_emptys(self): - for selector in ('div#main del', 'div#main div.oops', 'div div#main'): - self.assertEqual(len(self.soup.select(selector)), 0) - - def test_multi_class_support(self): - for selector in ('.class1', 'p.class1', '.class2', 'p.class2', - '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): - self.assertSelects(selector, ['pmulti']) - - def test_multi_class_selection(self): - for selector in ('.class1.class3', '.class3.class2', - '.class1.class2.class3'): - self.assertSelects(selector, ['pmulti']) - - def test_child_selector(self): - self.assertSelects('.s1 > a', ['s1a1', 's1a2']) - self.assertSelects('.s1 > a span', ['s1a2s1']) - - def test_child_selector_id(self): - self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) - - def test_attribute_equals(self): - self.assertSelectMultiple( - ('p[class="onep"]', ['p1']), - ('p[id="p1"]', ['p1']), - ('[class="onep"]', ['p1']), - ('[id="p1"]', ['p1']), - ('link[rel="stylesheet"]', ['l1']), - ('link[type="text/css"]', ['l1']), - ('link[href="blah.css"]', ['l1']), - ('link[href="no-blah.css"]', []), - ('[rel="stylesheet"]', ['l1']), - ('[type="text/css"]', ['l1']), - ('[href="blah.css"]', ['l1']), - ('[href="no-blah.css"]', []), - ('p[href="no-blah.css"]', []), - ('[href="no-blah.css"]', []), - ) - - def test_attribute_tilde(self): - self.assertSelectMultiple( - ('p[class~="class1"]', ['pmulti']), - ('p[class~="class2"]', ['pmulti']), - ('p[class~="class3"]', ['pmulti']), - ('[class~="class1"]', ['pmulti']), - ('[class~="class2"]', ['pmulti']), - ('[class~="class3"]', ['pmulti']), - ('a[rel~="friend"]', ['bob']), - ('a[rel~="met"]', ['bob']), - ('[rel~="friend"]', ['bob']), - ('[rel~="met"]', ['bob']), - ) - - def test_attribute_startswith(self): - self.assertSelectMultiple( - ('[rel^="style"]', ['l1']), - ('link[rel^="style"]', ['l1']), - ('notlink[rel^="notstyle"]', []), - ('[rel^="notstyle"]', []), - ('link[rel^="notstyle"]', []), - ('link[href^="bla"]', ['l1']), - ('a[href^="http://"]', ['bob', 'me']), - ('[href^="http://"]', ['bob', 'me']), - ('[id^="p"]', ['pmulti', 'p1']), - ('[id^="m"]', ['me', 'main']), - ('div[id^="m"]', ['main']), - ('a[id^="m"]', ['me']), - ) - - def test_attribute_endswith(self): - self.assertSelectMultiple( - ('[href$=".css"]', ['l1']), - ('link[href$=".css"]', ['l1']), - ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), - ('div[id$="1"]', []), - ('[id$="noending"]', []), - ) - - def test_attribute_contains(self): - self.assertSelectMultiple( - # From test_attribute_startswith - ('[rel*="style"]', ['l1']), - ('link[rel*="style"]', ['l1']), - ('notlink[rel*="notstyle"]', []), - ('[rel*="notstyle"]', []), - ('link[rel*="notstyle"]', []), - ('link[href*="bla"]', ['l1']), - ('a[href*="http://"]', ['bob', 'me']), - ('[href*="http://"]', ['bob', 'me']), - ('[id*="p"]', ['pmulti', 'p1']), - ('div[id*="m"]', ['main']), - ('a[id*="m"]', ['me']), - # From test_attribute_endswith - ('[href*=".css"]', ['l1']), - ('link[href*=".css"]', ['l1']), - ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), - ('div[id*="1"]', []), - ('[id*="noending"]', []), - # New for this test - ('[href*="."]', ['bob', 'me', 'l1']), - ('a[href*="."]', ['bob', 'me']), - ('link[href*="."]', ['l1']), - ('div[id*="n"]', ['main', 'inner']), - ('div[id*="nn"]', ['inner']), - ) - - def test_attribute_exact_or_hypen(self): - self.assertSelectMultiple( - ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('p[lang|="fr"]', ['lang-fr']), - ('p[lang|="gb"]', []), - ) - - def test_attribute_exists(self): - self.assertSelectMultiple( - ('[rel]', ['l1', 'bob', 'me']), - ('link[rel]', ['l1']), - ('a[rel]', ['bob', 'me']), - ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), - ('p[class]', ['p1', 'pmulti']), - ('[blah]', []), - ('p[blah]', []), - ) - - def test_nth_of_type(self): - # Try to select first paragraph - els = self.soup.select('div#inner p:nth-of-type(1)') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') - - # Try to select third paragraph - els = self.soup.select('div#inner p:nth-of-type(3)') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Another') - - # Try to select (non-existent!) fourth paragraph - els = self.soup.select('div#inner p:nth-of-type(4)') - self.assertEqual(len(els), 0) - - # Pass in an invalid value. - self.assertRaises( - ValueError, self.soup.select, 'div p:nth-of-type(0)') - - def test_nth_of_type_direct_descendant(self): - els = self.soup.select('div#inner > p:nth-of-type(1)') - self.assertEqual(len(els), 1) - self.assertEqual(els[0].string, u'Some text') - - def test_id_child_selector_nth_of_type(self): - self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) - - def test_select_on_element(self): - # Other tests operate on the tree; this operates on an element - # within the tree. - inner = self.soup.find("div", id="main") - selected = inner.select("div") - # The
        tag was selected. The