Updated bunded version of BS4

2013-07-14 22:06:37 +12:00 · 2013-07-14 22:06:37 +12:00 · 2182d5a0fd
commit 2182d5a0fd
parent 5d30398bc1
14 changed files with 832 additions and 159 deletions
--- a/lib/bs4/AUTHORS.txt
+++ b/lib/bs4/AUTHORS.txt
@ -0,0 +1,43 @@
 Behold, mortal, the origins of Beautiful Soup...
 ================================================
 Leonard Richardson is the primary programmer.
 Aaron DeVore is awesome.
 Mark Pilgrim provided the encoding detection code that forms the base
 of UnicodeDammit.
 Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
 Soup 4 working under Python 3.
 Simon Willison wrote soupselect, which was used to make Beautiful Soup
 support CSS selectors.
 Sam Ruby helped with a lot of edge cases.
 Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
 work in solving the nestable tags conundrum.
 An incomplete list of people have contributed patches to Beautiful
 Soup:
 Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
 Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
 Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
 Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
 Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
 Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
 Webster, Paul Wright, Danny Yoo
 An incomplete list of people who made suggestions or found bugs or
 found ways to break Beautiful Soup:
 Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
 Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
 Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
 warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
 Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
 Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
 Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
 Sousa Rocha, Yichun Wei, Per Vognsen
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.3"
+__version__ = "4.2.1"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
 __license__ = "MIT"
 __all__ = ['BeautifulSoup']
@ -201,9 +201,9 @@ class BeautifulSoup(Tag):
        """Create a new tag associated with this soup."""
        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
-    def new_string(self, s):
+    def new_string(self, s, subclass=NavigableString):
        """Create a new NavigableString associated with this soup."""
-        navigable = NavigableString(s)
+        navigable = subclass(s)
        navigable.setup()
        return navigable
@ -245,13 +245,15 @@ class BeautifulSoup(Tag):
            o = containerClass(currentData)
            self.object_was_parsed(o)
-    def object_was_parsed(self, o):
+    def object_was_parsed(self, o, parent=None, most_recent_element=None):
        """Add an object to the parse tree."""
-        o.setup(self.currentTag, self.previous_element)
+        parent = parent or self.currentTag
-        if self.previous_element:
+        most_recent_element = most_recent_element or self._most_recent_element
-            self.previous_element.next_element = o
+        o.setup(parent, most_recent_element)
-        self.previous_element = o
+        if most_recent_element is not None:
-        self.currentTag.contents.append(o)
+            most_recent_element.next_element = o
        self._most_recent_element = o
        parent.contents.append(o)
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
@ -295,12 +297,12 @@ class BeautifulSoup(Tag):
            return None
        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self.previous_element)
+                  self.currentTag, self._most_recent_element)
        if tag is None:
            return tag
-        if self.previous_element:
+        if self._most_recent_element:
-            self.previous_element.next_element = tag
+            self._most_recent_element.next_element = tag
-        self.previous_element = tag
+        self._most_recent_element = tag
        self.pushTag(tag)
        return tag
@ -333,6 +335,10 @@ class BeautifulSoup(Tag):
        return prefix + super(BeautifulSoup, self).decode(
            indent_level, eventual_encoding, formatter)
 # Alias to make it easier to type import: 'from bs4 import _soup'
 _s = BeautifulSoup
 _soup = BeautifulSoup
 class BeautifulStoneSoup(BeautifulSoup):
    """Deprecated interface to an XML parser."""
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -152,7 +152,7 @@ class TreeBuilder(object):
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), [])
            for cdata_list_attr in itertools.chain(universal, tag_specific):
-                if cdata_list_attr in dict(attrs):
+                if cdata_list_attr in attrs:
                    # Basically, we have a "class" attribute whose
                    # value is a whitespace-separated list of CSS
                    # classes. Split it into a list.
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
            old_element = self.element.contents[-1]
            new_element = self.soup.new_string(old_element + node.element)
            old_element.replace_with(new_element)
            self.soup._most_recent_element = new_element
        else:
-            self.element.append(node.element)
+            self.soup.object_was_parsed(node.element, parent=self.element)
            node.parent = self
    def getAttributes(self):
        return AttrList(self.element)
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
        # it's fixed.
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)
@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.soup.endData()
        if data.startswith("DOCTYPE "):
            data = data[len("DOCTYPE "):]
        elif data == 'DOCTYPE':
            # i.e. "<!DOCTYPE>"
            data = ''
        self.soup.handle_data(data)
        self.soup.endData(Doctype)
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -3,6 +3,7 @@ __all__ = [
    'LXMLTreeBuilder',
    ]
 from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    CHUNK_SIZE = 512
    # This namespace mapping is specified in the XML Namespace
    # standard.
    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
    @property
    def default_parser(self):
        # This can either return a parser object or a class, which
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            parser = parser(target=self, strip_cdata=False)
        self.parser = parser
        self.soup = None
-        self.nsmaps = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]
    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                dammit.contains_replacement_characters)
    def feed(self, markup):
-        if isinstance(markup, basestring):
+        if isinstance(markup, bytes):
            markup = BytesIO(markup)
        elif isinstance(markup, unicode):
            markup = StringIO(markup)
        # Call feed() at least once, even if the markup is empty,
        # or the parser won't be initialized.
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        self.parser.close()
    def close(self):
-        self.nsmaps = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]
    def start(self, name, attrs, nsmap={}):
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
        attrs = dict(attrs)
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(nsmap) == 0 and self.nsmaps != None:
+        if len(self.nsmaps) > 1:
-            # There are no new namespaces for this tag, but namespaces
+            # There are no new namespaces for this tag, but
-            # are in play, so we need a separate tag stack to know
+            # non-default namespaces are in play, so we need a
-            # when they end.
+            # separate tag stack to know when they end.
            self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
            if self.nsmaps is None:
                self.nsmaps = []
            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
            self.nsmaps.append(inverted_nsmap)
            # Also treat the namespace mapping as a set of attributes on the
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace
-        if self.nsmaps is not None and len(self.nsmaps) > 0:
+        # Namespaces are in play. Find any attributes that came in
-            # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
-            # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
-            # turn then into NamespacedAttribute objects.
+        new_attrs = {}
-            new_attrs = {}
+        for attr, value in attrs.items():
-            for attr, value in attrs.items():
+            namespace, attr = self._getNsTag(attr)
-                namespace, attr = self._getNsTag(attr)
+            if namespace is None:
-                if namespace is None:
+                new_attrs[attr] = value
-                    new_attrs[attr] = value
+            else:
-                else:
+                nsprefix = self._prefix_for_namespace(namespace)
-                    nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
-                    attr = NamespacedAttribute(nsprefix, attr, namespace)
+                new_attrs[attr] = value
-                    new_attrs[attr] = value
+        attrs = new_attrs
            attrs = new_attrs
        namespace, name = self._getNsTag(name)
        nsprefix = self._prefix_for_namespace(namespace)
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        for inverted_nsmap in reversed(self.nsmaps):
            if inverted_nsmap is not None and namespace in inverted_nsmap:
                return inverted_nsmap[namespace]
        return None
    def end(self, name):
        self.soup.endData()
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    nsprefix = inverted_nsmap[namespace]
                    break
        self.soup.handle_endtag(name, nsprefix)
-        if self.nsmaps != None:
+        if len(self.nsmaps) > 1:
            # This tag, or one of its parents, introduced a namespace
            # mapping, so pop it off the stack.
            self.nsmaps.pop()
            if len(self.nsmaps) == 0:
                # Namespaces are no longer in play, so don't bother keeping
                # track of the namespace stack.
                self.nsmaps = None
    def pi(self, target, data):
        pass
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -81,6 +81,8 @@ class EntitySubstitution(object):
                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                           ")")
    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
    @classmethod
    def _substitute_html_entity(cls, matchobj):
        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@ -134,6 +136,28 @@ class EntitySubstitution(object):
    def substitute_xml(cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.
        :param value: A string to be substituted. The less-than sign
          will become &lt;, the greater-than sign will become &gt;,
          and any ampersands will become &amp;. If you want ampersands
          that appear to be part of an entity definition to be left
          alone, use substitute_xml_containing_entities() instead.
        :param make_quoted_attribute: If True, then the string will be
         quoted, as befits an attribute value.
        """
        # Escape angle brackets and ampersands.
        value = cls.AMPERSAND_OR_BRACKET.sub(
            cls._substitute_xml_entity, value)
        if make_quoted_attribute:
            value = cls.quoted_attribute_value(value)
        return value
    @classmethod
    def substitute_xml_containing_entities(
        cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.
        :param value: A string to be substituted. The less-than sign will
          become &lt;, the greater-than sign will become &gt;, and any
          ampersands that are not part of an entity defition will
@ -151,6 +175,7 @@ class EntitySubstitution(object):
            value = cls.quoted_attribute_value(value)
        return value
    @classmethod
    def substitute_html(cls, s):
        """Replace certain Unicode characters with named HTML entities.
@ -273,7 +298,6 @@ class UnicodeDammit:
            return None
        self.tried_encodings.append((proposed, errors))
        markup = self.markup
        # Convert smart quotes to HTML if coming from an encoding
        # that might have them.
        if (self.smart_quotes_to is not None
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -0,0 +1,178 @@
 """Diagnostic functions, mainly for use when doing tech support."""
 from StringIO import StringIO
 from HTMLParser import HTMLParser
 from bs4 import BeautifulSoup, __version__
 from bs4.builder import builder_registry
 import os
 import random
 import time
 import traceback
 import sys
 import cProfile
 def diagnose(data):
    """Diagnostic suite for isolating common problems."""
    print "Diagnostic running on Beautiful Soup %s" % __version__
    print "Python version %s" % sys.version
    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
        for builder in builder_registry.builders:
            if name in builder.features:
                break
        else:
            basic_parsers.remove(name)
            print (
                "I noticed that %s is not installed. Installing it may help." %
                name)
    if 'lxml' in basic_parsers:
        basic_parsers.append(["lxml", "xml"])
        from lxml import etree
        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
    if 'html5lib' in basic_parsers:
        import html5lib
        print "Found html5lib version %s" % html5lib.__version__
    if hasattr(data, 'read'):
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
        data = open(data).read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        return
    print
    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
            soup = BeautifulSoup(data, parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "Here's what %s did with the markup:" % parser
            print soup.prettify()
        print "-" * 80
 def lxml_trace(data, html=True):
    """Print out the lxml events that occur during parsing.
    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html):
        print("%s, %4s, %s" % (event, element.tag, element.text))
 class AnnouncingParser(HTMLParser):
    """Announces HTMLParser parse events, without doing anything else."""
    def _p(self, s):
        print(s)
    def handle_starttag(self, name, attrs):
        self._p("%s START" % name)
    def handle_endtag(self, name):
        self._p("%s END" % name)
    def handle_data(self, data):
        self._p("%s DATA" % data)
    def handle_charref(self, name):
        self._p("%s CHARREF" % name)
    def handle_entityref(self, name):
        self._p("%s ENTITYREF" % name)
    def handle_comment(self, data):
        self._p("%s COMMENT" % data)
    def handle_decl(self, data):
        self._p("%s DECL" % data)
    def unknown_decl(self, data):
        self._p("%s UNKNOWN-DECL" % data)
    def handle_pi(self, data):
        self._p("%s PI" % data)
 def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.
    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data)
 _vowels = "aeiou"
 _consonants = "bcdfghjklmnpqrstvwxyz"
 def rword(length=5):
    "Generate a random word-like string."
    s = ''
    for i in range(length):
        if i % 2 == 0:
            t = _consonants
        else:
            t = _vowels
        s += random.choice(t)
    return s
 def rsentence(length=4):
    "Generate a random sentence-like string."
    return " ".join(rword(random.randint(4,9)) for i in range(length))
 def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print "Raw lxml parsed the markup in %.2fs." % (b-a)
 if __name__ == '__main__':
    diagnose(sys.stdin.read())
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
    def __new__(cls, prefix, name, namespace=None):
        if name is None:
            obj = unicode.__new__(cls, prefix)
        elif prefix is None:
            # Not really namespaced.
            obj = unicode.__new__(cls, name)
        else:
            obj = unicode.__new__(cls, prefix + ":" + name)
        obj.prefix = prefix
@ -78,6 +81,40 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
            return match.group(1) + encoding
        return self.CHARSET_RE.sub(rewrite, self.original_value)
 class HTMLAwareEntitySubstitution(EntitySubstitution):
    """Entity substitution rules that are aware of some HTML quirks.
    Specifically, the contents of <script> and <style> tags should not
    undergo entity substitution.
    Incoming NavigableString objects are checked to see if they're the
    direct children of a <script> or <style> tag.
    """
    cdata_containing_tags = set(["script", "style"])
    preformatted_tags = set(["pre"])
    @classmethod
    def _substitute_if_appropriate(cls, ns, f):
        if (isinstance(ns, NavigableString)
            and ns.parent is not None
            and ns.parent.name in cls.cdata_containing_tags):
            # Do nothing.
            return ns
        # Substitute.
        return f(ns)
    @classmethod
    def substitute_html(cls, ns):
        return cls._substitute_if_appropriate(
            ns, EntitySubstitution.substitute_html)
    @classmethod
    def substitute_xml(cls, ns):
        return cls._substitute_if_appropriate(
            ns, EntitySubstitution.substitute_xml)
 class PageElement(object):
    """Contains the navigational information for some part of the page
@ -94,25 +131,60 @@ class PageElement(object):
    #   converted to entities.  This is not recommended, but it's
    #   faster than "minimal".
    # A function - This function will be called on every string that
-    #  needs to undergo entity substition
+    #  needs to undergo entity substitution.
-    FORMATTERS = {
+    #
    # In an HTML document, the default "html" and "minimal" functions
    # will leave the contents of <script> and <style> tags alone. For
    # an XML document, all tags will be given the same treatment.
    HTML_FORMATTERS = {
        "html" : HTMLAwareEntitySubstitution.substitute_html,
        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
        None : None
        }
    XML_FORMATTERS = {
        "html" : EntitySubstitution.substitute_html,
        "minimal" : EntitySubstitution.substitute_xml,
        None : None
        }
    @classmethod
    def format_string(self, s, formatter='minimal'):
        """Format the given string using the given formatter."""
        if not callable(formatter):
-            formatter = self.FORMATTERS.get(
+            formatter = self._formatter_for_name(formatter)
                formatter, EntitySubstitution.substitute_xml)
        if formatter is None:
            output = s
        else:
            output = formatter(s)
        return output
    @property
    def _is_xml(self):
        """Is this element part of an XML tree or an HTML tree?
        This is used when mapping a formatter name ("minimal") to an
        appropriate function (one that performs entity-substitution on
        the contents of <script> and <style> tags, or not). It's
        inefficient, but it should be called very rarely.
        """
        if self.parent is None:
            # This is the top-level object. It should have .is_xml set
            # from tree creation. If not, take a guess--BS is usually
            # used on HTML markup.
            return getattr(self, 'is_xml', False)
        return self.parent._is_xml
    def _formatter_for_name(self, name):
        "Look up a formatter function based on its name and the tree."
        if self._is_xml:
            return self.XML_FORMATTERS.get(
                name, EntitySubstitution.substitute_xml)
        else:
            return self.HTML_FORMATTERS.get(
                name, HTMLAwareEntitySubstitution.substitute_xml)
    def setup(self, parent=None, previous_element=None):
        """Sets up the initial relations between this element and
        other elements."""
@ -366,7 +438,7 @@ class PageElement(object):
        # NOTE: We can't use _find_one because findParents takes a different
        # set of arguments.
        r = None
-        l = self.find_parents(name, attrs, 1)
+        l = self.find_parents(name, attrs, 1, **kwargs)
        if l:
            r = l[0]
        return r
@ -495,6 +567,14 @@ class PageElement(object):
            value =" ".join(value)
        return value
    def _tag_name_matches_and(self, function, tag_name):
        if not tag_name:
            return function
        else:
            def _match(tag):
                return tag.name == tag_name and function(tag)
            return _match
    def _attribute_checker(self, operator, attribute, value=''):
        """Create a function that performs a CSS selector operation.
@ -536,87 +616,6 @@ class PageElement(object):
        else:
            return lambda el: el.has_attr(attribute)
    def select(self, selector):
        """Perform a CSS selection operation on the current element."""
        tokens = selector.split()
        current_context = [self]
        for index, token in enumerate(tokens):
            if tokens[index - 1] == '>':
                # already found direct descendants in last step. skip this
                # step.
                continue
            m = self.attribselect_re.match(token)
            if m is not None:
                # Attribute selector
                tag, attribute, operator, value = m.groups()
                if not tag:
                    tag = True
                checker = self._attribute_checker(operator, attribute, value)
                found = []
                for context in current_context:
                    found.extend(
                        [el for el in context.find_all(tag) if checker(el)])
                current_context = found
                continue
            if '#' in token:
                # ID selector
                tag, id = token.split('#', 1)
                if tag == "":
                    tag = True
                el = current_context[0].find(tag, {'id': id})
                if el is None:
                    return [] # No match
                current_context = [el]
                continue
            if '.' in token:
                # Class selector
                tag_name, klass = token.split('.', 1)
                if not tag_name:
                    tag_name = True
                classes = set(klass.split('.'))
                found = []
                def classes_match(tag):
                    if tag_name is not True and tag.name != tag_name:
                        return False
                    if not tag.has_attr('class'):
                        return False
                    return classes.issubset(tag['class'])
                for context in current_context:
                    found.extend(context.find_all(classes_match))
                current_context = found
                continue
            if token == '*':
                # Star selector
                found = []
                for context in current_context:
                    found.extend(context.findAll(True))
                current_context = found
                continue
            if token == '>':
                # Child selector
                tag = tokens[index + 1]
                if not tag:
                    tag = True
                found = []
                for context in current_context:
                    found.extend(context.find_all(tag, recursive=False))
                current_context = found
                continue
            # Here we should just have a regular tag
            if not self.tag_name_re.match(token):
                return []
            found = []
            for context in current_context:
                found.extend(context.findAll(token))
            current_context = found
        return current_context
    # Old non-property versions of the generators, for backwards
    # compatibility with BS3.
    def nextGenerator(self):
@ -652,6 +651,9 @@ class NavigableString(unicode, PageElement):
            return unicode.__new__(cls, value)
        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
    def __copy__(self):
        return self
    def __getnewargs__(self):
        return (unicode(self),)
@ -709,7 +711,7 @@ class Doctype(PreformattedString):
    @classmethod
    def for_name_and_ids(cls, name, pub_id, system_id):
-        value = name
+        value = name or ''
        if pub_id is not None:
            value += ' PUBLIC "%s"' % pub_id
            if system_id is not None:
@ -803,16 +805,24 @@ class Tag(PageElement):
        self.clear()
        self.append(string.__class__(string))
-    def _all_strings(self, strip=False):
+    def _all_strings(self, strip=False, types=(NavigableString, CData)):
-        """Yield all child strings, possibly stripping them."""
+        """Yield all strings of certain classes, possibly stripping them.
        By default, yields only NavigableString and CData objects. So
        no comments, processing instructions, etc.
        """
        for descendant in self.descendants:
-            if not isinstance(descendant, NavigableString):
+            if (
                (types is None and not isinstance(descendant, NavigableString))
                or
                (types is not None and type(descendant) not in types)):
                continue
            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0:
                    continue
            yield descendant
    strings = property(_all_strings)
    @property
@ -820,11 +830,13 @@ class Tag(PageElement):
        for string in self._all_strings(True):
            yield string
-    def get_text(self, separator=u"", strip=False):
+    def get_text(self, separator=u"", strip=False,
                 types=(NavigableString, CData)):
        """
        Get all child strings, concatenated using the given separator.
        """
-        return separator.join([s for s in self._all_strings(strip)])
+        return separator.join([s for s in self._all_strings(
                    strip, types=types)])
    getText = get_text
    text = property(get_text)
@ -835,6 +847,7 @@ class Tag(PageElement):
        while i is not None:
            next = i.next_element
            i.__dict__.clear()
            i.contents = []
            i = next
    def clear(self, decompose=False):
@ -966,6 +979,13 @@ class Tag(PageElement):
        u = self.decode(indent_level, encoding, formatter)
        return u.encode(encoding, errors)
    def _should_pretty_print(self, indent_level):
        """Should this tag be pretty-printed?"""
        return (
            indent_level is not None and
            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
             or self._is_xml))
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
@ -978,6 +998,12 @@ class Tag(PageElement):
           document contains a <META> tag that mentions the document's
           encoding.
        """
        # First off, turn a string formatter into a function. This
        # will stop the lookup from happening over and over again.
        if not callable(formatter):
            formatter = self._formatter_for_name(formatter)
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
@ -1010,12 +1036,15 @@ class Tag(PageElement):
        else:
            closeTag = '</%s%s>' % (prefix, self.name)
-        pretty_print = (indent_level is not None)
+        pretty_print = self._should_pretty_print(indent_level)
        space = ''
        indent_space = ''
        if indent_level is not None:
            indent_space = (' ' * (indent_level - 1))
        if pretty_print:
-            space = (' ' * (indent_level - 1))
+            space = indent_space
            indent_contents = indent_level + 1
        else:
            space = ''
            indent_contents = None
        contents = self.decode_contents(
            indent_contents, eventual_encoding, formatter)
@ -1028,8 +1057,10 @@ class Tag(PageElement):
            attribute_string = ''
            if attrs:
                attribute_string = ' ' + ' '.join(attrs)
-            if pretty_print:
+            if indent_level is not None:
-                s.append(space)
+                # Even if this particular tag is not pretty-printed,
                # we should indent up to the start of the tag.
                s.append(indent_space)
            s.append('<%s%s%s%s>' % (
                    prefix, self.name, attribute_string, close))
            if pretty_print:
@ -1040,7 +1071,10 @@ class Tag(PageElement):
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
-            if pretty_print and closeTag and self.next_sibling:
+            if indent_level is not None and closeTag and self.next_sibling:
                # Even if this particular tag is not pretty-printed,
                # we're now done with the tag, and we should add a
                # newline if appropriate.
                s.append("\n")
            s = ''.join(s)
        return s
@ -1063,6 +1097,11 @@ class Tag(PageElement):
           document contains a <META> tag that mentions the document's
           encoding.
        """
        # First off, turn a string formatter into a function. This
        # will stop the lookup from happening over and over again.
        if not callable(formatter):
            formatter = self._formatter_for_name(formatter)
        pretty_print = (indent_level is not None)
        s = []
        for c in self:
@ -1072,13 +1111,13 @@ class Tag(PageElement):
            elif isinstance(c, Tag):
                s.append(c.decode(indent_level, eventual_encoding,
                                  formatter))
-            if text and indent_level:
+            if text and indent_level and not self.name == 'pre':
                text = text.strip()
            if text:
-                if pretty_print:
+                if pretty_print and not self.name == 'pre':
                    s.append(" " * (indent_level - 1))
                s.append(text)
-                if pretty_print:
+                if pretty_print and not self.name == 'pre':
                    s.append("\n")
        return ''.join(s)
@ -1145,6 +1184,207 @@ class Tag(PageElement):
            yield current
            current = current.next_element
    # CSS selector code
    _selector_combinators = ['>', '+', '~']
    _select_debug = False
    def select(self, selector, _candidate_generator=None):
        """Perform a CSS selection operation on the current element."""
        tokens = selector.split()
        current_context = [self]
        if tokens[-1] in self._selector_combinators:
            raise ValueError(
                'Final combinator "%s" is missing an argument.' % tokens[-1])
        if self._select_debug:
            print 'Running CSS selector "%s"' % selector
        for index, token in enumerate(tokens):
            if self._select_debug:
                print ' Considering token "%s"' % token
            recursive_candidate_generator = None
            tag_name = None
            if tokens[index-1] in self._selector_combinators:
                # This token was consumed by the previous combinator. Skip it.
                if self._select_debug:
                    print '  Token was consumed by the previous combinator.'
                continue
            # Each operation corresponds to a checker function, a rule
            # for determining whether a candidate matches the
            # selector. Candidates are generated by the active
            # iterator.
            checker = None
            m = self.attribselect_re.match(token)
            if m is not None:
                # Attribute selector
                tag_name, attribute, operator, value = m.groups()
                checker = self._attribute_checker(operator, attribute, value)
            elif '#' in token:
                # ID selector
                tag_name, tag_id = token.split('#', 1)
                def id_matches(tag):
                    return tag.get('id', None) == tag_id
                checker = id_matches
            elif '.' in token:
                # Class selector
                tag_name, klass = token.split('.', 1)
                classes = set(klass.split('.'))
                def classes_match(candidate):
                    return classes.issubset(candidate.get('class', []))
                checker = classes_match
            elif ':' in token:
                # Pseudo-class
                tag_name, pseudo = token.split(':', 1)
                if tag_name == '':
                    raise ValueError(
                        "A pseudo-class must be prefixed with a tag name.")
                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
                found = []
                if pseudo_attributes is not None:
                    pseudo_type, pseudo_value = pseudo_attributes.groups()
                    if pseudo_type == 'nth-of-type':
                        try:
                            pseudo_value = int(pseudo_value)
                        except:
                            raise NotImplementedError(
                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
                        if pseudo_value < 1:
                            raise ValueError(
                                'nth-of-type pseudo-class value must be at least 1.')
                        class Counter(object):
                            def __init__(self, destination):
                                self.count = 0
                                self.destination = destination
                            def nth_child_of_type(self, tag):
                                self.count += 1
                                if self.count == self.destination:
                                    return True
                                if self.count > self.destination:
                                    # Stop the generator that's sending us
                                    # these things.
                                    raise StopIteration()
                                return False
                        checker = Counter(pseudo_value).nth_child_of_type
                    else:
                        raise NotImplementedError(
                            'Only the following pseudo-classes are implemented: nth-of-type.')
            elif token == '*':
                # Star selector -- matches everything
                pass
            elif token == '>':
                # Run the next token as a CSS selector against the
                # direct children of each tag in the current context.
                recursive_candidate_generator = lambda tag: tag.children
            elif token == '~':
                # Run the next token as a CSS selector against the
                # siblings of each tag in the current context.
                recursive_candidate_generator = lambda tag: tag.next_siblings
            elif token == '+':
                # For each tag in the current context, run the next
                # token as a CSS selector against the tag's next
                # sibling that's a tag.
                def next_tag_sibling(tag):
                    yield tag.find_next_sibling(True)
                recursive_candidate_generator = next_tag_sibling
            elif self.tag_name_re.match(token):
                # Just a tag name.
                tag_name = token
            else:
                raise ValueError(
                    'Unsupported or invalid CSS selector: "%s"' % token)
            if recursive_candidate_generator:
                # This happens when the selector looks like  "> foo".
                #
                # The generator calls select() recursively on every
                # member of the current context, passing in a different
                # candidate generator and a different selector.
                #
                # In the case of "> foo", the candidate generator is
                # one that yields a tag's direct children (">"), and
                # the selector is "foo".
                next_token = tokens[index+1]
                def recursive_select(tag):
                    if self._select_debug:
                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
                        print '-' * 40
                    for i in tag.select(next_token, recursive_candidate_generator):
                        if self._select_debug:
                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
                        yield i
                    if self._select_debug:
                        print '-' * 40
                _use_candidate_generator = recursive_select
            elif _candidate_generator is None:
                # By default, a tag's candidates are all of its
                # children. If tag_name is defined, only yield tags
                # with that name.
                if self._select_debug:
                    if tag_name:
                        check = "[any]"
                    else:
                        check = tag_name
                    print '   Default candidate generator, tag name="%s"' % check
                if self._select_debug:
                    # This is redundant with later code, but it stops
                    # a bunch of bogus tags from cluttering up the
                    # debug log.
                    def default_candidate_generator(tag):
                        for child in tag.descendants:
                            if not isinstance(child, Tag):
                                continue
                            if tag_name and not child.name == tag_name:
                                continue
                            yield child
                    _use_candidate_generator = default_candidate_generator
                else:
                    _use_candidate_generator = lambda tag: tag.descendants
            else:
                _use_candidate_generator = _candidate_generator
            new_context = []
            new_context_ids = set([])
            for tag in current_context:
                if self._select_debug:
                    print "    Running candidate generator on %s %s" % (
                        tag.name, repr(tag.attrs))
                for candidate in _use_candidate_generator(tag):
                    if not isinstance(candidate, Tag):
                        continue
                    if tag_name and candidate.name != tag_name:
                        continue
                    if checker is not None:
                        try:
                            result = checker(candidate)
                        except StopIteration:
                            # The checker has decided we should no longer
                            # run the generator.
                            break
                    if checker is None or result:
                        if self._select_debug:
                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
                        if id(candidate) not in new_context_ids:
                            # If a tag matches a selector more than once,
                            # don't include it in the context more than once.
                            new_context.append(candidate)
                            new_context_ids.add(id(candidate))
                    elif self._select_debug:
                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
            current_context = new_context
        if self._select_debug:
            print "Final verdict:"
            for i in current_context:
                print " %s %s" % (i.name, i.attrs)
        return current_context
    # Old names for backwards compatibility
    def childGenerator(self):
        return self.children
@ -1152,10 +1392,13 @@ class Tag(PageElement):
    def recursiveChildGenerator(self):
        return self.descendants
-    # This was kind of misleading because has_key() (attributes) was
+    def has_key(self, key):
-    # different from __in__ (contents). has_key() is gone in Python 3,
+        """This was kind of misleading because has_key() (attributes)
-    # anyway.
+        was different from __in__ (contents). has_key() is gone in
-    has_key = has_attr
+        Python 3, anyway."""
        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
                key))
        return self.has_attr(key)
 # Next, a couple classes to represent queries and their results.
 class SoupStrainer(object):
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertDoctypeHandled(
            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
@ -159,6 +164,12 @@ class HTMLTreeBuilderSmokeTest(object):
        comment = soup.find(text="foobar")
        self.assertEqual(comment.__class__, Comment)
        # The comment is properly integrated into the tree.
        foo = soup.find(text="foo")
        self.assertEqual(comment, foo.next_element)
        baz = soup.find(text="baz")
        self.assertEqual(comment, baz.previous_element)
    def test_preserved_whitespace_in_pre_and_textarea(self):
        """Whitespace must be preserved in <pre> and <textarea> tags."""
        self.assertSoupEquals("<pre>   </pre>")
@ -217,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object):
        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
    def test_entities_in_text_converted_to_unicode(self):
        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
    def test_quot_entity_converted_to_quotation_mark(self):
@ -235,6 +248,12 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)
    def test_multipart_strings(self):
        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
        self.assertEqual("p", soup.h2.string.next_element.name)
        self.assertEqual("p", soup.p.name)
    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
        very least they should not choke on namespaces or lose
@ -453,6 +472,18 @@ class XMLTreeBuilderSmokeTest(object):
        self.assertEqual(
            soup.encode("utf-8"), markup)
    def test_formatter_processes_script_tag_for_xml_documents(self):
        doc = """
  <script type="text/javascript">
  </script>
 """
        soup = BeautifulSoup(doc, "xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
        encoded = soup.encode()
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
@ -495,6 +526,11 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)
    def test_namespaced_attributes_xml_namespace(self):
        markup = '<foo xml:lang="fr">bar</foo>'
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""
@ -523,6 +559,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
        self.assertEqual(namespace, soup.math.namespace)
        self.assertEqual(namespace, soup.msqrt.namespace)
    def test_xml_declaration_becomes_comment(self):
        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
        soup = self.soup(markup)
        self.assertTrue(isinstance(soup.contents[0], Comment))
        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
        self.assertEqual("html", soup.contents[0].next_element.name)
 def skipIf(condition, reason):
   def nothing(test, *args, **kwargs):
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    def test_xml_declaration_followed_by_doctype(self):
        markup = '''<?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE html>
 <html>
  <head>
  </head>
  <body>
   <p>foo</p>
  </body>
 </html>'''
        soup = self.soup(markup)
        # Verify that we can reach the <p> tag; this means the tree is connected.
        self.assertEqual(b"<p>foo</p>", soup.p.encode())
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -6,8 +6,11 @@ import warnings
 try:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
    LXML_PRESENT = True
    import lxml.etree
    LXML_VERSION = lxml.etree.LXML_VERSION
 except ImportError, e:
    LXML_PRESENT = False
    LXML_VERSION = (0,)
 from bs4 import (
    BeautifulSoup,
@ -41,6 +44,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
    # test if an old version of lxml is installed.
    @skipIf(
        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
        "Skipping doctype test for old version of lxml to avoid segfault.")
    def test_empty_doctype(self):
        soup = self.soup("<!DOCTYPE>")
        doctype = soup.contents[0]
        self.assertEqual("", doctype.strip())
    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
@ -72,4 +86,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML()
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase):
    def test_xml_quoting_handles_ampersands(self):
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
-    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+    def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml("&Aacute;T&T"),
            "&amp;Aacute;T&amp;T")
    def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
            "&Aacute;T&amp;T")
    def test_quotes_not_html_substituted(self):
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@ -20,6 +20,7 @@ from bs4.builder import (
 )
 from bs4.element import (
    CData,
    Comment,
    Doctype,
    NavigableString,
    SoupStrainer,
@ -425,6 +426,7 @@ class TestParentOperations(TreeTest):
    def test_find_parent(self):
        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
    def test_parent_of_text_element(self):
        text = self.tree.find(text="Start here")
@ -687,6 +689,12 @@ class TestTagCreation(SoupTest):
        self.assertEqual("foo", s)
        self.assertTrue(isinstance(s, NavigableString))
    def test_new_string_can_create_navigablestring_subclass(self):
        soup = self.soup("")
        s = soup.new_string("foo", Comment)
        self.assertEqual("foo", s)
        self.assertTrue(isinstance(s, Comment))
 class TestTreeModification(SoupTest):
    def test_attribute_modification(self):
@ -1048,7 +1056,7 @@ class TestTreeModification(SoupTest):
        # clear using decompose()
        em = a.em
        a.clear(decompose=True)
-        self.assertFalse(hasattr(em, "contents"))
+        self.assertEqual(0, len(em.contents))
    def test_string_set(self):
        """Tag.string = 'string'"""
@ -1166,6 +1174,19 @@ class TestElementObjects(SoupTest):
        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
    def test_get_text_ignores_comments(self):
        soup = self.soup("foo<!--IGNORE-->bar")
        self.assertEqual(soup.get_text(), "foobar")
        self.assertEqual(
            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
        self.assertEqual(
            soup.get_text(types=None), "fooIGNOREbar")
    def test_all_strings_ignores_comments(self):
        soup = self.soup("foo<!--IGNORE-->bar")
        self.assertEqual(['foo', 'bar'], list(soup.strings))
 class TestCDAtaListAttributes(SoupTest):
    """Testing cdata-list attributes like 'class'.
@ -1310,6 +1331,32 @@ class TestSubstitutions(SoupTest):
        expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
    def test_formatter_skips_script_tag_for_html_documents(self):
        doc = """
  <script type="text/javascript">
   console.log("< < hey > > ");
  </script>
 """
        encoded = BeautifulSoup(doc).encode()
        self.assertTrue(b"< < hey > >" in encoded)
    def test_formatter_skips_style_tag_for_html_documents(self):
        doc = """
  <style type="text/css">
   console.log("< < hey > > ");
  </style>
 """
        encoded = BeautifulSoup(doc).encode()
        self.assertTrue(b"< < hey > >" in encoded)
    def test_prettify_leaves_preformatted_text_alone(self):
        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ")
        # Everything outside the <pre> tag is reformatted, but everything
        # inside is left alone.
        self.assertEqual(
            u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
            soup.div.prettify())
    def test_prettify_accepts_formatter(self):
        soup = BeautifulSoup("<html><body>foo</body></html>")
        pretty = soup.prettify(formatter = lambda x: x.upper())
@ -1459,7 +1506,7 @@ class TestSoupSelector(TreeTest):
 </head>
 <body>
-<div id="main">
+<div id="main" class="fancy">
 <div id="inner">
 <h1 id="header1">An H1</h1>
 <p>Some text</p>
@ -1531,7 +1578,7 @@ class TestSoupSelector(TreeTest):
        self.assertEqual(len(self.soup.select('del')), 0)
    def test_invalid_tag(self):
-        self.assertEqual(len(self.soup.select('tag%t')), 0)
+        self.assertRaises(ValueError, self.soup.select, 'tag%t')
    def test_header_tags(self):
        self.assertSelectMultiple(
@ -1564,7 +1611,7 @@ class TestSoupSelector(TreeTest):
        for el in els:
            self.assertEqual(el.name, 'p')
        self.assertEqual(els[1]['class'], ['onep'])
-        self.assertFalse(els[0].has_key('class'))
+        self.assertFalse(els[0].has_attr('class'))
    def test_a_bunch_of_emptys(self):
        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
@ -1584,6 +1631,9 @@ class TestSoupSelector(TreeTest):
        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
        self.assertSelects('.s1 > a span', ['s1a2s1'])
    def test_child_selector_id(self):
        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
    def test_attribute_equals(self):
        self.assertSelectMultiple(
            ('p[class="onep"]', ['p1']),
@ -1690,6 +1740,33 @@ class TestSoupSelector(TreeTest):
            ('p[blah]', []),
        )
    def test_nth_of_type(self):
        # Try to select first paragraph
        els = self.soup.select('div#inner p:nth-of-type(1)')
        self.assertEqual(len(els), 1)
        self.assertEqual(els[0].string, u'Some text')
        # Try to select third paragraph
        els = self.soup.select('div#inner p:nth-of-type(3)')
        self.assertEqual(len(els), 1)
        self.assertEqual(els[0].string, u'Another')
        # Try to select (non-existent!) fourth paragraph
        els = self.soup.select('div#inner p:nth-of-type(4)')
        self.assertEqual(len(els), 0)
        # Pass in an invalid value.
        self.assertRaises(
            ValueError, self.soup.select, 'div p:nth-of-type(0)')
    def test_nth_of_type_direct_descendant(self):
        els = self.soup.select('div#inner > p:nth-of-type(1)')
        self.assertEqual(len(els), 1)
        self.assertEqual(els[0].string, u'Some text')
    def test_id_child_selector_nth_of_type(self):
        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
    def test_select_on_element(self):
        # Other tests operate on the tree; this operates on an element
        # within the tree.
@ -1698,3 +1775,26 @@ class TestSoupSelector(TreeTest):
        # The <div id="inner"> tag was selected. The <div id="footer">
        # tag was not.
        self.assertSelectsIDs(selected, ['inner'])
    def test_overspecified_child_id(self):
        self.assertSelects(".fancy #inner", ['inner'])
        self.assertSelects(".normal #inner", [])
    def test_adjacent_sibling_selector(self):
        self.assertSelects('#p1 + h2', ['header2'])
        self.assertSelects('#p1 + h2 + p', ['pmulti'])
        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
        self.assertEqual([], self.soup.select('#p1 + p'))
    def test_general_sibling_selector(self):
        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
        self.assertSelects('#p1 ~ #header2', ['header2'])
        self.assertSelects('#p1 ~ h2 + a', ['me'])
        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
        self.assertEqual([], self.soup.select('#inner ~ h2'))
    def test_dangling_combinator(self):
        self.assertRaises(ValueError, self.soup.select, 'h1 >')
    def test_sibling_combinator_wont_select_same_tag_twice(self):
        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])