Updated bunded version of BS4

2013-07-14 22:06:37 +12:00 · 2013-07-14 22:06:37 +12:00 · 2182d5a0fd
commit 2182d5a0fd
parent 5d30398bc1
14 changed files with 832 additions and 159 deletions
--- a/lib/bs4/AUTHORS.txt
+++ b/lib/bs4/AUTHORS.txt
@ -0,0 +1,43 @@
+Behold, mortal, the origins of Beautiful Soup...
+================================================
+
+Leonard Richardson is the primary programmer.
+
+Aaron DeVore is awesome.
+
+Mark Pilgrim provided the encoding detection code that forms the base
+of UnicodeDammit.
+
+Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
+Soup 4 working under Python 3.
+
+Simon Willison wrote soupselect, which was used to make Beautiful Soup
+support CSS selectors.
+
+Sam Ruby helped with a lot of edge cases.
+
+Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
+work in solving the nestable tags conundrum.
+
+An incomplete list of people have contributed patches to Beautiful
+Soup:
+
+ Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
+ Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
+ Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
+ Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
+ Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
+ Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
+ Webster, Paul Wright, Danny Yoo
+
+An incomplete list of people who made suggestions or found bugs or
+found ways to break Beautiful Soup:
+
+ Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
+ Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
+ Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
+ warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
+ Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
+ Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
+ Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
+ Sousa Rocha, Yichun Wei, Per Vognsen
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.3"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__version__ = "4.2.1"
+__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']
@ -201,9 +201,9 @@ class BeautifulSoup(Tag):
        """Create a new tag associated with this soup."""
        return Tag(None, self.builder, name, namespace, nsprefix, attrs)

-    def new_string(self, s):
+    def new_string(self, s, subclass=NavigableString):
        """Create a new NavigableString associated with this soup."""
-        navigable = NavigableString(s)
+        navigable = subclass(s)
        navigable.setup()
        return navigable

@ -245,13 +245,15 @@ class BeautifulSoup(Tag):
            o = containerClass(currentData)
            self.object_was_parsed(o)

-    def object_was_parsed(self, o):
+    def object_was_parsed(self, o, parent=None, most_recent_element=None):
        """Add an object to the parse tree."""
-        o.setup(self.currentTag, self.previous_element)
-        if self.previous_element:
-            self.previous_element.next_element = o
-        self.previous_element = o
-        self.currentTag.contents.append(o)
+        parent = parent or self.currentTag
+        most_recent_element = most_recent_element or self._most_recent_element
+        o.setup(parent, most_recent_element)
+        if most_recent_element is not None:
+            most_recent_element.next_element = o
+        self._most_recent_element = o
+        parent.contents.append(o)

    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
        """Pops the tag stack up to and including the most recent
@ -295,12 +297,12 @@ class BeautifulSoup(Tag):
            return None

        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self.previous_element)
+                  self.currentTag, self._most_recent_element)
        if tag is None:
            return tag
-        if self.previous_element:
-            self.previous_element.next_element = tag
-        self.previous_element = tag
+        if self._most_recent_element:
+            self._most_recent_element.next_element = tag
+        self._most_recent_element = tag
        self.pushTag(tag)
        return tag

@ -333,6 +335,10 @@ class BeautifulSoup(Tag):
        return prefix + super(BeautifulSoup, self).decode(
            indent_level, eventual_encoding, formatter)

+# Alias to make it easier to type import: 'from bs4 import _soup'
+_s = BeautifulSoup
+_soup = BeautifulSoup
+
 class BeautifulStoneSoup(BeautifulSoup):
    """Deprecated interface to an XML parser."""

--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -152,7 +152,7 @@ class TreeBuilder(object):
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), [])
            for cdata_list_attr in itertools.chain(universal, tag_specific):
-                if cdata_list_attr in dict(attrs):
+                if cdata_list_attr in attrs:
                    # Basically, we have a "class" attribute whose
                    # value is a whitespace-separated list of CSS
                    # classes. Split it into a list.
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
            old_element = self.element.contents[-1]
            new_element = self.soup.new_string(old_element + node.element)
            old_element.replace_with(new_element)
+            self.soup._most_recent_element = new_element
        else:
-            self.element.append(node.element)
-            node.parent = self
+            self.soup.object_was_parsed(node.element, parent=self.element)

    def getAttributes(self):
        return AttrList(self.element)
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
        # it's fixed.
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
+        elif name.startswith('X'):
+            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.soup.endData()
        if data.startswith("DOCTYPE "):
            data = data[len("DOCTYPE "):]
+        elif data == 'DOCTYPE':
+            # i.e. "<!DOCTYPE>"
+            data = ''
        self.soup.handle_data(data)
        self.soup.endData(Doctype)

--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -3,6 +3,7 @@ __all__ = [
    'LXMLTreeBuilder',
    ]

+from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    CHUNK_SIZE = 512

+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
    @property
    def default_parser(self):
        # This can either return a parser object or a class, which
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            parser = parser(target=self, strip_cdata=False)
        self.parser = parser
        self.soup = None
-        self.nsmaps = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]

    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                dammit.contains_replacement_characters)

    def feed(self, markup):
-        if isinstance(markup, basestring):
+        if isinstance(markup, bytes):
+            markup = BytesIO(markup)
+        elif isinstance(markup, unicode):
            markup = StringIO(markup)
        # Call feed() at least once, even if the markup is empty,
        # or the parser won't be initialized.
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        self.parser.close()

    def close(self):
-        self.nsmaps = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]

    def start(self, name, attrs, nsmap={}):
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
        attrs = dict(attrs)
-
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(nsmap) == 0 and self.nsmaps != None:
-            # There are no new namespaces for this tag, but namespaces
-            # are in play, so we need a separate tag stack to know
-            # when they end.
+        if len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
            self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
-            if self.nsmaps is None:
-                self.nsmaps = []
            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
            self.nsmaps.append(inverted_nsmap)
            # Also treat the namespace mapping as a set of attributes on the
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace

-        if self.nsmaps is not None and len(self.nsmaps) > 0:
-            # Namespaces are in play. Find any attributes that came in
-            # from lxml with namespaces attached to their names, and
-            # turn then into NamespacedAttribute objects.
-            new_attrs = {}
-            for attr, value in attrs.items():
-                namespace, attr = self._getNsTag(attr)
-                if namespace is None:
-                    new_attrs[attr] = value
-                else:
-                    nsprefix = self._prefix_for_namespace(namespace)
-                    attr = NamespacedAttribute(nsprefix, attr, namespace)
-                    new_attrs[attr] = value
-            attrs = new_attrs
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        new_attrs = {}
+        for attr, value in attrs.items():
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                new_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                new_attrs[attr] = value
+        attrs = new_attrs

        namespace, name = self._getNsTag(name)
        nsprefix = self._prefix_for_namespace(namespace)
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        for inverted_nsmap in reversed(self.nsmaps):
            if inverted_nsmap is not None and namespace in inverted_nsmap:
                return inverted_nsmap[namespace]
+        return None

    def end(self, name):
        self.soup.endData()
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    nsprefix = inverted_nsmap[namespace]
                    break
        self.soup.handle_endtag(name, nsprefix)
-        if self.nsmaps != None:
+        if len(self.nsmaps) > 1:
            # This tag, or one of its parents, introduced a namespace
            # mapping, so pop it off the stack.
            self.nsmaps.pop()
-            if len(self.nsmaps) == 0:
-                # Namespaces are no longer in play, so don't bother keeping
-                # track of the namespace stack.
-                self.nsmaps = None

    def pi(self, target, data):
        pass
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -81,6 +81,8 @@ class EntitySubstitution(object):
                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                           ")")

+    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
    @classmethod
    def _substitute_html_entity(cls, matchobj):
        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@ -134,6 +136,28 @@ class EntitySubstitution(object):
    def substitute_xml(cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.

+        :param value: A string to be substituted. The less-than sign
+          will become &lt;, the greater-than sign will become &gt;,
+          and any ampersands will become &amp;. If you want ampersands
+          that appear to be part of an entity definition to be left
+          alone, use substitute_xml_containing_entities() instead.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
+        # Escape angle brackets and ampersands.
+        value = cls.AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+
+        if make_quoted_attribute:
+            value = cls.quoted_attribute_value(value)
+        return value
+
+    @classmethod
+    def substitute_xml_containing_entities(
+        cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
        :param value: A string to be substituted. The less-than sign will
          become &lt;, the greater-than sign will become &gt;, and any
          ampersands that are not part of an entity defition will
@ -151,6 +175,7 @@ class EntitySubstitution(object):
            value = cls.quoted_attribute_value(value)
        return value

+
    @classmethod
    def substitute_html(cls, s):
        """Replace certain Unicode characters with named HTML entities.
@ -273,7 +298,6 @@ class UnicodeDammit:
            return None
        self.tried_encodings.append((proposed, errors))
        markup = self.markup
-
        # Convert smart quotes to HTML if coming from an encoding
        # that might have them.
        if (self.smart_quotes_to is not None
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -0,0 +1,178 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+from bs4 import BeautifulSoup, __version__
+from bs4.builder import builder_registry
+import os
+import random
+import time
+import traceback
+import sys
+import cProfile
+
+def diagnose(data):
+    """Diagnostic suite for isolating common problems."""
+    print "Diagnostic running on Beautiful Soup %s" % __version__
+    print "Python version %s" % sys.version
+
+    basic_parsers = ["html.parser", "html5lib", "lxml"]
+    for name in basic_parsers:
+        for builder in builder_registry.builders:
+            if name in builder.features:
+                break
+        else:
+            basic_parsers.remove(name)
+            print (
+                "I noticed that %s is not installed. Installing it may help." %
+                name)
+
+    if 'lxml' in basic_parsers:
+        basic_parsers.append(["lxml", "xml"])
+        from lxml import etree
+        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+
+    if 'html5lib' in basic_parsers:
+        import html5lib
+        print "Found html5lib version %s" % html5lib.__version__
+
+    if hasattr(data, 'read'):
+        data = data.read()
+    elif os.path.exists(data):
+        print '"%s" looks like a filename. Reading data from the file.' % data
+        data = open(data).read()
+    elif data.startswith("http:") or data.startswith("https:"):
+        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
+        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+        return
+    print
+
+    for parser in basic_parsers:
+        print "Trying to parse your markup with %s" % parser
+        success = False
+        try:
+            soup = BeautifulSoup(data, parser)
+            success = True
+        except Exception, e:
+            print "%s could not parse the markup." % parser
+            traceback.print_exc()
+        if success:
+            print "Here's what %s did with the markup:" % parser
+            print soup.prettify()
+
+        print "-" * 80
+
+def lxml_trace(data, html=True):
+    """Print out the lxml events that occur during parsing.
+
+    This lets you see how lxml parses a document when no Beautiful
+    Soup code is running.
+    """
+    from lxml import etree
+    for event, element in etree.iterparse(StringIO(data), html=html):
+        print("%s, %4s, %s" % (event, element.tag, element.text))
+
+class AnnouncingParser(HTMLParser):
+    """Announces HTMLParser parse events, without doing anything else."""
+
+    def _p(self, s):
+        print(s)
+
+    def handle_starttag(self, name, attrs):
+        self._p("%s START" % name)
+
+    def handle_endtag(self, name):
+        self._p("%s END" % name)
+
+    def handle_data(self, data):
+        self._p("%s DATA" % data)
+
+    def handle_charref(self, name):
+        self._p("%s CHARREF" % name)
+
+    def handle_entityref(self, name):
+        self._p("%s ENTITYREF" % name)
+
+    def handle_comment(self, data):
+        self._p("%s COMMENT" % data)
+
+    def handle_decl(self, data):
+        self._p("%s DECL" % data)
+
+    def unknown_decl(self, data):
+        self._p("%s UNKNOWN-DECL" % data)
+
+    def handle_pi(self, data):
+        self._p("%s PI" % data)
+
+def htmlparser_trace(data):
+    """Print out the HTMLParser events that occur during parsing.
+
+    This lets you see how HTMLParser parses a document when no
+    Beautiful Soup code is running.
+    """
+    parser = AnnouncingParser()
+    parser.feed(data)
+
+_vowels = "aeiou"
+_consonants = "bcdfghjklmnpqrstvwxyz"
+
+def rword(length=5):
+    "Generate a random word-like string."
+    s = ''
+    for i in range(length):
+        if i % 2 == 0:
+            t = _consonants
+        else:
+            t = _vowels
+        s += random.choice(t)
+    return s
+
+def rsentence(length=4):
+    "Generate a random sentence-like string."
+    return " ".join(rword(random.randint(4,9)) for i in range(length))
+        
+def rdoc(num_elements=1000):
+    """Randomly generate an invalid HTML document."""
+    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
+    elements = []
+    for i in range(num_elements):
+        choice = random.randint(0,3)
+        if choice == 0:
+            # New tag.
+            tag_name = random.choice(tag_names)
+            elements.append("<%s>" % tag_name)
+        elif choice == 1:
+            elements.append(rsentence(random.randint(1,4)))
+        elif choice == 2:
+            # Close a tag.
+            tag_name = random.choice(tag_names)
+            elements.append("</%s>" % tag_name)
+    return "<html>" + "\n".join(elements) + "</html>"
+
+def benchmark_parsers(num_elements=100000):
+    """Very basic head-to-head performance benchmark."""
+    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+    data = rdoc(num_elements)
+    print "Generated a large invalid HTML document (%d bytes)." % len(data)
+    
+    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
+        success = False
+        try:
+            a = time.time()
+            soup = BeautifulSoup(data, parser)
+            b = time.time()
+            success = True
+        except Exception, e:
+            print "%s could not parse the markup." % parser
+            traceback.print_exc()
+        if success:
+            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+
+    from lxml import etree
+    a = time.time()
+    etree.HTML(data)
+    b = time.time()
+    print "Raw lxml parsed the markup in %.2fs." % (b-a)
+
+if __name__ == '__main__':
+    diagnose(sys.stdin.read())
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
    def __new__(cls, prefix, name, namespace=None):
        if name is None:
            obj = unicode.__new__(cls, prefix)
+        elif prefix is None:
+            # Not really namespaced.
+            obj = unicode.__new__(cls, name)
        else:
            obj = unicode.__new__(cls, prefix + ":" + name)
        obj.prefix = prefix
@ -78,6 +81,40 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
            return match.group(1) + encoding
        return self.CHARSET_RE.sub(rewrite, self.original_value)

+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+    """Entity substitution rules that are aware of some HTML quirks.
+
+    Specifically, the contents of <script> and <style> tags should not
+    undergo entity substitution.
+
+    Incoming NavigableString objects are checked to see if they're the
+    direct children of a <script> or <style> tag.
+    """
+
+    cdata_containing_tags = set(["script", "style"])
+
+    preformatted_tags = set(["pre"])
+
+    @classmethod
+    def _substitute_if_appropriate(cls, ns, f):
+        if (isinstance(ns, NavigableString)
+            and ns.parent is not None
+            and ns.parent.name in cls.cdata_containing_tags):
+            # Do nothing.
+            return ns
+        # Substitute.
+        return f(ns)
+
+    @classmethod
+    def substitute_html(cls, ns):
+        return cls._substitute_if_appropriate(
+            ns, EntitySubstitution.substitute_html)
+
+    @classmethod
+    def substitute_xml(cls, ns):
+        return cls._substitute_if_appropriate(
+            ns, EntitySubstitution.substitute_xml)

 class PageElement(object):
    """Contains the navigational information for some part of the page
@ -94,25 +131,60 @@ class PageElement(object):
    #   converted to entities.  This is not recommended, but it's
    #   faster than "minimal".
    # A function - This function will be called on every string that
-    #  needs to undergo entity substition
-    FORMATTERS = {
+    #  needs to undergo entity substitution.
+    #
+
+    # In an HTML document, the default "html" and "minimal" functions
+    # will leave the contents of <script> and <style> tags alone. For
+    # an XML document, all tags will be given the same treatment.
+
+    HTML_FORMATTERS = {
+        "html" : HTMLAwareEntitySubstitution.substitute_html,
+        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+        None : None
+        }
+
+    XML_FORMATTERS = {
        "html" : EntitySubstitution.substitute_html,
        "minimal" : EntitySubstitution.substitute_xml,
        None : None
        }

-    @classmethod
    def format_string(self, s, formatter='minimal'):
        """Format the given string using the given formatter."""
        if not callable(formatter):
-            formatter = self.FORMATTERS.get(
-                formatter, EntitySubstitution.substitute_xml)
+            formatter = self._formatter_for_name(formatter)
        if formatter is None:
            output = s
        else:
            output = formatter(s)
        return output

+    @property
+    def _is_xml(self):
+        """Is this element part of an XML tree or an HTML tree?
+
+        This is used when mapping a formatter name ("minimal") to an
+        appropriate function (one that performs entity-substitution on
+        the contents of <script> and <style> tags, or not). It's
+        inefficient, but it should be called very rarely.
+        """
+        if self.parent is None:
+            # This is the top-level object. It should have .is_xml set
+            # from tree creation. If not, take a guess--BS is usually
+            # used on HTML markup.
+            return getattr(self, 'is_xml', False)
+        return self.parent._is_xml
+
+    def _formatter_for_name(self, name):
+        "Look up a formatter function based on its name and the tree."
+        if self._is_xml:
+            return self.XML_FORMATTERS.get(
+                name, EntitySubstitution.substitute_xml)
+        else:
+            return self.HTML_FORMATTERS.get(
+                name, HTMLAwareEntitySubstitution.substitute_xml)
+
    def setup(self, parent=None, previous_element=None):
        """Sets up the initial relations between this element and
        other elements."""
@ -366,7 +438,7 @@ class PageElement(object):
        # NOTE: We can't use _find_one because findParents takes a different
        # set of arguments.
        r = None
-        l = self.find_parents(name, attrs, 1)
+        l = self.find_parents(name, attrs, 1, **kwargs)
        if l:
            r = l[0]
        return r
@ -495,6 +567,14 @@ class PageElement(object):
            value =" ".join(value)
        return value

+    def _tag_name_matches_and(self, function, tag_name):
+        if not tag_name:
+            return function
+        else:
+            def _match(tag):
+                return tag.name == tag_name and function(tag)
+            return _match
+
    def _attribute_checker(self, operator, attribute, value=''):
        """Create a function that performs a CSS selector operation.

@ -536,87 +616,6 @@ class PageElement(object):
        else:
            return lambda el: el.has_attr(attribute)

-    def select(self, selector):
-        """Perform a CSS selection operation on the current element."""
-        tokens = selector.split()
-        current_context = [self]
-        for index, token in enumerate(tokens):
-            if tokens[index - 1] == '>':
-                # already found direct descendants in last step. skip this
-                # step.
-                continue
-            m = self.attribselect_re.match(token)
-            if m is not None:
-                # Attribute selector
-                tag, attribute, operator, value = m.groups()
-                if not tag:
-                    tag = True
-                checker = self._attribute_checker(operator, attribute, value)
-                found = []
-                for context in current_context:
-                    found.extend(
-                        [el for el in context.find_all(tag) if checker(el)])
-                current_context = found
-                continue
-
-            if '#' in token:
-                # ID selector
-                tag, id = token.split('#', 1)
-                if tag == "":
-                    tag = True
-                el = current_context[0].find(tag, {'id': id})
-                if el is None:
-                    return [] # No match
-                current_context = [el]
-                continue
-
-            if '.' in token:
-                # Class selector
-                tag_name, klass = token.split('.', 1)
-                if not tag_name:
-                    tag_name = True
-                classes = set(klass.split('.'))
-                found = []
-                def classes_match(tag):
-                    if tag_name is not True and tag.name != tag_name:
-                        return False
-                    if not tag.has_attr('class'):
-                        return False
-                    return classes.issubset(tag['class'])
-                for context in current_context:
-                    found.extend(context.find_all(classes_match))
-                current_context = found
-                continue
-
-            if token == '*':
-                # Star selector
-                found = []
-                for context in current_context:
-                    found.extend(context.findAll(True))
-                current_context = found
-                continue
-
-            if token == '>':
-                # Child selector
-                tag = tokens[index + 1]
-                if not tag:
-                    tag = True
-
-                found = []
-                for context in current_context:
-                    found.extend(context.find_all(tag, recursive=False))
-                current_context = found
-                continue
-
-            # Here we should just have a regular tag
-            if not self.tag_name_re.match(token):
-                return []
-            found = []
-            for context in current_context:
-                found.extend(context.findAll(token))
-            current_context = found
-        return current_context
-
    # Old non-property versions of the generators, for backwards
    # compatibility with BS3.
    def nextGenerator(self):
@ -652,6 +651,9 @@ class NavigableString(unicode, PageElement):
            return unicode.__new__(cls, value)
        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)

+    def __copy__(self):
+        return self
+
    def __getnewargs__(self):
        return (unicode(self),)

@ -709,7 +711,7 @@ class Doctype(PreformattedString):

    @classmethod
    def for_name_and_ids(cls, name, pub_id, system_id):
-        value = name
+        value = name or ''
        if pub_id is not None:
            value += ' PUBLIC "%s"' % pub_id
            if system_id is not None:
@ -803,16 +805,24 @@ class Tag(PageElement):
        self.clear()
        self.append(string.__class__(string))

-    def _all_strings(self, strip=False):
-        """Yield all child strings, possibly stripping them."""
+    def _all_strings(self, strip=False, types=(NavigableString, CData)):
+        """Yield all strings of certain classes, possibly stripping them.
+
+        By default, yields only NavigableString and CData objects. So
+        no comments, processing instructions, etc.
+        """
        for descendant in self.descendants:
-            if not isinstance(descendant, NavigableString):
+            if (
+                (types is None and not isinstance(descendant, NavigableString))
+                or
+                (types is not None and type(descendant) not in types)):
                continue
            if strip:
                descendant = descendant.strip()
                if len(descendant) == 0:
                    continue
            yield descendant
+
    strings = property(_all_strings)

    @property
@ -820,11 +830,13 @@ class Tag(PageElement):
        for string in self._all_strings(True):
            yield string

-    def get_text(self, separator=u"", strip=False):
+    def get_text(self, separator=u"", strip=False,
+                 types=(NavigableString, CData)):
        """
        Get all child strings, concatenated using the given separator.
        """
-        return separator.join([s for s in self._all_strings(strip)])
+        return separator.join([s for s in self._all_strings(
+                    strip, types=types)])
    getText = get_text
    text = property(get_text)

@ -835,6 +847,7 @@ class Tag(PageElement):
        while i is not None:
            next = i.next_element
            i.__dict__.clear()
+            i.contents = []
            i = next

    def clear(self, decompose=False):
@ -966,6 +979,13 @@ class Tag(PageElement):
        u = self.decode(indent_level, encoding, formatter)
        return u.encode(encoding, errors)

+    def _should_pretty_print(self, indent_level):
+        """Should this tag be pretty-printed?"""
+        return (
+            indent_level is not None and
+            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
+             or self._is_xml))
+
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
@ -978,6 +998,12 @@ class Tag(PageElement):
           document contains a <META> tag that mentions the document's
           encoding.
        """
+
+        # First off, turn a string formatter into a function. This
+        # will stop the lookup from happening over and over again.
+        if not callable(formatter):
+            formatter = self._formatter_for_name(formatter)
+
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
@ -1010,12 +1036,15 @@ class Tag(PageElement):
        else:
            closeTag = '</%s%s>' % (prefix, self.name)

-        pretty_print = (indent_level is not None)
+        pretty_print = self._should_pretty_print(indent_level)
+        space = ''
+        indent_space = ''
+        if indent_level is not None:
+            indent_space = (' ' * (indent_level - 1))
        if pretty_print:
-            space = (' ' * (indent_level - 1))
+            space = indent_space
            indent_contents = indent_level + 1
        else:
-            space = ''
            indent_contents = None
        contents = self.decode_contents(
            indent_contents, eventual_encoding, formatter)
@ -1028,8 +1057,10 @@ class Tag(PageElement):
            attribute_string = ''
            if attrs:
                attribute_string = ' ' + ' '.join(attrs)
-            if pretty_print:
-                s.append(space)
+            if indent_level is not None:
+                # Even if this particular tag is not pretty-printed,
+                # we should indent up to the start of the tag.
+                s.append(indent_space)
            s.append('<%s%s%s%s>' % (
                    prefix, self.name, attribute_string, close))
            if pretty_print:
@ -1040,7 +1071,10 @@ class Tag(PageElement):
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
-            if pretty_print and closeTag and self.next_sibling:
+            if indent_level is not None and closeTag and self.next_sibling:
+                # Even if this particular tag is not pretty-printed,
+                # we're now done with the tag, and we should add a
+                # newline if appropriate.
                s.append("\n")
            s = ''.join(s)
        return s
@ -1063,6 +1097,11 @@ class Tag(PageElement):
           document contains a <META> tag that mentions the document's
           encoding.
        """
+        # First off, turn a string formatter into a function. This
+        # will stop the lookup from happening over and over again.
+        if not callable(formatter):
+            formatter = self._formatter_for_name(formatter)
+
        pretty_print = (indent_level is not None)
        s = []
        for c in self:
@ -1072,13 +1111,13 @@ class Tag(PageElement):
            elif isinstance(c, Tag):
                s.append(c.decode(indent_level, eventual_encoding,
                                  formatter))
-            if text and indent_level:
+            if text and indent_level and not self.name == 'pre':
                text = text.strip()
            if text:
-                if pretty_print:
+                if pretty_print and not self.name == 'pre':
                    s.append(" " * (indent_level - 1))
                s.append(text)
-                if pretty_print:
+                if pretty_print and not self.name == 'pre':
                    s.append("\n")
        return ''.join(s)

@ -1145,6 +1184,207 @@ class Tag(PageElement):
            yield current
            current = current.next_element

+    # CSS selector code
+
+    _selector_combinators = ['>', '+', '~']
+    _select_debug = False
+    def select(self, selector, _candidate_generator=None):
+        """Perform a CSS selection operation on the current element."""
+        tokens = selector.split()
+        current_context = [self]
+
+        if tokens[-1] in self._selector_combinators:
+            raise ValueError(
+                'Final combinator "%s" is missing an argument.' % tokens[-1])
+        if self._select_debug:
+            print 'Running CSS selector "%s"' % selector
+        for index, token in enumerate(tokens):
+            if self._select_debug:
+                print ' Considering token "%s"' % token
+            recursive_candidate_generator = None
+            tag_name = None
+            if tokens[index-1] in self._selector_combinators:
+                # This token was consumed by the previous combinator. Skip it.
+                if self._select_debug:
+                    print '  Token was consumed by the previous combinator.'
+                continue
+            # Each operation corresponds to a checker function, a rule
+            # for determining whether a candidate matches the
+            # selector. Candidates are generated by the active
+            # iterator.
+            checker = None
+
+            m = self.attribselect_re.match(token)
+            if m is not None:
+                # Attribute selector
+                tag_name, attribute, operator, value = m.groups()
+                checker = self._attribute_checker(operator, attribute, value)
+
+            elif '#' in token:
+                # ID selector
+                tag_name, tag_id = token.split('#', 1)
+                def id_matches(tag):
+                    return tag.get('id', None) == tag_id
+                checker = id_matches
+
+            elif '.' in token:
+                # Class selector
+                tag_name, klass = token.split('.', 1)
+                classes = set(klass.split('.'))
+                def classes_match(candidate):
+                    return classes.issubset(candidate.get('class', []))
+                checker = classes_match
+
+            elif ':' in token:
+                # Pseudo-class
+                tag_name, pseudo = token.split(':', 1)
+                if tag_name == '':
+                    raise ValueError(
+                        "A pseudo-class must be prefixed with a tag name.")
+                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                found = []
+                if pseudo_attributes is not None:
+                    pseudo_type, pseudo_value = pseudo_attributes.groups()
+                    if pseudo_type == 'nth-of-type':
+                        try:
+                            pseudo_value = int(pseudo_value)
+                        except:
+                            raise NotImplementedError(
+                                'Only numeric values are currently supported for the nth-of-type pseudo-class.')
+                        if pseudo_value < 1:
+                            raise ValueError(
+                                'nth-of-type pseudo-class value must be at least 1.')
+                        class Counter(object):
+                            def __init__(self, destination):
+                                self.count = 0
+                                self.destination = destination
+
+                            def nth_child_of_type(self, tag):
+                                self.count += 1
+                                if self.count == self.destination:
+                                    return True
+                                if self.count > self.destination:
+                                    # Stop the generator that's sending us
+                                    # these things.
+                                    raise StopIteration()
+                                return False
+                        checker = Counter(pseudo_value).nth_child_of_type
+                    else:
+                        raise NotImplementedError(
+                            'Only the following pseudo-classes are implemented: nth-of-type.')
+
+            elif token == '*':
+                # Star selector -- matches everything
+                pass
+            elif token == '>':
+                # Run the next token as a CSS selector against the
+                # direct children of each tag in the current context.
+                recursive_candidate_generator = lambda tag: tag.children
+            elif token == '~':
+                # Run the next token as a CSS selector against the
+                # siblings of each tag in the current context.
+                recursive_candidate_generator = lambda tag: tag.next_siblings
+            elif token == '+':
+                # For each tag in the current context, run the next
+                # token as a CSS selector against the tag's next
+                # sibling that's a tag.
+                def next_tag_sibling(tag):
+                    yield tag.find_next_sibling(True)
+                recursive_candidate_generator = next_tag_sibling
+
+            elif self.tag_name_re.match(token):
+                # Just a tag name.
+                tag_name = token
+            else:
+                raise ValueError(
+                    'Unsupported or invalid CSS selector: "%s"' % token)
+
+            if recursive_candidate_generator:
+                # This happens when the selector looks like  "> foo".
+                #
+                # The generator calls select() recursively on every
+                # member of the current context, passing in a different
+                # candidate generator and a different selector.
+                #
+                # In the case of "> foo", the candidate generator is
+                # one that yields a tag's direct children (">"), and
+                # the selector is "foo".
+                next_token = tokens[index+1]
+                def recursive_select(tag):
+                    if self._select_debug:
+                        print '    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
+                        print '-' * 40
+                    for i in tag.select(next_token, recursive_candidate_generator):
+                        if self._select_debug:
+                            print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
+                        yield i
+                    if self._select_debug:
+                        print '-' * 40
+                _use_candidate_generator = recursive_select
+            elif _candidate_generator is None:
+                # By default, a tag's candidates are all of its
+                # children. If tag_name is defined, only yield tags
+                # with that name.
+                if self._select_debug:
+                    if tag_name:
+                        check = "[any]"
+                    else:
+                        check = tag_name
+                    print '   Default candidate generator, tag name="%s"' % check
+                if self._select_debug:
+                    # This is redundant with later code, but it stops
+                    # a bunch of bogus tags from cluttering up the
+                    # debug log.
+                    def default_candidate_generator(tag):
+                        for child in tag.descendants:
+                            if not isinstance(child, Tag):
+                                continue
+                            if tag_name and not child.name == tag_name:
+                                continue
+                            yield child
+                    _use_candidate_generator = default_candidate_generator
+                else:
+                    _use_candidate_generator = lambda tag: tag.descendants
+            else:
+                _use_candidate_generator = _candidate_generator
+
+            new_context = []
+            new_context_ids = set([])
+            for tag in current_context:
+                if self._select_debug:
+                    print "    Running candidate generator on %s %s" % (
+                        tag.name, repr(tag.attrs))
+                for candidate in _use_candidate_generator(tag):
+                    if not isinstance(candidate, Tag):
+                        continue
+                    if tag_name and candidate.name != tag_name:
+                        continue
+                    if checker is not None:
+                        try:
+                            result = checker(candidate)
+                        except StopIteration:
+                            # The checker has decided we should no longer
+                            # run the generator.
+                            break
+                    if checker is None or result:
+                        if self._select_debug:
+                            print "     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
+                        if id(candidate) not in new_context_ids:
+                            # If a tag matches a selector more than once,
+                            # don't include it in the context more than once.
+                            new_context.append(candidate)
+                            new_context_ids.add(id(candidate))
+                    elif self._select_debug:
+                        print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
+
+            current_context = new_context
+
+        if self._select_debug:
+            print "Final verdict:"
+            for i in current_context:
+                print " %s %s" % (i.name, i.attrs)
+        return current_context
+
    # Old names for backwards compatibility
    def childGenerator(self):
        return self.children
@ -1152,10 +1392,13 @@ class Tag(PageElement):
    def recursiveChildGenerator(self):
        return self.descendants

-    # This was kind of misleading because has_key() (attributes) was
-    # different from __in__ (contents). has_key() is gone in Python 3,
-    # anyway.
-    has_key = has_attr
+    def has_key(self, key):
+        """This was kind of misleading because has_key() (attributes)
+        was different from __in__ (contents). has_key() is gone in
+        Python 3, anyway."""
+        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
+                key))
+        return self.has_attr(key)

 # Next, a couple classes to represent queries and their results.
 class SoupStrainer(object):
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertDoctypeHandled(
            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')

+    def test_empty_doctype(self):
+        soup = self.soup("<!DOCTYPE>")
+        doctype = soup.contents[0]
+        self.assertEqual("", doctype.strip())
+
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
@ -159,6 +164,12 @@ class HTMLTreeBuilderSmokeTest(object):
        comment = soup.find(text="foobar")
        self.assertEqual(comment.__class__, Comment)

+        # The comment is properly integrated into the tree.
+        foo = soup.find(text="foo")
+        self.assertEqual(comment, foo.next_element)
+        baz = soup.find(text="baz")
+        self.assertEqual(comment, baz.previous_element)
+
    def test_preserved_whitespace_in_pre_and_textarea(self):
        """Whitespace must be preserved in <pre> and <textarea> tags."""
        self.assertSoupEquals("<pre>   </pre>")
@ -217,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object):
        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)

    def test_entities_in_text_converted_to_unicode(self):
        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)

    def test_quot_entity_converted_to_quotation_mark(self):
@ -235,6 +248,12 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)

+    def test_multipart_strings(self):
+        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
+        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
+        self.assertEqual("p", soup.h2.string.next_element.name)
+        self.assertEqual("p", soup.p.name)
+
    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
        very least they should not choke on namespaces or lose
@ -453,6 +472,18 @@ class XMLTreeBuilderSmokeTest(object):
        self.assertEqual(
            soup.encode("utf-8"), markup)

+    def test_formatter_processes_script_tag_for_xml_documents(self):
+        doc = """
+  <script type="text/javascript">
+  </script>
+"""
+        soup = BeautifulSoup(doc, "xml")
+        # lxml would have stripped this while parsing, but we can add
+        # it later.
+        soup.script.string = 'console.log("< < hey > > ");'
+        encoded = soup.encode()
+        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
+
    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
@ -495,6 +526,11 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.foo), markup)

+    def test_namespaced_attributes_xml_namespace(self):
+        markup = '<foo xml:lang="fr">bar</foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""

@ -523,6 +559,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
        self.assertEqual(namespace, soup.math.namespace)
        self.assertEqual(namespace, soup.msqrt.namespace)

+    def test_xml_declaration_becomes_comment(self):
+        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
+        soup = self.soup(markup)
+        self.assertTrue(isinstance(soup.contents[0], Comment))
+        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
+        self.assertEqual("html", soup.contents[0].next_element.name)

 def skipIf(condition, reason):
   def nothing(test, *args, **kwargs):
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
            "<table><thead><tr><td>Foo</td></tr></thead>"
            "<tbody><tr><td>Bar</td></tr></tbody>"
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_xml_declaration_followed_by_doctype(self):
+        markup = '''<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html>
+<html>
+  <head>
+  </head>
+  <body>
+   <p>foo</p>
+  </body>
+</html>'''
+        soup = self.soup(markup)
+        # Verify that we can reach the <p> tag; this means the tree is connected.
+        self.assertEqual(b"<p>foo</p>", soup.p.encode())
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -6,8 +6,11 @@ import warnings
 try:
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
    LXML_PRESENT = True
+    import lxml.etree
+    LXML_VERSION = lxml.etree.LXML_VERSION
 except ImportError, e:
    LXML_PRESENT = False
+    LXML_VERSION = (0,)

 from bs4 import (
    BeautifulSoup,
@ -41,6 +44,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")

+    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+    # test if an old version of lxml is installed.
+
+    @skipIf(
+        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
+        "Skipping doctype test for old version of lxml to avoid segfault.")
+    def test_empty_doctype(self):
+        soup = self.soup("<!DOCTYPE>")
+        doctype = soup.contents[0]
+        self.assertEqual("", doctype.strip())
+
    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
@ -72,4 +86,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML()
-
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase):
    def test_xml_quoting_handles_ampersands(self):
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")

-    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+    def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml("&Aacute;T&T"),
+            "&amp;Aacute;T&amp;T")
+
+    def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEqual(
+            self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
            "&Aacute;T&amp;T")

    def test_quotes_not_html_substituted(self):
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@ -20,6 +20,7 @@ from bs4.builder import (
 )
 from bs4.element import (
    CData,
+    Comment,
    Doctype,
    NavigableString,
    SoupStrainer,
@ -425,6 +426,7 @@ class TestParentOperations(TreeTest):

    def test_find_parent(self):
        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
+        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')

    def test_parent_of_text_element(self):
        text = self.tree.find(text="Start here")
@ -687,6 +689,12 @@ class TestTagCreation(SoupTest):
        self.assertEqual("foo", s)
        self.assertTrue(isinstance(s, NavigableString))

+    def test_new_string_can_create_navigablestring_subclass(self):
+        soup = self.soup("")
+        s = soup.new_string("foo", Comment)
+        self.assertEqual("foo", s)
+        self.assertTrue(isinstance(s, Comment))
+
 class TestTreeModification(SoupTest):

    def test_attribute_modification(self):
@ -1048,7 +1056,7 @@ class TestTreeModification(SoupTest):
        # clear using decompose()
        em = a.em
        a.clear(decompose=True)
-        self.assertFalse(hasattr(em, "contents"))
+        self.assertEqual(0, len(em.contents))

    def test_string_set(self):
        """Tag.string = 'string'"""
@ -1166,6 +1174,19 @@ class TestElementObjects(SoupTest):
        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")

+    def test_get_text_ignores_comments(self):
+        soup = self.soup("foo<!--IGNORE-->bar")
+        self.assertEqual(soup.get_text(), "foobar")
+
+        self.assertEqual(
+            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
+        self.assertEqual(
+            soup.get_text(types=None), "fooIGNOREbar")
+
+    def test_all_strings_ignores_comments(self):
+        soup = self.soup("foo<!--IGNORE-->bar")
+        self.assertEqual(['foo', 'bar'], list(soup.strings))
+
 class TestCDAtaListAttributes(SoupTest):

    """Testing cdata-list attributes like 'class'.
@ -1310,6 +1331,32 @@ class TestSubstitutions(SoupTest):
        expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))

+    def test_formatter_skips_script_tag_for_html_documents(self):
+        doc = """
+  <script type="text/javascript">
+   console.log("< < hey > > ");
+  </script>
+"""
+        encoded = BeautifulSoup(doc).encode()
+        self.assertTrue(b"< < hey > >" in encoded)
+
+    def test_formatter_skips_style_tag_for_html_documents(self):
+        doc = """
+  <style type="text/css">
+   console.log("< < hey > > ");
+  </style>
+"""
+        encoded = BeautifulSoup(doc).encode()
+        self.assertTrue(b"< < hey > >" in encoded)
+
+    def test_prettify_leaves_preformatted_text_alone(self):
+        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ")
+        # Everything outside the <pre> tag is reformatted, but everything
+        # inside is left alone.
+        self.assertEqual(
+            u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
+            soup.div.prettify())
+
    def test_prettify_accepts_formatter(self):
        soup = BeautifulSoup("<html><body>foo</body></html>")
        pretty = soup.prettify(formatter = lambda x: x.upper())
@ -1459,7 +1506,7 @@ class TestSoupSelector(TreeTest):
 </head>
 <body>

-<div id="main">
+<div id="main" class="fancy">
 <div id="inner">
 <h1 id="header1">An H1</h1>
 <p>Some text</p>
@ -1531,7 +1578,7 @@ class TestSoupSelector(TreeTest):
        self.assertEqual(len(self.soup.select('del')), 0)

    def test_invalid_tag(self):
-        self.assertEqual(len(self.soup.select('tag%t')), 0)
+        self.assertRaises(ValueError, self.soup.select, 'tag%t')

    def test_header_tags(self):
        self.assertSelectMultiple(
@ -1564,7 +1611,7 @@ class TestSoupSelector(TreeTest):
        for el in els:
            self.assertEqual(el.name, 'p')
        self.assertEqual(els[1]['class'], ['onep'])
-        self.assertFalse(els[0].has_key('class'))
+        self.assertFalse(els[0].has_attr('class'))

    def test_a_bunch_of_emptys(self):
        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
@ -1584,6 +1631,9 @@ class TestSoupSelector(TreeTest):
        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
        self.assertSelects('.s1 > a span', ['s1a2s1'])

+    def test_child_selector_id(self):
+        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
+
    def test_attribute_equals(self):
        self.assertSelectMultiple(
            ('p[class="onep"]', ['p1']),
@ -1690,6 +1740,33 @@ class TestSoupSelector(TreeTest):
            ('p[blah]', []),
        )

+    def test_nth_of_type(self):
+        # Try to select first paragraph
+        els = self.soup.select('div#inner p:nth-of-type(1)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Some text')
+
+        # Try to select third paragraph
+        els = self.soup.select('div#inner p:nth-of-type(3)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Another')
+
+        # Try to select (non-existent!) fourth paragraph
+        els = self.soup.select('div#inner p:nth-of-type(4)')
+        self.assertEqual(len(els), 0)
+
+        # Pass in an invalid value.
+        self.assertRaises(
+            ValueError, self.soup.select, 'div p:nth-of-type(0)')
+
+    def test_nth_of_type_direct_descendant(self):
+        els = self.soup.select('div#inner > p:nth-of-type(1)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Some text')
+
+    def test_id_child_selector_nth_of_type(self):
+        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
+
    def test_select_on_element(self):
        # Other tests operate on the tree; this operates on an element
        # within the tree.
@ -1698,3 +1775,26 @@ class TestSoupSelector(TreeTest):
        # The <div id="inner"> tag was selected. The <div id="footer">
        # tag was not.
        self.assertSelectsIDs(selected, ['inner'])
+
+    def test_overspecified_child_id(self):
+        self.assertSelects(".fancy #inner", ['inner'])
+        self.assertSelects(".normal #inner", [])
+
+    def test_adjacent_sibling_selector(self):
+        self.assertSelects('#p1 + h2', ['header2'])
+        self.assertSelects('#p1 + h2 + p', ['pmulti'])
+        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
+        self.assertEqual([], self.soup.select('#p1 + p'))
+
+    def test_general_sibling_selector(self):
+        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
+        self.assertSelects('#p1 ~ #header2', ['header2'])
+        self.assertSelects('#p1 ~ h2 + a', ['me'])
+        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
+        self.assertEqual([], self.soup.select('#inner ~ h2'))
+
+    def test_dangling_combinator(self):
+        self.assertRaises(ValueError, self.soup.select, 'h1 >')
+
+    def test_sibling_combinator_wont_select_same_tag_twice(self):
+        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])