Updated bunded version of BS4

2013-07-14 22:06:37 +12:00 · 2013-07-14 22:06:37 +12:00 · 2182d5a0fd
commit 2182d5a0fd
parent 5d30398bc1
14 changed files with 832 additions and 159 deletions
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -152,7 +152,7 @@ class TreeBuilder(object):
            tag_specific = self.cdata_list_attributes.get(
                tag_name.lower(), [])
            for cdata_list_attr in itertools.chain(universal, tag_specific):
-                if cdata_list_attr in dict(attrs):
+                if cdata_list_attr in attrs:
                    # Basically, we have a "class" attribute whose
                    # value is a whitespace-separated list of CSS
                    # classes. Split it into a list.
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
            old_element = self.element.contents[-1]
            new_element = self.soup.new_string(old_element + node.element)
            old_element.replace_with(new_element)
+            self.soup._most_recent_element = new_element
        else:
-            self.element.append(node.element)
-            node.parent = self
+            self.soup.object_was_parsed(node.element, parent=self.element)

    def getAttributes(self):
        return AttrList(self.element)
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
        # it's fixed.
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
+        elif name.startswith('X'):
+            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.soup.endData()
        if data.startswith("DOCTYPE "):
            data = data[len("DOCTYPE "):]
+        elif data == 'DOCTYPE':
+            # i.e. "<!DOCTYPE>"
+            data = ''
        self.soup.handle_data(data)
        self.soup.endData(Doctype)

--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -3,6 +3,7 @@ __all__ = [
    'LXMLTreeBuilder',
    ]

+from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    CHUNK_SIZE = 512

+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
    @property
    def default_parser(self):
        # This can either return a parser object or a class, which
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            parser = parser(target=self, strip_cdata=False)
        self.parser = parser
        self.soup = None
-        self.nsmaps = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]

    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                dammit.contains_replacement_characters)

    def feed(self, markup):
-        if isinstance(markup, basestring):
+        if isinstance(markup, bytes):
+            markup = BytesIO(markup)
+        elif isinstance(markup, unicode):
            markup = StringIO(markup)
        # Call feed() at least once, even if the markup is empty,
        # or the parser won't be initialized.
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        self.parser.close()

    def close(self):
-        self.nsmaps = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]

    def start(self, name, attrs, nsmap={}):
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
        attrs = dict(attrs)
-
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(nsmap) == 0 and self.nsmaps != None:
-            # There are no new namespaces for this tag, but namespaces
-            # are in play, so we need a separate tag stack to know
-            # when they end.
+        if len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
            self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
-            if self.nsmaps is None:
-                self.nsmaps = []
            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
            self.nsmaps.append(inverted_nsmap)
            # Also treat the namespace mapping as a set of attributes on the
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                attrs[attribute] = namespace

-        if self.nsmaps is not None and len(self.nsmaps) > 0:
-            # Namespaces are in play. Find any attributes that came in
-            # from lxml with namespaces attached to their names, and
-            # turn then into NamespacedAttribute objects.
-            new_attrs = {}
-            for attr, value in attrs.items():
-                namespace, attr = self._getNsTag(attr)
-                if namespace is None:
-                    new_attrs[attr] = value
-                else:
-                    nsprefix = self._prefix_for_namespace(namespace)
-                    attr = NamespacedAttribute(nsprefix, attr, namespace)
-                    new_attrs[attr] = value
-            attrs = new_attrs
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        new_attrs = {}
+        for attr, value in attrs.items():
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                new_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                new_attrs[attr] = value
+        attrs = new_attrs

        namespace, name = self._getNsTag(name)
        nsprefix = self._prefix_for_namespace(namespace)
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        for inverted_nsmap in reversed(self.nsmaps):
            if inverted_nsmap is not None and namespace in inverted_nsmap:
                return inverted_nsmap[namespace]
+        return None

    def end(self, name):
        self.soup.endData()
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    nsprefix = inverted_nsmap[namespace]
                    break
        self.soup.handle_endtag(name, nsprefix)
-        if self.nsmaps != None:
+        if len(self.nsmaps) > 1:
            # This tag, or one of its parents, introduced a namespace
            # mapping, so pop it off the stack.
            self.nsmaps.pop()
-            if len(self.nsmaps) == 0:
-                # Namespaces are no longer in play, so don't bother keeping
-                # track of the namespace stack.
-                self.nsmaps = None

    def pi(self, target, data):
        pass