Updated bunded version of BS4
This commit is contained in:
parent
5d30398bc1
commit
2182d5a0fd
14 changed files with 832 additions and 159 deletions
|
@ -3,6 +3,7 @@ __all__ = [
|
|||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
|
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
|
||||
@property
|
||||
def default_parser(self):
|
||||
# This can either return a parser object or a class, which
|
||||
|
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
parser = parser(target=self, strip_cdata=False)
|
||||
self.parser = parser
|
||||
self.soup = None
|
||||
self.nsmaps = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
|
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, basestring):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, unicode):
|
||||
markup = StringIO(markup)
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
|
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
self.parser.close()
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(nsmap) == 0 and self.nsmaps != None:
|
||||
# There are no new namespaces for this tag, but namespaces
|
||||
# are in play, so we need a separate tag stack to know
|
||||
# when they end.
|
||||
if len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
if self.nsmaps is None:
|
||||
self.nsmaps = []
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
|
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
if self.nsmaps is not None and len(self.nsmaps) > 0:
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
|
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
|
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if self.nsmaps != None:
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
if len(self.nsmaps) == 0:
|
||||
# Namespaces are no longer in play, so don't bother keeping
|
||||
# track of the namespace stack.
|
||||
self.nsmaps = None
|
||||
|
||||
def pi(self, target, data):
|
||||
pass
|
||||
|
|
Reference in a new issue