Updated bunded version of BS4

This commit is contained in:
Luke Rogers 2013-07-14 22:06:37 +12:00
parent 5d30398bc1
commit 2182d5a0fd
14 changed files with 832 additions and 159 deletions

View file

@ -152,7 +152,7 @@ class TreeBuilder(object):
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), [])
for cdata_list_attr in itertools.chain(universal, tag_specific):
if cdata_list_attr in dict(attrs):
if cdata_list_attr in attrs:
# Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS
# classes. Split it into a list.

View file

@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + node.element)
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
self.element.append(node.element)
node.parent = self
self.soup.object_was_parsed(node.element, parent=self.element)
def getAttributes(self):
return AttrList(self.element)

View file

@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)

View file

@ -3,6 +3,7 @@ __all__ = [
'LXMLTreeBuilder',
]
from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
@property
def default_parser(self):
# This can either return a parser object or a class, which
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
self.nsmaps = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
dammit.contains_replacement_characters)
def feed(self, markup):
if isinstance(markup, basestring):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.close()
def close(self):
self.nsmaps = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(nsmap) == 0 and self.nsmaps != None:
# There are no new namespaces for this tag, but namespaces
# are in play, so we need a separate tag stack to know
# when they end.
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
if self.nsmaps is None:
self.nsmaps = []
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
if self.nsmaps is not None and len(self.nsmaps) > 0:
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
return None
def end(self, name):
self.soup.endData()
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if self.nsmaps != None:
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
if len(self.nsmaps) == 0:
# Namespaces are no longer in play, so don't bother keeping
# track of the namespace stack.
self.nsmaps = None
def pi(self, target, data):
pass