Updated bunded version of BS4

This commit is contained in:
Luke Rogers 2013-07-14 22:06:37 +12:00
parent 5d30398bc1
commit 2182d5a0fd
14 changed files with 832 additions and 159 deletions

43
lib/bs4/AUTHORS.txt Normal file
View file

@ -0,0 +1,43 @@
Behold, mortal, the origins of Beautiful Soup...
================================================
Leonard Richardson is the primary programmer.
Aaron DeVore is awesome.
Mark Pilgrim provided the encoding detection code that forms the base
of UnicodeDammit.
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
Soup 4 working under Python 3.
Simon Willison wrote soupselect, which was used to make Beautiful Soup
support CSS selectors.
Sam Ruby helped with a lot of edge cases.
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
work in solving the nestable tags conundrum.
An incomplete list of people have contributed patches to Beautiful
Soup:
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
Webster, Paul Wright, Danny Yoo
An incomplete list of people who made suggestions or found bugs or
found ways to break Beautiful Soup:
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
Sousa Rocha, Yichun Wei, Per Vognsen

View file

@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.1.3" __version__ = "4.2.1"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
@ -201,9 +201,9 @@ class BeautifulSoup(Tag):
"""Create a new tag associated with this soup.""" """Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs) return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s): def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup.""" """Create a new NavigableString associated with this soup."""
navigable = NavigableString(s) navigable = subclass(s)
navigable.setup() navigable.setup()
return navigable return navigable
@ -245,13 +245,15 @@ class BeautifulSoup(Tag):
o = containerClass(currentData) o = containerClass(currentData)
self.object_was_parsed(o) self.object_was_parsed(o)
def object_was_parsed(self, o): def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree.""" """Add an object to the parse tree."""
o.setup(self.currentTag, self.previous_element) parent = parent or self.currentTag
if self.previous_element: most_recent_element = most_recent_element or self._most_recent_element
self.previous_element.next_element = o o.setup(parent, most_recent_element)
self.previous_element = o if most_recent_element is not None:
self.currentTag.contents.append(o) most_recent_element.next_element = o
self._most_recent_element = o
parent.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True): def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent """Pops the tag stack up to and including the most recent
@ -295,12 +297,12 @@ class BeautifulSoup(Tag):
return None return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self.previous_element) self.currentTag, self._most_recent_element)
if tag is None: if tag is None:
return tag return tag
if self.previous_element: if self._most_recent_element:
self.previous_element.next_element = tag self._most_recent_element.next_element = tag
self.previous_element = tag self._most_recent_element = tag
self.pushTag(tag) self.pushTag(tag)
return tag return tag
@ -333,6 +335,10 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode( return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter) indent_level, eventual_encoding, formatter)
# Alias to make it easier to type import: 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
class BeautifulStoneSoup(BeautifulSoup): class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser.""" """Deprecated interface to an XML parser."""

View file

@ -152,7 +152,7 @@ class TreeBuilder(object):
tag_specific = self.cdata_list_attributes.get( tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), []) tag_name.lower(), [])
for cdata_list_attr in itertools.chain(universal, tag_specific): for cdata_list_attr in itertools.chain(universal, tag_specific):
if cdata_list_attr in dict(attrs): if cdata_list_attr in attrs:
# Basically, we have a "class" attribute whose # Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS # value is a whitespace-separated list of CSS
# classes. Split it into a list. # classes. Split it into a list.

View file

@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
old_element = self.element.contents[-1] old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + node.element) new_element = self.soup.new_string(old_element + node.element)
old_element.replace_with(new_element) old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else: else:
self.element.append(node.element) self.soup.object_was_parsed(node.element, parent=self.element)
node.parent = self
def getAttributes(self): def getAttributes(self):
return AttrList(self.element) return AttrList(self.element)

View file

@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
# it's fixed. # it's fixed.
if name.startswith('x'): if name.startswith('x'):
real_name = int(name.lstrip('x'), 16) real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else: else:
real_name = int(name) real_name = int(name)
@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData() self.soup.endData()
if data.startswith("DOCTYPE "): if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):] data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
self.soup.handle_data(data) self.soup.handle_data(data)
self.soup.endData(Doctype) self.soup.endData(Doctype)

View file

@ -3,6 +3,7 @@ __all__ = [
'LXMLTreeBuilder', 'LXMLTreeBuilder',
] ]
from io import BytesIO
from StringIO import StringIO from StringIO import StringIO
import collections import collections
from lxml import etree from lxml import etree
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
CHUNK_SIZE = 512 CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
@property @property
def default_parser(self): def default_parser(self):
# This can either return a parser object or a class, which # This can either return a parser object or a class, which
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False) parser = parser(target=self, strip_cdata=False)
self.parser = parser self.parser = parser
self.soup = None self.soup = None
self.nsmaps = None self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag): def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag # Split the namespace URL out of a fully-qualified lxml tag
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
dammit.contains_replacement_characters) dammit.contains_replacement_characters)
def feed(self, markup): def feed(self, markup):
if isinstance(markup, basestring): if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup) markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty, # Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized. # or the parser won't be initialized.
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.close() self.parser.close()
def close(self): def close(self):
self.nsmaps = None self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}): def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs) attrs = dict(attrs)
nsprefix = None nsprefix = None
# Invert each namespace map as it comes in. # Invert each namespace map as it comes in.
if len(nsmap) == 0 and self.nsmaps != None: if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but namespaces # There are no new namespaces for this tag, but
# are in play, so we need a separate tag stack to know # non-default namespaces are in play, so we need a
# when they end. # separate tag stack to know when they end.
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
if self.nsmaps is None:
self.nsmaps = []
inverted_nsmap = dict((value, key) for key, value in nsmap.items()) inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap) self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the # Also treat the namespace mapping as a set of attributes on the
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
"xmlns", prefix, "http://www.w3.org/2000/xmlns/") "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace attrs[attribute] = namespace
if self.nsmaps is not None and len(self.nsmaps) > 0: # Namespaces are in play. Find any attributes that came in
# Namespaces are in play. Find any attributes that came in # from lxml with namespaces attached to their names, and
# from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects.
# turn then into NamespacedAttribute objects. new_attrs = {}
new_attrs = {} for attr, value in attrs.items():
for attr, value in attrs.items(): namespace, attr = self._getNsTag(attr)
namespace, attr = self._getNsTag(attr) if namespace is None:
if namespace is None: new_attrs[attr] = value
new_attrs[attr] = value else:
else: nsprefix = self._prefix_for_namespace(namespace)
nsprefix = self._prefix_for_namespace(namespace) attr = NamespacedAttribute(nsprefix, attr, namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace) new_attrs[attr] = value
new_attrs[attr] = value attrs = new_attrs
attrs = new_attrs
namespace, name = self._getNsTag(name) namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace) nsprefix = self._prefix_for_namespace(namespace)
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
for inverted_nsmap in reversed(self.nsmaps): for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap: if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace] return inverted_nsmap[namespace]
return None
def end(self, name): def end(self, name):
self.soup.endData() self.soup.endData()
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
nsprefix = inverted_nsmap[namespace] nsprefix = inverted_nsmap[namespace]
break break
self.soup.handle_endtag(name, nsprefix) self.soup.handle_endtag(name, nsprefix)
if self.nsmaps != None: if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace # This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack. # mapping, so pop it off the stack.
self.nsmaps.pop() self.nsmaps.pop()
if len(self.nsmaps) == 0:
# Namespaces are no longer in play, so don't bother keeping
# track of the namespace stack.
self.nsmaps = None
def pi(self, target, data): def pi(self, target, data):
pass pass

View file

@ -81,6 +81,8 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")") ")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@classmethod @classmethod
def _substitute_html_entity(cls, matchobj): def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@ -134,6 +136,28 @@ class EntitySubstitution(object):
def substitute_xml(cls, value, make_quoted_attribute=False): def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters. """Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign
will become &lt;, the greater-than sign will become &gt;,
and any ampersands will become &amp;. If you want ampersands
that appear to be part of an entity definition to be left
alone, use substitute_xml_containing_entities() instead.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets and ampersands.
value = cls.AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_xml_containing_entities(
cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will :param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will ampersands that are not part of an entity defition will
@ -151,6 +175,7 @@ class EntitySubstitution(object):
value = cls.quoted_attribute_value(value) value = cls.quoted_attribute_value(value)
return value return value
@classmethod @classmethod
def substitute_html(cls, s): def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities. """Replace certain Unicode characters with named HTML entities.
@ -273,7 +298,6 @@ class UnicodeDammit:
return None return None
self.tried_encodings.append((proposed, errors)) self.tried_encodings.append((proposed, errors))
markup = self.markup markup = self.markup
# Convert smart quotes to HTML if coming from an encoding # Convert smart quotes to HTML if coming from an encoding
# that might have them. # that might have them.
if (self.smart_quotes_to is not None if (self.smart_quotes_to is not None

178
lib/bs4/diagnose.py Normal file
View file

@ -0,0 +1,178 @@
"""Diagnostic functions, mainly for use when doing tech support."""
from StringIO import StringIO
from HTMLParser import HTMLParser
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
import os
import random
import time
import traceback
import sys
import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__
print "Python version %s" % sys.version
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
for builder in builder_registry.builders:
if name in builder.features:
break
else:
basic_parsers.remove(name)
print (
"I noticed that %s is not installed. Installing it may help." %
name)
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
if 'html5lib' in basic_parsers:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "Here's what %s did with the markup:" % parser
print soup.prettify()
print "-" * 80
def lxml_trace(data, html=True):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html):
print("%s, %4s, %s" % (event, element.tag, element.text))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
def _p(self, s):
print(s)
def handle_starttag(self, name, attrs):
self._p("%s START" % name)
def handle_endtag(self, name):
self._p("%s END" % name)
def handle_data(self, data):
self._p("%s DATA" % data)
def handle_charref(self, name):
self._p("%s CHARREF" % name)
def handle_entityref(self, name):
self._p("%s ENTITYREF" % name)
def handle_comment(self, data):
self._p("%s COMMENT" % data)
def handle_decl(self, data):
self._p("%s DECL" % data)
def unknown_decl(self, data):
self._p("%s UNKNOWN-DECL" % data)
def handle_pi(self, data):
self._p("%s PI" % data)
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.feed(data)
_vowels = "aeiou"
_consonants = "bcdfghjklmnpqrstvwxyz"
def rword(length=5):
"Generate a random word-like string."
s = ''
for i in range(length):
if i % 2 == 0:
t = _consonants
else:
t = _vowels
s += random.choice(t)
return s
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
elements = []
for i in range(num_elements):
choice = random.randint(0,3)
if choice == 0:
# New tag.
tag_name = random.choice(tag_names)
elements.append("<%s>" % tag_name)
elif choice == 1:
elements.append(rsentence(random.randint(1,4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
if __name__ == '__main__':
diagnose(sys.stdin.read())

View file

@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None): def __new__(cls, prefix, name, namespace=None):
if name is None: if name is None:
obj = unicode.__new__(cls, prefix) obj = unicode.__new__(cls, prefix)
elif prefix is None:
# Not really namespaced.
obj = unicode.__new__(cls, name)
else: else:
obj = unicode.__new__(cls, prefix + ":" + name) obj = unicode.__new__(cls, prefix + ":" + name)
obj.prefix = prefix obj.prefix = prefix
@ -78,6 +81,40 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value) return self.CHARSET_RE.sub(rewrite, self.original_value)
class HTMLAwareEntitySubstitution(EntitySubstitution):
"""Entity substitution rules that are aware of some HTML quirks.
Specifically, the contents of <script> and <style> tags should not
undergo entity substitution.
Incoming NavigableString objects are checked to see if they're the
direct children of a <script> or <style> tag.
"""
cdata_containing_tags = set(["script", "style"])
preformatted_tags = set(["pre"])
@classmethod
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in cls.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return f(ns)
@classmethod
def substitute_html(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_html)
@classmethod
def substitute_xml(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
class PageElement(object): class PageElement(object):
"""Contains the navigational information for some part of the page """Contains the navigational information for some part of the page
@ -94,25 +131,60 @@ class PageElement(object):
# converted to entities. This is not recommended, but it's # converted to entities. This is not recommended, but it's
# faster than "minimal". # faster than "minimal".
# A function - This function will be called on every string that # A function - This function will be called on every string that
# needs to undergo entity substition # needs to undergo entity substitution.
FORMATTERS = { #
# In an HTML document, the default "html" and "minimal" functions
# will leave the contents of <script> and <style> tags alone. For
# an XML document, all tags will be given the same treatment.
HTML_FORMATTERS = {
"html" : HTMLAwareEntitySubstitution.substitute_html,
"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
None : None
}
XML_FORMATTERS = {
"html" : EntitySubstitution.substitute_html, "html" : EntitySubstitution.substitute_html,
"minimal" : EntitySubstitution.substitute_xml, "minimal" : EntitySubstitution.substitute_xml,
None : None None : None
} }
@classmethod
def format_string(self, s, formatter='minimal'): def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter.""" """Format the given string using the given formatter."""
if not callable(formatter): if not callable(formatter):
formatter = self.FORMATTERS.get( formatter = self._formatter_for_name(formatter)
formatter, EntitySubstitution.substitute_xml)
if formatter is None: if formatter is None:
output = s output = s
else: else:
output = formatter(s) output = formatter(s)
return output return output
@property
def _is_xml(self):
"""Is this element part of an XML tree or an HTML tree?
This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on
the contents of <script> and <style> tags, or not). It's
inefficient, but it should be called very rarely.
"""
if self.parent is None:
# This is the top-level object. It should have .is_xml set
# from tree creation. If not, take a guess--BS is usually
# used on HTML markup.
return getattr(self, 'is_xml', False)
return self.parent._is_xml
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
return self.XML_FORMATTERS.get(
name, EntitySubstitution.substitute_xml)
else:
return self.HTML_FORMATTERS.get(
name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None): def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and """Sets up the initial relations between this element and
other elements.""" other elements."""
@ -366,7 +438,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different # NOTE: We can't use _find_one because findParents takes a different
# set of arguments. # set of arguments.
r = None r = None
l = self.find_parents(name, attrs, 1) l = self.find_parents(name, attrs, 1, **kwargs)
if l: if l:
r = l[0] r = l[0]
return r return r
@ -495,6 +567,14 @@ class PageElement(object):
value =" ".join(value) value =" ".join(value)
return value return value
def _tag_name_matches_and(self, function, tag_name):
if not tag_name:
return function
else:
def _match(tag):
return tag.name == tag_name and function(tag)
return _match
def _attribute_checker(self, operator, attribute, value=''): def _attribute_checker(self, operator, attribute, value=''):
"""Create a function that performs a CSS selector operation. """Create a function that performs a CSS selector operation.
@ -536,87 +616,6 @@ class PageElement(object):
else: else:
return lambda el: el.has_attr(attribute) return lambda el: el.has_attr(attribute)
def select(self, selector):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
for index, token in enumerate(tokens):
if tokens[index - 1] == '>':
# already found direct descendants in last step. skip this
# step.
continue
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag, attribute, operator, value = m.groups()
if not tag:
tag = True
checker = self._attribute_checker(operator, attribute, value)
found = []
for context in current_context:
found.extend(
[el for el in context.find_all(tag) if checker(el)])
current_context = found
continue
if '#' in token:
# ID selector
tag, id = token.split('#', 1)
if tag == "":
tag = True
el = current_context[0].find(tag, {'id': id})
if el is None:
return [] # No match
current_context = [el]
continue
if '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
if not tag_name:
tag_name = True
classes = set(klass.split('.'))
found = []
def classes_match(tag):
if tag_name is not True and tag.name != tag_name:
return False
if not tag.has_attr('class'):
return False
return classes.issubset(tag['class'])
for context in current_context:
found.extend(context.find_all(classes_match))
current_context = found
continue
if token == '*':
# Star selector
found = []
for context in current_context:
found.extend(context.findAll(True))
current_context = found
continue
if token == '>':
# Child selector
tag = tokens[index + 1]
if not tag:
tag = True
found = []
for context in current_context:
found.extend(context.find_all(tag, recursive=False))
current_context = found
continue
# Here we should just have a regular tag
if not self.tag_name_re.match(token):
return []
found = []
for context in current_context:
found.extend(context.findAll(token))
current_context = found
return current_context
# Old non-property versions of the generators, for backwards # Old non-property versions of the generators, for backwards
# compatibility with BS3. # compatibility with BS3.
def nextGenerator(self): def nextGenerator(self):
@ -652,6 +651,9 @@ class NavigableString(unicode, PageElement):
return unicode.__new__(cls, value) return unicode.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __copy__(self):
return self
def __getnewargs__(self): def __getnewargs__(self):
return (unicode(self),) return (unicode(self),)
@ -709,7 +711,7 @@ class Doctype(PreformattedString):
@classmethod @classmethod
def for_name_and_ids(cls, name, pub_id, system_id): def for_name_and_ids(cls, name, pub_id, system_id):
value = name value = name or ''
if pub_id is not None: if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id value += ' PUBLIC "%s"' % pub_id
if system_id is not None: if system_id is not None:
@ -803,16 +805,24 @@ class Tag(PageElement):
self.clear() self.clear()
self.append(string.__class__(string)) self.append(string.__class__(string))
def _all_strings(self, strip=False): def _all_strings(self, strip=False, types=(NavigableString, CData)):
"""Yield all child strings, possibly stripping them.""" """Yield all strings of certain classes, possibly stripping them.
By default, yields only NavigableString and CData objects. So
no comments, processing instructions, etc.
"""
for descendant in self.descendants: for descendant in self.descendants:
if not isinstance(descendant, NavigableString): if (
(types is None and not isinstance(descendant, NavigableString))
or
(types is not None and type(descendant) not in types)):
continue continue
if strip: if strip:
descendant = descendant.strip() descendant = descendant.strip()
if len(descendant) == 0: if len(descendant) == 0:
continue continue
yield descendant yield descendant
strings = property(_all_strings) strings = property(_all_strings)
@property @property
@ -820,11 +830,13 @@ class Tag(PageElement):
for string in self._all_strings(True): for string in self._all_strings(True):
yield string yield string
def get_text(self, separator=u"", strip=False): def get_text(self, separator=u"", strip=False,
types=(NavigableString, CData)):
""" """
Get all child strings, concatenated using the given separator. Get all child strings, concatenated using the given separator.
""" """
return separator.join([s for s in self._all_strings(strip)]) return separator.join([s for s in self._all_strings(
strip, types=types)])
getText = get_text getText = get_text
text = property(get_text) text = property(get_text)
@ -835,6 +847,7 @@ class Tag(PageElement):
while i is not None: while i is not None:
next = i.next_element next = i.next_element
i.__dict__.clear() i.__dict__.clear()
i.contents = []
i = next i = next
def clear(self, decompose=False): def clear(self, decompose=False):
@ -966,6 +979,13 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter) u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors) return u.encode(encoding, errors)
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
return (
indent_level is not None and
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
or self._is_xml))
def decode(self, indent_level=None, def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal"):
@ -978,6 +998,12 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's document contains a <META> tag that mentions the document's
encoding. encoding.
""" """
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
attrs = [] attrs = []
if self.attrs: if self.attrs:
for key, val in sorted(self.attrs.items()): for key, val in sorted(self.attrs.items()):
@ -1010,12 +1036,15 @@ class Tag(PageElement):
else: else:
closeTag = '</%s%s>' % (prefix, self.name) closeTag = '</%s%s>' % (prefix, self.name)
pretty_print = (indent_level is not None) pretty_print = self._should_pretty_print(indent_level)
space = ''
indent_space = ''
if indent_level is not None:
indent_space = (' ' * (indent_level - 1))
if pretty_print: if pretty_print:
space = (' ' * (indent_level - 1)) space = indent_space
indent_contents = indent_level + 1 indent_contents = indent_level + 1
else: else:
space = ''
indent_contents = None indent_contents = None
contents = self.decode_contents( contents = self.decode_contents(
indent_contents, eventual_encoding, formatter) indent_contents, eventual_encoding, formatter)
@ -1028,8 +1057,10 @@ class Tag(PageElement):
attribute_string = '' attribute_string = ''
if attrs: if attrs:
attribute_string = ' ' + ' '.join(attrs) attribute_string = ' ' + ' '.join(attrs)
if pretty_print: if indent_level is not None:
s.append(space) # Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % ( s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close)) prefix, self.name, attribute_string, close))
if pretty_print: if pretty_print:
@ -1040,7 +1071,10 @@ class Tag(PageElement):
if pretty_print and closeTag: if pretty_print and closeTag:
s.append(space) s.append(space)
s.append(closeTag) s.append(closeTag)
if pretty_print and closeTag and self.next_sibling: if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n") s.append("\n")
s = ''.join(s) s = ''.join(s)
return s return s
@ -1063,6 +1097,11 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's document contains a <META> tag that mentions the document's
encoding. encoding.
""" """
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None) pretty_print = (indent_level is not None)
s = [] s = []
for c in self: for c in self:
@ -1072,13 +1111,13 @@ class Tag(PageElement):
elif isinstance(c, Tag): elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding, s.append(c.decode(indent_level, eventual_encoding,
formatter)) formatter))
if text and indent_level: if text and indent_level and not self.name == 'pre':
text = text.strip() text = text.strip()
if text: if text:
if pretty_print: if pretty_print and not self.name == 'pre':
s.append(" " * (indent_level - 1)) s.append(" " * (indent_level - 1))
s.append(text) s.append(text)
if pretty_print: if pretty_print and not self.name == 'pre':
s.append("\n") s.append("\n")
return ''.join(s) return ''.join(s)
@ -1145,6 +1184,207 @@ class Tag(PageElement):
yield current yield current
current = current.next_element current = current.next_element
# CSS selector code
_selector_combinators = ['>', '+', '~']
_select_debug = False
def select(self, selector, _candidate_generator=None):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
if tokens[-1] in self._selector_combinators:
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug:
print 'Running CSS selector "%s"' % selector
for index, token in enumerate(tokens):
if self._select_debug:
print ' Considering token "%s"' % token
recursive_candidate_generator = None
tag_name = None
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
continue
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the
# selector. Candidates are generated by the active
# iterator.
checker = None
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag_name, attribute, operator, value = m.groups()
checker = self._attribute_checker(operator, attribute, value)
elif '#' in token:
# ID selector
tag_name, tag_id = token.split('#', 1)
def id_matches(tag):
return tag.get('id', None) == tag_id
checker = id_matches
elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
classes = set(klass.split('.'))
def classes_match(candidate):
return classes.issubset(candidate.get('class', []))
checker = classes_match
elif ':' in token:
# Pseudo-class
tag_name, pseudo = token.split(':', 1)
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is not None:
pseudo_type, pseudo_value = pseudo_attributes.groups()
if pseudo_type == 'nth-of-type':
try:
pseudo_value = int(pseudo_value)
except:
raise NotImplementedError(
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudo-class value must be at least 1.')
class Counter(object):
def __init__(self, destination):
self.count = 0
self.destination = destination
def nth_child_of_type(self, tag):
self.count += 1
if self.count == self.destination:
return True
if self.count > self.destination:
# Stop the generator that's sending us
# these things.
raise StopIteration()
return False
checker = Counter(pseudo_value).nth_child_of_type
else:
raise NotImplementedError(
'Only the following pseudo-classes are implemented: nth-of-type.')
elif token == '*':
# Star selector -- matches everything
pass
elif token == '>':
# Run the next token as a CSS selector against the
# direct children of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.children
elif token == '~':
# Run the next token as a CSS selector against the
# siblings of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.next_siblings
elif token == '+':
# For each tag in the current context, run the next
# token as a CSS selector against the tag's next
# sibling that's a tag.
def next_tag_sibling(tag):
yield tag.find_next_sibling(True)
recursive_candidate_generator = next_tag_sibling
elif self.tag_name_re.match(token):
# Just a tag name.
tag_name = token
else:
raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token)
if recursive_candidate_generator:
# This happens when the selector looks like "> foo".
#
# The generator calls select() recursively on every
# member of the current context, passing in a different
# candidate generator and a different selector.
#
# In the case of "> foo", the candidate generator is
# one that yields a tag's direct children (">"), and
# the selector is "foo".
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
print '-' * 40
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
yield i
if self._select_debug:
print '-' * 40
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
# children. If tag_name is defined, only yield tags
# with that name.
if self._select_debug:
if tag_name:
check = "[any]"
else:
check = tag_name
print ' Default candidate generator, tag name="%s"' % check
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
# debug log.
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if tag_name and not child.name == tag_name:
continue
yield child
_use_candidate_generator = default_candidate_generator
else:
_use_candidate_generator = lambda tag: tag.descendants
else:
_use_candidate_generator = _candidate_generator
new_context = []
new_context_ids = set([])
for tag in current_context:
if self._select_debug:
print " Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
if tag_name and candidate.name != tag_name:
continue
if checker is not None:
try:
result = checker(candidate)
except StopIteration:
# The checker has decided we should no longer
# run the generator.
break
if checker is None or result:
if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context
if self._select_debug:
print "Final verdict:"
for i in current_context:
print " %s %s" % (i.name, i.attrs)
return current_context
# Old names for backwards compatibility # Old names for backwards compatibility
def childGenerator(self): def childGenerator(self):
return self.children return self.children
@ -1152,10 +1392,13 @@ class Tag(PageElement):
def recursiveChildGenerator(self): def recursiveChildGenerator(self):
return self.descendants return self.descendants
# This was kind of misleading because has_key() (attributes) was def has_key(self, key):
# different from __in__ (contents). has_key() is gone in Python 3, """This was kind of misleading because has_key() (attributes)
# anyway. was different from __in__ (contents). has_key() is gone in
has_key = has_attr Python 3, anyway."""
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
key))
return self.has_attr(key)
# Next, a couple classes to represent queries and their results. # Next, a couple classes to represent queries and their results.
class SoupStrainer(object): class SoupStrainer(object):

View file

@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertDoctypeHandled( self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_public_doctype_with_url(self): def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype) self.assertDoctypeHandled(doctype)
@ -159,6 +164,12 @@ class HTMLTreeBuilderSmokeTest(object):
comment = soup.find(text="foobar") comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment) self.assertEqual(comment.__class__, Comment)
# The comment is properly integrated into the tree.
foo = soup.find(text="foo")
self.assertEqual(comment, foo.next_element)
baz = soup.find(text="baz")
self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self): def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags.""" """Whitespace must be preserved in <pre> and <textarea> tags."""
self.assertSoupEquals("<pre> </pre>") self.assertSoupEquals("<pre> </pre>")
@ -217,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect) self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self): def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect) self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect) self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self): def test_quot_entity_converted_to_quotation_mark(self):
@ -235,6 +248,12 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect) self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
def test_basic_namespaces(self): def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the """Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose very least they should not choke on namespaces or lose
@ -453,6 +472,18 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual( self.assertEqual(
soup.encode("utf-8"), markup) soup.encode("utf-8"), markup)
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
</script>
"""
soup = BeautifulSoup(doc, "xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
encoded = soup.encode()
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_popping_namespaced_tag(self): def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup) soup = self.soup(markup)
@ -495,6 +526,11 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(unicode(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5.""" """Smoke test for a tree builder that supports HTML5."""
@ -523,6 +559,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace) self.assertEqual(namespace, soup.msqrt.namespace)
def test_xml_declaration_becomes_comment(self):
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
soup = self.soup(markup)
self.assertTrue(isinstance(soup.contents[0], Comment))
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
self.assertEqual("html", soup.contents[0].next_element.name)
def skipIf(condition, reason): def skipIf(condition, reason):
def nothing(test, *args, **kwargs): def nothing(test, *args, **kwargs):

View file

@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"<table><thead><tr><td>Foo</td></tr></thead>" "<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>" "<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>") "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_xml_declaration_followed_by_doctype(self):
markup = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<p>foo</p>
</body>
</html>'''
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEqual(b"<p>foo</p>", soup.p.encode())

View file

@ -6,8 +6,11 @@ import warnings
try: try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True LXML_PRESENT = True
import lxml.etree
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e: except ImportError, e:
LXML_PRESENT = False LXML_PRESENT = False
LXML_VERSION = (0,)
from bs4 import ( from bs4 import (
BeautifulSoup, BeautifulSoup,
@ -41,6 +44,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals( self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>") "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_beautifulstonesoup_is_xml_parser(self): def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder # Make sure that the deprecated BSS class uses an xml builder
# if one is installed. # if one is installed.
@ -72,4 +86,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property @property
def default_builder(self): def default_builder(self):
return LXMLTreeBuilderForXML() return LXMLTreeBuilderForXML()

View file

@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase):
def test_xml_quoting_handles_ampersands(self): def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T") self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual( self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"), self.sub.substitute_xml("&Aacute;T&T"),
"&amp;Aacute;T&amp;T")
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T") "&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self): def test_quotes_not_html_substituted(self):

View file

@ -20,6 +20,7 @@ from bs4.builder import (
) )
from bs4.element import ( from bs4.element import (
CData, CData,
Comment,
Doctype, Doctype,
NavigableString, NavigableString,
SoupStrainer, SoupStrainer,
@ -425,6 +426,7 @@ class TestParentOperations(TreeTest):
def test_find_parent(self): def test_find_parent(self):
self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
def test_parent_of_text_element(self): def test_parent_of_text_element(self):
text = self.tree.find(text="Start here") text = self.tree.find(text="Start here")
@ -687,6 +689,12 @@ class TestTagCreation(SoupTest):
self.assertEqual("foo", s) self.assertEqual("foo", s)
self.assertTrue(isinstance(s, NavigableString)) self.assertTrue(isinstance(s, NavigableString))
def test_new_string_can_create_navigablestring_subclass(self):
soup = self.soup("")
s = soup.new_string("foo", Comment)
self.assertEqual("foo", s)
self.assertTrue(isinstance(s, Comment))
class TestTreeModification(SoupTest): class TestTreeModification(SoupTest):
def test_attribute_modification(self): def test_attribute_modification(self):
@ -1048,7 +1056,7 @@ class TestTreeModification(SoupTest):
# clear using decompose() # clear using decompose()
em = a.em em = a.em
a.clear(decompose=True) a.clear(decompose=True)
self.assertFalse(hasattr(em, "contents")) self.assertEqual(0, len(em.contents))
def test_string_set(self): def test_string_set(self):
"""Tag.string = 'string'""" """Tag.string = 'string'"""
@ -1166,6 +1174,19 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
def test_get_text_ignores_comments(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(soup.get_text(), "foobar")
self.assertEqual(
soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
def test_all_strings_ignores_comments(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
class TestCDAtaListAttributes(SoupTest): class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'. """Testing cdata-list attributes like 'class'.
@ -1310,6 +1331,32 @@ class TestSubstitutions(SoupTest):
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
def test_formatter_skips_script_tag_for_html_documents(self):
doc = """
<script type="text/javascript">
console.log("< < hey > > ");
</script>
"""
encoded = BeautifulSoup(doc).encode()
self.assertTrue(b"< < hey > >" in encoded)
def test_formatter_skips_style_tag_for_html_documents(self):
doc = """
<style type="text/css">
console.log("< < hey > > ");
</style>
"""
encoded = BeautifulSoup(doc).encode()
self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self):
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
self.assertEqual(
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
soup.div.prettify())
def test_prettify_accepts_formatter(self): def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>") soup = BeautifulSoup("<html><body>foo</body></html>")
pretty = soup.prettify(formatter = lambda x: x.upper()) pretty = soup.prettify(formatter = lambda x: x.upper())
@ -1459,7 +1506,7 @@ class TestSoupSelector(TreeTest):
</head> </head>
<body> <body>
<div id="main"> <div id="main" class="fancy">
<div id="inner"> <div id="inner">
<h1 id="header1">An H1</h1> <h1 id="header1">An H1</h1>
<p>Some text</p> <p>Some text</p>
@ -1531,7 +1578,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0) self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self): def test_invalid_tag(self):
self.assertEqual(len(self.soup.select('tag%t')), 0) self.assertRaises(ValueError, self.soup.select, 'tag%t')
def test_header_tags(self): def test_header_tags(self):
self.assertSelectMultiple( self.assertSelectMultiple(
@ -1564,7 +1611,7 @@ class TestSoupSelector(TreeTest):
for el in els: for el in els:
self.assertEqual(el.name, 'p') self.assertEqual(el.name, 'p')
self.assertEqual(els[1]['class'], ['onep']) self.assertEqual(els[1]['class'], ['onep'])
self.assertFalse(els[0].has_key('class')) self.assertFalse(els[0].has_attr('class'))
def test_a_bunch_of_emptys(self): def test_a_bunch_of_emptys(self):
for selector in ('div#main del', 'div#main div.oops', 'div div#main'): for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
@ -1584,6 +1631,9 @@ class TestSoupSelector(TreeTest):
self.assertSelects('.s1 > a', ['s1a1', 's1a2']) self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
self.assertSelects('.s1 > a span', ['s1a2s1']) self.assertSelects('.s1 > a span', ['s1a2s1'])
def test_child_selector_id(self):
self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
def test_attribute_equals(self): def test_attribute_equals(self):
self.assertSelectMultiple( self.assertSelectMultiple(
('p[class="onep"]', ['p1']), ('p[class="onep"]', ['p1']),
@ -1690,6 +1740,33 @@ class TestSoupSelector(TreeTest):
('p[blah]', []), ('p[blah]', []),
) )
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Another')
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
# Pass in an invalid value.
self.assertRaises(
ValueError, self.soup.select, 'div p:nth-of-type(0)')
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
def test_select_on_element(self): def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element # Other tests operate on the tree; this operates on an element
# within the tree. # within the tree.
@ -1698,3 +1775,26 @@ class TestSoupSelector(TreeTest):
# The <div id="inner"> tag was selected. The <div id="footer"> # The <div id="inner"> tag was selected. The <div id="footer">
# tag was not. # tag was not.
self.assertSelectsIDs(selected, ['inner']) self.assertSelectsIDs(selected, ['inner'])
def test_overspecified_child_id(self):
self.assertSelects(".fancy #inner", ['inner'])
self.assertSelects(".normal #inner", [])
def test_adjacent_sibling_selector(self):
self.assertSelects('#p1 + h2', ['header2'])
self.assertSelects('#p1 + h2 + p', ['pmulti'])
self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
self.assertEqual([], self.soup.select('#p1 + p'))
def test_general_sibling_selector(self):
self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
self.assertSelects('#p1 ~ #header2', ['header2'])
self.assertSelects('#p1 ~ h2 + a', ['me'])
self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
self.assertRaises(ValueError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])