Updated bunded version of BS4

This commit is contained in:
Luke Rogers 2013-07-14 22:06:37 +12:00
parent 5d30398bc1
commit 2182d5a0fd
14 changed files with 832 additions and 159 deletions

43
lib/bs4/AUTHORS.txt Normal file
View File

@ -0,0 +1,43 @@
Behold, mortal, the origins of Beautiful Soup...
================================================
Leonard Richardson is the primary programmer.
Aaron DeVore is awesome.
Mark Pilgrim provided the encoding detection code that forms the base
of UnicodeDammit.
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
Soup 4 working under Python 3.
Simon Willison wrote soupselect, which was used to make Beautiful Soup
support CSS selectors.
Sam Ruby helped with a lot of edge cases.
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
work in solving the nestable tags conundrum.
An incomplete list of people have contributed patches to Beautiful
Soup:
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
Webster, Paul Wright, Danny Yoo
An incomplete list of people who made suggestions or found bugs or
found ways to break Beautiful Soup:
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
Sousa Rocha, Yichun Wei, Per Vognsen

View File

@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.1.3"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
__version__ = "4.2.1"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
@ -201,9 +201,9 @@ class BeautifulSoup(Tag):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s):
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
navigable = NavigableString(s)
navigable = subclass(s)
navigable.setup()
return navigable
@ -245,13 +245,15 @@ class BeautifulSoup(Tag):
o = containerClass(currentData)
self.object_was_parsed(o)
def object_was_parsed(self, o):
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
o.setup(self.currentTag, self.previous_element)
if self.previous_element:
self.previous_element.next_element = o
self.previous_element = o
self.currentTag.contents.append(o)
parent = parent or self.currentTag
most_recent_element = most_recent_element or self._most_recent_element
o.setup(parent, most_recent_element)
if most_recent_element is not None:
most_recent_element.next_element = o
self._most_recent_element = o
parent.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
@ -295,12 +297,12 @@ class BeautifulSoup(Tag):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self.previous_element)
self.currentTag, self._most_recent_element)
if tag is None:
return tag
if self.previous_element:
self.previous_element.next_element = tag
self.previous_element = tag
if self._most_recent_element:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
return tag
@ -333,6 +335,10 @@ class BeautifulSoup(Tag):
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
# Alias to make it easier to type import: 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""

View File

@ -152,7 +152,7 @@ class TreeBuilder(object):
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), [])
for cdata_list_attr in itertools.chain(universal, tag_specific):
if cdata_list_attr in dict(attrs):
if cdata_list_attr in attrs:
# Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS
# classes. Split it into a list.

View File

@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + node.element)
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
self.element.append(node.element)
node.parent = self
self.soup.object_was_parsed(node.element, parent=self.element)
def getAttributes(self):
return AttrList(self.element)

View File

@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)

View File

@ -3,6 +3,7 @@ __all__ = [
'LXMLTreeBuilder',
]
from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
@property
def default_parser(self):
# This can either return a parser object or a class, which
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
self.nsmaps = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
dammit.contains_replacement_characters)
def feed(self, markup):
if isinstance(markup, basestring):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.close()
def close(self):
self.nsmaps = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(nsmap) == 0 and self.nsmaps != None:
# There are no new namespaces for this tag, but namespaces
# are in play, so we need a separate tag stack to know
# when they end.
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
if self.nsmaps is None:
self.nsmaps = []
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
if self.nsmaps is not None and len(self.nsmaps) > 0:
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
return None
def end(self, name):
self.soup.endData()
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if self.nsmaps != None:
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
if len(self.nsmaps) == 0:
# Namespaces are no longer in play, so don't bother keeping
# track of the namespace stack.
self.nsmaps = None
def pi(self, target, data):
pass

View File

@ -81,6 +81,8 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@ -134,6 +136,28 @@ class EntitySubstitution(object):
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign
will become &lt;, the greater-than sign will become &gt;,
and any ampersands will become &amp;. If you want ampersands
that appear to be part of an entity definition to be left
alone, use substitute_xml_containing_entities() instead.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets and ampersands.
value = cls.AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_xml_containing_entities(
cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
@ -151,6 +175,7 @@ class EntitySubstitution(object):
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
@ -273,7 +298,6 @@ class UnicodeDammit:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None

178
lib/bs4/diagnose.py Normal file
View File

@ -0,0 +1,178 @@
"""Diagnostic functions, mainly for use when doing tech support."""
from StringIO import StringIO
from HTMLParser import HTMLParser
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
import os
import random
import time
import traceback
import sys
import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__
print "Python version %s" % sys.version
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
for builder in builder_registry.builders:
if name in builder.features:
break
else:
basic_parsers.remove(name)
print (
"I noticed that %s is not installed. Installing it may help." %
name)
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
if 'html5lib' in basic_parsers:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "Here's what %s did with the markup:" % parser
print soup.prettify()
print "-" * 80
def lxml_trace(data, html=True):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html):
print("%s, %4s, %s" % (event, element.tag, element.text))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
def _p(self, s):
print(s)
def handle_starttag(self, name, attrs):
self._p("%s START" % name)
def handle_endtag(self, name):
self._p("%s END" % name)
def handle_data(self, data):
self._p("%s DATA" % data)
def handle_charref(self, name):
self._p("%s CHARREF" % name)
def handle_entityref(self, name):
self._p("%s ENTITYREF" % name)
def handle_comment(self, data):
self._p("%s COMMENT" % data)
def handle_decl(self, data):
self._p("%s DECL" % data)
def unknown_decl(self, data):
self._p("%s UNKNOWN-DECL" % data)
def handle_pi(self, data):
self._p("%s PI" % data)
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.feed(data)
_vowels = "aeiou"
_consonants = "bcdfghjklmnpqrstvwxyz"
def rword(length=5):
"Generate a random word-like string."
s = ''
for i in range(length):
if i % 2 == 0:
t = _consonants
else:
t = _vowels
s += random.choice(t)
return s
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
elements = []
for i in range(num_elements):
choice = random.randint(0,3)
if choice == 0:
# New tag.
tag_name = random.choice(tag_names)
elements.append("<%s>" % tag_name)
elif choice == 1:
elements.append(rsentence(random.randint(1,4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
if __name__ == '__main__':
diagnose(sys.stdin.read())

View File

@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None):
if name is None:
obj = unicode.__new__(cls, prefix)
elif prefix is None:
# Not really namespaced.
obj = unicode.__new__(cls, name)
else:
obj = unicode.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
@ -78,6 +81,40 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
class HTMLAwareEntitySubstitution(EntitySubstitution):
"""Entity substitution rules that are aware of some HTML quirks.
Specifically, the contents of <script> and <style> tags should not
undergo entity substitution.
Incoming NavigableString objects are checked to see if they're the
direct children of a <script> or <style> tag.
"""
cdata_containing_tags = set(["script", "style"])
preformatted_tags = set(["pre"])
@classmethod
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in cls.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return f(ns)
@classmethod
def substitute_html(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_html)
@classmethod
def substitute_xml(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
class PageElement(object):
"""Contains the navigational information for some part of the page
@ -94,25 +131,60 @@ class PageElement(object):
# converted to entities. This is not recommended, but it's
# faster than "minimal".
# A function - This function will be called on every string that
# needs to undergo entity substition
FORMATTERS = {
# needs to undergo entity substitution.
#
# In an HTML document, the default "html" and "minimal" functions
# will leave the contents of <script> and <style> tags alone. For
# an XML document, all tags will be given the same treatment.
HTML_FORMATTERS = {
"html" : HTMLAwareEntitySubstitution.substitute_html,
"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
None : None
}
XML_FORMATTERS = {
"html" : EntitySubstitution.substitute_html,
"minimal" : EntitySubstitution.substitute_xml,
None : None
}
@classmethod
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
if not callable(formatter):
formatter = self.FORMATTERS.get(
formatter, EntitySubstitution.substitute_xml)
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
output = formatter(s)
return output
@property
def _is_xml(self):
"""Is this element part of an XML tree or an HTML tree?
This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on
the contents of <script> and <style> tags, or not). It's
inefficient, but it should be called very rarely.
"""
if self.parent is None:
# This is the top-level object. It should have .is_xml set
# from tree creation. If not, take a guess--BS is usually
# used on HTML markup.
return getattr(self, 'is_xml', False)
return self.parent._is_xml
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
return self.XML_FORMATTERS.get(
name, EntitySubstitution.substitute_xml)
else:
return self.HTML_FORMATTERS.get(
name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and
other elements."""
@ -366,7 +438,7 @@ class PageElement(object):
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
l = self.find_parents(name, attrs, 1)
l = self.find_parents(name, attrs, 1, **kwargs)
if l:
r = l[0]
return r
@ -495,6 +567,14 @@ class PageElement(object):
value =" ".join(value)
return value
def _tag_name_matches_and(self, function, tag_name):
if not tag_name:
return function
else:
def _match(tag):
return tag.name == tag_name and function(tag)
return _match
def _attribute_checker(self, operator, attribute, value=''):
"""Create a function that performs a CSS selector operation.
@ -536,87 +616,6 @@ class PageElement(object):
else:
return lambda el: el.has_attr(attribute)
def select(self, selector):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
for index, token in enumerate(tokens):
if tokens[index - 1] == '>':
# already found direct descendants in last step. skip this
# step.
continue
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag, attribute, operator, value = m.groups()
if not tag:
tag = True
checker = self._attribute_checker(operator, attribute, value)
found = []
for context in current_context:
found.extend(
[el for el in context.find_all(tag) if checker(el)])
current_context = found
continue
if '#' in token:
# ID selector
tag, id = token.split('#', 1)
if tag == "":
tag = True
el = current_context[0].find(tag, {'id': id})
if el is None:
return [] # No match
current_context = [el]
continue
if '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
if not tag_name:
tag_name = True
classes = set(klass.split('.'))
found = []
def classes_match(tag):
if tag_name is not True and tag.name != tag_name:
return False
if not tag.has_attr('class'):
return False
return classes.issubset(tag['class'])
for context in current_context:
found.extend(context.find_all(classes_match))
current_context = found
continue
if token == '*':
# Star selector
found = []
for context in current_context:
found.extend(context.findAll(True))
current_context = found
continue
if token == '>':
# Child selector
tag = tokens[index + 1]
if not tag:
tag = True
found = []
for context in current_context:
found.extend(context.find_all(tag, recursive=False))
current_context = found
continue
# Here we should just have a regular tag
if not self.tag_name_re.match(token):
return []
found = []
for context in current_context:
found.extend(context.findAll(token))
current_context = found
return current_context
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@ -652,6 +651,9 @@ class NavigableString(unicode, PageElement):
return unicode.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __copy__(self):
return self
def __getnewargs__(self):
return (unicode(self),)
@ -709,7 +711,7 @@ class Doctype(PreformattedString):
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):
value = name
value = name or ''
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
if system_id is not None:
@ -803,16 +805,24 @@ class Tag(PageElement):
self.clear()
self.append(string.__class__(string))
def _all_strings(self, strip=False):
"""Yield all child strings, possibly stripping them."""
def _all_strings(self, strip=False, types=(NavigableString, CData)):
"""Yield all strings of certain classes, possibly stripping them.
By default, yields only NavigableString and CData objects. So
no comments, processing instructions, etc.
"""
for descendant in self.descendants:
if not isinstance(descendant, NavigableString):
if (
(types is None and not isinstance(descendant, NavigableString))
or
(types is not None and type(descendant) not in types)):
continue
if strip:
descendant = descendant.strip()
if len(descendant) == 0:
continue
yield descendant
strings = property(_all_strings)
@property
@ -820,11 +830,13 @@ class Tag(PageElement):
for string in self._all_strings(True):
yield string
def get_text(self, separator=u"", strip=False):
def get_text(self, separator=u"", strip=False,
types=(NavigableString, CData)):
"""
Get all child strings, concatenated using the given separator.
"""
return separator.join([s for s in self._all_strings(strip)])
return separator.join([s for s in self._all_strings(
strip, types=types)])
getText = get_text
text = property(get_text)
@ -835,6 +847,7 @@ class Tag(PageElement):
while i is not None:
next = i.next_element
i.__dict__.clear()
i.contents = []
i = next
def clear(self, decompose=False):
@ -966,6 +979,13 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
return (
indent_level is not None and
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
or self._is_xml))
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@ -978,6 +998,12 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's
encoding.
"""
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@ -1010,12 +1036,15 @@ class Tag(PageElement):
else:
closeTag = '</%s%s>' % (prefix, self.name)
pretty_print = (indent_level is not None)
pretty_print = self._should_pretty_print(indent_level)
space = ''
indent_space = ''
if indent_level is not None:
indent_space = (' ' * (indent_level - 1))
if pretty_print:
space = (' ' * (indent_level - 1))
space = indent_space
indent_contents = indent_level + 1
else:
space = ''
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter)
@ -1028,8 +1057,10 @@ class Tag(PageElement):
attribute_string = ''
if attrs:
attribute_string = ' ' + ' '.join(attrs)
if pretty_print:
s.append(space)
if indent_level is not None:
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
if pretty_print:
@ -1040,7 +1071,10 @@ class Tag(PageElement):
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
if pretty_print and closeTag and self.next_sibling:
if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
@ -1063,6 +1097,11 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's
encoding.
"""
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
for c in self:
@ -1072,13 +1111,13 @@ class Tag(PageElement):
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
if text and indent_level:
if text and indent_level and not self.name == 'pre':
text = text.strip()
if text:
if pretty_print:
if pretty_print and not self.name == 'pre':
s.append(" " * (indent_level - 1))
s.append(text)
if pretty_print:
if pretty_print and not self.name == 'pre':
s.append("\n")
return ''.join(s)
@ -1145,6 +1184,207 @@ class Tag(PageElement):
yield current
current = current.next_element
# CSS selector code
_selector_combinators = ['>', '+', '~']
_select_debug = False
def select(self, selector, _candidate_generator=None):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
current_context = [self]
if tokens[-1] in self._selector_combinators:
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug:
print 'Running CSS selector "%s"' % selector
for index, token in enumerate(tokens):
if self._select_debug:
print ' Considering token "%s"' % token
recursive_candidate_generator = None
tag_name = None
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
continue
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the
# selector. Candidates are generated by the active
# iterator.
checker = None
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag_name, attribute, operator, value = m.groups()
checker = self._attribute_checker(operator, attribute, value)
elif '#' in token:
# ID selector
tag_name, tag_id = token.split('#', 1)
def id_matches(tag):
return tag.get('id', None) == tag_id
checker = id_matches
elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
classes = set(klass.split('.'))
def classes_match(candidate):
return classes.issubset(candidate.get('class', []))
checker = classes_match
elif ':' in token:
# Pseudo-class
tag_name, pseudo = token.split(':', 1)
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is not None:
pseudo_type, pseudo_value = pseudo_attributes.groups()
if pseudo_type == 'nth-of-type':
try:
pseudo_value = int(pseudo_value)
except:
raise NotImplementedError(
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudo-class value must be at least 1.')
class Counter(object):
def __init__(self, destination):
self.count = 0
self.destination = destination
def nth_child_of_type(self, tag):
self.count += 1
if self.count == self.destination:
return True
if self.count > self.destination:
# Stop the generator that's sending us
# these things.
raise StopIteration()
return False
checker = Counter(pseudo_value).nth_child_of_type
else:
raise NotImplementedError(
'Only the following pseudo-classes are implemented: nth-of-type.')
elif token == '*':
# Star selector -- matches everything
pass
elif token == '>':
# Run the next token as a CSS selector against the
# direct children of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.children
elif token == '~':
# Run the next token as a CSS selector against the
# siblings of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.next_siblings
elif token == '+':
# For each tag in the current context, run the next
# token as a CSS selector against the tag's next
# sibling that's a tag.
def next_tag_sibling(tag):
yield tag.find_next_sibling(True)
recursive_candidate_generator = next_tag_sibling
elif self.tag_name_re.match(token):
# Just a tag name.
tag_name = token
else:
raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token)
if recursive_candidate_generator:
# This happens when the selector looks like "> foo".
#
# The generator calls select() recursively on every
# member of the current context, passing in a different
# candidate generator and a different selector.
#
# In the case of "> foo", the candidate generator is
# one that yields a tag's direct children (">"), and
# the selector is "foo".
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
print '-' * 40
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
yield i
if self._select_debug:
print '-' * 40
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
# children. If tag_name is defined, only yield tags
# with that name.
if self._select_debug:
if tag_name:
check = "[any]"
else:
check = tag_name
print ' Default candidate generator, tag name="%s"' % check
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
# debug log.
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if tag_name and not child.name == tag_name:
continue
yield child
_use_candidate_generator = default_candidate_generator
else:
_use_candidate_generator = lambda tag: tag.descendants
else:
_use_candidate_generator = _candidate_generator
new_context = []
new_context_ids = set([])
for tag in current_context:
if self._select_debug:
print " Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
if tag_name and candidate.name != tag_name:
continue
if checker is not None:
try:
result = checker(candidate)
except StopIteration:
# The checker has decided we should no longer
# run the generator.
break
if checker is None or result:
if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context
if self._select_debug:
print "Final verdict:"
for i in current_context:
print " %s %s" % (i.name, i.attrs)
return current_context
# Old names for backwards compatibility
def childGenerator(self):
return self.children
@ -1152,10 +1392,13 @@ class Tag(PageElement):
def recursiveChildGenerator(self):
return self.descendants
# This was kind of misleading because has_key() (attributes) was
# different from __in__ (contents). has_key() is gone in Python 3,
# anyway.
has_key = has_attr
def has_key(self, key):
"""This was kind of misleading because has_key() (attributes)
was different from __in__ (contents). has_key() is gone in
Python 3, anyway."""
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
key))
return self.has_attr(key)
# Next, a couple classes to represent queries and their results.
class SoupStrainer(object):

View File

@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
@ -159,6 +164,12 @@ class HTMLTreeBuilderSmokeTest(object):
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
# The comment is properly integrated into the tree.
foo = soup.find(text="foo")
self.assertEqual(comment, foo.next_element)
baz = soup.find(text="baz")
self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags."""
self.assertSoupEquals("<pre> </pre>")
@ -217,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self):
@ -235,6 +248,12 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose
@ -453,6 +472,18 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
</script>
"""
soup = BeautifulSoup(doc, "xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
encoded = soup.encode()
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
@ -495,6 +526,11 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
@ -523,6 +559,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace)
def test_xml_declaration_becomes_comment(self):
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
soup = self.soup(markup)
self.assertTrue(isinstance(soup.contents[0], Comment))
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
self.assertEqual("html", soup.contents[0].next_element.name)
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):

View File

@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_xml_declaration_followed_by_doctype(self):
markup = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<p>foo</p>
</body>
</html>'''
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEqual(b"<p>foo</p>", soup.p.encode())

View File

@ -6,8 +6,11 @@ import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
import lxml.etree
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e:
LXML_PRESENT = False
LXML_VERSION = (0,)
from bs4 import (
BeautifulSoup,
@ -41,6 +44,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
@ -72,4 +86,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return LXMLTreeBuilderForXML()

View File

@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase):
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"),
"&amp;Aacute;T&amp;T")
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self):

View File

@ -20,6 +20,7 @@ from bs4.builder import (
)
from bs4.element import (
CData,
Comment,
Doctype,
NavigableString,
SoupStrainer,
@ -425,6 +426,7 @@ class TestParentOperations(TreeTest):
def test_find_parent(self):
self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
def test_parent_of_text_element(self):
text = self.tree.find(text="Start here")
@ -687,6 +689,12 @@ class TestTagCreation(SoupTest):
self.assertEqual("foo", s)
self.assertTrue(isinstance(s, NavigableString))
def test_new_string_can_create_navigablestring_subclass(self):
soup = self.soup("")
s = soup.new_string("foo", Comment)
self.assertEqual("foo", s)
self.assertTrue(isinstance(s, Comment))
class TestTreeModification(SoupTest):
def test_attribute_modification(self):
@ -1048,7 +1056,7 @@ class TestTreeModification(SoupTest):
# clear using decompose()
em = a.em
a.clear(decompose=True)
self.assertFalse(hasattr(em, "contents"))
self.assertEqual(0, len(em.contents))
def test_string_set(self):
"""Tag.string = 'string'"""
@ -1166,6 +1174,19 @@ class TestElementObjects(SoupTest):
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
def test_get_text_ignores_comments(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(soup.get_text(), "foobar")
self.assertEqual(
soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
self.assertEqual(
soup.get_text(types=None), "fooIGNOREbar")
def test_all_strings_ignores_comments(self):
soup = self.soup("foo<!--IGNORE-->bar")
self.assertEqual(['foo', 'bar'], list(soup.strings))
class TestCDAtaListAttributes(SoupTest):
"""Testing cdata-list attributes like 'class'.
@ -1310,6 +1331,32 @@ class TestSubstitutions(SoupTest):
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
def test_formatter_skips_script_tag_for_html_documents(self):
doc = """
<script type="text/javascript">
console.log("< < hey > > ");
</script>
"""
encoded = BeautifulSoup(doc).encode()
self.assertTrue(b"< < hey > >" in encoded)
def test_formatter_skips_style_tag_for_html_documents(self):
doc = """
<style type="text/css">
console.log("< < hey > > ");
</style>
"""
encoded = BeautifulSoup(doc).encode()
self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self):
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
# Everything outside the <pre> tag is reformatted, but everything
# inside is left alone.
self.assertEqual(
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
soup.div.prettify())
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>")
pretty = soup.prettify(formatter = lambda x: x.upper())
@ -1459,7 +1506,7 @@ class TestSoupSelector(TreeTest):
</head>
<body>
<div id="main">
<div id="main" class="fancy">
<div id="inner">
<h1 id="header1">An H1</h1>
<p>Some text</p>
@ -1531,7 +1578,7 @@ class TestSoupSelector(TreeTest):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
self.assertEqual(len(self.soup.select('tag%t')), 0)
self.assertRaises(ValueError, self.soup.select, 'tag%t')
def test_header_tags(self):
self.assertSelectMultiple(
@ -1564,7 +1611,7 @@ class TestSoupSelector(TreeTest):
for el in els:
self.assertEqual(el.name, 'p')
self.assertEqual(els[1]['class'], ['onep'])
self.assertFalse(els[0].has_key('class'))
self.assertFalse(els[0].has_attr('class'))
def test_a_bunch_of_emptys(self):
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
@ -1584,6 +1631,9 @@ class TestSoupSelector(TreeTest):
self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
self.assertSelects('.s1 > a span', ['s1a2s1'])
def test_child_selector_id(self):
self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
def test_attribute_equals(self):
self.assertSelectMultiple(
('p[class="onep"]', ['p1']),
@ -1690,6 +1740,33 @@ class TestSoupSelector(TreeTest):
('p[blah]', []),
)
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Another')
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
# Pass in an invalid value.
self.assertRaises(
ValueError, self.soup.select, 'div p:nth-of-type(0)')
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element
# within the tree.
@ -1698,3 +1775,26 @@ class TestSoupSelector(TreeTest):
# The <div id="inner"> tag was selected. The <div id="footer">
# tag was not.
self.assertSelectsIDs(selected, ['inner'])
def test_overspecified_child_id(self):
self.assertSelects(".fancy #inner", ['inner'])
self.assertSelects(".normal #inner", [])
def test_adjacent_sibling_selector(self):
self.assertSelects('#p1 + h2', ['header2'])
self.assertSelects('#p1 + h2 + p', ['pmulti'])
self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
self.assertEqual([], self.soup.select('#p1 + p'))
def test_general_sibling_selector(self):
self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
self.assertSelects('#p1 ~ #header2', ['header2'])
self.assertSelects('#p1 ~ h2 + a', ['me'])
self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
self.assertEqual([], self.soup.select('#inner ~ h2'))
def test_dangling_combinator(self):
self.assertRaises(ValueError, self.soup.select, 'h1 >')
def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])