Updated bunded version of BS4
This commit is contained in:
parent
5d30398bc1
commit
2182d5a0fd
14 changed files with 832 additions and 159 deletions
43
lib/bs4/AUTHORS.txt
Normal file
43
lib/bs4/AUTHORS.txt
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
Behold, mortal, the origins of Beautiful Soup...
|
||||||
|
================================================
|
||||||
|
|
||||||
|
Leonard Richardson is the primary programmer.
|
||||||
|
|
||||||
|
Aaron DeVore is awesome.
|
||||||
|
|
||||||
|
Mark Pilgrim provided the encoding detection code that forms the base
|
||||||
|
of UnicodeDammit.
|
||||||
|
|
||||||
|
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
||||||
|
Soup 4 working under Python 3.
|
||||||
|
|
||||||
|
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
||||||
|
support CSS selectors.
|
||||||
|
|
||||||
|
Sam Ruby helped with a lot of edge cases.
|
||||||
|
|
||||||
|
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
|
||||||
|
work in solving the nestable tags conundrum.
|
||||||
|
|
||||||
|
An incomplete list of people have contributed patches to Beautiful
|
||||||
|
Soup:
|
||||||
|
|
||||||
|
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
|
||||||
|
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
|
||||||
|
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
|
||||||
|
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
|
||||||
|
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
|
||||||
|
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
|
||||||
|
Webster, Paul Wright, Danny Yoo
|
||||||
|
|
||||||
|
An incomplete list of people who made suggestions or found bugs or
|
||||||
|
found ways to break Beautiful Soup:
|
||||||
|
|
||||||
|
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
||||||
|
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
||||||
|
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
||||||
|
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
||||||
|
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
||||||
|
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
|
||||||
|
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
||||||
|
Sousa Rocha, Yichun Wei, Per Vognsen
|
|
@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.1.3"
|
__version__ = "4.2.1"
|
||||||
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
@ -201,9 +201,9 @@ class BeautifulSoup(Tag):
|
||||||
"""Create a new tag associated with this soup."""
|
"""Create a new tag associated with this soup."""
|
||||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||||
|
|
||||||
def new_string(self, s):
|
def new_string(self, s, subclass=NavigableString):
|
||||||
"""Create a new NavigableString associated with this soup."""
|
"""Create a new NavigableString associated with this soup."""
|
||||||
navigable = NavigableString(s)
|
navigable = subclass(s)
|
||||||
navigable.setup()
|
navigable.setup()
|
||||||
return navigable
|
return navigable
|
||||||
|
|
||||||
|
@ -245,13 +245,15 @@ class BeautifulSoup(Tag):
|
||||||
o = containerClass(currentData)
|
o = containerClass(currentData)
|
||||||
self.object_was_parsed(o)
|
self.object_was_parsed(o)
|
||||||
|
|
||||||
def object_was_parsed(self, o):
|
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||||
"""Add an object to the parse tree."""
|
"""Add an object to the parse tree."""
|
||||||
o.setup(self.currentTag, self.previous_element)
|
parent = parent or self.currentTag
|
||||||
if self.previous_element:
|
most_recent_element = most_recent_element or self._most_recent_element
|
||||||
self.previous_element.next_element = o
|
o.setup(parent, most_recent_element)
|
||||||
self.previous_element = o
|
if most_recent_element is not None:
|
||||||
self.currentTag.contents.append(o)
|
most_recent_element.next_element = o
|
||||||
|
self._most_recent_element = o
|
||||||
|
parent.contents.append(o)
|
||||||
|
|
||||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
"""Pops the tag stack up to and including the most recent
|
"""Pops the tag stack up to and including the most recent
|
||||||
|
@ -295,12 +297,12 @@ class BeautifulSoup(Tag):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||||
self.currentTag, self.previous_element)
|
self.currentTag, self._most_recent_element)
|
||||||
if tag is None:
|
if tag is None:
|
||||||
return tag
|
return tag
|
||||||
if self.previous_element:
|
if self._most_recent_element:
|
||||||
self.previous_element.next_element = tag
|
self._most_recent_element.next_element = tag
|
||||||
self.previous_element = tag
|
self._most_recent_element = tag
|
||||||
self.pushTag(tag)
|
self.pushTag(tag)
|
||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
@ -333,6 +335,10 @@ class BeautifulSoup(Tag):
|
||||||
return prefix + super(BeautifulSoup, self).decode(
|
return prefix + super(BeautifulSoup, self).decode(
|
||||||
indent_level, eventual_encoding, formatter)
|
indent_level, eventual_encoding, formatter)
|
||||||
|
|
||||||
|
# Alias to make it easier to type import: 'from bs4 import _soup'
|
||||||
|
_s = BeautifulSoup
|
||||||
|
_soup = BeautifulSoup
|
||||||
|
|
||||||
class BeautifulStoneSoup(BeautifulSoup):
|
class BeautifulStoneSoup(BeautifulSoup):
|
||||||
"""Deprecated interface to an XML parser."""
|
"""Deprecated interface to an XML parser."""
|
||||||
|
|
||||||
|
|
|
@ -152,7 +152,7 @@ class TreeBuilder(object):
|
||||||
tag_specific = self.cdata_list_attributes.get(
|
tag_specific = self.cdata_list_attributes.get(
|
||||||
tag_name.lower(), [])
|
tag_name.lower(), [])
|
||||||
for cdata_list_attr in itertools.chain(universal, tag_specific):
|
for cdata_list_attr in itertools.chain(universal, tag_specific):
|
||||||
if cdata_list_attr in dict(attrs):
|
if cdata_list_attr in attrs:
|
||||||
# Basically, we have a "class" attribute whose
|
# Basically, we have a "class" attribute whose
|
||||||
# value is a whitespace-separated list of CSS
|
# value is a whitespace-separated list of CSS
|
||||||
# classes. Split it into a list.
|
# classes. Split it into a list.
|
||||||
|
|
|
@ -131,9 +131,9 @@ class Element(html5lib.treebuilders._base.Node):
|
||||||
old_element = self.element.contents[-1]
|
old_element = self.element.contents[-1]
|
||||||
new_element = self.soup.new_string(old_element + node.element)
|
new_element = self.soup.new_string(old_element + node.element)
|
||||||
old_element.replace_with(new_element)
|
old_element.replace_with(new_element)
|
||||||
|
self.soup._most_recent_element = new_element
|
||||||
else:
|
else:
|
||||||
self.element.append(node.element)
|
self.soup.object_was_parsed(node.element, parent=self.element)
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def getAttributes(self):
|
def getAttributes(self):
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
|
@ -58,6 +58,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
# it's fixed.
|
# it's fixed.
|
||||||
if name.startswith('x'):
|
if name.startswith('x'):
|
||||||
real_name = int(name.lstrip('x'), 16)
|
real_name = int(name.lstrip('x'), 16)
|
||||||
|
elif name.startswith('X'):
|
||||||
|
real_name = int(name.lstrip('X'), 16)
|
||||||
else:
|
else:
|
||||||
real_name = int(name)
|
real_name = int(name)
|
||||||
|
|
||||||
|
@ -85,6 +87,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
if data.startswith("DOCTYPE "):
|
if data.startswith("DOCTYPE "):
|
||||||
data = data[len("DOCTYPE "):]
|
data = data[len("DOCTYPE "):]
|
||||||
|
elif data == 'DOCTYPE':
|
||||||
|
# i.e. "<!DOCTYPE>"
|
||||||
|
data = ''
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(Doctype)
|
self.soup.endData(Doctype)
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ __all__ = [
|
||||||
'LXMLTreeBuilder',
|
'LXMLTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import collections
|
import collections
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
@ -28,6 +29,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
CHUNK_SIZE = 512
|
CHUNK_SIZE = 512
|
||||||
|
|
||||||
|
# This namespace mapping is specified in the XML Namespace
|
||||||
|
# standard.
|
||||||
|
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_parser(self):
|
def default_parser(self):
|
||||||
# This can either return a parser object or a class, which
|
# This can either return a parser object or a class, which
|
||||||
|
@ -45,7 +50,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
parser = parser(target=self, strip_cdata=False)
|
parser = parser(target=self, strip_cdata=False)
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
self.soup = None
|
self.soup = None
|
||||||
self.nsmaps = None
|
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||||
|
|
||||||
def _getNsTag(self, tag):
|
def _getNsTag(self, tag):
|
||||||
# Split the namespace URL out of a fully-qualified lxml tag
|
# Split the namespace URL out of a fully-qualified lxml tag
|
||||||
|
@ -71,7 +76,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
dammit.contains_replacement_characters)
|
dammit.contains_replacement_characters)
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
if isinstance(markup, basestring):
|
if isinstance(markup, bytes):
|
||||||
|
markup = BytesIO(markup)
|
||||||
|
elif isinstance(markup, unicode):
|
||||||
markup = StringIO(markup)
|
markup = StringIO(markup)
|
||||||
# Call feed() at least once, even if the markup is empty,
|
# Call feed() at least once, even if the markup is empty,
|
||||||
# or the parser won't be initialized.
|
# or the parser won't be initialized.
|
||||||
|
@ -85,23 +92,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.nsmaps = None
|
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||||
|
|
||||||
def start(self, name, attrs, nsmap={}):
|
def start(self, name, attrs, nsmap={}):
|
||||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
|
|
||||||
nsprefix = None
|
nsprefix = None
|
||||||
# Invert each namespace map as it comes in.
|
# Invert each namespace map as it comes in.
|
||||||
if len(nsmap) == 0 and self.nsmaps != None:
|
if len(self.nsmaps) > 1:
|
||||||
# There are no new namespaces for this tag, but namespaces
|
# There are no new namespaces for this tag, but
|
||||||
# are in play, so we need a separate tag stack to know
|
# non-default namespaces are in play, so we need a
|
||||||
# when they end.
|
# separate tag stack to know when they end.
|
||||||
self.nsmaps.append(None)
|
self.nsmaps.append(None)
|
||||||
elif len(nsmap) > 0:
|
elif len(nsmap) > 0:
|
||||||
# A new namespace mapping has come into play.
|
# A new namespace mapping has come into play.
|
||||||
if self.nsmaps is None:
|
|
||||||
self.nsmaps = []
|
|
||||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||||
self.nsmaps.append(inverted_nsmap)
|
self.nsmaps.append(inverted_nsmap)
|
||||||
# Also treat the namespace mapping as a set of attributes on the
|
# Also treat the namespace mapping as a set of attributes on the
|
||||||
|
@ -112,20 +116,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||||
attrs[attribute] = namespace
|
attrs[attribute] = namespace
|
||||||
|
|
||||||
if self.nsmaps is not None and len(self.nsmaps) > 0:
|
# Namespaces are in play. Find any attributes that came in
|
||||||
# Namespaces are in play. Find any attributes that came in
|
# from lxml with namespaces attached to their names, and
|
||||||
# from lxml with namespaces attached to their names, and
|
# turn then into NamespacedAttribute objects.
|
||||||
# turn then into NamespacedAttribute objects.
|
new_attrs = {}
|
||||||
new_attrs = {}
|
for attr, value in attrs.items():
|
||||||
for attr, value in attrs.items():
|
namespace, attr = self._getNsTag(attr)
|
||||||
namespace, attr = self._getNsTag(attr)
|
if namespace is None:
|
||||||
if namespace is None:
|
new_attrs[attr] = value
|
||||||
new_attrs[attr] = value
|
else:
|
||||||
else:
|
nsprefix = self._prefix_for_namespace(namespace)
|
||||||
nsprefix = self._prefix_for_namespace(namespace)
|
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
new_attrs[attr] = value
|
||||||
new_attrs[attr] = value
|
attrs = new_attrs
|
||||||
attrs = new_attrs
|
|
||||||
|
|
||||||
namespace, name = self._getNsTag(name)
|
namespace, name = self._getNsTag(name)
|
||||||
nsprefix = self._prefix_for_namespace(namespace)
|
nsprefix = self._prefix_for_namespace(namespace)
|
||||||
|
@ -138,6 +141,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
for inverted_nsmap in reversed(self.nsmaps):
|
for inverted_nsmap in reversed(self.nsmaps):
|
||||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||||
return inverted_nsmap[namespace]
|
return inverted_nsmap[namespace]
|
||||||
|
return None
|
||||||
|
|
||||||
def end(self, name):
|
def end(self, name):
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
|
@ -150,14 +154,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
nsprefix = inverted_nsmap[namespace]
|
nsprefix = inverted_nsmap[namespace]
|
||||||
break
|
break
|
||||||
self.soup.handle_endtag(name, nsprefix)
|
self.soup.handle_endtag(name, nsprefix)
|
||||||
if self.nsmaps != None:
|
if len(self.nsmaps) > 1:
|
||||||
# This tag, or one of its parents, introduced a namespace
|
# This tag, or one of its parents, introduced a namespace
|
||||||
# mapping, so pop it off the stack.
|
# mapping, so pop it off the stack.
|
||||||
self.nsmaps.pop()
|
self.nsmaps.pop()
|
||||||
if len(self.nsmaps) == 0:
|
|
||||||
# Namespaces are no longer in play, so don't bother keeping
|
|
||||||
# track of the namespace stack.
|
|
||||||
self.nsmaps = None
|
|
||||||
|
|
||||||
def pi(self, target, data):
|
def pi(self, target, data):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -81,6 +81,8 @@ class EntitySubstitution(object):
|
||||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||||
")")
|
")")
|
||||||
|
|
||||||
|
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _substitute_html_entity(cls, matchobj):
|
def _substitute_html_entity(cls, matchobj):
|
||||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||||
|
@ -134,6 +136,28 @@ class EntitySubstitution(object):
|
||||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||||
"""Substitute XML entities for special XML characters.
|
"""Substitute XML entities for special XML characters.
|
||||||
|
|
||||||
|
:param value: A string to be substituted. The less-than sign
|
||||||
|
will become <, the greater-than sign will become >,
|
||||||
|
and any ampersands will become &. If you want ampersands
|
||||||
|
that appear to be part of an entity definition to be left
|
||||||
|
alone, use substitute_xml_containing_entities() instead.
|
||||||
|
|
||||||
|
:param make_quoted_attribute: If True, then the string will be
|
||||||
|
quoted, as befits an attribute value.
|
||||||
|
"""
|
||||||
|
# Escape angle brackets and ampersands.
|
||||||
|
value = cls.AMPERSAND_OR_BRACKET.sub(
|
||||||
|
cls._substitute_xml_entity, value)
|
||||||
|
|
||||||
|
if make_quoted_attribute:
|
||||||
|
value = cls.quoted_attribute_value(value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def substitute_xml_containing_entities(
|
||||||
|
cls, value, make_quoted_attribute=False):
|
||||||
|
"""Substitute XML entities for special XML characters.
|
||||||
|
|
||||||
:param value: A string to be substituted. The less-than sign will
|
:param value: A string to be substituted. The less-than sign will
|
||||||
become <, the greater-than sign will become >, and any
|
become <, the greater-than sign will become >, and any
|
||||||
ampersands that are not part of an entity defition will
|
ampersands that are not part of an entity defition will
|
||||||
|
@ -151,6 +175,7 @@ class EntitySubstitution(object):
|
||||||
value = cls.quoted_attribute_value(value)
|
value = cls.quoted_attribute_value(value)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def substitute_html(cls, s):
|
def substitute_html(cls, s):
|
||||||
"""Replace certain Unicode characters with named HTML entities.
|
"""Replace certain Unicode characters with named HTML entities.
|
||||||
|
@ -273,7 +298,6 @@ class UnicodeDammit:
|
||||||
return None
|
return None
|
||||||
self.tried_encodings.append((proposed, errors))
|
self.tried_encodings.append((proposed, errors))
|
||||||
markup = self.markup
|
markup = self.markup
|
||||||
|
|
||||||
# Convert smart quotes to HTML if coming from an encoding
|
# Convert smart quotes to HTML if coming from an encoding
|
||||||
# that might have them.
|
# that might have them.
|
||||||
if (self.smart_quotes_to is not None
|
if (self.smart_quotes_to is not None
|
||||||
|
|
178
lib/bs4/diagnose.py
Normal file
178
lib/bs4/diagnose.py
Normal file
|
@ -0,0 +1,178 @@
|
||||||
|
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||||
|
from StringIO import StringIO
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
from bs4 import BeautifulSoup, __version__
|
||||||
|
from bs4.builder import builder_registry
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
import sys
|
||||||
|
import cProfile
|
||||||
|
|
||||||
|
def diagnose(data):
|
||||||
|
"""Diagnostic suite for isolating common problems."""
|
||||||
|
print "Diagnostic running on Beautiful Soup %s" % __version__
|
||||||
|
print "Python version %s" % sys.version
|
||||||
|
|
||||||
|
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||||
|
for name in basic_parsers:
|
||||||
|
for builder in builder_registry.builders:
|
||||||
|
if name in builder.features:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
basic_parsers.remove(name)
|
||||||
|
print (
|
||||||
|
"I noticed that %s is not installed. Installing it may help." %
|
||||||
|
name)
|
||||||
|
|
||||||
|
if 'lxml' in basic_parsers:
|
||||||
|
basic_parsers.append(["lxml", "xml"])
|
||||||
|
from lxml import etree
|
||||||
|
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||||
|
|
||||||
|
if 'html5lib' in basic_parsers:
|
||||||
|
import html5lib
|
||||||
|
print "Found html5lib version %s" % html5lib.__version__
|
||||||
|
|
||||||
|
if hasattr(data, 'read'):
|
||||||
|
data = data.read()
|
||||||
|
elif os.path.exists(data):
|
||||||
|
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||||
|
data = open(data).read()
|
||||||
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
|
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||||
|
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||||
|
return
|
||||||
|
print
|
||||||
|
|
||||||
|
for parser in basic_parsers:
|
||||||
|
print "Trying to parse your markup with %s" % parser
|
||||||
|
success = False
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(data, parser)
|
||||||
|
success = True
|
||||||
|
except Exception, e:
|
||||||
|
print "%s could not parse the markup." % parser
|
||||||
|
traceback.print_exc()
|
||||||
|
if success:
|
||||||
|
print "Here's what %s did with the markup:" % parser
|
||||||
|
print soup.prettify()
|
||||||
|
|
||||||
|
print "-" * 80
|
||||||
|
|
||||||
|
def lxml_trace(data, html=True):
|
||||||
|
"""Print out the lxml events that occur during parsing.
|
||||||
|
|
||||||
|
This lets you see how lxml parses a document when no Beautiful
|
||||||
|
Soup code is running.
|
||||||
|
"""
|
||||||
|
from lxml import etree
|
||||||
|
for event, element in etree.iterparse(StringIO(data), html=html):
|
||||||
|
print("%s, %4s, %s" % (event, element.tag, element.text))
|
||||||
|
|
||||||
|
class AnnouncingParser(HTMLParser):
|
||||||
|
"""Announces HTMLParser parse events, without doing anything else."""
|
||||||
|
|
||||||
|
def _p(self, s):
|
||||||
|
print(s)
|
||||||
|
|
||||||
|
def handle_starttag(self, name, attrs):
|
||||||
|
self._p("%s START" % name)
|
||||||
|
|
||||||
|
def handle_endtag(self, name):
|
||||||
|
self._p("%s END" % name)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self._p("%s DATA" % data)
|
||||||
|
|
||||||
|
def handle_charref(self, name):
|
||||||
|
self._p("%s CHARREF" % name)
|
||||||
|
|
||||||
|
def handle_entityref(self, name):
|
||||||
|
self._p("%s ENTITYREF" % name)
|
||||||
|
|
||||||
|
def handle_comment(self, data):
|
||||||
|
self._p("%s COMMENT" % data)
|
||||||
|
|
||||||
|
def handle_decl(self, data):
|
||||||
|
self._p("%s DECL" % data)
|
||||||
|
|
||||||
|
def unknown_decl(self, data):
|
||||||
|
self._p("%s UNKNOWN-DECL" % data)
|
||||||
|
|
||||||
|
def handle_pi(self, data):
|
||||||
|
self._p("%s PI" % data)
|
||||||
|
|
||||||
|
def htmlparser_trace(data):
|
||||||
|
"""Print out the HTMLParser events that occur during parsing.
|
||||||
|
|
||||||
|
This lets you see how HTMLParser parses a document when no
|
||||||
|
Beautiful Soup code is running.
|
||||||
|
"""
|
||||||
|
parser = AnnouncingParser()
|
||||||
|
parser.feed(data)
|
||||||
|
|
||||||
|
_vowels = "aeiou"
|
||||||
|
_consonants = "bcdfghjklmnpqrstvwxyz"
|
||||||
|
|
||||||
|
def rword(length=5):
|
||||||
|
"Generate a random word-like string."
|
||||||
|
s = ''
|
||||||
|
for i in range(length):
|
||||||
|
if i % 2 == 0:
|
||||||
|
t = _consonants
|
||||||
|
else:
|
||||||
|
t = _vowels
|
||||||
|
s += random.choice(t)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def rsentence(length=4):
|
||||||
|
"Generate a random sentence-like string."
|
||||||
|
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
||||||
|
|
||||||
|
def rdoc(num_elements=1000):
|
||||||
|
"""Randomly generate an invalid HTML document."""
|
||||||
|
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
|
||||||
|
elements = []
|
||||||
|
for i in range(num_elements):
|
||||||
|
choice = random.randint(0,3)
|
||||||
|
if choice == 0:
|
||||||
|
# New tag.
|
||||||
|
tag_name = random.choice(tag_names)
|
||||||
|
elements.append("<%s>" % tag_name)
|
||||||
|
elif choice == 1:
|
||||||
|
elements.append(rsentence(random.randint(1,4)))
|
||||||
|
elif choice == 2:
|
||||||
|
# Close a tag.
|
||||||
|
tag_name = random.choice(tag_names)
|
||||||
|
elements.append("</%s>" % tag_name)
|
||||||
|
return "<html>" + "\n".join(elements) + "</html>"
|
||||||
|
|
||||||
|
def benchmark_parsers(num_elements=100000):
|
||||||
|
"""Very basic head-to-head performance benchmark."""
|
||||||
|
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||||
|
data = rdoc(num_elements)
|
||||||
|
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||||
|
|
||||||
|
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||||
|
success = False
|
||||||
|
try:
|
||||||
|
a = time.time()
|
||||||
|
soup = BeautifulSoup(data, parser)
|
||||||
|
b = time.time()
|
||||||
|
success = True
|
||||||
|
except Exception, e:
|
||||||
|
print "%s could not parse the markup." % parser
|
||||||
|
traceback.print_exc()
|
||||||
|
if success:
|
||||||
|
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
a = time.time()
|
||||||
|
etree.HTML(data)
|
||||||
|
b = time.time()
|
||||||
|
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
diagnose(sys.stdin.read())
|
|
@ -26,6 +26,9 @@ class NamespacedAttribute(unicode):
|
||||||
def __new__(cls, prefix, name, namespace=None):
|
def __new__(cls, prefix, name, namespace=None):
|
||||||
if name is None:
|
if name is None:
|
||||||
obj = unicode.__new__(cls, prefix)
|
obj = unicode.__new__(cls, prefix)
|
||||||
|
elif prefix is None:
|
||||||
|
# Not really namespaced.
|
||||||
|
obj = unicode.__new__(cls, name)
|
||||||
else:
|
else:
|
||||||
obj = unicode.__new__(cls, prefix + ":" + name)
|
obj = unicode.__new__(cls, prefix + ":" + name)
|
||||||
obj.prefix = prefix
|
obj.prefix = prefix
|
||||||
|
@ -78,6 +81,40 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||||
return match.group(1) + encoding
|
return match.group(1) + encoding
|
||||||
return self.CHARSET_RE.sub(rewrite, self.original_value)
|
return self.CHARSET_RE.sub(rewrite, self.original_value)
|
||||||
|
|
||||||
|
class HTMLAwareEntitySubstitution(EntitySubstitution):
|
||||||
|
|
||||||
|
"""Entity substitution rules that are aware of some HTML quirks.
|
||||||
|
|
||||||
|
Specifically, the contents of <script> and <style> tags should not
|
||||||
|
undergo entity substitution.
|
||||||
|
|
||||||
|
Incoming NavigableString objects are checked to see if they're the
|
||||||
|
direct children of a <script> or <style> tag.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cdata_containing_tags = set(["script", "style"])
|
||||||
|
|
||||||
|
preformatted_tags = set(["pre"])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _substitute_if_appropriate(cls, ns, f):
|
||||||
|
if (isinstance(ns, NavigableString)
|
||||||
|
and ns.parent is not None
|
||||||
|
and ns.parent.name in cls.cdata_containing_tags):
|
||||||
|
# Do nothing.
|
||||||
|
return ns
|
||||||
|
# Substitute.
|
||||||
|
return f(ns)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def substitute_html(cls, ns):
|
||||||
|
return cls._substitute_if_appropriate(
|
||||||
|
ns, EntitySubstitution.substitute_html)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def substitute_xml(cls, ns):
|
||||||
|
return cls._substitute_if_appropriate(
|
||||||
|
ns, EntitySubstitution.substitute_xml)
|
||||||
|
|
||||||
class PageElement(object):
|
class PageElement(object):
|
||||||
"""Contains the navigational information for some part of the page
|
"""Contains the navigational information for some part of the page
|
||||||
|
@ -94,25 +131,60 @@ class PageElement(object):
|
||||||
# converted to entities. This is not recommended, but it's
|
# converted to entities. This is not recommended, but it's
|
||||||
# faster than "minimal".
|
# faster than "minimal".
|
||||||
# A function - This function will be called on every string that
|
# A function - This function will be called on every string that
|
||||||
# needs to undergo entity substition
|
# needs to undergo entity substitution.
|
||||||
FORMATTERS = {
|
#
|
||||||
|
|
||||||
|
# In an HTML document, the default "html" and "minimal" functions
|
||||||
|
# will leave the contents of <script> and <style> tags alone. For
|
||||||
|
# an XML document, all tags will be given the same treatment.
|
||||||
|
|
||||||
|
HTML_FORMATTERS = {
|
||||||
|
"html" : HTMLAwareEntitySubstitution.substitute_html,
|
||||||
|
"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
|
||||||
|
None : None
|
||||||
|
}
|
||||||
|
|
||||||
|
XML_FORMATTERS = {
|
||||||
"html" : EntitySubstitution.substitute_html,
|
"html" : EntitySubstitution.substitute_html,
|
||||||
"minimal" : EntitySubstitution.substitute_xml,
|
"minimal" : EntitySubstitution.substitute_xml,
|
||||||
None : None
|
None : None
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def format_string(self, s, formatter='minimal'):
|
def format_string(self, s, formatter='minimal'):
|
||||||
"""Format the given string using the given formatter."""
|
"""Format the given string using the given formatter."""
|
||||||
if not callable(formatter):
|
if not callable(formatter):
|
||||||
formatter = self.FORMATTERS.get(
|
formatter = self._formatter_for_name(formatter)
|
||||||
formatter, EntitySubstitution.substitute_xml)
|
|
||||||
if formatter is None:
|
if formatter is None:
|
||||||
output = s
|
output = s
|
||||||
else:
|
else:
|
||||||
output = formatter(s)
|
output = formatter(s)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _is_xml(self):
|
||||||
|
"""Is this element part of an XML tree or an HTML tree?
|
||||||
|
|
||||||
|
This is used when mapping a formatter name ("minimal") to an
|
||||||
|
appropriate function (one that performs entity-substitution on
|
||||||
|
the contents of <script> and <style> tags, or not). It's
|
||||||
|
inefficient, but it should be called very rarely.
|
||||||
|
"""
|
||||||
|
if self.parent is None:
|
||||||
|
# This is the top-level object. It should have .is_xml set
|
||||||
|
# from tree creation. If not, take a guess--BS is usually
|
||||||
|
# used on HTML markup.
|
||||||
|
return getattr(self, 'is_xml', False)
|
||||||
|
return self.parent._is_xml
|
||||||
|
|
||||||
|
def _formatter_for_name(self, name):
|
||||||
|
"Look up a formatter function based on its name and the tree."
|
||||||
|
if self._is_xml:
|
||||||
|
return self.XML_FORMATTERS.get(
|
||||||
|
name, EntitySubstitution.substitute_xml)
|
||||||
|
else:
|
||||||
|
return self.HTML_FORMATTERS.get(
|
||||||
|
name, HTMLAwareEntitySubstitution.substitute_xml)
|
||||||
|
|
||||||
def setup(self, parent=None, previous_element=None):
|
def setup(self, parent=None, previous_element=None):
|
||||||
"""Sets up the initial relations between this element and
|
"""Sets up the initial relations between this element and
|
||||||
other elements."""
|
other elements."""
|
||||||
|
@ -366,7 +438,7 @@ class PageElement(object):
|
||||||
# NOTE: We can't use _find_one because findParents takes a different
|
# NOTE: We can't use _find_one because findParents takes a different
|
||||||
# set of arguments.
|
# set of arguments.
|
||||||
r = None
|
r = None
|
||||||
l = self.find_parents(name, attrs, 1)
|
l = self.find_parents(name, attrs, 1, **kwargs)
|
||||||
if l:
|
if l:
|
||||||
r = l[0]
|
r = l[0]
|
||||||
return r
|
return r
|
||||||
|
@ -495,6 +567,14 @@ class PageElement(object):
|
||||||
value =" ".join(value)
|
value =" ".join(value)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def _tag_name_matches_and(self, function, tag_name):
|
||||||
|
if not tag_name:
|
||||||
|
return function
|
||||||
|
else:
|
||||||
|
def _match(tag):
|
||||||
|
return tag.name == tag_name and function(tag)
|
||||||
|
return _match
|
||||||
|
|
||||||
def _attribute_checker(self, operator, attribute, value=''):
|
def _attribute_checker(self, operator, attribute, value=''):
|
||||||
"""Create a function that performs a CSS selector operation.
|
"""Create a function that performs a CSS selector operation.
|
||||||
|
|
||||||
|
@ -536,87 +616,6 @@ class PageElement(object):
|
||||||
else:
|
else:
|
||||||
return lambda el: el.has_attr(attribute)
|
return lambda el: el.has_attr(attribute)
|
||||||
|
|
||||||
def select(self, selector):
|
|
||||||
"""Perform a CSS selection operation on the current element."""
|
|
||||||
tokens = selector.split()
|
|
||||||
current_context = [self]
|
|
||||||
for index, token in enumerate(tokens):
|
|
||||||
if tokens[index - 1] == '>':
|
|
||||||
# already found direct descendants in last step. skip this
|
|
||||||
# step.
|
|
||||||
continue
|
|
||||||
m = self.attribselect_re.match(token)
|
|
||||||
if m is not None:
|
|
||||||
# Attribute selector
|
|
||||||
tag, attribute, operator, value = m.groups()
|
|
||||||
if not tag:
|
|
||||||
tag = True
|
|
||||||
checker = self._attribute_checker(operator, attribute, value)
|
|
||||||
found = []
|
|
||||||
for context in current_context:
|
|
||||||
found.extend(
|
|
||||||
[el for el in context.find_all(tag) if checker(el)])
|
|
||||||
current_context = found
|
|
||||||
continue
|
|
||||||
|
|
||||||
if '#' in token:
|
|
||||||
# ID selector
|
|
||||||
tag, id = token.split('#', 1)
|
|
||||||
if tag == "":
|
|
||||||
tag = True
|
|
||||||
el = current_context[0].find(tag, {'id': id})
|
|
||||||
if el is None:
|
|
||||||
return [] # No match
|
|
||||||
current_context = [el]
|
|
||||||
continue
|
|
||||||
|
|
||||||
if '.' in token:
|
|
||||||
# Class selector
|
|
||||||
tag_name, klass = token.split('.', 1)
|
|
||||||
if not tag_name:
|
|
||||||
tag_name = True
|
|
||||||
classes = set(klass.split('.'))
|
|
||||||
found = []
|
|
||||||
def classes_match(tag):
|
|
||||||
if tag_name is not True and tag.name != tag_name:
|
|
||||||
return False
|
|
||||||
if not tag.has_attr('class'):
|
|
||||||
return False
|
|
||||||
return classes.issubset(tag['class'])
|
|
||||||
for context in current_context:
|
|
||||||
found.extend(context.find_all(classes_match))
|
|
||||||
current_context = found
|
|
||||||
continue
|
|
||||||
|
|
||||||
if token == '*':
|
|
||||||
# Star selector
|
|
||||||
found = []
|
|
||||||
for context in current_context:
|
|
||||||
found.extend(context.findAll(True))
|
|
||||||
current_context = found
|
|
||||||
continue
|
|
||||||
|
|
||||||
if token == '>':
|
|
||||||
# Child selector
|
|
||||||
tag = tokens[index + 1]
|
|
||||||
if not tag:
|
|
||||||
tag = True
|
|
||||||
|
|
||||||
found = []
|
|
||||||
for context in current_context:
|
|
||||||
found.extend(context.find_all(tag, recursive=False))
|
|
||||||
current_context = found
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Here we should just have a regular tag
|
|
||||||
if not self.tag_name_re.match(token):
|
|
||||||
return []
|
|
||||||
found = []
|
|
||||||
for context in current_context:
|
|
||||||
found.extend(context.findAll(token))
|
|
||||||
current_context = found
|
|
||||||
return current_context
|
|
||||||
|
|
||||||
# Old non-property versions of the generators, for backwards
|
# Old non-property versions of the generators, for backwards
|
||||||
# compatibility with BS3.
|
# compatibility with BS3.
|
||||||
def nextGenerator(self):
|
def nextGenerator(self):
|
||||||
|
@ -652,6 +651,9 @@ class NavigableString(unicode, PageElement):
|
||||||
return unicode.__new__(cls, value)
|
return unicode.__new__(cls, value)
|
||||||
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
def __getnewargs__(self):
|
def __getnewargs__(self):
|
||||||
return (unicode(self),)
|
return (unicode(self),)
|
||||||
|
|
||||||
|
@ -709,7 +711,7 @@ class Doctype(PreformattedString):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def for_name_and_ids(cls, name, pub_id, system_id):
|
def for_name_and_ids(cls, name, pub_id, system_id):
|
||||||
value = name
|
value = name or ''
|
||||||
if pub_id is not None:
|
if pub_id is not None:
|
||||||
value += ' PUBLIC "%s"' % pub_id
|
value += ' PUBLIC "%s"' % pub_id
|
||||||
if system_id is not None:
|
if system_id is not None:
|
||||||
|
@ -803,16 +805,24 @@ class Tag(PageElement):
|
||||||
self.clear()
|
self.clear()
|
||||||
self.append(string.__class__(string))
|
self.append(string.__class__(string))
|
||||||
|
|
||||||
def _all_strings(self, strip=False):
|
def _all_strings(self, strip=False, types=(NavigableString, CData)):
|
||||||
"""Yield all child strings, possibly stripping them."""
|
"""Yield all strings of certain classes, possibly stripping them.
|
||||||
|
|
||||||
|
By default, yields only NavigableString and CData objects. So
|
||||||
|
no comments, processing instructions, etc.
|
||||||
|
"""
|
||||||
for descendant in self.descendants:
|
for descendant in self.descendants:
|
||||||
if not isinstance(descendant, NavigableString):
|
if (
|
||||||
|
(types is None and not isinstance(descendant, NavigableString))
|
||||||
|
or
|
||||||
|
(types is not None and type(descendant) not in types)):
|
||||||
continue
|
continue
|
||||||
if strip:
|
if strip:
|
||||||
descendant = descendant.strip()
|
descendant = descendant.strip()
|
||||||
if len(descendant) == 0:
|
if len(descendant) == 0:
|
||||||
continue
|
continue
|
||||||
yield descendant
|
yield descendant
|
||||||
|
|
||||||
strings = property(_all_strings)
|
strings = property(_all_strings)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -820,11 +830,13 @@ class Tag(PageElement):
|
||||||
for string in self._all_strings(True):
|
for string in self._all_strings(True):
|
||||||
yield string
|
yield string
|
||||||
|
|
||||||
def get_text(self, separator=u"", strip=False):
|
def get_text(self, separator=u"", strip=False,
|
||||||
|
types=(NavigableString, CData)):
|
||||||
"""
|
"""
|
||||||
Get all child strings, concatenated using the given separator.
|
Get all child strings, concatenated using the given separator.
|
||||||
"""
|
"""
|
||||||
return separator.join([s for s in self._all_strings(strip)])
|
return separator.join([s for s in self._all_strings(
|
||||||
|
strip, types=types)])
|
||||||
getText = get_text
|
getText = get_text
|
||||||
text = property(get_text)
|
text = property(get_text)
|
||||||
|
|
||||||
|
@ -835,6 +847,7 @@ class Tag(PageElement):
|
||||||
while i is not None:
|
while i is not None:
|
||||||
next = i.next_element
|
next = i.next_element
|
||||||
i.__dict__.clear()
|
i.__dict__.clear()
|
||||||
|
i.contents = []
|
||||||
i = next
|
i = next
|
||||||
|
|
||||||
def clear(self, decompose=False):
|
def clear(self, decompose=False):
|
||||||
|
@ -966,6 +979,13 @@ class Tag(PageElement):
|
||||||
u = self.decode(indent_level, encoding, formatter)
|
u = self.decode(indent_level, encoding, formatter)
|
||||||
return u.encode(encoding, errors)
|
return u.encode(encoding, errors)
|
||||||
|
|
||||||
|
def _should_pretty_print(self, indent_level):
|
||||||
|
"""Should this tag be pretty-printed?"""
|
||||||
|
return (
|
||||||
|
indent_level is not None and
|
||||||
|
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
|
||||||
|
or self._is_xml))
|
||||||
|
|
||||||
def decode(self, indent_level=None,
|
def decode(self, indent_level=None,
|
||||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
formatter="minimal"):
|
formatter="minimal"):
|
||||||
|
@ -978,6 +998,12 @@ class Tag(PageElement):
|
||||||
document contains a <META> tag that mentions the document's
|
document contains a <META> tag that mentions the document's
|
||||||
encoding.
|
encoding.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# First off, turn a string formatter into a function. This
|
||||||
|
# will stop the lookup from happening over and over again.
|
||||||
|
if not callable(formatter):
|
||||||
|
formatter = self._formatter_for_name(formatter)
|
||||||
|
|
||||||
attrs = []
|
attrs = []
|
||||||
if self.attrs:
|
if self.attrs:
|
||||||
for key, val in sorted(self.attrs.items()):
|
for key, val in sorted(self.attrs.items()):
|
||||||
|
@ -1010,12 +1036,15 @@ class Tag(PageElement):
|
||||||
else:
|
else:
|
||||||
closeTag = '</%s%s>' % (prefix, self.name)
|
closeTag = '</%s%s>' % (prefix, self.name)
|
||||||
|
|
||||||
pretty_print = (indent_level is not None)
|
pretty_print = self._should_pretty_print(indent_level)
|
||||||
|
space = ''
|
||||||
|
indent_space = ''
|
||||||
|
if indent_level is not None:
|
||||||
|
indent_space = (' ' * (indent_level - 1))
|
||||||
if pretty_print:
|
if pretty_print:
|
||||||
space = (' ' * (indent_level - 1))
|
space = indent_space
|
||||||
indent_contents = indent_level + 1
|
indent_contents = indent_level + 1
|
||||||
else:
|
else:
|
||||||
space = ''
|
|
||||||
indent_contents = None
|
indent_contents = None
|
||||||
contents = self.decode_contents(
|
contents = self.decode_contents(
|
||||||
indent_contents, eventual_encoding, formatter)
|
indent_contents, eventual_encoding, formatter)
|
||||||
|
@ -1028,8 +1057,10 @@ class Tag(PageElement):
|
||||||
attribute_string = ''
|
attribute_string = ''
|
||||||
if attrs:
|
if attrs:
|
||||||
attribute_string = ' ' + ' '.join(attrs)
|
attribute_string = ' ' + ' '.join(attrs)
|
||||||
if pretty_print:
|
if indent_level is not None:
|
||||||
s.append(space)
|
# Even if this particular tag is not pretty-printed,
|
||||||
|
# we should indent up to the start of the tag.
|
||||||
|
s.append(indent_space)
|
||||||
s.append('<%s%s%s%s>' % (
|
s.append('<%s%s%s%s>' % (
|
||||||
prefix, self.name, attribute_string, close))
|
prefix, self.name, attribute_string, close))
|
||||||
if pretty_print:
|
if pretty_print:
|
||||||
|
@ -1040,7 +1071,10 @@ class Tag(PageElement):
|
||||||
if pretty_print and closeTag:
|
if pretty_print and closeTag:
|
||||||
s.append(space)
|
s.append(space)
|
||||||
s.append(closeTag)
|
s.append(closeTag)
|
||||||
if pretty_print and closeTag and self.next_sibling:
|
if indent_level is not None and closeTag and self.next_sibling:
|
||||||
|
# Even if this particular tag is not pretty-printed,
|
||||||
|
# we're now done with the tag, and we should add a
|
||||||
|
# newline if appropriate.
|
||||||
s.append("\n")
|
s.append("\n")
|
||||||
s = ''.join(s)
|
s = ''.join(s)
|
||||||
return s
|
return s
|
||||||
|
@ -1063,6 +1097,11 @@ class Tag(PageElement):
|
||||||
document contains a <META> tag that mentions the document's
|
document contains a <META> tag that mentions the document's
|
||||||
encoding.
|
encoding.
|
||||||
"""
|
"""
|
||||||
|
# First off, turn a string formatter into a function. This
|
||||||
|
# will stop the lookup from happening over and over again.
|
||||||
|
if not callable(formatter):
|
||||||
|
formatter = self._formatter_for_name(formatter)
|
||||||
|
|
||||||
pretty_print = (indent_level is not None)
|
pretty_print = (indent_level is not None)
|
||||||
s = []
|
s = []
|
||||||
for c in self:
|
for c in self:
|
||||||
|
@ -1072,13 +1111,13 @@ class Tag(PageElement):
|
||||||
elif isinstance(c, Tag):
|
elif isinstance(c, Tag):
|
||||||
s.append(c.decode(indent_level, eventual_encoding,
|
s.append(c.decode(indent_level, eventual_encoding,
|
||||||
formatter))
|
formatter))
|
||||||
if text and indent_level:
|
if text and indent_level and not self.name == 'pre':
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text:
|
if text:
|
||||||
if pretty_print:
|
if pretty_print and not self.name == 'pre':
|
||||||
s.append(" " * (indent_level - 1))
|
s.append(" " * (indent_level - 1))
|
||||||
s.append(text)
|
s.append(text)
|
||||||
if pretty_print:
|
if pretty_print and not self.name == 'pre':
|
||||||
s.append("\n")
|
s.append("\n")
|
||||||
return ''.join(s)
|
return ''.join(s)
|
||||||
|
|
||||||
|
@ -1145,6 +1184,207 @@ class Tag(PageElement):
|
||||||
yield current
|
yield current
|
||||||
current = current.next_element
|
current = current.next_element
|
||||||
|
|
||||||
|
# CSS selector code
|
||||||
|
|
||||||
|
_selector_combinators = ['>', '+', '~']
|
||||||
|
_select_debug = False
|
||||||
|
def select(self, selector, _candidate_generator=None):
|
||||||
|
"""Perform a CSS selection operation on the current element."""
|
||||||
|
tokens = selector.split()
|
||||||
|
current_context = [self]
|
||||||
|
|
||||||
|
if tokens[-1] in self._selector_combinators:
|
||||||
|
raise ValueError(
|
||||||
|
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
||||||
|
if self._select_debug:
|
||||||
|
print 'Running CSS selector "%s"' % selector
|
||||||
|
for index, token in enumerate(tokens):
|
||||||
|
if self._select_debug:
|
||||||
|
print ' Considering token "%s"' % token
|
||||||
|
recursive_candidate_generator = None
|
||||||
|
tag_name = None
|
||||||
|
if tokens[index-1] in self._selector_combinators:
|
||||||
|
# This token was consumed by the previous combinator. Skip it.
|
||||||
|
if self._select_debug:
|
||||||
|
print ' Token was consumed by the previous combinator.'
|
||||||
|
continue
|
||||||
|
# Each operation corresponds to a checker function, a rule
|
||||||
|
# for determining whether a candidate matches the
|
||||||
|
# selector. Candidates are generated by the active
|
||||||
|
# iterator.
|
||||||
|
checker = None
|
||||||
|
|
||||||
|
m = self.attribselect_re.match(token)
|
||||||
|
if m is not None:
|
||||||
|
# Attribute selector
|
||||||
|
tag_name, attribute, operator, value = m.groups()
|
||||||
|
checker = self._attribute_checker(operator, attribute, value)
|
||||||
|
|
||||||
|
elif '#' in token:
|
||||||
|
# ID selector
|
||||||
|
tag_name, tag_id = token.split('#', 1)
|
||||||
|
def id_matches(tag):
|
||||||
|
return tag.get('id', None) == tag_id
|
||||||
|
checker = id_matches
|
||||||
|
|
||||||
|
elif '.' in token:
|
||||||
|
# Class selector
|
||||||
|
tag_name, klass = token.split('.', 1)
|
||||||
|
classes = set(klass.split('.'))
|
||||||
|
def classes_match(candidate):
|
||||||
|
return classes.issubset(candidate.get('class', []))
|
||||||
|
checker = classes_match
|
||||||
|
|
||||||
|
elif ':' in token:
|
||||||
|
# Pseudo-class
|
||||||
|
tag_name, pseudo = token.split(':', 1)
|
||||||
|
if tag_name == '':
|
||||||
|
raise ValueError(
|
||||||
|
"A pseudo-class must be prefixed with a tag name.")
|
||||||
|
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||||
|
found = []
|
||||||
|
if pseudo_attributes is not None:
|
||||||
|
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
||||||
|
if pseudo_type == 'nth-of-type':
|
||||||
|
try:
|
||||||
|
pseudo_value = int(pseudo_value)
|
||||||
|
except:
|
||||||
|
raise NotImplementedError(
|
||||||
|
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
|
||||||
|
if pseudo_value < 1:
|
||||||
|
raise ValueError(
|
||||||
|
'nth-of-type pseudo-class value must be at least 1.')
|
||||||
|
class Counter(object):
|
||||||
|
def __init__(self, destination):
|
||||||
|
self.count = 0
|
||||||
|
self.destination = destination
|
||||||
|
|
||||||
|
def nth_child_of_type(self, tag):
|
||||||
|
self.count += 1
|
||||||
|
if self.count == self.destination:
|
||||||
|
return True
|
||||||
|
if self.count > self.destination:
|
||||||
|
# Stop the generator that's sending us
|
||||||
|
# these things.
|
||||||
|
raise StopIteration()
|
||||||
|
return False
|
||||||
|
checker = Counter(pseudo_value).nth_child_of_type
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
'Only the following pseudo-classes are implemented: nth-of-type.')
|
||||||
|
|
||||||
|
elif token == '*':
|
||||||
|
# Star selector -- matches everything
|
||||||
|
pass
|
||||||
|
elif token == '>':
|
||||||
|
# Run the next token as a CSS selector against the
|
||||||
|
# direct children of each tag in the current context.
|
||||||
|
recursive_candidate_generator = lambda tag: tag.children
|
||||||
|
elif token == '~':
|
||||||
|
# Run the next token as a CSS selector against the
|
||||||
|
# siblings of each tag in the current context.
|
||||||
|
recursive_candidate_generator = lambda tag: tag.next_siblings
|
||||||
|
elif token == '+':
|
||||||
|
# For each tag in the current context, run the next
|
||||||
|
# token as a CSS selector against the tag's next
|
||||||
|
# sibling that's a tag.
|
||||||
|
def next_tag_sibling(tag):
|
||||||
|
yield tag.find_next_sibling(True)
|
||||||
|
recursive_candidate_generator = next_tag_sibling
|
||||||
|
|
||||||
|
elif self.tag_name_re.match(token):
|
||||||
|
# Just a tag name.
|
||||||
|
tag_name = token
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
'Unsupported or invalid CSS selector: "%s"' % token)
|
||||||
|
|
||||||
|
if recursive_candidate_generator:
|
||||||
|
# This happens when the selector looks like "> foo".
|
||||||
|
#
|
||||||
|
# The generator calls select() recursively on every
|
||||||
|
# member of the current context, passing in a different
|
||||||
|
# candidate generator and a different selector.
|
||||||
|
#
|
||||||
|
# In the case of "> foo", the candidate generator is
|
||||||
|
# one that yields a tag's direct children (">"), and
|
||||||
|
# the selector is "foo".
|
||||||
|
next_token = tokens[index+1]
|
||||||
|
def recursive_select(tag):
|
||||||
|
if self._select_debug:
|
||||||
|
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
||||||
|
print '-' * 40
|
||||||
|
for i in tag.select(next_token, recursive_candidate_generator):
|
||||||
|
if self._select_debug:
|
||||||
|
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
||||||
|
yield i
|
||||||
|
if self._select_debug:
|
||||||
|
print '-' * 40
|
||||||
|
_use_candidate_generator = recursive_select
|
||||||
|
elif _candidate_generator is None:
|
||||||
|
# By default, a tag's candidates are all of its
|
||||||
|
# children. If tag_name is defined, only yield tags
|
||||||
|
# with that name.
|
||||||
|
if self._select_debug:
|
||||||
|
if tag_name:
|
||||||
|
check = "[any]"
|
||||||
|
else:
|
||||||
|
check = tag_name
|
||||||
|
print ' Default candidate generator, tag name="%s"' % check
|
||||||
|
if self._select_debug:
|
||||||
|
# This is redundant with later code, but it stops
|
||||||
|
# a bunch of bogus tags from cluttering up the
|
||||||
|
# debug log.
|
||||||
|
def default_candidate_generator(tag):
|
||||||
|
for child in tag.descendants:
|
||||||
|
if not isinstance(child, Tag):
|
||||||
|
continue
|
||||||
|
if tag_name and not child.name == tag_name:
|
||||||
|
continue
|
||||||
|
yield child
|
||||||
|
_use_candidate_generator = default_candidate_generator
|
||||||
|
else:
|
||||||
|
_use_candidate_generator = lambda tag: tag.descendants
|
||||||
|
else:
|
||||||
|
_use_candidate_generator = _candidate_generator
|
||||||
|
|
||||||
|
new_context = []
|
||||||
|
new_context_ids = set([])
|
||||||
|
for tag in current_context:
|
||||||
|
if self._select_debug:
|
||||||
|
print " Running candidate generator on %s %s" % (
|
||||||
|
tag.name, repr(tag.attrs))
|
||||||
|
for candidate in _use_candidate_generator(tag):
|
||||||
|
if not isinstance(candidate, Tag):
|
||||||
|
continue
|
||||||
|
if tag_name and candidate.name != tag_name:
|
||||||
|
continue
|
||||||
|
if checker is not None:
|
||||||
|
try:
|
||||||
|
result = checker(candidate)
|
||||||
|
except StopIteration:
|
||||||
|
# The checker has decided we should no longer
|
||||||
|
# run the generator.
|
||||||
|
break
|
||||||
|
if checker is None or result:
|
||||||
|
if self._select_debug:
|
||||||
|
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
||||||
|
if id(candidate) not in new_context_ids:
|
||||||
|
# If a tag matches a selector more than once,
|
||||||
|
# don't include it in the context more than once.
|
||||||
|
new_context.append(candidate)
|
||||||
|
new_context_ids.add(id(candidate))
|
||||||
|
elif self._select_debug:
|
||||||
|
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||||
|
|
||||||
|
current_context = new_context
|
||||||
|
|
||||||
|
if self._select_debug:
|
||||||
|
print "Final verdict:"
|
||||||
|
for i in current_context:
|
||||||
|
print " %s %s" % (i.name, i.attrs)
|
||||||
|
return current_context
|
||||||
|
|
||||||
# Old names for backwards compatibility
|
# Old names for backwards compatibility
|
||||||
def childGenerator(self):
|
def childGenerator(self):
|
||||||
return self.children
|
return self.children
|
||||||
|
@ -1152,10 +1392,13 @@ class Tag(PageElement):
|
||||||
def recursiveChildGenerator(self):
|
def recursiveChildGenerator(self):
|
||||||
return self.descendants
|
return self.descendants
|
||||||
|
|
||||||
# This was kind of misleading because has_key() (attributes) was
|
def has_key(self, key):
|
||||||
# different from __in__ (contents). has_key() is gone in Python 3,
|
"""This was kind of misleading because has_key() (attributes)
|
||||||
# anyway.
|
was different from __in__ (contents). has_key() is gone in
|
||||||
has_key = has_attr
|
Python 3, anyway."""
|
||||||
|
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
|
||||||
|
key))
|
||||||
|
return self.has_attr(key)
|
||||||
|
|
||||||
# Next, a couple classes to represent queries and their results.
|
# Next, a couple classes to represent queries and their results.
|
||||||
class SoupStrainer(object):
|
class SoupStrainer(object):
|
||||||
|
|
|
@ -81,6 +81,11 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
self.assertDoctypeHandled(
|
self.assertDoctypeHandled(
|
||||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||||
|
|
||||||
|
def test_empty_doctype(self):
|
||||||
|
soup = self.soup("<!DOCTYPE>")
|
||||||
|
doctype = soup.contents[0]
|
||||||
|
self.assertEqual("", doctype.strip())
|
||||||
|
|
||||||
def test_public_doctype_with_url(self):
|
def test_public_doctype_with_url(self):
|
||||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||||
self.assertDoctypeHandled(doctype)
|
self.assertDoctypeHandled(doctype)
|
||||||
|
@ -159,6 +164,12 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
comment = soup.find(text="foobar")
|
comment = soup.find(text="foobar")
|
||||||
self.assertEqual(comment.__class__, Comment)
|
self.assertEqual(comment.__class__, Comment)
|
||||||
|
|
||||||
|
# The comment is properly integrated into the tree.
|
||||||
|
foo = soup.find(text="foo")
|
||||||
|
self.assertEqual(comment, foo.next_element)
|
||||||
|
baz = soup.find(text="baz")
|
||||||
|
self.assertEqual(comment, baz.previous_element)
|
||||||
|
|
||||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||||
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
"""Whitespace must be preserved in <pre> and <textarea> tags."""
|
||||||
self.assertSoupEquals("<pre> </pre>")
|
self.assertSoupEquals("<pre> </pre>")
|
||||||
|
@ -217,12 +228,14 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
|
||||||
def test_entities_in_text_converted_to_unicode(self):
|
def test_entities_in_text_converted_to_unicode(self):
|
||||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
|
||||||
def test_quot_entity_converted_to_quotation_mark(self):
|
def test_quot_entity_converted_to_quotation_mark(self):
|
||||||
|
@ -235,6 +248,12 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
|
|
||||||
|
def test_multipart_strings(self):
|
||||||
|
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||||
|
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||||
|
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||||
|
self.assertEqual("p", soup.p.name)
|
||||||
|
|
||||||
def test_basic_namespaces(self):
|
def test_basic_namespaces(self):
|
||||||
"""Parsers don't need to *understand* namespaces, but at the
|
"""Parsers don't need to *understand* namespaces, but at the
|
||||||
very least they should not choke on namespaces or lose
|
very least they should not choke on namespaces or lose
|
||||||
|
@ -453,6 +472,18 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.encode("utf-8"), markup)
|
soup.encode("utf-8"), markup)
|
||||||
|
|
||||||
|
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||||
|
doc = """
|
||||||
|
<script type="text/javascript">
|
||||||
|
</script>
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(doc, "xml")
|
||||||
|
# lxml would have stripped this while parsing, but we can add
|
||||||
|
# it later.
|
||||||
|
soup.script.string = 'console.log("< < hey > > ");'
|
||||||
|
encoded = soup.encode()
|
||||||
|
self.assertTrue(b"< < hey > >" in encoded)
|
||||||
|
|
||||||
def test_popping_namespaced_tag(self):
|
def test_popping_namespaced_tag(self):
|
||||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
|
@ -495,6 +526,11 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(unicode(soup.foo), markup)
|
||||||
|
|
||||||
|
def test_namespaced_attributes_xml_namespace(self):
|
||||||
|
markup = '<foo xml:lang="fr">bar</foo>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(unicode(soup.foo), markup)
|
||||||
|
|
||||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||||
"""Smoke test for a tree builder that supports HTML5."""
|
"""Smoke test for a tree builder that supports HTML5."""
|
||||||
|
|
||||||
|
@ -523,6 +559,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||||
self.assertEqual(namespace, soup.math.namespace)
|
self.assertEqual(namespace, soup.math.namespace)
|
||||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||||
|
|
||||||
|
def test_xml_declaration_becomes_comment(self):
|
||||||
|
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||||
|
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||||
|
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||||
|
|
||||||
def skipIf(condition, reason):
|
def skipIf(condition, reason):
|
||||||
def nothing(test, *args, **kwargs):
|
def nothing(test, *args, **kwargs):
|
||||||
|
|
|
@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
|
||||||
|
def test_xml_declaration_followed_by_doctype(self):
|
||||||
|
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>foo</p>
|
||||||
|
</body>
|
||||||
|
</html>'''
|
||||||
|
soup = self.soup(markup)
|
||||||
|
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||||
|
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
||||||
|
|
|
@ -6,8 +6,11 @@ import warnings
|
||||||
try:
|
try:
|
||||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||||
LXML_PRESENT = True
|
LXML_PRESENT = True
|
||||||
|
import lxml.etree
|
||||||
|
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||||
except ImportError, e:
|
except ImportError, e:
|
||||||
LXML_PRESENT = False
|
LXML_PRESENT = False
|
||||||
|
LXML_VERSION = (0,)
|
||||||
|
|
||||||
from bs4 import (
|
from bs4 import (
|
||||||
BeautifulSoup,
|
BeautifulSoup,
|
||||||
|
@ -41,6 +44,17 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
self.assertSoupEquals(
|
self.assertSoupEquals(
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
|
||||||
|
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||||
|
# test if an old version of lxml is installed.
|
||||||
|
|
||||||
|
@skipIf(
|
||||||
|
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
|
||||||
|
"Skipping doctype test for old version of lxml to avoid segfault.")
|
||||||
|
def test_empty_doctype(self):
|
||||||
|
soup = self.soup("<!DOCTYPE>")
|
||||||
|
doctype = soup.contents[0]
|
||||||
|
self.assertEqual("", doctype.strip())
|
||||||
|
|
||||||
def test_beautifulstonesoup_is_xml_parser(self):
|
def test_beautifulstonesoup_is_xml_parser(self):
|
||||||
# Make sure that the deprecated BSS class uses an xml builder
|
# Make sure that the deprecated BSS class uses an xml builder
|
||||||
# if one is installed.
|
# if one is installed.
|
||||||
|
@ -72,4 +86,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
@property
|
@property
|
||||||
def default_builder(self):
|
def default_builder(self):
|
||||||
return LXMLTreeBuilderForXML()
|
return LXMLTreeBuilderForXML()
|
||||||
|
|
||||||
|
|
|
@ -125,9 +125,14 @@ class TestEntitySubstitution(unittest.TestCase):
|
||||||
def test_xml_quoting_handles_ampersands(self):
|
def test_xml_quoting_handles_ampersands(self):
|
||||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
||||||
|
|
||||||
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
|
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.sub.substitute_xml("ÁT&T"),
|
self.sub.substitute_xml("ÁT&T"),
|
||||||
|
"&Aacute;T&T")
|
||||||
|
|
||||||
|
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
||||||
|
self.assertEqual(
|
||||||
|
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||||
"ÁT&T")
|
"ÁT&T")
|
||||||
|
|
||||||
def test_quotes_not_html_substituted(self):
|
def test_quotes_not_html_substituted(self):
|
||||||
|
|
|
@ -20,6 +20,7 @@ from bs4.builder import (
|
||||||
)
|
)
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CData,
|
CData,
|
||||||
|
Comment,
|
||||||
Doctype,
|
Doctype,
|
||||||
NavigableString,
|
NavigableString,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
|
@ -425,6 +426,7 @@ class TestParentOperations(TreeTest):
|
||||||
|
|
||||||
def test_find_parent(self):
|
def test_find_parent(self):
|
||||||
self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
|
self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
|
||||||
|
self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
|
||||||
|
|
||||||
def test_parent_of_text_element(self):
|
def test_parent_of_text_element(self):
|
||||||
text = self.tree.find(text="Start here")
|
text = self.tree.find(text="Start here")
|
||||||
|
@ -687,6 +689,12 @@ class TestTagCreation(SoupTest):
|
||||||
self.assertEqual("foo", s)
|
self.assertEqual("foo", s)
|
||||||
self.assertTrue(isinstance(s, NavigableString))
|
self.assertTrue(isinstance(s, NavigableString))
|
||||||
|
|
||||||
|
def test_new_string_can_create_navigablestring_subclass(self):
|
||||||
|
soup = self.soup("")
|
||||||
|
s = soup.new_string("foo", Comment)
|
||||||
|
self.assertEqual("foo", s)
|
||||||
|
self.assertTrue(isinstance(s, Comment))
|
||||||
|
|
||||||
class TestTreeModification(SoupTest):
|
class TestTreeModification(SoupTest):
|
||||||
|
|
||||||
def test_attribute_modification(self):
|
def test_attribute_modification(self):
|
||||||
|
@ -1048,7 +1056,7 @@ class TestTreeModification(SoupTest):
|
||||||
# clear using decompose()
|
# clear using decompose()
|
||||||
em = a.em
|
em = a.em
|
||||||
a.clear(decompose=True)
|
a.clear(decompose=True)
|
||||||
self.assertFalse(hasattr(em, "contents"))
|
self.assertEqual(0, len(em.contents))
|
||||||
|
|
||||||
def test_string_set(self):
|
def test_string_set(self):
|
||||||
"""Tag.string = 'string'"""
|
"""Tag.string = 'string'"""
|
||||||
|
@ -1166,6 +1174,19 @@ class TestElementObjects(SoupTest):
|
||||||
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
|
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
|
||||||
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
|
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
|
||||||
|
|
||||||
|
def test_get_text_ignores_comments(self):
|
||||||
|
soup = self.soup("foo<!--IGNORE-->bar")
|
||||||
|
self.assertEqual(soup.get_text(), "foobar")
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.get_text(types=None), "fooIGNOREbar")
|
||||||
|
|
||||||
|
def test_all_strings_ignores_comments(self):
|
||||||
|
soup = self.soup("foo<!--IGNORE-->bar")
|
||||||
|
self.assertEqual(['foo', 'bar'], list(soup.strings))
|
||||||
|
|
||||||
class TestCDAtaListAttributes(SoupTest):
|
class TestCDAtaListAttributes(SoupTest):
|
||||||
|
|
||||||
"""Testing cdata-list attributes like 'class'.
|
"""Testing cdata-list attributes like 'class'.
|
||||||
|
@ -1310,6 +1331,32 @@ class TestSubstitutions(SoupTest):
|
||||||
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||||||
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
|
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
|
||||||
|
|
||||||
|
def test_formatter_skips_script_tag_for_html_documents(self):
|
||||||
|
doc = """
|
||||||
|
<script type="text/javascript">
|
||||||
|
console.log("< < hey > > ");
|
||||||
|
</script>
|
||||||
|
"""
|
||||||
|
encoded = BeautifulSoup(doc).encode()
|
||||||
|
self.assertTrue(b"< < hey > >" in encoded)
|
||||||
|
|
||||||
|
def test_formatter_skips_style_tag_for_html_documents(self):
|
||||||
|
doc = """
|
||||||
|
<style type="text/css">
|
||||||
|
console.log("< < hey > > ");
|
||||||
|
</style>
|
||||||
|
"""
|
||||||
|
encoded = BeautifulSoup(doc).encode()
|
||||||
|
self.assertTrue(b"< < hey > >" in encoded)
|
||||||
|
|
||||||
|
def test_prettify_leaves_preformatted_text_alone(self):
|
||||||
|
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
|
||||||
|
# Everything outside the <pre> tag is reformatted, but everything
|
||||||
|
# inside is left alone.
|
||||||
|
self.assertEqual(
|
||||||
|
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
|
||||||
|
soup.div.prettify())
|
||||||
|
|
||||||
def test_prettify_accepts_formatter(self):
|
def test_prettify_accepts_formatter(self):
|
||||||
soup = BeautifulSoup("<html><body>foo</body></html>")
|
soup = BeautifulSoup("<html><body>foo</body></html>")
|
||||||
pretty = soup.prettify(formatter = lambda x: x.upper())
|
pretty = soup.prettify(formatter = lambda x: x.upper())
|
||||||
|
@ -1459,7 +1506,7 @@ class TestSoupSelector(TreeTest):
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
|
||||||
<div id="main">
|
<div id="main" class="fancy">
|
||||||
<div id="inner">
|
<div id="inner">
|
||||||
<h1 id="header1">An H1</h1>
|
<h1 id="header1">An H1</h1>
|
||||||
<p>Some text</p>
|
<p>Some text</p>
|
||||||
|
@ -1531,7 +1578,7 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertEqual(len(self.soup.select('del')), 0)
|
self.assertEqual(len(self.soup.select('del')), 0)
|
||||||
|
|
||||||
def test_invalid_tag(self):
|
def test_invalid_tag(self):
|
||||||
self.assertEqual(len(self.soup.select('tag%t')), 0)
|
self.assertRaises(ValueError, self.soup.select, 'tag%t')
|
||||||
|
|
||||||
def test_header_tags(self):
|
def test_header_tags(self):
|
||||||
self.assertSelectMultiple(
|
self.assertSelectMultiple(
|
||||||
|
@ -1564,7 +1611,7 @@ class TestSoupSelector(TreeTest):
|
||||||
for el in els:
|
for el in els:
|
||||||
self.assertEqual(el.name, 'p')
|
self.assertEqual(el.name, 'p')
|
||||||
self.assertEqual(els[1]['class'], ['onep'])
|
self.assertEqual(els[1]['class'], ['onep'])
|
||||||
self.assertFalse(els[0].has_key('class'))
|
self.assertFalse(els[0].has_attr('class'))
|
||||||
|
|
||||||
def test_a_bunch_of_emptys(self):
|
def test_a_bunch_of_emptys(self):
|
||||||
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
||||||
|
@ -1584,6 +1631,9 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
|
self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
|
||||||
self.assertSelects('.s1 > a span', ['s1a2s1'])
|
self.assertSelects('.s1 > a span', ['s1a2s1'])
|
||||||
|
|
||||||
|
def test_child_selector_id(self):
|
||||||
|
self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
|
||||||
|
|
||||||
def test_attribute_equals(self):
|
def test_attribute_equals(self):
|
||||||
self.assertSelectMultiple(
|
self.assertSelectMultiple(
|
||||||
('p[class="onep"]', ['p1']),
|
('p[class="onep"]', ['p1']),
|
||||||
|
@ -1690,6 +1740,33 @@ class TestSoupSelector(TreeTest):
|
||||||
('p[blah]', []),
|
('p[blah]', []),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_nth_of_type(self):
|
||||||
|
# Try to select first paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||||
|
self.assertEqual(len(els), 1)
|
||||||
|
self.assertEqual(els[0].string, u'Some text')
|
||||||
|
|
||||||
|
# Try to select third paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||||
|
self.assertEqual(len(els), 1)
|
||||||
|
self.assertEqual(els[0].string, u'Another')
|
||||||
|
|
||||||
|
# Try to select (non-existent!) fourth paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||||
|
self.assertEqual(len(els), 0)
|
||||||
|
|
||||||
|
# Pass in an invalid value.
|
||||||
|
self.assertRaises(
|
||||||
|
ValueError, self.soup.select, 'div p:nth-of-type(0)')
|
||||||
|
|
||||||
|
def test_nth_of_type_direct_descendant(self):
|
||||||
|
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||||
|
self.assertEqual(len(els), 1)
|
||||||
|
self.assertEqual(els[0].string, u'Some text')
|
||||||
|
|
||||||
|
def test_id_child_selector_nth_of_type(self):
|
||||||
|
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
|
||||||
|
|
||||||
def test_select_on_element(self):
|
def test_select_on_element(self):
|
||||||
# Other tests operate on the tree; this operates on an element
|
# Other tests operate on the tree; this operates on an element
|
||||||
# within the tree.
|
# within the tree.
|
||||||
|
@ -1698,3 +1775,26 @@ class TestSoupSelector(TreeTest):
|
||||||
# The <div id="inner"> tag was selected. The <div id="footer">
|
# The <div id="inner"> tag was selected. The <div id="footer">
|
||||||
# tag was not.
|
# tag was not.
|
||||||
self.assertSelectsIDs(selected, ['inner'])
|
self.assertSelectsIDs(selected, ['inner'])
|
||||||
|
|
||||||
|
def test_overspecified_child_id(self):
|
||||||
|
self.assertSelects(".fancy #inner", ['inner'])
|
||||||
|
self.assertSelects(".normal #inner", [])
|
||||||
|
|
||||||
|
def test_adjacent_sibling_selector(self):
|
||||||
|
self.assertSelects('#p1 + h2', ['header2'])
|
||||||
|
self.assertSelects('#p1 + h2 + p', ['pmulti'])
|
||||||
|
self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
|
||||||
|
self.assertEqual([], self.soup.select('#p1 + p'))
|
||||||
|
|
||||||
|
def test_general_sibling_selector(self):
|
||||||
|
self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
|
||||||
|
self.assertSelects('#p1 ~ #header2', ['header2'])
|
||||||
|
self.assertSelects('#p1 ~ h2 + a', ['me'])
|
||||||
|
self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
|
||||||
|
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
||||||
|
|
||||||
|
def test_dangling_combinator(self):
|
||||||
|
self.assertRaises(ValueError, self.soup.select, 'h1 >')
|
||||||
|
|
||||||
|
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||||
|
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||||
|
|
Reference in a new issue