"""Tests to ensure that the html5lib tree builder generates good trees.""" import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( HTML5TreeBuilderSmokeTest, SoupTest, skipIf, ) @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing its tree builder.") class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): return HTML5TreeBuilder() def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. strainer = SoupStrainer("b") markup = "

A bold statement.

" with warnings.catch_warnings(record=True) as w: soup = self.soup(markup, parse_only=strainer) self.assertEqual( soup.decode(), self.document_for(markup)) self.assertTrue( "the html5lib tree builder doesn't support parse_only" in str(w[0].message)) def test_correctly_nested_tables(self): """html5lib inserts tags where other parsers don't.""" markup = ('' '' "') self.assertSoupEquals( markup, '
Here's another table:" '' '' '
foo
Here\'s another table:' '
foo
' '
') self.assertSoupEquals( "" "" "
Foo
Bar
Baz
") def test_xml_declaration_followed_by_doctype(self): markup = '''

foo

''' soup = self.soup(markup) # Verify that we can reach the

tag; this means the tree is connected. self.assertEqual(b"

foo

", soup.p.encode()) def test_reparented_markup(self): markup = '

foo

\n

bar

' soup = self.soup(markup) self.assertEqual("

foo

\n

bar

", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_reparented_markup_ends_with_whitespace(self): markup = '

foo

\n

bar

\n' soup = self.soup(markup) self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) self.assertEqual(2, len(soup.find_all('p'))) def test_processing_instruction(self): """Processing instructions become comments.""" markup = b"""""" soup = self.soup(markup) assert str(soup).startswith("") def test_cloned_multivalue_node(self): markup = b"""

""" soup = self.soup(markup) a1, a2 = soup.find_all('a') self.assertEqual(a1, a2) assert a1 is not a2