Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase 
 16   
 17  try: 
 18      unicode = __builtins__["unicode"] 
 19  except (NameError, KeyError): 
 20      unicode = str 
 21   
22 -class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases 24 """ 25 etree = etree 26 27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 28 html_str_pretty = _bytes("""\ 29 <html> 30 <head><title>test</title></head> 31 <body><h1>page title</h1></body> 32 </html> 33 """) 34 broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>") 35 uhtml_str = _str("<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>") 36
37 - def tearDown(self):
38 super(HtmlParserTestCase, self).tearDown() 39 self.etree.set_default_parser()
40
41 - def test_module_HTML(self):
42 element = self.etree.HTML(self.html_str) 43 self.assertEqual(self.etree.tostring(element), 44 self.html_str)
45
46 - def test_module_HTML_unicode(self):
47 element = self.etree.HTML(self.uhtml_str) 48 self.assertEqual(unicode(self.etree.tostring(element, encoding='UTF8'), 'UTF8'), 49 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
50
52 element = self.etree.HTML(self.html_str) 53 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 54 self.html_str_pretty)
55
57 parser = self.etree.HTMLParser(recover=False) 58 parse = self.etree.parse 59 f = BytesIO("<html></body>") 60 self.assertRaises(self.etree.XMLSyntaxError, 61 parse, f, parser)
62
64 parser = self.etree.HTMLParser() 65 Element = parser.makeelement 66 67 el = Element('name') 68 self.assertRaises(ValueError, Element, '{}') 69 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 70 71 self.assertRaises(ValueError, Element, '{test}') 72 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
73
75 parser = self.etree.HTMLParser() 76 Element = parser.makeelement 77 78 pname = Element('p:name') 79 self.assertEquals(pname.tag, 'p:name') 80 81 pname = Element('{test}p:name') 82 self.assertEquals(pname.tag, '{test}p:name') 83 84 pname = Element('name') 85 pname.tag = 'p:name' 86 self.assertEquals(pname.tag, 'p:name')
87
89 parser = self.etree.HTMLParser() 90 Element = parser.makeelement 91 92 self.assertRaises(ValueError, Element, 'p"name') 93 self.assertRaises(ValueError, Element, "na'me") 94 self.assertRaises(ValueError, Element, '{test}"name') 95 self.assertRaises(ValueError, Element, "{test}name'") 96 97 el = Element('name') 98 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 99 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 100 self.assertEquals(el.tag, "name")
101
103 parser = self.etree.HTMLParser() 104 Element = parser.makeelement 105 106 self.assertRaises(ValueError, Element, ' name ') 107 self.assertRaises(ValueError, Element, 'na me') 108 self.assertRaises(ValueError, Element, '{test} name') 109 110 el = Element('name') 111 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 112 self.assertEquals(el.tag, "name")
113
115 parser = self.etree.HTMLParser() 116 Element = parser.makeelement 117 118 SubElement = self.etree.SubElement 119 120 el = Element('name') 121 self.assertRaises(ValueError, SubElement, el, '{}') 122 self.assertRaises(ValueError, SubElement, el, '{test}')
123
125 parser = self.etree.HTMLParser() 126 Element = parser.makeelement 127 SubElement = self.etree.SubElement 128 129 el = Element('name') 130 pname = SubElement(el, 'p:name') 131 self.assertEquals(pname.tag, 'p:name') 132 133 pname = SubElement(el, '{test}p:name') 134 self.assertEquals(pname.tag, '{test}p:name')
135
137 parser = self.etree.HTMLParser() 138 Element = parser.makeelement 139 SubElement = self.etree.SubElement 140 141 el = Element('name') 142 self.assertRaises(ValueError, SubElement, el, "name'") 143 self.assertRaises(ValueError, SubElement, el, 'na"me') 144 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 145 self.assertRaises(ValueError, SubElement, el, '{test}"name')
146
148 parser = self.etree.HTMLParser() 149 Element = parser.makeelement 150 SubElement = self.etree.SubElement 151 152 el = Element('name') 153 self.assertRaises(ValueError, SubElement, el, ' name ') 154 self.assertRaises(ValueError, SubElement, el, 'na me') 155 self.assertRaises(ValueError, SubElement, el, '{test} name')
156
158 parser = self.etree.HTMLParser(recover=False) 159 parse = self.etree.parse 160 f = BytesIO(self.broken_html_str) 161 self.assertRaises(self.etree.XMLSyntaxError, 162 parse, f, parser)
163
165 text = _str('Søk på nettet') 166 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 167 168 tree = self.etree.parse( 169 BytesIO(html_latin1), 170 self.etree.HTMLParser(encoding="iso-8859-1")) 171 p = tree.find("//p") 172 self.assertEquals(p.text, text)
173
175 text = _str('Søk på nettet') 176 wrong_head = _str(''' 177 <head> 178 <meta http-equiv="Content-Type" 179 content="text/html; charset=UTF-8" /> 180 </head>''') 181 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 182 text) 183 ).encode('iso-8859-1') 184 185 self.assertRaises(self.etree.ParseError, 186 self.etree.parse, 187 BytesIO(html_latin1)) 188 189 tree = self.etree.parse( 190 BytesIO(html_latin1), 191 self.etree.HTMLParser(encoding="iso-8859-1")) 192 p = tree.find("//p") 193 self.assertEquals(p.text, text)
194
195 - def test_module_HTML_broken(self):
196 element = self.etree.HTML(self.broken_html_str) 197 self.assertEqual(self.etree.tostring(element), 198 self.html_str)
199
200 - def test_module_HTML_cdata(self):
201 # by default, libxml2 generates CDATA nodes for <script> content 202 html = _bytes('<html><head><style>foo</style></head></html>') 203 element = self.etree.HTML(html) 204 self.assertEquals(element[0][0].text, "foo")
205
206 - def test_module_HTML_access(self):
207 element = self.etree.HTML(self.html_str) 208 self.assertEqual(element[0][0].tag, 'title')
209
210 - def test_module_parse_html(self):
211 parser = self.etree.HTMLParser() 212 filename = tempfile.mktemp(suffix=".html") 213 open(filename, 'wb').write(self.html_str) 214 try: 215 f = open(filename, 'rb') 216 tree = self.etree.parse(f, parser) 217 f.close() 218 self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) 219 finally: 220 os.remove(filename)
221
223 parser = self.etree.HTMLParser() 224 f = SillyFileLike(self.html_str) 225 tree = self.etree.parse(f, parser) 226 html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 227 self.assertEqual(html, self.html_str)
228 229 ## def test_module_parse_html_filelike_unicode(self): 230 ## parser = self.etree.HTMLParser() 231 ## f = SillyFileLike(self.uhtml_str) 232 ## tree = self.etree.parse(f, parser) 233 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 234 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 235
236 - def test_html_file_error(self):
237 parser = self.etree.HTMLParser() 238 parse = self.etree.parse 239 self.assertRaises(IOError, 240 parse, "__some_hopefully_nonexisting_file__.html", 241 parser)
242
244 self.assertRaises(self.etree.XMLSyntaxError, 245 self.etree.parse, BytesIO(self.broken_html_str)) 246 247 self.etree.set_default_parser( self.etree.HTMLParser() ) 248 249 tree = self.etree.parse(BytesIO(self.broken_html_str)) 250 self.assertEqual(self.etree.tostring(tree.getroot()), 251 self.html_str) 252 253 self.etree.set_default_parser() 254 255 self.assertRaises(self.etree.XMLSyntaxError, 256 self.etree.parse, BytesIO(self.broken_html_str))
257
258 - def test_html_iterparse(self):
259 iterparse = self.etree.iterparse 260 f = BytesIO( 261 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 262 263 iterator = iterparse(f, html=True) 264 self.assertEquals(None, iterator.root) 265 266 events = list(iterator) 267 root = iterator.root 268 self.assert_(root is not None) 269 self.assertEquals( 270 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 271 ('end', root[1]), ('end', root)], 272 events)
273
274 - def test_html_iterparse_file(self):
275 iterparse = self.etree.iterparse 276 iterator = iterparse(fileInTestDir("css_shakespear.html"), 277 html=True) 278 279 self.assertEquals(None, iterator.root) 280 events = list(iterator) 281 root = iterator.root 282 self.assert_(root is not None) 283 self.assertEquals(249, len(events)) 284 self.assertEquals( 285 [], 286 [ event for (event, element) in events if event != 'end' ])
287
288 -def test_suite():
289 suite = unittest.TestSuite() 290 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 291 return suite
292 293 if __name__ == '__main__': 294 print('to test use test.py %s' % __file__) 295