# -*- coding: utf-8 -*- # Licensed under a 3-clause BSD style license - see LICENSE.rst """ This module tests some of the methods related to the ``HTML`` reader/writer and aims to document its functionality. Requires `BeautifulSoup `_ to be installed. """ from .. import html from .. import core from ....table import Table import pytest import numpy as np from .common import setup_function, teardown_function from ... import ascii from ....extern.six.moves import range, cStringIO as StringIO from ....utils.xml.writer import HAS_BLEACH # Check to see if the BeautifulSoup dependency is present. try: from bs4 import BeautifulSoup, FeatureNotFound HAS_BEAUTIFUL_SOUP = True except ImportError: HAS_BEAUTIFUL_SOUP = False @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_soupstring(): """ Test to make sure the class SoupString behaves properly. """ soup = BeautifulSoup('

foo

') soup_str = html.SoupString(soup) assert isinstance(soup_str, str) assert isinstance(soup_str, html.SoupString) assert soup_str == '

foo

' assert soup_str.soup is soup def test_listwriter(): """ Test to make sure the class ListWriter behaves properly. """ lst = [] writer = html.ListWriter(lst) for i in range(5): writer.write(i) for ch in 'abcde': writer.write(ch) assert lst == [0, 1, 2, 3, 4, 'a', 'b', 'c', 'd', 'e'] @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_identify_table(): """ Test to make sure that identify_table() returns whether the given BeautifulSoup tag is the correct table to process. """ # Should return False on non- tags and None soup = BeautifulSoup('') assert html.identify_table(soup, {}, 0) is False assert html.identify_table(None, {}, 0) is False soup = BeautifulSoup('

' '

A
B

').table assert html.identify_table(soup, {}, 2) is False assert html.identify_table(soup, {}, 1) is True # Default index of 1 # Same tests, but with explicit parameter assert html.identify_table(soup, {'table_id': 2}, 1) is False assert html.identify_table(soup, {'table_id': 1}, 1) is True # Test identification by string ID assert html.identify_table(soup, {'table_id': 'bar'}, 1) is False assert html.identify_table(soup, {'table_id': 'foo'}, 1) is True @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_missing_data(): """ Test reading a table with missing data """ # First with default where blank => '0' table_in = ['', '', '', '', '

A

1

'] dat = Table.read(table_in, format='ascii.html') assert dat.masked is True assert np.all(dat['A'].mask == [True, False]) assert dat['A'].dtype.kind == 'i' # Now with a specific value '...' => missing table_in = ['', '', '', '', '

A
...
1

'] dat = Table.read(table_in, format='ascii.html', fill_values=[('...', '0')]) assert dat.masked is True assert np.all(dat['A'].mask == [True, False]) assert dat['A'].dtype.kind == 'i' @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_rename_cols(): """ Test reading a table and renaming cols """ table_in = ['', '', '', '

A	B
1	2

'] # Swap column names dat = Table.read(table_in, format='ascii.html', names=['B', 'A']) assert dat.colnames == ['B', 'A'] assert len(dat) == 1 # Swap column names and only include A (the renamed version) dat = Table.read(table_in, format='ascii.html', names=['B', 'A'], include_names=['A']) assert dat.colnames == ['A'] assert len(dat) == 1 assert np.all(dat['A'] == 2) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_no_names(): """ Test reading a table witn no column header """ table_in = ['', '', '', '

'] dat = Table.read(table_in, format='ascii.html') assert dat.colnames == ['col1'] assert len(dat) == 2 dat = Table.read(table_in, format='ascii.html', names=['a']) assert dat.colnames == ['a'] assert len(dat) == 2 @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_identify_table_fail(): """ Raise an exception with an informative error message if table_id is not found. """ table_in = ['', '

A
B

'] with pytest.raises(core.InconsistentTableError) as err: Table.read(table_in, format='ascii.html', htmldict={'table_id': 'bad_id'}, guess=False) assert str(err).endswith("ERROR: HTML table id 'bad_id' not found") with pytest.raises(core.InconsistentTableError) as err: Table.read(table_in, format='ascii.html', htmldict={'table_id': 3}, guess=False) assert str(err).endswith("ERROR: HTML table number 3 not found") @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_backend_parsers(): """ Make sure the user can specify which back-end parser to use and that an error is raised if the parser is invalid. """ for parser in ('lxml', 'xml', 'html.parser', 'html5lib'): try: table = Table.read('t/html2.html', format='ascii.html', htmldict={'parser': parser}, guess=False) except FeatureNotFound: if parser == 'html.parser': raise # otherwise ignore if the dependency isn't present # reading should fail if the parser is invalid with pytest.raises(FeatureNotFound): Table.read('t/html2.html', format='ascii.html', htmldict={'parser': 'foo'}, guess=False) @pytest.mark.skipif('HAS_BEAUTIFUL_SOUP') def test_htmlinputter_no_bs4(): """ This should return an OptionalTableImportError if BeautifulSoup is not installed. """ inputter = html.HTMLInputter() with pytest.raises(core.OptionalTableImportError): inputter.process_lines([]) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmlinputter(): """ Test to ensure that HTMLInputter correctly converts input into a list of SoupStrings representing table elements. """ f = 't/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} # In absence of table_id, defaults to the first table expected = ['Column 1Column 2Column 3', '1a1.05', '2b2.75', '3c-1.25'] assert [str(x) for x in inputter.get_lines(table)] == expected # Should raise an InconsistentTableError if the table is not found inputter.html = {'table_id': 4} with pytest.raises(core.InconsistentTableError): inputter.get_lines(table) # Identification by string ID inputter.html['table_id'] = 'second' expected = ['Column AColumn BColumn C', '4d10.5', '5e27.5', '6f-12.5'] assert [str(x) for x in inputter.get_lines(table)] == expected # Identification by integer index inputter.html['table_id'] = 3 expected = ['C1C2C3', '7g105.0', '8h275.0', '9i-125.0'] assert [str(x) for x in inputter.get_lines(table)] == expected @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmlsplitter(): """ Test to make sure that HTMLSplitter correctly inputs lines of type SoupString to return a generator that gives all header and data elements. """ splitter = html.HTMLSplitter() lines = [html.SoupString(BeautifulSoup('

Col 1	Col 2

').tr), html.SoupString(BeautifulSoup('

Data 1

Data 2

').tr)] expected_data = [['Col 1', 'Col 2'], ['Data 1', 'Data 2']] assert list(splitter(lines)) == expected_data # Make sure the presence of a non-SoupString triggers a TypeError lines.append('Data 3Data 4') with pytest.raises(TypeError): list(splitter(lines)) # Make sure that passing an empty list triggers an error with pytest.raises(core.InconsistentTableError): list(splitter([])) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmlheader_start(): """ Test to ensure that the start_line method of HTMLHeader returns the first line of header data. Uses t/html.html for sample input. """ f = 't/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} header = html.HTMLHeader() lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ 'Column 1Column 2Column 3' inputter.html['table_id'] = 'second' lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ 'Column AColumn BColumn C' inputter.html['table_id'] = 3 lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ 'C1C2C3' # start_line should return None if no valid header is found lines = [html.SoupString(BeautifulSoup('

Data

').tr), html.SoupString(BeautifulSoup('

Text

').p)] assert header.start_line(lines) is None # Should raise an error if a non-SoupString is present lines.append('Header') with pytest.raises(TypeError): header.start_line(lines) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmldata(): """ Test to ensure that the start_line and end_lines methods of HTMLData returns the first line of table data. Uses t/html.html for sample input. """ f = 't/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} data = html.HTMLData() lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '1a1.05' # end_line returns the index of the last data element + 1 assert str(lines[data.end_line(lines) - 1]) == \ '3c-1.25' inputter.html['table_id'] = 'second' lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '4d10.5' assert str(lines[data.end_line(lines) - 1]) == \ '6f-12.5' inputter.html['table_id'] = 3 lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '7g105.0' assert str(lines[data.end_line(lines) - 1]) == \ '9i-125.0' # start_line should raise an error if no table data exists lines = [html.SoupString(BeautifulSoup('

').div), html.SoupString(BeautifulSoup('

Text

').p)] with pytest.raises(core.InconsistentTableError): data.start_line(lines) # end_line should return None if no table data exists assert data.end_line(lines) is None # Should raise an error if a non-SoupString is present lines.append('Data') with pytest.raises(TypeError): data.start_line(lines) with pytest.raises(TypeError): data.end_line(lines) def test_multicolumn_write(): """ Test to make sure that the HTML writer writes multidimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [('a', 'a', 'a'), ('b', 'b', 'b'), ('c', 'c', 'c')] table = Table([col1, col2, col3], names=('C1', 'C2', 'C3')) expected = """\

C1	C2		C3
1	1.0	1.0	a	a	a
2	2.0	2.0	b	b	b
3	3.0	3.0	c	c	c

""" out = html.HTML().write(table)[0].strip() assert out == expected.strip() @pytest.mark.skipif('not HAS_BLEACH') def test_multicolumn_write_escape(): """ Test to make sure that the HTML writer writes multidimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [('', '', 'a'), ('', 'b', 'b'), ('c', 'c', 'c')] table = Table([col1, col2, col3], names=('C1', 'C2', 'C3')) expected = """\

C1	C2		C3
1	1.0	1.0			a
2	2.0	2.0		b	b
3	3.0	3.0	c	c	c

""" out = html.HTML(htmldict={'raw_html_cols': 'C3'}).write(table)[0].strip() assert out == expected.strip() def test_write_no_multicols(): """ Test to make sure that the HTML writer will not use multi-dimensional columns if the multicol parameter is False. """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [('a', 'a', 'a'), ('b', 'b', 'b'), ('c', 'c', 'c')] table = Table([col1, col2, col3], names=('C1', 'C2', 'C3')) expected = """\

C1	C2	C3
1	1.0 .. 1.0	a .. a
2	2.0 .. 2.0	b .. b
3	3.0 .. 3.0	c .. c

""" assert html.HTML({'multicol': False}).write(table)[0].strip() == \ expected.strip() @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_multicolumn_read(): """ Test to make sure that the HTML reader inputs multidimensional columns (those with iterable elements) using the colspan attribute of . Ensure that any string element within a multidimensional column casts all elements to string prior to type conversion operations. """ table = Table.read('t/html2.html', format='ascii.html') str_type = np.dtype((np.str, 21)) expected = Table(np.array([(['1', '2.5000000000000000001'], 3), (['1a', '1'], 3.5)], dtype=[('A', str_type, (2,)), ('B', 'x'], ['y']], names=['a', 'b']) # One column contains raw HTML (string input) out = StringIO() t.write(out, format='ascii.html', htmldict={'raw_html_cols': 'a'}) expected = """\ x <em>y</em> """ assert expected in out.getvalue() # One column contains raw HTML (list input) out = StringIO() t.write(out, format='ascii.html', htmldict={'raw_html_cols': ['a']}) assert expected in out.getvalue() # Two columns contains raw HTML (list input) out = StringIO() t.write(out, format='ascii.html', htmldict={'raw_html_cols': ['a', 'b']}) expected = """\ x y """ assert expected in out.getvalue() @pytest.mark.skipif('not HAS_BLEACH') def test_raw_html_write_clean(): """ Test that columns can contain raw HTML which is not escaped. """ import bleach t = Table([[''], ['

'], ['y']], names=['a', 'b', 'c']) # Confirm that