From 0911892e8aadd370a4450da091c5c1bcb0199398 Mon Sep 17 00:00:00 2001 From: Pawel Kmiecik Date: Thu, 13 Jun 2024 12:20:33 +0200 Subject: [PATCH] fix: table HTML generation corrected (#355) Currently unstructured-inference generates tables with missing markup: ```html
header cell1 header cell2
body cell1 body cell2
``` When it should look like: ```html
header cell1 header cell2
body cell1 body cell2
``` Additionally, fixed `fill_cells` function which added redundant cells that break the HTML table layout when spanned cells were found. --- CHANGELOG.md | 3 + .../models/test_tables.py | 550 +++++++++++++++++- unstructured_inference/__version__.py | 2 +- unstructured_inference/models/tables.py | 62 +- 4 files changed, 579 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af8711b0..b7241945 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.7.35 +Fix syntax for generated HTML tables + ## 0.7.34 * Reduce excessive logging diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index 10f845e7..15c467cd 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -970,7 +970,7 @@ def test_table_prediction_runs_with_empty_recognize( def test_table_prediction_with_ocr_tokens(table_transformer, example_image, mocked_ocr_tokens): prediction = table_transformer.predict(example_image, ocr_tokens=mocked_ocr_tokens) - assert '
' in prediction + assert '" in prediction @@ -1216,26 +1216,6 @@ def test_header_supercell_tree(supercells, expected_len): assert len(supercells) == expected_len -def test_cells_to_html(): - # example table - # +----------+---------------------+ - # | two | two columns | - # | |----------+----------| - # | rows |sub cell 1|sub cell 2| - # +----------+----------+----------+ - cells = [ - {"row_nums": [0, 1], "column_nums": [0], "cell text": "two row", "column header": False}, - {"row_nums": [0], "column_nums": [1, 2], "cell text": "two cols", "column header": False}, - {"row_nums": [1], "column_nums": [1], "cell text": "sub cell 1", "column header": False}, - {"row_nums": [1], "column_nums": [2], "cell text": "sub cell 2", "column header": False}, - ] - expected = ( - '
' in prediction assert "
Blind51434.5%, n=1
two rowtwo ' - "cols
sub cell 1sub cell 2
" - ) - assert tables.cells_to_html(cells) == expected - - @pytest.mark.parametrize("zoom", [1, 0.1, 5, -1, 0]) def test_zoom_image(example_image, zoom): width, height = example_image.size @@ -1247,6 +1227,534 @@ def test_zoom_image(example_image, zoom): assert new_h == np.round(height * zoom, 0) +@pytest.mark.parametrize( + ("input_cells", "expected_html"), + [ + # +----------+---------------------+ + # | row1col1 | row1col2 | row1col3 | + # |----------|----------+----------| + # | row2col1 | row2col2 | row2col3 | + # +----------+----------+----------+ + pytest.param( + [ + { + "row_nums": [0], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [0], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + { + "row_nums": [0], + "column_nums": [2], + "cell text": "row1col3", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + ], + ( + "" + "
row1col1row1col2row1col3
row2col1row2col2row2col3
" + ), + id="simple table without header", + ), + # +----------+---------------------+ + # | h1col1 | h1col2 | h1col3 | + # |----------|----------+----------| + # | row1col1 | row1col2 | row1col3 | + # |----------|----------+----------| + # | row2col1 | row2col2 | row2col3 | + # +----------+----------+----------+ + pytest.param( + [ + {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True}, + {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True}, + {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [2], + "cell text": "row1col3", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + ], + ( + "" + "" + "
h1col1h1col2h1col2
row1col1row1col2row1col3
row2col1row2col2row2col3
" + ), + id="simple table with header", + ), + # +----------+---------------------+ + # | h1col1 | h1col2 | h1col3 | + # |----------|----------+----------| + # | row1col1 | row1col2 | row1col3 | + # |----------|----------+----------| + # | row2col1 | row2col2 | row2col3 | + # +----------+----------+----------+ + pytest.param( + [ + {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True}, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True}, + { + "row_nums": [1], + "column_nums": [2], + "cell text": "row1col3", + "column header": False, + }, + {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True}, + ], + ( + "" + "" + "
h1col1h1col2h1col2
row1col1row1col2row1col3
row2col1row2col2row2col3
" + ), + id="simple table with header, mixed elements", + ), + # +----------+---------------------+ + # | two | two columns | + # | |----------+----------| + # | rows |sub cell 1|sub cell 2| + # +----------+----------+----------+ + pytest.param( + [ + { + "row_nums": [0, 1], + "column_nums": [0], + "cell text": "two row", + "column header": False, + }, + { + "row_nums": [0], + "column_nums": [1, 2], + "cell text": "two cols", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "sub cell 1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [2], + "cell text": "sub cell 2", + "column header": False, + }, + ], + ( + '" + "
two rowtwo ' + "cols
sub cell 1sub cell 2
" + ), + id="various spans, no headers", + ), + # +----------+---------------------+----------+ + # | | h1col23 | h1col4 | + # | h12col1 |----------+----------+----------| + # | | h2col2 | h2col34 | + # |----------|----------+----------+----------+ + # | r3col1 | r3col2 | | + # |----------+----------| r34col34 | + # | r4col12 | | + # +----------+----------+----------+----------+ + pytest.param( + [ + { + "row_nums": [0, 1], + "column_nums": [0], + "cell text": "h12col1", + "column header": True, + }, + { + "row_nums": [0], + "column_nums": [1, 2], + "cell text": "h1col23", + "column header": True, + }, + {"row_nums": [0], "column_nums": [3], "cell text": "h1col4", "column header": True}, + {"row_nums": [1], "column_nums": [1], "cell text": "h2col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [2, 3], + "cell text": "h2col34", + "column header": True, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "r3col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "r3col2", + "column header": False, + }, + { + "row_nums": [2, 3], + "column_nums": [2, 3], + "cell text": "r34col34", + "column header": False, + }, + { + "row_nums": [3], + "column_nums": [0, 1], + "cell text": "r4col12", + "column header": False, + }, + ], + ( + '' + '' + '' + '' + '
h12col1h1col23h1col4
h2col2h2col34
r3col1r3col2r34col34
r4col12
' + ), + id="various spans, with 2 row header", + ), + ], +) +def test_cells_to_html(input_cells, expected_html): + assert tables.cells_to_html(input_cells) == expected_html + + +@pytest.mark.parametrize( + ("input_cells", "expected_cells"), + [ + pytest.param( + [ + {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True}, + {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True}, + {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [2], + "cell text": "row1col3", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + ], + [ + {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True}, + {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True}, + {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [2], + "cell text": "row1col3", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + ], + id="identical tables, no changes expected", + ), + pytest.param( + [ + {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True}, + {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + ], + [ + {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True}, + {"row_nums": [0], "column_nums": [1], "cell text": "", "column header": True}, + {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [0], + "cell text": "row1col1", + "column header": False, + }, + { + "row_nums": [1], + "column_nums": [1], + "cell text": "row1col2", + "column header": False, + }, + {"row_nums": [1], "column_nums": [2], "cell text": "", "column header": False}, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "row2col1", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [1], + "cell text": "row2col2", + "column header": False, + }, + { + "row_nums": [2], + "column_nums": [2], + "cell text": "row2col3", + "column header": False, + }, + ], + id="missing column in header and in the middle", + ), + pytest.param( + [ + { + "row_nums": [0, 1], + "column_nums": [0], + "cell text": "h12col1", + "column header": True, + }, + { + "row_nums": [0], + "column_nums": [1, 2], + "cell text": "h1col23", + "column header": True, + }, + {"row_nums": [1], "column_nums": [1], "cell text": "h2col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [2, 3], + "cell text": "h2col34", + "column header": True, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "r3col1", + "column header": False, + }, + { + "row_nums": [2, 3], + "column_nums": [2, 3], + "cell text": "r34col34", + "column header": False, + }, + { + "row_nums": [3], + "column_nums": [0, 1], + "cell text": "r4col12", + "column header": False, + }, + ], + [ + { + "row_nums": [0, 1], + "column_nums": [0], + "cell text": "h12col1", + "column header": True, + }, + { + "row_nums": [0], + "column_nums": [1, 2], + "cell text": "h1col23", + "column header": True, + }, + {"row_nums": [0], "column_nums": [3], "cell text": "", "column header": True}, + {"row_nums": [1], "column_nums": [1], "cell text": "h2col2", "column header": True}, + { + "row_nums": [1], + "column_nums": [2, 3], + "cell text": "h2col34", + "column header": True, + }, + { + "row_nums": [2], + "column_nums": [0], + "cell text": "r3col1", + "column header": False, + }, + {"row_nums": [2], "column_nums": [1], "cell text": "", "column header": False}, + { + "row_nums": [2, 3], + "column_nums": [2, 3], + "cell text": "r34col34", + "column header": False, + }, + { + "row_nums": [3], + "column_nums": [0, 1], + "cell text": "r4col12", + "column header": False, + }, + ], + id="missing column in header and in the middle in table with spans", + ), + ], +) +def test_fill_cells(input_cells, expected_cells): + def sort_cells(cells): + return sorted(cells, key=lambda x: (x["row_nums"], x["column_nums"])) + + assert sort_cells(tables.fill_cells(input_cells)) == sort_cells(expected_cells) + + def test_padded_results_has_right_dimensions(table_transformer, example_image): str_class_name2idx = tables.get_class_map("structure") # a simpler mapping so we keep all structure in the returned objs below for test diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index e6fd9f15..d0586119 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.34" # pragma: no cover +__version__ = "0.7.35" # pragma: no cover diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 48f4c383..d639eb62 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -648,11 +648,8 @@ def structure_to_cells(table_structure, tokens): def fill_cells(cells: List[dict]) -> List[dict]: - """add empty cells to pad cells that spans multiple rows for html conversion - - For example if a cell takes row 0 and 1 and column 0, we add a new empty cell at row 1 and - column 0. This padding ensures the structure of the output table is intact. In this example the - cell data is {"row_nums": [0, 1], "column_nums": [0], ...} + """fills the missing cells in the table by adding a cells with empty text + where there are no cells detected by the model. A cell contains the following keys relevent to the html conversion: row_nums: List[int] @@ -663,28 +660,60 @@ def fill_cells(cells: List[dict]) -> List[dict]: than one numbers cell text: str the text in this cell + column header: bool + whether this cell is a column header """ - new_cells = cells.copy() + table_rows_no = max({row for cell in cells for row in cell["row_nums"]}) + table_cols_no = max({col for cell in cells for col in cell["column_nums"]}) + filled = np.zeros((table_rows_no + 1, table_cols_no + 1), dtype=bool) for cell in cells: - for extra_row in sorted(cell["row_nums"][1:]): - new_cell = cell.copy() - new_cell["row_nums"] = [extra_row] - new_cell["cell text"] = "" - new_cells.append(new_cell) + for row in cell["row_nums"]: + for col in cell["column_nums"]: + filled[row, col] = True + # add cells for which filled is false + header_rows = {row for cell in cells if cell["column header"] for row in cell["row_nums"]} + new_cells = cells.copy() + not_filled_idx = np.where(filled == False) # noqa: E712 + for row, col in zip(not_filled_idx[0], not_filled_idx[1]): + new_cell = { + "row_nums": [row], + "column_nums": [col], + "cell text": "", + "column header": row in header_rows, + } + new_cells.append(new_cell) return new_cells -def cells_to_html(cells): - """Convert table structure to html format.""" +def cells_to_html(cells: List[dict]) -> str: + """Convert table structure to html format. + + Args: + cells: List of dictionaries representing table cells, where each dictionary has the + following format: + { + "row_nums": List[int], + "column_nums": List[int], + "cell text": str, + "column header": bool, + } + Returns: + str: HTML table string + """ cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"]))) table = ET.Element("table") current_row = -1 + table_header = None + table_has_header = any(cell["column header"] for cell in cells) + if table_has_header: + table_header = ET.SubElement(table, "thead") + + table_body = ET.SubElement(table, "tbody") for cell in cells: this_row = min(cell["row_nums"]) - attrib = {} colspan = len(cell["column_nums"]) if colspan > 1: @@ -695,11 +724,12 @@ def cells_to_html(cells): if this_row > current_row: current_row = this_row if cell["column header"]: + table_subelement = table_header cell_tag = "th" - row = ET.SubElement(table, "thead") else: + table_subelement = table_body cell_tag = "td" - row = ET.SubElement(table, "tr") + row = ET.SubElement(table_subelement, "tr") # type: ignore tcell = ET.SubElement(row, cell_tag, attrib=attrib) tcell.text = cell["cell text"]