From bd145907a4b909a465b8f3fef2b9aad71877626a Mon Sep 17 00:00:00 2001 From: yamazombie <36588283+yamazombie@users.noreply.github.com> Date: Sat, 18 Jan 2025 13:06:48 +0900 Subject: [PATCH] fix: preserve text after line breaks in PowerPoint table cells This commit addresses an issue where text after line breaks in PowerPoint table cells was lost during processing. The issue is resolved by handling cell content similarly to how it is processed for Word documents, using space separation: https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/docx.py#L494 --- unstructured/partition/pptx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 0fc46c773e..7c55bcdefd 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -252,7 +252,7 @@ def _iter_table_element(self, graphfrm: GraphicFrame) -> Iterator[Table]: return html_text = htmlify_matrix_of_cell_texts( - [[cell.text for cell in row.cells] for row in rows] + [[cell.text.replace("\n", " ") for cell in row.cells] for row in rows] ) html_table = HtmlTable.from_html_text(html_text)