From 1cf12e90abda720086bcf03d1aefcbaaeef4fc4f Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Sat, 21 Oct 2023 02:37:55 -0500 Subject: [PATCH] chore: streamline kwarg handling (#264) Improves kwarg handling so that kwargs are handled explicitly when needed, and suppressed otherwise. --- CHANGELOG.md | 3 ++- .../inference/test_layout.py | 4 +++- unstructured_inference/__version__.py | 2 +- unstructured_inference/inference/layout.py | 18 ++++-------------- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca246338..fb399e56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.7.10-dev2 +## 0.7.10 +* Handle kwargs explicitly when needed, suppress otherwise * fix: Reduce Chipper memory consumption on x86_64 cpus * fix: Skips ordering elements coming from Chipper * fix: After refactoring to introduce Chipper, annotate() weren't able to show text with extra info from elements, this is fixed now. diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 49f8d6e4..a75b505c 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -2,7 +2,7 @@ import os.path import tempfile from functools import partial -from unittest.mock import mock_open, patch +from unittest.mock import ANY, mock_open, patch import numpy as np import pytest @@ -675,6 +675,8 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m fixed_layouts=None, extract_tables=False, pdf_image_dpi=200, + extract_images_in_pdf=ANY, + image_output_dir_path=ANY, ) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 5a48fbf0..8f560143 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.10-dev2" # pragma: no cover +__version__ = "0.7.10" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 5ab39aab..e8303e66 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -71,10 +71,7 @@ def from_pages(cls, pages: List[PageLayout]) -> DocumentLayout: def from_file( cls, filename: str, - detection_model: Optional[UnstructuredObjectDetectionModel] = None, - element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, - extract_tables: bool = False, pdf_image_dpi: int = 200, **kwargs, ) -> DocumentLayout: @@ -108,11 +105,8 @@ def from_file( image, number=i + 1, document_filename=filename, - detection_model=detection_model, - element_extraction_model=element_extraction_model, layout=layout, fixed_layout=fixed_layout, - extract_tables=extract_tables, **kwargs, ) pages.append(page) @@ -453,10 +447,6 @@ def from_image( def process_data_with_model( data: BinaryIO, model_name: Optional[str], - is_image: bool = False, - fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, - extract_tables: bool = False, - pdf_image_dpi: int = 200, **kwargs, ) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a @@ -467,10 +457,6 @@ def process_data_with_model( layout = process_file_with_model( tmp_file.name, model_name, - is_image=is_image, - fixed_layouts=fixed_layouts, - extract_tables=extract_tables, - pdf_image_dpi=pdf_image_dpi, **kwargs, ) @@ -484,6 +470,8 @@ def process_file_with_model( fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by @@ -514,6 +502,8 @@ def process_file_with_model( fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, **kwargs, ) )