From 7fdddfbc1e98fb4d7a57ebb11aef4eca07b076e6 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Sun, 22 Oct 2023 23:48:28 -0500 Subject: [PATCH] chore: improve kwarg handling (#1810) Closes `unstructured-inference` issue [#265](https://github.com/Unstructured-IO/unstructured-inference/issues/265). Cleaned up the kwarg handling, taking opportunities to turn instances of handling kwargs as dicts to just using them as normal in function signatures. #### Testing: Should just pass CI. --- CHANGELOG.md | 8 ++++ requirements/embed-huggingface.txt | 4 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 4 +- .../ingest-azure-cognitive-search.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-bedrock.txt | 4 +- requirements/ingest-confluence.txt | 2 +- requirements/ingest-jira.txt | 2 +- requirements/ingest-openai.txt | 4 +- requirements/test.txt | 2 +- .../partition/pdf_image/test_pdf.py | 13 +++++- test_unstructured/partition/test_auto.py | 4 +- unstructured/__version__.py | 2 +- unstructured/partition/auto.py | 10 +++++ unstructured/partition/common.py | 7 +--- unstructured/partition/pdf.py | 41 +++++++++++-------- unstructured/partition/utils/constants.py | 1 + unstructured/partition/utils/sorting.py | 5 +-- 19 files changed, 75 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d99fd4bfeb..bff8ac632f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.10.26-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.10.25 ### Enhancements diff --git a/requirements/embed-huggingface.txt b/requirements/embed-huggingface.txt index 883ac5800e..d34404071e 100644 --- a/requirements/embed-huggingface.txt +++ b/requirements/embed-huggingface.txt @@ -76,9 +76,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.318 +langchain==0.0.320 # via -r requirements/embed-huggingface.in -langsmith==0.0.46 +langsmith==0.0.49 # via langchain markupsafe==2.1.3 # via jinja2 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 5e12c266cd..20a0178ca2 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -6,7 +6,7 @@ pdf2image pdfminer.six # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.9 +unstructured-inference==0.7.10 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index ac1366f078..2ca39716f9 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -203,7 +203,7 @@ sympy==1.12 # via # onnxruntime # torch -timm==0.9.7 +timm==0.9.8 # via effdet tokenizers==0.14.1 # via transformers @@ -236,7 +236,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.7.9 +unstructured-inference==0.7.10 # via -r requirements/extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt index c3be136ed6..817423eda9 100644 --- a/requirements/ingest-azure-cognitive-search.txt +++ b/requirements/ingest-azure-cognitive-search.txt @@ -6,7 +6,7 @@ # azure-common==1.1.28 # via azure-search-documents -azure-core==1.29.4 +azure-core==1.29.5 # via # azure-search-documents # msrest diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index c621e4b125..9da3719703 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -14,7 +14,7 @@ async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via aiohttp -azure-core==1.29.4 +azure-core==1.29.5 # via # adlfs # azure-identity diff --git a/requirements/ingest-bedrock.txt b/requirements/ingest-bedrock.txt index 4b73e94b12..313ae62798 100644 --- a/requirements/ingest-bedrock.txt +++ b/requirements/ingest-bedrock.txt @@ -61,9 +61,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.318 +langchain==0.0.320 # via -r requirements/ingest-bedrock.in -langsmith==0.0.46 +langsmith==0.0.49 # via langchain marshmallow==3.20.1 # via diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index 197504d4d2..ba7e56369e 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -4,7 +4,7 @@ # # pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in # -atlassian-python-api==3.41.2 +atlassian-python-api==3.41.3 # via -r requirements/ingest-confluence.in certifi==2023.7.22 # via diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt index c56b7f624c..732571fb17 100644 --- a/requirements/ingest-jira.txt +++ b/requirements/ingest-jira.txt @@ -4,7 +4,7 @@ # # pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in # -atlassian-python-api==3.41.2 +atlassian-python-api==3.41.3 # via -r requirements/ingest-jira.in certifi==2023.7.22 # via diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index e8f6a1ebe4..70a0f0c393 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -50,9 +50,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.318 +langchain==0.0.320 # via -r requirements/ingest-openai.in -langsmith==0.0.46 +langsmith==0.0.49 # via langchain marshmallow==3.20.1 # via diff --git a/requirements/test.txt b/requirements/test.txt index 409d2ceace..a48f577a3c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -103,7 +103,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.1.0 +ruff==0.1.1 # via -r requirements/test.in six==1.16.0 # via diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 35c2223c4e..ede41841b5 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var( mock_process.assert_called_once_with( filename, is_image=False, - pdf_image_dpi=200, + pdf_image_dpi=mock.ANY, + extract_tables=mock.ANY, model_name="checkbox", + extract_images_in_pdf=mock.ANY, + image_output_dir_path=mock.ANY, ) @@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name( mock_process.assert_called_once_with( filename, is_image=False, - pdf_image_dpi=200, + pdf_image_dpi=mock.ANY, + extract_tables=mock.ANY, model_name="checkbox", + extract_images_in_pdf=mock.ANY, + image_output_dir_path=mock.ANY, ) @@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi(): mock_process.assert_called_once_with( filename, is_image=False, + extract_tables=mock.ANY, model_name=pdf.default_hi_res_model(), pdf_image_dpi=100, + extract_images_in_pdf=mock.ANY, + image_output_dir_path=mock.ANY, ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 834cdb41c9..d6414619ae 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -3,7 +3,7 @@ import pathlib import warnings from importlib import import_module -from unittest.mock import patch +from unittest.mock import ANY, patch import docx import pytest @@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch): url=None, include_page_breaks=False, infer_table_structure=False, + extract_images_in_pdf=ANY, + image_output_dir_path=ANY, strategy="fast", languages=None, ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 692c04d172..750a52e559 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.25" # pragma: no cover +__version__ = "0.10.26-dev0" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index b5d31843ce..6005068b5c 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -135,6 +135,8 @@ def partition( languages: Optional[List[str]] = None, detect_language_per_element: bool = False, pdf_infer_table_structure: bool = False, + pdf_extract_images: bool = False, + pdf_image_output_dir_path: Optional[str] = None, xml_keep_tags: bool = False, data_source_metadata: Optional[DataSourceMetadata] = None, metadata_filename: Optional[str] = None, @@ -186,6 +188,12 @@ def partition( additional metadata field, "text_as_html," where the value (string) is a just a transformation of the data into an HTML . The "text" field for a partitioned Table Element is always present, whether True or False. + pdf_extract_images + If True and strategy=hi_res, any detected images will be saved in the path specified by + pdf_image_output_dir_path. + pdf_image_output_dir_path + If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the + given path xml_keep_tags If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml. @@ -367,6 +375,8 @@ def partition( infer_table_structure=infer_table_structure, strategy=strategy, languages=languages, + extract_images_in_pdf=pdf_extract_images, + image_output_dir_path=pdf_image_output_dir_path, **kwargs, ) elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF): diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 8d41854ef2..3549f0ef28 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -35,10 +35,7 @@ ) from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE -from unstructured.partition.utils.constants import ( - SORT_MODE_DONT, - SORT_MODE_XY_CUT, -) +from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT from unstructured.utils import dependency_exists, first if dependency_exists("docx") and dependency_exists("docx.table"): @@ -551,11 +548,11 @@ def document_to_element_list( infer_list_items: bool = True, source_format: Optional[str] = None, detection_origin: Optional[str] = None, + sort_mode: str = SORT_MODE_XY_CUT, **kwargs, ) -> List[Element]: """Converts a DocumentLayout object to a list of unstructured elements.""" elements: List[Element] = [] - sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT) num_pages = len(document.pages) for i, page in enumerate(document.pages): diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 79d9dc8420..d30c9f5eea 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -100,6 +100,8 @@ def partition_pdf( metadata_last_modified: Optional[str] = None, chunking_strategy: Optional[str] = None, links: Sequence[Link] = [], + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, **kwargs, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. @@ -135,6 +137,12 @@ def partition_pdf( processing text/plain content. metadata_last_modified The last modified date for the document. + extract_images_in_pdf + If True and strategy=hi_res, any detected images will be saved in the path specified by + image_output_dir_path. + image_output_dir_path + If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the + given path """ exactly_one(filename=filename, file=file) @@ -164,6 +172,8 @@ def partition_pdf( max_partition=max_partition, min_partition=min_partition, metadata_last_modified=metadata_last_modified, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, **kwargs, ) @@ -210,6 +220,8 @@ def partition_pdf_or_image( max_partition: Optional[int] = 1500, min_partition: Optional[int] = 0, metadata_last_modified: Optional[str] = None, + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, **kwargs, ) -> List[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -292,6 +304,8 @@ def partition_pdf_or_image( include_page_breaks=include_page_breaks, languages=languages, metadata_last_modified=metadata_last_modified or last_modification_date, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, **kwargs, ) layout_elements = [] @@ -334,6 +348,9 @@ def _partition_pdf_or_image_local( ocr_mode: str = OCRMode.FULL_PAGE.value, model_name: Optional[str] = None, metadata_last_modified: Optional[str] = None, + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, + pdf_image_dpi: Optional[int] = None, **kwargs, ) -> List[Element]: """Partition using package installed locally.""" @@ -350,7 +367,6 @@ def _partition_pdf_or_image_local( ocr_languages = prepare_languages_for_tesseract(languages) model_name = model_name or default_hi_res_model() - pdf_image_dpi = kwargs.pop("pdf_image_dpi", None) if pdf_image_dpi is None: pdf_image_dpi = 300 if model_name == "chipper" else 200 if (pdf_image_dpi < 300) and (model_name == "chipper"): @@ -359,27 +375,16 @@ def _partition_pdf_or_image_local( f"(currently {pdf_image_dpi}).", ) - # NOTE(christine): Need to extract images from PDF's - extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False) - image_output_dir_path = kwargs.get("image_output_dir_path", None) - process_with_model_extra_kwargs = { - "extract_images_in_pdf": extract_images_in_pdf, - "image_output_dir_path": image_output_dir_path, - } - - process_with_model_kwargs = {} - for key, value in process_with_model_extra_kwargs.items(): - if value: - process_with_model_kwargs[key] = value - if file is None: # NOTE(christine): out_layout = extracted_layout + inferred_layout out_layout = process_file_with_model( filename, is_image=is_image, + extract_tables=infer_table_structure, model_name=model_name, pdf_image_dpi=pdf_image_dpi, - **process_with_model_kwargs, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, ) if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper @@ -398,9 +403,11 @@ def _partition_pdf_or_image_local( out_layout = process_data_with_model( file, is_image=is_image, + extract_tables=infer_table_structure, model_name=model_name, pdf_image_dpi=pdf_image_dpi, - **process_with_model_kwargs, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, ) if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper @@ -528,11 +535,11 @@ def _process_pdfminer_pages( filename: str = "", include_page_breaks: bool = False, metadata_last_modified: Optional[str] = None, + sort_mode: str = SORT_MODE_XY_CUT, **kwargs, ): """Uses PDF miner to split a document into pages and process them.""" elements: List[Element] = [] - sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT) rsrcmgr = PDFResourceManager() laparams = LAParams() diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index e7c976469d..aed1422b37 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -11,6 +11,7 @@ class OCRMode(Enum): SORT_MODE_BASIC = "basic" SORT_MODE_DONT = "dont" + SUBREGION_THRESHOLD_FOR_OCR = 0.5 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py index 081d60156e..5b98919c82 100644 --- a/unstructured/partition/utils/sorting.py +++ b/unstructured/partition/utils/sorting.py @@ -5,10 +5,7 @@ from unstructured.documents.elements import CoordinatesMetadata, Element from unstructured.logger import trace_logger -from unstructured.partition.utils.constants import ( - SORT_MODE_BASIC, - SORT_MODE_XY_CUT, -) +from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped