From 7fdddfbc1e98fb4d7a57ebb11aef4eca07b076e6 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Sun, 22 Oct 2023 23:48:28 -0500 Subject: [PATCH] chore: improve kwarg handling (#1810) Closes `unstructured-inference` issue [#265](https://github.com/Unstructured-IO/unstructured-inference/issues/265). Cleaned up the kwarg handling, taking opportunities to turn instances of handling kwargs as dicts to just using them as normal in function signatures. #### Testing: Should just pass CI. --- CHANGELOG.md | 8 ++++ requirements/embed-huggingface.txt | 4 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 4 +- .../ingest-azure-cognitive-search.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-bedrock.txt | 4 +- requirements/ingest-confluence.txt | 2 +- requirements/ingest-jira.txt | 2 +- requirements/ingest-openai.txt | 4 +- requirements/test.txt | 2 +- .../partition/pdf_image/test_pdf.py | 13 +++++- test_unstructured/partition/test_auto.py | 4 +- unstructured/__version__.py | 2 +- unstructured/partition/auto.py | 10 +++++ unstructured/partition/common.py | 7 +--- unstructured/partition/pdf.py | 41 +++++++++++-------- unstructured/partition/utils/constants.py | 1 + unstructured/partition/utils/sorting.py | 5 +-- 19 files changed, 75 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d99fd4bfeb..bff8ac632f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.10.26-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.10.25 ### Enhancements diff --git a/requirements/embed-huggingface.txt b/requirements/embed-huggingface.txt index 883ac5800e..d34404071e 100644 --- a/requirements/embed-huggingface.txt +++ b/requirements/embed-huggingface.txt @@ -76,9 +76,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.318 +langchain==0.0.320 # via -r requirements/embed-huggingface.in -langsmith==0.0.46 +langsmith==0.0.49 # via langchain markupsafe==2.1.3 # via jinja2 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 5e12c266cd..20a0178ca2 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -6,7 +6,7 @@ pdf2image pdfminer.six # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.9 +unstructured-inference==0.7.10 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index ac1366f078..2ca39716f9 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -203,7 +203,7 @@ sympy==1.12 # via # onnxruntime # torch -timm==0.9.7 +timm==0.9.8 # via effdet tokenizers==0.14.1 # via transformers @@ -236,7 +236,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.7.9 +unstructured-inference==0.7.10 # via -r requirements/extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt index c3be136ed6..817423eda9 100644 --- a/requirements/ingest-azure-cognitive-search.txt +++ b/requirements/ingest-azure-cognitive-search.txt @@ -6,7 +6,7 @@ # azure-common==1.1.28 # via azure-search-documents -azure-core==1.29.4 +azure-core==1.29.5 # via # azure-search-documents # msrest diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index c621e4b125..9da3719703 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -14,7 +14,7 @@ async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via aiohttp -azure-core==1.29.4 +azure-core==1.29.5 # via # adlfs # azure-identity diff --git a/requirements/ingest-bedrock.txt b/requirements/ingest-bedrock.txt index 4b73e94b12..313ae62798 100644 --- a/requirements/ingest-bedrock.txt +++ b/requirements/ingest-bedrock.txt @@ -61,9 +61,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.318 +langchain==0.0.320 # via -r requirements/ingest-bedrock.in -langsmith==0.0.46 +langsmith==0.0.49 # via langchain marshmallow==3.20.1 # via diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index 197504d4d2..ba7e56369e 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -4,7 +4,7 @@ # # pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in # -atlassian-python-api==3.41.2 +atlassian-python-api==3.41.3 # via -r requirements/ingest-confluence.in certifi==2023.7.22 # via diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt index c56b7f624c..732571fb17 100644 --- a/requirements/ingest-jira.txt +++ b/requirements/ingest-jira.txt @@ -4,7 +4,7 @@ # # pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in # -atlassian-python-api==3.41.2 +atlassian-python-api==3.41.3 # via -r requirements/ingest-jira.in certifi==2023.7.22 # via diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index e8f6a1ebe4..70a0f0c393 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -50,9 +50,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.318 +langchain==0.0.320 # via -r requirements/ingest-openai.in -langsmith==0.0.46 +langsmith==0.0.49 # via langchain marshmallow==3.20.1 # via diff --git a/requirements/test.txt b/requirements/test.txt index 409d2ceace..a48f577a3c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -103,7 +103,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.1.0 +ruff==0.1.1 # via -r requirements/test.in six==1.16.0 # via diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 35c2223c4e..ede41841b5 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var( mock_process.assert_called_once_with( filename, is_image=False, - pdf_image_dpi=200, + pdf_image_dpi=mock.ANY, + extract_tables=mock.ANY, model_name="checkbox", + extract_images_in_pdf=mock.ANY, + image_output_dir_path=mock.ANY, ) @@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name( mock_process.assert_called_once_with( filename, is_image=False, - pdf_image_dpi=200, + pdf_image_dpi=mock.ANY, + extract_tables=mock.ANY, model_name="checkbox", + extract_images_in_pdf=mock.ANY, + image_output_dir_path=mock.ANY, ) @@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi(): mock_process.assert_called_once_with( filename, is_image=False, + extract_tables=mock.ANY, model_name=pdf.default_hi_res_model(), pdf_image_dpi=100, + extract_images_in_pdf=mock.ANY, + image_output_dir_path=mock.ANY, ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 834cdb41c9..d6414619ae 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -3,7 +3,7 @@ import pathlib import warnings from importlib import import_module -from unittest.mock import patch +from unittest.mock import ANY, patch import docx import pytest @@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch): url=None, include_page_breaks=False, infer_table_structure=False, + extract_images_in_pdf=ANY, + image_output_dir_path=ANY, strategy="fast", languages=None, ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 692c04d172..750a52e559 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.25" # pragma: no cover +__version__ = "0.10.26-dev0" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index b5d31843ce..6005068b5c 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -135,6 +135,8 @@ def partition( languages: Optional[List[str]] = None, detect_language_per_element: bool = False, pdf_infer_table_structure: bool = False, + pdf_extract_images: bool = False, + pdf_image_output_dir_path: Optional[str] = None, xml_keep_tags: bool = False, data_source_metadata: Optional[DataSourceMetadata] = None, metadata_filename: Optional[str] = None, @@ -186,6 +188,12 @@ def partition( additional metadata field, "text_as_html," where the value (string) is a just a transformation of the data into an HTML