chore: improve kwarg handling (#1810)

Closes `unstructured-inference` issue [#265](Unstructured-IO/unstructured-inference#265). Cleaned up the kwarg handling, taking opportunities to turn instances of handling kwargs as dicts to just using them as normal in function signatures. #### Testing: Should just pass CI.
Unstructured-IO · Oct 23, 2023 · 7fdddfb · 7fdddfb
1 parent 82c8adb
commit 7fdddfb
Show file tree

Hide file tree

Showing 19 changed files with 75 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 0.10.26-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.10.25
 
 ### Enhancements

diff --git a/requirements/embed-huggingface.txt b/requirements/embed-huggingface.txt
@@ -76,9 +76,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
     # via -r requirements/embed-huggingface.in
-langsmith==0.0.46
+langsmith==0.0.49
     # via langchain
 markupsafe==2.1.3
     # via jinja2

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -6,7 +6,7 @@ pdf2image
 pdfminer.six
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.9
+unstructured-inference==0.7.10
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -203,7 +203,7 @@ sympy==1.12
     # via
     #   onnxruntime
     #   torch
-timm==0.9.7
+timm==0.9.8
     # via effdet
 tokenizers==0.14.1
     # via transformers
@@ -236,7 +236,7 @@ typing-extensions==4.8.0
     #   torch
 tzdata==2023.3
     # via pandas
-unstructured-inference==0.7.9
+unstructured-inference==0.7.10
     # via -r requirements/extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via

diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt
@@ -6,7 +6,7 @@
 #
 azure-common==1.1.28
     # via azure-search-documents
-azure-core==1.29.4
+azure-core==1.29.5
     # via
     #   azure-search-documents
     #   msrest

diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt
@@ -14,7 +14,7 @@ async-timeout==4.0.3
     # via aiohttp
 attrs==23.1.0
     # via aiohttp
-azure-core==1.29.4
+azure-core==1.29.5
     # via
     #   adlfs
     #   azure-identity

diff --git a/requirements/ingest-bedrock.txt b/requirements/ingest-bedrock.txt
@@ -61,9 +61,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
     # via -r requirements/ingest-bedrock.in
-langsmith==0.0.46
+langsmith==0.0.49
     # via langchain
 marshmallow==3.20.1
     # via

diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
 #
-atlassian-python-api==3.41.2
+atlassian-python-api==3.41.3
     # via -r requirements/ingest-confluence.in
 certifi==2023.7.22
     # via

diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
 #
-atlassian-python-api==3.41.2
+atlassian-python-api==3.41.3
     # via -r requirements/ingest-jira.in
 certifi==2023.7.22
     # via

diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt
@@ -50,9 +50,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
     # via -r requirements/ingest-openai.in
-langsmith==0.0.46
+langsmith==0.0.49
     # via langchain
 marshmallow==3.20.1
     # via

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -103,7 +103,7 @@ requests==2.31.0
     # via
     #   -c requirements/base.txt
     #   label-studio-sdk
-ruff==0.1.0
+ruff==0.1.1
     # via -r requirements/test.in
 six==1.16.0
     # via

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
         mock_process.assert_called_once_with(
             filename,
             is_image=False,
-            pdf_image_dpi=200,
+            pdf_image_dpi=mock.ANY,
+            extract_tables=mock.ANY,
             model_name="checkbox",
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
         )
 
 
@@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
         mock_process.assert_called_once_with(
             filename,
             is_image=False,
-            pdf_image_dpi=200,
+            pdf_image_dpi=mock.ANY,
+            extract_tables=mock.ANY,
             model_name="checkbox",
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
         )
 
 
@@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
         mock_process.assert_called_once_with(
             filename,
             is_image=False,
+            extract_tables=mock.ANY,
             model_name=pdf.default_hi_res_model(),
             pdf_image_dpi=100,
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
         )
 
 

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -3,7 +3,7 @@
 import pathlib
 import warnings
 from importlib import import_module
-from unittest.mock import patch
+from unittest.mock import ANY, patch
 
 import docx
 import pytest
@@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
         url=None,
         include_page_breaks=False,
         infer_table_structure=False,
+        extract_images_in_pdf=ANY,
+        image_output_dir_path=ANY,
         strategy="fast",
         languages=None,
     )

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.25"  # pragma: no cover
+__version__ = "0.10.26-dev0"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -135,6 +135,8 @@ def partition(
     languages: Optional[List[str]] = None,
     detect_language_per_element: bool = False,
     pdf_infer_table_structure: bool = False,
+    pdf_extract_images: bool = False,
+    pdf_image_output_dir_path: Optional[str] = None,
     xml_keep_tags: bool = False,
     data_source_metadata: Optional[DataSourceMetadata] = None,
     metadata_filename: Optional[str] = None,
@@ -186,6 +188,12 @@ def partition(
         additional metadata field, "text_as_html," where the value (string) is a just a
         transformation of the data into an HTML <table>.
         The "text" field for a partitioned Table Element is always present, whether True or False.
+    pdf_extract_images
+        If True and strategy=hi_res, any detected images will be saved in the path specified by
+        pdf_image_output_dir_path.
+    pdf_image_output_dir_path
+        If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
+        given path
     xml_keep_tags
         If True, will retain the XML tags in the output. Otherwise it will simply extract
         the text from within the tags. Only applies to partition_xml.
@@ -367,6 +375,8 @@ def partition(
             infer_table_structure=infer_table_structure,
             strategy=strategy,
             languages=languages,
+            extract_images_in_pdf=pdf_extract_images,
+            image_output_dir_path=pdf_image_output_dir_path,
             **kwargs,
         )
     elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):

diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -35,10 +35,7 @@
 )
 from unstructured.logger import logger
 from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
-from unstructured.partition.utils.constants import (
-    SORT_MODE_DONT,
-    SORT_MODE_XY_CUT,
-)
+from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
 from unstructured.utils import dependency_exists, first
 
 if dependency_exists("docx") and dependency_exists("docx.table"):
@@ -551,11 +548,11 @@ def document_to_element_list(
     infer_list_items: bool = True,
     source_format: Optional[str] = None,
     detection_origin: Optional[str] = None,
+    sort_mode: str = SORT_MODE_XY_CUT,
     **kwargs,
 ) -> List[Element]:
     """Converts a DocumentLayout object to a list of unstructured elements."""
     elements: List[Element] = []
-    sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
 
     num_pages = len(document.pages)
     for i, page in enumerate(document.pages):

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -100,6 +100,8 @@ def partition_pdf(
     metadata_last_modified: Optional[str] = None,
     chunking_strategy: Optional[str] = None,
     links: Sequence[Link] = [],
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
     **kwargs,
 ) -> List[Element]:
     """Parses a pdf document into a list of interpreted elements.
@@ -135,6 +137,12 @@ def partition_pdf(
         processing text/plain content.
     metadata_last_modified
         The last modified date for the document.
+    extract_images_in_pdf
+        If True and strategy=hi_res, any detected images will be saved in the path specified by
+        image_output_dir_path.
+    image_output_dir_path
+        If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the
+        given path
     """
     exactly_one(filename=filename, file=file)
 
@@ -164,6 +172,8 @@ def partition_pdf(
         max_partition=max_partition,
         min_partition=min_partition,
         metadata_last_modified=metadata_last_modified,
+        extract_images_in_pdf=extract_images_in_pdf,
+        image_output_dir_path=image_output_dir_path,
         **kwargs,
     )
 
@@ -210,6 +220,8 @@ def partition_pdf_or_image(
     max_partition: Optional[int] = 1500,
     min_partition: Optional[int] = 0,
     metadata_last_modified: Optional[str] = None,
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
     **kwargs,
 ) -> List[Element]:
     """Parses a pdf or image document into a list of interpreted elements."""
@@ -292,6 +304,8 @@ def partition_pdf_or_image(
                 include_page_breaks=include_page_breaks,
                 languages=languages,
                 metadata_last_modified=metadata_last_modified or last_modification_date,
+                extract_images_in_pdf=extract_images_in_pdf,
+                image_output_dir_path=image_output_dir_path,
                 **kwargs,
             )
             layout_elements = []
@@ -334,6 +348,9 @@ def _partition_pdf_or_image_local(
     ocr_mode: str = OCRMode.FULL_PAGE.value,
     model_name: Optional[str] = None,
     metadata_last_modified: Optional[str] = None,
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
+    pdf_image_dpi: Optional[int] = None,
     **kwargs,
 ) -> List[Element]:
     """Partition using package installed locally."""
@@ -350,7 +367,6 @@ def _partition_pdf_or_image_local(
     ocr_languages = prepare_languages_for_tesseract(languages)
 
     model_name = model_name or default_hi_res_model()
-    pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
     if pdf_image_dpi is None:
         pdf_image_dpi = 300 if model_name == "chipper" else 200
     if (pdf_image_dpi < 300) and (model_name == "chipper"):
@@ -359,27 +375,16 @@ def _partition_pdf_or_image_local(
             f"(currently {pdf_image_dpi}).",
         )
 
-    # NOTE(christine): Need to extract images from PDF's
-    extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
-    image_output_dir_path = kwargs.get("image_output_dir_path", None)
-    process_with_model_extra_kwargs = {
-        "extract_images_in_pdf": extract_images_in_pdf,
-        "image_output_dir_path": image_output_dir_path,
-    }
-
-    process_with_model_kwargs = {}
-    for key, value in process_with_model_extra_kwargs.items():
-        if value:
-            process_with_model_kwargs[key] = value
-
     if file is None:
         # NOTE(christine): out_layout = extracted_layout + inferred_layout
         out_layout = process_file_with_model(
             filename,
             is_image=is_image,
+            extract_tables=infer_table_structure,
             model_name=model_name,
             pdf_image_dpi=pdf_image_dpi,
-            **process_with_model_kwargs,
+            extract_images_in_pdf=extract_images_in_pdf,
+            image_output_dir_path=image_output_dir_path,
         )
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -398,9 +403,11 @@ def _partition_pdf_or_image_local(
         out_layout = process_data_with_model(
             file,
             is_image=is_image,
+            extract_tables=infer_table_structure,
             model_name=model_name,
             pdf_image_dpi=pdf_image_dpi,
-            **process_with_model_kwargs,
+            extract_images_in_pdf=extract_images_in_pdf,
+            image_output_dir_path=image_output_dir_path,
         )
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -528,11 +535,11 @@ def _process_pdfminer_pages(
     filename: str = "",
     include_page_breaks: bool = False,
     metadata_last_modified: Optional[str] = None,
+    sort_mode: str = SORT_MODE_XY_CUT,
     **kwargs,
 ):
     """Uses PDF miner to split a document into pages and process them."""
     elements: List[Element] = []
-    sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
 
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()

diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
@@ -11,6 +11,7 @@ class OCRMode(Enum):
 SORT_MODE_BASIC = "basic"
 SORT_MODE_DONT = "dont"
 
+
 SUBREGION_THRESHOLD_FOR_OCR = 0.5
 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
 

diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
@@ -5,10 +5,7 @@
 
 from unstructured.documents.elements import CoordinatesMetadata, Element
 from unstructured.logger import trace_logger
-from unstructured.partition.utils.constants import (
-    SORT_MODE_BASIC,
-    SORT_MODE_XY_CUT,
-)
+from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
 from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.10.25" # pragma: no cover
		__version__ = "0.10.26-dev0" # pragma: no cover