Unstructured-IO · qued · Oct 23, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 0.10.26-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
 ## 0.10.25
 
 ### Enhancements

diff --git a/requirements/embed-huggingface.txt b/requirements/embed-huggingface.txt
@@ -76,9 +76,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
     # via -r requirements/embed-huggingface.in
-langsmith==0.0.46
+langsmith==0.0.49
     # via langchain
 markupsafe==2.1.3
     # via jinja2

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -6,7 +6,7 @@ pdf2image
 pdfminer.six
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.9
+unstructured-inference==0.7.10
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -203,7 +203,7 @@ sympy==1.12
     # via
     #   onnxruntime
     #   torch
-timm==0.9.7
+timm==0.9.8
     # via effdet
 tokenizers==0.14.1
     # via transformers
@@ -236,7 +236,7 @@ typing-extensions==4.8.0
     #   torch
 tzdata==2023.3
     # via pandas
-unstructured-inference==0.7.9
+unstructured-inference==0.7.10
     # via -r requirements/extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via

diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt
@@ -6,7 +6,7 @@
 #
 azure-common==1.1.28
     # via azure-search-documents
-azure-core==1.29.4
+azure-core==1.29.5
     # via
     #   azure-search-documents
     #   msrest

diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt
@@ -14,7 +14,7 @@ async-timeout==4.0.3
     # via aiohttp
 attrs==23.1.0
     # via aiohttp
-azure-core==1.29.4
+azure-core==1.29.5
     # via
     #   adlfs
     #   azure-identity

diff --git a/requirements/ingest-bedrock.txt b/requirements/ingest-bedrock.txt
@@ -61,9 +61,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
     # via -r requirements/ingest-bedrock.in
-langsmith==0.0.46
+langsmith==0.0.49
     # via langchain
 marshmallow==3.20.1
     # via

diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
 #
-atlassian-python-api==3.41.2
+atlassian-python-api==3.41.3
     # via -r requirements/ingest-confluence.in
 certifi==2023.7.22
     # via

diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
 #
-atlassian-python-api==3.41.2
+atlassian-python-api==3.41.3
     # via -r requirements/ingest-jira.in
 certifi==2023.7.22
     # via

diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt
@@ -50,9 +50,9 @@ jsonpatch==1.33
     # via langchain
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.318
+langchain==0.0.320
     # via -r requirements/ingest-openai.in
-langsmith==0.0.46
+langsmith==0.0.49
     # via langchain
 marshmallow==3.20.1
     # via

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -103,7 +103,7 @@ requests==2.31.0
     # via
     #   -c requirements/base.txt
     #   label-studio-sdk
-ruff==0.1.0
+ruff==0.1.1
     # via -r requirements/test.in
 six==1.16.0
     # via

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
         mock_process.assert_called_once_with(
             filename,
             is_image=False,
-            pdf_image_dpi=200,
+            pdf_image_dpi=mock.ANY,
+            extract_tables=mock.ANY,
             model_name="checkbox",
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
         )
 
 
@@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
         mock_process.assert_called_once_with(
             filename,
             is_image=False,
-            pdf_image_dpi=200,
+            pdf_image_dpi=mock.ANY,
+            extract_tables=mock.ANY,
             model_name="checkbox",
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
         )
 
 
@@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
         mock_process.assert_called_once_with(
             filename,
             is_image=False,
+            extract_tables=mock.ANY,
             model_name=pdf.default_hi_res_model(),
             pdf_image_dpi=100,
+            extract_images_in_pdf=mock.ANY,
+            image_output_dir_path=mock.ANY,
         )
 
 

diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py
@@ -23,7 +23,7 @@
     contains_emoji,
     document_to_element_list,
 )
-from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
+from unstructured.partition.utils.constants import SortMode
 
 
 class MockPageLayout(layout.PageLayout):
@@ -460,7 +460,7 @@ def test_document_to_element_list_handles_parent():
 
 @pytest.mark.parametrize(
     ("sort_mode", "call_count"),
-    [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
+    [(SortMode.SORT_MODE_DONT, 0), (SortMode.SORT_MODE_BASIC, 1), (SortMode.SORT_MODE_XY_CUT, 1)],
 )
 def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
     block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")

diff --git a/test_unstructured/partition/utils/test_sorting.py b/test_unstructured/partition/utils/test_sorting.py
@@ -2,7 +2,7 @@
 
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import CoordinatesMetadata, Element, Text
-from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
+from unstructured.partition.utils.constants import SortMode
 from unstructured.partition.utils.sorting import (
     coord_has_valid_points,
     coordinates_to_bbox,
@@ -60,7 +60,7 @@ def test_sort_xycut_neg_coordinates():
         elements.append(elem)
 
     # NOTE(crag): xycut not attempted, sort_page_elements returns original list
-    assert sort_page_elements(elements, sort_mode=SORT_MODE_XY_CUT) is not elements
+    assert sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_XY_CUT) is not elements
 
 
 def test_sort_xycut_pos_coordinates():
@@ -74,7 +74,7 @@ def test_sort_xycut_pos_coordinates():
         elements.append(elem)
 
     # NOTE(crag): xycut ran, so different list reference returned from input list
-    assert sort_page_elements(elements, sort_mode=SORT_MODE_XY_CUT) is not elements
+    assert sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_XY_CUT) is not elements
 
 
 def test_sort_basic_neg_coordinates():
@@ -87,7 +87,7 @@ def test_sort_basic_neg_coordinates():
         )
         elements.append(elem)
 
-    sorted_page_elements = sort_page_elements(elements, sort_mode=SORT_MODE_BASIC)
+    sorted_page_elements = sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_BASIC)
     sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])
     assert sorted_elem_text == "2 1 0"
 
@@ -102,7 +102,7 @@ def test_sort_basic_pos_coordinates():
         )
         elements.append(elem)
 
-    sorted_page_elements = sort_page_elements(elements, sort_mode=SORT_MODE_BASIC)
+    sorted_page_elements = sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_BASIC)
     assert sorted_page_elements is not elements
 
     sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.25"  # pragma: no cover
+__version__ = "0.10.26-dev0"  # pragma: no cover
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -35,10 +35,7 @@
 )
 from unstructured.logger import logger
 from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
-from unstructured.partition.utils.constants import (
-    SORT_MODE_DONT,
-    SORT_MODE_XY_CUT,
-)
+from unstructured.partition.utils.constants import SortMode
 from unstructured.utils import dependency_exists, first
 
 if dependency_exists("docx") and dependency_exists("docx.table"):
@@ -551,11 +548,11 @@ def document_to_element_list(
     infer_list_items: bool = True,
     source_format: Optional[str] = None,
     detection_origin: Optional[str] = None,
+    sort_mode: SortMode = SortMode.SORT_MODE_XY_CUT,
     **kwargs,
 ) -> List[Element]:
     """Converts a DocumentLayout object to a list of unstructured elements."""
     elements: List[Element] = []
-    sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
 
     num_pages = len(document.pages)
     for i, page in enumerate(document.pages):
@@ -630,7 +627,7 @@ def document_to_element_list(
                 )
                 element.metadata.parent_id = element_parent.id
         sorted_page_elements = page_elements
-        if sortable and sort_mode != SORT_MODE_DONT:
+        if sortable and sort_mode != SortMode.SORT_MODE_DONT:
             sorted_page_elements = sort_page_elements(page_elements, sort_mode)
 
         if include_page_breaks and i < num_pages - 1:

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -60,12 +60,7 @@
 )
 from unstructured.partition.strategies import determine_pdf_or_image_strategy
 from unstructured.partition.text import element_from_text, partition_text
-from unstructured.partition.utils.constants import (
-    SORT_MODE_BASIC,
-    SORT_MODE_DONT,
-    SORT_MODE_XY_CUT,
-    OCRMode,
-)
+from unstructured.partition.utils.constants import OCRMode, SortMode
 from unstructured.partition.utils.sorting import (
     coord_has_valid_points,
     sort_page_elements,
@@ -334,6 +329,9 @@ def _partition_pdf_or_image_local(
     ocr_mode: str = OCRMode.FULL_PAGE.value,
     model_name: Optional[str] = None,
     metadata_last_modified: Optional[str] = None,
+    extract_images_in_pdf: bool = False,
+    image_output_dir_path: Optional[str] = None,
+    pdf_image_dpi: Optional[int] = None,
     **kwargs,
 ) -> List[Element]:
     """Partition using package installed locally."""
@@ -350,7 +348,6 @@ def _partition_pdf_or_image_local(
     ocr_languages = prepare_languages_for_tesseract(languages)
 
     model_name = model_name or default_hi_res_model()
-    pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
     if pdf_image_dpi is None:
         pdf_image_dpi = 300 if model_name == "chipper" else 200
     if (pdf_image_dpi < 300) and (model_name == "chipper"):
@@ -359,27 +356,16 @@ def _partition_pdf_or_image_local(
             f"(currently {pdf_image_dpi}).",
         )
 
-    # NOTE(christine): Need to extract images from PDF's
-    extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
-    image_output_dir_path = kwargs.get("image_output_dir_path", None)
-    process_with_model_extra_kwargs = {
-        "extract_images_in_pdf": extract_images_in_pdf,
-        "image_output_dir_path": image_output_dir_path,
-    }
-
-    process_with_model_kwargs = {}
-    for key, value in process_with_model_extra_kwargs.items():
-        if value:
-            process_with_model_kwargs[key] = value
-
     if file is None:
         # NOTE(christine): out_layout = extracted_layout + inferred_layout
         out_layout = process_file_with_model(
             filename,
             is_image=is_image,
+            extract_tables=infer_table_structure,
             model_name=model_name,
             pdf_image_dpi=pdf_image_dpi,
-            **process_with_model_kwargs,
+            extract_images_in_pdf=extract_images_in_pdf,
+            image_output_dir_path=image_output_dir_path,
         )
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -398,9 +384,11 @@ def _partition_pdf_or_image_local(
         out_layout = process_data_with_model(
             file,
             is_image=is_image,
+            extract_tables=infer_table_structure,
             model_name=model_name,
             pdf_image_dpi=pdf_image_dpi,
-            **process_with_model_kwargs,
+            extract_images_in_pdf=extract_images_in_pdf,
+            image_output_dir_path=image_output_dir_path,
         )
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
@@ -420,7 +408,7 @@ def _partition_pdf_or_image_local(
 
     # NOTE(alan): starting with v2, chipper sorts the elements itself.
     if model_name == "chipper":
-        kwargs["sort_mode"] = SORT_MODE_DONT
+        kwargs["sort_mode"] = SortMode.SORT_MODE_DONT
 
     elements = document_to_element_list(
         final_layout,
@@ -528,11 +516,11 @@ def _process_pdfminer_pages(
     filename: str = "",
     include_page_breaks: bool = False,
     metadata_last_modified: Optional[str] = None,
+    sort_mode: SortMode = SortMode.SORT_MODE_XY_CUT,
     **kwargs,
 ):
     """Uses PDF miner to split a document into pages and process them."""
     elements: List[Element] = []
-    sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)
 
     rsrcmgr = PDFResourceManager()
     laparams = LAParams()
@@ -661,8 +649,8 @@ def _process_pdfminer_pages(
 
         # NOTE(crag, christine): always do the basic sort first for determinsitic order across
         # python versions.
-        sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC)
-        if sort_mode != SORT_MODE_BASIC:
+        sorted_page_elements = sort_page_elements(page_elements, SortMode.SORT_MODE_BASIC)
+        if sort_mode != SortMode.SORT_MODE_BASIC:
             sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode)
 
         elements += sorted_page_elements

diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
@@ -7,9 +7,11 @@ class OCRMode(Enum):
     FULL_PAGE = "entire_page"
 
 
-SORT_MODE_XY_CUT = "xy-cut"
-SORT_MODE_BASIC = "basic"
-SORT_MODE_DONT = "dont"
+class SortMode(Enum):
+    SORT_MODE_XY_CUT = "xy-cut"
+    SORT_MODE_BASIC = "basic"
+    SORT_MODE_DONT = "dont"
+
 
 SUBREGION_THRESHOLD_FOR_OCR = 0.5
 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.10.25" # pragma: no cover
		__version__ = "0.10.26-dev0" # pragma: no cover