Skip to content

Commit

Permalink
chore: improve kwarg handling (#1810)
Browse files Browse the repository at this point in the history
Closes `unstructured-inference` issue
[#265](Unstructured-IO/unstructured-inference#265).

Cleaned up the kwarg handling, taking opportunities to turn instances of
handling kwargs as dicts to just using them as normal in function
signatures.

#### Testing:

Should just pass CI.
  • Loading branch information
qued authored Oct 23, 2023
1 parent 82c8adb commit 7fdddfb
Show file tree
Hide file tree
Showing 19 changed files with 75 additions and 44 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.10.26-dev0

### Enhancements

### Features

### Fixes

## 0.10.25

### Enhancements
Expand Down
4 changes: 2 additions & 2 deletions requirements/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/embed-huggingface.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
markupsafe==2.1.3
# via jinja2
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pdf2image
pdfminer.six
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.9
unstructured-inference==0.7.10
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
4 changes: 2 additions & 2 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ sympy==1.12
# via
# onnxruntime
# torch
timm==0.9.7
timm==0.9.8
# via effdet
tokenizers==0.14.1
# via transformers
Expand Down Expand Up @@ -236,7 +236,7 @@ typing-extensions==4.8.0
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.7.9
unstructured-inference==0.7.10
# via -r requirements/extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-azure-cognitive-search.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
azure-common==1.1.28
# via azure-search-documents
azure-core==1.29.4
azure-core==1.29.5
# via
# azure-search-documents
# msrest
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-azure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via aiohttp
azure-core==1.29.4
azure-core==1.29.5
# via
# adlfs
# azure-identity
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-bedrock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/ingest-bedrock.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
marshmallow==3.20.1
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-confluence.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
#
atlassian-python-api==3.41.2
atlassian-python-api==3.41.3
# via -r requirements/ingest-confluence.in
certifi==2023.7.22
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-jira.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
#
atlassian-python-api==3.41.2
atlassian-python-api==3.41.3
# via -r requirements/ingest-jira.in
certifi==2023.7.22
# via
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-openai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/ingest-openai.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
marshmallow==3.20.1
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ requests==2.31.0
# via
# -c requirements/base.txt
# label-studio-sdk
ruff==0.1.0
ruff==0.1.1
# via -r requirements/test.in
six==1.16.0
# via
Expand Down
13 changes: 11 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=200,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand All @@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=200,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand Down Expand Up @@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
mock_process.assert_called_once_with(
filename,
is_image=False,
extract_tables=mock.ANY,
model_name=pdf.default_hi_res_model(),
pdf_image_dpi=100,
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
import warnings
from importlib import import_module
from unittest.mock import patch
from unittest.mock import ANY, patch

import docx
import pytest
Expand Down Expand Up @@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
url=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=ANY,
image_output_dir_path=ANY,
strategy="fast",
languages=None,
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.25" # pragma: no cover
__version__ = "0.10.26-dev0" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def partition(
languages: Optional[List[str]] = None,
detect_language_per_element: bool = False,
pdf_infer_table_structure: bool = False,
pdf_extract_images: bool = False,
pdf_image_output_dir_path: Optional[str] = None,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None,
Expand Down Expand Up @@ -186,6 +188,12 @@ def partition(
additional metadata field, "text_as_html," where the value (string) is a just a
transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False.
pdf_extract_images
If True and strategy=hi_res, any detected images will be saved in the path specified by
pdf_image_output_dir_path.
pdf_image_output_dir_path
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
given path
xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml.
Expand Down Expand Up @@ -367,6 +375,8 @@ def partition(
infer_table_structure=infer_table_structure,
strategy=strategy,
languages=languages,
extract_images_in_pdf=pdf_extract_images,
image_output_dir_path=pdf_image_output_dir_path,
**kwargs,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
Expand Down
7 changes: 2 additions & 5 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@
)
from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.partition.utils.constants import (
SORT_MODE_DONT,
SORT_MODE_XY_CUT,
)
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.utils import dependency_exists, first

if dependency_exists("docx") and dependency_exists("docx.table"):
Expand Down Expand Up @@ -551,11 +548,11 @@ def document_to_element_list(
infer_list_items: bool = True,
source_format: Optional[str] = None,
detection_origin: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
**kwargs,
) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

num_pages = len(document.pages)
for i, page in enumerate(document.pages):
Expand Down
41 changes: 24 additions & 17 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def partition_pdf(
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
links: Sequence[Link] = [],
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -135,6 +137,12 @@ def partition_pdf(
processing text/plain content.
metadata_last_modified
The last modified date for the document.
extract_images_in_pdf
If True and strategy=hi_res, any detected images will be saved in the path specified by
image_output_dir_path.
image_output_dir_path
If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the
given path
"""
exactly_one(filename=filename, file=file)

Expand Down Expand Up @@ -164,6 +172,8 @@ def partition_pdf(
max_partition=max_partition,
min_partition=min_partition,
metadata_last_modified=metadata_last_modified,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs,
)

Expand Down Expand Up @@ -210,6 +220,8 @@ def partition_pdf_or_image(
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -292,6 +304,8 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs,
)
layout_elements = []
Expand Down Expand Up @@ -334,6 +348,9 @@ def _partition_pdf_or_image_local(
ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
**kwargs,
) -> List[Element]:
"""Partition using package installed locally."""
Expand All @@ -350,7 +367,6 @@ def _partition_pdf_or_image_local(
ocr_languages = prepare_languages_for_tesseract(languages)

model_name = model_name or default_hi_res_model()
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
if pdf_image_dpi is None:
pdf_image_dpi = 300 if model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (model_name == "chipper"):
Expand All @@ -359,27 +375,16 @@ def _partition_pdf_or_image_local(
f"(currently {pdf_image_dpi}).",
)

# NOTE(christine): Need to extract images from PDF's
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
image_output_dir_path = kwargs.get("image_output_dir_path", None)
process_with_model_extra_kwargs = {
"extract_images_in_pdf": extract_images_in_pdf,
"image_output_dir_path": image_output_dir_path,
}

process_with_model_kwargs = {}
for key, value in process_with_model_extra_kwargs.items():
if value:
process_with_model_kwargs[key] = value

if file is None:
# NOTE(christine): out_layout = extracted_layout + inferred_layout
out_layout = process_file_with_model(
filename,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -398,9 +403,11 @@ def _partition_pdf_or_image_local(
out_layout = process_data_with_model(
file,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand Down Expand Up @@ -528,11 +535,11 @@ def _process_pdfminer_pages(
filename: str = "",
include_page_breaks: bool = False,
metadata_last_modified: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
**kwargs,
):
"""Uses PDF miner to split a document into pages and process them."""
elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

rsrcmgr = PDFResourceManager()
laparams = LAParams()
Expand Down
1 change: 1 addition & 0 deletions unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class OCRMode(Enum):
SORT_MODE_BASIC = "basic"
SORT_MODE_DONT = "dont"


SUBREGION_THRESHOLD_FOR_OCR = 0.5
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)

Expand Down
5 changes: 1 addition & 4 deletions unstructured/partition/utils/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@

from unstructured.documents.elements import CoordinatesMetadata, Element
from unstructured.logger import trace_logger
from unstructured.partition.utils.constants import (
SORT_MODE_BASIC,
SORT_MODE_XY_CUT,
)
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped


Expand Down

0 comments on commit 7fdddfb

Please sign in to comment.