Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: improve kwarg handling #1810

Merged
merged 12 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.10.26-dev0

### Enhancements

### Features

### Fixes

## 0.10.25

### Enhancements
Expand Down
4 changes: 2 additions & 2 deletions requirements/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/embed-huggingface.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
markupsafe==2.1.3
# via jinja2
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pdf2image
pdfminer.six
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.9
unstructured-inference==0.7.10
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
4 changes: 2 additions & 2 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ sympy==1.12
# via
# onnxruntime
# torch
timm==0.9.7
timm==0.9.8
# via effdet
tokenizers==0.14.1
# via transformers
Expand Down Expand Up @@ -236,7 +236,7 @@ typing-extensions==4.8.0
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.7.9
unstructured-inference==0.7.10
# via -r requirements/extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-azure-cognitive-search.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
azure-common==1.1.28
# via azure-search-documents
azure-core==1.29.4
azure-core==1.29.5
# via
# azure-search-documents
# msrest
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-azure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via aiohttp
azure-core==1.29.4
azure-core==1.29.5
# via
# adlfs
# azure-identity
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-bedrock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/ingest-bedrock.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
marshmallow==3.20.1
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-confluence.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
#
atlassian-python-api==3.41.2
atlassian-python-api==3.41.3
# via -r requirements/ingest-confluence.in
certifi==2023.7.22
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-jira.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
#
atlassian-python-api==3.41.2
atlassian-python-api==3.41.3
# via -r requirements/ingest-jira.in
certifi==2023.7.22
# via
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-openai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/ingest-openai.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
marshmallow==3.20.1
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ requests==2.31.0
# via
# -c requirements/base.txt
# label-studio-sdk
ruff==0.1.0
ruff==0.1.1
# via -r requirements/test.in
six==1.16.0
# via
Expand Down
13 changes: 11 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=200,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand All @@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=200,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand Down Expand Up @@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
mock_process.assert_called_once_with(
filename,
is_image=False,
extract_tables=mock.ANY,
model_name=pdf.default_hi_res_model(),
pdf_image_dpi=100,
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/partition/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
contains_emoji,
document_to_element_list,
)
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.partition.utils.constants import SortMode


class MockPageLayout(layout.PageLayout):
Expand Down Expand Up @@ -460,7 +460,7 @@ def test_document_to_element_list_handles_parent():

@pytest.mark.parametrize(
("sort_mode", "call_count"),
[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
[(SortMode.SORT_MODE_DONT, 0), (SortMode.SORT_MODE_BASIC, 1), (SortMode.SORT_MODE_XY_CUT, 1)],
)
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
Expand Down
10 changes: 5 additions & 5 deletions test_unstructured/partition/utils/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import CoordinatesMetadata, Element, Text
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.constants import SortMode
from unstructured.partition.utils.sorting import (
coord_has_valid_points,
coordinates_to_bbox,
Expand Down Expand Up @@ -60,7 +60,7 @@ def test_sort_xycut_neg_coordinates():
elements.append(elem)

# NOTE(crag): xycut not attempted, sort_page_elements returns original list
assert sort_page_elements(elements, sort_mode=SORT_MODE_XY_CUT) is not elements
assert sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_XY_CUT) is not elements


def test_sort_xycut_pos_coordinates():
Expand All @@ -74,7 +74,7 @@ def test_sort_xycut_pos_coordinates():
elements.append(elem)

# NOTE(crag): xycut ran, so different list reference returned from input list
assert sort_page_elements(elements, sort_mode=SORT_MODE_XY_CUT) is not elements
assert sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_XY_CUT) is not elements


def test_sort_basic_neg_coordinates():
Expand All @@ -87,7 +87,7 @@ def test_sort_basic_neg_coordinates():
)
elements.append(elem)

sorted_page_elements = sort_page_elements(elements, sort_mode=SORT_MODE_BASIC)
sorted_page_elements = sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_BASIC)
sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])
assert sorted_elem_text == "2 1 0"

Expand All @@ -102,7 +102,7 @@ def test_sort_basic_pos_coordinates():
)
elements.append(elem)

sorted_page_elements = sort_page_elements(elements, sort_mode=SORT_MODE_BASIC)
sorted_page_elements = sort_page_elements(elements, sort_mode=SortMode.SORT_MODE_BASIC)
assert sorted_page_elements is not elements

sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.25" # pragma: no cover
__version__ = "0.10.26-dev0" # pragma: no cover
9 changes: 3 additions & 6 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@
)
from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.partition.utils.constants import (
SORT_MODE_DONT,
SORT_MODE_XY_CUT,
)
from unstructured.partition.utils.constants import SortMode
from unstructured.utils import dependency_exists, first

if dependency_exists("docx") and dependency_exists("docx.table"):
Expand Down Expand Up @@ -551,11 +548,11 @@ def document_to_element_list(
infer_list_items: bool = True,
source_format: Optional[str] = None,
detection_origin: Optional[str] = None,
sort_mode: SortMode = SortMode.SORT_MODE_XY_CUT,
**kwargs,
) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

num_pages = len(document.pages)
for i, page in enumerate(document.pages):
Expand Down Expand Up @@ -630,7 +627,7 @@ def document_to_element_list(
)
element.metadata.parent_id = element_parent.id
sorted_page_elements = page_elements
if sortable and sort_mode != SORT_MODE_DONT:
if sortable and sort_mode != SortMode.SORT_MODE_DONT:
sorted_page_elements = sort_page_elements(page_elements, sort_mode)

if include_page_breaks and i < num_pages - 1:
Expand Down
40 changes: 14 additions & 26 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,7 @@
)
from unstructured.partition.strategies import determine_pdf_or_image_strategy
from unstructured.partition.text import element_from_text, partition_text
from unstructured.partition.utils.constants import (
SORT_MODE_BASIC,
SORT_MODE_DONT,
SORT_MODE_XY_CUT,
OCRMode,
)
from unstructured.partition.utils.constants import OCRMode, SortMode
from unstructured.partition.utils.sorting import (
coord_has_valid_points,
sort_page_elements,
Expand Down Expand Up @@ -334,6 +329,9 @@ def _partition_pdf_or_image_local(
ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
qued marked this conversation as resolved.
Show resolved Hide resolved
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
**kwargs,
) -> List[Element]:
"""Partition using package installed locally."""
Expand All @@ -350,7 +348,6 @@ def _partition_pdf_or_image_local(
ocr_languages = prepare_languages_for_tesseract(languages)

model_name = model_name or default_hi_res_model()
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
if pdf_image_dpi is None:
pdf_image_dpi = 300 if model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (model_name == "chipper"):
Expand All @@ -359,27 +356,16 @@ def _partition_pdf_or_image_local(
f"(currently {pdf_image_dpi}).",
)

# NOTE(christine): Need to extract images from PDF's
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
image_output_dir_path = kwargs.get("image_output_dir_path", None)
process_with_model_extra_kwargs = {
"extract_images_in_pdf": extract_images_in_pdf,
"image_output_dir_path": image_output_dir_path,
}

process_with_model_kwargs = {}
for key, value in process_with_model_extra_kwargs.items():
if value:
process_with_model_kwargs[key] = value

if file is None:
# NOTE(christine): out_layout = extracted_layout + inferred_layout
out_layout = process_file_with_model(
filename,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -398,9 +384,11 @@ def _partition_pdf_or_image_local(
out_layout = process_data_with_model(
file,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -420,7 +408,7 @@ def _partition_pdf_or_image_local(

# NOTE(alan): starting with v2, chipper sorts the elements itself.
if model_name == "chipper":
kwargs["sort_mode"] = SORT_MODE_DONT
kwargs["sort_mode"] = SortMode.SORT_MODE_DONT
qued marked this conversation as resolved.
Show resolved Hide resolved

elements = document_to_element_list(
final_layout,
Expand Down Expand Up @@ -528,11 +516,11 @@ def _process_pdfminer_pages(
filename: str = "",
include_page_breaks: bool = False,
metadata_last_modified: Optional[str] = None,
sort_mode: SortMode = SortMode.SORT_MODE_XY_CUT,
**kwargs,
):
"""Uses PDF miner to split a document into pages and process them."""
elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

rsrcmgr = PDFResourceManager()
laparams = LAParams()
Expand Down Expand Up @@ -661,8 +649,8 @@ def _process_pdfminer_pages(

# NOTE(crag, christine): always do the basic sort first for determinsitic order across
# python versions.
sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC)
if sort_mode != SORT_MODE_BASIC:
sorted_page_elements = sort_page_elements(page_elements, SortMode.SORT_MODE_BASIC)
if sort_mode != SortMode.SORT_MODE_BASIC:
sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode)

elements += sorted_page_elements
Expand Down
8 changes: 5 additions & 3 deletions unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ class OCRMode(Enum):
FULL_PAGE = "entire_page"


SORT_MODE_XY_CUT = "xy-cut"
SORT_MODE_BASIC = "basic"
SORT_MODE_DONT = "dont"
class SortMode(Enum):
SORT_MODE_XY_CUT = "xy-cut"
SORT_MODE_BASIC = "basic"
SORT_MODE_DONT = "dont"


SUBREGION_THRESHOLD_FOR_OCR = 0.5
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
Expand Down
Loading
Loading