Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: improve kwarg handling #1810

Merged
merged 12 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.10.26-dev0

### Enhancements

### Features

### Fixes

## 0.10.25

### Enhancements
Expand Down
4 changes: 2 additions & 2 deletions requirements/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/embed-huggingface.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
markupsafe==2.1.3
# via jinja2
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pdf2image
pdfminer.six
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.9
unstructured-inference==0.7.10
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
4 changes: 2 additions & 2 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ sympy==1.12
# via
# onnxruntime
# torch
timm==0.9.7
timm==0.9.8
# via effdet
tokenizers==0.14.1
# via transformers
Expand Down Expand Up @@ -236,7 +236,7 @@ typing-extensions==4.8.0
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.7.9
unstructured-inference==0.7.10
# via -r requirements/extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-azure-cognitive-search.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
azure-common==1.1.28
# via azure-search-documents
azure-core==1.29.4
azure-core==1.29.5
# via
# azure-search-documents
# msrest
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-azure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via aiohttp
azure-core==1.29.4
azure-core==1.29.5
# via
# adlfs
# azure-identity
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-bedrock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/ingest-bedrock.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
marshmallow==3.20.1
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-confluence.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
#
atlassian-python-api==3.41.2
atlassian-python-api==3.41.3
# via -r requirements/ingest-confluence.in
certifi==2023.7.22
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-jira.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
#
atlassian-python-api==3.41.2
atlassian-python-api==3.41.3
# via -r requirements/ingest-jira.in
certifi==2023.7.22
# via
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-openai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.318
langchain==0.0.320
# via -r requirements/ingest-openai.in
langsmith==0.0.46
langsmith==0.0.49
# via langchain
marshmallow==3.20.1
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ requests==2.31.0
# via
# -c requirements/base.txt
# label-studio-sdk
ruff==0.1.0
ruff==0.1.1
# via -r requirements/test.in
six==1.16.0
# via
Expand Down
13 changes: 11 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,11 @@ def test_partition_pdf_with_model_name_env_var(
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=200,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand All @@ -199,8 +202,11 @@ def test_partition_pdf_with_model_name(
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=200,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand Down Expand Up @@ -440,8 +446,11 @@ def test_partition_pdf_with_dpi():
mock_process.assert_called_once_with(
filename,
is_image=False,
extract_tables=mock.ANY,
model_name=pdf.default_hi_res_model(),
pdf_image_dpi=100,
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)


Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
import warnings
from importlib import import_module
from unittest.mock import patch
from unittest.mock import ANY, patch

import docx
import pytest
Expand Down Expand Up @@ -347,6 +347,8 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
url=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=ANY,
image_output_dir_path=ANY,
strategy="fast",
languages=None,
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.25" # pragma: no cover
__version__ = "0.10.26-dev0" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def partition(
languages: Optional[List[str]] = None,
detect_language_per_element: bool = False,
pdf_infer_table_structure: bool = False,
pdf_extract_images: bool = False,
pdf_image_output_dir_path: Optional[str] = None,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None,
Expand Down Expand Up @@ -186,6 +188,12 @@ def partition(
additional metadata field, "text_as_html," where the value (string) is a just a
transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False.
pdf_extract_images
If True and strategy=hi_res, any detected images will be saved in the path specified by
pdf_image_output_dir_path.
pdf_image_output_dir_path
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
given path
xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml.
Expand Down Expand Up @@ -367,6 +375,8 @@ def partition(
infer_table_structure=infer_table_structure,
strategy=strategy,
languages=languages,
extract_images_in_pdf=pdf_extract_images,
image_output_dir_path=pdf_image_output_dir_path,
**kwargs,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
Expand Down
7 changes: 2 additions & 5 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@
)
from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.partition.utils.constants import (
SORT_MODE_DONT,
SORT_MODE_XY_CUT,
)
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.utils import dependency_exists, first

if dependency_exists("docx") and dependency_exists("docx.table"):
Expand Down Expand Up @@ -551,11 +548,11 @@ def document_to_element_list(
infer_list_items: bool = True,
source_format: Optional[str] = None,
detection_origin: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
**kwargs,
) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

num_pages = len(document.pages)
for i, page in enumerate(document.pages):
Expand Down
41 changes: 24 additions & 17 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def partition_pdf(
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
links: Sequence[Link] = [],
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -135,6 +137,12 @@ def partition_pdf(
processing text/plain content.
metadata_last_modified
The last modified date for the document.
extract_images_in_pdf
If True and strategy=hi_res, any detected images will be saved in the path specified by
image_output_dir_path.
image_output_dir_path
If extract_images_in_pdf=True and strategy=hi_res, any detected images will be saved in the
given path
"""
exactly_one(filename=filename, file=file)

Expand Down Expand Up @@ -164,6 +172,8 @@ def partition_pdf(
max_partition=max_partition,
min_partition=min_partition,
metadata_last_modified=metadata_last_modified,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs,
)

Expand Down Expand Up @@ -210,6 +220,8 @@ def partition_pdf_or_image(
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
image_output_dir_path: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -292,6 +304,8 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
**kwargs,
)
layout_elements = []
Expand Down Expand Up @@ -334,6 +348,9 @@ def _partition_pdf_or_image_local(
ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
extract_images_in_pdf: bool = False,
qued marked this conversation as resolved.
Show resolved Hide resolved
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
**kwargs,
) -> List[Element]:
"""Partition using package installed locally."""
Expand All @@ -350,7 +367,6 @@ def _partition_pdf_or_image_local(
ocr_languages = prepare_languages_for_tesseract(languages)

model_name = model_name or default_hi_res_model()
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
if pdf_image_dpi is None:
pdf_image_dpi = 300 if model_name == "chipper" else 200
if (pdf_image_dpi < 300) and (model_name == "chipper"):
Expand All @@ -359,27 +375,16 @@ def _partition_pdf_or_image_local(
f"(currently {pdf_image_dpi}).",
)

# NOTE(christine): Need to extract images from PDF's
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
image_output_dir_path = kwargs.get("image_output_dir_path", None)
process_with_model_extra_kwargs = {
"extract_images_in_pdf": extract_images_in_pdf,
"image_output_dir_path": image_output_dir_path,
}

process_with_model_kwargs = {}
for key, value in process_with_model_extra_kwargs.items():
if value:
process_with_model_kwargs[key] = value

if file is None:
# NOTE(christine): out_layout = extracted_layout + inferred_layout
out_layout = process_file_with_model(
filename,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand All @@ -398,9 +403,11 @@ def _partition_pdf_or_image_local(
out_layout = process_data_with_model(
file,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
**process_with_model_kwargs,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
Expand Down Expand Up @@ -528,11 +535,11 @@ def _process_pdfminer_pages(
filename: str = "",
include_page_breaks: bool = False,
metadata_last_modified: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
**kwargs,
):
"""Uses PDF miner to split a document into pages and process them."""
elements: List[Element] = []
sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT)

rsrcmgr = PDFResourceManager()
laparams = LAParams()
Expand Down
1 change: 1 addition & 0 deletions unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class OCRMode(Enum):
SORT_MODE_BASIC = "basic"
SORT_MODE_DONT = "dont"


SUBREGION_THRESHOLD_FOR_OCR = 0.5
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)

Expand Down
5 changes: 1 addition & 4 deletions unstructured/partition/utils/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@

from unstructured.documents.elements import CoordinatesMetadata, Element
from unstructured.logger import trace_logger
from unstructured.partition.utils.constants import (
SORT_MODE_BASIC,
SORT_MODE_XY_CUT,
)
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped


Expand Down
Loading