Skip to content

Commit

Permalink
feat: pass list type parameters via client sdk (#2567)
Browse files Browse the repository at this point in the history
The purpose of this PR is to support using the same type of parameters
as `partition_*()` when using `partition_via_api()`. This PR works
together with `unsturctured-api` [PR
#368](Unstructured-IO/unstructured-api#368).

**Note:** This PR will support extracting image blocks("Image", "Table")
via partition_via_api().

### Summary
- update `partition_via_api()` to convert all list type parameters to
JSON formatted strings before passing them to the unstructured client
SDK
- add a unit test function to test extracting image blocks via
`parition_via_api()`
- add a unit test function to test list type parameters passed to API
via unstructured client sdk

### Testing
```
from unstructured.partition.api import partition_via_api

elements = partition_via_api(
    filename="example-docs/embedded-images-tables.pdf",
    api_key="YOUR-API-KEY",
    strategy="hi_res",
    extract_image_block_types=["image", "table"],
)

image_block_elements = [el for el in elements if el.category == "Image" or el.category == "Table"]
print("\n\n".join([el.metadata.image_mime_type for el in image_block_elements]))
print("\n\n".join([el.metadata.image_base64 for el in image_block_elements]))
```
  • Loading branch information
christinestraub authored Feb 26, 2024
1 parent 8f78538 commit ee8b0f9
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

### Fixes

* **Fix passing list type parameters when calling unstructured API via `partition_via_api()`** Update `partition_via_api()` to convert all list type parameters to JSON formatted strings before calling the unstructured client SDK. This will support image block extraction via `partition_via_api()`.
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors**
* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.
Expand Down
49 changes: 48 additions & 1 deletion test_unstructured/partition/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import contextlib
import json
import os
Expand All @@ -8,7 +9,7 @@
import requests
from unstructured_client.general import General

from unstructured.documents.elements import NarrativeText
from unstructured.documents.elements import ElementType, NarrativeText
from unstructured.partition.api import partition_multiple_via_api, partition_via_api

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand Down Expand Up @@ -210,6 +211,52 @@ def test_partition_via_api_valid_request_data_kwargs():
assert isinstance(elements, list)


def test_partition_via_api_image_block_extraction():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf")
elements = partition_via_api(
filename=filename,
strategy="hi_res",
extract_image_block_types=["image", "table"],
api_key=get_api_key(),
)
image_elements = [el for el in elements if el.category == ElementType.IMAGE]
for el in image_elements:
assert el.metadata.image_base64 is not None
assert el.metadata.image_mime_type is not None
image_data = base64.b64decode(el.metadata.image_base64)
assert isinstance(image_data, bytes)


def test_partition_via_api_pass_list_type_parameters(monkeypatch):
mock_request = Mock(return_value=MockResponse(status_code=200))
monkeypatch.setattr(requests.Session, "request", mock_request)

filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf")

partition_via_api(
filename=filename,
strategy="hi_res",
extract_image_block_types=["image", "table"],
skip_infer_table_types=["pdf", "docx"],
languages=["eng"],
)

mock_request.assert_called_with(
"POST",
ANY,
data=ANY,
files=[
["extract_image_block_types", [None, '["image", "table"]']],
["files", ANY],
["languages", [None, '["eng"]']],
["skip_infer_table_types", [None, '["pdf", "docx"]']],
["strategy", [None, "hi_res"]],
],
headers=ANY,
params=ANY,
)


# Note(austin) - This test is way too noisy against the hosted api
# def test_partition_via_api_invalid_request_data_kwargs():
# filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
Expand Down
8 changes: 8 additions & 0 deletions unstructured/partition/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import json
from typing import (
IO,
List,
Expand Down Expand Up @@ -88,6 +89,13 @@ def partition_via_api(
file_name=metadata_filename,
)

# NOTE(christine): Converts all list type parameters to JSON formatted strings
# (e.g. ["image", "table"] -> '["image", "table"]')
# This can be removed if "speakeasy" supports passing list type parameters to FastAPI.
for k, v in request_kwargs.items():
if isinstance(v, list):
request_kwargs[k] = json.dumps(v)

req = shared.PartitionParameters(
files=files,
**request_kwargs,
Expand Down

0 comments on commit ee8b0f9

Please sign in to comment.