From ee8b0f93dcba5adb2e4e032346fbaafbe0f45e54 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Mon, 26 Feb 2024 11:17:06 -0800 Subject: [PATCH] feat: pass list type parameters via client sdk (#2567) The purpose of this PR is to support using the same type of parameters as `partition_*()` when using `partition_via_api()`. This PR works together with `unsturctured-api` [PR #368](https://github.com/Unstructured-IO/unstructured-api/pull/368). **Note:** This PR will support extracting image blocks("Image", "Table") via partition_via_api(). ### Summary - update `partition_via_api()` to convert all list type parameters to JSON formatted strings before passing them to the unstructured client SDK - add a unit test function to test extracting image blocks via `parition_via_api()` - add a unit test function to test list type parameters passed to API via unstructured client sdk ### Testing ``` from unstructured.partition.api import partition_via_api elements = partition_via_api( filename="example-docs/embedded-images-tables.pdf", api_key="YOUR-API-KEY", strategy="hi_res", extract_image_block_types=["image", "table"], ) image_block_elements = [el for el in elements if el.category == "Image" or el.category == "Table"] print("\n\n".join([el.metadata.image_mime_type for el in image_block_elements])) print("\n\n".join([el.metadata.image_base64 for el in image_block_elements])) ``` --- CHANGELOG.md | 1 + test_unstructured/partition/test_api.py | 49 ++++++++++++++++++++++++- unstructured/partition/api.py | 8 ++++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a32edbd4b2..e726a4e914 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ### Fixes +* **Fix passing list type parameters when calling unstructured API via `partition_via_api()`** Update `partition_via_api()` to convert all list type parameters to JSON formatted strings before calling the unstructured client SDK. This will support image block extraction via `partition_via_api()`. * **Add OctoAI embedder** Adds support for embeddings via OctoAI. * **Fix `check_connection` in opensearch, databricks, postgres, azure connectors** * **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON. diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 6cb01df6ce..cf61f837c1 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -1,3 +1,4 @@ +import base64 import contextlib import json import os @@ -8,7 +9,7 @@ import requests from unstructured_client.general import General -from unstructured.documents.elements import NarrativeText +from unstructured.documents.elements import ElementType, NarrativeText from unstructured.partition.api import partition_multiple_via_api, partition_via_api DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -210,6 +211,52 @@ def test_partition_via_api_valid_request_data_kwargs(): assert isinstance(elements, list) +def test_partition_via_api_image_block_extraction(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf") + elements = partition_via_api( + filename=filename, + strategy="hi_res", + extract_image_block_types=["image", "table"], + api_key=get_api_key(), + ) + image_elements = [el for el in elements if el.category == ElementType.IMAGE] + for el in image_elements: + assert el.metadata.image_base64 is not None + assert el.metadata.image_mime_type is not None + image_data = base64.b64decode(el.metadata.image_base64) + assert isinstance(image_data, bytes) + + +def test_partition_via_api_pass_list_type_parameters(monkeypatch): + mock_request = Mock(return_value=MockResponse(status_code=200)) + monkeypatch.setattr(requests.Session, "request", mock_request) + + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf") + + partition_via_api( + filename=filename, + strategy="hi_res", + extract_image_block_types=["image", "table"], + skip_infer_table_types=["pdf", "docx"], + languages=["eng"], + ) + + mock_request.assert_called_with( + "POST", + ANY, + data=ANY, + files=[ + ["extract_image_block_types", [None, '["image", "table"]']], + ["files", ANY], + ["languages", [None, '["eng"]']], + ["skip_infer_table_types", [None, '["pdf", "docx"]']], + ["strategy", [None, "hi_res"]], + ], + headers=ANY, + params=ANY, + ) + + # Note(austin) - This test is way too noisy against the hosted api # def test_partition_via_api_invalid_request_data_kwargs(): # filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") diff --git a/unstructured/partition/api.py b/unstructured/partition/api.py index ef209d7911..f2edd5922a 100644 --- a/unstructured/partition/api.py +++ b/unstructured/partition/api.py @@ -1,4 +1,5 @@ import contextlib +import json from typing import ( IO, List, @@ -88,6 +89,13 @@ def partition_via_api( file_name=metadata_filename, ) + # NOTE(christine): Converts all list type parameters to JSON formatted strings + # (e.g. ["image", "table"] -> '["image", "table"]') + # This can be removed if "speakeasy" supports passing list type parameters to FastAPI. + for k, v in request_kwargs.items(): + if isinstance(v, list): + request_kwargs[k] = json.dumps(v) + req = shared.PartitionParameters( files=files, **request_kwargs,