From 5e10d2b682f25aef44e3ab667993d04e66c2a671 Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:18:05 +0530 Subject: [PATCH 1/6] Changes for line-item extraction prompt type Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- .../static/select_choices.json | 3 +- ...003_alter_toolstudioprompt_enforce_type.py | 36 +++++++++ .../prompt_studio/prompt_studio_v2/models.py | 1 + .../src/unstract/prompt_service/constants.py | 1 + .../src/unstract/prompt_service/helper.py | 73 ++++++++++++++++++- .../src/unstract/prompt_service/main.py | 35 +++++++++ 6 files changed, 146 insertions(+), 3 deletions(-) create mode 100644 backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py diff --git a/backend/prompt_studio/prompt_studio_core_v2/static/select_choices.json b/backend/prompt_studio/prompt_studio_core_v2/static/select_choices.json index f9e002f7d..2e260a452 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/static/select_choices.json +++ b/backend/prompt_studio/prompt_studio_core_v2/static/select_choices.json @@ -15,7 +15,8 @@ "boolean":"boolean", "json":"json", "table":"table", - "record":"record" + "record":"record", + "line_item":"line-item" }, "output_processing":{ "DEFAULT":"Default" diff --git a/backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py b/backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py new file mode 100644 index 000000000..af359ec9a --- /dev/null +++ b/backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py @@ -0,0 +1,36 @@ +# Generated by Django 4.2.1 on 2024-12-10 04:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("prompt_studio_v2", "0002_alter_toolstudioprompt_enforce_type"), + ] + + operations = [ + migrations.AlterField( + model_name="toolstudioprompt", + name="enforce_type", + field=models.TextField( + blank=True, + choices=[ + ("Text", "Response sent as Text"), + ("number", "Response sent as number"), + ("email", "Response sent as email"), + ("date", "Response sent as date"), + ("boolean", "Response sent as boolean"), + ("json", "Response sent as json"), + ("table", "Response sent as table"), + ( + "record", + "Response sent for records. Entries of records are list of logical and organized individual entities with distint values", + ), + ("line-item", "Response sent as line-item"), + ], + db_comment="Field to store the type in which the response to be returned.", + default="Text", + ), + ), + ] diff --git a/backend/prompt_studio/prompt_studio_v2/models.py b/backend/prompt_studio/prompt_studio_v2/models.py index 9cd37c36f..afe978763 100644 --- a/backend/prompt_studio/prompt_studio_v2/models.py +++ b/backend/prompt_studio/prompt_studio_v2/models.py @@ -27,6 +27,7 @@ class EnforceType(models.TextChoices): "logical and organized individual " "entities with distint values" ) + LINE_ITEM = "line-item", ("Response sent as line-item") class PromptType(models.TextChoices): PROMPT = "PROMPT", "Response sent as Text" diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py index e905eec1e..d6e9f36d7 100644 --- a/prompt-service/src/unstract/prompt_service/constants.py +++ b/prompt-service/src/unstract/prompt_service/constants.py @@ -71,6 +71,7 @@ class PromptServiceContants: ENABLE_HIGHLIGHT = "enable_highlight" FILE_PATH = "file_path" HIGHLIGHT_DATA = "highlight_data" + LINE_ITEM = "line-item" class RunLevel(Enum): diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py index cf4ef63cb..50ce64542 100644 --- a/prompt-service/src/unstract/prompt_service/helper.py +++ b/prompt-service/src/unstract/prompt_service/helper.py @@ -16,6 +16,11 @@ from unstract.sdk.exceptions import SdkError from unstract.sdk.llm import LLM +PAID_FEATURE_MSG = ( + "It is a cloud / enterprise feature. If you have purchased a plan and still " + "face this issue, please contact support" +) + load_dotenv() # Global variable to store plugins @@ -295,8 +300,8 @@ def run_completion( extract_json=prompt_type.lower() != PSKeys.TEXT, ) answer: str = completion[PSKeys.RESPONSE].text - highlight_data = completion.get(PSKeys.HIGHLIGHT_DATA) - if all([metadata, highlight_data, prompt_key]): + highlight_data = completion.get(PSKeys.HIGHLIGHT_DATA, []) + if all([metadata, prompt_key]): metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {})[prompt_key] = highlight_data return answer # TODO: Catch and handle specific exception here @@ -333,3 +338,67 @@ def extract_table( except table_extractor["exception_cls"] as e: msg = f"Couldn't extract table. {e}" raise APIError(message=msg) + + +def extract_line_item( + tool_settings: dict[str, Any], + output: dict[str, Any], + plugins: dict[str, dict[str, Any]], + structured_output: dict[str, Any], + llm: LLM, + file_path: str, +) -> dict[str, Any]: + # Adjust file path to read from the extract folder + base_name = os.path.splitext(os.path.basename(file_path))[ + 0 + ] # Get the base name without extension + extract_file_path = os.path.join( + os.path.dirname(file_path), "extract", f"{base_name}.txt" + ) + + # Read file content into context + if not os.path.exists(extract_file_path): + raise FileNotFoundError( + f"The file at path '{extract_file_path}' does not exist." + ) + + with open(extract_file_path, encoding="utf-8") as file: + context = file.read() + + prompt = construct_prompt( + preamble=tool_settings.get(PSKeys.PREAMBLE, ""), + prompt=output["promptx"], + postamble=tool_settings.get(PSKeys.POSTAMBLE, ""), + grammar_list=tool_settings.get(PSKeys.GRAMMAR, []), + context=context, + platform_postamble="", + ) + # return run_completion( + # llm=llm, + # prompt=prompt, + # metadata=metadata, + # prompt_key=output[PSKeys.NAME], + # prompt_type=output.get(PSKeys.TYPE, PSKeys.TEXT), + # enable_highlight=enable_highlight, + # file_path=file_path, + # ) + line_item_extraction_plugin: dict[str, Any] = plugins.get( + "line-item-extraction", {} + ) + if not line_item_extraction_plugin: + raise APIError(PAID_FEATURE_MSG) + try: + line_item_extraction = line_item_extraction_plugin["entrypoint_cls"]( + llm=llm, + tool_settings=tool_settings, + output=output, + structured_output=structured_output, + logger=current_app.logger, + prompt=prompt, + ) + answer = line_item_extraction.run() + structured_output[output[PSKeys.NAME]] = answer + return structured_output + except line_item_extraction["exception_cls"] as e: + msg = f"Couldn't extract table. {e}" + raise APIError(message=msg) diff --git a/prompt-service/src/unstract/prompt_service/main.py b/prompt-service/src/unstract/prompt_service/main.py index 12c2242e0..43ea0052e 100644 --- a/prompt-service/src/unstract/prompt_service/main.py +++ b/prompt-service/src/unstract/prompt_service/main.py @@ -12,6 +12,7 @@ from unstract.prompt_service.exceptions import APIError, ErrorResponse, NoPayloadError from unstract.prompt_service.helper import ( construct_and_run_prompt, + extract_line_item, extract_table, extract_variable, get_cleaned_context, @@ -250,6 +251,40 @@ def prompt_processor() -> Any: "Error while extracting table for the prompt", ) raise api_error + elif output[PSKeys.TYPE] == PSKeys.LINE_ITEM: + try: + structured_output = extract_line_item( + tool_settings=tool_settings, + output=output, + plugins=plugins, + structured_output=structured_output, + llm=llm, + file_path=file_path, + ) + metadata = query_usage_metadata(token=platform_key, metadata=metadata) + response = { + PSKeys.METADATA: metadata, + PSKeys.OUTPUT: structured_output, + } + return response + except APIError as e: + app.logger.error( + "Failed to extract line-item for the prompt %s: %s", + output[PSKeys.NAME], + str(e), + ) + publish_log( + log_events_id, + { + "tool_id": tool_id, + "prompt_key": prompt_name, + "doc_name": doc_name, + }, + LogLevel.ERROR, + RunLevel.RUN, + "Error while extracting line-item for the prompt", + ) + raise e try: context: set[str] = set() From 668ed04bd21ae341e91edc62d4a462fde2627fc0 Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:32:22 +0530 Subject: [PATCH 2/6] Removed commented out code Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- prompt-service/src/unstract/prompt_service/helper.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py index 50ce64542..ac6152d48 100644 --- a/prompt-service/src/unstract/prompt_service/helper.py +++ b/prompt-service/src/unstract/prompt_service/helper.py @@ -373,15 +373,6 @@ def extract_line_item( context=context, platform_postamble="", ) - # return run_completion( - # llm=llm, - # prompt=prompt, - # metadata=metadata, - # prompt_key=output[PSKeys.NAME], - # prompt_type=output.get(PSKeys.TYPE, PSKeys.TEXT), - # enable_highlight=enable_highlight, - # file_path=file_path, - # ) line_item_extraction_plugin: dict[str, Any] = plugins.get( "line-item-extraction", {} ) From bd0e83228b27c2953ecc7155d61e1e1ac349e77f Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Mon, 16 Dec 2024 05:01:53 +0530 Subject: [PATCH 3/6] Minor fix Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- prompt-service/src/unstract/prompt_service/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py index ac6152d48..a9c927c5d 100644 --- a/prompt-service/src/unstract/prompt_service/helper.py +++ b/prompt-service/src/unstract/prompt_service/helper.py @@ -383,13 +383,13 @@ def extract_line_item( llm=llm, tool_settings=tool_settings, output=output, + prompt=prompt, structured_output=structured_output, logger=current_app.logger, - prompt=prompt, ) answer = line_item_extraction.run() structured_output[output[PSKeys.NAME]] = answer return structured_output - except line_item_extraction["exception_cls"] as e: + except line_item_extraction_plugin["exception_cls"] as e: msg = f"Couldn't extract table. {e}" raise APIError(message=msg) From 3a631441f51daf1da1674ccb9b398d7d88d9d272 Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Fri, 10 Jan 2025 10:46:07 +0530 Subject: [PATCH 4/6] Minor improvements Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- .../output_manager_helper.py | 2 +- ...=> 0006_alter_toolstudioprompt_enforce_type.py} | 9 ++++++--- backend/prompt_studio/prompt_studio_v2/models.py | 7 ++++++- .../src/unstract/prompt_service/helper.py | 14 +++++++++----- prompt-service/src/unstract/prompt_service/main.py | 5 ++++- 5 files changed, 26 insertions(+), 11 deletions(-) rename backend/prompt_studio/prompt_studio_v2/migrations/{0003_alter_toolstudioprompt_enforce_type.py => 0006_alter_toolstudioprompt_enforce_type.py} (74%) diff --git a/backend/prompt_studio/prompt_studio_output_manager_v2/output_manager_helper.py b/backend/prompt_studio/prompt_studio_output_manager_v2/output_manager_helper.py index 4e6a25daa..f6ec9bf9d 100644 --- a/backend/prompt_studio/prompt_studio_output_manager_v2/output_manager_helper.py +++ b/backend/prompt_studio/prompt_studio_output_manager_v2/output_manager_helper.py @@ -148,7 +148,7 @@ def update_or_create_prompt_output( output = outputs.get(prompt.prompt_key) # TODO: use enums here - if prompt.enforce_type in {"json", "table", "record"}: + if prompt.enforce_type in {"json", "table", "record", "line-item"}: output = json.dumps(output) profile_manager = default_profile eval_metrics = outputs.get(f"{prompt.prompt_key}__evaluation", []) diff --git a/backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py b/backend/prompt_studio/prompt_studio_v2/migrations/0006_alter_toolstudioprompt_enforce_type.py similarity index 74% rename from backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py rename to backend/prompt_studio/prompt_studio_v2/migrations/0006_alter_toolstudioprompt_enforce_type.py index af359ec9a..8dc5a79d4 100644 --- a/backend/prompt_studio/prompt_studio_v2/migrations/0003_alter_toolstudioprompt_enforce_type.py +++ b/backend/prompt_studio/prompt_studio_v2/migrations/0006_alter_toolstudioprompt_enforce_type.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.1 on 2024-12-10 04:13 +# Generated by Django 4.2.1 on 2025-01-09 21:09 from django.db import migrations, models @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ("prompt_studio_v2", "0002_alter_toolstudioprompt_enforce_type"), + ("prompt_studio_v2", "0005_alter_toolstudioprompt_required"), ] operations = [ @@ -27,7 +27,10 @@ class Migration(migrations.Migration): "record", "Response sent for records. Entries of records are list of logical and organized individual entities with distint values", ), - ("line-item", "Response sent as line-item"), + ( + "line-item", + "Response sent as line-item which is large a JSON output. If extraction stopped due to token limitation, we try to continue extraction from where it stopped", + ), ], db_comment="Field to store the type in which the response to be returned.", default="Text", diff --git a/backend/prompt_studio/prompt_studio_v2/models.py b/backend/prompt_studio/prompt_studio_v2/models.py index 315659d2f..68f14d39f 100644 --- a/backend/prompt_studio/prompt_studio_v2/models.py +++ b/backend/prompt_studio/prompt_studio_v2/models.py @@ -27,7 +27,12 @@ class EnforceType(models.TextChoices): "logical and organized individual " "entities with distint values" ) - LINE_ITEM = "line-item", ("Response sent as line-item") + LINE_ITEM = "line-item", ( + "Response sent as line-item " + "which is large a JSON output. " + "If extraction stopped due to token limitation, " + "we try to continue extraction from where it stopped" + ) class PromptType(models.TextChoices): PROMPT = "PROMPT", "Response sent as Text" diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py index c27887c74..bbb7f999d 100644 --- a/prompt-service/src/unstract/prompt_service/helper.py +++ b/prompt-service/src/unstract/prompt_service/helper.py @@ -408,7 +408,14 @@ def extract_line_item( structured_output: dict[str, Any], llm: LLM, file_path: str, + metadata: Optional[dict[str, str]], ) -> dict[str, Any]: + line_item_extraction_plugin: dict[str, Any] = plugins.get( + "line-item-extraction", {} + ) + if not line_item_extraction_plugin: + raise APIError(PAID_FEATURE_MSG) + # Adjust file path to read from the extract folder base_name = os.path.splitext(os.path.basename(file_path))[ 0 @@ -434,11 +441,7 @@ def extract_line_item( context=context, platform_postamble="", ) - line_item_extraction_plugin: dict[str, Any] = plugins.get( - "line-item-extraction", {} - ) - if not line_item_extraction_plugin: - raise APIError(PAID_FEATURE_MSG) + try: line_item_extraction = line_item_extraction_plugin["entrypoint_cls"]( llm=llm, @@ -450,6 +453,7 @@ def extract_line_item( ) answer = line_item_extraction.run() structured_output[output[PSKeys.NAME]] = answer + metadata[PSKeys.CONTEXT][output[PSKeys.NAME]] = get_cleaned_context(context) return structured_output except line_item_extraction_plugin["exception_cls"] as e: msg = f"Couldn't extract table. {e}" diff --git a/prompt-service/src/unstract/prompt_service/main.py b/prompt-service/src/unstract/prompt_service/main.py index 5ab01c57b..5fb5d5146 100644 --- a/prompt-service/src/unstract/prompt_service/main.py +++ b/prompt-service/src/unstract/prompt_service/main.py @@ -274,13 +274,16 @@ def prompt_processor() -> Any: structured_output=structured_output, llm=llm, file_path=file_path, + metadata=metadata, ) metadata = query_usage_metadata(token=platform_key, metadata=metadata) + # TODO: Handle metrics for line-item extraction response = { PSKeys.METADATA: metadata, PSKeys.OUTPUT: structured_output, + PSKeys.METRICS: metrics, } - return response + continue except APIError as e: app.logger.error( "Failed to extract line-item for the prompt %s: %s", From e5e9d1e115bb18fa93fcfbd2fdefffc5d6d563e7 Mon Sep 17 00:00:00 2001 From: harini-venkataraman <115449948+harini-venkataraman@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:29:27 +0530 Subject: [PATCH 5/6] [FEAT] FS APIs for Line item extractor (#1060) * FS APIs for Line item extractor * Optimizing if-else branch --- .../src/unstract/prompt_service/helper.py | 32 +++++++++++++++---- .../src/unstract/prompt_service/main.py | 1 + 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py index bbb7f999d..08d18d907 100644 --- a/prompt-service/src/unstract/prompt_service/helper.py +++ b/prompt-service/src/unstract/prompt_service/helper.py @@ -409,6 +409,7 @@ def extract_line_item( llm: LLM, file_path: str, metadata: Optional[dict[str, str]], + execution_source: str, ) -> dict[str, Any]: line_item_extraction_plugin: dict[str, Any] = plugins.get( "line-item-extraction", {} @@ -425,13 +426,32 @@ def extract_line_item( ) # Read file content into context - if not os.path.exists(extract_file_path): - raise FileNotFoundError( - f"The file at path '{extract_file_path}' does not exist." - ) + if check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE): + fs_instance: FileStorage = FileStorage(FileStorageProvider.LOCAL) + if execution_source == ExecutionSource.IDE.value: + fs_instance = EnvHelper.get_storage( + storage_type=StorageType.PERMANENT, + env_name=FileStorageKeys.PERMANENT_REMOTE_STORAGE, + ) + if execution_source == ExecutionSource.TOOL.value: + fs_instance = EnvHelper.get_storage( + storage_type=StorageType.TEMPORARY, + env_name=FileStorageKeys.TEMPORARY_REMOTE_STORAGE, + ) + + if not fs_instance.exists(extract_file_path): + raise FileNotFoundError( + f"The file at path '{extract_file_path}' does not exist." + ) + context = fs_instance.read(path=extract_file_path, encoding="utf-8", mode="rb") + else: + if not os.path.exists(extract_file_path): + raise FileNotFoundError( + f"The file at path '{extract_file_path}' does not exist." + ) - with open(extract_file_path, encoding="utf-8") as file: - context = file.read() + with open(extract_file_path, encoding="utf-8") as file: + context = file.read() prompt = construct_prompt( preamble=tool_settings.get(PSKeys.PREAMBLE, ""), diff --git a/prompt-service/src/unstract/prompt_service/main.py b/prompt-service/src/unstract/prompt_service/main.py index 5fb5d5146..a66d2778f 100644 --- a/prompt-service/src/unstract/prompt_service/main.py +++ b/prompt-service/src/unstract/prompt_service/main.py @@ -275,6 +275,7 @@ def prompt_processor() -> Any: llm=llm, file_path=file_path, metadata=metadata, + execution_source=execution_source, ) metadata = query_usage_metadata(token=platform_key, metadata=metadata) # TODO: Handle metrics for line-item extraction From e8ce466a2a0366b9c84631f9e6bab410d2b2c4ca Mon Sep 17 00:00:00 2001 From: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> Date: Fri, 10 Jan 2025 14:25:09 +0530 Subject: [PATCH 6/6] Minor fix Signed-off-by: Deepak <89829542+Deepak-Kesavan@users.noreply.github.com> --- .../src/unstract/prompt_service/helper.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py index 08d18d907..3797fcb52 100644 --- a/prompt-service/src/unstract/prompt_service/helper.py +++ b/prompt-service/src/unstract/prompt_service/helper.py @@ -417,13 +417,13 @@ def extract_line_item( if not line_item_extraction_plugin: raise APIError(PAID_FEATURE_MSG) - # Adjust file path to read from the extract folder - base_name = os.path.splitext(os.path.basename(file_path))[ - 0 - ] # Get the base name without extension - extract_file_path = os.path.join( - os.path.dirname(file_path), "extract", f"{base_name}.txt" - ) + extract_file_path = file_path + if execution_source == ExecutionSource.IDE.value: + # Adjust file path to read from the extract folder + base_name = os.path.splitext(os.path.basename(file_path))[0] + extract_file_path = os.path.join( + os.path.dirname(file_path), "extract", f"{base_name}.txt" + ) # Read file content into context if check_feature_flag_status(FeatureFlag.REMOTE_FILE_STORAGE): @@ -473,7 +473,7 @@ def extract_line_item( ) answer = line_item_extraction.run() structured_output[output[PSKeys.NAME]] = answer - metadata[PSKeys.CONTEXT][output[PSKeys.NAME]] = get_cleaned_context(context) + metadata[PSKeys.CONTEXT][output[PSKeys.NAME]] = [context] return structured_output except line_item_extraction_plugin["exception_cls"] as e: msg = f"Couldn't extract table. {e}"