From de8918fb4a7a4b65a3dcfd5d2e064212b23dc060 Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Wed, 11 Dec 2024 15:39:44 -0600
Subject: [PATCH 01/10] Initial CLIP Block

---
 inference/core/workflows/core_steps/loader.py |   4 +
 .../models/foundation/clip/__init__.py        |   0
 .../core_steps/models/foundation/clip/v1.py   | 227 ++++++++++++++++++
 .../execution_engine/entities/types.py        |  16 ++
 4 files changed, 247 insertions(+)
 create mode 100644 inference/core/workflows/core_steps/models/foundation/clip/__init__.py
 create mode 100644 inference/core/workflows/core_steps/models/foundation/clip/v1.py

diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
index d72daa36b..a6297eecf 100644
--- a/inference/core/workflows/core_steps/loader.py
+++ b/inference/core/workflows/core_steps/loader.py
@@ -141,6 +141,9 @@
 from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
     AntropicClaudeBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.clip.v1 import (
+    ClipModelBlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.clip_comparison.v1 import (
     ClipComparisonBlockV1,
 )
@@ -485,6 +488,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         CircleVisualizationBlockV1,
         ClipComparisonBlockV1,
         ClipComparisonBlockV2,
+        ClipModelBlockV1,
         CogVLMBlockV1,
         ColorVisualizationBlockV1,
         ConvertGrayscaleBlockV1,
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/__init__.py b/inference/core/workflows/core_steps/models/foundation/clip/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
new file mode 100644
index 000000000..c3f817171
--- /dev/null
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -0,0 +1,227 @@
+from functools import partial
+from typing import List, Literal, Optional, Type, Union
+
+import numpy as np
+from pydantic import ConfigDict, Field
+
+from inference.core.entities.requests.clip import ClipImageEmbeddingRequest, ClipTextEmbeddingRequest
+from inference.core.env import (
+    HOSTED_CORE_MODEL_URL,
+    LOCAL_INFERENCE_API_URL,
+    WORKFLOWS_REMOTE_API_TARGET,
+    WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
+)
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.core_steps.common.utils import (
+    load_core_model,
+    run_in_parallel,
+)
+from inference.core.workflows.execution_engine.constants import (
+    PARENT_ID_KEY,
+    ROOT_PARENT_ID_KEY,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    EMBEDDING_KIND,
+    IMAGE_KIND,
+    STRING_KIND,
+    ImageInputField,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+from inference_sdk import InferenceHTTPClient
+
+LONG_DESCRIPTION = """
+Use a CLIP model to create semantic embeddings of text and images.
+
+This block accepts an image or string and returns an embedding.
+The embedding can be used to compare the similarity between different
+images or between images and text.
+"""
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "CLIP Embedding Model",
+            "version": "v1",
+            "short_description": "Generate an embedding of an image or string.",
+            "long_description": LONG_DESCRIPTION,
+            "license": "MIT",
+            "block_type": "model",
+        }
+    )
+    type: Literal["roboflow_core/clip@v1"]
+    name: str = Field(description="Unique name of step in workflows")
+    data: Selector(kind=[IMAGE_KIND, STRING_KIND]) = Field(
+        title="Data",
+        description="The string or image to generate an embedding for.",
+        examples=["$inputs.image", "$steps.cropping.crops"]
+    )
+    
+    version: Union[
+        Literal[
+            "RN101",
+            "RN50",
+            "RN50x16",
+            "RN50x4",
+            "RN50x64",
+            "ViT-B-16",
+            "ViT-B-32",
+            "ViT-L-14-336px",
+            "ViT-L-14",
+        ],
+        Selector(kind=[STRING_KIND]),
+    ] = Field(
+        default="ViT-B-32",
+        description="Variant of CLIP model",
+        examples=["ViT-B-16", "$inputs.variant"],
+    )
+
+    @classmethod
+    def get_parameters_accepting_batches(cls) -> List[str]:
+        return ["data"]
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])
+        ]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class ClipModelBlockV1(WorkflowBlock):
+
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+        step_execution_mode: StepExecutionMode,
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+        self._step_execution_mode = step_execution_mode
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key", "step_execution_mode"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        data: Batch[Union[WorkflowImageData, str]],
+        version: str,
+    ) -> BlockResult:
+        if self._step_execution_mode is StepExecutionMode.LOCAL:
+            return self.run_locally(data=data, version=version)
+        elif self._step_execution_mode is StepExecutionMode.REMOTE:
+            return self.run_remotely(data=data, version=version)
+        else:
+            raise ValueError(
+                f"Unknown step execution mode: {self._step_execution_mode}"
+            )
+
+    def run_locally(
+        self,
+        data: Batch[Union[WorkflowImageData, str]],
+        version: str,
+    ) -> BlockResult:
+        predictions = []
+        if len(data) == 0:
+            return []
+        
+        if isinstance(data[0], str):
+            inference_request = ClipTextEmbeddingRequest(
+                clip_version_id=version,
+                text=data,
+                api_key=self._api_key,
+            )
+            
+            clip_model_id = load_core_model(
+                model_manager=self._model_manager,
+                inference_request=inference_request,
+                core_model="clip",
+            )
+            
+            predictions = self._model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+            
+            return [ {"embedding": p} for p in predictions.embeddings ]
+        else:
+            inference_request = ClipImageEmbeddingRequest(
+                clip_version_id=version,
+                image=[single_image.to_inference_format(numpy_preferred=True) for single_image in data],
+                api_key=self._api_key,
+            )
+
+            clip_model_id = load_core_model(
+                model_manager=self._model_manager,
+                inference_request=inference_request,
+                core_model="clip",
+            )
+
+            predictions = self._model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+
+            return [ {"embedding": p} for p in predictions.embeddings ]
+
+    def run_remotely(
+        self,
+        data: Batch[Union[WorkflowImageData, str]],
+        version: str,
+    ) -> BlockResult:
+        api_url = (
+            LOCAL_INFERENCE_API_URL
+            if WORKFLOWS_REMOTE_API_TARGET != "hosted"
+            else HOSTED_CORE_MODEL_URL
+        )
+        client = InferenceHTTPClient(
+            api_url=api_url,
+            api_key=self._api_key,
+        )
+        if WORKFLOWS_REMOTE_API_TARGET == "hosted":
+            client.select_api_v0()
+
+        tasks = []
+        for d in data:
+            if isinstance(d, str):
+                tasks.append(
+                    partial(
+                        client.get_clip_text_embeddings,
+                        text=d,
+                        clip_version=version,
+                    )
+                )
+            else:
+                tasks.append(
+                    partial(
+                        client.get_clip_image_embeddings,
+                        image=d.base64_image,
+                        clip_version=version,
+                    )
+                )
+        
+        predictions = run_in_parallel(
+            tasks=tasks,
+            max_workers=WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
+        )
+
+        return [ {"embedding": p} for p in predictions ]
+        
\ No newline at end of file
diff --git a/inference/core/workflows/execution_engine/entities/types.py b/inference/core/workflows/execution_engine/entities/types.py
index d70948857..c51e99147 100644
--- a/inference/core/workflows/execution_engine/entities/types.py
+++ b/inference/core/workflows/execution_engine/entities/types.py
@@ -210,6 +210,22 @@ def __hash__(self) -> int:
     internal_data_type="List[Any]",
 )
 
+EMBEDDING_KIND_DOCS = """
+This kind represents a vector embedding. It is a list of floating point numbers.
+
+Embeddings are used in various machine learning tasks like clustering, classification,
+and similarity search. They are used to represent data in a continuous, low-dimensional space.
+
+Typically, vectors that are close to each other in the embedding space are considered similar.
+"""
+EMBEDDING_KIND = Kind(
+    name="embedding",
+    description="A list of floating point numbers representing a vector embedding.",
+    docs=EMBEDDING_KIND_DOCS,
+    serialised_data_type="List[float]",
+    internal_data_type="List[float]",
+)
+
 RGB_COLOR_KIND_DOCS = """
 This kind represents RGB color as a tuple (R, G, B).
 

From a47c8c082d337ddbec8e3421ce582f3ff8c10027 Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Wed, 11 Dec 2024 15:56:00 -0600
Subject: [PATCH 02/10] Make it work for non-batch oriented inputs also

---
 .../core_steps/models/foundation/clip/v1.py    | 18 ++++++++++++++----
 .../v1/compiler/graph_constructor.py           | 15 ---------------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
index c3f817171..d3d5810de 100644
--- a/inference/core/workflows/core_steps/models/foundation/clip/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -62,7 +62,7 @@ class BlockManifest(WorkflowBlockManifest):
     )
     type: Literal["roboflow_core/clip@v1"]
     name: str = Field(description="Unique name of step in workflows")
-    data: Selector(kind=[IMAGE_KIND, STRING_KIND]) = Field(
+    data: Union[Selector(kind=[IMAGE_KIND, STRING_KIND]), str] = Field(
         title="Data",
         description="The string or image to generate an embedding for.",
         examples=["$inputs.image", "$steps.cropping.crops"]
@@ -141,10 +141,14 @@ def run_locally(
         data: Batch[Union[WorkflowImageData, str]],
         version: str,
     ) -> BlockResult:
-        predictions = []
-        if len(data) == 0:
-            return []
+        # if not array, wrap
+        convert_to_singleton = False
+        if not isinstance(data, Batch):
+            data = [data]
+            convert_to_singleton = True
         
+        print("CONVERT TO SINGLETON", convert_to_singleton, type(data), data)
+
         if isinstance(data[0], str):
             inference_request = ClipTextEmbeddingRequest(
                 clip_version_id=version,
@@ -161,6 +165,9 @@ def run_locally(
             predictions = self._model_manager.infer_from_request_sync(
                 clip_model_id, inference_request
             )
+
+            if convert_to_singleton:
+                return {"embedding": predictions.embeddings[0]}
             
             return [ {"embedding": p} for p in predictions.embeddings ]
         else:
@@ -180,6 +187,9 @@ def run_locally(
                 clip_model_id, inference_request
             )
 
+            if convert_to_singleton:
+                return {"embedding": predictions.embeddings[0]}
+
             return [ {"embedding": p} for p in predictions.embeddings ]
 
     def run_remotely(
diff --git a/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py b/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py
index f1a253b41..fcded3073 100644
--- a/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py
+++ b/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py
@@ -710,21 +710,6 @@ def denote_data_flow_for_step(
                 f"step inputs are filled with outputs of batch-oriented steps or batch-oriented inputs.",
                 context="workflow_compilation | execution_graph_construction",
             )
-        if (
-            step_accepts_batch_input
-            and batch_input_expected == {True}
-            and False in actual_input_is_batch
-        ):
-            raise ExecutionGraphStructureError(
-                public_message=f"Detected invalid reference plugged "
-                f"into property `{property_name}` of step `{node}` - the step "
-                f"property strictly requires batch-oriented inputs, yet the input selector "
-                f"holds non-batch oriented input - this indicates the "
-                f"problem with construction of your Workflow - usually the problem occurs when "
-                f"non-batch oriented step inputs are filled with outputs of non batch-oriented "
-                f"steps or non batch-oriented inputs.",
-                context="workflow_compilation | execution_graph_construction",
-            )
     if not parameters_with_batch_inputs:
         data_lineage = []
     else:

From 9cf28712bf18445412b3a064bd089415961a20e2 Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Thu, 12 Dec 2024 10:40:46 -0600
Subject: [PATCH 03/10] Make style

---
 .../core_steps/models/foundation/clip/v1.py   | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
index d3d5810de..6d68cd07a 100644
--- a/inference/core/workflows/core_steps/models/foundation/clip/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -4,7 +4,10 @@
 import numpy as np
 from pydantic import ConfigDict, Field
 
-from inference.core.entities.requests.clip import ClipImageEmbeddingRequest, ClipTextEmbeddingRequest
+from inference.core.entities.requests.clip import (
+    ClipImageEmbeddingRequest,
+    ClipTextEmbeddingRequest,
+)
 from inference.core.env import (
     HOSTED_CORE_MODEL_URL,
     LOCAL_INFERENCE_API_URL,
@@ -65,9 +68,9 @@ class BlockManifest(WorkflowBlockManifest):
     data: Union[Selector(kind=[IMAGE_KIND, STRING_KIND]), str] = Field(
         title="Data",
         description="The string or image to generate an embedding for.",
-        examples=["$inputs.image", "$steps.cropping.crops"]
+        examples=["$inputs.image", "$steps.cropping.crops"],
     )
-    
+
     version: Union[
         Literal[
             "RN101",
@@ -93,9 +96,7 @@ def get_parameters_accepting_batches(cls) -> List[str]:
 
     @classmethod
     def describe_outputs(cls) -> List[OutputDefinition]:
-        return [
-            OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])
-        ]
+        return [OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])]
 
     @classmethod
     def get_execution_engine_compatibility(cls) -> Optional[str]:
@@ -146,7 +147,7 @@ def run_locally(
         if not isinstance(data, Batch):
             data = [data]
             convert_to_singleton = True
-        
+
         print("CONVERT TO SINGLETON", convert_to_singleton, type(data), data)
 
         if isinstance(data[0], str):
@@ -155,25 +156,28 @@ def run_locally(
                 text=data,
                 api_key=self._api_key,
             )
-            
+
             clip_model_id = load_core_model(
                 model_manager=self._model_manager,
                 inference_request=inference_request,
                 core_model="clip",
             )
-            
+
             predictions = self._model_manager.infer_from_request_sync(
                 clip_model_id, inference_request
             )
 
             if convert_to_singleton:
                 return {"embedding": predictions.embeddings[0]}
-            
-            return [ {"embedding": p} for p in predictions.embeddings ]
+
+            return [{"embedding": p} for p in predictions.embeddings]
         else:
             inference_request = ClipImageEmbeddingRequest(
                 clip_version_id=version,
-                image=[single_image.to_inference_format(numpy_preferred=True) for single_image in data],
+                image=[
+                    single_image.to_inference_format(numpy_preferred=True)
+                    for single_image in data
+                ],
                 api_key=self._api_key,
             )
 
@@ -190,7 +194,7 @@ def run_locally(
             if convert_to_singleton:
                 return {"embedding": predictions.embeddings[0]}
 
-            return [ {"embedding": p} for p in predictions.embeddings ]
+            return [{"embedding": p} for p in predictions.embeddings]
 
     def run_remotely(
         self,
@@ -227,11 +231,10 @@ def run_remotely(
                         clip_version=version,
                     )
                 )
-        
+
         predictions = run_in_parallel(
             tasks=tasks,
             max_workers=WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
         )
 
-        return [ {"embedding": p} for p in predictions ]
-        
\ No newline at end of file
+        return [{"embedding": p} for p in predictions]

From ace6afc5f6d7ab78b7e7c567eeb14bf24faaacc1 Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Thu, 12 Dec 2024 10:55:20 -0600
Subject: [PATCH 04/10] Update tests

---
 ...st_workflow_with_arbitrary_batch_inputs.py | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_arbitrary_batch_inputs.py b/tests/workflows/integration_tests/execution/test_workflow_with_arbitrary_batch_inputs.py
index 51802efef..ddb7c113e 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_arbitrary_batch_inputs.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_arbitrary_batch_inputs.py
@@ -794,13 +794,12 @@ def test_workflow_when_non_batch_oriented_step_feeds_batch_oriented_step_operati
         "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
     }
 
-    # when
-    with pytest.raises(ExecutionGraphStructureError):
-        _ = ExecutionEngine.init(
-            workflow_definition=WORKFLOW_WITH_NON_BATCH_ORIENTED_STEP_FEEDING_BATCH_ORIENTED_STEP_OPERATING_BATCH_WISE,
-            init_parameters=workflow_init_parameters,
-            max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
-        )
+    # should not throw an exception
+    _ = ExecutionEngine.init(
+        workflow_definition=WORKFLOW_WITH_NON_BATCH_ORIENTED_STEP_FEEDING_BATCH_ORIENTED_STEP_OPERATING_BATCH_WISE,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
 
 
 WORKFLOW_WITH_NON_BATCH_ORIENTED_STEP_FEEDING_MIXED_INPUT_STEP = {
@@ -1327,13 +1326,12 @@ def test_workflow_when_non_batch_oriented_step_feeds_compound_strictly_batch_ori
         "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
     }
 
-    # then
-    with pytest.raises(ExecutionGraphStructureError):
-        _ = ExecutionEngine.init(
-            workflow_definition=WORKFLOW_WITH_NON_BATCH_ORIENTED_STEP_FEEDING_COMPOUND_STRICTLY_BATCH_ORIENTED_STEP,
-            init_parameters=workflow_init_parameters,
-            max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
-        )
+    # should not throw an exception
+    _ = ExecutionEngine.init(
+        workflow_definition=WORKFLOW_WITH_NON_BATCH_ORIENTED_STEP_FEEDING_COMPOUND_STRICTLY_BATCH_ORIENTED_STEP,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
 
 
 WORKFLOW_WITH_BATCH_ORIENTED_STEP_FEEDING_COMPOUND_NON_BATCH_ORIENTED_STEP = {
@@ -1741,13 +1739,12 @@ def test_workflow_when_non_batch_oriented_input_feeds_compound_strictly_batch_or
         "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
     }
 
-    # then
-    with pytest.raises(ExecutionGraphStructureError):
-        _ = ExecutionEngine.init(
-            workflow_definition=WORKFLOW_WITH_NON_BATCH_ORIENTED_INPUT_FEEDING_COMPOUND_STRICTLY_BATCH_ORIENTED_STEP,
-            init_parameters=workflow_init_parameters,
-            max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
-        )
+    # should not throw an exception
+    _ = ExecutionEngine.init(
+        workflow_definition=WORKFLOW_WITH_NON_BATCH_ORIENTED_INPUT_FEEDING_COMPOUND_STRICTLY_BATCH_ORIENTED_STEP,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
 
 
 WORKFLOW_WITH_BATCH_ORIENTED_INPUT_FEEDING_COMPOUND_NON_BATCH_ORIENTED_STEP = {

From a044fa82123e59f86a22082598f78cab4236713c Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Thu, 12 Dec 2024 14:52:11 -0600
Subject: [PATCH 05/10] Add cosine similarity and unit tests

---
 inference/core/workflows/core_steps/loader.py |   4 +
 .../math/cosine_similarity/__init__.py        |   0
 .../core_steps/math/cosine_similarity/v1.py   |  71 +++++++++
 .../core_steps/models/foundation/clip/v1.py   |   6 -
 .../core_steps/math/test_cosine_similarity.py |  79 ++++++++++
 .../core_steps/models/foundation/test_clip.py | 143 ++++++++++++++++++
 6 files changed, 297 insertions(+), 6 deletions(-)
 create mode 100644 inference/core/workflows/core_steps/math/cosine_similarity/__init__.py
 create mode 100644 inference/core/workflows/core_steps/math/cosine_similarity/v1.py
 create mode 100644 tests/workflows/unit_tests/core_steps/math/test_cosine_similarity.py
 create mode 100644 tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py

diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
index a6297eecf..80d578057 100644
--- a/inference/core/workflows/core_steps/loader.py
+++ b/inference/core/workflows/core_steps/loader.py
@@ -138,6 +138,9 @@
 from inference.core.workflows.core_steps.fusion.dimension_collapse.v1 import (
     DimensionCollapseBlockV1,
 )
+from inference.core.workflows.core_steps.math.cosine_similarity.v1 import (
+    CosineSimilarityBlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
     AntropicClaudeBlockV1,
 )
@@ -477,6 +480,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         PropertyDefinitionBlockV1,
         DimensionCollapseBlockV1,
         FirstNonEmptyOrDefaultBlockV1,
+        CosineSimilarityBlockV1,
         AntropicClaudeBlockV1,
         BackgroundColorVisualizationBlockV1,
         BarcodeDetectorBlockV1,
diff --git a/inference/core/workflows/core_steps/math/cosine_similarity/__init__.py b/inference/core/workflows/core_steps/math/cosine_similarity/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/inference/core/workflows/core_steps/math/cosine_similarity/v1.py b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
new file mode 100644
index 000000000..331301f9f
--- /dev/null
+++ b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
@@ -0,0 +1,71 @@
+from typing import List, Literal, Optional, Type
+
+from pydantic import ConfigDict, Field
+
+from inference.core.workflows.execution_engine.entities.base import (
+    OutputDefinition,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    EMBEDDING_KIND,
+    FLOAT_KIND,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+from inference.core.utils.postprocess import cosine_similarity
+
+LONG_DESCRIPTION = """
+Calculate the cosine similarity between two embeddings.
+
+A cosine similarity of 1 means the two embeddings are identical,
+while a cosine similarity of 0 means the two embeddings are orthogonal.
+Greater values indicate greater similarity.
+"""
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "Cosine Similarity",
+            "version": "v1",
+            "short_description": "Calculate the cosine similarity between two embeddings.",
+            "long_description": LONG_DESCRIPTION,
+            "license": "MIT",
+            "block_type": "model",
+        }
+    )
+    type: Literal["roboflow_core/cosine_similarity@v1"]
+    name: str = Field(description="Unique name of step in workflows")
+    embedding_1: Selector(kind=[EMBEDDING_KIND]) = Field(
+        description="Embedding 1",
+        examples=["$steps.clip_image.embedding"],
+    )
+    embedding_2: Selector(kind=[EMBEDDING_KIND]) = Field(
+        description="Embedding 2",
+        examples=["$steps.clip_text.embedding"],
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [OutputDefinition(name="similarity", kind=[FLOAT_KIND])]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class CosineSimilarityBlockV1(WorkflowBlock):
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        embedding_1: List[float],
+        embedding_2: List[float]
+    ) -> BlockResult:
+        similarity = cosine_similarity(embedding_1, embedding_2)
+        return { "similarity": similarity }
\ No newline at end of file
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
index 6d68cd07a..cc4b33c9e 100644
--- a/inference/core/workflows/core_steps/models/foundation/clip/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -1,7 +1,6 @@
 from functools import partial
 from typing import List, Literal, Optional, Type, Union
 
-import numpy as np
 from pydantic import ConfigDict, Field
 
 from inference.core.entities.requests.clip import (
@@ -20,10 +19,6 @@
     load_core_model,
     run_in_parallel,
 )
-from inference.core.workflows.execution_engine.constants import (
-    PARENT_ID_KEY,
-    ROOT_PARENT_ID_KEY,
-)
 from inference.core.workflows.execution_engine.entities.base import (
     Batch,
     OutputDefinition,
@@ -33,7 +28,6 @@
     EMBEDDING_KIND,
     IMAGE_KIND,
     STRING_KIND,
-    ImageInputField,
     Selector,
 )
 from inference.core.workflows.prototypes.block import (
diff --git a/tests/workflows/unit_tests/core_steps/math/test_cosine_similarity.py b/tests/workflows/unit_tests/core_steps/math/test_cosine_similarity.py
new file mode 100644
index 000000000..8df6059d2
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/math/test_cosine_similarity.py
@@ -0,0 +1,79 @@
+import pytest
+from pydantic import ValidationError
+
+from inference.core.workflows.core_steps.math.cosine_similarity.v1 import BlockManifest, CosineSimilarityBlockV1
+
+
+def test_manifest_parsing_when_data_is_valid():
+    # Given
+    data = {
+        "type": "roboflow_core/cosine_similarity@v1",
+        "name": "cosine_step",
+        "embedding_1": "$steps.clip_image.embedding",
+        "embedding_2": "$steps.clip_text.embedding",
+    }
+
+    # When
+    result = BlockManifest.model_validate(data)
+
+    # Then
+    assert result.type == "roboflow_core/cosine_similarity@v1"
+    assert result.name == "cosine_step"
+    assert result.embedding_1 == "$steps.clip_image.embedding"
+    assert result.embedding_2 == "$steps.clip_text.embedding"
+
+
+def test_manifest_parsing_when_data_is_invalid():
+    # Given invalid data (not a valid embedding selector)
+    data = {
+        "type": "roboflow_core/cosine_similarity@v1",
+        "name": "cosine_step",
+        "embedding_1": "invalid_data",
+        "embedding_2": "invalid_data",
+    }
+
+    # When / Then
+    with pytest.raises(ValidationError):
+        BlockManifest.model_validate(data)
+
+
+def test_cosine_similarity_block_run_identical_embeddings():
+    # Given identical embeddings
+    block = CosineSimilarityBlockV1()
+    embedding_1 = [0.1, 0.3, 0.5]
+    embedding_2 = [0.1, 0.3, 0.5]
+
+    # When
+    result = block.run(embedding_1=embedding_1, embedding_2=embedding_2)
+
+    # Then
+    # Cosine similarity should be close to 1.0 for identical vectors
+    assert pytest.approx(result["similarity"], 0.0001) == 1.0
+
+
+def test_cosine_similarity_block_run_orthogonal_embeddings():
+    # Given orthogonal embeddings
+    block = CosineSimilarityBlockV1()
+    embedding_1 = [1.0, 0.0, 0.0]
+    embedding_2 = [0.0, 1.0, 0.0]
+
+    # When
+    result = block.run(embedding_1=embedding_1, embedding_2=embedding_2)
+
+    # Then
+    # Cosine similarity should be close to 0.0 for orthogonal vectors
+    assert pytest.approx(result["similarity"], 0.0001) == 0.0
+
+
+def test_cosine_similarity_block_run_negative_correlation():
+    # Given inversely correlated embeddings
+    block = CosineSimilarityBlockV1()
+    embedding_1 = [1.0, 1.0, 1.0]
+    embedding_2 = [-1.0, -1.0, -1.0]
+
+    # When
+    result = block.run(embedding_1=embedding_1, embedding_2=embedding_2)
+
+    # Then
+    # Cosine similarity should be close to -1.0 for perfectly negatively correlated vectors
+    assert pytest.approx(result["similarity"], 0.0001) == -1.0
diff --git a/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py b/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py
new file mode 100644
index 000000000..d6f4e9ba0
--- /dev/null
+++ b/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py
@@ -0,0 +1,143 @@
+import pytest
+from unittest.mock import MagicMock, patch
+from pydantic import ValidationError
+
+import numpy as np
+
+from inference.core.workflows.core_steps.models.foundation.clip.v1 import (
+    BlockManifest,
+    ClipModelBlockV1
+)
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.entities.base import (
+    ImageParentMetadata,
+    WorkflowImageData
+)
+
+
+@pytest.fixture
+def mock_model_manager():
+    # Mock a model manager that returns a predictable embedding
+    mock = MagicMock()
+    mock.infer_from_request_sync.return_value = MagicMock(
+        embeddings=[[0.1, 0.2, 0.3]]  # Sample embedding
+    )
+    return mock
+
+
+@pytest.fixture
+def mock_workflow_image_data():
+    # Create a mock WorkflowImageData instance
+    start_image = np.random.randint(0, 255, (1000, 1000, 3), dtype=np.uint8)
+    return WorkflowImageData(
+        parent_metadata=ImageParentMetadata(parent_id="some"),
+        numpy_image=start_image,
+    )
+
+
+def test_manifest_parsing_valid():
+    data = {
+        "type": "roboflow_core/clip@v1",
+        "name": "my_clip_step",
+        "data": "$inputs.image",
+        "version": "RN50",
+    }
+
+    result = BlockManifest.model_validate(data)
+    assert result.type == "roboflow_core/clip@v1"
+    assert result.name == "my_clip_step"
+    assert result.data == "$inputs.image"
+    assert result.version == "RN50"
+
+
+def test_manifest_parsing_invalid_missing_type():
+    data = {
+        "name": "my_clip_step",
+        "data": "$inputs.image",
+        "version": "RN50",
+    }
+    with pytest.raises(ValidationError):
+        BlockManifest.model_validate(data)
+
+
+def test_manifest_parsing_invalid_data_type():
+    data = {
+        "type": "roboflow_core/clip@v1",
+        "name": "my_clip_step",
+        "data": 123,  # invalid type
+        "version": "RN50",
+    }
+    with pytest.raises(ValidationError):
+        BlockManifest.model_validate(data)
+
+
+def test_run_locally_with_text(mock_model_manager):
+    block = ClipModelBlockV1(
+        model_manager=mock_model_manager,
+        api_key=None,
+        step_execution_mode=StepExecutionMode.LOCAL,
+    )
+
+    # Run with text input
+    result = block.run(data="Hello world", version="RN50")
+
+    assert isinstance(result, dict)
+    assert len(result["embedding"]) == 3
+    assert result["embedding"] == [0.1, 0.2, 0.3]
+    mock_model_manager.infer_from_request_sync.assert_called_once()
+
+
+def test_run_locally_with_image(mock_model_manager, mock_workflow_image_data):
+    block = ClipModelBlockV1(
+        model_manager=mock_model_manager,
+        api_key=None,
+        step_execution_mode=StepExecutionMode.LOCAL,
+    )
+
+    result = block.run(data=mock_workflow_image_data, version="RN50")
+
+    assert isinstance(result, dict)
+    assert len(result["embedding"]) == 3
+    assert result["embedding"] == [0.1, 0.2, 0.3]
+    mock_model_manager.infer_from_request_sync.assert_called_once()
+
+
+@patch("inference.core.workflows.core_steps.models.foundation.clip.v1.InferenceHTTPClient")
+def test_run_remotely_with_text(mock_client_cls, mock_model_manager):
+    # Mock the remote client and its return value
+    mock_client = MagicMock()
+    mock_client.get_clip_text_embeddings.return_value = [0.1, 0.2, 0.3]
+    mock_client_cls.return_value = mock_client
+
+    block = ClipModelBlockV1(
+        model_manager=mock_model_manager,
+        api_key=None,
+        step_execution_mode=StepExecutionMode.REMOTE,
+    )
+
+    result = block.run(data=["Hello world"], version="RN50")
+
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0]["embedding"] == [0.1, 0.2, 0.3]
+    mock_client.get_clip_text_embeddings.assert_called_once()
+
+
+@patch("inference.core.workflows.core_steps.models.foundation.clip.v1.InferenceHTTPClient")
+def test_run_remotely_with_image(mock_client_cls, mock_model_manager, mock_workflow_image_data):
+    mock_client = MagicMock()
+    mock_client.get_clip_image_embeddings.return_value = [0.1, 0.2, 0.3]
+    mock_client_cls.return_value = mock_client
+
+    block = ClipModelBlockV1(
+        model_manager=mock_model_manager,
+        api_key=None,
+        step_execution_mode=StepExecutionMode.REMOTE,
+    )
+
+    result = block.run(data=[mock_workflow_image_data], version="RN50")
+
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0]["embedding"] == [0.1, 0.2, 0.3]
+    mock_client.get_clip_image_embeddings.assert_called_once()

From b7b5d1cbf517cd3baebc7ff22814a8f9b93882ce Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Thu, 12 Dec 2024 15:19:24 -0600
Subject: [PATCH 06/10] Add integration tests

---
 .../execution/test_workflow_with_clip.py      | 165 +++++++++++++++++-
 1 file changed, 161 insertions(+), 4 deletions(-)

diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_clip.py b/tests/workflows/integration_tests/execution/test_workflow_with_clip.py
index 944f50d0c..af306cce9 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_clip.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_clip.py
@@ -11,6 +11,163 @@
 )
 
 CLIP_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "InferenceImage", "name": "image_1"},
+        {"type": "InferenceImage", "name": "image_2"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding_1",
+            "data": "$inputs.image_1",
+            "version": "RN50",
+        },
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding_2",
+            "data": "$inputs.image_2",
+            "version": "RN50",
+        },
+        {
+            "type": "roboflow_core/cosine_similarity@v1",
+            "name": "cosine_similarity",
+            "embedding_1": "$steps.embedding_1.embedding",
+            "embedding_2": "$steps.embedding_2.embedding",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "similarity",
+            "coordinates_system": "own",
+            "selector": "$steps.cosine_similarity.similarity",
+        },
+        {
+            "type": "JsonField",
+            "name": "image_embeddings",
+            "coordinates_system": "own",
+            "selector": "$steps.embedding_1.embedding",
+        },
+    ],
+}
+
+
+@add_to_workflows_gallery(
+    category="Basic Workflows",
+    use_case_title="Workflow with Embeddings",
+    use_case_description="""
+This Workflow shows how to use an embedding model to compare the
+similarity of two images with each other.
+    """,
+    workflow_definition=CLIP_WORKFLOW,
+    workflow_name_in_app="clip",
+)
+def test_clip_embedding_model(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+    crowd_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.api_key": None,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLIP_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={"image_1": license_plate_image, "image_2": crowd_image}
+    )
+
+    # then
+    assert isinstance(result, list), "Expected list to be delivered"
+    assert len(result) == 1, "Expected 1 element in the output"
+    assert set(result[0].keys()) == {
+        "similarity",
+        "image_embeddings",
+    }, "Expected all declared outputs to be delivered"
+    assert (
+        pytest.approx(result[0]["similarity"], 0.01) == 0.444
+    ), "Expected similarity to be approximately the defined value"
+    assert (
+        len(result[0]["image_embeddings"]) == 1024
+    ), "Expected image embedding to be of dimension 1024 for RN50 model"
+
+CLIP_TEXT_WORKFLOW = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "WorkflowParameter", "name": "prompt"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding",
+            "data": "$inputs.prompt",
+            "version": "RN50",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "text_embeddings",
+            "coordinates_system": "own",
+            "selector": "$steps.embedding.embedding",
+        },
+    ],
+}
+
+
+def test_clip_text_embedding_model(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+    crowd_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.api_key": None,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLIP_TEXT_WORKFLOW,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={"prompt": "Foo Bar"}
+    )
+
+    # then
+    assert isinstance(result, list), "Expected list to be delivered"
+    assert len(result) == 1, "Expected 1 element in the output"
+    assert set(result[0].keys()) == {
+        "text_embeddings",
+    }, "Expected all declared outputs to be delivered"
+    assert (
+        len(result[0]["text_embeddings"]) == 1024
+    ), "Expected text embedding to be of dimension 1024 for RN50 model"
+    assert (
+        pytest.approx(np.mean(result[0]["text_embeddings"]), 0.0001) == -0.016772
+    ), "Expected embedding to have a value similar to during testing"
+    assert (
+        pytest.approx(np.max(result[0]["text_embeddings"]), 0.0001) == 1.65736556
+    ), "Expected embedding to have a value similar to during testing"
+    assert (
+        pytest.approx(np.min(result[0]["text_embeddings"]), 0.0001) == -10.109556
+    ), "Expected embedding to have a value similar to during testing"
+    assert (
+        pytest.approx(np.std(result[0]["text_embeddings"]), 0.0001) == 0.39733439
+    ), "Expected embedding to have a value similar to during testing"
+
+CLIP_COMPARISON_WORKFLOW = {
     "version": "1.0",
     "inputs": [
         {"type": "WorkflowImage", "name": "image"},
@@ -36,16 +193,16 @@
 
 @add_to_workflows_gallery(
     category="Basic Workflows",
-    use_case_title="Workflow with CLIP model",
+    use_case_title="Workflow with CLIP Comparison",
     use_case_description="""
-This is the basic workflow that only contains a single CLIP model block. 
+This is the basic workflow that only contains a single CLIP Comparison block. 
 
 Please take a look at how batch-oriented WorkflowImage data is plugged to 
 detection step via input selector (`$inputs.image`) and how non-batch parameters 
 (reference set of texts that the each image in batch will be compared to)
 is dynamically specified - via `$inputs.reference` selector.
     """,
-    workflow_definition=CLIP_WORKFLOW,
+    workflow_definition=CLIP_COMPARISON_WORKFLOW,
     workflow_name_in_app="clip",
 )
 def test_clip_workflow_when_minimal_valid_input_provided(
@@ -60,7 +217,7 @@ def test_clip_workflow_when_minimal_valid_input_provided(
         "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
     }
     execution_engine = ExecutionEngine.init(
-        workflow_definition=CLIP_WORKFLOW,
+        workflow_definition=CLIP_COMPARISON_WORKFLOW,
         init_parameters=workflow_init_parameters,
         max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
     )

From 09716608257fa44c45e811868b2b8c3f36c0507f Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Thu, 12 Dec 2024 15:32:05 -0600
Subject: [PATCH 07/10] Fix style

---
 .../core_steps/math/cosine_similarity/v1.py   | 21 +++++++++----------
 .../core_steps/models/foundation/clip/v1.py   |  5 +++++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/inference/core/workflows/core_steps/math/cosine_similarity/v1.py b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
index 331301f9f..b0c4e032d 100644
--- a/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
+++ b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
@@ -2,9 +2,8 @@
 
 from pydantic import ConfigDict, Field
 
-from inference.core.workflows.execution_engine.entities.base import (
-    OutputDefinition,
-)
+from inference.core.utils.postprocess import cosine_similarity
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
 from inference.core.workflows.execution_engine.entities.types import (
     EMBEDDING_KIND,
     FLOAT_KIND,
@@ -15,7 +14,6 @@
     WorkflowBlock,
     WorkflowBlockManifest,
 )
-from inference.core.utils.postprocess import cosine_similarity
 
 LONG_DESCRIPTION = """
 Calculate the cosine similarity between two embeddings.
@@ -34,7 +32,12 @@ class BlockManifest(WorkflowBlockManifest):
             "short_description": "Calculate the cosine similarity between two embeddings.",
             "long_description": LONG_DESCRIPTION,
             "license": "MIT",
-            "block_type": "model",
+            "block_type": "math",
+            "ui_manifest": {
+                "section": "advanced",
+                "icon": "far fa-calculator-simple",
+                "blockPriority": 3,
+            },
         }
     )
     type: Literal["roboflow_core/cosine_similarity@v1"]
@@ -62,10 +65,6 @@ class CosineSimilarityBlockV1(WorkflowBlock):
     def get_manifest(cls) -> Type[WorkflowBlockManifest]:
         return BlockManifest
 
-    def run(
-        self,
-        embedding_1: List[float],
-        embedding_2: List[float]
-    ) -> BlockResult:
+    def run(self, embedding_1: List[float], embedding_2: List[float]) -> BlockResult:
         similarity = cosine_similarity(embedding_1, embedding_2)
-        return { "similarity": similarity }
\ No newline at end of file
+        return {"similarity": similarity}
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
index cc4b33c9e..0896c1962 100644
--- a/inference/core/workflows/core_steps/models/foundation/clip/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -55,6 +55,11 @@ class BlockManifest(WorkflowBlockManifest):
             "long_description": LONG_DESCRIPTION,
             "license": "MIT",
             "block_type": "model",
+            "ui_manifest": {
+                "section": "model",
+                "icon": "far fa-paperclip",
+                "blockPriority": 2,
+            },
         }
     )
     type: Literal["roboflow_core/clip@v1"]

From a893049e67a6739f553d32fbc20ebed26d70fae2 Mon Sep 17 00:00:00 2001
From: Brad Dwyer <brad@roboflow.com>
Date: Thu, 12 Dec 2024 15:33:10 -0600
Subject: [PATCH 08/10] Remove stray print

---
 .../core/workflows/core_steps/models/foundation/clip/v1.py      | 2 --
 1 file changed, 2 deletions(-)

diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
index 0896c1962..615ef5471 100644
--- a/inference/core/workflows/core_steps/models/foundation/clip/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -147,8 +147,6 @@ def run_locally(
             data = [data]
             convert_to_singleton = True
 
-        print("CONVERT TO SINGLETON", convert_to_singleton, type(data), data)
-
         if isinstance(data[0], str):
             inference_request = ClipTextEmbeddingRequest(
                 clip_version_id=version,

From 1b43e9ad08129f549d5d9d746ae19cccad9a8694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Fri, 13 Dec 2024 17:24:56 +0100
Subject: [PATCH 09/10] Make linters happy

---
 .../v1/compiler/graph_constructor.py           | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py b/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py
index bc61d1d94..f1a253b41 100644
--- a/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py
+++ b/inference/core/workflows/execution_engine/v1/compiler/graph_constructor.py
@@ -711,18 +711,18 @@ def denote_data_flow_for_step(
                 context="workflow_compilation | execution_graph_construction",
             )
         if (
-                step_accepts_batch_input
-                and batch_input_expected == {True}
-                and False in actual_input_is_batch
+            step_accepts_batch_input
+            and batch_input_expected == {True}
+            and False in actual_input_is_batch
         ):
             raise ExecutionGraphStructureError(
                 public_message=f"Detected invalid reference plugged "
-                               f"into property `{property_name}` of step `{node}` - the step "
-                               f"property strictly requires batch-oriented inputs, yet the input selector "
-                               f"holds non-batch oriented input - this indicates the "
-                               f"problem with construction of your Workflow - usually the problem occurs when "
-                               f"non-batch oriented step inputs are filled with outputs of non batch-oriented "
-                               f"steps or non batch-oriented inputs.",
+                f"into property `{property_name}` of step `{node}` - the step "
+                f"property strictly requires batch-oriented inputs, yet the input selector "
+                f"holds non-batch oriented input - this indicates the "
+                f"problem with construction of your Workflow - usually the problem occurs when "
+                f"non-batch oriented step inputs are filled with outputs of non batch-oriented "
+                f"steps or non batch-oriented inputs.",
                 context="workflow_compilation | execution_graph_construction",
             )
     if not parameters_with_batch_inputs:

From 55d3a95f2f63b95ab3e849b4ed09cbf9bbb9e7bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Fri, 13 Dec 2024 18:02:04 +0100
Subject: [PATCH 10/10] Fix issues and add tests

---
 inference/core/version.py                     |   2 +-
 .../core_steps/math/cosine_similarity/v1.py   |   5 +
 .../core_steps/models/foundation/clip/v1.py   |  76 ++------
 .../execution/test_workflow_with_clip.py      | 171 +++++++++++++++++-
 .../core_steps/models/foundation/test_clip.py |  20 +-
 5 files changed, 205 insertions(+), 69 deletions(-)

diff --git a/inference/core/version.py b/inference/core/version.py
index 00d2732ad..4a5f53a40 100644
--- a/inference/core/version.py
+++ b/inference/core/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.30.0"
+__version__ = "0.31.0"
 
 
 if __name__ == "__main__":
diff --git a/inference/core/workflows/core_steps/math/cosine_similarity/v1.py b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
index b0c4e032d..fcf5892b3 100644
--- a/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
+++ b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
@@ -66,5 +66,10 @@ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
         return BlockManifest
 
     def run(self, embedding_1: List[float], embedding_2: List[float]) -> BlockResult:
+        if len(embedding_1) != len(embedding_2):
+            raise RuntimeError(
+                f"roboflow_core/cosine_similarity@v1 block feed with different shape of embeddings. "
+                f"`embedding_1`: (N, {len(embedding_1)}), `embedding_2`: (N, {len(embedding_2)})"
+            )
         similarity = cosine_similarity(embedding_1, embedding_2)
         return {"similarity": similarity}
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
index 615ef5471..9e92b1ce5 100644
--- a/inference/core/workflows/core_steps/models/foundation/clip/v1.py
+++ b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -89,10 +89,6 @@ class BlockManifest(WorkflowBlockManifest):
         examples=["ViT-B-16", "$inputs.variant"],
     )
 
-    @classmethod
-    def get_parameters_accepting_batches(cls) -> List[str]:
-        return ["data"]
-
     @classmethod
     def describe_outputs(cls) -> List[OutputDefinition]:
         return [OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])]
@@ -124,7 +120,7 @@ def get_manifest(cls) -> Type[WorkflowBlockManifest]:
 
     def run(
         self,
-        data: Batch[Union[WorkflowImageData, str]],
+        data: Union[WorkflowImageData, str],
         version: str,
     ) -> BlockResult:
         if self._step_execution_mode is StepExecutionMode.LOCAL:
@@ -138,64 +134,43 @@ def run(
 
     def run_locally(
         self,
-        data: Batch[Union[WorkflowImageData, str]],
+        data: Union[WorkflowImageData, str],
         version: str,
     ) -> BlockResult:
-        # if not array, wrap
-        convert_to_singleton = False
-        if not isinstance(data, Batch):
-            data = [data]
-            convert_to_singleton = True
-
-        if isinstance(data[0], str):
+        if isinstance(data, str):
             inference_request = ClipTextEmbeddingRequest(
                 clip_version_id=version,
-                text=data,
+                text=[data],
                 api_key=self._api_key,
             )
-
             clip_model_id = load_core_model(
                 model_manager=self._model_manager,
                 inference_request=inference_request,
                 core_model="clip",
             )
-
             predictions = self._model_manager.infer_from_request_sync(
                 clip_model_id, inference_request
             )
-
-            if convert_to_singleton:
-                return {"embedding": predictions.embeddings[0]}
-
-            return [{"embedding": p} for p in predictions.embeddings]
+            return {"embedding": predictions.embeddings[0]}
         else:
             inference_request = ClipImageEmbeddingRequest(
                 clip_version_id=version,
-                image=[
-                    single_image.to_inference_format(numpy_preferred=True)
-                    for single_image in data
-                ],
+                image=[data.to_inference_format(numpy_preferred=True)],
                 api_key=self._api_key,
             )
-
             clip_model_id = load_core_model(
                 model_manager=self._model_manager,
                 inference_request=inference_request,
                 core_model="clip",
             )
-
             predictions = self._model_manager.infer_from_request_sync(
                 clip_model_id, inference_request
             )
-
-            if convert_to_singleton:
-                return {"embedding": predictions.embeddings[0]}
-
-            return [{"embedding": p} for p in predictions.embeddings]
+            return {"embedding": predictions.embeddings[0]}
 
     def run_remotely(
         self,
-        data: Batch[Union[WorkflowImageData, str]],
+        data: Union[WorkflowImageData, str],
         version: str,
     ) -> BlockResult:
         api_url = (
@@ -210,28 +185,15 @@ def run_remotely(
         if WORKFLOWS_REMOTE_API_TARGET == "hosted":
             client.select_api_v0()
 
-        tasks = []
-        for d in data:
-            if isinstance(d, str):
-                tasks.append(
-                    partial(
-                        client.get_clip_text_embeddings,
-                        text=d,
-                        clip_version=version,
-                    )
-                )
-            else:
-                tasks.append(
-                    partial(
-                        client.get_clip_image_embeddings,
-                        image=d.base64_image,
-                        clip_version=version,
-                    )
-                )
-
-        predictions = run_in_parallel(
-            tasks=tasks,
-            max_workers=WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
-        )
+        if isinstance(data, str):
+            result = client.get_clip_text_embeddings(
+                text=data,
+                clip_version=version,
+            )
+        else:
+            result = client.get_clip_image_embeddings(
+                inference_input=data.base64_image,
+                clip_version=version,
+            )
 
-        return [{"embedding": p} for p in predictions]
+        return {"embedding": result["embeddings"][0]}
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_clip.py b/tests/workflows/integration_tests/execution/test_workflow_with_clip.py
index f7055203d..0ffd9b150 100644
--- a/tests/workflows/integration_tests/execution/test_workflow_with_clip.py
+++ b/tests/workflows/integration_tests/execution/test_workflow_with_clip.py
@@ -4,7 +4,7 @@
 from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
 from inference.core.managers.base import ModelManager
 from inference.core.workflows.core_steps.common.entities import StepExecutionMode
-from inference.core.workflows.errors import RuntimeInputError
+from inference.core.workflows.errors import RuntimeInputError, StepExecutionError
 from inference.core.workflows.execution_engine.core import ExecutionEngine
 from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
     add_to_workflows_gallery,
@@ -100,6 +100,175 @@ def test_clip_embedding_model(
     ), "Expected image embedding to be of dimension 1024 for RN50 model"
 
 
+CLIP_WORKFLOW_COSINE_SIMILARITY_CROSS_DATA_TYPE = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "InferenceImage", "name": "image_1"},
+        {"type": "WorkflowParameter", "name": "reference"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding_1",
+            "data": "$inputs.image_1",
+            "version": "RN50",
+        },
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding_2",
+            "data": "$inputs.reference",
+            "version": "RN50",
+        },
+        {
+            "type": "roboflow_core/cosine_similarity@v1",
+            "name": "cosine_similarity",
+            "embedding_1": "$steps.embedding_1.embedding",
+            "embedding_2": "$steps.embedding_2.embedding",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "similarity",
+            "coordinates_system": "own",
+            "selector": "$steps.cosine_similarity.similarity",
+        },
+        {
+            "type": "JsonField",
+            "name": "image_embeddings",
+            "coordinates_system": "own",
+            "selector": "$steps.embedding_1.embedding",
+        },
+    ],
+}
+
+
+def test_clip_embedding_model_on_batches_of_cross_type_data(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+    crowd_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.api_key": None,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLIP_WORKFLOW_COSINE_SIMILARITY_CROSS_DATA_TYPE,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    result = execution_engine.run(
+        runtime_parameters={
+            "image_1": [license_plate_image, crowd_image],
+            "reference": "people",
+        }
+    )
+
+    # then
+    assert isinstance(result, list), "Expected list to be delivered"
+    assert len(result) == 2, "Expected 2 elements in the output"
+    assert set(result[0].keys()) == {
+        "similarity",
+        "image_embeddings",
+    }, "Expected all declared outputs to be delivered"
+    assert set(result[1].keys()) == {
+        "similarity",
+        "image_embeddings",
+    }, "Expected all declared outputs to be delivered"
+    assert (
+        abs(result[0]["similarity"] - 0.13) < 0.02
+    ), "Expected similarity to be approximately the defined value"
+    assert (
+        len(result[0]["image_embeddings"]) == 1024
+    ), "Expected image embedding to be of dimension 1024 for RN50 model"
+    assert (
+        abs(result[1]["similarity"] - 0.15) < 0.02
+    ), "Expected similarity to be approximately the defined value"
+    assert (
+        len(result[1]["image_embeddings"]) == 1024
+    ), "Expected image embedding to be of dimension 1024 for RN50 model"
+
+
+CLIP_WORKFLOW_COSINE_SIMILARITY_CROSS_DATA_TYPE_WITH_INVALID_LENGTH_OF_EMBEDDINGS = {
+    "version": "1.0",
+    "inputs": [
+        {"type": "InferenceImage", "name": "image_1"},
+        {"type": "WorkflowParameter", "name": "reference"},
+    ],
+    "steps": [
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding_1",
+            "data": "$inputs.image_1",
+            "version": "RN50",
+        },
+        {
+            "type": "roboflow_core/clip@v1",
+            "name": "embedding_2",
+            "data": "$inputs.reference",
+            "version": "RN50x4",
+        },
+        {
+            "type": "roboflow_core/cosine_similarity@v1",
+            "name": "cosine_similarity",
+            "embedding_1": "$steps.embedding_1.embedding",
+            "embedding_2": "$steps.embedding_2.embedding",
+        },
+    ],
+    "outputs": [
+        {
+            "type": "JsonField",
+            "name": "similarity",
+            "coordinates_system": "own",
+            "selector": "$steps.cosine_similarity.similarity",
+        },
+        {
+            "type": "JsonField",
+            "name": "image_embeddings",
+            "coordinates_system": "own",
+            "selector": "$steps.embedding_1.embedding",
+        },
+    ],
+}
+
+
+def test_clip_embedding_model_on_batches_of_cross_type_data_with_different_embeddings_length(
+    model_manager: ModelManager,
+    license_plate_image: np.ndarray,
+    crowd_image: np.ndarray,
+) -> None:
+    # given
+    workflow_init_parameters = {
+        "workflows_core.model_manager": model_manager,
+        "workflows_core.api_key": None,
+        "workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
+    }
+    execution_engine = ExecutionEngine.init(
+        workflow_definition=CLIP_WORKFLOW_COSINE_SIMILARITY_CROSS_DATA_TYPE_WITH_INVALID_LENGTH_OF_EMBEDDINGS,
+        init_parameters=workflow_init_parameters,
+        max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+    )
+
+    # when
+    with pytest.raises(StepExecutionError) as error:
+        _ = execution_engine.run(
+            runtime_parameters={
+                "image_1": [license_plate_image, crowd_image],
+                "reference": "people",
+            }
+        )
+
+    # then
+    assert (
+        "roboflow_core/cosine_similarity@v1 block feed with different shape of embeddings"
+        in str(error.value)
+    )
+
+
 CLIP_TEXT_WORKFLOW = {
     "version": "1.0",
     "inputs": [
diff --git a/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py b/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py
index 873b27b28..6ee576a7f 100644
--- a/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py
+++ b/tests/workflows/unit_tests/core_steps/models/foundation/test_clip.py
@@ -108,7 +108,9 @@ def test_run_locally_with_image(mock_model_manager, mock_workflow_image_data):
 def test_run_remotely_with_text(mock_client_cls, mock_model_manager):
     # Mock the remote client and its return value
     mock_client = MagicMock()
-    mock_client.get_clip_text_embeddings.return_value = [0.1, 0.2, 0.3]
+    mock_client.get_clip_text_embeddings.return_value = {
+        "embeddings": [[0.1, 0.2, 0.3]]
+    }
     mock_client_cls.return_value = mock_client
 
     block = ClipModelBlockV1(
@@ -117,11 +119,9 @@ def test_run_remotely_with_text(mock_client_cls, mock_model_manager):
         step_execution_mode=StepExecutionMode.REMOTE,
     )
 
-    result = block.run(data=["Hello world"], version="RN50")
+    result = block.run(data="Hello world", version="RN50")
 
-    assert isinstance(result, list)
-    assert len(result) == 1
-    assert result[0]["embedding"] == [0.1, 0.2, 0.3]
+    assert result["embedding"] == [0.1, 0.2, 0.3]
     mock_client.get_clip_text_embeddings.assert_called_once()
 
 
@@ -132,7 +132,9 @@ def test_run_remotely_with_image(
     mock_client_cls, mock_model_manager, mock_workflow_image_data
 ):
     mock_client = MagicMock()
-    mock_client.get_clip_image_embeddings.return_value = [0.1, 0.2, 0.3]
+    mock_client.get_clip_image_embeddings.return_value = {
+        "embeddings": [[0.1, 0.2, 0.3]]
+    }
     mock_client_cls.return_value = mock_client
 
     block = ClipModelBlockV1(
@@ -141,9 +143,7 @@ def test_run_remotely_with_image(
         step_execution_mode=StepExecutionMode.REMOTE,
     )
 
-    result = block.run(data=[mock_workflow_image_data], version="RN50")
+    result = block.run(data=mock_workflow_image_data, version="RN50")
 
-    assert isinstance(result, list)
-    assert len(result) == 1
-    assert result[0]["embedding"] == [0.1, 0.2, 0.3]
+    assert result["embedding"] == [0.1, 0.2, 0.3]
     mock_client.get_clip_image_embeddings.assert_called_once()