Merge pull request #877 from roboflow/workflow-embeddings

CLIP Embeddings and Cosine Similarity Workflow Blocks
roboflow · Dec 13, 2024 · ac4787d · ac4787d
2 parents d24d6f6 + 173c71c
commit ac4787d
Show file tree

Hide file tree

Showing 10 changed files with 861 additions and 6 deletions.
diff --git a/inference/core/version.py b/inference/core/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.30.0"
+__version__ = "0.31.0"
 
 
 if __name__ == "__main__":

diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
@@ -141,9 +141,15 @@
 from inference.core.workflows.core_steps.fusion.dimension_collapse.v1 import (
     DimensionCollapseBlockV1,
 )
+from inference.core.workflows.core_steps.math.cosine_similarity.v1 import (
+    CosineSimilarityBlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
     AnthropicClaudeBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.clip.v1 import (
+    ClipModelBlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.clip_comparison.v1 import (
     ClipComparisonBlockV1,
 )
@@ -479,6 +485,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         DimensionCollapseBlockV1,
         FirstNonEmptyOrDefaultBlockV1,
         AnthropicClaudeBlockV1,
+        CosineSimilarityBlockV1,
         BackgroundColorVisualizationBlockV1,
         BarcodeDetectorBlockV1,
         BlurVisualizationBlockV1,
@@ -489,6 +496,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         CircleVisualizationBlockV1,
         ClipComparisonBlockV1,
         ClipComparisonBlockV2,
+        ClipModelBlockV1,
         CogVLMBlockV1,
         ColorVisualizationBlockV1,
         ConvertGrayscaleBlockV1,

diff --git a/inference/core/workflows/core_steps/math/cosine_similarity/__init__.py b/inference/core/workflows/core_steps/math/cosine_similarity/__init__.py
diff --git a/inference/core/workflows/core_steps/math/cosine_similarity/v1.py b/inference/core/workflows/core_steps/math/cosine_similarity/v1.py
@@ -0,0 +1,75 @@
+from typing import List, Literal, Optional, Type
+
+from pydantic import ConfigDict, Field
+
+from inference.core.utils.postprocess import cosine_similarity
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
+from inference.core.workflows.execution_engine.entities.types import (
+    EMBEDDING_KIND,
+    FLOAT_KIND,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+
+LONG_DESCRIPTION = """
+Calculate the cosine similarity between two embeddings.
+
+A cosine similarity of 1 means the two embeddings are identical,
+while a cosine similarity of 0 means the two embeddings are orthogonal.
+Greater values indicate greater similarity.
+"""
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "Cosine Similarity",
+            "version": "v1",
+            "short_description": "Calculate the cosine similarity between two embeddings.",
+            "long_description": LONG_DESCRIPTION,
+            "license": "MIT",
+            "block_type": "math",
+            "ui_manifest": {
+                "section": "advanced",
+                "icon": "far fa-calculator-simple",
+                "blockPriority": 3,
+            },
+        }
+    )
+    type: Literal["roboflow_core/cosine_similarity@v1"]
+    name: str = Field(description="Unique name of step in workflows")
+    embedding_1: Selector(kind=[EMBEDDING_KIND]) = Field(
+        description="Embedding 1",
+        examples=["$steps.clip_image.embedding"],
+    )
+    embedding_2: Selector(kind=[EMBEDDING_KIND]) = Field(
+        description="Embedding 2",
+        examples=["$steps.clip_text.embedding"],
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [OutputDefinition(name="similarity", kind=[FLOAT_KIND])]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class CosineSimilarityBlockV1(WorkflowBlock):
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(self, embedding_1: List[float], embedding_2: List[float]) -> BlockResult:
+        if len(embedding_1) != len(embedding_2):
+            raise RuntimeError(
+                f"roboflow_core/cosine_similarity@v1 block feed with different shape of embeddings. "
+                f"`embedding_1`: (N, {len(embedding_1)}), `embedding_2`: (N, {len(embedding_2)})"
+            )
+        similarity = cosine_similarity(embedding_1, embedding_2)
+        return {"similarity": similarity}
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/__init__.py b/inference/core/workflows/core_steps/models/foundation/clip/__init__.py
diff --git a/inference/core/workflows/core_steps/models/foundation/clip/v1.py b/inference/core/workflows/core_steps/models/foundation/clip/v1.py
@@ -0,0 +1,199 @@
+from functools import partial
+from typing import List, Literal, Optional, Type, Union
+
+from pydantic import ConfigDict, Field
+
+from inference.core.entities.requests.clip import (
+    ClipImageEmbeddingRequest,
+    ClipTextEmbeddingRequest,
+)
+from inference.core.env import (
+    HOSTED_CORE_MODEL_URL,
+    LOCAL_INFERENCE_API_URL,
+    WORKFLOWS_REMOTE_API_TARGET,
+    WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
+)
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.core_steps.common.utils import (
+    load_core_model,
+    run_in_parallel,
+)
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    EMBEDDING_KIND,
+    IMAGE_KIND,
+    STRING_KIND,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+from inference_sdk import InferenceHTTPClient
+
+LONG_DESCRIPTION = """
+Use a CLIP model to create semantic embeddings of text and images.
+
+This block accepts an image or string and returns an embedding.
+The embedding can be used to compare the similarity between different
+images or between images and text.
+"""
+
+
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "CLIP Embedding Model",
+            "version": "v1",
+            "short_description": "Generate an embedding of an image or string.",
+            "long_description": LONG_DESCRIPTION,
+            "license": "MIT",
+            "block_type": "model",
+            "ui_manifest": {
+                "section": "model",
+                "icon": "far fa-paperclip",
+                "blockPriority": 2,
+            },
+        }
+    )
+    type: Literal["roboflow_core/clip@v1"]
+    name: str = Field(description="Unique name of step in workflows")
+    data: Union[Selector(kind=[IMAGE_KIND, STRING_KIND]), str] = Field(
+        title="Data",
+        description="The string or image to generate an embedding for.",
+        examples=["$inputs.image", "$steps.cropping.crops"],
+    )
+
+    version: Union[
+        Literal[
+            "RN101",
+            "RN50",
+            "RN50x16",
+            "RN50x4",
+            "RN50x64",
+            "ViT-B-16",
+            "ViT-B-32",
+            "ViT-L-14-336px",
+            "ViT-L-14",
+        ],
+        Selector(kind=[STRING_KIND]),
+    ] = Field(
+        default="ViT-B-32",
+        description="Variant of CLIP model",
+        examples=["ViT-B-16", "$inputs.variant"],
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+
+class ClipModelBlockV1(WorkflowBlock):
+
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+        step_execution_mode: StepExecutionMode,
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+        self._step_execution_mode = step_execution_mode
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key", "step_execution_mode"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        data: Union[WorkflowImageData, str],
+        version: str,
+    ) -> BlockResult:
+        if self._step_execution_mode is StepExecutionMode.LOCAL:
+            return self.run_locally(data=data, version=version)
+        elif self._step_execution_mode is StepExecutionMode.REMOTE:
+            return self.run_remotely(data=data, version=version)
+        else:
+            raise ValueError(
+                f"Unknown step execution mode: {self._step_execution_mode}"
+            )
+
+    def run_locally(
+        self,
+        data: Union[WorkflowImageData, str],
+        version: str,
+    ) -> BlockResult:
+        if isinstance(data, str):
+            inference_request = ClipTextEmbeddingRequest(
+                clip_version_id=version,
+                text=[data],
+                api_key=self._api_key,
+            )
+            clip_model_id = load_core_model(
+                model_manager=self._model_manager,
+                inference_request=inference_request,
+                core_model="clip",
+            )
+            predictions = self._model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+            return {"embedding": predictions.embeddings[0]}
+        else:
+            inference_request = ClipImageEmbeddingRequest(
+                clip_version_id=version,
+                image=[data.to_inference_format(numpy_preferred=True)],
+                api_key=self._api_key,
+            )
+            clip_model_id = load_core_model(
+                model_manager=self._model_manager,
+                inference_request=inference_request,
+                core_model="clip",
+            )
+            predictions = self._model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+            return {"embedding": predictions.embeddings[0]}
+
+    def run_remotely(
+        self,
+        data: Union[WorkflowImageData, str],
+        version: str,
+    ) -> BlockResult:
+        api_url = (
+            LOCAL_INFERENCE_API_URL
+            if WORKFLOWS_REMOTE_API_TARGET != "hosted"
+            else HOSTED_CORE_MODEL_URL
+        )
+        client = InferenceHTTPClient(
+            api_url=api_url,
+            api_key=self._api_key,
+        )
+        if WORKFLOWS_REMOTE_API_TARGET == "hosted":
+            client.select_api_v0()
+
+        if isinstance(data, str):
+            result = client.get_clip_text_embeddings(
+                text=data,
+                clip_version=version,
+            )
+        else:
+            result = client.get_clip_image_embeddings(
+                inference_input=data.base64_image,
+                clip_version=version,
+            )
+
+        return {"embedding": result["embeddings"][0]}
diff --git a/inference/core/workflows/execution_engine/entities/types.py b/inference/core/workflows/execution_engine/entities/types.py
@@ -210,6 +210,22 @@ def __hash__(self) -> int:
     internal_data_type="List[Any]",
 )
 
+EMBEDDING_KIND_DOCS = """
+This kind represents a vector embedding. It is a list of floating point numbers.
+
+Embeddings are used in various machine learning tasks like clustering, classification,
+and similarity search. They are used to represent data in a continuous, low-dimensional space.
+
+Typically, vectors that are close to each other in the embedding space are considered similar.
+"""
+EMBEDDING_KIND = Kind(
+    name="embedding",
+    description="A list of floating point numbers representing a vector embedding.",
+    docs=EMBEDDING_KIND_DOCS,
+    serialised_data_type="List[float]",
+    internal_data_type="List[float]",
+)
+
 RGB_COLOR_KIND_DOCS = """
 This kind represents RGB color as a tuple (R, G, B).