Skip to content

Commit

Permalink
Merge pull request #877 from roboflow/workflow-embeddings
Browse files Browse the repository at this point in the history
CLIP Embeddings and Cosine Similarity Workflow Blocks
  • Loading branch information
PawelPeczek-Roboflow authored Dec 13, 2024
2 parents d24d6f6 + 173c71c commit ac4787d
Show file tree
Hide file tree
Showing 10 changed files with 861 additions and 6 deletions.
2 changes: 1 addition & 1 deletion inference/core/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.30.0"
__version__ = "0.31.0"


if __name__ == "__main__":
Expand Down
8 changes: 8 additions & 0 deletions inference/core/workflows/core_steps/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,15 @@
from inference.core.workflows.core_steps.fusion.dimension_collapse.v1 import (
DimensionCollapseBlockV1,
)
from inference.core.workflows.core_steps.math.cosine_similarity.v1 import (
CosineSimilarityBlockV1,
)
from inference.core.workflows.core_steps.models.foundation.anthropic_claude.v1 import (
AnthropicClaudeBlockV1,
)
from inference.core.workflows.core_steps.models.foundation.clip.v1 import (
ClipModelBlockV1,
)
from inference.core.workflows.core_steps.models.foundation.clip_comparison.v1 import (
ClipComparisonBlockV1,
)
Expand Down Expand Up @@ -479,6 +485,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
DimensionCollapseBlockV1,
FirstNonEmptyOrDefaultBlockV1,
AnthropicClaudeBlockV1,
CosineSimilarityBlockV1,
BackgroundColorVisualizationBlockV1,
BarcodeDetectorBlockV1,
BlurVisualizationBlockV1,
Expand All @@ -489,6 +496,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
CircleVisualizationBlockV1,
ClipComparisonBlockV1,
ClipComparisonBlockV2,
ClipModelBlockV1,
CogVLMBlockV1,
ColorVisualizationBlockV1,
ConvertGrayscaleBlockV1,
Expand Down
Empty file.
75 changes: 75 additions & 0 deletions inference/core/workflows/core_steps/math/cosine_similarity/v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import List, Literal, Optional, Type

from pydantic import ConfigDict, Field

from inference.core.utils.postprocess import cosine_similarity
from inference.core.workflows.execution_engine.entities.base import OutputDefinition
from inference.core.workflows.execution_engine.entities.types import (
EMBEDDING_KIND,
FLOAT_KIND,
Selector,
)
from inference.core.workflows.prototypes.block import (
BlockResult,
WorkflowBlock,
WorkflowBlockManifest,
)

LONG_DESCRIPTION = """
Calculate the cosine similarity between two embeddings.
A cosine similarity of 1 means the two embeddings are identical,
while a cosine similarity of 0 means the two embeddings are orthogonal.
Greater values indicate greater similarity.
"""


class BlockManifest(WorkflowBlockManifest):
model_config = ConfigDict(
json_schema_extra={
"name": "Cosine Similarity",
"version": "v1",
"short_description": "Calculate the cosine similarity between two embeddings.",
"long_description": LONG_DESCRIPTION,
"license": "MIT",
"block_type": "math",
"ui_manifest": {
"section": "advanced",
"icon": "far fa-calculator-simple",
"blockPriority": 3,
},
}
)
type: Literal["roboflow_core/cosine_similarity@v1"]
name: str = Field(description="Unique name of step in workflows")
embedding_1: Selector(kind=[EMBEDDING_KIND]) = Field(
description="Embedding 1",
examples=["$steps.clip_image.embedding"],
)
embedding_2: Selector(kind=[EMBEDDING_KIND]) = Field(
description="Embedding 2",
examples=["$steps.clip_text.embedding"],
)

@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [OutputDefinition(name="similarity", kind=[FLOAT_KIND])]

@classmethod
def get_execution_engine_compatibility(cls) -> Optional[str]:
return ">=1.3.0,<2.0.0"


class CosineSimilarityBlockV1(WorkflowBlock):
@classmethod
def get_manifest(cls) -> Type[WorkflowBlockManifest]:
return BlockManifest

def run(self, embedding_1: List[float], embedding_2: List[float]) -> BlockResult:
if len(embedding_1) != len(embedding_2):
raise RuntimeError(
f"roboflow_core/cosine_similarity@v1 block feed with different shape of embeddings. "
f"`embedding_1`: (N, {len(embedding_1)}), `embedding_2`: (N, {len(embedding_2)})"
)
similarity = cosine_similarity(embedding_1, embedding_2)
return {"similarity": similarity}
Empty file.
199 changes: 199 additions & 0 deletions inference/core/workflows/core_steps/models/foundation/clip/v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
from functools import partial
from typing import List, Literal, Optional, Type, Union

from pydantic import ConfigDict, Field

from inference.core.entities.requests.clip import (
ClipImageEmbeddingRequest,
ClipTextEmbeddingRequest,
)
from inference.core.env import (
HOSTED_CORE_MODEL_URL,
LOCAL_INFERENCE_API_URL,
WORKFLOWS_REMOTE_API_TARGET,
WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
)
from inference.core.managers.base import ModelManager
from inference.core.workflows.core_steps.common.entities import StepExecutionMode
from inference.core.workflows.core_steps.common.utils import (
load_core_model,
run_in_parallel,
)
from inference.core.workflows.execution_engine.entities.base import (
Batch,
OutputDefinition,
WorkflowImageData,
)
from inference.core.workflows.execution_engine.entities.types import (
EMBEDDING_KIND,
IMAGE_KIND,
STRING_KIND,
Selector,
)
from inference.core.workflows.prototypes.block import (
BlockResult,
WorkflowBlock,
WorkflowBlockManifest,
)
from inference_sdk import InferenceHTTPClient

LONG_DESCRIPTION = """
Use a CLIP model to create semantic embeddings of text and images.
This block accepts an image or string and returns an embedding.
The embedding can be used to compare the similarity between different
images or between images and text.
"""


class BlockManifest(WorkflowBlockManifest):
model_config = ConfigDict(
json_schema_extra={
"name": "CLIP Embedding Model",
"version": "v1",
"short_description": "Generate an embedding of an image or string.",
"long_description": LONG_DESCRIPTION,
"license": "MIT",
"block_type": "model",
"ui_manifest": {
"section": "model",
"icon": "far fa-paperclip",
"blockPriority": 2,
},
}
)
type: Literal["roboflow_core/clip@v1"]
name: str = Field(description="Unique name of step in workflows")
data: Union[Selector(kind=[IMAGE_KIND, STRING_KIND]), str] = Field(
title="Data",
description="The string or image to generate an embedding for.",
examples=["$inputs.image", "$steps.cropping.crops"],
)

version: Union[
Literal[
"RN101",
"RN50",
"RN50x16",
"RN50x4",
"RN50x64",
"ViT-B-16",
"ViT-B-32",
"ViT-L-14-336px",
"ViT-L-14",
],
Selector(kind=[STRING_KIND]),
] = Field(
default="ViT-B-32",
description="Variant of CLIP model",
examples=["ViT-B-16", "$inputs.variant"],
)

@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
return [OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])]

@classmethod
def get_execution_engine_compatibility(cls) -> Optional[str]:
return ">=1.3.0,<2.0.0"


class ClipModelBlockV1(WorkflowBlock):

def __init__(
self,
model_manager: ModelManager,
api_key: Optional[str],
step_execution_mode: StepExecutionMode,
):
self._model_manager = model_manager
self._api_key = api_key
self._step_execution_mode = step_execution_mode

@classmethod
def get_init_parameters(cls) -> List[str]:
return ["model_manager", "api_key", "step_execution_mode"]

@classmethod
def get_manifest(cls) -> Type[WorkflowBlockManifest]:
return BlockManifest

def run(
self,
data: Union[WorkflowImageData, str],
version: str,
) -> BlockResult:
if self._step_execution_mode is StepExecutionMode.LOCAL:
return self.run_locally(data=data, version=version)
elif self._step_execution_mode is StepExecutionMode.REMOTE:
return self.run_remotely(data=data, version=version)
else:
raise ValueError(
f"Unknown step execution mode: {self._step_execution_mode}"
)

def run_locally(
self,
data: Union[WorkflowImageData, str],
version: str,
) -> BlockResult:
if isinstance(data, str):
inference_request = ClipTextEmbeddingRequest(
clip_version_id=version,
text=[data],
api_key=self._api_key,
)
clip_model_id = load_core_model(
model_manager=self._model_manager,
inference_request=inference_request,
core_model="clip",
)
predictions = self._model_manager.infer_from_request_sync(
clip_model_id, inference_request
)
return {"embedding": predictions.embeddings[0]}
else:
inference_request = ClipImageEmbeddingRequest(
clip_version_id=version,
image=[data.to_inference_format(numpy_preferred=True)],
api_key=self._api_key,
)
clip_model_id = load_core_model(
model_manager=self._model_manager,
inference_request=inference_request,
core_model="clip",
)
predictions = self._model_manager.infer_from_request_sync(
clip_model_id, inference_request
)
return {"embedding": predictions.embeddings[0]}

def run_remotely(
self,
data: Union[WorkflowImageData, str],
version: str,
) -> BlockResult:
api_url = (
LOCAL_INFERENCE_API_URL
if WORKFLOWS_REMOTE_API_TARGET != "hosted"
else HOSTED_CORE_MODEL_URL
)
client = InferenceHTTPClient(
api_url=api_url,
api_key=self._api_key,
)
if WORKFLOWS_REMOTE_API_TARGET == "hosted":
client.select_api_v0()

if isinstance(data, str):
result = client.get_clip_text_embeddings(
text=data,
clip_version=version,
)
else:
result = client.get_clip_image_embeddings(
inference_input=data.base64_image,
clip_version=version,
)

return {"embedding": result["embeddings"][0]}
16 changes: 16 additions & 0 deletions inference/core/workflows/execution_engine/entities/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,22 @@ def __hash__(self) -> int:
internal_data_type="List[Any]",
)

EMBEDDING_KIND_DOCS = """
This kind represents a vector embedding. It is a list of floating point numbers.
Embeddings are used in various machine learning tasks like clustering, classification,
and similarity search. They are used to represent data in a continuous, low-dimensional space.
Typically, vectors that are close to each other in the embedding space are considered similar.
"""
EMBEDDING_KIND = Kind(
name="embedding",
description="A list of floating point numbers representing a vector embedding.",
docs=EMBEDDING_KIND_DOCS,
serialised_data_type="List[float]",
internal_data_type="List[float]",
)

RGB_COLOR_KIND_DOCS = """
This kind represents RGB color as a tuple (R, G, B).
Expand Down
Loading

0 comments on commit ac4787d

Please sign in to comment.