Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PoC] Improve TRTLLM deployment UX #650

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion all_models/inflight_batcher_llm/ensemble/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

name: "ensemble"
platform: "ensemble"
max_batch_size: ${triton_max_batch_size}
max_batch_size: 256
input [
{
name: "text_input"
Expand Down
35 changes: 30 additions & 5 deletions all_models/inflight_batcher_llm/postprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import os

import numpy as np
import triton_python_backend_utils as pb_utils
Expand Down Expand Up @@ -52,11 +53,11 @@ def initialize(self, args):
* model_name: Model name
"""
# Parse model configs
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']
self.model_config = json.loads(args['model_config'])
# Support tokenizer dir from env var for central location
tokenizer_dir = self.get_tokenizer_dir()

skip_special_tokens = model_config['parameters'].get(
skip_special_tokens = self.model_config['parameters'].get(
'skip_special_tokens')
if skip_special_tokens is not None:
skip_special_tokens_str = skip_special_tokens[
Expand Down Expand Up @@ -87,12 +88,36 @@ def initialize(self, args):

# Parse model output configs
output_config = pb_utils.get_output_config_by_name(
model_config, "OUTPUT")
self.model_config, "OUTPUT")

# Convert Triton types to numpy types
self.output_dtype = pb_utils.triton_string_to_numpy(
output_config['data_type'])

def get_tokenizer_dir(self):
# Manual override of tokenizer. This is to support common case/models
# when engine/tokenizer are downloaded on demand at model load time.
tokenizer_dir = os.environ.get("TRTLLM_TOKENIZER")

# If no override, use tokenizer co-located with engine
if not tokenizer_dir:
tokenizer_dir = os.environ.get("TRTLLM_ENGINE_DIR")

# If no env var used at all, use tokenizer dir defined in config.pbtxt
# This is for backwards compatibility but is the most tedious to set
# and keep aligned in each location.
if not tokenizer_dir:
tokenizer_dir = self.model_config['parameters']['tokenizer_dir'][
'string_value']

# If no method of setting tokenizer worked, fail.
if not tokenizer_dir:
raise pb_utils.TritonModelException(
f"No tokenizer directory set. Please set TRTLLM_ENGINE_DIR env var or 'tokenizer_dir' config field to the directory containing engines and tokenizers."
)

return tokenizer_dir

def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
Expand Down
8 changes: 5 additions & 3 deletions all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

name: "postprocessing"
backend: "python"
max_batch_size: ${triton_max_batch_size}
max_batch_size: 256
dynamic_batching {}
input [
{
Expand All @@ -48,13 +48,15 @@ output [
}
]

# TODO: env var
parameters {
key: "tokenizer_dir"
value: {
string_value: "${tokenizer_dir}"
string_value: ""
}
}

# TODO: Lookup how its filled today
parameters {
key: "skip_special_tokens"
value: {
Expand All @@ -64,7 +66,7 @@ parameters {

instance_group [
{
count: ${postprocessing_instance_count}
count: 8
kind: KIND_CPU
}
]
49 changes: 36 additions & 13 deletions all_models/inflight_batcher_llm/preprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ def initialize(self, args):
* model_name: Model name
"""
# Parse model configs
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']
self.model_config = json.loads(args['model_config'])
# Support tokenizer dir from env var for central location
tokenizer_dir = self.get_tokenizer_dir()

add_special_tokens = model_config['parameters'].get(
add_special_tokens = self.model_config['parameters'].get(
'add_special_tokens')
visual_model_path = model_config['parameters']['visual_model_path'][
'string_value']
max_num_images = model_config['parameters'].get('max_num_images')
visual_model_path = self.model_config['parameters'][
'visual_model_path']['string_value']
max_num_images = self.model_config['parameters'].get('max_num_images')

if max_num_images is not None:
max_num_images_str = max_num_images['string_value']
Expand Down Expand Up @@ -133,7 +133,7 @@ def initialize(self, args):
'llava', 'blip2-opt', 'vila', 'mllama'
], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila and mllama. Got {self.model_type}."

llm_model_path = model_config['parameters']['gpt_model_path'][
llm_model_path = self.model_config['parameters']['gpt_model_path'][
'string_value']
llm_model_path = os.path.join(llm_model_path, 'config.json')
with open(llm_model_path, 'r') as f:
Expand All @@ -144,7 +144,7 @@ def initialize(self, args):

self.vision_preprocessor = VisionPreProcessor(
self.model_type, AutoProcessor.from_pretrained(tokenizer_dir),
model_config)
self.model_config)

# Parse model output configs and convert Triton types to numpy types
output_names = [
Expand All @@ -159,15 +159,39 @@ def initialize(self, args):
input_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_input_config_by_name(
model_config, input_name)['data_type']))
self.model_config, input_name)['data_type']))

for output_name in output_names:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(
model_config, output_name)['data_type']))
self.model_config, output_name)['data_type']))

def get_tokenizer_dir(self):
# Manual override of tokenizer. This is to support common case/models
# when engine/tokenizer are downloaded on demand at model load time.
tokenizer_dir = os.environ.get("TRTLLM_TOKENIZER")

# If no override, use tokenizer co-located with engine
if not tokenizer_dir:
tokenizer_dir = os.environ.get("TRTLLM_ENGINE_DIR")

# If no env var used at all, use tokenizer dir defined in config.pbtxt
# This is for backwards compatibility but is the most tedious to set
# and keep aligned in each location.
if not tokenizer_dir:
tokenizer_dir = self.model_config['parameters']['tokenizer_dir'][
'string_value']

# If no method of setting tokenizer worked, fail.
if not tokenizer_dir:
raise pb_utils.TritonModelException(
f"No tokenizer directory set. Please set TRTLLM_ENGINE_DIR env var or 'tokenizer_dir' config field to the directory containing engines and tokenizers."
)

return tokenizer_dir

def _setup_ptable_shape(self, llm_model_config):
max_prompt_embedding_table_size = llm_model_config['build_config'][
Expand Down Expand Up @@ -662,9 +686,8 @@ def __init__(self,
import requests
import torch
from PIL import Image
from torch.utils.dlpack import from_dlpack

from tensorrt_llm._utils import str_dtype_to_torch
from torch.utils.dlpack import from_dlpack

# create method for loading image from urls
self.load_images_from_urls = lambda img_urls: [
Expand Down
10 changes: 6 additions & 4 deletions all_models/inflight_batcher_llm/preprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

name: "preprocessing"
backend: "python"
max_batch_size: ${triton_max_batch_size}
max_batch_size: 256
input [
{
name: "QUERY"
Expand Down Expand Up @@ -177,10 +177,11 @@ output [
}
]

# TODO: Use shared env var
parameters {
key: "tokenizer_dir"
value: {
string_value: "${tokenizer_dir}"
string_value: ""
}
}

Expand All @@ -198,10 +199,11 @@ parameters {
}
}

# TODO: Shared env var
parameters: {
key: "gpt_model_path"
value: {
string_value: "${engine_dir}"
string_value: ""
}
}

Expand All @@ -214,7 +216,7 @@ parameters: {

instance_group [
{
count: ${preprocessing_instance_count}
count: 8
kind: KIND_CPU
}
]
Loading