triton-inference-server · rmccorm4 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/all_models/inflight_batcher_llm/ensemble/config.pbtxt
@@ -26,7 +26,7 @@
 
 name: "ensemble"
 platform: "ensemble"
-max_batch_size: ${triton_max_batch_size}
+max_batch_size: 256
 input [
   {
     name: "text_input"

diff --git a/all_models/inflight_batcher_llm/postprocessing/1/model.py b/all_models/inflight_batcher_llm/postprocessing/1/model.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+import os
 
 import numpy as np
 import triton_python_backend_utils as pb_utils
@@ -52,11 +53,11 @@ def initialize(self, args):
           * model_name: Model name
         """
         # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
+        self.model_config = json.loads(args['model_config'])
+        # Support tokenizer dir from env var for central location
+        tokenizer_dir = self.get_tokenizer_dir()
 
-        skip_special_tokens = model_config['parameters'].get(
+        skip_special_tokens = self.model_config['parameters'].get(
             'skip_special_tokens')
         if skip_special_tokens is not None:
             skip_special_tokens_str = skip_special_tokens[
@@ -87,12 +88,36 @@ def initialize(self, args):
 
         # Parse model output configs
         output_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT")
+            self.model_config, "OUTPUT")
 
         # Convert Triton types to numpy types
         self.output_dtype = pb_utils.triton_string_to_numpy(
             output_config['data_type'])
 
+    def get_tokenizer_dir(self):
+        # Manual override of tokenizer. This is to support common case/models
+        # when engine/tokenizer are downloaded on demand at model load time.
+        tokenizer_dir = os.environ.get("TRTLLM_TOKENIZER")
+
+        # If no override, use tokenizer co-located with engine
+        if not tokenizer_dir:
+            tokenizer_dir = os.environ.get("TRTLLM_ENGINE_DIR")
+
+        # If no env var used at all, use tokenizer dir defined in config.pbtxt
+        # This is for backwards compatibility but is the most tedious to set
+        # and keep aligned in each location.
+        if not tokenizer_dir:
+            tokenizer_dir = self.model_config['parameters']['tokenizer_dir'][
+                'string_value']
+
+        # If no method of setting tokenizer worked, fail.
+        if not tokenizer_dir:
+            raise pb_utils.TritonModelException(
+                f"No tokenizer directory set. Please set TRTLLM_ENGINE_DIR env var or 'tokenizer_dir' config field to the directory containing engines and tokenizers."
+            )
+
+        return tokenizer_dir
+
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
         function receives a list of pb_utils.InferenceRequest as the only

diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
@@ -26,7 +26,7 @@
 
 name: "postprocessing"
 backend: "python"
-max_batch_size: ${triton_max_batch_size}
+max_batch_size: 256
 dynamic_batching {}
 input [
   {
@@ -48,13 +48,15 @@ output [
   }
 ]
 
+# TODO: env var
 parameters {
   key: "tokenizer_dir"
   value: {
-    string_value: "${tokenizer_dir}"
+    string_value: ""
   }
 }
 
+# TODO: Lookup how its filled today
 parameters {
   key: "skip_special_tokens"
   value: {
@@ -64,7 +66,7 @@ parameters {
 
 instance_group [
     {
-        count: ${postprocessing_instance_count}
+        count: 8
         kind: KIND_CPU
     }
 ]
diff --git a/all_models/inflight_batcher_llm/preprocessing/1/model.py b/all_models/inflight_batcher_llm/preprocessing/1/model.py
@@ -54,15 +54,15 @@ def initialize(self, args):
           * model_name: Model name
         """
         # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
+        self.model_config = json.loads(args['model_config'])
+        # Support tokenizer dir from env var for central location
+        tokenizer_dir = self.get_tokenizer_dir()
 
-        add_special_tokens = model_config['parameters'].get(
+        add_special_tokens = self.model_config['parameters'].get(
             'add_special_tokens')
-        visual_model_path = model_config['parameters']['visual_model_path'][
-            'string_value']
-        max_num_images = model_config['parameters'].get('max_num_images')
+        visual_model_path = self.model_config['parameters'][
+            'visual_model_path']['string_value']
+        max_num_images = self.model_config['parameters'].get('max_num_images')
 
         if max_num_images is not None:
             max_num_images_str = max_num_images['string_value']
@@ -133,7 +133,7 @@ def initialize(self, args):
                 'llava', 'blip2-opt', 'vila', 'mllama'
             ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila and mllama. Got {self.model_type}."
 
-            llm_model_path = model_config['parameters']['gpt_model_path'][
+            llm_model_path = self.model_config['parameters']['gpt_model_path'][
                 'string_value']
             llm_model_path = os.path.join(llm_model_path, 'config.json')
             with open(llm_model_path, 'r') as f:
@@ -144,7 +144,7 @@ def initialize(self, args):
 
             self.vision_preprocessor = VisionPreProcessor(
                 self.model_type, AutoProcessor.from_pretrained(tokenizer_dir),
-                model_config)
+                self.model_config)
 
         # Parse model output configs and convert Triton types to numpy types
         output_names = [
@@ -159,15 +159,39 @@ def initialize(self, args):
                 input_name.lower() + "_dtype",
                 pb_utils.triton_string_to_numpy(
                     pb_utils.get_input_config_by_name(
-                        model_config, input_name)['data_type']))
+                        self.model_config, input_name)['data_type']))
 
         for output_name in output_names:
             setattr(
                 self,
                 output_name.lower() + "_dtype",
                 pb_utils.triton_string_to_numpy(
                     pb_utils.get_output_config_by_name(
-                        model_config, output_name)['data_type']))
+                        self.model_config, output_name)['data_type']))
+
+    def get_tokenizer_dir(self):
+        # Manual override of tokenizer. This is to support common case/models
+        # when engine/tokenizer are downloaded on demand at model load time.
+        tokenizer_dir = os.environ.get("TRTLLM_TOKENIZER")
+
+        # If no override, use tokenizer co-located with engine
+        if not tokenizer_dir:
+            tokenizer_dir = os.environ.get("TRTLLM_ENGINE_DIR")
+
+        # If no env var used at all, use tokenizer dir defined in config.pbtxt
+        # This is for backwards compatibility but is the most tedious to set
+        # and keep aligned in each location.
+        if not tokenizer_dir:
+            tokenizer_dir = self.model_config['parameters']['tokenizer_dir'][
+                'string_value']
+
+        # If no method of setting tokenizer worked, fail.
+        if not tokenizer_dir:
+            raise pb_utils.TritonModelException(
+                f"No tokenizer directory set. Please set TRTLLM_ENGINE_DIR env var or 'tokenizer_dir' config field to the directory containing engines and tokenizers."
+            )
+
+        return tokenizer_dir
 
     def _setup_ptable_shape(self, llm_model_config):
         max_prompt_embedding_table_size = llm_model_config['build_config'][
@@ -662,9 +686,8 @@ def __init__(self,
         import requests
         import torch
         from PIL import Image
-        from torch.utils.dlpack import from_dlpack
-
         from tensorrt_llm._utils import str_dtype_to_torch
+        from torch.utils.dlpack import from_dlpack
 
         # create method for loading image from urls
         self.load_images_from_urls = lambda img_urls: [

diff --git a/all_models/inflight_batcher_llm/preprocessing/config.pbtxt b/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
@@ -26,7 +26,7 @@
 
 name: "preprocessing"
 backend: "python"
-max_batch_size: ${triton_max_batch_size}
+max_batch_size: 256
 input [
     {
         name: "QUERY"
@@ -177,10 +177,11 @@ output [
     }
 ]
 
+# TODO: Use shared env var
 parameters {
   key: "tokenizer_dir"
   value: {
-    string_value: "${tokenizer_dir}"
+    string_value: ""
   }
 }
 
@@ -198,10 +199,11 @@ parameters {
   }
 }
 
+# TODO: Shared env var
 parameters: {
   key: "gpt_model_path"
   value: {
-    string_value: "${engine_dir}"
+    string_value: ""
   }
 }
 
@@ -214,7 +216,7 @@ parameters: {
 
 instance_group [
     {
-        count: ${preprocessing_instance_count}
+        count: 8
         kind: KIND_CPU
     }
 ]