llm/qwen/qwen2-vl-7b.yaml

envs:
  MODEL_NAME: Qwen/Qwen2-VL-7B-Instruct

service:
  # Specifying the path to the endpoint to check the readiness of the replicas.
  readiness_probe:
    path: /v1/chat/completions
    post_data:
      model: $MODEL_NAME
      messages:
        - role: user
          content: Hello! What is your name?
      max_tokens: 1
    initial_delay_seconds: 1200
  # How many replicas to manage.
  replicas: 2
  

resources:
  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}
  disk_tier: best
  ports: 8000

setup: |
  # Install later transformers version for the support of
  # qwen2_vl support
  pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
  pip install vllm==0.6.1.post2
  pip install vllm-flash-attn

run: |
  export PATH=$PATH:/sbin
  vllm serve $MODEL_NAME \
    --host 0.0.0.0 \
    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
    --max-model-len 2048 | tee ~/openai_api_server.log