forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqwen2-vl-7b.yaml
36 lines (31 loc) · 961 Bytes
/
qwen2-vl-7b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
envs:
MODEL_NAME: Qwen/Qwen2-VL-7B-Instruct
service:
# Specifying the path to the endpoint to check the readiness of the replicas.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
initial_delay_seconds: 1200
# How many replicas to manage.
replicas: 2
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}
disk_tier: best
ports: 8000
setup: |
# Install later transformers version for the support of
# qwen2_vl support
pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
pip install vllm==0.6.1.post2
pip install vllm-flash-attn
run: |
export PATH=$PATH:/sbin
vllm serve $MODEL_NAME \
--host 0.0.0.0 \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--max-model-len 2048 | tee ~/openai_api_server.log