Files

85 lines
2.1 KiB
YAML

services:
qwen-dense:
image: nvcr.io/nvidia/vllm:26.04-py3
container_name: vllm-qwen-dense
profiles: ["qwen-dense"]
restart: unless-stopped
ipc: host
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-27B-FP8
--quantization fp8
--gpu-memory-utilization 0.40
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-27b-dense
qwen-moe:
image: nvcr.io/nvidia/vllm:26.04-py3
container_name: vllm-qwen-moe
profiles: ["qwen-moe"]
restart: unless-stopped
ipc: host
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-35B-A3B-FP8
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-35b-moe
gemma:
image: nvcr.io/nvidia/vllm:26.04-py3
container_name: vllm-gemma-moe
profiles: ["gemma-moe"]
ipc: host
restart: unless-stopped
ports:
- "8002:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
environment:
- HF_TOKEN=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
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve google/gemma-4-26B-A4B-it
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 8192
--gpu-memory-utilization 0.40
--trust-remote-code
--served-model-name gemma-26b-moe