Files
llm-setup/vLLM/NVIDIA-SPARK/old/docker-compose.yml_00
T
2026-05-04 16:59:06 +00:00

38 lines
1.0 KiB
Plaintext

services:
qwen-36:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--served-model-name qwen-3.6-blackwell
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.40