llm-setup/vLLM/NVIDIA-SPARK/docker-compose.yml

services:
  qwen-dense:
    image: nvcr.io/nvidia/vllm:26.04-py3
    container_name: vllm-qwen-dense
    profiles: ["qwen-dense"]
    restart: unless-stopped
    ipc: host
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-27B-FP8
      --quantization fp8
      --gpu-memory-utilization 0.40
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-27b-dense

  qwen-moe:
    image: nvcr.io/nvidia/vllm:26.04-py3
    container_name: vllm-qwen-moe
    profiles: ["qwen-moe"]
    restart: unless-stopped
    ipc: host
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-35b-moe

  gemma:
    image: nvcr.io/nvidia/vllm:26.04-py3
    container_name: vllm-gemma-moe
    profiles: ["gemma-moe"]
    ipc: host
    restart: unless-stopped
    ports:
      - "8002:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    environment:
      - HF_TOKEN=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
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve google/gemma-4-26B-A4B-it
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 8192
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --served-model-name gemma-26b-moe