llm-setup/vLLM/NVIDIA-SPARK/old/docker-compose.yml_01

services:
  qwen-36:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --served-model-name qwen-3.6-blackwell

  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.40
      --served-model-name gpt-oss-20b