diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_2test b/vLLM/NVIDIA-SPARK/docker-compose.yml_2test new file mode 100644 index 0000000..f3b9e5a --- /dev/null +++ b/vLLM/NVIDIA-SPARK/docker-compose.yml_2test @@ -0,0 +1,94 @@ +services: + qwen-dense: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-dense + restart: always + ipc: host + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve Qwen/Qwen3.6-27B-FP8 + --quantization fp8 + --gpu-memory-utilization 0.32 + --kv-cache-dtype fp8 + --max-model-len 32768 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-27b-dense + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 300s + + qwen-moe: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-moe + restart: always + ipc: host + depends_on: + qwen-dense: + condition: service_healthy + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve Qwen/Qwen3.6-35B-A3B-FP8 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.32 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-35b-moe + + gemma4-moe: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gemma4-moe + restart: always + ipc: host + depends_on: + qwen-moe: + condition: service_healthy + ports: + - "8002:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve google/gemma-4-26B-A4B-it + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.32 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name gemma-4-26b-moe + diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_ok b/vLLM/NVIDIA-SPARK/docker-compose.yml_ok new file mode 100644 index 0000000..3eac093 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/docker-compose.yml_ok @@ -0,0 +1,64 @@ +services: + qwen-dense: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-dense + restart: always + ipc: host + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve Qwen/Qwen3.6-27B-FP8 + --quantization fp8 + --gpu-memory-utilization 0.45 + --kv-cache-dtype fp8 + --max-model-len 32768 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-27b-dense + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 300s + + qwen-moe: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-moe + restart: always + ipc: host + depends_on: + qwen-dense: + condition: service_healthy + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve Qwen/Qwen3.6-35B-A3B-FP8 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.45 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-35b-moe + diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_profile b/vLLM/NVIDIA-SPARK/docker-compose.yml_profile new file mode 100644 index 0000000..08d2032 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/docker-compose.yml_profile @@ -0,0 +1,91 @@ +services: + qwen-dense: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-dense + profiles: ["qwen", "dense"] + restart: unless-stopped + ipc: host + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve Qwen/Qwen3.6-27B-FP8 + --quantization fp8 + --gpu-memory-utilization 0.44 + --kv-cache-dtype fp8 + --max-model-len 32768 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-27b-dense + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 300s + + qwen-moe: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-moe + profiles: ["qwen", "moe"] + restart: unless-stopped + ipc: host + depends_on: + qwen-dense: + condition: service_healthy + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve Qwen/Qwen3.6-35B-A3B-FP8 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.44 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-35b-moe + + gemma: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gemma-moe + profiles: ["gemma", "moe"] + ipc: host + restart: unless-stopped + ports: + - "8002:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve google/gemma-4-26B-A4B-it + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.44 + --trust-remote-code + --served-model-name gemma-26b-moe