Completo aggiunta

2026-05-05 14:24:04 +00:00
parent d5aa7c168b
commit aa5ccd7eee
3 changed files with 249 additions and 0 deletions
@@ -0,0 +1,94 @@
+services:
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-dense
+    restart: always
+    ipc: host
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
+      --gpu-memory-utilization 0.32
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 300s
+
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-moe
+    restart: always
+    ipc: host
+    depends_on:
+      qwen-dense:
+        condition: service_healthy
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.32
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-35b-moe
+
+  gemma4-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-gemma4-moe
+    restart: always
+    ipc: host
+    depends_on:
+      qwen-moe:
+        condition: service_healthy
+    ports:
+      - "8002:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve google/gemma-4-26B-A4B-it
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.32
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name gemma-4-26b-moe
+
@@ -0,0 +1,64 @@
+services:
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-dense
+    restart: always
+    ipc: host
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
+      --gpu-memory-utilization 0.45
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 300s
+
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-moe
+    restart: always
+    ipc: host
+    depends_on:
+      qwen-dense:
+        condition: service_healthy
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.45
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-35b-moe
+
@@ -0,0 +1,91 @@
+services:
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-dense
+    profiles: ["qwen", "dense"]
+    restart: unless-stopped
+    ipc: host
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
+      --gpu-memory-utilization 0.44
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 300s
+
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-moe
+    profiles: ["qwen", "moe"]
+    restart: unless-stopped
+    ipc: host
+    depends_on:
+      qwen-dense:
+        condition: service_healthy
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.44
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-35b-moe
+
+  gemma:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-gemma-moe
+    profiles: ["gemma", "moe"]
+    ipc: host
+    restart: unless-stopped
+    ports:
+      - "8002:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve google/gemma-4-26B-A4B-it
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.44
+      --trust-remote-code
+      --served-model-name gemma-26b-moe