Completo aggiunta

Update per versione corrente VLLM con container 26.04-py3
Aggiunta opzioni x tool calling in modello dense
2026-05-05 14:24:04 +00:00 · 2026-05-05 14:23:21 +00:00 · 2026-05-05 06:02:39 +00:00 · 2026-05-05 05:30:25 +00:00 · 2026-05-04 19:00:14 +02:00
4 changed files with 295 additions and 22 deletions
@@ -1,13 +1,14 @@
 services:
-  gpt-oss:
-    image: nvcr.io/nvidia/vllm:26.03.post1-py3
-    container_name: vllm-gpt-oss
-    restart: always
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.04-py3
+    container_name: vllm-qwen-dense
+    profiles: ["qwen-dense"]
+    restart: unless-stopped
    ipc: host
    ports:
      - "8000:8000"
    volumes:
-      - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
+      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
@@ -16,26 +17,22 @@ services:
              count: all
              capabilities: [gpu]
    command: >
-      vllm serve openai/gpt-oss-20b
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
      --gpu-memory-utilization 0.40
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-      start_period: 300s
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense

-  qwen-blackwell:
-    image: nvcr.io/nvidia/vllm:26.03.post1-py3
-    container_name: vllm-qwen-blackwell
-    restart: always
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.04-py3
+    container_name: vllm-qwen-moe
+    profiles: ["qwen-moe"]
+    restart: unless-stopped
    ipc: host
-    depends_on:
-      gpt-oss:
-        condition: service_healthy  # Aspetta che il primo sia "Healthy"
    ports:
      - "8001:8000"
    volumes:
@@ -48,13 +45,40 @@ services:
              count: all
              capabilities: [gpu]
    command: >
-      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
-      --enable-auto-tool-choice 
+      --enable-auto-tool-choice
      --tool-call-parser hermes
-      --served-model-name qwen-3.6-blackwell
+      --served-model-name qwen-3.6-35b-moe

+  gemma:
+    image: nvcr.io/nvidia/vllm:26.04-py3
+    container_name: vllm-gemma-moe
+    profiles: ["gemma-moe"]
+    ipc: host
+    restart: unless-stopped
+    ports:
+      - "8002:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    environment:
+      - HF_TOKEN=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
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve google/gemma-4-26B-A4B-it
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 8192
+      --gpu-memory-utilization 0.40
+      --trust-remote-code
+      --served-model-name gemma-26b-moe
@@ -0,0 +1,94 @@
+services:
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-dense
+    restart: always
+    ipc: host
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
+      --gpu-memory-utilization 0.32
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 300s
+
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-moe
+    restart: always
+    ipc: host
+    depends_on:
+      qwen-dense:
+        condition: service_healthy
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.32
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-35b-moe
+
+  gemma4-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-gemma4-moe
+    restart: always
+    ipc: host
+    depends_on:
+      qwen-moe:
+        condition: service_healthy
+    ports:
+      - "8002:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve google/gemma-4-26B-A4B-it
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.32
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name gemma-4-26b-moe
+
@@ -0,0 +1,64 @@
+services:
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-dense
+    restart: always
+    ipc: host
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
+      --gpu-memory-utilization 0.45
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 300s
+
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-moe
+    restart: always
+    ipc: host
+    depends_on:
+      qwen-dense:
+        condition: service_healthy
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.45
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-35b-moe
+
@@ -0,0 +1,91 @@
+services:
+  qwen-dense:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-dense
+    profiles: ["qwen", "dense"]
+    restart: unless-stopped
+    ipc: host
+    ports:
+      - "8000:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-27B-FP8
+      --quantization fp8
+      --gpu-memory-utilization 0.44
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-27b-dense
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 300s
+
+  qwen-moe:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-qwen-moe
+    profiles: ["qwen", "moe"]
+    restart: unless-stopped
+    ipc: host
+    depends_on:
+      qwen-dense:
+        condition: service_healthy
+    ports:
+      - "8001:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.44
+      --trust-remote-code
+      --enable-auto-tool-choice
+      --tool-call-parser hermes
+      --served-model-name qwen-3.6-35b-moe
+
+  gemma:
+    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    container_name: vllm-gemma-moe
+    profiles: ["gemma", "moe"]
+    ipc: host
+    restart: unless-stopped
+    ports:
+      - "8002:8000"
+    volumes:
+      - /root/.cache/huggingface:/root/.cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: >
+      vllm serve google/gemma-4-26B-A4B-it
+      --quantization fp8
+      --kv-cache-dtype fp8
+      --max-model-len 32768
+      --gpu-memory-utilization 0.44
+      --trust-remote-code
+      --served-model-name gemma-26b-moe
Author	SHA1	Message	Date
Samuele E. Locatelli	aa5ccd7eee	Completo aggiunta	2026-05-05 14:24:04 +00:00
root	d5aa7c168b	Update per versione corrente VLLM con container 26.04-py3	2026-05-05 14:23:21 +00:00
Samuele E. Locatelli	4ce92d6698	Aggiunta opzioni x tool calling in modello dense	2026-05-05 06:02:39 +00:00
Samuele E. Locatelli	977124445a	Modifica: tolto gpt-oss, messo Qwen 3.6 27b dense	2026-05-05 05:30:25 +00:00
Samuele Locatelli	1e7a10a147	Merge branch 'main' into develop	2026-05-04 19:00:14 +02:00