Completo aggiunta

Update per versione corrente VLLM con container 26.04-py3
Aggiunta opzioni x tool calling in modello dense
2026-05-05 14:24:04 +00:00 · 2026-05-05 14:23:21 +00:00 · 2026-05-05 06:02:39 +00:00 · 2026-05-05 05:30:25 +00:00 · 2026-05-04 19:00:14 +02:00
4 changed files with 295 additions and 22 deletions
@@ -1,13 +1,14 @@
 services:
-  gpt-oss:
+  qwen-dense:
-    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    image: nvcr.io/nvidia/vllm:26.04-py3
-    container_name: vllm-gpt-oss
+    container_name: vllm-qwen-dense
-    restart: always
+    profiles: ["qwen-dense"]
    restart: unless-stopped
    ipc: host
    ports:
      - "8000:8000"
    volumes:
-      - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
+      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
@@ -16,26 +17,22 @@ services:
              count: all
              capabilities: [gpu]
    command: >
-      vllm serve openai/gpt-oss-20b
+      vllm serve Qwen/Qwen3.6-27B-FP8
      --quantization fp8
      --gpu-memory-utilization 0.40
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
-    healthcheck:
+      --enable-auto-tool-choice
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      --tool-call-parser hermes
-      interval: 30s
+      --served-model-name qwen-3.6-27b-dense
      timeout: 10s
      retries: 5
      start_period: 300s
-  qwen-blackwell:
+  qwen-moe:
-    image: nvcr.io/nvidia/vllm:26.03.post1-py3
+    image: nvcr.io/nvidia/vllm:26.04-py3
-    container_name: vllm-qwen-blackwell
+    container_name: vllm-qwen-moe
-    restart: always
+    profiles: ["qwen-moe"]
    restart: unless-stopped
    ipc: host
    depends_on:
      gpt-oss:
        condition: service_healthy  # Aspetta che il primo sia "Healthy"
    ports:
      - "8001:8000"
    volumes:
@@ -48,13 +45,40 @@ services:
              count: all
              capabilities: [gpu]
    command: >
-      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
+      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
-      --enable-auto-tool-choice 
+      --enable-auto-tool-choice
      --tool-call-parser hermes
-      --served-model-name qwen-3.6-blackwell
+      --served-model-name qwen-3.6-35b-moe
  gemma:
    image: nvcr.io/nvidia/vllm:26.04-py3
    container_name: vllm-gemma-moe
    profiles: ["gemma-moe"]
    ipc: host
    restart: unless-stopped
    ports:
      - "8002:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    environment:
      - HF_TOKEN=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
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve google/gemma-4-26B-A4B-it
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 8192
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --served-model-name gemma-26b-moe
@@ -0,0 +1,94 @@
 services:
  qwen-dense:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-dense
    restart: always
    ipc: host
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-27B-FP8
      --quantization fp8
      --gpu-memory-utilization 0.32
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-27b-dense
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s
  qwen-moe:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-moe
    restart: always
    ipc: host
    depends_on:
      qwen-dense:
        condition: service_healthy
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.32
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-35b-moe
  gemma4-moe:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gemma4-moe
    restart: always
    ipc: host
    depends_on:
      qwen-moe:
        condition: service_healthy
    ports:
      - "8002:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve google/gemma-4-26B-A4B-it
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.32
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name gemma-4-26b-moe
@@ -0,0 +1,64 @@
 services:
  qwen-dense:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-dense
    restart: always
    ipc: host
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-27B-FP8
      --quantization fp8
      --gpu-memory-utilization 0.45
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-27b-dense
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s
  qwen-moe:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-moe
    restart: always
    ipc: host
    depends_on:
      qwen-dense:
        condition: service_healthy
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.45
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-35b-moe
@@ -0,0 +1,91 @@
 services:
  qwen-dense:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-dense
    profiles: ["qwen", "dense"]
    restart: unless-stopped
    ipc: host
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-27B-FP8
      --quantization fp8
      --gpu-memory-utilization 0.44
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-27b-dense
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s
  qwen-moe:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-moe
    profiles: ["qwen", "moe"]
    restart: unless-stopped
    ipc: host
    depends_on:
      qwen-dense:
        condition: service_healthy
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve Qwen/Qwen3.6-35B-A3B-FP8
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.44
      --trust-remote-code
      --enable-auto-tool-choice
      --tool-call-parser hermes
      --served-model-name qwen-3.6-35b-moe
  gemma:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gemma-moe
    profiles: ["gemma", "moe"]
    ipc: host
    restart: unless-stopped
    ports:
      - "8002:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve google/gemma-4-26B-A4B-it
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.44
      --trust-remote-code
      --served-model-name gemma-26b-moe
Author	SHA1	Message	Date
Samuele E. Locatelli	aa5ccd7eee	Completo aggiunta	2026-05-05 14:24:04 +00:00
root	d5aa7c168b	Update per versione corrente VLLM con container 26.04-py3	2026-05-05 14:23:21 +00:00
Samuele E. Locatelli	4ce92d6698	Aggiunta opzioni x tool calling in modello dense	2026-05-05 06:02:39 +00:00
Samuele E. Locatelli	977124445a	Modifica: tolto gpt-oss, messo Qwen 3.6 27b dense	2026-05-05 05:30:25 +00:00
Samuele Locatelli	1e7a10a147	Merge branch 'main' into develop	2026-05-04 19:00:14 +02:00