Compare commits

5 Commits

Author SHA1 Message Date
Samuele E. Locatelli aa5ccd7eee Completo aggiunta 2026-05-05 14:24:04 +00:00
root d5aa7c168b Update per versione corrente VLLM con container 26.04-py3 2026-05-05 14:23:21 +00:00
Samuele E. Locatelli 4ce92d6698 Aggiunta opzioni x tool calling in modello dense 2026-05-05 06:02:39 +00:00
Samuele E. Locatelli 977124445a Modifica: tolto gpt-oss, messo Qwen 3.6 27b dense 2026-05-05 05:30:25 +00:00
Samuele Locatelli 1e7a10a147 Merge branch 'main' into develop 2026-05-04 19:00:14 +02:00
4 changed files with 295 additions and 22 deletions
+46 -22
View File
@@ -1,13 +1,14 @@
services:
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
qwen-dense:
image: nvcr.io/nvidia/vllm:26.04-py3
container_name: vllm-qwen-dense
profiles: ["qwen-dense"]
restart: unless-stopped
ipc: host
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
@@ -16,26 +17,22 @@ services:
count: all
capabilities: [gpu]
command: >
vllm serve openai/gpt-oss-20b
vllm serve Qwen/Qwen3.6-27B-FP8
--quantization fp8
--gpu-memory-utilization 0.40
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-27b-dense
qwen-blackwell:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-blackwell
restart: always
qwen-moe:
image: nvcr.io/nvidia/vllm:26.04-py3
container_name: vllm-qwen-moe
profiles: ["qwen-moe"]
restart: unless-stopped
ipc: host
depends_on:
gpt-oss:
condition: service_healthy # Aspetta che il primo sia "Healthy"
ports:
- "8001:8000"
volumes:
@@ -48,13 +45,40 @@ services:
count: all
capabilities: [gpu]
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
vllm serve Qwen/Qwen3.6-35B-A3B-FP8
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--enable-auto-tool-choice
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-blackwell
--served-model-name qwen-3.6-35b-moe
gemma:
image: nvcr.io/nvidia/vllm:26.04-py3
container_name: vllm-gemma-moe
profiles: ["gemma-moe"]
ipc: host
restart: unless-stopped
ports:
- "8002:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
environment:
- HF_TOKEN=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
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve google/gemma-4-26B-A4B-it
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 8192
--gpu-memory-utilization 0.40
--trust-remote-code
--served-model-name gemma-26b-moe
@@ -0,0 +1,94 @@
services:
qwen-dense:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-dense
restart: always
ipc: host
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-27B-FP8
--quantization fp8
--gpu-memory-utilization 0.32
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-27b-dense
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s
qwen-moe:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-moe
restart: always
ipc: host
depends_on:
qwen-dense:
condition: service_healthy
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-35B-A3B-FP8
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.32
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-35b-moe
gemma4-moe:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gemma4-moe
restart: always
ipc: host
depends_on:
qwen-moe:
condition: service_healthy
ports:
- "8002:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve google/gemma-4-26B-A4B-it
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.32
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name gemma-4-26b-moe
+64
View File
@@ -0,0 +1,64 @@
services:
qwen-dense:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-dense
restart: always
ipc: host
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-27B-FP8
--quantization fp8
--gpu-memory-utilization 0.45
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-27b-dense
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s
qwen-moe:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-moe
restart: always
ipc: host
depends_on:
qwen-dense:
condition: service_healthy
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-35B-A3B-FP8
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.45
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-35b-moe
@@ -0,0 +1,91 @@
services:
qwen-dense:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-dense
profiles: ["qwen", "dense"]
restart: unless-stopped
ipc: host
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-27B-FP8
--quantization fp8
--gpu-memory-utilization 0.44
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-27b-dense
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s
qwen-moe:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-moe
profiles: ["qwen", "moe"]
restart: unless-stopped
ipc: host
depends_on:
qwen-dense:
condition: service_healthy
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve Qwen/Qwen3.6-35B-A3B-FP8
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.44
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-35b-moe
gemma:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gemma-moe
profiles: ["gemma", "moe"]
ipc: host
restart: unless-stopped
ports:
- "8002:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve google/gemma-4-26B-A4B-it
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.44
--trust-remote-code
--served-model-name gemma-26b-moe