iAggiunta folder old con vecchi files

This commit is contained in:
Samuele E. Locatelli
2026-05-04 16:59:06 +00:00
parent a289a66c1c
commit e7a9fba1a3
6 changed files with 299 additions and 0 deletions
@@ -0,0 +1,37 @@
services:
qwen-36:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--served-model-name qwen-3.6-blackwell
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.40
@@ -0,0 +1,39 @@
services:
qwen-36:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
- HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--served-model-name qwen-3.6-blackwell
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.40
--served-model-name gpt-oss-20b
@@ -0,0 +1,50 @@
services:
# Primo modello: GPT-OSS (più veloce da caricare)
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.40
--served-model-name gpt-oss-20b
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
interval: 30s
timeout: 10s
retries: 10
start_period: 60s
# Secondo modello: Qwen (parte solo quando GPT-OSS è pronto)
qwen-36:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen
restart: always
runtime: nvidia
ipc: host
environment:
- NVIDIA_VISIBLE_DEVICES=all
- HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica
depends_on:
gpt-oss:
condition: service_healthy
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--served-model-name qwen-3.6-blackwell
@@ -0,0 +1,49 @@
services:
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
ipc: host
shm_size: '16gb'
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.35
--trust-remote-code
qwen-36:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen
restart: always
ipc: host
shm_size: '16gb'
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
depends_on:
- gpt-oss
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
entrypoint: /bin/sh -c "sleep 300 && vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 --quantization fp8 --kv-cache-dtype fp8 --max-model-len 32768 --gpu-memory-utilization 0.40 --trust-remote-code --served-model-name qwen-3.6-blackwell"
@@ -0,0 +1,62 @@
services:
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
ipc: host
environment:
- HF_MODEL_HANDLE=openai/gpt-oss-20b
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.40
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s
qwen-blackwell:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-blackwell
restart: always
ipc: host
depends_on:
gpt-oss:
condition: service_healthy # Aspetta che il primo sia "Healthy"
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-blackwell
@@ -0,0 +1,62 @@
services:
gpt-oss:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-gpt-oss
restart: always
ipc: host
environment:
- HF_MODEL_HANDLE=openai/gpt-oss-20b
ports:
- "8000:8000"
volumes:
- /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve openai/gpt-oss-20b
--gpu-memory-utilization 0.40
--kv-cache-dtype fp8
--max-model-len 32768
--trust-remote-code
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 300s
qwen-blackwell:
image: nvcr.io/nvidia/vllm:26.03.post1-py3
container_name: vllm-qwen-blackwell
restart: always
ipc: host
depends_on:
gpt-oss:
condition: service_healthy # Aspetta che il primo sia "Healthy"
ports:
- "8001:8000"
volumes:
- /root/.cache/huggingface:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command: >
vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
--quantization fp8
--kv-cache-dtype fp8
--max-model-len 32768
--gpu-memory-utilization 0.40
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
--served-model-name qwen-3.6-blackwell