diff --git a/vLLM/NVIDIA-SPARK/old/docker-compose.yml_00 b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_00 new file mode 100644 index 0000000..851d7c8 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_00 @@ -0,0 +1,37 @@ +services: + qwen-36: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen + restart: always + runtime: nvidia + ipc: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.40 + --trust-remote-code + --served-model-name qwen-3.6-blackwell + + gpt-oss: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gpt-oss + restart: always + runtime: nvidia + ipc: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve openai/gpt-oss-20b + --gpu-memory-utilization 0.40 diff --git a/vLLM/NVIDIA-SPARK/old/docker-compose.yml_01 b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_01 new file mode 100644 index 0000000..56a03f0 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_01 @@ -0,0 +1,39 @@ +services: + qwen-36: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen + restart: always + runtime: nvidia + ipc: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.40 + --trust-remote-code + --served-model-name qwen-3.6-blackwell + + gpt-oss: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gpt-oss + restart: always + runtime: nvidia + ipc: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve openai/gpt-oss-20b + --gpu-memory-utilization 0.40 + --served-model-name gpt-oss-20b diff --git a/vLLM/NVIDIA-SPARK/old/docker-compose.yml_02 b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_02 new file mode 100644 index 0000000..05ae3c1 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_02 @@ -0,0 +1,50 @@ +services: + # Primo modello: GPT-OSS (più veloce da caricare) + gpt-oss: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gpt-oss + restart: always + runtime: nvidia + ipc: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve openai/gpt-oss-20b + --gpu-memory-utilization 0.40 + --served-model-name gpt-oss-20b + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 30s + timeout: 10s + retries: 10 + start_period: 60s + + # Secondo modello: Qwen (parte solo quando GPT-OSS è pronto) + qwen-36: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen + restart: always + runtime: nvidia + ipc: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica + depends_on: + gpt-oss: + condition: service_healthy + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.40 + --trust-remote-code + --served-model-name qwen-3.6-blackwell diff --git a/vLLM/NVIDIA-SPARK/old/docker-compose.yml_04 b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_04 new file mode 100644 index 0000000..2b04d5b --- /dev/null +++ b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_04 @@ -0,0 +1,49 @@ +services: + gpt-oss: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gpt-oss + restart: always + ipc: host + shm_size: '16gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + command: > + vllm serve openai/gpt-oss-20b + --gpu-memory-utilization 0.35 + --trust-remote-code + + qwen-36: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen + restart: always + ipc: host + shm_size: '16gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + depends_on: + - gpt-oss + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + entrypoint: /bin/sh -c "sleep 300 && vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 --quantization fp8 --kv-cache-dtype fp8 --max-model-len 32768 --gpu-memory-utilization 0.40 --trust-remote-code --served-model-name qwen-3.6-blackwell" diff --git a/vLLM/NVIDIA-SPARK/old/docker-compose.yml_05 b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_05 new file mode 100644 index 0000000..b06cde4 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_05 @@ -0,0 +1,62 @@ +services: + gpt-oss: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gpt-oss + restart: always + ipc: host + environment: + - HF_MODEL_HANDLE=openai/gpt-oss-20b + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve openai/gpt-oss-20b + --gpu-memory-utilization 0.40 + --kv-cache-dtype fp8 + --max-model-len 32768 + --trust-remote-code + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 300s + + qwen-blackwell: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-blackwell + restart: always + ipc: host + depends_on: + gpt-oss: + condition: service_healthy # Aspetta che il primo sia "Healthy" + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.40 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-blackwell + diff --git a/vLLM/NVIDIA-SPARK/old/docker-compose.yml_ok b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_ok new file mode 100644 index 0000000..b06cde4 --- /dev/null +++ b/vLLM/NVIDIA-SPARK/old/docker-compose.yml_ok @@ -0,0 +1,62 @@ +services: + gpt-oss: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-gpt-oss + restart: always + ipc: host + environment: + - HF_MODEL_HANDLE=openai/gpt-oss-20b + ports: + - "8000:8000" + volumes: + - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve openai/gpt-oss-20b + --gpu-memory-utilization 0.40 + --kv-cache-dtype fp8 + --max-model-len 32768 + --trust-remote-code + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 300s + + qwen-blackwell: + image: nvcr.io/nvidia/vllm:26.03.post1-py3 + container_name: vllm-qwen-blackwell + restart: always + ipc: host + depends_on: + gpt-oss: + condition: service_healthy # Aspetta che il primo sia "Healthy" + ports: + - "8001:8000" + volumes: + - /root/.cache/huggingface:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + command: > + vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 + --quantization fp8 + --kv-cache-dtype fp8 + --max-model-len 32768 + --gpu-memory-utilization 0.40 + --trust-remote-code + --enable-auto-tool-choice + --tool-call-parser hermes + --served-model-name qwen-3.6-blackwell +