diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_00 b/vLLM/NVIDIA-SPARK/docker-compose.yml_00 deleted file mode 100644 index 851d7c8..0000000 --- a/vLLM/NVIDIA-SPARK/docker-compose.yml_00 +++ /dev/null @@ -1,37 +0,0 @@ -services: - qwen-36: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-qwen - restart: always - runtime: nvidia - ipc: host - environment: - - NVIDIA_VISIBLE_DEVICES=all - ports: - - "8001:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 - --quantization fp8 - --kv-cache-dtype fp8 - --max-model-len 32768 - --gpu-memory-utilization 0.40 - --trust-remote-code - --served-model-name qwen-3.6-blackwell - - gpt-oss: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-gpt-oss - restart: always - runtime: nvidia - ipc: host - environment: - - NVIDIA_VISIBLE_DEVICES=all - ports: - - "8000:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve openai/gpt-oss-20b - --gpu-memory-utilization 0.40 diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_01 b/vLLM/NVIDIA-SPARK/docker-compose.yml_01 deleted file mode 100644 index 56a03f0..0000000 --- a/vLLM/NVIDIA-SPARK/docker-compose.yml_01 +++ /dev/null @@ -1,39 +0,0 @@ -services: - qwen-36: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-qwen - restart: always - runtime: nvidia - ipc: host - environment: - - NVIDIA_VISIBLE_DEVICES=all - - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica - ports: - - "8001:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 - --quantization fp8 - --kv-cache-dtype fp8 - --max-model-len 32768 - --gpu-memory-utilization 0.40 - --trust-remote-code - --served-model-name qwen-3.6-blackwell - - gpt-oss: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-gpt-oss - restart: always - runtime: nvidia - ipc: host - environment: - - NVIDIA_VISIBLE_DEVICES=all - ports: - - "8000:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve openai/gpt-oss-20b - --gpu-memory-utilization 0.40 - --served-model-name gpt-oss-20b diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_02 b/vLLM/NVIDIA-SPARK/docker-compose.yml_02 deleted file mode 100644 index 05ae3c1..0000000 --- a/vLLM/NVIDIA-SPARK/docker-compose.yml_02 +++ /dev/null @@ -1,50 +0,0 @@ -services: - # Primo modello: GPT-OSS (più veloce da caricare) - gpt-oss: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-gpt-oss - restart: always - runtime: nvidia - ipc: host - environment: - - NVIDIA_VISIBLE_DEVICES=all - ports: - - "8000:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve openai/gpt-oss-20b - --gpu-memory-utilization 0.40 - --served-model-name gpt-oss-20b - healthcheck: - test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] - interval: 30s - timeout: 10s - retries: 10 - start_period: 60s - - # Secondo modello: Qwen (parte solo quando GPT-OSS è pronto) - qwen-36: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-qwen - restart: always - runtime: nvidia - ipc: host - environment: - - NVIDIA_VISIBLE_DEVICES=all - - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica - depends_on: - gpt-oss: - condition: service_healthy - ports: - - "8001:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 - --quantization fp8 - --kv-cache-dtype fp8 - --max-model-len 32768 - --gpu-memory-utilization 0.40 - --trust-remote-code - --served-model-name qwen-3.6-blackwell diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_04 b/vLLM/NVIDIA-SPARK/docker-compose.yml_04 deleted file mode 100644 index 2b04d5b..0000000 --- a/vLLM/NVIDIA-SPARK/docker-compose.yml_04 +++ /dev/null @@ -1,49 +0,0 @@ -services: - gpt-oss: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-gpt-oss - restart: always - ipc: host - shm_size: '16gb' - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - environment: - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=compute,utility - ports: - - "8000:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - command: > - vllm serve openai/gpt-oss-20b - --gpu-memory-utilization 0.35 - --trust-remote-code - - qwen-36: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-qwen - restart: always - ipc: host - shm_size: '16gb' - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - depends_on: - - gpt-oss - environment: - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=compute,utility - ports: - - "8001:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - entrypoint: /bin/sh -c "sleep 300 && vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 --quantization fp8 --kv-cache-dtype fp8 --max-model-len 32768 --gpu-memory-utilization 0.40 --trust-remote-code --served-model-name qwen-3.6-blackwell" diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_05 b/vLLM/NVIDIA-SPARK/docker-compose.yml_05 deleted file mode 100644 index b06cde4..0000000 --- a/vLLM/NVIDIA-SPARK/docker-compose.yml_05 +++ /dev/null @@ -1,62 +0,0 @@ -services: - gpt-oss: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-gpt-oss - restart: always - ipc: host - environment: - - HF_MODEL_HANDLE=openai/gpt-oss-20b - ports: - - "8000:8000" - volumes: - - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - command: > - vllm serve openai/gpt-oss-20b - --gpu-memory-utilization 0.40 - --kv-cache-dtype fp8 - --max-model-len 32768 - --trust-remote-code - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] - interval: 30s - timeout: 10s - retries: 5 - start_period: 300s - - qwen-blackwell: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-qwen-blackwell - restart: always - ipc: host - depends_on: - gpt-oss: - condition: service_healthy # Aspetta che il primo sia "Healthy" - ports: - - "8001:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - command: > - vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 - --quantization fp8 - --kv-cache-dtype fp8 - --max-model-len 32768 - --gpu-memory-utilization 0.40 - --trust-remote-code - --enable-auto-tool-choice - --tool-call-parser hermes - --served-model-name qwen-3.6-blackwell - diff --git a/vLLM/NVIDIA-SPARK/docker-compose.yml_ok b/vLLM/NVIDIA-SPARK/docker-compose.yml_ok deleted file mode 100644 index b06cde4..0000000 --- a/vLLM/NVIDIA-SPARK/docker-compose.yml_ok +++ /dev/null @@ -1,62 +0,0 @@ -services: - gpt-oss: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-gpt-oss - restart: always - ipc: host - environment: - - HF_MODEL_HANDLE=openai/gpt-oss-20b - ports: - - "8000:8000" - volumes: - - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - command: > - vllm serve openai/gpt-oss-20b - --gpu-memory-utilization 0.40 - --kv-cache-dtype fp8 - --max-model-len 32768 - --trust-remote-code - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] - interval: 30s - timeout: 10s - retries: 5 - start_period: 300s - - qwen-blackwell: - image: nvcr.io/nvidia/vllm:26.03.post1-py3 - container_name: vllm-qwen-blackwell - restart: always - ipc: host - depends_on: - gpt-oss: - condition: service_healthy # Aspetta che il primo sia "Healthy" - ports: - - "8001:8000" - volumes: - - /root/.cache/huggingface:/root/.cache/huggingface - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - command: > - vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 - --quantization fp8 - --kv-cache-dtype fp8 - --max-model-len 32768 - --gpu-memory-utilization 0.40 - --trust-remote-code - --enable-auto-tool-choice - --tool-call-parser hermes - --served-model-name qwen-3.6-blackwell -