iAggiunta folder old con vecchi files

2026-05-04 16:59:06 +00:00
parent a289a66c1c
commit e7a9fba1a3
6 changed files with 299 additions and 0 deletions
@@ -0,0 +1,37 @@
 services:
  qwen-36:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --served-model-name qwen-3.6-blackwell
  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.40
@@ -0,0 +1,39 @@
 services:
  qwen-36:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --served-model-name qwen-3.6-blackwell
  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.40
      --served-model-name gpt-oss-20b
@@ -0,0 +1,50 @@
 services:
  # Primo modello: GPT-OSS (più veloce da caricare)
  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.40
      --served-model-name gpt-oss-20b
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 10
      start_period: 60s
  # Secondo modello: Qwen (parte solo quando GPT-OSS è pronto)
  qwen-36:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen
    restart: always
    runtime: nvidia
    ipc: host
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - HF_HUB_ENABLE_HF_TRANSFER=1 # Accelera il caricamento se scarica
    depends_on:
      gpt-oss:
        condition: service_healthy
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --served-model-name qwen-3.6-blackwell
@@ -0,0 +1,49 @@
 services:
  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    ipc: host
    shm_size: '16gb'
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.35
      --trust-remote-code
  qwen-36:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen
    restart: always
    ipc: host
    shm_size: '16gb'
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    depends_on:
      - gpt-oss
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    entrypoint: /bin/sh -c "sleep 300 && vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2 --quantization fp8 --kv-cache-dtype fp8 --max-model-len 32768 --gpu-memory-utilization 0.40 --trust-remote-code --served-model-name qwen-3.6-blackwell"
@@ -0,0 +1,62 @@
 services:
  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    ipc: host
    environment:
      - HF_MODEL_HANDLE=openai/gpt-oss-20b
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.40
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s
  qwen-blackwell:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-blackwell
    restart: always
    ipc: host
    depends_on:
      gpt-oss:
        condition: service_healthy  # Aspetta che il primo sia "Healthy"
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --enable-auto-tool-choice 
      --tool-call-parser hermes
      --served-model-name qwen-3.6-blackwell
@@ -0,0 +1,62 @@
 services:
  gpt-oss:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-gpt-oss
    restart: always
    ipc: host
    environment:
      - HF_MODEL_HANDLE=openai/gpt-oss-20b
    ports:
      - "8000:8000"
    volumes:
      - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve openai/gpt-oss-20b
      --gpu-memory-utilization 0.40
      --kv-cache-dtype fp8
      --max-model-len 32768
      --trust-remote-code
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s
  qwen-blackwell:
    image: nvcr.io/nvidia/vllm:26.03.post1-py3
    container_name: vllm-qwen-blackwell
    restart: always
    ipc: host
    depends_on:
      gpt-oss:
        condition: service_healthy  # Aspetta che il primo sia "Healthy"
    ports:
      - "8001:8000"
    volumes:
      - /root/.cache/huggingface:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      vllm serve /root/.cache/huggingface/hub/models--Qwen--Qwen3.6-35B-A3B-FP8/snapshots/61a5771f218894aaacf97551e24a25b866750fc2
      --quantization fp8
      --kv-cache-dtype fp8
      --max-model-len 32768
      --gpu-memory-utilization 0.40
      --trust-remote-code
      --enable-auto-tool-choice 
      --tool-call-parser hermes
      --served-model-name qwen-3.6-blackwell