version: "3.8"

services:
  ollama:
    build: .
    container_name: ollama
    ports:
      - "11434:11434"
    volumes:
      - ./.ollama:/root/.ollama
    environment:
      OLLAMA_HOST: 0.0.0.0
      OLLAMA_MAX_LOADED_MODELS: 2
      OLLAMA_NUM_PARALLEL: 8
      OLLAMA_MAX_QUEUE: 96
      OLLAMA_KEEP_ALIVE: 24h
      OLLAMA_FLASH_ATTENTION: 1
      OLLAMA_KV_CACHE_TYPE: q4_0
      NVIDIA_VISIBLE_DEVICES: all
      NVIDIA_DRIVER_CAPABILITIES: compute,utility
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: always    
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/version"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 20s