flaskAPI / Dockerfile
TruVlad's picture
Upload folder using huggingface_hub
0e2d813 verified
# FROM ollama/ollama:0.12.3
FROM ollama/ollama:latest
RUN apt update
RUN apt upgrade -y
# OLLAMA_DEBUG Show additional debug information (e.g. OLLAMA_DEBUG=1)
# OLLAMA_HOST IP Address for the ollama server (default 127.0.0.1:11434)
# OLLAMA_CONTEXT_LENGTH Context length to use unless otherwise specified (default: 4096)
# OLLAMA_KEEP_ALIVE The duration that models stay loaded in memory (default "5m")
# OLLAMA_MAX_LOADED_MODELS Maximum number of loaded models per GPU
# OLLAMA_MAX_QUEUE Maximum number of queued requests
# OLLAMA_MODELS The path to the models directory
# OLLAMA_NUM_PARALLEL Maximum number of parallel requests
# OLLAMA_NOPRUNE Do not prune model blobs on startup
# OLLAMA_ORIGINS A comma separated list of allowed origins
# OLLAMA_SCHED_SPREAD Always schedule model across all GPUs
# OLLAMA_FLASH_ATTENTION Enabled flash attention
# OLLAMA_KV_CACHE_TYPE Quantization type for the K/V cache (default: f16)
# OLLAMA_LLM_LIBRARY Set LLM library to bypass autodetection
# OLLAMA_GPU_OVERHEAD Reserve a portion of VRAM per GPU (bytes)
# OLLAMA_LOAD_TIMEOUT
ENV OLLAMA_KEEP_ALIVE="24h"
ENV OLLAMA_HOST=0.0.0.0:7861
ENV OLLAMA_LOAD_TIMEOUT="24h"
RUN apt-get update && apt-get upgrade -y
RUN apt-get install git g++ python3 python3-pip -y && apt-get clean
COPY requirements.txt requirements.txt
COPY pull06.sh pull06.sh
COPY pull17.sh pull17.sh
COPY pull4.sh pull4.sh
COPY pull8.sh pull8.sh
#RUN /bin/bash -x pull06.sh
# RUN /bin/bash -x pull8.sh
COPY pull14.sh pull14.sh
# RUN /bin/bash -x pull14.sh
RUN pip install --no-cache-dir -r requirements.txt --break-system-packages
VOLUME vol1 vol2
COPY main.py main.py
COPY util.py util.py
COPY start.sh start.sh
#ENTRYPOINT ["/usr/bin/ollama", "serve"]
ENTRYPOINT ["/bin/bash", "-x", "start.sh"]