Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / docker-compose_llama.yml

seanpedrickcase

Sync: Merge pull request #174 from seanpedrick-case/dev

ae79dd2 2 days ago

raw

history blame contribute delete

15.9 kB

	# Pick which GGUF model runs by setting COMPOSE_PROFILES in .env (or pass --profile):
	# COMPOSE_PROFILES=35b -> qwen35-35b_q4_gguf
	# COMPOSE_PROFILES=27b -> qwen35-27b_q4_gguf
	# The app always talks to http://llama-inference:8080 (shared network alias on both model services).
	# Each model service uses its own llama.cpp and Hugging Face hub cache volumes so mmproj-F16.gguf
	# (same filename per repo) and -hf downloads are not shared across profiles.
	# Example CLI commands:
	# docker compose -f docker-compose_llama.yml --profile 35b_36 up -d
	# docker compose -f docker-compose_llama.yml --profile 27b_36 up -d
	# docker compose -f docker-compose_llama.yml --profile 35b up -d
	# docker compose -f docker-compose_llama.yml --profile 27b up -d
	# docker compose -f docker-compose_llama.yml --profile 9b up -d

	# docker compose -f docker-compose_llama.yml --profile gemma4-31b up -d
	# docker compose -f docker-compose_llama.yml --profile gemma4-26b up -d

	# Add --build to the above if you want to rebuild the app image.
	services:
	# Qwen 3.6 27B model setup below requires 24GB of VRAM to run.
	qwen36-27b_q4_gguf:
	profiles: ["27b_36"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.6-27B-GGUF
	- --hf-file
	- Qwen3.6-27B-UD-Q4_K_XL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.6-27B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --image_min_tokens
	- "300"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8000:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen36-27b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen36-27b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Qwen 3.6 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
	qwen36-35b_q4_gguf:
	profiles: ["35b_36"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.6-35B-A3B-GGUF
	- --hf-file
	- Qwen3.6-35B-A3B-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --n-cpu-moe
	- "0" # Increase this value to fit within your available VRAM
	- --image_min_tokens
	- "300"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8005:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen36-35b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Qwen 3.5 35B model setup below requires 24GB of VRAM with n-cpu-moe set to 0. For lower VRAM systems, n-cpu-moe ~ 40 could work for a 12GB VRAM system, and n-cpu-moe ~ 20 for a 16GB VRAM system.
	qwen35-35b_q4_gguf:
	profiles: ["35b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.5-35B-A3B-GGUF
	- --hf-file
	- Qwen3.5-35B-A3B-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --n-cpu-moe
	- "0" # Increase this value to fit within your available VRAM
	- --image_min_tokens
	- "300"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8001:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen35-35b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Qwen 3.5 27B model setup below requires 24GB of VRAM to run.
	qwen35-27b_q4_gguf:
	profiles: ["27b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.5-27B-GGUF
	- --hf-file
	- Qwen3.5-27B-UD-Q4_K_XL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "32768"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --image_min_tokens
	- "300"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	ports:
	- "8000:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen35-27b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	qwen9b_q4_gguf:
	profiles: ["9b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/Qwen3.5-9B-GGUF
	- --hf-file
	- Qwen3.5-9B-UD-Q4_K_XL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "16384"
	- --fit
	- "off"
	- --temp
	- "0.7"
	- --top-k
	- "20"
	- --top-p
	- "0.8"
	- --min-p
	- "0.0"
	- --frequency-penalty
	- "1"
	- --presence-penalty
	- "1"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --n-cpu-moe
	- "0" # Increase this value to fit within your availableVRAM
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	- --image_min_tokens
	- "300"
	ports:
	- "8003:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-qwen9b:/root/.cache/llama.cpp
	- hf-hub-cache-qwen9b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]

	# Gemma 4 31B model setup below requires 24GB+ of VRAM to run.
	gemma4-31b_q4_gguf:
	profiles: ["gemma4-31b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/gemma-4-31B-it-GGUF
	- --hf-file
	- gemma-4-31B-it-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "16384"
	- --fit
	- "off"
	- --temp
	- "1.0"
	- --top-k
	- "64"
	- --top-p
	- "0.95"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	- --image_min_tokens
	- "300"
	# - -hfd # This and the following parameters are for enabling speculative decoding. This should only be enabled when not using the mmproj parameter (i.e. no vision model available)
	# - unsloth/gemma-4-E2B-it-GGUF
	# - --hfd-file
	# - gemma-4-E2B-it-IQ4_NL.gguf
	# - -parallel
	# - "1"
	# - --draft-min
	# - "1"
	# - "draft-max"
	# - "8"
	ports:
	- "8002:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp
	- hf-hub-cache-gemma4-31b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	# Gemma 4 26B model setup below requires 24GB+ of VRAM to run.
	gemma4-26b_q4_gguf:
	profiles: ["gemma4-26b"]
	image: ghcr.io/ggml-org/llama.cpp:server-cuda12
	command:
	- -hf
	- unsloth/gemma-4-26B-A4B-it-GGUF
	- --hf-file
	- gemma-4-26B-A4B-it-UD-IQ4_NL.gguf
	- --mmproj-url
	- https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf
	- --n-gpu-layers
	- "-1"
	- --ctx-size
	- "65536"
	- --fit
	- "off"
	- --temp
	- "1.0"
	- --top-k
	- "64"
	- --top-p
	- "0.95"
	- --host
	- "0.0.0.0"
	- --port
	- "8080"
	- --no-warmup
	- --seed
	- "42"
	- --cache-type-k
	- "q8_0"
	- --cache-type-v
	- "q8_0"
	- --image_min_tokens
	- "300"
	ports:
	- "8002:8080"
	volumes:
	- ./models:/models
	- hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp
	- hf-hub-cache-gemma4-26b:/root/.cache/huggingface
	pull_policy: always
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	networks:
	redaction-net-llama:
	aliases:
	- llama-inference

	redaction-app-llama:
	profiles: ["35b_36", "27b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"]
	image: redaction-app-main
	build:
	context: . # Look in the current folder
	dockerfile: Dockerfile # Use this file
	target: gradio # Use the 'gradio' stage from your Dockerfile
	args: # Pass your build-time variables here!
	- TORCH_GPU_ENABLED=False
	- INSTALL_VLM=False
	- PADDLE_GPU_ENABLED=True
	- INSTALL_PADDLEOCR=True
	shm_size: '8gb'
	depends_on:
	qwen36-35b_q4_gguf:
	condition: service_healthy
	required: false
	qwen36-27b_q4_gguf:
	condition: service_healthy
	required: false
	qwen35-35b_q4_gguf:
	condition: service_healthy
	required: false
	qwen35-27b_q4_gguf:
	condition: service_healthy
	required: false
	qwen9b_q4_gguf:
	condition: service_healthy
	required: false
	gemma4-31b_q4_gguf:
	condition: service_healthy
	required: false
	gemma4-26b_q4_gguf:
	condition: service_healthy
	required: false
	environment:
	- FLAGS_fraction_of_gpu_memory_to_use=0.05
	- RUN_FASTAPI=True
	- APP_MODE=fastapi
	- SHOW_PADDLE_MODEL_OPTIONS=True
	- SHOW_LOCAL_OCR_MODEL_OPTIONS=True
	- SHOW_LOCAL_PII_DETECTION_OPTIONS=True
	- SHOW_INFERENCE_SERVER_PII_OPTIONS=True
	- SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
	- SHOW_HYBRID_MODELS=True
	- SHOW_DIFFICULT_OCR_EXAMPLES=True
	- SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
	- SHOW_SUMMARISATION=True
	- SHOW_AWS_API_KEYS=True
	- DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
	- DEFAULT_LOCAL_OCR_MODEL=paddle
	- DEFAULT_PII_DETECTION_MODEL=Local
	- INFERENCE_SERVER_API_URL=http://llama-inference:8080
	- DEFAULT_INFERENCE_SERVER_VLM_MODEL=""
	- DEFAULT_INFERENCE_SERVER_PII_MODEL=""
	- CUSTOM_VLM_BACKEND=inference_vlm
	- MAX_WORKERS=12
	- TESSERACT_MAX_WORKERS=8
	- PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
	- LOAD_PADDLE_AT_STARTUP=False
	- EFFICIENT_OCR=True
	- SHOW_CUSTOM_VLM_ENTITIES=True
	- SESSION_OUTPUT_FOLDER=True
	- SAVE_PAGE_OCR_VISUALISATIONS=False
	- HYBRID_OCR_CONFIDENCE_THRESHOLD=97
	- INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
	- PREPROCESS_LOCAL_OCR_IMAGES=False
	- INFERENCE_SERVER_DISABLE_THINKING=True
	- MAX_NEW_TOKENS=16384
	- SAVE_EXAMPLE_HYBRID_IMAGES=False
	- SAVE_VLM_INPUT_IMAGES=False
	- VLM_MAX_DPI=200.0
	- DEFAULT_NEW_BATCH_CHAR_COUNT=1250
	- REPORT_VLM_OUTPUTS_TO_GUI=True
	- REPORT_LLM_OUTPUTS_TO_GUI=True
	- ADD_VLM_BOUNDING_BOX_RULES=False

	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	ports:
	- "7861:7860"
	networks:
	- redaction-net-llama

	networks:
	redaction-net-llama:
	driver: bridge

	volumes:
	hf-llama-cache-qwen36-35b:
	hf-llama-cache-qwen36-27b:
	hf-llama-cache-qwen35-35b:
	hf-llama-cache-qwen35-27b:
	hf-llama-cache-qwen9b:
	hf-llama-cache-gemma4-31b:
	hf-llama-cache-gemma4-26b:
	hf-hub-cache-qwen36-35b:
	hf-hub-cache-qwen35-35b:
	hf-hub-cache-qwen35-27b:
	hf-hub-cache-qwen36-27b:
	hf-hub-cache-qwen9b:
	hf-hub-cache-gemma4-31b:
	hf-hub-cache-gemma4-26b: