| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| services: |
| |
| qwen36-27b_q4_gguf: |
| profiles: ["27b_36"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/Qwen3.6-27B-GGUF |
| - --hf-file |
| - Qwen3.6-27B-UD-Q4_K_XL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/Qwen3.6-27B-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "32768" |
| - --fit |
| - "off" |
| - --temp |
| - "0.7" |
| - --top-k |
| - "20" |
| - --top-p |
| - "0.8" |
| - --min-p |
| - "0.0" |
| - --frequency-penalty |
| - "1" |
| - --presence-penalty |
| - "1" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --image_min_tokens |
| - "300" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| ports: |
| - "8000:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-qwen36-27b:/root/.cache/llama.cpp |
| - hf-hub-cache-qwen36-27b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| networks: |
| redaction-net-llama: |
| aliases: |
| - llama-inference |
|
|
| |
| qwen36-35b_q4_gguf: |
| profiles: ["35b_36"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/Qwen3.6-35B-A3B-GGUF |
| - --hf-file |
| - Qwen3.6-35B-A3B-UD-IQ4_NL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "32768" |
| - --fit |
| - "off" |
| - --temp |
| - "0.7" |
| - --top-k |
| - "20" |
| - --top-p |
| - "0.8" |
| - --min-p |
| - "0.0" |
| - --frequency-penalty |
| - "1" |
| - --presence-penalty |
| - "1" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --n-cpu-moe |
| - "0" |
| - --image_min_tokens |
| - "300" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| ports: |
| - "8005:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-qwen36-35b:/root/.cache/llama.cpp |
| - hf-hub-cache-qwen36-35b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| networks: |
| redaction-net-llama: |
| aliases: |
| - llama-inference |
|
|
| |
| qwen35-35b_q4_gguf: |
| profiles: ["35b"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/Qwen3.5-35B-A3B-GGUF |
| - --hf-file |
| - Qwen3.5-35B-A3B-UD-IQ4_NL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "32768" |
| - --fit |
| - "off" |
| - --temp |
| - "0.7" |
| - --top-k |
| - "20" |
| - --top-p |
| - "0.8" |
| - --min-p |
| - "0.0" |
| - --frequency-penalty |
| - "1" |
| - --presence-penalty |
| - "1" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --n-cpu-moe |
| - "0" |
| - --image_min_tokens |
| - "300" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| ports: |
| - "8001:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-qwen35-35b:/root/.cache/llama.cpp |
| - hf-hub-cache-qwen35-35b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| networks: |
| redaction-net-llama: |
| aliases: |
| - llama-inference |
|
|
| |
| qwen35-27b_q4_gguf: |
| profiles: ["27b"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/Qwen3.5-27B-GGUF |
| - --hf-file |
| - Qwen3.5-27B-UD-Q4_K_XL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "32768" |
| - --fit |
| - "off" |
| - --temp |
| - "0.7" |
| - --top-k |
| - "20" |
| - --top-p |
| - "0.8" |
| - --min-p |
| - "0.0" |
| - --frequency-penalty |
| - "1" |
| - --presence-penalty |
| - "1" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --image_min_tokens |
| - "300" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| ports: |
| - "8000:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-qwen35-27b:/root/.cache/llama.cpp |
| - hf-hub-cache-qwen35-27b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| networks: |
| redaction-net-llama: |
| aliases: |
| - llama-inference |
|
|
| qwen9b_q4_gguf: |
| profiles: ["9b"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/Qwen3.5-9B-GGUF |
| - --hf-file |
| - Qwen3.5-9B-UD-Q4_K_XL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/Qwen3.5-9B-A3B-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "16384" |
| - --fit |
| - "off" |
| - --temp |
| - "0.7" |
| - --top-k |
| - "20" |
| - --top-p |
| - "0.8" |
| - --min-p |
| - "0.0" |
| - --frequency-penalty |
| - "1" |
| - --presence-penalty |
| - "1" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --n-cpu-moe |
| - "0" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| - --image_min_tokens |
| - "300" |
| ports: |
| - "8003:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-qwen9b:/root/.cache/llama.cpp |
| - hf-hub-cache-qwen9b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
|
|
| |
| gemma4-31b_q4_gguf: |
| profiles: ["gemma4-31b"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/gemma-4-31B-it-GGUF |
| - --hf-file |
| - gemma-4-31B-it-IQ4_NL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "16384" |
| - --fit |
| - "off" |
| - --temp |
| - "1.0" |
| - --top-k |
| - "64" |
| - --top-p |
| - "0.95" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| - --image_min_tokens |
| - "300" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| ports: |
| - "8002:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-gemma4-31b:/root/.cache/llama.cpp |
| - hf-hub-cache-gemma4-31b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| networks: |
| redaction-net-llama: |
| aliases: |
| - llama-inference |
|
|
| |
| gemma4-26b_q4_gguf: |
| profiles: ["gemma4-26b"] |
| image: ghcr.io/ggml-org/llama.cpp:server-cuda12 |
| command: |
| - -hf |
| - unsloth/gemma-4-26B-A4B-it-GGUF |
| - --hf-file |
| - gemma-4-26B-A4B-it-UD-IQ4_NL.gguf |
| - --mmproj-url |
| - https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF/resolve/main/mmproj-F16.gguf |
| - --n-gpu-layers |
| - "-1" |
| - --ctx-size |
| - "65536" |
| - --fit |
| - "off" |
| - --temp |
| - "1.0" |
| - --top-k |
| - "64" |
| - --top-p |
| - "0.95" |
| - --host |
| - "0.0.0.0" |
| - --port |
| - "8080" |
| - --no-warmup |
| - --seed |
| - "42" |
| - --cache-type-k |
| - "q8_0" |
| - --cache-type-v |
| - "q8_0" |
| - --image_min_tokens |
| - "300" |
| ports: |
| - "8002:8080" |
| volumes: |
| - ./models:/models |
| - hf-llama-cache-gemma4-26b:/root/.cache/llama.cpp |
| - hf-hub-cache-gemma4-26b:/root/.cache/huggingface |
| pull_policy: always |
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| healthcheck: |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8080/v1/models >/dev/null || exit 1"] |
| interval: 30s |
| timeout: 15s |
| retries: 8 |
| start_period: 1200s |
| networks: |
| redaction-net-llama: |
| aliases: |
| - llama-inference |
|
|
| redaction-app-llama: |
| profiles: ["35b_36", "27b_36", "35b", "27b", "9b", "gemma4-31b", "gemma4-26b"] |
| image: redaction-app-main |
| build: |
| context: . |
| dockerfile: Dockerfile |
| target: gradio |
| args: |
| - TORCH_GPU_ENABLED=False |
| - INSTALL_VLM=False |
| - PADDLE_GPU_ENABLED=True |
| - INSTALL_PADDLEOCR=True |
| shm_size: '8gb' |
| depends_on: |
| qwen36-35b_q4_gguf: |
| condition: service_healthy |
| required: false |
| qwen36-27b_q4_gguf: |
| condition: service_healthy |
| required: false |
| qwen35-35b_q4_gguf: |
| condition: service_healthy |
| required: false |
| qwen35-27b_q4_gguf: |
| condition: service_healthy |
| required: false |
| qwen9b_q4_gguf: |
| condition: service_healthy |
| required: false |
| gemma4-31b_q4_gguf: |
| condition: service_healthy |
| required: false |
| gemma4-26b_q4_gguf: |
| condition: service_healthy |
| required: false |
| environment: |
| - FLAGS_fraction_of_gpu_memory_to_use=0.05 |
| - RUN_FASTAPI=True |
| - APP_MODE=fastapi |
| - SHOW_PADDLE_MODEL_OPTIONS=True |
| - SHOW_LOCAL_OCR_MODEL_OPTIONS=True |
| - SHOW_LOCAL_PII_DETECTION_OPTIONS=True |
| - SHOW_INFERENCE_SERVER_PII_OPTIONS=True |
| - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True |
| - SHOW_HYBRID_MODELS=True |
| - SHOW_DIFFICULT_OCR_EXAMPLES=True |
| - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True |
| - SHOW_SUMMARISATION=True |
| - SHOW_AWS_API_KEYS=True |
| - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text |
| - DEFAULT_LOCAL_OCR_MODEL=paddle |
| - DEFAULT_PII_DETECTION_MODEL=Local |
| - INFERENCE_SERVER_API_URL=http://llama-inference:8080 |
| - DEFAULT_INFERENCE_SERVER_VLM_MODEL="" |
| - DEFAULT_INFERENCE_SERVER_PII_MODEL="" |
| - CUSTOM_VLM_BACKEND=inference_vlm |
| - MAX_WORKERS=12 |
| - TESSERACT_MAX_WORKERS=8 |
| - PADDLE_MAX_WORKERS=1 |
| - LOAD_PADDLE_AT_STARTUP=False |
| - EFFICIENT_OCR=True |
| - SHOW_CUSTOM_VLM_ENTITIES=True |
| - SESSION_OUTPUT_FOLDER=True |
| - SAVE_PAGE_OCR_VISUALISATIONS=False |
| - HYBRID_OCR_CONFIDENCE_THRESHOLD=97 |
| - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True |
| - PREPROCESS_LOCAL_OCR_IMAGES=False |
| - INFERENCE_SERVER_DISABLE_THINKING=True |
| - MAX_NEW_TOKENS=16384 |
| - SAVE_EXAMPLE_HYBRID_IMAGES=False |
| - SAVE_VLM_INPUT_IMAGES=False |
| - VLM_MAX_DPI=200.0 |
| - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 |
| - REPORT_VLM_OUTPUTS_TO_GUI=True |
| - REPORT_LLM_OUTPUTS_TO_GUI=True |
| - ADD_VLM_BOUNDING_BOX_RULES=False |
|
|
| deploy: |
| resources: |
| reservations: |
| devices: |
| - driver: nvidia |
| count: all |
| capabilities: [gpu] |
| ports: |
| - "7861:7860" |
| networks: |
| - redaction-net-llama |
|
|
| networks: |
| redaction-net-llama: |
| driver: bridge |
|
|
| volumes: |
| hf-llama-cache-qwen36-35b: |
| hf-llama-cache-qwen36-27b: |
| hf-llama-cache-qwen35-35b: |
| hf-llama-cache-qwen35-27b: |
| hf-llama-cache-qwen9b: |
| hf-llama-cache-gemma4-31b: |
| hf-llama-cache-gemma4-26b: |
| hf-hub-cache-qwen36-35b: |
| hf-hub-cache-qwen35-35b: |
| hf-hub-cache-qwen35-27b: |
| hf-hub-cache-qwen36-27b: |
| hf-hub-cache-qwen9b: |
| hf-hub-cache-gemma4-31b: |
| hf-hub-cache-gemma4-26b: |