# Main startup logic (run as: bash /app/scripts/entrypoint_body.sh). # entrypoint.sh is generated in the Dockerfile so HF can exec it without CRLF/BOM issues. set -euo pipefail cd /app # Same-container vLLM: PyTorch may call getpass.getuser() before USER is set in some runtimes. export USER="${USER:-huggingface}" export LOGNAME="${LOGNAME:-$USER}" export TORCHINDUCTOR_CACHE_DIR="${TORCHINDUCTOR_CACHE_DIR:-/tmp/torch_inductor_cache}" export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/triton_cache}" export PYTHONPATH="/app/vendor/rllm:${PYTHONPATH:-}" # Optional: load Space secrets copied to this path if [[ -f /app/.env.gen_image ]]; then set -a # shellcheck source=/dev/null source /app/.env.gen_image set +a fi if [[ "${START_VLLM_GENSEARCHER:-0}" != "1" ]]; then case "${OPENAI_BASE_URL:-}" in *127.0.0.1*|*localhost*) echo "[entrypoint] WARNING: OPENAI_BASE_URL points to loopback but START_VLLM_GENSEARCHER is not 1." echo "[entrypoint] The GenSearcher agent will get 'Connection error' unless a server listens here," echo "[entrypoint] or you set OPENAI_BASE_URL to an external OpenAI-compatible URL (ending in /v1)." ;; esac if [[ -z "${OPENAI_BASE_URL:-}" ]]; then echo "[entrypoint] OPENAI_BASE_URL is unset. For GenSearcher **inside this Space only**, set Space variable" echo "[entrypoint] START_VLLM_GENSEARCHER=1 (entrypoint will start vLLM here and set OPENAI_BASE_URL to loopback)." fi fi wait_http() { local url=$1 local name=$2 local max_attempts=${3:-90} local i=0 echo "[entrypoint] Waiting for ${name} (${url})..." until curl -sf "$url" >/dev/null 2>&1; do i=$((i + 1)) if [[ $i -ge $max_attempts ]]; then echo "[entrypoint] Timeout waiting for ${name}" exit 1 fi sleep 2 done echo "[entrypoint] ${name} is up." } # Defaults: only FireRed + Gradio in-container. Point OPENAI_BASE_URL / BROWSE_SUMMARY_BASE_URL # to your vLLM (or other OpenAI-compatible) endpoints via Space secrets. # --- Optional local vLLM: GenSearcher-8B (OpenAI-compatible) --- if [[ "${START_VLLM_GENSEARCHER:-0}" == "1" ]]; then CUDA_VISIBLE_DEVICES="${GENSEARCHER_CUDA_VISIBLE_DEVICES:-0}" \ vllm serve "${GENSEARCHER_MODEL_ID:-GenSearcher/Gen-Searcher-8B}" \ --host 0.0.0.0 \ --port 8002 \ --tensor-parallel-size "${GENSEARCHER_TP:-1}" \ --gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \ --served-model-name "${GEN_EVAL_MODEL:-Gen-Searcher-8B}" \ --max-model-len "${GENSEARCHER_MAX_MODEL_LEN:-65536}" \ --no-enable-prefix-caching & wait_http "http://127.0.0.1:8002/v1/models" "GenSearcher vLLM" export OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8002/v1}" fi # --- Optional local vLLM: browse summarization (Qwen3-VL) --- if [[ "${START_VLLM_BROWSE:-0}" == "1" ]]; then export BROWSE_GENERATE_ENGINE=vllm CUDA_VISIBLE_DEVICES="${BROWSE_CUDA_VISIBLE_DEVICES:-1}" \ vllm serve "${BROWSE_MODEL_ID:-Qwen/Qwen3-VL-30B-A3B-Instruct}" \ --host 0.0.0.0 \ --port 8003 \ --tensor-parallel-size "${BROWSE_TP:-1}" \ --gpu-memory-utilization "${VLLM_GPU_MEMORY_UTIL:-0.85}" \ --served-model-name "${BROWSE_SUMMARY_MODEL:-Qwen3-VL-30B-A3B-Instruct}" \ --max-model-len "${BROWSE_MAX_MODEL_LEN:-65536}" \ --mm-processor-cache-gb 0 \ --no-enable-prefix-caching & wait_http "http://127.0.0.1:8003/v1/models" "Browse-summary vLLM" export BROWSE_SUMMARY_BASE_URL="${BROWSE_SUMMARY_BASE_URL:-http://127.0.0.1:8003/v1}" fi # --- FireRed adapter (GenSearcher /generate contract) --- if [[ "${START_FIRERED_API:-1}" == "1" ]]; then CUDA_VISIBLE_DEVICES="${FIRERED_CUDA_VISIBLE_DEVICES:-0}" \ python -m uvicorn services.firered_generate:app --host 0.0.0.0 --port 8765 & wait_http "http://127.0.0.1:8765/health" "FireRed API" 120 export QWEN_EDIT_APP_URL="${QWEN_EDIT_APP_URL:-http://127.0.0.1:8765}" else echo "[entrypoint] START_FIRERED_API=0 — use external QWEN_EDIT_APP_URL for generation." fi exec python app.py