""" Build ChromaDB offline - chay tren local, push len HF de khong bi timeout. """ import sys import os # Fix encoding cho Windows console if sys.platform == "win32": sys.stdout.reconfigure(encoding="utf-8", errors="replace") sys.stderr.reconfigure(encoding="utf-8", errors="replace") import logging logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(asctime)s - %(levelname)s - %(message)s") # Add backend to path sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "backend"))) # Target paths DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "backend", "data")) CHROMA_DB_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "backend", "data", "chroma_db")) # Override module-level constants BEFORE importing the function import app.core.config as cfg cfg.CHROMA_DB_PATH = CHROMA_DB_PATH cfg.DATA_DIR = DATA_DIR import app.services.chat_service as svc from app.services.chat_service import _build_retrievers svc.FORCE_REBUILD_DB = True print("=" * 60) print(f"DATA_DIR : {DATA_DIR}") print(f"CHROMA_DB_PATH: {CHROMA_DB_PATH}") print("=" * 60) # Run the build fast_ret, deep_ret, splits = _build_retrievers(DATA_DIR, CHROMA_DB_PATH) # Verify if os.path.exists(CHROMA_DB_PATH): total_files = sum(len(files) for _, _, files in os.walk(CHROMA_DB_PATH)) total_size_mb = sum(os.path.getsize(os.path.join(r, f)) for r, _, files in os.walk(CHROMA_DB_PATH) for f in files) / (1024*1024) print("=" * 60) print(f"SUCCESS! ChromaDB created at: {CHROMA_DB_PATH}") print(f" Files: {total_files}") print(f" Size : {total_size_mb:.1f} MB") print(f" Chunks indexed: {len(splits)}") print("=" * 60) else: print("=" * 60) print(f"ERROR: ChromaDB directory NOT found at {CHROMA_DB_PATH}") print("=" * 60) sys.exit(1)