| """ |
| Build ChromaDB offline - chay tren local, push len HF de khong bi timeout. |
| """ |
| import sys |
| import os |
|
|
| |
| if sys.platform == "win32": |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| sys.stderr.reconfigure(encoding="utf-8", errors="replace") |
|
|
| import logging |
| logging.basicConfig(level=logging.INFO, stream=sys.stdout, |
| format="%(asctime)s - %(levelname)s - %(message)s") |
|
|
| |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "backend"))) |
|
|
| |
| DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "backend", "data")) |
| CHROMA_DB_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "backend", "data", "chroma_db")) |
|
|
| |
| import app.core.config as cfg |
| cfg.CHROMA_DB_PATH = CHROMA_DB_PATH |
| cfg.DATA_DIR = DATA_DIR |
|
|
| import app.services.chat_service as svc |
| from app.services.chat_service import _build_retrievers |
| svc.FORCE_REBUILD_DB = True |
|
|
| print("=" * 60) |
| print(f"DATA_DIR : {DATA_DIR}") |
| print(f"CHROMA_DB_PATH: {CHROMA_DB_PATH}") |
| print("=" * 60) |
|
|
| |
| fast_ret, deep_ret, splits = _build_retrievers(DATA_DIR, CHROMA_DB_PATH) |
|
|
| |
| if os.path.exists(CHROMA_DB_PATH): |
| total_files = sum(len(files) for _, _, files in os.walk(CHROMA_DB_PATH)) |
| total_size_mb = sum(os.path.getsize(os.path.join(r, f)) for r, _, files in os.walk(CHROMA_DB_PATH) for f in files) / (1024*1024) |
| print("=" * 60) |
| print(f"SUCCESS! ChromaDB created at: {CHROMA_DB_PATH}") |
| print(f" Files: {total_files}") |
| print(f" Size : {total_size_mb:.1f} MB") |
| print(f" Chunks indexed: {len(splits)}") |
| print("=" * 60) |
| else: |
| print("=" * 60) |
| print(f"ERROR: ChromaDB directory NOT found at {CHROMA_DB_PATH}") |
| print("=" * 60) |
| sys.exit(1) |
|
|