DeepMedAI / build_db_offline.py
PBThuong's picture
Thiết lập lại thư viện y khoa sạch và cập nhật chroma_db
8eaa451
"""
Build ChromaDB offline - chay tren local, push len HF de khong bi timeout.
"""
import sys
import os
# Fix encoding cho Windows console
if sys.platform == "win32":
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout,
format="%(asctime)s - %(levelname)s - %(message)s")
# Add backend to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "backend")))
# Target paths
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "backend", "data"))
CHROMA_DB_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "backend", "data", "chroma_db"))
# Override module-level constants BEFORE importing the function
import app.core.config as cfg
cfg.CHROMA_DB_PATH = CHROMA_DB_PATH
cfg.DATA_DIR = DATA_DIR
import app.services.chat_service as svc
from app.services.chat_service import _build_retrievers
svc.FORCE_REBUILD_DB = True
print("=" * 60)
print(f"DATA_DIR : {DATA_DIR}")
print(f"CHROMA_DB_PATH: {CHROMA_DB_PATH}")
print("=" * 60)
# Run the build
fast_ret, deep_ret, splits = _build_retrievers(DATA_DIR, CHROMA_DB_PATH)
# Verify
if os.path.exists(CHROMA_DB_PATH):
total_files = sum(len(files) for _, _, files in os.walk(CHROMA_DB_PATH))
total_size_mb = sum(os.path.getsize(os.path.join(r, f)) for r, _, files in os.walk(CHROMA_DB_PATH) for f in files) / (1024*1024)
print("=" * 60)
print(f"SUCCESS! ChromaDB created at: {CHROMA_DB_PATH}")
print(f" Files: {total_files}")
print(f" Size : {total_size_mb:.1f} MB")
print(f" Chunks indexed: {len(splits)}")
print("=" * 60)
else:
print("=" * 60)
print(f"ERROR: ChromaDB directory NOT found at {CHROMA_DB_PATH}")
print("=" * 60)
sys.exit(1)