File size: 1,336 Bytes
4fdc679
 
 
 
2aa7bf4
4fdc679
33b550a
4fdc679
 
 
 
 
33b550a
 
 
 
 
 
 
 
 
 
4fdc679
 
 
 
 
 
 
 
 
2aa7bf4
33b550a
 
4fdc679
 
33b550a
 
 
6aaa57e
33b550a
 
 
4fdc679
 
 
33b550a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# rag/db/initializer.py
import faiss
import numpy as np
from huggingface_hub import hf_hub_download
from config import HF_DS_REPO_ID, HF_INDEX_FILE, HF_IDS_FILE
from modules.retriever import set_index
from modules.corpus import prepare_corpus, _get_datasets, set_id_to_row

_vector_ids = None

def _load_index_in_memory():
    """HF Hub์—์„œ ์ธ๋ฑ์Šค/ID ๋งคํ•‘์„ ๋ฐ›์•„ ๋ฉ”๋ชจ๋ฆฌ์— ๋กœ๋“œ"""
    index_path = hf_hub_download(
        repo_id=HF_DS_REPO_ID,
        filename=HF_INDEX_FILE,
        repo_type="dataset"
    )
    ids_path = hf_hub_download(
        repo_id=HF_DS_REPO_ID,
        filename=HF_IDS_FILE,
        repo_type="dataset"
    )
    index = faiss.read_index(index_path)
    set_index(index)
    global _vector_ids
    _vector_ids = np.load(ids_path, allow_pickle=True)

def get_vector_ids():
    global _vector_ids
    return _vector_ids

def initialize_dbs():
    # 1) ์ฝ”ํผ์Šค ์ค€๋น„ (์ตœ์ดˆ 1ํšŒ parquet ๋‹ค์šด๋กœ๋“œ)
    prepare_corpus()
    # 2) ์ธ๋ฑ์Šค/ID ๋งคํ•‘ ๋ฉ”๋ชจ๋ฆฌ ๋กœ๋“œ
    _load_index_in_memory()
    # 3) ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ๋ฐ page_id โ†’ row ๋งคํ•‘ ์ƒ์„ฑ
    datasets = _get_datasets()
    id_to_row = {}
    for _subset, ds in datasets.items():
        for r in ds:
            id_to_row[r["page_id"]] = r
    set_id_to_row(id_to_row)

def force_update():
    _load_index_in_memory()