| import torch.distributed |
| import faiss |
| import pandas as pd |
| import faiss |
| import numpy as np |
| import jsonlines, json |
| from transformers import AutoModel |
| import os |
| import torch |
| ''' |
| data format: |
| { |
| "bibkey": "some_bibkey", |
| "text": "The abstract or text of the paper." |
| } |
| example: |
| { |
| "bibkey": "arxivid1234.5678", |
| "text": "Title: A Study on Something\nAbstract: This paper discusses the findings of a study on something important in the field of research.\nAuthors: John Doe" |
| } |
| ''' |
|
|
| model_name = "openbmb/MiniCPM-Embedding-Light" |
| model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda") |
|
|
| input_path = "./data/arxiv.jsonl" |
|
|
| with jsonlines.open(input_path) as f: |
| survey_data = list(f) |
|
|
|
|
| xids = [item["bibkey"] for item in survey_data] |
| passages = [item["text"] for item in survey_data] |
|
|
| embeddings_doc_dense, _ = model.encode_corpus(passages, max_length=1024) |
|
|
|
|
| |
| index = faiss.IndexFlatIP(embeddings_doc_dense.shape[1]) |
| id_map_index = faiss.IndexIDMap(index) |
| index = faiss.index_cpu_to_all_gpus(id_map_index) |
|
|
| x_ids_int = np.array(np.arange(len(xids))) |
|
|
| str_int_ids = {} |
| for i in range(len(xids)): |
| str_int_ids[xids[i]] = x_ids_int[i] |
| str_int_ids_df = pd.DataFrame(str_int_ids, index=[0]).T.reset_index() |
| str_int_ids_df.columns = ["str_id", "int_id"] |
| str_int_ids_df.to_csv("./index/str_int_ids_abstract.csv", index=False) |
|
|
| index.add_with_ids(embeddings_doc_dense, x_ids_int) |
|
|
| index = faiss.index_gpu_to_cpu(index) |
| faiss.write_index(index, "./index/index_abstract.faiss") |
|
|