| from tqdm import tqdm |
| import joblib |
| import numpy as np |
| from sentence_transformers import SentenceTransformer, util |
|
|
| |
| model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| def get_documents_from_scores(scores): |
| rankings = [] |
| for score in scores: |
| rankings.append(score[0]) |
| return rankings |
|
|
| def cosine_similarity(v1, v2): |
| v1 = np.array(v1) |
| v2 = np.array(v2) |
| if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0): |
| sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) |
| else: |
| sim = 0 |
| return sim |
|
|
| def get_open_source_embeddings(documents): |
| documents_embeddings = [] |
| for document in tqdm(documents): |
| documents_embeddings.append(model.encode(document)) |
| return documents_embeddings |
| |
| def open_source_rankings(query, document_embeddings, k): |
| query_embedding = model.encode(query) |
| scores = [] |
| for idx, embedding in enumerate(document_embeddings): |
| scores.append((idx, cosine_similarity(query_embedding, embedding))) |
| scores = sorted(scores, key=lambda x: x[1], reverse=True) |
| scores = scores[:k] |
| rankings = get_documents_from_scores(scores) |
| return rankings, scores |
|
|
|
|
| def open_source_pipeline(query, documents_embeddings_path="Retrieval/savedModels/open_source_embeddings.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100): |
| document_embeddings = joblib.load(documents_embeddings_path) |
| ids = joblib.load(ids_path) |
| rankings, scores = open_source_rankings(query, document_embeddings, k) |
| rankings2 = [] |
| for ranking in tqdm(rankings): |
| rankings2.append(ids[ranking]) |
| return rankings2 |