| |
|
|
| |
| |
|
|
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| import pickle |
| import pandas as pd |
| import nltk |
| from nltk.stem import * |
| nltk.download("punkt_tab") |
|
|
|
|
| print("Loading SentenceTransformer model...") |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
| print("Model loaded.") |
|
|
| def load_technologies(): |
| df = pd.read_excel('technologies_database.xlsx') |
| return df |
|
|
| def tech_to_dict(technologies): |
| tech_dict = [] |
| for index, tech in enumerate(technologies): |
| if not tech.find("<title>") > 1: |
| tab = tech.split("\n") |
| tab.pop(0) |
| tab.pop(len(tab)-1) |
| tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], |
| "purpose": tab[1][tab[1].find(": ")+2:], |
| "key_components": tab[2][tab[2].find(": ")+2:], |
| "advantages": tab[3][tab[3].find(": ")+2:], |
| "limitations": tab[4][tab[4].find(": ")+2:], |
| "id": index}) |
| return tech_dict |
|
|
| def stem(data,data_type): |
| stemmer = SnowballStemmer("english") |
| processed_data = [] |
| if data_type == "technologies": |
| for t_item in data: |
| processed_data.append({ |
| "title": stemmer.stem(t_item["title"]), |
| "purpose": stemmer.stem(t_item["purpose"]), |
| "key_components": stemmer.stem(t_item["key_components"]), |
| "advantages": stemmer.stem(t_item["advantages"]), |
| "limitations": stemmer.stem(t_item["limitations"]), |
| "id": t_item["id"] |
| }) |
| else: |
| for t_item in data: |
| print(t_item) |
| processed_data.append({ |
| "title": stemmer.stem(t_item), |
| "description": stemmer.stem(data[t_item]) |
| }) |
|
|
| return processed_data |
|
|
| def preprocess_tech_data(_df): |
| if _df is None or "description" not in _df.columns: |
| return [], [] |
|
|
| technologies_list = _df["description"].to_list() |
| tech_dict_raw = tech_to_dict(technologies_list) |
|
|
| tech_dict_filtered = [ |
| t for t in tech_dict_raw if ( |
| len(t.get("title", "")) >= 5 and |
| len(t.get("advantages", "")) >= 5 and |
| len(t.get("key_components", "")) >= 5 |
| ) |
| ] |
|
|
| if not tech_dict_filtered: |
| return [], [] |
|
|
| processed_tech_wt = stem(tech_dict_filtered,"technologies") |
|
|
| for t_item_wt in processed_tech_wt: |
| kc = t_item_wt.get("key_components") |
| if isinstance(kc, str): |
| t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc)) |
| else: |
| t_item_wt["key_components"] = "" |
|
|
| original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)] |
|
|
|
|
| _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else [] |
| return processed_tech_wt, _keys, original_tech_for_display |
|
|
|
|
| df = load_technologies() |
| global_tech,keys,original_tech = preprocess_tech_data(df) |
| global_tech_purposes = [t["purpose"] for t in global_tech] |
|
|
| |
| print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.") |
| global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True) |
| print("Global tech embeddings created.") |
|
|
| |
| output_filename = 'global_tech_embeddings.pkl' |
|
|
| |
| |
| data_to_save = { |
| 'global_tech': global_tech, |
| 'global_tech_embeddings': global_tech_embeddings |
| } |
|
|
| print(f"Saving embeddings and global_tech data to {output_filename}...") |
| with open(output_filename, 'wb') as f: |
| pickle.dump(data_to_save, f) |
| print(f"Data saved successfully to {output_filename}.") |
|
|
| print(f"\nTo load this file later in your API, use: \n" |
| f"with open('{output_filename}', 'rb') as f:\n" |
| f" loaded_data = pickle.load(f)\n" |
| f"global_tech = loaded_data['global_tech']\n" |
| f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n") |
|
|
|
|