| |
|
|
| import numpy as np |
| import torch |
| from torch import nn |
| from torch.utils.data import DataLoader |
| from sklearn.model_selection import KFold |
| from transformers import Trainer, TrainingArguments |
| from sklearn.metrics import ndcg_score |
| import json |
|
|
| from data_processing import load_data, EmbeddingGenerator, prepare_labels |
| from utils import compute_ndcg |
| import numpy as np |
| from sklearn.metrics import ndcg_score, mean_squared_error |
|
|
| |
| def generate_random_predictions(y_true): |
| return np.random.uniform(y_true.min(), y_true.max(), size=y_true.shape) |
|
|
| |
| def calculate_relative_lift(y_true, model_predictions, metric="ndcg"): |
| random_predictions = generate_random_predictions(y_true) |
| |
| if metric == "ndcg": |
| model_score = ndcg_score([y_true], [model_predictions]) |
| random_score = ndcg_score([y_true], [random_predictions]) |
| lift = (model_score - random_score) / random_score |
| elif metric == "mse": |
| model_score = mean_squared_error(y_true, model_predictions) |
| random_score = mean_squared_error(y_true, random_predictions) |
| lift = (random_score - model_score) / random_score |
| else: |
| raise ValueError("Unsupported metric") |
| |
| return lift, model_score, random_score |
|
|
| |
| class MultiOutputRegressor(nn.Module): |
| def __init__(self, hidden_size, num_outputs): |
| super(MultiOutputRegressor, self).__init__() |
| self.regressor_head = nn.Linear(hidden_size, num_outputs) |
| |
| def forward(self, input_ids): |
| return self.regressor_head(input_ids) |
|
|
| |
| class EmbeddingDataset(torch.utils.data.Dataset): |
| def __init__(self, embeddings, labels): |
| self.embeddings = embeddings |
| self.labels = labels |
| |
| def __len__(self): |
| return len(self.embeddings) |
| |
| def __getitem__(self, idx): |
| return {"input_ids": self.embeddings[idx], "label": self.labels[idx]} |
|
|
| |
| class CustomDataCollator: |
| def __call__(self, features): |
| embeddings = torch.stack([item["input_ids"] for item in features]) |
| labels = torch.stack([item["label"] for item in features]) |
| batch_data = {"input_ids": embeddings, "label": labels} |
| return batch_data |
|
|
| |
| class CustomTrainer(Trainer): |
| def compute_loss(self, model, inputs, return_outputs=False, **kwargs): |
| input_ids = inputs["input_ids"].to(self.args.device) |
| labels = inputs["label"].to(self.args.device) |
| outputs = model(input_ids) |
| loss_fct = nn.MSELoss() |
| loss = loss_fct(outputs, labels) |
| return (loss, outputs) if return_outputs else loss |
|
|
| def main(): |
| |
| outdata = load_data("labeled_users.json") |
|
|
| |
| descriptions = [record['description'] for record in outdata] |
|
|
| |
| embedder = EmbeddingGenerator() |
| X_embeddings = embedder.generate_embeddings(descriptions) |
|
|
| |
| y_matrix, label2id, id2label = prepare_labels(outdata) |
|
|
| |
| mappings = {'label2id': label2id, 'id2label': id2label} |
| with open('label_mappings.json', 'w') as f: |
| json.dump(mappings, f) |
|
|
| |
| train_embeddings = torch.tensor(X_embeddings, dtype=torch.float) |
| train_labels = torch.tensor(y_matrix, dtype=torch.float) |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
| data_collator = CustomDataCollator() |
|
|
| n_splits = 5 |
| kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) |
|
|
| hidden_size = train_embeddings.shape[1] |
| num_outputs = train_labels.shape[1] |
| fold_ndcg_scores = [] |
| all_preds = [] |
|
|
| for fold, (train_index, val_index) in enumerate(kf.split(train_embeddings)): |
| print(f"Fold {fold + 1}/{n_splits}") |
| |
| |
| X_train_fold = train_embeddings[train_index] |
| y_train_fold = train_labels[train_index] |
| X_val_fold = train_embeddings[val_index] |
| y_val_fold = train_labels[val_index] |
| |
| |
| train_dataset = EmbeddingDataset(X_train_fold, y_train_fold) |
| val_dataset = EmbeddingDataset(X_val_fold, y_val_fold) |
| |
| |
| model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) |
| model.to(device) |
| |
| |
| training_args = TrainingArguments( |
| output_dir=f"./results_fold_{fold+1}", |
| num_train_epochs=10, |
| per_device_train_batch_size=64, |
| logging_dir=f"./logs_fold_{fold+1}", |
| evaluation_strategy="no", |
| save_strategy="no", |
| disable_tqdm=True, |
| learning_rate=1e-5, |
| weight_decay=0.01, |
| max_grad_norm=1.0 |
| ) |
| |
| trainer = CustomTrainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| data_collator=data_collator, |
| ) |
| |
| trainer.train() |
| |
| |
| val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator) |
| |
| fold_preds = [] |
| fold_labels = [] |
| |
| model.eval() |
| with torch.no_grad(): |
| for batch in val_dataloader: |
| input_ids = batch["input_ids"].to(device) |
| labels = batch["label"].to(device) |
| outputs = model(input_ids) |
| fold_preds.append(outputs.cpu().numpy()) |
| fold_labels.append(labels.cpu().numpy()) |
| |
| |
| y_pred = np.concatenate(fold_preds, axis=0) |
| y_true = np.concatenate(fold_labels, axis=0) |
| |
| |
| all_preds.extend(y_pred) |
| |
| |
| all_ndcgs = [] |
| lifts = [] |
| for i in range(len(y_true)): |
| actual_weights = y_true[i] |
| predicted_weights = y_pred[i] |
| ndcg = ndcg_score([actual_weights], [predicted_weights]) |
| lift, model_score, random_score = calculate_relative_lift(actual_weights, predicted_weights, metric="ndcg") |
| lifts.append(lift) |
| all_ndcgs.append(ndcg) |
| |
| |
| if all_ndcgs: |
| avg_ndcg = np.mean(all_ndcgs) |
| else: |
| avg_ndcg = 0.0 |
| if lifts: |
| avg_lift = np.mean(lifts) |
| else: |
| avg_lift = 0.0 |
| print(f"Average NDCG for fold {fold + 1}: {avg_ndcg:.4f}") |
| print(f"Average Lift for fold {fold + 1}: {avg_lift:.4f}") |
| fold_ndcg_scores.append(avg_ndcg) |
|
|
| |
| overall_avg_ndcg = np.mean(fold_ndcg_scores) |
| print(f"\nOverall Average NDCG across all folds: {overall_avg_ndcg:.4f}") |
|
|
| |
| for idx, record in enumerate(outdata): |
| record['embedding'] = X_embeddings[idx].tolist() |
| |
| pred = all_preds[idx] |
| label_pred_dict = {id2label[i]: float(pred[i]) for i in range(len(pred))} |
| record['predictions'] = label_pred_dict |
|
|
| |
| with open("enriched_data.json", "w") as f: |
| for row in outdata: |
| _ = f.write(json.dumps(row) + '\n') |
|
|
| |
| |
| model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) |
| model.to(device) |
|
|
| |
| train_dataset = EmbeddingDataset(train_embeddings, train_labels) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="./final_model", |
| num_train_epochs=10, |
| per_device_train_batch_size=8, |
| logging_dir="./logs_final", |
| evaluation_strategy="no", |
| save_strategy="no", |
| disable_tqdm=False, |
| ) |
|
|
| |
| trainer = CustomTrainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| data_collator=data_collator, |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| model_save_path = 'multioutput_regressor.pth' |
| torch.save(model.state_dict(), model_save_path) |
| print(f"Model saved to {model_save_path}") |
|
|
| |
| load_and_predict(embedder, hidden_size, num_outputs, device) |
|
|
| def load_and_predict(embedder, hidden_size, num_outputs, device): |
| """ |
| Load the saved model and label mappings, make predictions on new data, |
| and map the predictions to labels. |
| """ |
| |
| with open('label_mappings.json', 'r') as f: |
| mappings = json.load(f) |
| id2label = mappings['id2label'] |
|
|
| |
| model_save_path = 'multioutput_regressor.pth' |
| loaded_model = MultiOutputRegressor(hidden_size=hidden_size, num_outputs=num_outputs) |
| loaded_model.load_state_dict(torch.load(model_save_path, map_location=device)) |
| loaded_model.to(device) |
| loaded_model.eval() |
|
|
| |
| new_sentences = [ |
| "This is a test sentence.", |
| "Another example of a sentence to predict." |
| ] |
| |
| new_embeddings = embedder.generate_embeddings(new_sentences) |
| new_embeddings_tensor = torch.tensor(new_embeddings, dtype=torch.float).to(device) |
|
|
| |
| with torch.no_grad(): |
| predictions = loaded_model(new_embeddings_tensor) |
| predictions = predictions.cpu().numpy() |
|
|
| |
| for sentence, pred in zip(new_sentences, predictions): |
| label_pred_dict = {id2label[str(i)]: float(pred[i]) for i in range(len(pred))} |
| print(f"Sentence: {sentence}") |
| print("Predictions:") |
| for label, value in label_pred_dict.items(): |
| print(f" {label}: {value}") |
| print() |
|
|
| if __name__ == "__main__": |
| main() |
|
|
| |
| |
| |
| |
|
|
|
|