| |
| |
|
|
| |
|
|
|
|
| import os |
| from datetime import datetime |
|
|
| from pathlib import Path |
|
|
| import polars as pl |
| import torch |
| from transformers import AutoModel, AutoTokenizer |
| from transformers import Trainer, TrainingArguments |
| from accelerate import Accelerator, DistributedType |
| from torch.optim import AdamW |
| from torch.utils.data import DataLoader |
|
|
| from utils._constants import * |
| from utils._nlp import get_transformers_word_embeddings |
| from utils._polars import concat_str_columns, slice_join_dataframes |
| from utils._articles import ( |
| convert_text2encoding_with_transformers, |
| create_article_id_to_value_mapping |
| ) |
| from utils._python import make_lookup_objects |
| from utils._behaviors import ( |
| create_binary_labels_column, |
| sampling_strategy_wu2019, |
| truncate_history, |
| ) |
| from utils._articles_behaviors import map_list_article_id_to_value |
| from dataset.pytorch_dataloader import ( |
| ebnerd_from_path, |
| NRMSDataset, |
| NewsrecDataset, |
| ) |
| from evaluation import ( |
| MetricEvaluator, |
| AucScore, |
| NdcgScore, |
| MrrScore, |
| F1Score, |
| LogLossScore, |
| RootMeanSquaredError, |
| AccuracyScore |
| ) |
| from models.nrms import NRMSModel |
|
|
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
| |
|
|
|
|
| TEST_DATA_PATH = "merged_0412_final.parquet" |
|
|
|
|
| |
|
|
|
|
| test_df = pl.read_parquet(TEST_DATA_PATH).with_columns(pl.Series("labels", [[]])) |
|
|
|
|
| |
|
|
|
|
| from transformers import AutoModel, AutoTokenizer |
|
|
| model_name = "Maltehb/danish-bert-botxo" |
| model = AutoModel.from_pretrained(model_name) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| word2vec_embeddimg = get_transformers_word_embeddings(model) |
|
|
|
|
| |
|
|
|
|
| ARTICLES_DATA_PATH = "/work/Blue/ebnerd/ebnerd_testset/articles.parquet" |
| ARTICLE_COLUMNS = [DEFAULT_TITLE_COL, DEFAULT_SUBTITLE_COL] |
| TEXT_MAX_LENGTH = 30 |
|
|
| articles_df = pl.read_parquet(ARTICLES_DATA_PATH) |
| df_articles, cat_col = concat_str_columns(articles_df, columns=ARTICLE_COLUMNS) |
| df_articles, token_col_title = convert_text2encoding_with_transformers( |
| df_articles, tokenizer, cat_col, max_length=TEXT_MAX_LENGTH |
| ) |
| article_mapping = create_article_id_to_value_mapping(df=df_articles, value_col=token_col_title) |
|
|
|
|
| |
|
|
|
|
| from dataclasses import dataclass, field |
| import numpy as np |
|
|
| @dataclass |
| class NRMSTestDataset(NewsrecDataset): |
| def __post_init__(self): |
| """ |
| Post-initialization method. Loads the data and sets additional attributes. |
| """ |
| self.lookup_article_index = {id: i for i, id in enumerate(self.article_dict, start=1)} |
| self.lookup_article_matrix = np.array(list(self.article_dict.values())) |
| UNKNOWN_ARRAY = np.zeros(self.lookup_article_matrix.shape[1], dtype=self.lookup_article_matrix.dtype) |
| self.lookup_article_matrix = np.vstack([UNKNOWN_ARRAY, self.lookup_article_matrix]) |
|
|
| self.unknown_index = [0] |
| self.X, self.y = self.load_data() |
| if self.kwargs is not None: |
| self.set_kwargs(self.kwargs) |
|
|
| def __getitem__(self, idx) -> dict: |
| """ |
| history_input_tensor: (samples, history_size, document_dimension) |
| candidate_input_title: (samples, npratio, document_dimension) |
| label: (samples, npratio) |
| """ |
| batch_X = self.X[idx] |
| article_id_fixed = [self.lookup_article_index.get(f, 0) for f in batch_X["article_id_fixed"].to_list()[0]] |
| history_input_tensor = self.lookup_article_matrix[article_id_fixed] |
|
|
| article_id_inview = [self.lookup_article_index.get(f, 0) for f in batch_X["article_ids_inview"].to_list()[0]] |
| candidate_input_title = self.lookup_article_matrix[article_id_inview] |
|
|
| return { |
| "user_id": self.X[idx]["user_id"][0], |
| "history_input_tensor": history_input_tensor, |
| "candidate_article_id" : self.X[idx]["article_ids_inview"][0][0], |
| "candidate_input_title": candidate_input_title, |
| "labels" : np.int32(0) |
| } |
|
|
|
|
| |
|
|
|
|
| test_dataset = NRMSTestDataset( |
| behaviors=test_df, |
| history_column=DEFAULT_HISTORY_ARTICLE_ID_COL, |
| article_dict=article_mapping, |
| unknown_representation="zeros", |
| eval_mode=False, |
| ) |
|
|
|
|
| |
|
|
|
|
| nrms_model = NRMSModel( |
| pretrained_weight=torch.tensor(word2vec_embeddimg), |
| emb_dim=768, |
| num_heads=16, |
| hidden_dim=128, |
| item_dim=64, |
| ) |
| state_dict = torch.load("nrms_model.epoch0.step20001.pth") |
| nrms_model = torch.compile(nrms_model) |
| nrms_model.load_state_dict(state_dict["model"]) |
| nrms_model.to("cuda:1") |
|
|
|
|
| |
|
|
|
|
| import torch._dynamo |
| from tqdm import tqdm |
| import os |
| from torch.utils.data import DataLoader |
|
|
| BATCH_SIZE = 256 |
| test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=60) |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" |
| torch._dynamo.config.suppress_errors = True |
|
|
| nrms_model.eval() |
|
|
| with open("test_set.txt", 'w') as f: |
| with torch.no_grad(): |
| for i, batch in enumerate(tqdm(test_dataloader)): |
| user_id = batch["user_id"].cpu().tolist() |
| candidate_article_id = batch["candidate_article_id"].cpu().tolist() |
| history_input_tensor = batch["history_input_tensor"].to("cuda:1") |
| candidate_input_title = batch["candidate_input_title"].to("cuda:1") |
|
|
| output_logits = nrms_model(history_input_tensor, candidate_input_title, None)[:,0].cpu().tolist() |
|
|
| for j in range(len(user_id)): |
| line = f"{user_id[j]},{candidate_article_id[j]},{output_logits[j]}\n" |
| f.write(line) |
|
|
|
|
| |
|
|
|
|
|
|
|
|
|
|