| | """
|
| | Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/
|
| |
|
| | Jon Reifschneider
|
| | Brinnae Bent
|
| |
|
| | """
|
| |
|
| | import os
|
| | import pandas as pd
|
| | import time
|
| | import torch
|
| | import numpy as np
|
| | import pandas as pd
|
| | import torch.nn as nn
|
| | import torch.nn.functional as F
|
| | import torch.optim as optim
|
| | from torch.utils.data import TensorDataset
|
| | from sklearn.model_selection import train_test_split
|
| |
|
| |
|
| | class NNColabFiltering(nn.Module):
|
| |
|
| | def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):
|
| | super().__init__()
|
| | self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)
|
| | self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)
|
| | self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)
|
| | self.fc2 = nn.Linear(n_activations,1)
|
| | self.rating_range = rating_range
|
| |
|
| | def forward(self, X):
|
| |
|
| | embedded_users = self.user_embeddings(X[:,0])
|
| | embedded_items = self.item_embeddings(X[:,1])
|
| |
|
| | embeddings = torch.cat([embedded_users,embedded_items],dim=1)
|
| |
|
| | preds = self.fc1(embeddings)
|
| | preds = F.relu(preds)
|
| | preds = self.fc2(preds)
|
| |
|
| | preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
|
| | return preds
|
| |
|
| | def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
|
| | '''
|
| | Loads the prefetched data from the output dir
|
| |
|
| | Inputs:
|
| | X_train: training data features
|
| | y_train: training data target
|
| | X_val: validation data features
|
| | y_val: validation data targets
|
| | batch_size: the batch size to use
|
| |
|
| | Returns:
|
| | trainloader: training dataloader
|
| | valloader: validation dataloader
|
| | '''
|
| |
|
| | trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
|
| | torch.from_numpy(np.array(y_train)).float())
|
| | valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
|
| | torch.from_numpy(np.array(y_val)).float())
|
| |
|
| |
|
| | trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
|
| | valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
|
| |
|
| | return trainloader, valloader
|
| |
|
| | def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
|
| | '''
|
| | Loads the prefetched data from the output dir
|
| |
|
| | Inputs:
|
| | model: the model to train
|
| | criterion: the criterion to use to train
|
| | optimizer: the optimizer to use to train
|
| | dataloaders: the dict of dataloaders to user in the training and validation
|
| | device: the torch defined cpu/gpu
|
| | num_epochs: number of epochs to use for training
|
| | scheduler: the scheduler to use to train for training
|
| |
|
| | Returns:
|
| | costpaths: the loss for each epoch for validation and training
|
| | '''
|
| | model = model.to(device)
|
| | since = time.time()
|
| |
|
| | costpaths = {'train':[],'val':[]}
|
| |
|
| | for epoch in range(num_epochs):
|
| | print('Epoch {}/{}'.format(epoch, num_epochs - 1))
|
| | print('-' * 10)
|
| |
|
| | for phase in ['train', 'val']:
|
| | if phase == 'train':
|
| | model.train()
|
| | else:
|
| | model.eval()
|
| |
|
| | running_loss = 0.0
|
| |
|
| | index = 0
|
| | for (inputs,labels) in dataloaders[phase]:
|
| | inputs = inputs.to(device)
|
| | labels = labels.to(device)
|
| |
|
| | optimizer.zero_grad()
|
| |
|
| | with torch.set_grad_enabled(phase == 'train'):
|
| | outputs = model.forward(inputs).view(-1)
|
| | loss = criterion(outputs, labels)
|
| |
|
| | if phase == 'train':
|
| | loss.backward()
|
| | optimizer.step()
|
| |
|
| | running_loss += np.sqrt(loss.item()) * labels.size(0)
|
| | print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
|
| | index +=1
|
| |
|
| | if (phase == 'train') and (scheduler is not None):
|
| | scheduler.step()
|
| |
|
| | epoch_loss = running_loss / len(dataloaders[phase].dataset)
|
| | costpaths[phase].append(epoch_loss)
|
| | print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
|
| |
|
| | time_elapsed = time.time() - since
|
| | print('Training complete in {:.0f}m {:.0f}s'.format(
|
| | time_elapsed // 60, time_elapsed % 60))
|
| |
|
| | return costpaths
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| | artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv')
|
| | X = artists.loc[:,['playlist_id','artist_album_id',]]
|
| | y = artists.loc[:,'song_percent']
|
| |
|
| |
|
| | X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)
|
| | batchsize = 64
|
| | trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)
|
| |
|
| | dataloaders = {'train':trainloader, 'val':valloader}
|
| | n_users = X.loc[:,'playlist_id'].max()+1
|
| | n_items = X.loc[:,'artist_album_id'].max()+1
|
| | model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])
|
| | criterion = nn.MSELoss()
|
| | lr=0.001
|
| | n_epochs=10
|
| | wd=1e-3
|
| | optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| |
|
| | cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
|
| |
|
| |
|
| | torch.save(model, os.getcwd() + '/models/recommender.pt')
|
| |
|
| |
|