Added Naive model and comments
Browse files- main.py +16 -18
- notebooks/dbscan.ipynb +169 -192
- notebooks/naive.ipynb +416 -0
- notebooks/nn_collab_filter.ipynb +21 -69
- scripts/build_features.py +71 -78
- scripts/make_dataset.py +92 -56
- scripts/model.py +46 -38
main.py
CHANGED
|
@@ -13,25 +13,15 @@ import os
|
|
| 13 |
import numpy as np
|
| 14 |
import pandas as pd
|
| 15 |
import pandas as pd
|
| 16 |
-
import json
|
| 17 |
-
import matplotlib.pyplot as plt
|
| 18 |
-
|
| 19 |
import os
|
| 20 |
-
import urllib.request
|
| 21 |
-
import zipfile
|
| 22 |
import json
|
| 23 |
import pandas as pd
|
| 24 |
-
import time
|
| 25 |
import torch
|
| 26 |
import numpy as np
|
| 27 |
import pandas as pd
|
| 28 |
import torch.nn as nn
|
| 29 |
import torch.nn.functional as F
|
| 30 |
-
import torch.optim as optim
|
| 31 |
-
from torch.utils.data import DataLoader, TensorDataset
|
| 32 |
-
from sklearn.model_selection import train_test_split
|
| 33 |
import matplotlib.pyplot as plt
|
| 34 |
-
from sklearn.preprocessing import LabelEncoder
|
| 35 |
|
| 36 |
class NNColabFiltering(nn.Module):
|
| 37 |
|
|
@@ -58,16 +48,29 @@ class NNColabFiltering(nn.Module):
|
|
| 58 |
return preds
|
| 59 |
|
| 60 |
def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):
|
| 61 |
-
|
|
|
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)
|
| 65 |
user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)
|
| 66 |
|
| 67 |
-
# Initialize tensor to store all predictions
|
| 68 |
all_predictions = torch.zeros(len(all_movie_ids), device=device)
|
| 69 |
|
| 70 |
-
# Generate predictions in batches
|
| 71 |
with torch.no_grad():
|
| 72 |
for i in range(0, len(all_movie_ids), batch_size):
|
| 73 |
batch_user_ids = user_ids[i:i+batch_size]
|
|
@@ -77,14 +80,10 @@ def generate_recommendations(artist_album, playlists, model, playlist_id, device
|
|
| 77 |
batch_predictions = model(input_tensor).squeeze()
|
| 78 |
all_predictions[i:i+batch_size] = batch_predictions
|
| 79 |
|
| 80 |
-
# Convert to numpy for easier handling
|
| 81 |
predictions = all_predictions.cpu().numpy()
|
| 82 |
-
|
| 83 |
albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())
|
| 84 |
-
|
| 85 |
unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)
|
| 86 |
|
| 87 |
-
# Get top N recommendations
|
| 88 |
top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]
|
| 89 |
recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]
|
| 90 |
|
|
@@ -126,7 +125,6 @@ if __name__ == '__main__':
|
|
| 126 |
trumpet = Image.open('assets/trumpet.png')
|
| 127 |
img2.image(trumpet, use_column_width=True)
|
| 128 |
|
| 129 |
-
# Using "with" notation
|
| 130 |
with st.sidebar:
|
| 131 |
playlist_name = st.selectbox(
|
| 132 |
"Playlist Selection",
|
|
|
|
| 13 |
import numpy as np
|
| 14 |
import pandas as pd
|
| 15 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 16 |
import os
|
|
|
|
|
|
|
| 17 |
import json
|
| 18 |
import pandas as pd
|
|
|
|
| 19 |
import torch
|
| 20 |
import numpy as np
|
| 21 |
import pandas as pd
|
| 22 |
import torch.nn as nn
|
| 23 |
import torch.nn.functional as F
|
|
|
|
|
|
|
|
|
|
| 24 |
import matplotlib.pyplot as plt
|
|
|
|
| 25 |
|
| 26 |
class NNColabFiltering(nn.Module):
|
| 27 |
|
|
|
|
| 48 |
return preds
|
| 49 |
|
| 50 |
def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):
|
| 51 |
+
'''
|
| 52 |
+
Loads the prefetched data from the output dir
|
| 53 |
|
| 54 |
+
Inputs:
|
| 55 |
+
artist_album: the dataframe containing the mappings for the artist and albums
|
| 56 |
+
playlists: the dataframe containing the playlists contents
|
| 57 |
+
model: the trained model
|
| 58 |
+
playlist_id: the playlist id to generate recommendation for
|
| 59 |
+
device: the gpu or cpu device define by torch
|
| 60 |
+
top_n: the number of recommendations to generate
|
| 61 |
+
batch_size: the batch size to use
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
album: the recommended album
|
| 65 |
+
playlists: the recommended artist
|
| 66 |
+
'''
|
| 67 |
+
model.eval()
|
| 68 |
|
| 69 |
all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)
|
| 70 |
user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)
|
| 71 |
|
|
|
|
| 72 |
all_predictions = torch.zeros(len(all_movie_ids), device=device)
|
| 73 |
|
|
|
|
| 74 |
with torch.no_grad():
|
| 75 |
for i in range(0, len(all_movie_ids), batch_size):
|
| 76 |
batch_user_ids = user_ids[i:i+batch_size]
|
|
|
|
| 80 |
batch_predictions = model(input_tensor).squeeze()
|
| 81 |
all_predictions[i:i+batch_size] = batch_predictions
|
| 82 |
|
|
|
|
| 83 |
predictions = all_predictions.cpu().numpy()
|
|
|
|
| 84 |
albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())
|
|
|
|
| 85 |
unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)
|
| 86 |
|
|
|
|
| 87 |
top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]
|
| 88 |
recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]
|
| 89 |
|
|
|
|
| 125 |
trumpet = Image.open('assets/trumpet.png')
|
| 126 |
img2.image(trumpet, use_column_width=True)
|
| 127 |
|
|
|
|
| 128 |
with st.sidebar:
|
| 129 |
playlist_name = st.selectbox(
|
| 130 |
"Playlist Selection",
|
notebooks/dbscan.ipynb
CHANGED
|
@@ -1,22 +1,12 @@
|
|
| 1 |
{
|
| 2 |
-
"nbformat": 4,
|
| 3 |
-
"nbformat_minor": 0,
|
| 4 |
-
"metadata": {
|
| 5 |
-
"colab": {
|
| 6 |
-
"provenance": [],
|
| 7 |
-
"machine_shape": "hm"
|
| 8 |
-
},
|
| 9 |
-
"kernelspec": {
|
| 10 |
-
"name": "python3",
|
| 11 |
-
"display_name": "Python 3"
|
| 12 |
-
},
|
| 13 |
-
"language_info": {
|
| 14 |
-
"name": "python"
|
| 15 |
-
}
|
| 16 |
-
},
|
| 17 |
"cells": [
|
| 18 |
{
|
| 19 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"source": [
|
| 21 |
"import os\n",
|
| 22 |
"import urllib.request\n",
|
|
@@ -34,74 +24,63 @@
|
|
| 34 |
"from sklearn.model_selection import train_test_split\n",
|
| 35 |
"import matplotlib.pyplot as plt\n",
|
| 36 |
"from sklearn.preprocessing import LabelEncoder"
|
| 37 |
-
]
|
| 38 |
-
"metadata": {
|
| 39 |
-
"id": "KHnddFeW5hwh"
|
| 40 |
-
},
|
| 41 |
-
"execution_count": null,
|
| 42 |
-
"outputs": []
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"cell_type": "code",
|
| 46 |
-
"
|
| 47 |
-
"from google.colab import drive\n",
|
| 48 |
-
"drive.mount('/content/drive')"
|
| 49 |
-
],
|
| 50 |
"metadata": {
|
| 51 |
"id": "l7pGG_d85lzH"
|
| 52 |
},
|
| 53 |
-
"
|
| 54 |
-
"
|
|
|
|
|
|
|
|
|
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"source": [
|
| 59 |
-
"# prompt: copy a file from another directory to current directory in python code and create folders if needed\n",
|
| 60 |
-
"\n",
|
| 61 |
"import shutil\n",
|
| 62 |
"import os\n",
|
| 63 |
"\n",
|
| 64 |
"def copy_file(src, dst):\n",
|
| 65 |
-
" \"\"\"\n",
|
| 66 |
-
" Copies a file from src to dst, creating any necessary directories.\n",
|
| 67 |
-
"\n",
|
| 68 |
-
" Args:\n",
|
| 69 |
-
" src: The path to the source file.\n",
|
| 70 |
-
" dst: The path to the destination file.\n",
|
| 71 |
-
" \"\"\"\n",
|
| 72 |
-
" # Create the destination directory if it doesn't exist.\n",
|
| 73 |
" dst_dir = os.path.dirname(dst)\n",
|
| 74 |
" if not os.path.exists(dst_dir):\n",
|
| 75 |
" os.makedirs(dst_dir)\n",
|
| 76 |
"\n",
|
| 77 |
-
" # Copy the file.\n",
|
| 78 |
" shutil.copy2(src, dst)\n",
|
| 79 |
"\n",
|
| 80 |
"copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
|
| 81 |
-
]
|
| 82 |
-
"metadata": {
|
| 83 |
-
"id": "dL8TIlH55qSc"
|
| 84 |
-
},
|
| 85 |
-
"execution_count": 3,
|
| 86 |
-
"outputs": []
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"source": [
|
| 91 |
"def unzip_archive(filepath, dir_path):\n",
|
| 92 |
" with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
|
| 93 |
" zip_ref.extractall(dir_path)\n",
|
| 94 |
"\n",
|
| 95 |
"unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
|
| 96 |
-
]
|
| 97 |
-
"metadata": {
|
| 98 |
-
"id": "LLy-YA775snY"
|
| 99 |
-
},
|
| 100 |
-
"execution_count": null,
|
| 101 |
-
"outputs": []
|
| 102 |
},
|
| 103 |
{
|
| 104 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
"source": [
|
| 106 |
"import shutil\n",
|
| 107 |
"\n",
|
|
@@ -111,29 +90,27 @@
|
|
| 111 |
" os.makedirs(directory)\n",
|
| 112 |
" else:\n",
|
| 113 |
" os.makedirs(directory)"
|
| 114 |
-
]
|
| 115 |
-
"metadata": {
|
| 116 |
-
"id": "YtO0seclE1Pb"
|
| 117 |
-
},
|
| 118 |
-
"execution_count": null,
|
| 119 |
-
"outputs": []
|
| 120 |
},
|
| 121 |
{
|
| 122 |
"cell_type": "code",
|
| 123 |
-
"
|
| 124 |
-
"\n",
|
| 125 |
-
"\n",
|
| 126 |
-
"directory = os.getcwd() + '/data/raw/data'\n",
|
| 127 |
-
"make_dir(directory)"
|
| 128 |
-
],
|
| 129 |
"metadata": {
|
| 130 |
"id": "UeqDk3_65vTt"
|
| 131 |
},
|
| 132 |
-
"
|
| 133 |
-
"
|
|
|
|
|
|
|
|
|
|
| 134 |
},
|
| 135 |
{
|
| 136 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
"source": [
|
| 138 |
"cols = [\n",
|
| 139 |
" 'name',\n",
|
|
@@ -144,15 +121,27 @@
|
|
| 144 |
" 'track_name',\n",
|
| 145 |
" 'album_name'\n",
|
| 146 |
"]"
|
| 147 |
-
]
|
| 148 |
-
"metadata": {
|
| 149 |
-
"id": "zMTup29b5wtO"
|
| 150 |
-
},
|
| 151 |
-
"execution_count": null,
|
| 152 |
-
"outputs": []
|
| 153 |
},
|
| 154 |
{
|
| 155 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
"source": [
|
| 157 |
"directory = os.getcwd() + '/data/raw/playlists/data'\n",
|
| 158 |
"df = pd.DataFrame()\n",
|
|
@@ -192,27 +181,15 @@
|
|
| 192 |
" df = pd.DataFrame()\n",
|
| 193 |
" if index % 100 == 0:\n",
|
| 194 |
" break"
|
| 195 |
-
],
|
| 196 |
-
"metadata": {
|
| 197 |
-
"colab": {
|
| 198 |
-
"base_uri": "https://localhost:8080/"
|
| 199 |
-
},
|
| 200 |
-
"id": "h6jQO9HT5zsG",
|
| 201 |
-
"outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
|
| 202 |
-
},
|
| 203 |
-
"execution_count": null,
|
| 204 |
-
"outputs": [
|
| 205 |
-
{
|
| 206 |
-
"output_type": "stream",
|
| 207 |
-
"name": "stdout",
|
| 208 |
-
"text": [
|
| 209 |
-
"mpd.slice.727000-727999.json\t100/1000\t10.0%"
|
| 210 |
-
]
|
| 211 |
-
}
|
| 212 |
]
|
| 213 |
},
|
| 214 |
{
|
| 215 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
"source": [
|
| 217 |
"import pyarrow.parquet as pq\n",
|
| 218 |
"\n",
|
|
@@ -228,27 +205,27 @@
|
|
| 228 |
"\n",
|
| 229 |
"folder_path = os.getcwd() + '/data/raw/data'\n",
|
| 230 |
"df = read_parquet_folder(folder_path)"
|
| 231 |
-
]
|
| 232 |
-
"metadata": {
|
| 233 |
-
"id": "PngL0QHq516u"
|
| 234 |
-
},
|
| 235 |
-
"execution_count": null,
|
| 236 |
-
"outputs": []
|
| 237 |
},
|
| 238 |
{
|
| 239 |
"cell_type": "code",
|
| 240 |
-
"
|
| 241 |
-
"directory = os.getcwd() + '/data/raw/mappings'\n",
|
| 242 |
-
"make_dir(directory)"
|
| 243 |
-
],
|
| 244 |
"metadata": {
|
| 245 |
"id": "hdLpjr2153b_"
|
| 246 |
},
|
| 247 |
-
"
|
| 248 |
-
"
|
|
|
|
|
|
|
|
|
|
| 249 |
},
|
| 250 |
{
|
| 251 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
"source": [
|
| 253 |
"def create_ids(df, col, name):\n",
|
| 254 |
" # Create a dictionary mapping unique values to IDs\n",
|
|
@@ -259,43 +236,42 @@
|
|
| 259 |
" df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n",
|
| 260 |
"\n",
|
| 261 |
" return df"
|
| 262 |
-
]
|
| 263 |
-
"metadata": {
|
| 264 |
-
"id": "peZyue6t57Mz"
|
| 265 |
-
},
|
| 266 |
-
"execution_count": null,
|
| 267 |
-
"outputs": []
|
| 268 |
},
|
| 269 |
{
|
| 270 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
"source": [
|
| 272 |
"df = create_ids(df, 'artist_name', 'artist')\n",
|
| 273 |
"df = create_ids(df, 'pid', 'playlist')\n",
|
| 274 |
-
"# df = create_ids(df, 'track_name', 'track')\n",
|
| 275 |
"df = create_ids(df, 'album_name', 'album')"
|
| 276 |
-
]
|
| 277 |
-
"metadata": {
|
| 278 |
-
"id": "p68WNyaf58rS"
|
| 279 |
-
},
|
| 280 |
-
"execution_count": null,
|
| 281 |
-
"outputs": []
|
| 282 |
},
|
| 283 |
{
|
| 284 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
"source": [
|
| 286 |
"df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
|
| 287 |
"\n",
|
| 288 |
"df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
|
| 289 |
"df['playlist_songs'] += 1"
|
| 290 |
-
]
|
| 291 |
-
"metadata": {
|
| 292 |
-
"id": "aSBKxRFa5-O_"
|
| 293 |
-
},
|
| 294 |
-
"execution_count": null,
|
| 295 |
-
"outputs": []
|
| 296 |
},
|
| 297 |
{
|
| 298 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
"source": [
|
| 300 |
"df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
|
| 301 |
"\n",
|
|
@@ -306,67 +282,50 @@
|
|
| 306 |
"df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
|
| 307 |
"\n",
|
| 308 |
"df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n"
|
| 309 |
-
]
|
| 310 |
-
"metadata": {
|
| 311 |
-
"id": "4WqHH-pn5_nL"
|
| 312 |
-
},
|
| 313 |
-
"execution_count": null,
|
| 314 |
-
"outputs": []
|
| 315 |
},
|
| 316 |
{
|
| 317 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
"source": [
|
| 319 |
-
"# df = df.groupby(['playlist_id','artist_album','artist_album_id','playlist_songs']).agg({\n",
|
| 320 |
-
"# 'song_count': 'sum',\n",
|
| 321 |
-
"# 'track_name': '|'.join,\n",
|
| 322 |
-
"# 'track_name': '|'.join,\n",
|
| 323 |
-
"# }).reset_index()\n",
|
| 324 |
"df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
|
| 325 |
"\n",
|
| 326 |
-
"# Encode the genres data\n",
|
| 327 |
"encoder = LabelEncoder()\n",
|
| 328 |
"encoder.fit(df['track_name'])\n",
|
| 329 |
"df['track_id'] = encoder.transform(df['track_name'])"
|
| 330 |
-
]
|
| 331 |
-
"metadata": {
|
| 332 |
-
"id": "V1bhU5rW6BSY"
|
| 333 |
-
},
|
| 334 |
-
"execution_count": null,
|
| 335 |
-
"outputs": []
|
| 336 |
},
|
| 337 |
{
|
| 338 |
"cell_type": "code",
|
| 339 |
-
"
|
| 340 |
-
"# df['artist_percent'] = df['artist_count'] / df['playlist_songs']\n",
|
| 341 |
-
"df['song_percent'] = df['song_count'] / df['playlist_songs']\n",
|
| 342 |
-
"# df['album_percent'] = df['album_count'] / df['playlist_songs']"
|
| 343 |
-
],
|
| 344 |
"metadata": {
|
| 345 |
"id": "l6sUWKYC6DCw"
|
| 346 |
},
|
| 347 |
-
"
|
| 348 |
-
"
|
|
|
|
|
|
|
| 349 |
},
|
| 350 |
{
|
| 351 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
"source": [
|
| 353 |
"import numpy as np\n",
|
| 354 |
"\n",
|
| 355 |
-
"# Assuming you have a DataFrame 'df' with a column 'column_name'\n",
|
| 356 |
"df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
|
| 357 |
-
]
|
| 358 |
-
"metadata": {
|
| 359 |
-
"id": "XxC0WnlL6EWz"
|
| 360 |
-
},
|
| 361 |
-
"execution_count": null,
|
| 362 |
-
"outputs": []
|
| 363 |
},
|
| 364 |
{
|
| 365 |
"cell_type": "code",
|
| 366 |
-
"
|
| 367 |
-
"artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n",
|
| 368 |
-
"artists.head()"
|
| 369 |
-
],
|
| 370 |
"metadata": {
|
| 371 |
"colab": {
|
| 372 |
"base_uri": "https://localhost:8080/",
|
|
@@ -375,19 +334,13 @@
|
|
| 375 |
"id": "kbxBcQiX6F2v",
|
| 376 |
"outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9"
|
| 377 |
},
|
| 378 |
-
"execution_count": null,
|
| 379 |
"outputs": [
|
| 380 |
{
|
| 381 |
-
"output_type": "execute_result",
|
| 382 |
"data": {
|
| 383 |
-
"
|
| 384 |
-
"
|
| 385 |
-
"
|
| 386 |
-
|
| 387 |
-
"2 0 2 2\n",
|
| 388 |
-
"3 0 3 3\n",
|
| 389 |
-
"4 0 4 4"
|
| 390 |
-
],
|
| 391 |
"text/html": [
|
| 392 |
"\n",
|
| 393 |
" <div id=\"df-cedfd0c3-1f93-4a45-b95c-5d58bbf23f45\" class=\"colab-df-container\">\n",
|
|
@@ -658,30 +611,39 @@
|
|
| 658 |
" </div>\n",
|
| 659 |
" </div>\n"
|
| 660 |
],
|
| 661 |
-
"
|
| 662 |
-
"
|
| 663 |
-
"
|
| 664 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
},
|
|
|
|
| 666 |
"metadata": {},
|
| 667 |
-
"
|
| 668 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
]
|
| 670 |
},
|
| 671 |
{
|
| 672 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
"source": [
|
| 674 |
"X = artists.loc[:,['artist_id','album_id',]]\n",
|
| 675 |
"y = artists.loc[:,'playlist_id',]\n",
|
| 676 |
"\n",
|
| 677 |
"# Split our data into training and test sets\n",
|
| 678 |
"X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
|
| 679 |
-
]
|
| 680 |
-
"metadata": {
|
| 681 |
-
"id": "5HLSc9z36Izn"
|
| 682 |
-
},
|
| 683 |
-
"execution_count": null,
|
| 684 |
-
"outputs": []
|
| 685 |
},
|
| 686 |
{
|
| 687 |
"cell_type": "code",
|
|
@@ -698,17 +660,7 @@
|
|
| 698 |
},
|
| 699 |
{
|
| 700 |
"cell_type": "code",
|
| 701 |
-
"
|
| 702 |
-
"from sklearn.metrics import precision_score, recall_score\n",
|
| 703 |
-
"y_no_noise = y[labels_db != -1]\n",
|
| 704 |
-
"labels_db_no_noise = labels_db[labels_db != -1]\n",
|
| 705 |
-
"\n",
|
| 706 |
-
"precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
|
| 707 |
-
"recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
|
| 708 |
-
"\n",
|
| 709 |
-
"print(f'Precision: {precision}')\n",
|
| 710 |
-
"print(f'Recall: {recall}')"
|
| 711 |
-
],
|
| 712 |
"metadata": {
|
| 713 |
"colab": {
|
| 714 |
"base_uri": "https://localhost:8080/"
|
|
@@ -716,33 +668,58 @@
|
|
| 716 |
"id": "Osq-NpGu9V2k",
|
| 717 |
"outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
|
| 718 |
},
|
| 719 |
-
"execution_count": 27,
|
| 720 |
"outputs": [
|
| 721 |
{
|
| 722 |
-
"output_type": "stream",
|
| 723 |
"name": "stderr",
|
|
|
|
| 724 |
"text": [
|
| 725 |
"/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
| 726 |
" _warn_prf(average, modifier, msg_start, len(result))\n"
|
| 727 |
]
|
| 728 |
},
|
| 729 |
{
|
| 730 |
-
"output_type": "stream",
|
| 731 |
"name": "stdout",
|
|
|
|
| 732 |
"text": [
|
| 733 |
"Precision: 1.589262536579764e-05\n",
|
| 734 |
"Recall: 9.606273770069471e-06\n"
|
| 735 |
]
|
| 736 |
},
|
| 737 |
{
|
| 738 |
-
"output_type": "stream",
|
| 739 |
"name": "stderr",
|
|
|
|
| 740 |
"text": [
|
| 741 |
"/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
| 742 |
" _warn_prf(average, modifier, msg_start, len(result))\n"
|
| 743 |
]
|
| 744 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 745 |
]
|
| 746 |
}
|
| 747 |
-
]
|
| 748 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {
|
| 7 |
+
"id": "KHnddFeW5hwh"
|
| 8 |
+
},
|
| 9 |
+
"outputs": [],
|
| 10 |
"source": [
|
| 11 |
"import os\n",
|
| 12 |
"import urllib.request\n",
|
|
|
|
| 24 |
"from sklearn.model_selection import train_test_split\n",
|
| 25 |
"import matplotlib.pyplot as plt\n",
|
| 26 |
"from sklearn.preprocessing import LabelEncoder"
|
| 27 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"cell_type": "code",
|
| 31 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
| 32 |
"metadata": {
|
| 33 |
"id": "l7pGG_d85lzH"
|
| 34 |
},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"from google.colab import drive\n",
|
| 38 |
+
"drive.mount('/content/drive')"
|
| 39 |
+
]
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"cell_type": "code",
|
| 43 |
+
"execution_count": 3,
|
| 44 |
+
"metadata": {
|
| 45 |
+
"id": "dL8TIlH55qSc"
|
| 46 |
+
},
|
| 47 |
+
"outputs": [],
|
| 48 |
"source": [
|
|
|
|
|
|
|
| 49 |
"import shutil\n",
|
| 50 |
"import os\n",
|
| 51 |
"\n",
|
| 52 |
"def copy_file(src, dst):\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
" dst_dir = os.path.dirname(dst)\n",
|
| 54 |
" if not os.path.exists(dst_dir):\n",
|
| 55 |
" os.makedirs(dst_dir)\n",
|
| 56 |
"\n",
|
|
|
|
| 57 |
" shutil.copy2(src, dst)\n",
|
| 58 |
"\n",
|
| 59 |
"copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
|
| 60 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"cell_type": "code",
|
| 64 |
+
"execution_count": null,
|
| 65 |
+
"metadata": {
|
| 66 |
+
"id": "LLy-YA775snY"
|
| 67 |
+
},
|
| 68 |
+
"outputs": [],
|
| 69 |
"source": [
|
| 70 |
"def unzip_archive(filepath, dir_path):\n",
|
| 71 |
" with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
|
| 72 |
" zip_ref.extractall(dir_path)\n",
|
| 73 |
"\n",
|
| 74 |
"unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
|
| 75 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
},
|
| 77 |
{
|
| 78 |
"cell_type": "code",
|
| 79 |
+
"execution_count": null,
|
| 80 |
+
"metadata": {
|
| 81 |
+
"id": "YtO0seclE1Pb"
|
| 82 |
+
},
|
| 83 |
+
"outputs": [],
|
| 84 |
"source": [
|
| 85 |
"import shutil\n",
|
| 86 |
"\n",
|
|
|
|
| 90 |
" os.makedirs(directory)\n",
|
| 91 |
" else:\n",
|
| 92 |
" os.makedirs(directory)"
|
| 93 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
},
|
| 95 |
{
|
| 96 |
"cell_type": "code",
|
| 97 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
"metadata": {
|
| 99 |
"id": "UeqDk3_65vTt"
|
| 100 |
},
|
| 101 |
+
"outputs": [],
|
| 102 |
+
"source": [
|
| 103 |
+
"directory = os.getcwd() + '/data/raw/data'\n",
|
| 104 |
+
"make_dir(directory)"
|
| 105 |
+
]
|
| 106 |
},
|
| 107 |
{
|
| 108 |
"cell_type": "code",
|
| 109 |
+
"execution_count": null,
|
| 110 |
+
"metadata": {
|
| 111 |
+
"id": "zMTup29b5wtO"
|
| 112 |
+
},
|
| 113 |
+
"outputs": [],
|
| 114 |
"source": [
|
| 115 |
"cols = [\n",
|
| 116 |
" 'name',\n",
|
|
|
|
| 121 |
" 'track_name',\n",
|
| 122 |
" 'album_name'\n",
|
| 123 |
"]"
|
| 124 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
},
|
| 126 |
{
|
| 127 |
"cell_type": "code",
|
| 128 |
+
"execution_count": null,
|
| 129 |
+
"metadata": {
|
| 130 |
+
"colab": {
|
| 131 |
+
"base_uri": "https://localhost:8080/"
|
| 132 |
+
},
|
| 133 |
+
"id": "h6jQO9HT5zsG",
|
| 134 |
+
"outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
|
| 135 |
+
},
|
| 136 |
+
"outputs": [
|
| 137 |
+
{
|
| 138 |
+
"name": "stdout",
|
| 139 |
+
"output_type": "stream",
|
| 140 |
+
"text": [
|
| 141 |
+
"mpd.slice.727000-727999.json\t100/1000\t10.0%"
|
| 142 |
+
]
|
| 143 |
+
}
|
| 144 |
+
],
|
| 145 |
"source": [
|
| 146 |
"directory = os.getcwd() + '/data/raw/playlists/data'\n",
|
| 147 |
"df = pd.DataFrame()\n",
|
|
|
|
| 181 |
" df = pd.DataFrame()\n",
|
| 182 |
" if index % 100 == 0:\n",
|
| 183 |
" break"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
]
|
| 185 |
},
|
| 186 |
{
|
| 187 |
"cell_type": "code",
|
| 188 |
+
"execution_count": null,
|
| 189 |
+
"metadata": {
|
| 190 |
+
"id": "PngL0QHq516u"
|
| 191 |
+
},
|
| 192 |
+
"outputs": [],
|
| 193 |
"source": [
|
| 194 |
"import pyarrow.parquet as pq\n",
|
| 195 |
"\n",
|
|
|
|
| 205 |
"\n",
|
| 206 |
"folder_path = os.getcwd() + '/data/raw/data'\n",
|
| 207 |
"df = read_parquet_folder(folder_path)"
|
| 208 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
},
|
| 210 |
{
|
| 211 |
"cell_type": "code",
|
| 212 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
| 213 |
"metadata": {
|
| 214 |
"id": "hdLpjr2153b_"
|
| 215 |
},
|
| 216 |
+
"outputs": [],
|
| 217 |
+
"source": [
|
| 218 |
+
"directory = os.getcwd() + '/data/raw/mappings'\n",
|
| 219 |
+
"make_dir(directory)"
|
| 220 |
+
]
|
| 221 |
},
|
| 222 |
{
|
| 223 |
"cell_type": "code",
|
| 224 |
+
"execution_count": null,
|
| 225 |
+
"metadata": {
|
| 226 |
+
"id": "peZyue6t57Mz"
|
| 227 |
+
},
|
| 228 |
+
"outputs": [],
|
| 229 |
"source": [
|
| 230 |
"def create_ids(df, col, name):\n",
|
| 231 |
" # Create a dictionary mapping unique values to IDs\n",
|
|
|
|
| 236 |
" df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n",
|
| 237 |
"\n",
|
| 238 |
" return df"
|
| 239 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
},
|
| 241 |
{
|
| 242 |
"cell_type": "code",
|
| 243 |
+
"execution_count": null,
|
| 244 |
+
"metadata": {
|
| 245 |
+
"id": "p68WNyaf58rS"
|
| 246 |
+
},
|
| 247 |
+
"outputs": [],
|
| 248 |
"source": [
|
| 249 |
"df = create_ids(df, 'artist_name', 'artist')\n",
|
| 250 |
"df = create_ids(df, 'pid', 'playlist')\n",
|
|
|
|
| 251 |
"df = create_ids(df, 'album_name', 'album')"
|
| 252 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
},
|
| 254 |
{
|
| 255 |
"cell_type": "code",
|
| 256 |
+
"execution_count": null,
|
| 257 |
+
"metadata": {
|
| 258 |
+
"id": "aSBKxRFa5-O_"
|
| 259 |
+
},
|
| 260 |
+
"outputs": [],
|
| 261 |
"source": [
|
| 262 |
"df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
|
| 263 |
"\n",
|
| 264 |
"df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
|
| 265 |
"df['playlist_songs'] += 1"
|
| 266 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
},
|
| 268 |
{
|
| 269 |
"cell_type": "code",
|
| 270 |
+
"execution_count": null,
|
| 271 |
+
"metadata": {
|
| 272 |
+
"id": "4WqHH-pn5_nL"
|
| 273 |
+
},
|
| 274 |
+
"outputs": [],
|
| 275 |
"source": [
|
| 276 |
"df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
|
| 277 |
"\n",
|
|
|
|
| 282 |
"df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
|
| 283 |
"\n",
|
| 284 |
"df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n"
|
| 285 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"cell_type": "code",
|
| 289 |
+
"execution_count": null,
|
| 290 |
+
"metadata": {
|
| 291 |
+
"id": "V1bhU5rW6BSY"
|
| 292 |
+
},
|
| 293 |
+
"outputs": [],
|
| 294 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
"df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
|
| 296 |
"\n",
|
|
|
|
| 297 |
"encoder = LabelEncoder()\n",
|
| 298 |
"encoder.fit(df['track_name'])\n",
|
| 299 |
"df['track_id'] = encoder.transform(df['track_name'])"
|
| 300 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
},
|
| 302 |
{
|
| 303 |
"cell_type": "code",
|
| 304 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
"metadata": {
|
| 306 |
"id": "l6sUWKYC6DCw"
|
| 307 |
},
|
| 308 |
+
"outputs": [],
|
| 309 |
+
"source": [
|
| 310 |
+
"df['song_percent'] = df['song_count'] / df['playlist_songs']"
|
| 311 |
+
]
|
| 312 |
},
|
| 313 |
{
|
| 314 |
"cell_type": "code",
|
| 315 |
+
"execution_count": null,
|
| 316 |
+
"metadata": {
|
| 317 |
+
"id": "XxC0WnlL6EWz"
|
| 318 |
+
},
|
| 319 |
+
"outputs": [],
|
| 320 |
"source": [
|
| 321 |
"import numpy as np\n",
|
| 322 |
"\n",
|
|
|
|
| 323 |
"df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
|
| 324 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
},
|
| 326 |
{
|
| 327 |
"cell_type": "code",
|
| 328 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
| 329 |
"metadata": {
|
| 330 |
"colab": {
|
| 331 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 334 |
"id": "kbxBcQiX6F2v",
|
| 335 |
"outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9"
|
| 336 |
},
|
|
|
|
| 337 |
"outputs": [
|
| 338 |
{
|
|
|
|
| 339 |
"data": {
|
| 340 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
| 341 |
+
"type": "dataframe",
|
| 342 |
+
"variable_name": "artists"
|
| 343 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
"text/html": [
|
| 345 |
"\n",
|
| 346 |
" <div id=\"df-cedfd0c3-1f93-4a45-b95c-5d58bbf23f45\" class=\"colab-df-container\">\n",
|
|
|
|
| 611 |
" </div>\n",
|
| 612 |
" </div>\n"
|
| 613 |
],
|
| 614 |
+
"text/plain": [
|
| 615 |
+
" playlist_id artist_id album_id\n",
|
| 616 |
+
"0 0 0 0\n",
|
| 617 |
+
"1 0 1 1\n",
|
| 618 |
+
"2 0 2 2\n",
|
| 619 |
+
"3 0 3 3\n",
|
| 620 |
+
"4 0 4 4"
|
| 621 |
+
]
|
| 622 |
},
|
| 623 |
+
"execution_count": 18,
|
| 624 |
"metadata": {},
|
| 625 |
+
"output_type": "execute_result"
|
| 626 |
}
|
| 627 |
+
],
|
| 628 |
+
"source": [
|
| 629 |
+
"artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n",
|
| 630 |
+
"artists.head()"
|
| 631 |
]
|
| 632 |
},
|
| 633 |
{
|
| 634 |
"cell_type": "code",
|
| 635 |
+
"execution_count": null,
|
| 636 |
+
"metadata": {
|
| 637 |
+
"id": "5HLSc9z36Izn"
|
| 638 |
+
},
|
| 639 |
+
"outputs": [],
|
| 640 |
"source": [
|
| 641 |
"X = artists.loc[:,['artist_id','album_id',]]\n",
|
| 642 |
"y = artists.loc[:,'playlist_id',]\n",
|
| 643 |
"\n",
|
| 644 |
"# Split our data into training and test sets\n",
|
| 645 |
"X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
|
| 646 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
},
|
| 648 |
{
|
| 649 |
"cell_type": "code",
|
|
|
|
| 660 |
},
|
| 661 |
{
|
| 662 |
"cell_type": "code",
|
| 663 |
+
"execution_count": 27,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
"metadata": {
|
| 665 |
"colab": {
|
| 666 |
"base_uri": "https://localhost:8080/"
|
|
|
|
| 668 |
"id": "Osq-NpGu9V2k",
|
| 669 |
"outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
|
| 670 |
},
|
|
|
|
| 671 |
"outputs": [
|
| 672 |
{
|
|
|
|
| 673 |
"name": "stderr",
|
| 674 |
+
"output_type": "stream",
|
| 675 |
"text": [
|
| 676 |
"/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
| 677 |
" _warn_prf(average, modifier, msg_start, len(result))\n"
|
| 678 |
]
|
| 679 |
},
|
| 680 |
{
|
|
|
|
| 681 |
"name": "stdout",
|
| 682 |
+
"output_type": "stream",
|
| 683 |
"text": [
|
| 684 |
"Precision: 1.589262536579764e-05\n",
|
| 685 |
"Recall: 9.606273770069471e-06\n"
|
| 686 |
]
|
| 687 |
},
|
| 688 |
{
|
|
|
|
| 689 |
"name": "stderr",
|
| 690 |
+
"output_type": "stream",
|
| 691 |
"text": [
|
| 692 |
"/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
| 693 |
" _warn_prf(average, modifier, msg_start, len(result))\n"
|
| 694 |
]
|
| 695 |
}
|
| 696 |
+
],
|
| 697 |
+
"source": [
|
| 698 |
+
"from sklearn.metrics import precision_score, recall_score\n",
|
| 699 |
+
"y_no_noise = y[labels_db != -1]\n",
|
| 700 |
+
"labels_db_no_noise = labels_db[labels_db != -1]\n",
|
| 701 |
+
"\n",
|
| 702 |
+
"precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
|
| 703 |
+
"recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
|
| 704 |
+
"\n",
|
| 705 |
+
"print(f'Precision: {precision}')\n",
|
| 706 |
+
"print(f'Recall: {recall}')"
|
| 707 |
]
|
| 708 |
}
|
| 709 |
+
],
|
| 710 |
+
"metadata": {
|
| 711 |
+
"colab": {
|
| 712 |
+
"machine_shape": "hm",
|
| 713 |
+
"provenance": []
|
| 714 |
+
},
|
| 715 |
+
"kernelspec": {
|
| 716 |
+
"display_name": "Python 3",
|
| 717 |
+
"name": "python3"
|
| 718 |
+
},
|
| 719 |
+
"language_info": {
|
| 720 |
+
"name": "python"
|
| 721 |
+
}
|
| 722 |
+
},
|
| 723 |
+
"nbformat": 4,
|
| 724 |
+
"nbformat_minor": 0
|
| 725 |
+
}
|
notebooks/naive.ipynb
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {
|
| 7 |
+
"id": "KHnddFeW5hwh"
|
| 8 |
+
},
|
| 9 |
+
"outputs": [],
|
| 10 |
+
"source": [
|
| 11 |
+
"import os\n",
|
| 12 |
+
"import urllib.request\n",
|
| 13 |
+
"import zipfile\n",
|
| 14 |
+
"import json\n",
|
| 15 |
+
"import pandas as pd\n",
|
| 16 |
+
"import time\n",
|
| 17 |
+
"import torch\n",
|
| 18 |
+
"import numpy as np\n",
|
| 19 |
+
"import pandas as pd\n",
|
| 20 |
+
"import torch.nn as nn\n",
|
| 21 |
+
"import torch.nn.functional as F\n",
|
| 22 |
+
"import torch.optim as optim\n",
|
| 23 |
+
"from torch.utils.data import DataLoader, TensorDataset\n",
|
| 24 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 25 |
+
"import matplotlib.pyplot as plt\n",
|
| 26 |
+
"from sklearn.preprocessing import LabelEncoder"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "code",
|
| 31 |
+
"execution_count": null,
|
| 32 |
+
"metadata": {
|
| 33 |
+
"id": "l7pGG_d85lzH"
|
| 34 |
+
},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"from google.colab import drive\n",
|
| 38 |
+
"drive.mount('/content/drive')"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 3,
|
| 44 |
+
"metadata": {
|
| 45 |
+
"id": "dL8TIlH55qSc"
|
| 46 |
+
},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"import shutil\n",
|
| 50 |
+
"import os\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"def copy_file(src, dst):\n",
|
| 53 |
+
" dst_dir = os.path.dirname(dst)\n",
|
| 54 |
+
" if not os.path.exists(dst_dir):\n",
|
| 55 |
+
" os.makedirs(dst_dir)\n",
|
| 56 |
+
"\n",
|
| 57 |
+
" shutil.copy2(src, dst)\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": null,
|
| 65 |
+
"metadata": {
|
| 66 |
+
"id": "LLy-YA775snY"
|
| 67 |
+
},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"def unzip_archive(filepath, dir_path):\n",
|
| 71 |
+
" with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
|
| 72 |
+
" zip_ref.extractall(dir_path)\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": null,
|
| 80 |
+
"metadata": {
|
| 81 |
+
"id": "YtO0seclE1Pb"
|
| 82 |
+
},
|
| 83 |
+
"outputs": [],
|
| 84 |
+
"source": [
|
| 85 |
+
"import shutil\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"def make_dir(directory):\n",
|
| 88 |
+
" if os.path.exists(directory):\n",
|
| 89 |
+
" shutil.rmtree(directory)\n",
|
| 90 |
+
" os.makedirs(directory)\n",
|
| 91 |
+
" else:\n",
|
| 92 |
+
" os.makedirs(directory)"
|
| 93 |
+
]
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"cell_type": "code",
|
| 97 |
+
"execution_count": null,
|
| 98 |
+
"metadata": {
|
| 99 |
+
"id": "UeqDk3_65vTt"
|
| 100 |
+
},
|
| 101 |
+
"outputs": [],
|
| 102 |
+
"source": [
|
| 103 |
+
"directory = os.getcwd() + '/data/raw/data'\n",
|
| 104 |
+
"make_dir(directory)"
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"execution_count": null,
|
| 110 |
+
"metadata": {
|
| 111 |
+
"id": "zMTup29b5wtO"
|
| 112 |
+
},
|
| 113 |
+
"outputs": [],
|
| 114 |
+
"source": [
|
| 115 |
+
"cols = [\n",
|
| 116 |
+
" 'name',\n",
|
| 117 |
+
" 'pid',\n",
|
| 118 |
+
" 'num_followers',\n",
|
| 119 |
+
" 'pos',\n",
|
| 120 |
+
" 'artist_name',\n",
|
| 121 |
+
" 'track_name',\n",
|
| 122 |
+
" 'album_name'\n",
|
| 123 |
+
"]"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"cell_type": "code",
|
| 128 |
+
"execution_count": null,
|
| 129 |
+
"metadata": {
|
| 130 |
+
"colab": {
|
| 131 |
+
"base_uri": "https://localhost:8080/"
|
| 132 |
+
},
|
| 133 |
+
"id": "h6jQO9HT5zsG",
|
| 134 |
+
"outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
|
| 135 |
+
},
|
| 136 |
+
"outputs": [
|
| 137 |
+
{
|
| 138 |
+
"name": "stdout",
|
| 139 |
+
"output_type": "stream",
|
| 140 |
+
"text": [
|
| 141 |
+
"mpd.slice.727000-727999.json\t100/1000\t10.0%"
|
| 142 |
+
]
|
| 143 |
+
}
|
| 144 |
+
],
|
| 145 |
+
"source": [
|
| 146 |
+
"directory = os.getcwd() + '/data/raw/playlists/data'\n",
|
| 147 |
+
"df = pd.DataFrame()\n",
|
| 148 |
+
"index = 0\n",
|
| 149 |
+
"\n",
|
| 150 |
+
"for filename in os.listdir(directory):\n",
|
| 151 |
+
" if os.path.isfile(os.path.join(directory, filename)):\n",
|
| 152 |
+
" if filename.find('.json') != -1 :\n",
|
| 153 |
+
" index += 1\n",
|
| 154 |
+
"\n",
|
| 155 |
+
" print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
|
| 156 |
+
"\n",
|
| 157 |
+
" full_path = os.path.join(directory, filename)\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" with open(full_path, 'r') as file:\n",
|
| 160 |
+
" json_data = json.load(file)\n",
|
| 161 |
+
"\n",
|
| 162 |
+
" temp = pd.DataFrame(json_data['playlists'])\n",
|
| 163 |
+
" expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
|
| 164 |
+
"\n",
|
| 165 |
+
" json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
|
| 166 |
+
" result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
|
| 167 |
+
" result = result[cols]\n",
|
| 168 |
+
"\n",
|
| 169 |
+
" df = pd.concat([df, result], axis=0, ignore_index=True)\n",
|
| 170 |
+
"\n",
|
| 171 |
+
" if index % 50 == 0:\n",
|
| 172 |
+
" df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n",
|
| 173 |
+
" del df\n",
|
| 174 |
+
" df = pd.DataFrame()\n",
|
| 175 |
+
" if index % 100 == 0:\n",
|
| 176 |
+
" break"
|
| 177 |
+
]
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"cell_type": "code",
|
| 181 |
+
"execution_count": 3,
|
| 182 |
+
"metadata": {
|
| 183 |
+
"id": "PngL0QHq516u"
|
| 184 |
+
},
|
| 185 |
+
"outputs": [],
|
| 186 |
+
"source": [
|
| 187 |
+
"import pyarrow.parquet as pq\n",
|
| 188 |
+
"\n",
|
| 189 |
+
"def read_parquet_folder(folder_path):\n",
|
| 190 |
+
" dataframes = []\n",
|
| 191 |
+
" for file in os.listdir(folder_path):\n",
|
| 192 |
+
" if file.endswith('.parquet'):\n",
|
| 193 |
+
" file_path = os.path.join(folder_path, file)\n",
|
| 194 |
+
" df = pd.read_parquet(file_path)\n",
|
| 195 |
+
" dataframes.append(df)\n",
|
| 196 |
+
"\n",
|
| 197 |
+
" return pd.concat(dataframes, ignore_index=True)\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"folder_path = os.getcwd() + '/../data/raw/data'\n",
|
| 200 |
+
"df = read_parquet_folder(folder_path)"
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"cell_type": "code",
|
| 205 |
+
"execution_count": 4,
|
| 206 |
+
"metadata": {
|
| 207 |
+
"id": "peZyue6t57Mz"
|
| 208 |
+
},
|
| 209 |
+
"outputs": [],
|
| 210 |
+
"source": [
|
| 211 |
+
"def create_ids(df, col, name):\n",
|
| 212 |
+
" value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
|
| 213 |
+
" df[f'{name}_id'] = df[col].map(value_to_id)\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" return df"
|
| 216 |
+
]
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"cell_type": "code",
|
| 220 |
+
"execution_count": 5,
|
| 221 |
+
"metadata": {
|
| 222 |
+
"id": "p68WNyaf58rS"
|
| 223 |
+
},
|
| 224 |
+
"outputs": [],
|
| 225 |
+
"source": [
|
| 226 |
+
"df = create_ids(df, 'artist_name', 'artist')\n",
|
| 227 |
+
"df = create_ids(df, 'pid', 'playlist')\n",
|
| 228 |
+
"df = create_ids(df, 'album_name', 'album')"
|
| 229 |
+
]
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"cell_type": "code",
|
| 233 |
+
"execution_count": 6,
|
| 234 |
+
"metadata": {
|
| 235 |
+
"id": "aSBKxRFa5-O_"
|
| 236 |
+
},
|
| 237 |
+
"outputs": [],
|
| 238 |
+
"source": [
|
| 239 |
+
"df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
|
| 240 |
+
"\n",
|
| 241 |
+
"df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
|
| 242 |
+
"df['playlist_songs'] += 1"
|
| 243 |
+
]
|
| 244 |
+
},
|
| 245 |
+
{
|
| 246 |
+
"cell_type": "code",
|
| 247 |
+
"execution_count": 7,
|
| 248 |
+
"metadata": {},
|
| 249 |
+
"outputs": [],
|
| 250 |
+
"source": [
|
| 251 |
+
"df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
|
| 254 |
+
"df['artist_album_id'] = df['artist_album'].map(value_to_id)"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"cell_type": "code",
|
| 259 |
+
"execution_count": 8,
|
| 260 |
+
"metadata": {
|
| 261 |
+
"id": "V1bhU5rW6BSY"
|
| 262 |
+
},
|
| 263 |
+
"outputs": [],
|
| 264 |
+
"source": [
|
| 265 |
+
"df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"encoder = LabelEncoder()\n",
|
| 268 |
+
"encoder.fit(df['track_name'])\n",
|
| 269 |
+
"df['track_id'] = encoder.transform(df['track_name'])"
|
| 270 |
+
]
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"cell_type": "code",
|
| 274 |
+
"execution_count": 9,
|
| 275 |
+
"metadata": {
|
| 276 |
+
"id": "l6sUWKYC6DCw"
|
| 277 |
+
},
|
| 278 |
+
"outputs": [],
|
| 279 |
+
"source": [
|
| 280 |
+
"df['song_percent'] = df['song_count'] / df['playlist_songs']"
|
| 281 |
+
]
|
| 282 |
+
},
|
| 283 |
+
{
|
| 284 |
+
"cell_type": "code",
|
| 285 |
+
"execution_count": 10,
|
| 286 |
+
"metadata": {
|
| 287 |
+
"id": "XxC0WnlL6EWz"
|
| 288 |
+
},
|
| 289 |
+
"outputs": [],
|
| 290 |
+
"source": [
|
| 291 |
+
"import numpy as np\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
|
| 294 |
+
]
|
| 295 |
+
},
|
| 296 |
+
{
|
| 297 |
+
"cell_type": "code",
|
| 298 |
+
"execution_count": 16,
|
| 299 |
+
"metadata": {
|
| 300 |
+
"id": "5HLSc9z36Izn"
|
| 301 |
+
},
|
| 302 |
+
"outputs": [],
|
| 303 |
+
"source": [
|
| 304 |
+
"X = df.loc[:,['artist_id','album_id',]]\n",
|
| 305 |
+
"y = df.loc[:,'song_percent',]\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"# Split our data into training and test sets\n",
|
| 308 |
+
"X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": 17,
|
| 314 |
+
"metadata": {},
|
| 315 |
+
"outputs": [],
|
| 316 |
+
"source": [
|
| 317 |
+
"from sklearn.metrics import precision_score, recall_score"
|
| 318 |
+
]
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"cell_type": "code",
|
| 322 |
+
"execution_count": 30,
|
| 323 |
+
"metadata": {
|
| 324 |
+
"id": "k47MaxR65Nq4"
|
| 325 |
+
},
|
| 326 |
+
"outputs": [],
|
| 327 |
+
"source": [
|
| 328 |
+
"class NaiveModel:\n",
|
| 329 |
+
" def __init__(self, k=10):\n",
|
| 330 |
+
" self.k = k\n",
|
| 331 |
+
" self.top_k_items = None\n",
|
| 332 |
+
"\n",
|
| 333 |
+
" def fit(self, X, y):\n",
|
| 334 |
+
" df = pd.DataFrame({'album_id': X['album_id'], 'song_percent': y})\n",
|
| 335 |
+
" avg_ratings = df.groupby('album_id')['song_percent'].mean()\n",
|
| 336 |
+
" self.top_k_items = avg_ratings.nlargest(self.k).index.tolist()\n",
|
| 337 |
+
"\n",
|
| 338 |
+
" def predict(self, X):\n",
|
| 339 |
+
" return [self.top_k_items] * len(X)"
|
| 340 |
+
]
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"cell_type": "code",
|
| 344 |
+
"execution_count": 36,
|
| 345 |
+
"metadata": {},
|
| 346 |
+
"outputs": [],
|
| 347 |
+
"source": [
|
| 348 |
+
"def precision_recall(actual,pred, k):\n",
|
| 349 |
+
" actuals = set(actual)\n",
|
| 350 |
+
" preds = set(pred[:k])\n",
|
| 351 |
+
" true_positives = len(actuals & preds)\n",
|
| 352 |
+
" precision = true_positives / k\n",
|
| 353 |
+
" recall = true_positives / len(actuals)\n",
|
| 354 |
+
" return precision, recall"
|
| 355 |
+
]
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"cell_type": "code",
|
| 359 |
+
"execution_count": 38,
|
| 360 |
+
"metadata": {
|
| 361 |
+
"colab": {
|
| 362 |
+
"base_uri": "https://localhost:8080/"
|
| 363 |
+
},
|
| 364 |
+
"id": "Osq-NpGu9V2k",
|
| 365 |
+
"outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
|
| 366 |
+
},
|
| 367 |
+
"outputs": [],
|
| 368 |
+
"source": [
|
| 369 |
+
"model = NaiveModel()\n",
|
| 370 |
+
"model.fit(X_train, y_train)\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"y_pred = model.predict(X_val)\n",
|
| 373 |
+
"\n",
|
| 374 |
+
"y_test_binary = (y_val >= 0.5).astype(int)\n",
|
| 375 |
+
"y_test_items = X_val['album_id'][y_test_binary == 1].tolist()\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"precisions = []\n",
|
| 378 |
+
"recalls = []\n",
|
| 379 |
+
"for i in range(len(X_val)):\n",
|
| 380 |
+
" precision, recall = precision_recall(y_test_items, y_pred[i], k=10)\n",
|
| 381 |
+
" precisions.append(precision)\n",
|
| 382 |
+
" recalls.append(recall)\n",
|
| 383 |
+
"\n",
|
| 384 |
+
"precision = sum(precisions) / len(precisions)\n",
|
| 385 |
+
"recall = sum(recalls) / len(recalls)\n",
|
| 386 |
+
"\n",
|
| 387 |
+
"print(f\"Precision: {precision}\")\n",
|
| 388 |
+
"print(f\"Recall: {recall}\")"
|
| 389 |
+
]
|
| 390 |
+
}
|
| 391 |
+
],
|
| 392 |
+
"metadata": {
|
| 393 |
+
"colab": {
|
| 394 |
+
"machine_shape": "hm",
|
| 395 |
+
"provenance": []
|
| 396 |
+
},
|
| 397 |
+
"kernelspec": {
|
| 398 |
+
"display_name": "Python 3",
|
| 399 |
+
"name": "python3"
|
| 400 |
+
},
|
| 401 |
+
"language_info": {
|
| 402 |
+
"codemirror_mode": {
|
| 403 |
+
"name": "ipython",
|
| 404 |
+
"version": 3
|
| 405 |
+
},
|
| 406 |
+
"file_extension": ".py",
|
| 407 |
+
"mimetype": "text/x-python",
|
| 408 |
+
"name": "python",
|
| 409 |
+
"nbconvert_exporter": "python",
|
| 410 |
+
"pygments_lexer": "ipython3",
|
| 411 |
+
"version": "3.6.15"
|
| 412 |
+
}
|
| 413 |
+
},
|
| 414 |
+
"nbformat": 4,
|
| 415 |
+
"nbformat_minor": 0
|
| 416 |
+
}
|
notebooks/nn_collab_filter.ipynb
CHANGED
|
@@ -48,25 +48,14 @@
|
|
| 48 |
},
|
| 49 |
"outputs": [],
|
| 50 |
"source": [
|
| 51 |
-
"# prompt: copy a file from another directory to current directory in python code and create folders if needed\n",
|
| 52 |
-
"\n",
|
| 53 |
"import shutil\n",
|
| 54 |
"import os\n",
|
| 55 |
"\n",
|
| 56 |
"def copy_file(src, dst):\n",
|
| 57 |
-
" \"\"\"\n",
|
| 58 |
-
" Copies a file from src to dst, creating any necessary directories.\n",
|
| 59 |
-
"\n",
|
| 60 |
-
" Args:\n",
|
| 61 |
-
" src: The path to the source file.\n",
|
| 62 |
-
" dst: The path to the destination file.\n",
|
| 63 |
-
" \"\"\"\n",
|
| 64 |
-
" # Create the destination directory if it doesn't exist.\n",
|
| 65 |
" dst_dir = os.path.dirname(dst)\n",
|
| 66 |
" if not os.path.exists(dst_dir):\n",
|
| 67 |
" os.makedirs(dst_dir)\n",
|
| 68 |
"\n",
|
| 69 |
-
" # Copy the file.\n",
|
| 70 |
" shutil.copy2(src, dst)\n",
|
| 71 |
"\n",
|
| 72 |
"# copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
|
|
@@ -84,7 +73,7 @@
|
|
| 84 |
" with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
|
| 85 |
" zip_ref.extractall(dir_path)\n",
|
| 86 |
"\n",
|
| 87 |
-
"unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
|
| 88 |
]
|
| 89 |
},
|
| 90 |
{
|
|
@@ -152,17 +141,14 @@
|
|
| 152 |
"directory = os.getcwd() + '/data/raw/playlists/data'\n",
|
| 153 |
"df = pd.DataFrame()\n",
|
| 154 |
"index = 0\n",
|
| 155 |
-
"
|
| 156 |
"for filename in os.listdir(directory):\n",
|
| 157 |
-
" # Check if the item is a file (not a subdirectory)\n",
|
| 158 |
" if os.path.isfile(os.path.join(directory, filename)):\n",
|
| 159 |
" if filename.find('.json') != -1 :\n",
|
| 160 |
" index += 1\n",
|
| 161 |
"\n",
|
| 162 |
-
" # Print the filename or perform operations on the file\n",
|
| 163 |
" print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
|
| 164 |
"\n",
|
| 165 |
-
" # If you need the full file path, you can use:\n",
|
| 166 |
" full_path = os.path.join(directory, filename)\n",
|
| 167 |
"\n",
|
| 168 |
" with open(full_path, 'r') as file:\n",
|
|
@@ -171,12 +157,9 @@
|
|
| 171 |
" temp = pd.DataFrame(json_data['playlists'])\n",
|
| 172 |
" expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
|
| 173 |
"\n",
|
| 174 |
-
" # Normalize the JSON data\n",
|
| 175 |
" json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
|
| 176 |
"\n",
|
| 177 |
-
" # Concatenate the original DataFrame with the normalized JSON data\n",
|
| 178 |
" result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
|
| 179 |
-
"\n",
|
| 180 |
" result = result[cols]\n",
|
| 181 |
"\n",
|
| 182 |
" df = pd.concat([df, result], axis=0, ignore_index=True)\n",
|
|
@@ -234,10 +217,8 @@
|
|
| 234 |
"outputs": [],
|
| 235 |
"source": [
|
| 236 |
"def create_ids(df, col, name):\n",
|
| 237 |
-
" # Create a dictionary mapping unique values to IDs\n",
|
| 238 |
" value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
|
| 239 |
"\n",
|
| 240 |
-
" # Create a new column with the IDs\n",
|
| 241 |
" df[f'{name}_id'] = df[col].map(value_to_id)\n",
|
| 242 |
" df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')\n",
|
| 243 |
"\n",
|
|
@@ -252,10 +233,10 @@
|
|
| 252 |
},
|
| 253 |
"outputs": [],
|
| 254 |
"source": [
|
| 255 |
-
"
|
| 256 |
"df = create_ids(df, 'pid', 'playlist')\n",
|
| 257 |
-
"
|
| 258 |
-
"
|
| 259 |
]
|
| 260 |
},
|
| 261 |
{
|
|
@@ -282,10 +263,8 @@
|
|
| 282 |
"source": [
|
| 283 |
"df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
|
| 284 |
"\n",
|
| 285 |
-
"# Step 2: Create a dictionary mapping unique combined values to IDs\n",
|
| 286 |
"value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
|
| 287 |
"\n",
|
| 288 |
-
"# Step 3: Map these IDs back to the DataFrame\n",
|
| 289 |
"df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
|
| 290 |
"\n",
|
| 291 |
"df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')\n",
|
|
@@ -300,32 +279,13 @@
|
|
| 300 |
},
|
| 301 |
"outputs": [],
|
| 302 |
"source": [
|
| 303 |
-
"# df = df.groupby(['playlist_id','artist_album','artist_album_id','playlist_songs']).agg({\n",
|
| 304 |
-
"# 'song_count': 'sum',\n",
|
| 305 |
-
"# 'track_name': '|'.join,\n",
|
| 306 |
-
"# 'track_name': '|'.join,\n",
|
| 307 |
-
"# }).reset_index()\n",
|
| 308 |
"df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
|
| 309 |
"\n",
|
| 310 |
-
"# Encode the genres data\n",
|
| 311 |
"encoder = LabelEncoder()\n",
|
| 312 |
"encoder.fit(df['track_name'])\n",
|
| 313 |
"df['track_id'] = encoder.transform(df['track_name'])"
|
| 314 |
]
|
| 315 |
},
|
| 316 |
-
{
|
| 317 |
-
"cell_type": "code",
|
| 318 |
-
"execution_count": null,
|
| 319 |
-
"metadata": {
|
| 320 |
-
"id": "r0YprWVe_LJ0"
|
| 321 |
-
},
|
| 322 |
-
"outputs": [],
|
| 323 |
-
"source": [
|
| 324 |
-
"# df['artist_count'] = df.groupby(['playlist_id','artist_id'])['song_id'].transform('nunique')\n",
|
| 325 |
-
"# df['album_count'] = df.groupby(['playlist_id','artist_id','album_id'])['song_id'].transform('nunique')\n",
|
| 326 |
-
"# df['song_count'] = df.groupby(['artist_id'])['song_id'].transform('count')"
|
| 327 |
-
]
|
| 328 |
-
},
|
| 329 |
{
|
| 330 |
"cell_type": "code",
|
| 331 |
"execution_count": null,
|
|
@@ -334,9 +294,7 @@
|
|
| 334 |
},
|
| 335 |
"outputs": [],
|
| 336 |
"source": [
|
| 337 |
-
"
|
| 338 |
-
"df['song_percent'] = df['song_count'] / df['playlist_songs']\n",
|
| 339 |
-
"# df['album_percent'] = df['album_count'] / df['playlist_songs']"
|
| 340 |
]
|
| 341 |
},
|
| 342 |
{
|
|
@@ -349,7 +307,6 @@
|
|
| 349 |
"source": [
|
| 350 |
"import numpy as np\n",
|
| 351 |
"\n",
|
| 352 |
-
"# Assuming you have a DataFrame 'df' with a column 'column_name'\n",
|
| 353 |
"df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
|
| 354 |
]
|
| 355 |
},
|
|
@@ -429,20 +386,20 @@
|
|
| 429 |
"source": [
|
| 430 |
"class NNColabFiltering(nn.Module):\n",
|
| 431 |
"\n",
|
| 432 |
-
" def __init__(self, n_playlists, n_artists,
|
| 433 |
" super().__init__()\n",
|
| 434 |
-
" self.
|
| 435 |
" self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)\n",
|
| 436 |
-
" self.fc1 = nn.Linear(
|
| 437 |
" self.fc2 = nn.Linear(n_activations,1)\n",
|
| 438 |
" self.rating_range = rating_range\n",
|
| 439 |
"\n",
|
| 440 |
" def forward(self, X):\n",
|
| 441 |
" # Get embeddings for minibatch\n",
|
| 442 |
-
"
|
| 443 |
" embedded_items = self.item_embeddings(X[:,1])\n",
|
| 444 |
-
" # Concatenate
|
| 445 |
-
" embeddings = torch.cat([
|
| 446 |
" # Pass embeddings through network\n",
|
| 447 |
" preds = self.fc1(embeddings)\n",
|
| 448 |
" preds = F.relu(preds)\n",
|
|
@@ -547,9 +504,9 @@
|
|
| 547 |
"source": [
|
| 548 |
"# Train the model\n",
|
| 549 |
"dataloaders = {'train':trainloader, 'val':valloader}\n",
|
| 550 |
-
"
|
| 551 |
"n_items = X.loc[:,'artist_album_id'].max()+1\n",
|
| 552 |
-
"model = NNColabFiltering(
|
| 553 |
"criterion = nn.MSELoss()\n",
|
| 554 |
"lr=0.001\n",
|
| 555 |
"n_epochs=10\n",
|
|
@@ -678,31 +635,26 @@
|
|
| 678 |
"def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):\n",
|
| 679 |
" model.eval()\n",
|
| 680 |
"\n",
|
|
|
|
|
|
|
| 681 |
"\n",
|
| 682 |
-
"
|
| 683 |
-
" user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)\n",
|
| 684 |
-
"\n",
|
| 685 |
-
" # Initialize tensor to store all predictions\n",
|
| 686 |
-
" all_predictions = torch.zeros(len(all_movie_ids), device=device)\n",
|
| 687 |
"\n",
|
| 688 |
-
" # Generate predictions in batches\n",
|
| 689 |
" with torch.no_grad():\n",
|
| 690 |
-
" for i in range(0, len(
|
| 691 |
-
"
|
| 692 |
-
"
|
| 693 |
"\n",
|
| 694 |
-
" input_tensor = torch.stack([
|
| 695 |
" batch_predictions = model(input_tensor).squeeze()\n",
|
| 696 |
" all_predictions[i:i+batch_size] = batch_predictions\n",
|
| 697 |
"\n",
|
| 698 |
-
" # Convert to numpy for easier handling\n",
|
| 699 |
" predictions = all_predictions.cpu().numpy()\n",
|
| 700 |
"\n",
|
| 701 |
" albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())\n",
|
| 702 |
"\n",
|
| 703 |
" unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)\n",
|
| 704 |
"\n",
|
| 705 |
-
" # Get top N recommendations\n",
|
| 706 |
" top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]\n",
|
| 707 |
" recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]\n",
|
| 708 |
"\n",
|
|
|
|
| 48 |
},
|
| 49 |
"outputs": [],
|
| 50 |
"source": [
|
|
|
|
|
|
|
| 51 |
"import shutil\n",
|
| 52 |
"import os\n",
|
| 53 |
"\n",
|
| 54 |
"def copy_file(src, dst):\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
" dst_dir = os.path.dirname(dst)\n",
|
| 56 |
" if not os.path.exists(dst_dir):\n",
|
| 57 |
" os.makedirs(dst_dir)\n",
|
| 58 |
"\n",
|
|
|
|
| 59 |
" shutil.copy2(src, dst)\n",
|
| 60 |
"\n",
|
| 61 |
"# copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
|
|
|
|
| 73 |
" with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
|
| 74 |
" zip_ref.extractall(dir_path)\n",
|
| 75 |
"\n",
|
| 76 |
+
"unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
|
| 77 |
]
|
| 78 |
},
|
| 79 |
{
|
|
|
|
| 141 |
"directory = os.getcwd() + '/data/raw/playlists/data'\n",
|
| 142 |
"df = pd.DataFrame()\n",
|
| 143 |
"index = 0\n",
|
| 144 |
+
"\n",
|
| 145 |
"for filename in os.listdir(directory):\n",
|
|
|
|
| 146 |
" if os.path.isfile(os.path.join(directory, filename)):\n",
|
| 147 |
" if filename.find('.json') != -1 :\n",
|
| 148 |
" index += 1\n",
|
| 149 |
"\n",
|
|
|
|
| 150 |
" print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
|
| 151 |
"\n",
|
|
|
|
| 152 |
" full_path = os.path.join(directory, filename)\n",
|
| 153 |
"\n",
|
| 154 |
" with open(full_path, 'r') as file:\n",
|
|
|
|
| 157 |
" temp = pd.DataFrame(json_data['playlists'])\n",
|
| 158 |
" expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
|
| 159 |
"\n",
|
|
|
|
| 160 |
" json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
|
| 161 |
"\n",
|
|
|
|
| 162 |
" result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
|
|
|
|
| 163 |
" result = result[cols]\n",
|
| 164 |
"\n",
|
| 165 |
" df = pd.concat([df, result], axis=0, ignore_index=True)\n",
|
|
|
|
| 217 |
"outputs": [],
|
| 218 |
"source": [
|
| 219 |
"def create_ids(df, col, name):\n",
|
|
|
|
| 220 |
" value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
|
| 221 |
"\n",
|
|
|
|
| 222 |
" df[f'{name}_id'] = df[col].map(value_to_id)\n",
|
| 223 |
" df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')\n",
|
| 224 |
"\n",
|
|
|
|
| 233 |
},
|
| 234 |
"outputs": [],
|
| 235 |
"source": [
|
| 236 |
+
"df = create_ids(df, 'artist_name', 'artist')\n",
|
| 237 |
"df = create_ids(df, 'pid', 'playlist')\n",
|
| 238 |
+
"df = create_ids(df, 'track_name', 'track')\n",
|
| 239 |
+
"df = create_ids(df, 'album_name', 'album')"
|
| 240 |
]
|
| 241 |
},
|
| 242 |
{
|
|
|
|
| 263 |
"source": [
|
| 264 |
"df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
|
| 265 |
"\n",
|
|
|
|
| 266 |
"value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
|
| 267 |
"\n",
|
|
|
|
| 268 |
"df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
|
| 269 |
"\n",
|
| 270 |
"df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')\n",
|
|
|
|
| 279 |
},
|
| 280 |
"outputs": [],
|
| 281 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
"df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
|
| 283 |
"\n",
|
|
|
|
| 284 |
"encoder = LabelEncoder()\n",
|
| 285 |
"encoder.fit(df['track_name'])\n",
|
| 286 |
"df['track_id'] = encoder.transform(df['track_name'])"
|
| 287 |
]
|
| 288 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
{
|
| 290 |
"cell_type": "code",
|
| 291 |
"execution_count": null,
|
|
|
|
| 294 |
},
|
| 295 |
"outputs": [],
|
| 296 |
"source": [
|
| 297 |
+
"df['song_percent'] = df['song_count'] / df['playlist_songs']"
|
|
|
|
|
|
|
| 298 |
]
|
| 299 |
},
|
| 300 |
{
|
|
|
|
| 307 |
"source": [
|
| 308 |
"import numpy as np\n",
|
| 309 |
"\n",
|
|
|
|
| 310 |
"df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
|
| 311 |
]
|
| 312 |
},
|
|
|
|
| 386 |
"source": [
|
| 387 |
"class NNColabFiltering(nn.Module):\n",
|
| 388 |
"\n",
|
| 389 |
+
" def __init__(self, n_playlists, n_artists, embedding_dim_playlists, embedding_dim_items, n_activations, rating_range):\n",
|
| 390 |
" super().__init__()\n",
|
| 391 |
+
" self.playlist_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_playlists)\n",
|
| 392 |
" self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)\n",
|
| 393 |
+
" self.fc1 = nn.Linear(embedding_dim_playlists+embedding_dim_items,n_activations)\n",
|
| 394 |
" self.fc2 = nn.Linear(n_activations,1)\n",
|
| 395 |
" self.rating_range = rating_range\n",
|
| 396 |
"\n",
|
| 397 |
" def forward(self, X):\n",
|
| 398 |
" # Get embeddings for minibatch\n",
|
| 399 |
+
" embedded_playlists = self.playlist_embeddings(X[:,0])\n",
|
| 400 |
" embedded_items = self.item_embeddings(X[:,1])\n",
|
| 401 |
+
" # Concatenate playlist and item embeddings\n",
|
| 402 |
+
" embeddings = torch.cat([embedded_playlists,embedded_items],dim=1)\n",
|
| 403 |
" # Pass embeddings through network\n",
|
| 404 |
" preds = self.fc1(embeddings)\n",
|
| 405 |
" preds = F.relu(preds)\n",
|
|
|
|
| 504 |
"source": [
|
| 505 |
"# Train the model\n",
|
| 506 |
"dataloaders = {'train':trainloader, 'val':valloader}\n",
|
| 507 |
+
"n_playlists = X.loc[:,'playlist_id'].max()+1\n",
|
| 508 |
"n_items = X.loc[:,'artist_album_id'].max()+1\n",
|
| 509 |
+
"model = NNColabFiltering(n_playlists,n_items,embedding_dim_playlists=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])\n",
|
| 510 |
"criterion = nn.MSELoss()\n",
|
| 511 |
"lr=0.001\n",
|
| 512 |
"n_epochs=10\n",
|
|
|
|
| 635 |
"def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):\n",
|
| 636 |
" model.eval()\n",
|
| 637 |
"\n",
|
| 638 |
+
" all_album_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)\n",
|
| 639 |
+
" playlist_ids = torch.full((len(all_album_ids),), playlist_id, dtype=torch.long, device=device)\n",
|
| 640 |
"\n",
|
| 641 |
+
" all_predictions = torch.zeros(len(all_album_ids), device=device)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
"\n",
|
|
|
|
| 643 |
" with torch.no_grad():\n",
|
| 644 |
+
" for i in range(0, len(all_album_ids), batch_size):\n",
|
| 645 |
+
" batch_playlist_ids = playlist_ids[i:i+batch_size]\n",
|
| 646 |
+
" batch_album_ids = all_album_ids[i:i+batch_size]\n",
|
| 647 |
"\n",
|
| 648 |
+
" input_tensor = torch.stack([batch_playlist_ids, batch_album_ids], dim=1)\n",
|
| 649 |
" batch_predictions = model(input_tensor).squeeze()\n",
|
| 650 |
" all_predictions[i:i+batch_size] = batch_predictions\n",
|
| 651 |
"\n",
|
|
|
|
| 652 |
" predictions = all_predictions.cpu().numpy()\n",
|
| 653 |
"\n",
|
| 654 |
" albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())\n",
|
| 655 |
"\n",
|
| 656 |
" unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)\n",
|
| 657 |
"\n",
|
|
|
|
| 658 |
" top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]\n",
|
| 659 |
" recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]\n",
|
| 660 |
"\n",
|
scripts/build_features.py
CHANGED
|
@@ -1,49 +1,21 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
-
import urllib.request
|
| 3 |
-
import zipfile
|
| 4 |
-
import json
|
| 5 |
import pandas as pd
|
| 6 |
-
import time
|
| 7 |
-
import torch
|
| 8 |
import numpy as np
|
| 9 |
import pandas as pd
|
| 10 |
-
import torch.nn as nn
|
| 11 |
-
import torch.nn.functional as F
|
| 12 |
-
import torch.optim as optim
|
| 13 |
-
from torch.utils.data import DataLoader, TensorDataset
|
| 14 |
-
from sklearn.model_selection import train_test_split
|
| 15 |
-
import matplotlib.pyplot as plt
|
| 16 |
from sklearn.preprocessing import LabelEncoder
|
| 17 |
import shutil
|
| 18 |
import os
|
| 19 |
-
import pyarrow.parquet as pq
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
cols = [
|
| 23 |
-
'name',
|
| 24 |
-
'pid',
|
| 25 |
-
'num_followers',
|
| 26 |
-
'pos',
|
| 27 |
-
'artist_name',
|
| 28 |
-
'track_name',
|
| 29 |
-
'album_name'
|
| 30 |
-
]
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def copy_file(src, dst):
|
| 34 |
-
|
| 35 |
-
dst_dir = os.path.dirname(dst)
|
| 36 |
-
if not os.path.exists(dst_dir):
|
| 37 |
-
os.makedirs(dst_dir)
|
| 38 |
-
|
| 39 |
-
shutil.copy2(src, dst)
|
| 40 |
-
|
| 41 |
-
def unzip_archive(filepath, dir_path):
|
| 42 |
-
with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
|
| 43 |
-
zip_ref.extractall(dir_path)
|
| 44 |
-
|
| 45 |
|
| 46 |
def make_dir(directory):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
if os.path.exists(directory):
|
| 48 |
shutil.rmtree(directory)
|
| 49 |
os.makedirs(directory)
|
|
@@ -51,52 +23,73 @@ def make_dir(directory):
|
|
| 51 |
os.makedirs(directory)
|
| 52 |
|
| 53 |
|
| 54 |
-
def
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
with open(full_path, 'r') as file:
|
| 72 |
-
json_data = json.load(file)
|
| 73 |
-
|
| 74 |
-
temp = pd.DataFrame(json_data['playlists'])
|
| 75 |
-
expanded_df = temp.explode('tracks').reset_index(drop=True)
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
|
|
|
| 82 |
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
|
| 89 |
-
del df
|
| 90 |
-
df = pd.DataFrame()
|
| 91 |
-
if index % 200 == 0:
|
| 92 |
-
break
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
make_dataset()
|
| 102 |
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
|
|
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from sklearn.preprocessing import LabelEncoder
|
| 7 |
import shutil
|
| 8 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def make_dir(directory):
|
| 11 |
+
'''
|
| 12 |
+
Creates a new blank directory
|
| 13 |
+
|
| 14 |
+
Inputs:
|
| 15 |
+
directory: path to create a new directory at
|
| 16 |
+
Returns:
|
| 17 |
+
|
| 18 |
+
'''
|
| 19 |
if os.path.exists(directory):
|
| 20 |
shutil.rmtree(directory)
|
| 21 |
os.makedirs(directory)
|
|
|
|
| 23 |
os.makedirs(directory)
|
| 24 |
|
| 25 |
|
| 26 |
+
def read_parquet_folder(folder_path):
|
| 27 |
+
'''
|
| 28 |
+
Creates the pandas dataframe from a folder of parquet files
|
| 29 |
+
|
| 30 |
+
Inputs:
|
| 31 |
+
folder_path: the folder path for the parquet files
|
| 32 |
+
Returns:
|
| 33 |
+
|
| 34 |
+
'''
|
| 35 |
+
dataframes = []
|
| 36 |
+
for file in os.listdir(folder_path):
|
| 37 |
+
if file.endswith('.parquet'):
|
| 38 |
+
file_path = os.path.join(folder_path, file)
|
| 39 |
+
df = pd.read_parquet(file_path)
|
| 40 |
+
dataframes.append(df)
|
| 41 |
+
|
| 42 |
+
return pd.concat(dataframes, ignore_index=True)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def create_ids(df, col, name):
|
| 46 |
+
'''
|
| 47 |
+
Creates unique ids for the features and creates mapping documents
|
| 48 |
+
|
| 49 |
+
Inputs:
|
| 50 |
+
df: dataframe with the features
|
| 51 |
+
col: column to create ids on
|
| 52 |
+
name: name of the newly created id
|
| 53 |
+
Returns:
|
| 54 |
+
df: dataframe with the mapped ids
|
| 55 |
+
|
| 56 |
+
'''
|
| 57 |
+
value_to_id = {val: i for i, val in enumerate(df[col].unique())}
|
| 58 |
+
|
| 59 |
+
df[f'{name}_id'] = df[col].map(value_to_id)
|
| 60 |
+
df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
|
| 61 |
+
|
| 62 |
+
return df
|
| 63 |
|
| 64 |
+
if __name__ == '__main__':
|
| 65 |
+
folder_path = os.getcwd() + '/data/raw/data'
|
| 66 |
+
df = read_parquet_folder(folder_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
+
directory = os.getcwd() + '/data/processed'
|
| 69 |
+
make_dir(directory)
|
| 70 |
|
| 71 |
+
df = create_ids(df, 'artist_name', 'artist')
|
| 72 |
+
df = create_ids(df, 'pid', 'playlist')
|
| 73 |
+
df = create_ids(df, 'album_name', 'album')
|
| 74 |
|
| 75 |
+
df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
|
| 76 |
+
df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
|
| 77 |
+
df['playlist_songs'] += 1
|
| 78 |
|
| 79 |
+
df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
|
| 80 |
+
value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
|
| 81 |
+
df['artist_album_id'] = df['artist_album'].map(value_to_id)
|
| 82 |
+
|
| 83 |
+
df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
|
| 84 |
|
| 85 |
+
df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
encoder = LabelEncoder()
|
| 88 |
+
encoder.fit(df['track_name'])
|
| 89 |
+
|
| 90 |
+
df['track_id'] = encoder.transform(df['track_name'])
|
| 91 |
+
df['song_percent'] = df['song_count'] / df['playlist_songs']
|
| 92 |
+
df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
|
|
|
|
| 93 |
|
| 94 |
+
artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
|
| 95 |
+
artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
|
scripts/make_dataset.py
CHANGED
|
@@ -1,25 +1,63 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
import os
|
| 3 |
-
import urllib.request
|
| 4 |
import zipfile
|
| 5 |
import json
|
| 6 |
import pandas as pd
|
| 7 |
-
import time
|
| 8 |
-
import torch
|
| 9 |
-
import numpy as np
|
| 10 |
import pandas as pd
|
| 11 |
-
import torch.nn as nn
|
| 12 |
-
import torch.nn.functional as F
|
| 13 |
-
import torch.optim as optim
|
| 14 |
-
from torch.utils.data import DataLoader, TensorDataset
|
| 15 |
-
from sklearn.model_selection import train_test_split
|
| 16 |
-
import matplotlib.pyplot as plt
|
| 17 |
-
from sklearn.preprocessing import LabelEncoder
|
| 18 |
import shutil
|
| 19 |
import os
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def make_dir(directory):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
if os.path.exists(directory):
|
| 24 |
shutil.rmtree(directory)
|
| 25 |
os.makedirs(directory)
|
|
@@ -27,56 +65,54 @@ def make_dir(directory):
|
|
| 27 |
os.makedirs(directory)
|
| 28 |
|
| 29 |
|
| 30 |
-
def
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
value_to_id = {val: i for i, val in enumerate(df[col].unique())}
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
folder_path = os.getcwd() + '/data/raw/data'
|
| 53 |
-
df = read_parquet_folder(folder_path)
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
df['playlist_songs'] += 1
|
| 65 |
|
| 66 |
-
|
| 67 |
-
value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
|
| 68 |
-
df['artist_album_id'] = df['artist_album'].map(value_to_id)
|
| 69 |
-
|
| 70 |
-
df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
| 80 |
|
| 81 |
-
artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
|
| 82 |
-
artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import zipfile
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import shutil
|
| 7 |
import os
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
cols = [
|
| 11 |
+
'name',
|
| 12 |
+
'pid',
|
| 13 |
+
'num_followers',
|
| 14 |
+
'pos',
|
| 15 |
+
'artist_name',
|
| 16 |
+
'track_name',
|
| 17 |
+
'album_name'
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def copy_file(src, dst):
|
| 22 |
+
'''
|
| 23 |
+
Copies a file from one dir to another
|
| 24 |
+
|
| 25 |
+
Inputs:
|
| 26 |
+
src: filepath to use as the soruce
|
| 27 |
+
dst: filepath to copy the file to
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
|
| 31 |
+
'''
|
| 32 |
+
dst_dir = os.path.dirname(dst)
|
| 33 |
+
if not os.path.exists(dst_dir):
|
| 34 |
+
os.makedirs(dst_dir)
|
| 35 |
+
|
| 36 |
+
shutil.copy2(src, dst)
|
| 37 |
+
|
| 38 |
+
def unzip_archive(filepath, dir_path):
|
| 39 |
+
'''
|
| 40 |
+
Unzips a zipfile to the dir_path
|
| 41 |
+
|
| 42 |
+
Inputs:
|
| 43 |
+
filepath: filepath of the zip file
|
| 44 |
+
dir_path: path to extract the zip file contents to
|
| 45 |
+
Returns:
|
| 46 |
+
|
| 47 |
+
'''
|
| 48 |
+
with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
|
| 49 |
+
zip_ref.extractall(dir_path)
|
| 50 |
+
|
| 51 |
|
| 52 |
def make_dir(directory):
|
| 53 |
+
'''
|
| 54 |
+
Creates a new blank directory
|
| 55 |
+
|
| 56 |
+
Inputs:
|
| 57 |
+
directory: path to create a new directory at
|
| 58 |
+
Returns:
|
| 59 |
+
|
| 60 |
+
'''
|
| 61 |
if os.path.exists(directory):
|
| 62 |
shutil.rmtree(directory)
|
| 63 |
os.makedirs(directory)
|
|
|
|
| 65 |
os.makedirs(directory)
|
| 66 |
|
| 67 |
|
| 68 |
+
def make_dataset():
|
| 69 |
+
'''
|
| 70 |
+
Creates the directory of parquet files to create the
|
| 71 |
+
dataset with, used parquet to reduce memory load
|
| 72 |
+
|
| 73 |
+
Inputs:
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
|
| 77 |
+
'''
|
| 78 |
+
directory = os.getcwd() + '/data/raw/playlists/data'
|
| 79 |
+
df = pd.DataFrame()
|
| 80 |
+
index = 0
|
|
|
|
| 81 |
|
| 82 |
+
for filename in os.listdir(directory):
|
| 83 |
+
if os.path.isfile(os.path.join(directory, filename)):
|
| 84 |
+
if filename.find('.json') != -1 :
|
| 85 |
+
index += 1
|
| 86 |
|
| 87 |
+
print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
|
| 88 |
|
| 89 |
+
full_path = os.path.join(directory, filename)
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
with open(full_path, 'r') as file:
|
| 92 |
+
json_data = json.load(file)
|
| 93 |
|
| 94 |
+
temp = pd.DataFrame(json_data['playlists'])
|
| 95 |
+
expanded_df = temp.explode('tracks').reset_index(drop=True)
|
| 96 |
+
json_normalized = pd.json_normalize(expanded_df['tracks'])
|
| 97 |
|
| 98 |
+
result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
|
| 99 |
+
result = result[cols]
|
|
|
|
| 100 |
|
| 101 |
+
df = pd.concat([df, result], axis=0, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
if index % 50 == 0:
|
| 104 |
+
df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
|
| 105 |
+
del df
|
| 106 |
+
df = pd.DataFrame()
|
| 107 |
+
if index % 200 == 0:
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
|
| 111 |
+
if __name__ == '__main__':
|
| 112 |
+
unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
|
| 113 |
+
directory = os.getcwd() + '/data/raw/data'
|
| 114 |
+
make_dir(directory)
|
| 115 |
+
directory = os.getcwd() + '/data/processed'
|
| 116 |
+
make_dir(directory)
|
| 117 |
+
make_dataset()
|
| 118 |
|
|
|
|
|
|
scripts/model.py
CHANGED
|
@@ -7,9 +7,6 @@ Brinnae Bent
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
| 10 |
-
import urllib.request
|
| 11 |
-
import zipfile
|
| 12 |
-
import json
|
| 13 |
import pandas as pd
|
| 14 |
import time
|
| 15 |
import torch
|
|
@@ -18,26 +15,8 @@ import pandas as pd
|
|
| 18 |
import torch.nn as nn
|
| 19 |
import torch.nn.functional as F
|
| 20 |
import torch.optim as optim
|
| 21 |
-
from torch.utils.data import
|
| 22 |
from sklearn.model_selection import train_test_split
|
| 23 |
-
import matplotlib.pyplot as plt
|
| 24 |
-
from sklearn.preprocessing import LabelEncoder
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
|
| 30 |
-
# Convert training and test data to TensorDatasets
|
| 31 |
-
trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
|
| 32 |
-
torch.from_numpy(np.array(y_train)).float())
|
| 33 |
-
valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
|
| 34 |
-
torch.from_numpy(np.array(y_val)).float())
|
| 35 |
-
|
| 36 |
-
# Create Dataloaders for our training and test data to allow us to iterate over minibatches
|
| 37 |
-
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
|
| 38 |
-
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
|
| 39 |
-
|
| 40 |
-
return trainloader, valloader
|
| 41 |
|
| 42 |
|
| 43 |
class NNColabFiltering(nn.Module):
|
|
@@ -64,9 +43,50 @@ class NNColabFiltering(nn.Module):
|
|
| 64 |
preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
|
| 65 |
return preds
|
| 66 |
|
| 67 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
since = time.time()
|
| 71 |
|
| 72 |
costpaths = {'train':[],'val':[]}
|
|
@@ -75,47 +95,36 @@ def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5,
|
|
| 75 |
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
|
| 76 |
print('-' * 10)
|
| 77 |
|
| 78 |
-
# Each epoch has a training and validation phase
|
| 79 |
for phase in ['train', 'val']:
|
| 80 |
if phase == 'train':
|
| 81 |
-
model.train()
|
| 82 |
else:
|
| 83 |
-
model.eval()
|
| 84 |
|
| 85 |
running_loss = 0.0
|
| 86 |
|
| 87 |
-
# Get the inputs and labels, and send to GPU if available
|
| 88 |
index = 0
|
| 89 |
for (inputs,labels) in dataloaders[phase]:
|
| 90 |
inputs = inputs.to(device)
|
| 91 |
labels = labels.to(device)
|
| 92 |
|
| 93 |
-
# Zero the weight gradients
|
| 94 |
optimizer.zero_grad()
|
| 95 |
|
| 96 |
-
# Forward pass to get outputs and calculate loss
|
| 97 |
-
# Track gradient only for training data
|
| 98 |
with torch.set_grad_enabled(phase == 'train'):
|
| 99 |
outputs = model.forward(inputs).view(-1)
|
| 100 |
loss = criterion(outputs, labels)
|
| 101 |
|
| 102 |
-
# Backpropagation to get the gradients with respect to each weight
|
| 103 |
-
# Only if in train
|
| 104 |
if phase == 'train':
|
| 105 |
loss.backward()
|
| 106 |
-
# Update the weights
|
| 107 |
optimizer.step()
|
| 108 |
|
| 109 |
-
# Convert loss into a scalar and add it to running_loss
|
| 110 |
running_loss += np.sqrt(loss.item()) * labels.size(0)
|
| 111 |
print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
|
| 112 |
index +=1
|
| 113 |
|
| 114 |
-
# Step along learning rate scheduler when in train
|
| 115 |
if (phase == 'train') and (scheduler is not None):
|
| 116 |
scheduler.step()
|
| 117 |
|
| 118 |
-
# Calculate and display average loss and accuracy for the epoch
|
| 119 |
epoch_loss = running_loss / len(dataloaders[phase].dataset)
|
| 120 |
costpaths[phase].append(epoch_loss)
|
| 121 |
print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
|
|
@@ -150,7 +159,6 @@ if __name__ == '__main__':
|
|
| 150 |
|
| 151 |
cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
|
| 152 |
|
| 153 |
-
|
| 154 |
# Save the entire model
|
| 155 |
torch.save(model, os.getcwd() + '/models/recommender.pt')
|
| 156 |
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
|
|
|
|
|
|
|
|
|
| 10 |
import pandas as pd
|
| 11 |
import time
|
| 12 |
import torch
|
|
|
|
| 15 |
import torch.nn as nn
|
| 16 |
import torch.nn.functional as F
|
| 17 |
import torch.optim as optim
|
| 18 |
+
from torch.utils.data import TensorDataset
|
| 19 |
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
class NNColabFiltering(nn.Module):
|
|
|
|
| 43 |
preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
|
| 44 |
return preds
|
| 45 |
|
| 46 |
+
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
|
| 47 |
+
'''
|
| 48 |
+
Loads the prefetched data from the output dir
|
| 49 |
+
|
| 50 |
+
Inputs:
|
| 51 |
+
X_train: training data features
|
| 52 |
+
y_train: training data target
|
| 53 |
+
X_val: validation data features
|
| 54 |
+
y_val: validation data targets
|
| 55 |
+
batch_size: the batch size to use
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
trainloader: training dataloader
|
| 59 |
+
valloader: validation dataloader
|
| 60 |
+
'''
|
| 61 |
+
# Convert training and test data to TensorDatasets
|
| 62 |
+
trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
|
| 63 |
+
torch.from_numpy(np.array(y_train)).float())
|
| 64 |
+
valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
|
| 65 |
+
torch.from_numpy(np.array(y_val)).float())
|
| 66 |
|
| 67 |
+
# Create Dataloaders for our training and test data to allow us to iterate over minibatches
|
| 68 |
+
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
|
| 69 |
+
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
|
| 70 |
+
|
| 71 |
+
return trainloader, valloader
|
| 72 |
+
|
| 73 |
+
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
|
| 74 |
+
'''
|
| 75 |
+
Loads the prefetched data from the output dir
|
| 76 |
+
|
| 77 |
+
Inputs:
|
| 78 |
+
model: the model to train
|
| 79 |
+
criterion: the criterion to use to train
|
| 80 |
+
optimizer: the optimizer to use to train
|
| 81 |
+
dataloaders: the dict of dataloaders to user in the training and validation
|
| 82 |
+
device: the torch defined cpu/gpu
|
| 83 |
+
num_epochs: number of epochs to use for training
|
| 84 |
+
scheduler: the scheduler to use to train for training
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
costpaths: the loss for each epoch for validation and training
|
| 88 |
+
'''
|
| 89 |
+
model = model.to(device)
|
| 90 |
since = time.time()
|
| 91 |
|
| 92 |
costpaths = {'train':[],'val':[]}
|
|
|
|
| 95 |
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
|
| 96 |
print('-' * 10)
|
| 97 |
|
|
|
|
| 98 |
for phase in ['train', 'val']:
|
| 99 |
if phase == 'train':
|
| 100 |
+
model.train()
|
| 101 |
else:
|
| 102 |
+
model.eval()
|
| 103 |
|
| 104 |
running_loss = 0.0
|
| 105 |
|
|
|
|
| 106 |
index = 0
|
| 107 |
for (inputs,labels) in dataloaders[phase]:
|
| 108 |
inputs = inputs.to(device)
|
| 109 |
labels = labels.to(device)
|
| 110 |
|
|
|
|
| 111 |
optimizer.zero_grad()
|
| 112 |
|
|
|
|
|
|
|
| 113 |
with torch.set_grad_enabled(phase == 'train'):
|
| 114 |
outputs = model.forward(inputs).view(-1)
|
| 115 |
loss = criterion(outputs, labels)
|
| 116 |
|
|
|
|
|
|
|
| 117 |
if phase == 'train':
|
| 118 |
loss.backward()
|
|
|
|
| 119 |
optimizer.step()
|
| 120 |
|
|
|
|
| 121 |
running_loss += np.sqrt(loss.item()) * labels.size(0)
|
| 122 |
print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
|
| 123 |
index +=1
|
| 124 |
|
|
|
|
| 125 |
if (phase == 'train') and (scheduler is not None):
|
| 126 |
scheduler.step()
|
| 127 |
|
|
|
|
| 128 |
epoch_loss = running_loss / len(dataloaders[phase].dataset)
|
| 129 |
costpaths[phase].append(epoch_loss)
|
| 130 |
print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
|
|
|
|
| 159 |
|
| 160 |
cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
|
| 161 |
|
|
|
|
| 162 |
# Save the entire model
|
| 163 |
torch.save(model, os.getcwd() + '/models/recommender.pt')
|
| 164 |
|