keesephillips commited on
Commit
6ce6b56
·
verified ·
1 Parent(s): 3284fa6

Added Naive model and comments

Browse files
main.py CHANGED
@@ -13,25 +13,15 @@ import os
13
  import numpy as np
14
  import pandas as pd
15
  import pandas as pd
16
- import json
17
- import matplotlib.pyplot as plt
18
-
19
  import os
20
- import urllib.request
21
- import zipfile
22
  import json
23
  import pandas as pd
24
- import time
25
  import torch
26
  import numpy as np
27
  import pandas as pd
28
  import torch.nn as nn
29
  import torch.nn.functional as F
30
- import torch.optim as optim
31
- from torch.utils.data import DataLoader, TensorDataset
32
- from sklearn.model_selection import train_test_split
33
  import matplotlib.pyplot as plt
34
- from sklearn.preprocessing import LabelEncoder
35
 
36
  class NNColabFiltering(nn.Module):
37
 
@@ -58,16 +48,29 @@ class NNColabFiltering(nn.Module):
58
  return preds
59
 
60
  def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):
61
- model.eval()
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)
65
  user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)
66
 
67
- # Initialize tensor to store all predictions
68
  all_predictions = torch.zeros(len(all_movie_ids), device=device)
69
 
70
- # Generate predictions in batches
71
  with torch.no_grad():
72
  for i in range(0, len(all_movie_ids), batch_size):
73
  batch_user_ids = user_ids[i:i+batch_size]
@@ -77,14 +80,10 @@ def generate_recommendations(artist_album, playlists, model, playlist_id, device
77
  batch_predictions = model(input_tensor).squeeze()
78
  all_predictions[i:i+batch_size] = batch_predictions
79
 
80
- # Convert to numpy for easier handling
81
  predictions = all_predictions.cpu().numpy()
82
-
83
  albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())
84
-
85
  unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)
86
 
87
- # Get top N recommendations
88
  top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]
89
  recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]
90
 
@@ -126,7 +125,6 @@ if __name__ == '__main__':
126
  trumpet = Image.open('assets/trumpet.png')
127
  img2.image(trumpet, use_column_width=True)
128
 
129
- # Using "with" notation
130
  with st.sidebar:
131
  playlist_name = st.selectbox(
132
  "Playlist Selection",
 
13
  import numpy as np
14
  import pandas as pd
15
  import pandas as pd
 
 
 
16
  import os
 
 
17
  import json
18
  import pandas as pd
 
19
  import torch
20
  import numpy as np
21
  import pandas as pd
22
  import torch.nn as nn
23
  import torch.nn.functional as F
 
 
 
24
  import matplotlib.pyplot as plt
 
25
 
26
  class NNColabFiltering(nn.Module):
27
 
 
48
  return preds
49
 
50
  def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):
51
+ '''
52
+ Loads the prefetched data from the output dir
53
 
54
+ Inputs:
55
+ artist_album: the dataframe containing the mappings for the artist and albums
56
+ playlists: the dataframe containing the playlists contents
57
+ model: the trained model
58
+ playlist_id: the playlist id to generate recommendation for
59
+ device: the gpu or cpu device define by torch
60
+ top_n: the number of recommendations to generate
61
+ batch_size: the batch size to use
62
+
63
+ Returns:
64
+ album: the recommended album
65
+ playlists: the recommended artist
66
+ '''
67
+ model.eval()
68
 
69
  all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)
70
  user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)
71
 
 
72
  all_predictions = torch.zeros(len(all_movie_ids), device=device)
73
 
 
74
  with torch.no_grad():
75
  for i in range(0, len(all_movie_ids), batch_size):
76
  batch_user_ids = user_ids[i:i+batch_size]
 
80
  batch_predictions = model(input_tensor).squeeze()
81
  all_predictions[i:i+batch_size] = batch_predictions
82
 
 
83
  predictions = all_predictions.cpu().numpy()
 
84
  albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())
 
85
  unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)
86
 
 
87
  top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]
88
  recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]
89
 
 
125
  trumpet = Image.open('assets/trumpet.png')
126
  img2.image(trumpet, use_column_width=True)
127
 
 
128
  with st.sidebar:
129
  playlist_name = st.selectbox(
130
  "Playlist Selection",
notebooks/dbscan.ipynb CHANGED
@@ -1,22 +1,12 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "machine_shape": "hm"
8
- },
9
- "kernelspec": {
10
- "name": "python3",
11
- "display_name": "Python 3"
12
- },
13
- "language_info": {
14
- "name": "python"
15
- }
16
- },
17
  "cells": [
18
  {
19
  "cell_type": "code",
 
 
 
 
 
20
  "source": [
21
  "import os\n",
22
  "import urllib.request\n",
@@ -34,74 +24,63 @@
34
  "from sklearn.model_selection import train_test_split\n",
35
  "import matplotlib.pyplot as plt\n",
36
  "from sklearn.preprocessing import LabelEncoder"
37
- ],
38
- "metadata": {
39
- "id": "KHnddFeW5hwh"
40
- },
41
- "execution_count": null,
42
- "outputs": []
43
  },
44
  {
45
  "cell_type": "code",
46
- "source": [
47
- "from google.colab import drive\n",
48
- "drive.mount('/content/drive')"
49
- ],
50
  "metadata": {
51
  "id": "l7pGG_d85lzH"
52
  },
53
- "execution_count": null,
54
- "outputs": []
 
 
 
55
  },
56
  {
57
  "cell_type": "code",
 
 
 
 
 
58
  "source": [
59
- "# prompt: copy a file from another directory to current directory in python code and create folders if needed\n",
60
- "\n",
61
  "import shutil\n",
62
  "import os\n",
63
  "\n",
64
  "def copy_file(src, dst):\n",
65
- " \"\"\"\n",
66
- " Copies a file from src to dst, creating any necessary directories.\n",
67
- "\n",
68
- " Args:\n",
69
- " src: The path to the source file.\n",
70
- " dst: The path to the destination file.\n",
71
- " \"\"\"\n",
72
- " # Create the destination directory if it doesn't exist.\n",
73
  " dst_dir = os.path.dirname(dst)\n",
74
  " if not os.path.exists(dst_dir):\n",
75
  " os.makedirs(dst_dir)\n",
76
  "\n",
77
- " # Copy the file.\n",
78
  " shutil.copy2(src, dst)\n",
79
  "\n",
80
  "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
81
- ],
82
- "metadata": {
83
- "id": "dL8TIlH55qSc"
84
- },
85
- "execution_count": 3,
86
- "outputs": []
87
  },
88
  {
89
  "cell_type": "code",
 
 
 
 
 
90
  "source": [
91
  "def unzip_archive(filepath, dir_path):\n",
92
  " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
93
  " zip_ref.extractall(dir_path)\n",
94
  "\n",
95
  "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
96
- ],
97
- "metadata": {
98
- "id": "LLy-YA775snY"
99
- },
100
- "execution_count": null,
101
- "outputs": []
102
  },
103
  {
104
  "cell_type": "code",
 
 
 
 
 
105
  "source": [
106
  "import shutil\n",
107
  "\n",
@@ -111,29 +90,27 @@
111
  " os.makedirs(directory)\n",
112
  " else:\n",
113
  " os.makedirs(directory)"
114
- ],
115
- "metadata": {
116
- "id": "YtO0seclE1Pb"
117
- },
118
- "execution_count": null,
119
- "outputs": []
120
  },
121
  {
122
  "cell_type": "code",
123
- "source": [
124
- "\n",
125
- "\n",
126
- "directory = os.getcwd() + '/data/raw/data'\n",
127
- "make_dir(directory)"
128
- ],
129
  "metadata": {
130
  "id": "UeqDk3_65vTt"
131
  },
132
- "execution_count": null,
133
- "outputs": []
 
 
 
134
  },
135
  {
136
  "cell_type": "code",
 
 
 
 
 
137
  "source": [
138
  "cols = [\n",
139
  " 'name',\n",
@@ -144,15 +121,27 @@
144
  " 'track_name',\n",
145
  " 'album_name'\n",
146
  "]"
147
- ],
148
- "metadata": {
149
- "id": "zMTup29b5wtO"
150
- },
151
- "execution_count": null,
152
- "outputs": []
153
  },
154
  {
155
  "cell_type": "code",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "source": [
157
  "directory = os.getcwd() + '/data/raw/playlists/data'\n",
158
  "df = pd.DataFrame()\n",
@@ -192,27 +181,15 @@
192
  " df = pd.DataFrame()\n",
193
  " if index % 100 == 0:\n",
194
  " break"
195
- ],
196
- "metadata": {
197
- "colab": {
198
- "base_uri": "https://localhost:8080/"
199
- },
200
- "id": "h6jQO9HT5zsG",
201
- "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
202
- },
203
- "execution_count": null,
204
- "outputs": [
205
- {
206
- "output_type": "stream",
207
- "name": "stdout",
208
- "text": [
209
- "mpd.slice.727000-727999.json\t100/1000\t10.0%"
210
- ]
211
- }
212
  ]
213
  },
214
  {
215
  "cell_type": "code",
 
 
 
 
 
216
  "source": [
217
  "import pyarrow.parquet as pq\n",
218
  "\n",
@@ -228,27 +205,27 @@
228
  "\n",
229
  "folder_path = os.getcwd() + '/data/raw/data'\n",
230
  "df = read_parquet_folder(folder_path)"
231
- ],
232
- "metadata": {
233
- "id": "PngL0QHq516u"
234
- },
235
- "execution_count": null,
236
- "outputs": []
237
  },
238
  {
239
  "cell_type": "code",
240
- "source": [
241
- "directory = os.getcwd() + '/data/raw/mappings'\n",
242
- "make_dir(directory)"
243
- ],
244
  "metadata": {
245
  "id": "hdLpjr2153b_"
246
  },
247
- "execution_count": null,
248
- "outputs": []
 
 
 
249
  },
250
  {
251
  "cell_type": "code",
 
 
 
 
 
252
  "source": [
253
  "def create_ids(df, col, name):\n",
254
  " # Create a dictionary mapping unique values to IDs\n",
@@ -259,43 +236,42 @@
259
  " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n",
260
  "\n",
261
  " return df"
262
- ],
263
- "metadata": {
264
- "id": "peZyue6t57Mz"
265
- },
266
- "execution_count": null,
267
- "outputs": []
268
  },
269
  {
270
  "cell_type": "code",
 
 
 
 
 
271
  "source": [
272
  "df = create_ids(df, 'artist_name', 'artist')\n",
273
  "df = create_ids(df, 'pid', 'playlist')\n",
274
- "# df = create_ids(df, 'track_name', 'track')\n",
275
  "df = create_ids(df, 'album_name', 'album')"
276
- ],
277
- "metadata": {
278
- "id": "p68WNyaf58rS"
279
- },
280
- "execution_count": null,
281
- "outputs": []
282
  },
283
  {
284
  "cell_type": "code",
 
 
 
 
 
285
  "source": [
286
  "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
287
  "\n",
288
  "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
289
  "df['playlist_songs'] += 1"
290
- ],
291
- "metadata": {
292
- "id": "aSBKxRFa5-O_"
293
- },
294
- "execution_count": null,
295
- "outputs": []
296
  },
297
  {
298
  "cell_type": "code",
 
 
 
 
 
299
  "source": [
300
  "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
301
  "\n",
@@ -306,67 +282,50 @@
306
  "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
307
  "\n",
308
  "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n"
309
- ],
310
- "metadata": {
311
- "id": "4WqHH-pn5_nL"
312
- },
313
- "execution_count": null,
314
- "outputs": []
315
  },
316
  {
317
  "cell_type": "code",
 
 
 
 
 
318
  "source": [
319
- "# df = df.groupby(['playlist_id','artist_album','artist_album_id','playlist_songs']).agg({\n",
320
- "# 'song_count': 'sum',\n",
321
- "# 'track_name': '|'.join,\n",
322
- "# 'track_name': '|'.join,\n",
323
- "# }).reset_index()\n",
324
  "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
325
  "\n",
326
- "# Encode the genres data\n",
327
  "encoder = LabelEncoder()\n",
328
  "encoder.fit(df['track_name'])\n",
329
  "df['track_id'] = encoder.transform(df['track_name'])"
330
- ],
331
- "metadata": {
332
- "id": "V1bhU5rW6BSY"
333
- },
334
- "execution_count": null,
335
- "outputs": []
336
  },
337
  {
338
  "cell_type": "code",
339
- "source": [
340
- "# df['artist_percent'] = df['artist_count'] / df['playlist_songs']\n",
341
- "df['song_percent'] = df['song_count'] / df['playlist_songs']\n",
342
- "# df['album_percent'] = df['album_count'] / df['playlist_songs']"
343
- ],
344
  "metadata": {
345
  "id": "l6sUWKYC6DCw"
346
  },
347
- "execution_count": null,
348
- "outputs": []
 
 
349
  },
350
  {
351
  "cell_type": "code",
 
 
 
 
 
352
  "source": [
353
  "import numpy as np\n",
354
  "\n",
355
- "# Assuming you have a DataFrame 'df' with a column 'column_name'\n",
356
  "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
357
- ],
358
- "metadata": {
359
- "id": "XxC0WnlL6EWz"
360
- },
361
- "execution_count": null,
362
- "outputs": []
363
  },
364
  {
365
  "cell_type": "code",
366
- "source": [
367
- "artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n",
368
- "artists.head()"
369
- ],
370
  "metadata": {
371
  "colab": {
372
  "base_uri": "https://localhost:8080/",
@@ -375,19 +334,13 @@
375
  "id": "kbxBcQiX6F2v",
376
  "outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9"
377
  },
378
- "execution_count": null,
379
  "outputs": [
380
  {
381
- "output_type": "execute_result",
382
  "data": {
383
- "text/plain": [
384
- " playlist_id artist_id album_id\n",
385
- "0 0 0 0\n",
386
- "1 0 1 1\n",
387
- "2 0 2 2\n",
388
- "3 0 3 3\n",
389
- "4 0 4 4"
390
- ],
391
  "text/html": [
392
  "\n",
393
  " <div id=\"df-cedfd0c3-1f93-4a45-b95c-5d58bbf23f45\" class=\"colab-df-container\">\n",
@@ -658,30 +611,39 @@
658
  " </div>\n",
659
  " </div>\n"
660
  ],
661
- "application/vnd.google.colaboratory.intrinsic+json": {
662
- "type": "dataframe",
663
- "variable_name": "artists"
664
- }
 
 
 
 
665
  },
 
666
  "metadata": {},
667
- "execution_count": 18
668
  }
 
 
 
 
669
  ]
670
  },
671
  {
672
  "cell_type": "code",
 
 
 
 
 
673
  "source": [
674
  "X = artists.loc[:,['artist_id','album_id',]]\n",
675
  "y = artists.loc[:,'playlist_id',]\n",
676
  "\n",
677
  "# Split our data into training and test sets\n",
678
  "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
679
- ],
680
- "metadata": {
681
- "id": "5HLSc9z36Izn"
682
- },
683
- "execution_count": null,
684
- "outputs": []
685
  },
686
  {
687
  "cell_type": "code",
@@ -698,17 +660,7 @@
698
  },
699
  {
700
  "cell_type": "code",
701
- "source": [
702
- "from sklearn.metrics import precision_score, recall_score\n",
703
- "y_no_noise = y[labels_db != -1]\n",
704
- "labels_db_no_noise = labels_db[labels_db != -1]\n",
705
- "\n",
706
- "precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
707
- "recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
708
- "\n",
709
- "print(f'Precision: {precision}')\n",
710
- "print(f'Recall: {recall}')"
711
- ],
712
  "metadata": {
713
  "colab": {
714
  "base_uri": "https://localhost:8080/"
@@ -716,33 +668,58 @@
716
  "id": "Osq-NpGu9V2k",
717
  "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
718
  },
719
- "execution_count": 27,
720
  "outputs": [
721
  {
722
- "output_type": "stream",
723
  "name": "stderr",
 
724
  "text": [
725
  "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
726
  " _warn_prf(average, modifier, msg_start, len(result))\n"
727
  ]
728
  },
729
  {
730
- "output_type": "stream",
731
  "name": "stdout",
 
732
  "text": [
733
  "Precision: 1.589262536579764e-05\n",
734
  "Recall: 9.606273770069471e-06\n"
735
  ]
736
  },
737
  {
738
- "output_type": "stream",
739
  "name": "stderr",
 
740
  "text": [
741
  "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
742
  " _warn_prf(average, modifier, msg_start, len(result))\n"
743
  ]
744
  }
 
 
 
 
 
 
 
 
 
 
 
745
  ]
746
  }
747
- ]
748
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "id": "KHnddFeW5hwh"
8
+ },
9
+ "outputs": [],
10
  "source": [
11
  "import os\n",
12
  "import urllib.request\n",
 
24
  "from sklearn.model_selection import train_test_split\n",
25
  "import matplotlib.pyplot as plt\n",
26
  "from sklearn.preprocessing import LabelEncoder"
27
+ ]
 
 
 
 
 
28
  },
29
  {
30
  "cell_type": "code",
31
+ "execution_count": null,
 
 
 
32
  "metadata": {
33
  "id": "l7pGG_d85lzH"
34
  },
35
+ "outputs": [],
36
+ "source": [
37
+ "from google.colab import drive\n",
38
+ "drive.mount('/content/drive')"
39
+ ]
40
  },
41
  {
42
  "cell_type": "code",
43
+ "execution_count": 3,
44
+ "metadata": {
45
+ "id": "dL8TIlH55qSc"
46
+ },
47
+ "outputs": [],
48
  "source": [
 
 
49
  "import shutil\n",
50
  "import os\n",
51
  "\n",
52
  "def copy_file(src, dst):\n",
 
 
 
 
 
 
 
 
53
  " dst_dir = os.path.dirname(dst)\n",
54
  " if not os.path.exists(dst_dir):\n",
55
  " os.makedirs(dst_dir)\n",
56
  "\n",
 
57
  " shutil.copy2(src, dst)\n",
58
  "\n",
59
  "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
60
+ ]
 
 
 
 
 
61
  },
62
  {
63
  "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {
66
+ "id": "LLy-YA775snY"
67
+ },
68
+ "outputs": [],
69
  "source": [
70
  "def unzip_archive(filepath, dir_path):\n",
71
  " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
72
  " zip_ref.extractall(dir_path)\n",
73
  "\n",
74
  "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
75
+ ]
 
 
 
 
 
76
  },
77
  {
78
  "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {
81
+ "id": "YtO0seclE1Pb"
82
+ },
83
+ "outputs": [],
84
  "source": [
85
  "import shutil\n",
86
  "\n",
 
90
  " os.makedirs(directory)\n",
91
  " else:\n",
92
  " os.makedirs(directory)"
93
+ ]
 
 
 
 
 
94
  },
95
  {
96
  "cell_type": "code",
97
+ "execution_count": null,
 
 
 
 
 
98
  "metadata": {
99
  "id": "UeqDk3_65vTt"
100
  },
101
+ "outputs": [],
102
+ "source": [
103
+ "directory = os.getcwd() + '/data/raw/data'\n",
104
+ "make_dir(directory)"
105
+ ]
106
  },
107
  {
108
  "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {
111
+ "id": "zMTup29b5wtO"
112
+ },
113
+ "outputs": [],
114
  "source": [
115
  "cols = [\n",
116
  " 'name',\n",
 
121
  " 'track_name',\n",
122
  " 'album_name'\n",
123
  "]"
124
+ ]
 
 
 
 
 
125
  },
126
  {
127
  "cell_type": "code",
128
+ "execution_count": null,
129
+ "metadata": {
130
+ "colab": {
131
+ "base_uri": "https://localhost:8080/"
132
+ },
133
+ "id": "h6jQO9HT5zsG",
134
+ "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
135
+ },
136
+ "outputs": [
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "mpd.slice.727000-727999.json\t100/1000\t10.0%"
142
+ ]
143
+ }
144
+ ],
145
  "source": [
146
  "directory = os.getcwd() + '/data/raw/playlists/data'\n",
147
  "df = pd.DataFrame()\n",
 
181
  " df = pd.DataFrame()\n",
182
  " if index % 100 == 0:\n",
183
  " break"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  ]
185
  },
186
  {
187
  "cell_type": "code",
188
+ "execution_count": null,
189
+ "metadata": {
190
+ "id": "PngL0QHq516u"
191
+ },
192
+ "outputs": [],
193
  "source": [
194
  "import pyarrow.parquet as pq\n",
195
  "\n",
 
205
  "\n",
206
  "folder_path = os.getcwd() + '/data/raw/data'\n",
207
  "df = read_parquet_folder(folder_path)"
208
+ ]
 
 
 
 
 
209
  },
210
  {
211
  "cell_type": "code",
212
+ "execution_count": null,
 
 
 
213
  "metadata": {
214
  "id": "hdLpjr2153b_"
215
  },
216
+ "outputs": [],
217
+ "source": [
218
+ "directory = os.getcwd() + '/data/raw/mappings'\n",
219
+ "make_dir(directory)"
220
+ ]
221
  },
222
  {
223
  "cell_type": "code",
224
+ "execution_count": null,
225
+ "metadata": {
226
+ "id": "peZyue6t57Mz"
227
+ },
228
+ "outputs": [],
229
  "source": [
230
  "def create_ids(df, col, name):\n",
231
  " # Create a dictionary mapping unique values to IDs\n",
 
236
  " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n",
237
  "\n",
238
  " return df"
239
+ ]
 
 
 
 
 
240
  },
241
  {
242
  "cell_type": "code",
243
+ "execution_count": null,
244
+ "metadata": {
245
+ "id": "p68WNyaf58rS"
246
+ },
247
+ "outputs": [],
248
  "source": [
249
  "df = create_ids(df, 'artist_name', 'artist')\n",
250
  "df = create_ids(df, 'pid', 'playlist')\n",
 
251
  "df = create_ids(df, 'album_name', 'album')"
252
+ ]
 
 
 
 
 
253
  },
254
  {
255
  "cell_type": "code",
256
+ "execution_count": null,
257
+ "metadata": {
258
+ "id": "aSBKxRFa5-O_"
259
+ },
260
+ "outputs": [],
261
  "source": [
262
  "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
263
  "\n",
264
  "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
265
  "df['playlist_songs'] += 1"
266
+ ]
 
 
 
 
 
267
  },
268
  {
269
  "cell_type": "code",
270
+ "execution_count": null,
271
+ "metadata": {
272
+ "id": "4WqHH-pn5_nL"
273
+ },
274
+ "outputs": [],
275
  "source": [
276
  "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
277
  "\n",
 
282
  "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
283
  "\n",
284
  "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n"
285
+ ]
 
 
 
 
 
286
  },
287
  {
288
  "cell_type": "code",
289
+ "execution_count": null,
290
+ "metadata": {
291
+ "id": "V1bhU5rW6BSY"
292
+ },
293
+ "outputs": [],
294
  "source": [
 
 
 
 
 
295
  "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
296
  "\n",
 
297
  "encoder = LabelEncoder()\n",
298
  "encoder.fit(df['track_name'])\n",
299
  "df['track_id'] = encoder.transform(df['track_name'])"
300
+ ]
 
 
 
 
 
301
  },
302
  {
303
  "cell_type": "code",
304
+ "execution_count": null,
 
 
 
 
305
  "metadata": {
306
  "id": "l6sUWKYC6DCw"
307
  },
308
+ "outputs": [],
309
+ "source": [
310
+ "df['song_percent'] = df['song_count'] / df['playlist_songs']"
311
+ ]
312
  },
313
  {
314
  "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {
317
+ "id": "XxC0WnlL6EWz"
318
+ },
319
+ "outputs": [],
320
  "source": [
321
  "import numpy as np\n",
322
  "\n",
 
323
  "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
324
+ ]
 
 
 
 
 
325
  },
326
  {
327
  "cell_type": "code",
328
+ "execution_count": null,
 
 
 
329
  "metadata": {
330
  "colab": {
331
  "base_uri": "https://localhost:8080/",
 
334
  "id": "kbxBcQiX6F2v",
335
  "outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9"
336
  },
 
337
  "outputs": [
338
  {
 
339
  "data": {
340
+ "application/vnd.google.colaboratory.intrinsic+json": {
341
+ "type": "dataframe",
342
+ "variable_name": "artists"
343
+ },
 
 
 
 
344
  "text/html": [
345
  "\n",
346
  " <div id=\"df-cedfd0c3-1f93-4a45-b95c-5d58bbf23f45\" class=\"colab-df-container\">\n",
 
611
  " </div>\n",
612
  " </div>\n"
613
  ],
614
+ "text/plain": [
615
+ " playlist_id artist_id album_id\n",
616
+ "0 0 0 0\n",
617
+ "1 0 1 1\n",
618
+ "2 0 2 2\n",
619
+ "3 0 3 3\n",
620
+ "4 0 4 4"
621
+ ]
622
  },
623
+ "execution_count": 18,
624
  "metadata": {},
625
+ "output_type": "execute_result"
626
  }
627
+ ],
628
+ "source": [
629
+ "artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n",
630
+ "artists.head()"
631
  ]
632
  },
633
  {
634
  "cell_type": "code",
635
+ "execution_count": null,
636
+ "metadata": {
637
+ "id": "5HLSc9z36Izn"
638
+ },
639
+ "outputs": [],
640
  "source": [
641
  "X = artists.loc[:,['artist_id','album_id',]]\n",
642
  "y = artists.loc[:,'playlist_id',]\n",
643
  "\n",
644
  "# Split our data into training and test sets\n",
645
  "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
646
+ ]
 
 
 
 
 
647
  },
648
  {
649
  "cell_type": "code",
 
660
  },
661
  {
662
  "cell_type": "code",
663
+ "execution_count": 27,
 
 
 
 
 
 
 
 
 
 
664
  "metadata": {
665
  "colab": {
666
  "base_uri": "https://localhost:8080/"
 
668
  "id": "Osq-NpGu9V2k",
669
  "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
670
  },
 
671
  "outputs": [
672
  {
 
673
  "name": "stderr",
674
+ "output_type": "stream",
675
  "text": [
676
  "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
677
  " _warn_prf(average, modifier, msg_start, len(result))\n"
678
  ]
679
  },
680
  {
 
681
  "name": "stdout",
682
+ "output_type": "stream",
683
  "text": [
684
  "Precision: 1.589262536579764e-05\n",
685
  "Recall: 9.606273770069471e-06\n"
686
  ]
687
  },
688
  {
 
689
  "name": "stderr",
690
+ "output_type": "stream",
691
  "text": [
692
  "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
693
  " _warn_prf(average, modifier, msg_start, len(result))\n"
694
  ]
695
  }
696
+ ],
697
+ "source": [
698
+ "from sklearn.metrics import precision_score, recall_score\n",
699
+ "y_no_noise = y[labels_db != -1]\n",
700
+ "labels_db_no_noise = labels_db[labels_db != -1]\n",
701
+ "\n",
702
+ "precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
703
+ "recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
704
+ "\n",
705
+ "print(f'Precision: {precision}')\n",
706
+ "print(f'Recall: {recall}')"
707
  ]
708
  }
709
+ ],
710
+ "metadata": {
711
+ "colab": {
712
+ "machine_shape": "hm",
713
+ "provenance": []
714
+ },
715
+ "kernelspec": {
716
+ "display_name": "Python 3",
717
+ "name": "python3"
718
+ },
719
+ "language_info": {
720
+ "name": "python"
721
+ }
722
+ },
723
+ "nbformat": 4,
724
+ "nbformat_minor": 0
725
+ }
notebooks/naive.ipynb ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "id": "KHnddFeW5hwh"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "import os\n",
12
+ "import urllib.request\n",
13
+ "import zipfile\n",
14
+ "import json\n",
15
+ "import pandas as pd\n",
16
+ "import time\n",
17
+ "import torch\n",
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "import torch.nn as nn\n",
21
+ "import torch.nn.functional as F\n",
22
+ "import torch.optim as optim\n",
23
+ "from torch.utils.data import DataLoader, TensorDataset\n",
24
+ "from sklearn.model_selection import train_test_split\n",
25
+ "import matplotlib.pyplot as plt\n",
26
+ "from sklearn.preprocessing import LabelEncoder"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {
33
+ "id": "l7pGG_d85lzH"
34
+ },
35
+ "outputs": [],
36
+ "source": [
37
+ "from google.colab import drive\n",
38
+ "drive.mount('/content/drive')"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 3,
44
+ "metadata": {
45
+ "id": "dL8TIlH55qSc"
46
+ },
47
+ "outputs": [],
48
+ "source": [
49
+ "import shutil\n",
50
+ "import os\n",
51
+ "\n",
52
+ "def copy_file(src, dst):\n",
53
+ " dst_dir = os.path.dirname(dst)\n",
54
+ " if not os.path.exists(dst_dir):\n",
55
+ " os.makedirs(dst_dir)\n",
56
+ "\n",
57
+ " shutil.copy2(src, dst)\n",
58
+ "\n",
59
+ "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {
66
+ "id": "LLy-YA775snY"
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "def unzip_archive(filepath, dir_path):\n",
71
+ " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
72
+ " zip_ref.extractall(dir_path)\n",
73
+ "\n",
74
+ "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {
81
+ "id": "YtO0seclE1Pb"
82
+ },
83
+ "outputs": [],
84
+ "source": [
85
+ "import shutil\n",
86
+ "\n",
87
+ "def make_dir(directory):\n",
88
+ " if os.path.exists(directory):\n",
89
+ " shutil.rmtree(directory)\n",
90
+ " os.makedirs(directory)\n",
91
+ " else:\n",
92
+ " os.makedirs(directory)"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": null,
98
+ "metadata": {
99
+ "id": "UeqDk3_65vTt"
100
+ },
101
+ "outputs": [],
102
+ "source": [
103
+ "directory = os.getcwd() + '/data/raw/data'\n",
104
+ "make_dir(directory)"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {
111
+ "id": "zMTup29b5wtO"
112
+ },
113
+ "outputs": [],
114
+ "source": [
115
+ "cols = [\n",
116
+ " 'name',\n",
117
+ " 'pid',\n",
118
+ " 'num_followers',\n",
119
+ " 'pos',\n",
120
+ " 'artist_name',\n",
121
+ " 'track_name',\n",
122
+ " 'album_name'\n",
123
+ "]"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "metadata": {
130
+ "colab": {
131
+ "base_uri": "https://localhost:8080/"
132
+ },
133
+ "id": "h6jQO9HT5zsG",
134
+ "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
135
+ },
136
+ "outputs": [
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "mpd.slice.727000-727999.json\t100/1000\t10.0%"
142
+ ]
143
+ }
144
+ ],
145
+ "source": [
146
+ "directory = os.getcwd() + '/data/raw/playlists/data'\n",
147
+ "df = pd.DataFrame()\n",
148
+ "index = 0\n",
149
+ "\n",
150
+ "for filename in os.listdir(directory):\n",
151
+ " if os.path.isfile(os.path.join(directory, filename)):\n",
152
+ " if filename.find('.json') != -1 :\n",
153
+ " index += 1\n",
154
+ "\n",
155
+ " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
156
+ "\n",
157
+ " full_path = os.path.join(directory, filename)\n",
158
+ "\n",
159
+ " with open(full_path, 'r') as file:\n",
160
+ " json_data = json.load(file)\n",
161
+ "\n",
162
+ " temp = pd.DataFrame(json_data['playlists'])\n",
163
+ " expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
164
+ "\n",
165
+ " json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
166
+ " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
167
+ " result = result[cols]\n",
168
+ "\n",
169
+ " df = pd.concat([df, result], axis=0, ignore_index=True)\n",
170
+ "\n",
171
+ " if index % 50 == 0:\n",
172
+ " df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n",
173
+ " del df\n",
174
+ " df = pd.DataFrame()\n",
175
+ " if index % 100 == 0:\n",
176
+ " break"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 3,
182
+ "metadata": {
183
+ "id": "PngL0QHq516u"
184
+ },
185
+ "outputs": [],
186
+ "source": [
187
+ "import pyarrow.parquet as pq\n",
188
+ "\n",
189
+ "def read_parquet_folder(folder_path):\n",
190
+ " dataframes = []\n",
191
+ " for file in os.listdir(folder_path):\n",
192
+ " if file.endswith('.parquet'):\n",
193
+ " file_path = os.path.join(folder_path, file)\n",
194
+ " df = pd.read_parquet(file_path)\n",
195
+ " dataframes.append(df)\n",
196
+ "\n",
197
+ " return pd.concat(dataframes, ignore_index=True)\n",
198
+ "\n",
199
+ "folder_path = os.getcwd() + '/../data/raw/data'\n",
200
+ "df = read_parquet_folder(folder_path)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 4,
206
+ "metadata": {
207
+ "id": "peZyue6t57Mz"
208
+ },
209
+ "outputs": [],
210
+ "source": [
211
+ "def create_ids(df, col, name):\n",
212
+ " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
213
+ " df[f'{name}_id'] = df[col].map(value_to_id)\n",
214
+ "\n",
215
+ " return df"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 5,
221
+ "metadata": {
222
+ "id": "p68WNyaf58rS"
223
+ },
224
+ "outputs": [],
225
+ "source": [
226
+ "df = create_ids(df, 'artist_name', 'artist')\n",
227
+ "df = create_ids(df, 'pid', 'playlist')\n",
228
+ "df = create_ids(df, 'album_name', 'album')"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": 6,
234
+ "metadata": {
235
+ "id": "aSBKxRFa5-O_"
236
+ },
237
+ "outputs": [],
238
+ "source": [
239
+ "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
240
+ "\n",
241
+ "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
242
+ "df['playlist_songs'] += 1"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 7,
248
+ "metadata": {},
249
+ "outputs": [],
250
+ "source": [
251
+ "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
252
+ "\n",
253
+ "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
254
+ "df['artist_album_id'] = df['artist_album'].map(value_to_id)"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": 8,
260
+ "metadata": {
261
+ "id": "V1bhU5rW6BSY"
262
+ },
263
+ "outputs": [],
264
+ "source": [
265
+ "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
266
+ "\n",
267
+ "encoder = LabelEncoder()\n",
268
+ "encoder.fit(df['track_name'])\n",
269
+ "df['track_id'] = encoder.transform(df['track_name'])"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 9,
275
+ "metadata": {
276
+ "id": "l6sUWKYC6DCw"
277
+ },
278
+ "outputs": [],
279
+ "source": [
280
+ "df['song_percent'] = df['song_count'] / df['playlist_songs']"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 10,
286
+ "metadata": {
287
+ "id": "XxC0WnlL6EWz"
288
+ },
289
+ "outputs": [],
290
+ "source": [
291
+ "import numpy as np\n",
292
+ "\n",
293
+ "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": 16,
299
+ "metadata": {
300
+ "id": "5HLSc9z36Izn"
301
+ },
302
+ "outputs": [],
303
+ "source": [
304
+ "X = df.loc[:,['artist_id','album_id',]]\n",
305
+ "y = df.loc[:,'song_percent',]\n",
306
+ "\n",
307
+ "# Split our data into training and test sets\n",
308
+ "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": 17,
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "from sklearn.metrics import precision_score, recall_score"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": 30,
323
+ "metadata": {
324
+ "id": "k47MaxR65Nq4"
325
+ },
326
+ "outputs": [],
327
+ "source": [
328
+ "class NaiveModel:\n",
329
+ " def __init__(self, k=10):\n",
330
+ " self.k = k\n",
331
+ " self.top_k_items = None\n",
332
+ "\n",
333
+ " def fit(self, X, y):\n",
334
+ " df = pd.DataFrame({'album_id': X['album_id'], 'song_percent': y})\n",
335
+ " avg_ratings = df.groupby('album_id')['song_percent'].mean()\n",
336
+ " self.top_k_items = avg_ratings.nlargest(self.k).index.tolist()\n",
337
+ "\n",
338
+ " def predict(self, X):\n",
339
+ " return [self.top_k_items] * len(X)"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": 36,
345
+ "metadata": {},
346
+ "outputs": [],
347
+ "source": [
348
+ "def precision_recall(actual,pred, k):\n",
349
+ " actuals = set(actual)\n",
350
+ " preds = set(pred[:k])\n",
351
+ " true_positives = len(actuals & preds)\n",
352
+ " precision = true_positives / k\n",
353
+ " recall = true_positives / len(actuals)\n",
354
+ " return precision, recall"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 38,
360
+ "metadata": {
361
+ "colab": {
362
+ "base_uri": "https://localhost:8080/"
363
+ },
364
+ "id": "Osq-NpGu9V2k",
365
+ "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
366
+ },
367
+ "outputs": [],
368
+ "source": [
369
+ "model = NaiveModel()\n",
370
+ "model.fit(X_train, y_train)\n",
371
+ "\n",
372
+ "y_pred = model.predict(X_val)\n",
373
+ "\n",
374
+ "y_test_binary = (y_val >= 0.5).astype(int)\n",
375
+ "y_test_items = X_val['album_id'][y_test_binary == 1].tolist()\n",
376
+ "\n",
377
+ "precisions = []\n",
378
+ "recalls = []\n",
379
+ "for i in range(len(X_val)):\n",
380
+ " precision, recall = precision_recall(y_test_items, y_pred[i], k=10)\n",
381
+ " precisions.append(precision)\n",
382
+ " recalls.append(recall)\n",
383
+ "\n",
384
+ "precision = sum(precisions) / len(precisions)\n",
385
+ "recall = sum(recalls) / len(recalls)\n",
386
+ "\n",
387
+ "print(f\"Precision: {precision}\")\n",
388
+ "print(f\"Recall: {recall}\")"
389
+ ]
390
+ }
391
+ ],
392
+ "metadata": {
393
+ "colab": {
394
+ "machine_shape": "hm",
395
+ "provenance": []
396
+ },
397
+ "kernelspec": {
398
+ "display_name": "Python 3",
399
+ "name": "python3"
400
+ },
401
+ "language_info": {
402
+ "codemirror_mode": {
403
+ "name": "ipython",
404
+ "version": 3
405
+ },
406
+ "file_extension": ".py",
407
+ "mimetype": "text/x-python",
408
+ "name": "python",
409
+ "nbconvert_exporter": "python",
410
+ "pygments_lexer": "ipython3",
411
+ "version": "3.6.15"
412
+ }
413
+ },
414
+ "nbformat": 4,
415
+ "nbformat_minor": 0
416
+ }
notebooks/nn_collab_filter.ipynb CHANGED
@@ -48,25 +48,14 @@
48
  },
49
  "outputs": [],
50
  "source": [
51
- "# prompt: copy a file from another directory to current directory in python code and create folders if needed\n",
52
- "\n",
53
  "import shutil\n",
54
  "import os\n",
55
  "\n",
56
  "def copy_file(src, dst):\n",
57
- " \"\"\"\n",
58
- " Copies a file from src to dst, creating any necessary directories.\n",
59
- "\n",
60
- " Args:\n",
61
- " src: The path to the source file.\n",
62
- " dst: The path to the destination file.\n",
63
- " \"\"\"\n",
64
- " # Create the destination directory if it doesn't exist.\n",
65
  " dst_dir = os.path.dirname(dst)\n",
66
  " if not os.path.exists(dst_dir):\n",
67
  " os.makedirs(dst_dir)\n",
68
  "\n",
69
- " # Copy the file.\n",
70
  " shutil.copy2(src, dst)\n",
71
  "\n",
72
  "# copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
@@ -84,7 +73,7 @@
84
  " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
85
  " zip_ref.extractall(dir_path)\n",
86
  "\n",
87
- "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
88
  ]
89
  },
90
  {
@@ -152,17 +141,14 @@
152
  "directory = os.getcwd() + '/data/raw/playlists/data'\n",
153
  "df = pd.DataFrame()\n",
154
  "index = 0\n",
155
- "# Loop through all files in the directory\n",
156
  "for filename in os.listdir(directory):\n",
157
- " # Check if the item is a file (not a subdirectory)\n",
158
  " if os.path.isfile(os.path.join(directory, filename)):\n",
159
  " if filename.find('.json') != -1 :\n",
160
  " index += 1\n",
161
  "\n",
162
- " # Print the filename or perform operations on the file\n",
163
  " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
164
  "\n",
165
- " # If you need the full file path, you can use:\n",
166
  " full_path = os.path.join(directory, filename)\n",
167
  "\n",
168
  " with open(full_path, 'r') as file:\n",
@@ -171,12 +157,9 @@
171
  " temp = pd.DataFrame(json_data['playlists'])\n",
172
  " expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
173
  "\n",
174
- " # Normalize the JSON data\n",
175
  " json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
176
  "\n",
177
- " # Concatenate the original DataFrame with the normalized JSON data\n",
178
  " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
179
- "\n",
180
  " result = result[cols]\n",
181
  "\n",
182
  " df = pd.concat([df, result], axis=0, ignore_index=True)\n",
@@ -234,10 +217,8 @@
234
  "outputs": [],
235
  "source": [
236
  "def create_ids(df, col, name):\n",
237
- " # Create a dictionary mapping unique values to IDs\n",
238
  " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
239
  "\n",
240
- " # Create a new column with the IDs\n",
241
  " df[f'{name}_id'] = df[col].map(value_to_id)\n",
242
  " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')\n",
243
  "\n",
@@ -252,10 +233,10 @@
252
  },
253
  "outputs": [],
254
  "source": [
255
- "# df = create_ids(df, 'artist_name', 'artist')\n",
256
  "df = create_ids(df, 'pid', 'playlist')\n",
257
- "# df = create_ids(df, 'track_name', 'track')\n",
258
- "# df = create_ids(df, 'album_name', 'album')"
259
  ]
260
  },
261
  {
@@ -282,10 +263,8 @@
282
  "source": [
283
  "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
284
  "\n",
285
- "# Step 2: Create a dictionary mapping unique combined values to IDs\n",
286
  "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
287
  "\n",
288
- "# Step 3: Map these IDs back to the DataFrame\n",
289
  "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
290
  "\n",
291
  "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')\n",
@@ -300,32 +279,13 @@
300
  },
301
  "outputs": [],
302
  "source": [
303
- "# df = df.groupby(['playlist_id','artist_album','artist_album_id','playlist_songs']).agg({\n",
304
- "# 'song_count': 'sum',\n",
305
- "# 'track_name': '|'.join,\n",
306
- "# 'track_name': '|'.join,\n",
307
- "# }).reset_index()\n",
308
  "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
309
  "\n",
310
- "# Encode the genres data\n",
311
  "encoder = LabelEncoder()\n",
312
  "encoder.fit(df['track_name'])\n",
313
  "df['track_id'] = encoder.transform(df['track_name'])"
314
  ]
315
  },
316
- {
317
- "cell_type": "code",
318
- "execution_count": null,
319
- "metadata": {
320
- "id": "r0YprWVe_LJ0"
321
- },
322
- "outputs": [],
323
- "source": [
324
- "# df['artist_count'] = df.groupby(['playlist_id','artist_id'])['song_id'].transform('nunique')\n",
325
- "# df['album_count'] = df.groupby(['playlist_id','artist_id','album_id'])['song_id'].transform('nunique')\n",
326
- "# df['song_count'] = df.groupby(['artist_id'])['song_id'].transform('count')"
327
- ]
328
- },
329
  {
330
  "cell_type": "code",
331
  "execution_count": null,
@@ -334,9 +294,7 @@
334
  },
335
  "outputs": [],
336
  "source": [
337
- "# df['artist_percent'] = df['artist_count'] / df['playlist_songs']\n",
338
- "df['song_percent'] = df['song_count'] / df['playlist_songs']\n",
339
- "# df['album_percent'] = df['album_count'] / df['playlist_songs']"
340
  ]
341
  },
342
  {
@@ -349,7 +307,6 @@
349
  "source": [
350
  "import numpy as np\n",
351
  "\n",
352
- "# Assuming you have a DataFrame 'df' with a column 'column_name'\n",
353
  "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
354
  ]
355
  },
@@ -429,20 +386,20 @@
429
  "source": [
430
  "class NNColabFiltering(nn.Module):\n",
431
  "\n",
432
- " def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):\n",
433
  " super().__init__()\n",
434
- " self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)\n",
435
  " self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)\n",
436
- " self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)\n",
437
  " self.fc2 = nn.Linear(n_activations,1)\n",
438
  " self.rating_range = rating_range\n",
439
  "\n",
440
  " def forward(self, X):\n",
441
  " # Get embeddings for minibatch\n",
442
- " embedded_users = self.user_embeddings(X[:,0])\n",
443
  " embedded_items = self.item_embeddings(X[:,1])\n",
444
- " # Concatenate user and item embeddings\n",
445
- " embeddings = torch.cat([embedded_users,embedded_items],dim=1)\n",
446
  " # Pass embeddings through network\n",
447
  " preds = self.fc1(embeddings)\n",
448
  " preds = F.relu(preds)\n",
@@ -547,9 +504,9 @@
547
  "source": [
548
  "# Train the model\n",
549
  "dataloaders = {'train':trainloader, 'val':valloader}\n",
550
- "n_users = X.loc[:,'playlist_id'].max()+1\n",
551
  "n_items = X.loc[:,'artist_album_id'].max()+1\n",
552
- "model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])\n",
553
  "criterion = nn.MSELoss()\n",
554
  "lr=0.001\n",
555
  "n_epochs=10\n",
@@ -678,31 +635,26 @@
678
  "def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):\n",
679
  " model.eval()\n",
680
  "\n",
 
 
681
  "\n",
682
- " all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)\n",
683
- " user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)\n",
684
- "\n",
685
- " # Initialize tensor to store all predictions\n",
686
- " all_predictions = torch.zeros(len(all_movie_ids), device=device)\n",
687
  "\n",
688
- " # Generate predictions in batches\n",
689
  " with torch.no_grad():\n",
690
- " for i in range(0, len(all_movie_ids), batch_size):\n",
691
- " batch_user_ids = user_ids[i:i+batch_size]\n",
692
- " batch_movie_ids = all_movie_ids[i:i+batch_size]\n",
693
  "\n",
694
- " input_tensor = torch.stack([batch_user_ids, batch_movie_ids], dim=1)\n",
695
  " batch_predictions = model(input_tensor).squeeze()\n",
696
  " all_predictions[i:i+batch_size] = batch_predictions\n",
697
  "\n",
698
- " # Convert to numpy for easier handling\n",
699
  " predictions = all_predictions.cpu().numpy()\n",
700
  "\n",
701
  " albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())\n",
702
  "\n",
703
  " unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)\n",
704
  "\n",
705
- " # Get top N recommendations\n",
706
  " top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]\n",
707
  " recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]\n",
708
  "\n",
 
48
  },
49
  "outputs": [],
50
  "source": [
 
 
51
  "import shutil\n",
52
  "import os\n",
53
  "\n",
54
  "def copy_file(src, dst):\n",
 
 
 
 
 
 
 
 
55
  " dst_dir = os.path.dirname(dst)\n",
56
  " if not os.path.exists(dst_dir):\n",
57
  " os.makedirs(dst_dir)\n",
58
  "\n",
 
59
  " shutil.copy2(src, dst)\n",
60
  "\n",
61
  "# copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
 
73
  " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
74
  " zip_ref.extractall(dir_path)\n",
75
  "\n",
76
+ "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
77
  ]
78
  },
79
  {
 
141
  "directory = os.getcwd() + '/data/raw/playlists/data'\n",
142
  "df = pd.DataFrame()\n",
143
  "index = 0\n",
144
+ "\n",
145
  "for filename in os.listdir(directory):\n",
 
146
  " if os.path.isfile(os.path.join(directory, filename)):\n",
147
  " if filename.find('.json') != -1 :\n",
148
  " index += 1\n",
149
  "\n",
 
150
  " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
151
  "\n",
 
152
  " full_path = os.path.join(directory, filename)\n",
153
  "\n",
154
  " with open(full_path, 'r') as file:\n",
 
157
  " temp = pd.DataFrame(json_data['playlists'])\n",
158
  " expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
159
  "\n",
 
160
  " json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
161
  "\n",
 
162
  " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
 
163
  " result = result[cols]\n",
164
  "\n",
165
  " df = pd.concat([df, result], axis=0, ignore_index=True)\n",
 
217
  "outputs": [],
218
  "source": [
219
  "def create_ids(df, col, name):\n",
 
220
  " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
221
  "\n",
 
222
  " df[f'{name}_id'] = df[col].map(value_to_id)\n",
223
  " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')\n",
224
  "\n",
 
233
  },
234
  "outputs": [],
235
  "source": [
236
+ "df = create_ids(df, 'artist_name', 'artist')\n",
237
  "df = create_ids(df, 'pid', 'playlist')\n",
238
+ "df = create_ids(df, 'track_name', 'track')\n",
239
+ "df = create_ids(df, 'album_name', 'album')"
240
  ]
241
  },
242
  {
 
263
  "source": [
264
  "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
265
  "\n",
 
266
  "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
267
  "\n",
 
268
  "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
269
  "\n",
270
  "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')\n",
 
279
  },
280
  "outputs": [],
281
  "source": [
 
 
 
 
 
282
  "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
283
  "\n",
 
284
  "encoder = LabelEncoder()\n",
285
  "encoder.fit(df['track_name'])\n",
286
  "df['track_id'] = encoder.transform(df['track_name'])"
287
  ]
288
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  {
290
  "cell_type": "code",
291
  "execution_count": null,
 
294
  },
295
  "outputs": [],
296
  "source": [
297
+ "df['song_percent'] = df['song_count'] / df['playlist_songs']"
 
 
298
  ]
299
  },
300
  {
 
307
  "source": [
308
  "import numpy as np\n",
309
  "\n",
 
310
  "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
311
  ]
312
  },
 
386
  "source": [
387
  "class NNColabFiltering(nn.Module):\n",
388
  "\n",
389
+ " def __init__(self, n_playlists, n_artists, embedding_dim_playlists, embedding_dim_items, n_activations, rating_range):\n",
390
  " super().__init__()\n",
391
+ " self.playlist_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_playlists)\n",
392
  " self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)\n",
393
+ " self.fc1 = nn.Linear(embedding_dim_playlists+embedding_dim_items,n_activations)\n",
394
  " self.fc2 = nn.Linear(n_activations,1)\n",
395
  " self.rating_range = rating_range\n",
396
  "\n",
397
  " def forward(self, X):\n",
398
  " # Get embeddings for minibatch\n",
399
+ " embedded_playlists = self.playlist_embeddings(X[:,0])\n",
400
  " embedded_items = self.item_embeddings(X[:,1])\n",
401
+ " # Concatenate playlist and item embeddings\n",
402
+ " embeddings = torch.cat([embedded_playlists,embedded_items],dim=1)\n",
403
  " # Pass embeddings through network\n",
404
  " preds = self.fc1(embeddings)\n",
405
  " preds = F.relu(preds)\n",
 
504
  "source": [
505
  "# Train the model\n",
506
  "dataloaders = {'train':trainloader, 'val':valloader}\n",
507
+ "n_playlists = X.loc[:,'playlist_id'].max()+1\n",
508
  "n_items = X.loc[:,'artist_album_id'].max()+1\n",
509
+ "model = NNColabFiltering(n_playlists,n_items,embedding_dim_playlists=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])\n",
510
  "criterion = nn.MSELoss()\n",
511
  "lr=0.001\n",
512
  "n_epochs=10\n",
 
635
  "def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):\n",
636
  " model.eval()\n",
637
  "\n",
638
+ " all_album_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)\n",
639
+ " playlist_ids = torch.full((len(all_album_ids),), playlist_id, dtype=torch.long, device=device)\n",
640
  "\n",
641
+ " all_predictions = torch.zeros(len(all_album_ids), device=device)\n",
 
 
 
 
642
  "\n",
 
643
  " with torch.no_grad():\n",
644
+ " for i in range(0, len(all_album_ids), batch_size):\n",
645
+ " batch_playlist_ids = playlist_ids[i:i+batch_size]\n",
646
+ " batch_album_ids = all_album_ids[i:i+batch_size]\n",
647
  "\n",
648
+ " input_tensor = torch.stack([batch_playlist_ids, batch_album_ids], dim=1)\n",
649
  " batch_predictions = model(input_tensor).squeeze()\n",
650
  " all_predictions[i:i+batch_size] = batch_predictions\n",
651
  "\n",
 
652
  " predictions = all_predictions.cpu().numpy()\n",
653
  "\n",
654
  " albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())\n",
655
  "\n",
656
  " unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)\n",
657
  "\n",
 
658
  " top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]\n",
659
  " recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]\n",
660
  "\n",
scripts/build_features.py CHANGED
@@ -1,49 +1,21 @@
 
1
  import os
2
- import urllib.request
3
- import zipfile
4
- import json
5
  import pandas as pd
6
- import time
7
- import torch
8
  import numpy as np
9
  import pandas as pd
10
- import torch.nn as nn
11
- import torch.nn.functional as F
12
- import torch.optim as optim
13
- from torch.utils.data import DataLoader, TensorDataset
14
- from sklearn.model_selection import train_test_split
15
- import matplotlib.pyplot as plt
16
  from sklearn.preprocessing import LabelEncoder
17
  import shutil
18
  import os
19
- import pyarrow.parquet as pq
20
-
21
-
22
- cols = [
23
- 'name',
24
- 'pid',
25
- 'num_followers',
26
- 'pos',
27
- 'artist_name',
28
- 'track_name',
29
- 'album_name'
30
- ]
31
-
32
-
33
- def copy_file(src, dst):
34
-
35
- dst_dir = os.path.dirname(dst)
36
- if not os.path.exists(dst_dir):
37
- os.makedirs(dst_dir)
38
-
39
- shutil.copy2(src, dst)
40
-
41
- def unzip_archive(filepath, dir_path):
42
- with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
43
- zip_ref.extractall(dir_path)
44
-
45
 
46
  def make_dir(directory):
 
 
 
 
 
 
 
 
47
  if os.path.exists(directory):
48
  shutil.rmtree(directory)
49
  os.makedirs(directory)
@@ -51,52 +23,73 @@ def make_dir(directory):
51
  os.makedirs(directory)
52
 
53
 
54
- def make_dataset():
55
- directory = os.getcwd() + '/data/raw/playlists/data'
56
- df = pd.DataFrame()
57
- index = 0
58
- # Loop through all files in the directory
59
- for filename in os.listdir(directory):
60
- # Check if the item is a file (not a subdirectory)
61
- if os.path.isfile(os.path.join(directory, filename)):
62
- if filename.find('.json') != -1 :
63
- index += 1
64
-
65
- # Print the filename or perform operations on the file
66
- print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # If you need the full file path, you can use:
69
- full_path = os.path.join(directory, filename)
70
-
71
- with open(full_path, 'r') as file:
72
- json_data = json.load(file)
73
-
74
- temp = pd.DataFrame(json_data['playlists'])
75
- expanded_df = temp.explode('tracks').reset_index(drop=True)
76
 
77
- # Normalize the JSON data
78
- json_normalized = pd.json_normalize(expanded_df['tracks'])
79
 
80
- # Concatenate the original DataFrame with the normalized JSON data
81
- result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
 
82
 
83
- result = result[cols]
 
 
84
 
85
- df = pd.concat([df, result], axis=0, ignore_index=True)
 
 
 
 
86
 
87
- if index % 50 == 0:
88
- df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
89
- del df
90
- df = pd.DataFrame()
91
- if index % 200 == 0:
92
- break
93
-
94
 
95
- if __name__ == '__main__':
96
- unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
97
- directory = os.getcwd() + '/data/raw/data'
98
- make_dir(directory)
99
- directory = os.getcwd() + '/data/processed'
100
- make_dir(directory)
101
- make_dataset()
102
 
 
 
 
1
+ import numpy as np
2
  import os
 
 
 
3
  import pandas as pd
 
 
4
  import numpy as np
5
  import pandas as pd
 
 
 
 
 
 
6
  from sklearn.preprocessing import LabelEncoder
7
  import shutil
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def make_dir(directory):
11
+ '''
12
+ Creates a new blank directory
13
+
14
+ Inputs:
15
+ directory: path to create a new directory at
16
+ Returns:
17
+
18
+ '''
19
  if os.path.exists(directory):
20
  shutil.rmtree(directory)
21
  os.makedirs(directory)
 
23
  os.makedirs(directory)
24
 
25
 
26
+ def read_parquet_folder(folder_path):
27
+ '''
28
+ Creates the pandas dataframe from a folder of parquet files
29
+
30
+ Inputs:
31
+ folder_path: the folder path for the parquet files
32
+ Returns:
33
+
34
+ '''
35
+ dataframes = []
36
+ for file in os.listdir(folder_path):
37
+ if file.endswith('.parquet'):
38
+ file_path = os.path.join(folder_path, file)
39
+ df = pd.read_parquet(file_path)
40
+ dataframes.append(df)
41
+
42
+ return pd.concat(dataframes, ignore_index=True)
43
+
44
+
45
+ def create_ids(df, col, name):
46
+ '''
47
+ Creates unique ids for the features and creates mapping documents
48
+
49
+ Inputs:
50
+ df: dataframe with the features
51
+ col: column to create ids on
52
+ name: name of the newly created id
53
+ Returns:
54
+ df: dataframe with the mapped ids
55
+
56
+ '''
57
+ value_to_id = {val: i for i, val in enumerate(df[col].unique())}
58
+
59
+ df[f'{name}_id'] = df[col].map(value_to_id)
60
+ df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
61
+
62
+ return df
63
 
64
+ if __name__ == '__main__':
65
+ folder_path = os.getcwd() + '/data/raw/data'
66
+ df = read_parquet_folder(folder_path)
 
 
 
 
 
67
 
68
+ directory = os.getcwd() + '/data/processed'
69
+ make_dir(directory)
70
 
71
+ df = create_ids(df, 'artist_name', 'artist')
72
+ df = create_ids(df, 'pid', 'playlist')
73
+ df = create_ids(df, 'album_name', 'album')
74
 
75
+ df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
76
+ df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
77
+ df['playlist_songs'] += 1
78
 
79
+ df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
80
+ value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
81
+ df['artist_album_id'] = df['artist_album'].map(value_to_id)
82
+
83
+ df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
84
 
85
+ df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
 
 
 
 
 
 
86
 
87
+ encoder = LabelEncoder()
88
+ encoder.fit(df['track_name'])
89
+
90
+ df['track_id'] = encoder.transform(df['track_name'])
91
+ df['song_percent'] = df['song_count'] / df['playlist_songs']
92
+ df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
 
93
 
94
+ artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
95
+ artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
scripts/make_dataset.py CHANGED
@@ -1,25 +1,63 @@
1
- import numpy as np
2
  import os
3
- import urllib.request
4
  import zipfile
5
  import json
6
  import pandas as pd
7
- import time
8
- import torch
9
- import numpy as np
10
  import pandas as pd
11
- import torch.nn as nn
12
- import torch.nn.functional as F
13
- import torch.optim as optim
14
- from torch.utils.data import DataLoader, TensorDataset
15
- from sklearn.model_selection import train_test_split
16
- import matplotlib.pyplot as plt
17
- from sklearn.preprocessing import LabelEncoder
18
  import shutil
19
  import os
20
- import pyarrow.parquet as pq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def make_dir(directory):
 
 
 
 
 
 
 
 
23
  if os.path.exists(directory):
24
  shutil.rmtree(directory)
25
  os.makedirs(directory)
@@ -27,56 +65,54 @@ def make_dir(directory):
27
  os.makedirs(directory)
28
 
29
 
30
- def read_parquet_folder(folder_path):
31
- dataframes = []
32
- for file in os.listdir(folder_path):
33
- if file.endswith('.parquet'):
34
- file_path = os.path.join(folder_path, file)
35
- df = pd.read_parquet(file_path)
36
- dataframes.append(df)
37
-
38
- return pd.concat(dataframes, ignore_index=True)
39
-
40
-
41
- def create_ids(df, col, name):
42
- # Create a dictionary mapping unique values to IDs
43
- value_to_id = {val: i for i, val in enumerate(df[col].unique())}
44
 
45
- # Create a new column with the IDs
46
- df[f'{name}_id'] = df[col].map(value_to_id)
47
- df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
 
48
 
49
- return df
50
 
51
- if __name__ == '__main__':
52
- folder_path = os.getcwd() + '/data/raw/data'
53
- df = read_parquet_folder(folder_path)
54
 
55
- directory = os.getcwd() + '/data/processed'
56
- make_dir(directory)
57
 
58
- df = create_ids(df, 'artist_name', 'artist')
59
- df = create_ids(df, 'pid', 'playlist')
60
- df = create_ids(df, 'album_name', 'album')
61
 
62
- df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
63
- df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
64
- df['playlist_songs'] += 1
65
 
66
- df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
67
- value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
68
- df['artist_album_id'] = df['artist_album'].map(value_to_id)
69
-
70
- df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
71
 
72
- df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
 
 
 
 
 
 
73
 
74
- encoder = LabelEncoder()
75
- encoder.fit(df['track_name'])
76
-
77
- df['track_id'] = encoder.transform(df['track_name'])
78
- df['song_percent'] = df['song_count'] / df['playlist_songs']
79
- df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
 
80
 
81
- artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
82
- artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
 
 
1
  import os
 
2
  import zipfile
3
  import json
4
  import pandas as pd
 
 
 
5
  import pandas as pd
 
 
 
 
 
 
 
6
  import shutil
7
  import os
8
+
9
+
10
+ cols = [
11
+ 'name',
12
+ 'pid',
13
+ 'num_followers',
14
+ 'pos',
15
+ 'artist_name',
16
+ 'track_name',
17
+ 'album_name'
18
+ ]
19
+
20
+
21
+ def copy_file(src, dst):
22
+ '''
23
+ Copies a file from one dir to another
24
+
25
+ Inputs:
26
+ src: filepath to use as the soruce
27
+ dst: filepath to copy the file to
28
+
29
+ Returns:
30
+
31
+ '''
32
+ dst_dir = os.path.dirname(dst)
33
+ if not os.path.exists(dst_dir):
34
+ os.makedirs(dst_dir)
35
+
36
+ shutil.copy2(src, dst)
37
+
38
+ def unzip_archive(filepath, dir_path):
39
+ '''
40
+ Unzips a zipfile to the dir_path
41
+
42
+ Inputs:
43
+ filepath: filepath of the zip file
44
+ dir_path: path to extract the zip file contents to
45
+ Returns:
46
+
47
+ '''
48
+ with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
49
+ zip_ref.extractall(dir_path)
50
+
51
 
52
  def make_dir(directory):
53
+ '''
54
+ Creates a new blank directory
55
+
56
+ Inputs:
57
+ directory: path to create a new directory at
58
+ Returns:
59
+
60
+ '''
61
  if os.path.exists(directory):
62
  shutil.rmtree(directory)
63
  os.makedirs(directory)
 
65
  os.makedirs(directory)
66
 
67
 
68
+ def make_dataset():
69
+ '''
70
+ Creates the directory of parquet files to create the
71
+ dataset with, used parquet to reduce memory load
72
+
73
+ Inputs:
74
+
75
+ Returns:
76
+
77
+ '''
78
+ directory = os.getcwd() + '/data/raw/playlists/data'
79
+ df = pd.DataFrame()
80
+ index = 0
 
81
 
82
+ for filename in os.listdir(directory):
83
+ if os.path.isfile(os.path.join(directory, filename)):
84
+ if filename.find('.json') != -1 :
85
+ index += 1
86
 
87
+ print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
88
 
89
+ full_path = os.path.join(directory, filename)
 
 
90
 
91
+ with open(full_path, 'r') as file:
92
+ json_data = json.load(file)
93
 
94
+ temp = pd.DataFrame(json_data['playlists'])
95
+ expanded_df = temp.explode('tracks').reset_index(drop=True)
96
+ json_normalized = pd.json_normalize(expanded_df['tracks'])
97
 
98
+ result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
99
+ result = result[cols]
 
100
 
101
+ df = pd.concat([df, result], axis=0, ignore_index=True)
 
 
 
 
102
 
103
+ if index % 50 == 0:
104
+ df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
105
+ del df
106
+ df = pd.DataFrame()
107
+ if index % 200 == 0:
108
+ break
109
+
110
 
111
+ if __name__ == '__main__':
112
+ unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
113
+ directory = os.getcwd() + '/data/raw/data'
114
+ make_dir(directory)
115
+ directory = os.getcwd() + '/data/processed'
116
+ make_dir(directory)
117
+ make_dataset()
118
 
 
 
scripts/model.py CHANGED
@@ -7,9 +7,6 @@ Brinnae Bent
7
  """
8
 
9
  import os
10
- import urllib.request
11
- import zipfile
12
- import json
13
  import pandas as pd
14
  import time
15
  import torch
@@ -18,26 +15,8 @@ import pandas as pd
18
  import torch.nn as nn
19
  import torch.nn.functional as F
20
  import torch.optim as optim
21
- from torch.utils.data import DataLoader, TensorDataset
22
  from sklearn.model_selection import train_test_split
23
- import matplotlib.pyplot as plt
24
- from sklearn.preprocessing import LabelEncoder
25
-
26
-
27
-
28
-
29
- def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
30
- # Convert training and test data to TensorDatasets
31
- trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
32
- torch.from_numpy(np.array(y_train)).float())
33
- valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
34
- torch.from_numpy(np.array(y_val)).float())
35
-
36
- # Create Dataloaders for our training and test data to allow us to iterate over minibatches
37
- trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
38
- valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
39
-
40
- return trainloader, valloader
41
 
42
 
43
  class NNColabFiltering(nn.Module):
@@ -64,9 +43,50 @@ class NNColabFiltering(nn.Module):
64
  preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
65
  return preds
66
 
67
- def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- model = model.to(device) # Send model to GPU if available
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  since = time.time()
71
 
72
  costpaths = {'train':[],'val':[]}
@@ -75,47 +95,36 @@ def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5,
75
  print('Epoch {}/{}'.format(epoch, num_epochs - 1))
76
  print('-' * 10)
77
 
78
- # Each epoch has a training and validation phase
79
  for phase in ['train', 'val']:
80
  if phase == 'train':
81
- model.train() # Set model to training mode
82
  else:
83
- model.eval() # Set model to evaluate mode
84
 
85
  running_loss = 0.0
86
 
87
- # Get the inputs and labels, and send to GPU if available
88
  index = 0
89
  for (inputs,labels) in dataloaders[phase]:
90
  inputs = inputs.to(device)
91
  labels = labels.to(device)
92
 
93
- # Zero the weight gradients
94
  optimizer.zero_grad()
95
 
96
- # Forward pass to get outputs and calculate loss
97
- # Track gradient only for training data
98
  with torch.set_grad_enabled(phase == 'train'):
99
  outputs = model.forward(inputs).view(-1)
100
  loss = criterion(outputs, labels)
101
 
102
- # Backpropagation to get the gradients with respect to each weight
103
- # Only if in train
104
  if phase == 'train':
105
  loss.backward()
106
- # Update the weights
107
  optimizer.step()
108
 
109
- # Convert loss into a scalar and add it to running_loss
110
  running_loss += np.sqrt(loss.item()) * labels.size(0)
111
  print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
112
  index +=1
113
 
114
- # Step along learning rate scheduler when in train
115
  if (phase == 'train') and (scheduler is not None):
116
  scheduler.step()
117
 
118
- # Calculate and display average loss and accuracy for the epoch
119
  epoch_loss = running_loss / len(dataloaders[phase].dataset)
120
  costpaths[phase].append(epoch_loss)
121
  print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
@@ -150,7 +159,6 @@ if __name__ == '__main__':
150
 
151
  cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
152
 
153
-
154
  # Save the entire model
155
  torch.save(model, os.getcwd() + '/models/recommender.pt')
156
 
 
7
  """
8
 
9
  import os
 
 
 
10
  import pandas as pd
11
  import time
12
  import torch
 
15
  import torch.nn as nn
16
  import torch.nn.functional as F
17
  import torch.optim as optim
18
+ from torch.utils.data import TensorDataset
19
  from sklearn.model_selection import train_test_split
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  class NNColabFiltering(nn.Module):
 
43
  preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
44
  return preds
45
 
46
+ def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
47
+ '''
48
+ Loads the prefetched data from the output dir
49
+
50
+ Inputs:
51
+ X_train: training data features
52
+ y_train: training data target
53
+ X_val: validation data features
54
+ y_val: validation data targets
55
+ batch_size: the batch size to use
56
+
57
+ Returns:
58
+ trainloader: training dataloader
59
+ valloader: validation dataloader
60
+ '''
61
+ # Convert training and test data to TensorDatasets
62
+ trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
63
+ torch.from_numpy(np.array(y_train)).float())
64
+ valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
65
+ torch.from_numpy(np.array(y_val)).float())
66
 
67
+ # Create Dataloaders for our training and test data to allow us to iterate over minibatches
68
+ trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
69
+ valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
70
+
71
+ return trainloader, valloader
72
+
73
+ def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
74
+ '''
75
+ Loads the prefetched data from the output dir
76
+
77
+ Inputs:
78
+ model: the model to train
79
+ criterion: the criterion to use to train
80
+ optimizer: the optimizer to use to train
81
+ dataloaders: the dict of dataloaders to user in the training and validation
82
+ device: the torch defined cpu/gpu
83
+ num_epochs: number of epochs to use for training
84
+ scheduler: the scheduler to use to train for training
85
+
86
+ Returns:
87
+ costpaths: the loss for each epoch for validation and training
88
+ '''
89
+ model = model.to(device)
90
  since = time.time()
91
 
92
  costpaths = {'train':[],'val':[]}
 
95
  print('Epoch {}/{}'.format(epoch, num_epochs - 1))
96
  print('-' * 10)
97
 
 
98
  for phase in ['train', 'val']:
99
  if phase == 'train':
100
+ model.train()
101
  else:
102
+ model.eval()
103
 
104
  running_loss = 0.0
105
 
 
106
  index = 0
107
  for (inputs,labels) in dataloaders[phase]:
108
  inputs = inputs.to(device)
109
  labels = labels.to(device)
110
 
 
111
  optimizer.zero_grad()
112
 
 
 
113
  with torch.set_grad_enabled(phase == 'train'):
114
  outputs = model.forward(inputs).view(-1)
115
  loss = criterion(outputs, labels)
116
 
 
 
117
  if phase == 'train':
118
  loss.backward()
 
119
  optimizer.step()
120
 
 
121
  running_loss += np.sqrt(loss.item()) * labels.size(0)
122
  print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
123
  index +=1
124
 
 
125
  if (phase == 'train') and (scheduler is not None):
126
  scheduler.step()
127
 
 
128
  epoch_loss = running_loss / len(dataloaders[phase].dataset)
129
  costpaths[phase].append(epoch_loss)
130
  print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
 
159
 
160
  cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
161
 
 
162
  # Save the entire model
163
  torch.save(model, os.getcwd() + '/models/recommender.pt')
164