Added Naive model and comments

Browse files

Files changed (7) hide show

main.py +16 -18
notebooks/dbscan.ipynb +169 -192
notebooks/naive.ipynb +416 -0
notebooks/nn_collab_filter.ipynb +21 -69
scripts/build_features.py +71 -78
scripts/make_dataset.py +92 -56
scripts/model.py +46 -38

main.py CHANGED Viewed

@@ -13,25 +13,15 @@ import os
 import numpy as np
 import pandas as pd
 import pandas as pd
-import json
-import matplotlib.pyplot as plt
 import os
-import urllib.request
-import zipfile
 import json
 import pandas as pd
-import time
 import torch
 import numpy as np
 import pandas as pd
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.model_selection import train_test_split
 import matplotlib.pyplot as plt
-from sklearn.preprocessing import LabelEncoder
 class NNColabFiltering(nn.Module):
@@ -58,16 +48,29 @@ class NNColabFiltering(nn.Module):
         return preds
 def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):
-    model.eval()
     all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)
     user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)
-    # Initialize tensor to store all predictions
     all_predictions = torch.zeros(len(all_movie_ids), device=device)
-    # Generate predictions in batches
     with torch.no_grad():
         for i in range(0, len(all_movie_ids), batch_size):
             batch_user_ids = user_ids[i:i+batch_size]
@@ -77,14 +80,10 @@ def generate_recommendations(artist_album, playlists, model, playlist_id, device
             batch_predictions = model(input_tensor).squeeze()
             all_predictions[i:i+batch_size] = batch_predictions
-    # Convert to numpy for easier handling
     predictions = all_predictions.cpu().numpy()
     albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())
     unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)
-    # Get top N recommendations
     top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]
     recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]
@@ -126,7 +125,6 @@ if __name__ == '__main__':
     trumpet = Image.open('assets/trumpet.png')
     img2.image(trumpet, use_column_width=True)
-    # Using "with" notation
     with st.sidebar:
         playlist_name = st.selectbox(
             "Playlist Selection",

 import numpy as np
 import pandas as pd
 import pandas as pd
 import os
 import json
 import pandas as pd
 import torch
 import numpy as np
 import pandas as pd
 import torch.nn as nn
 import torch.nn.functional as F
 import matplotlib.pyplot as plt
 class NNColabFiltering(nn.Module):
         return preds
 def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):
+    '''
+    Loads the prefetched data from the output dir
+    Inputs:
+        artist_album: the dataframe containing the mappings for the artist and albums
+        playlists: the dataframe containing the playlists contents
+        model: the trained model
+        playlist_id: the playlist id to generate recommendation for
+        device: the gpu or cpu device define by torch
+        top_n: the number of recommendations to generate
+        batch_size: the batch size to use
+    Returns:
+        album: the recommended album
+        playlists: the recommended artist
+    '''
+    model.eval()
     all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)
     user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)
     all_predictions = torch.zeros(len(all_movie_ids), device=device)
     with torch.no_grad():
         for i in range(0, len(all_movie_ids), batch_size):
             batch_user_ids = user_ids[i:i+batch_size]
             batch_predictions = model(input_tensor).squeeze()
             all_predictions[i:i+batch_size] = batch_predictions
     predictions = all_predictions.cpu().numpy()
     albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())
     unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)
     top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]
     recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]
     trumpet = Image.open('assets/trumpet.png')
     img2.image(trumpet, use_column_width=True)
     with st.sidebar:
         playlist_name = st.selectbox(
             "Playlist Selection",

notebooks/dbscan.ipynb CHANGED Viewed

@@ -1,22 +1,12 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "machine_shape": "hm"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "code",
       "source": [
         "import os\n",
         "import urllib.request\n",
@@ -34,74 +24,63 @@
         "from sklearn.model_selection import train_test_split\n",
         "import matplotlib.pyplot as plt\n",
         "from sklearn.preprocessing import LabelEncoder"
-      ],
-      "metadata": {
-        "id": "KHnddFeW5hwh"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [
-        "from google.colab import drive\n",
-        "drive.mount('/content/drive')"
-      ],
       "metadata": {
         "id": "l7pGG_d85lzH"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "# prompt: copy a file from another directory to current directory in python code and create folders if needed\n",
-        "\n",
         "import shutil\n",
         "import os\n",
         "\n",
         "def copy_file(src, dst):\n",
-        "  \"\"\"\n",
-        "  Copies a file from src to dst, creating any necessary directories.\n",
-        "\n",
-        "  Args:\n",
-        "    src: The path to the source file.\n",
-        "    dst: The path to the destination file.\n",
-        "  \"\"\"\n",
-        "  # Create the destination directory if it doesn't exist.\n",
         "  dst_dir = os.path.dirname(dst)\n",
         "  if not os.path.exists(dst_dir):\n",
         "    os.makedirs(dst_dir)\n",
         "\n",
-        "  # Copy the file.\n",
         "  shutil.copy2(src, dst)\n",
         "\n",
         "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
-      ],
-      "metadata": {
-        "id": "dL8TIlH55qSc"
-      },
-      "execution_count": 3,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "def unzip_archive(filepath, dir_path):\n",
         "  with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
         "    zip_ref.extractall(dir_path)\n",
         "\n",
         "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
-      ],
-      "metadata": {
-        "id": "LLy-YA775snY"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "import shutil\n",
         "\n",
@@ -111,29 +90,27 @@
         "        os.makedirs(directory)\n",
         "    else:\n",
         "        os.makedirs(directory)"
-      ],
-      "metadata": {
-        "id": "YtO0seclE1Pb"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [
-        "\n",
-        "\n",
-        "directory = os.getcwd() + '/data/raw/data'\n",
-        "make_dir(directory)"
-      ],
       "metadata": {
         "id": "UeqDk3_65vTt"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "cols = [\n",
         "    'name',\n",
@@ -144,15 +121,27 @@
         "    'track_name',\n",
         "    'album_name'\n",
         "]"
-      ],
-      "metadata": {
-        "id": "zMTup29b5wtO"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "directory = os.getcwd() + '/data/raw/playlists/data'\n",
         "df = pd.DataFrame()\n",
@@ -192,27 +181,15 @@
         "            df = pd.DataFrame()\n",
         "            if index % 100 == 0:\n",
         "                break"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "h6jQO9HT5zsG",
-        "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "mpd.slice.727000-727999.json\t100/1000\t10.0%"
-          ]
-        }
       ]
     },
     {
       "cell_type": "code",
       "source": [
         "import pyarrow.parquet as pq\n",
         "\n",
@@ -228,27 +205,27 @@
         "\n",
         "folder_path = os.getcwd() + '/data/raw/data'\n",
         "df = read_parquet_folder(folder_path)"
-      ],
-      "metadata": {
-        "id": "PngL0QHq516u"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [
-        "directory = os.getcwd() + '/data/raw/mappings'\n",
-        "make_dir(directory)"
-      ],
       "metadata": {
         "id": "hdLpjr2153b_"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "def create_ids(df, col, name):\n",
         "    # Create a dictionary mapping unique values to IDs\n",
@@ -259,43 +236,42 @@
         "    df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n",
         "\n",
         "    return df"
-      ],
-      "metadata": {
-        "id": "peZyue6t57Mz"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "df = create_ids(df, 'artist_name', 'artist')\n",
         "df = create_ids(df, 'pid', 'playlist')\n",
-        "# df = create_ids(df, 'track_name', 'track')\n",
         "df = create_ids(df, 'album_name', 'album')"
-      ],
-      "metadata": {
-        "id": "p68WNyaf58rS"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
         "\n",
         "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
         "df['playlist_songs'] += 1"
-      ],
-      "metadata": {
-        "id": "aSBKxRFa5-O_"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
         "\n",
@@ -306,67 +282,50 @@
         "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
         "\n",
         "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n"
-      ],
-      "metadata": {
-        "id": "4WqHH-pn5_nL"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "# df = df.groupby(['playlist_id','artist_album','artist_album_id','playlist_songs']).agg({\n",
-        "#     'song_count': 'sum',\n",
-        "#     'track_name': '|'.join,\n",
-        "#     'track_name': '|'.join,\n",
-        "# }).reset_index()\n",
         "df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
         "\n",
-        "# Encode the genres data\n",
         "encoder = LabelEncoder()\n",
         "encoder.fit(df['track_name'])\n",
         "df['track_id'] = encoder.transform(df['track_name'])"
-      ],
-      "metadata": {
-        "id": "V1bhU5rW6BSY"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [
-        "# df['artist_percent'] = df['artist_count'] / df['playlist_songs']\n",
-        "df['song_percent'] = df['song_count'] / df['playlist_songs']\n",
-        "# df['album_percent'] = df['album_count'] / df['playlist_songs']"
-      ],
       "metadata": {
         "id": "l6sUWKYC6DCw"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "import numpy as np\n",
         "\n",
-        "# Assuming you have a DataFrame 'df' with a column 'column_name'\n",
         "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
-      ],
-      "metadata": {
-        "id": "XxC0WnlL6EWz"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [
-        "artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n",
-        "artists.head()"
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -375,19 +334,13 @@
         "id": "kbxBcQiX6F2v",
         "outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9"
       },
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "execute_result",
           "data": {
-            "text/plain": [
-              "   playlist_id  artist_id  album_id\n",
-              "0            0          0         0\n",
-              "1            0          1         1\n",
-              "2            0          2         2\n",
-              "3            0          3         3\n",
-              "4            0          4         4"
-            ],
             "text/html": [
               "\n",
               "  <div id=\"df-cedfd0c3-1f93-4a45-b95c-5d58bbf23f45\" class=\"colab-df-container\">\n",
@@ -658,30 +611,39 @@
               "    </div>\n",
               "  </div>\n"
             ],
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "dataframe",
-              "variable_name": "artists"
-            }
           },
           "metadata": {},
-          "execution_count": 18
         }
       ]
     },
     {
       "cell_type": "code",
       "source": [
         "X = artists.loc[:,['artist_id','album_id',]]\n",
         "y = artists.loc[:,'playlist_id',]\n",
         "\n",
         "# Split our data into training and test sets\n",
         "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
-      ],
-      "metadata": {
-        "id": "5HLSc9z36Izn"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -698,17 +660,7 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "from sklearn.metrics import precision_score, recall_score\n",
-        "y_no_noise = y[labels_db != -1]\n",
-        "labels_db_no_noise = labels_db[labels_db != -1]\n",
-        "\n",
-        "precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
-        "recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
-        "\n",
-        "print(f'Precision: {precision}')\n",
-        "print(f'Recall: {recall}')"
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -716,33 +668,58 @@
         "id": "Osq-NpGu9V2k",
         "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
       },
-      "execution_count": 27,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stderr",
           "text": [
             "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
             "  _warn_prf(average, modifier, msg_start, len(result))\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stdout",
           "text": [
             "Precision: 1.589262536579764e-05\n",
             "Recall: 9.606273770069471e-06\n"
           ]
         },
         {
-          "output_type": "stream",
           "name": "stderr",
           "text": [
             "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
             "  _warn_prf(average, modifier, msg_start, len(result))\n"
           ]
         }
       ]
     }
-  ]
-}

 {
   "cells": [
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KHnddFeW5hwh"
+      },
+      "outputs": [],
       "source": [
         "import os\n",
         "import urllib.request\n",
         "from sklearn.model_selection import train_test_split\n",
         "import matplotlib.pyplot as plt\n",
         "from sklearn.preprocessing import LabelEncoder"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "l7pGG_d85lzH"
       },
+      "outputs": [],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "dL8TIlH55qSc"
+      },
+      "outputs": [],
       "source": [
         "import shutil\n",
         "import os\n",
         "\n",
         "def copy_file(src, dst):\n",
         "  dst_dir = os.path.dirname(dst)\n",
         "  if not os.path.exists(dst_dir):\n",
         "    os.makedirs(dst_dir)\n",
         "\n",
         "  shutil.copy2(src, dst)\n",
         "\n",
         "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LLy-YA775snY"
+      },
+      "outputs": [],
       "source": [
         "def unzip_archive(filepath, dir_path):\n",
         "  with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
         "    zip_ref.extractall(dir_path)\n",
         "\n",
         "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YtO0seclE1Pb"
+      },
+      "outputs": [],
       "source": [
         "import shutil\n",
         "\n",
         "        os.makedirs(directory)\n",
         "    else:\n",
         "        os.makedirs(directory)"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "UeqDk3_65vTt"
       },
+      "outputs": [],
+      "source": [
+        "directory = os.getcwd() + '/data/raw/data'\n",
+        "make_dir(directory)"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zMTup29b5wtO"
+      },
+      "outputs": [],
       "source": [
         "cols = [\n",
         "    'name',\n",
         "    'track_name',\n",
         "    'album_name'\n",
         "]"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "h6jQO9HT5zsG",
+        "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "mpd.slice.727000-727999.json\t100/1000\t10.0%"
+          ]
+        }
+      ],
       "source": [
         "directory = os.getcwd() + '/data/raw/playlists/data'\n",
         "df = pd.DataFrame()\n",
         "            df = pd.DataFrame()\n",
         "            if index % 100 == 0:\n",
         "                break"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PngL0QHq516u"
+      },
+      "outputs": [],
       "source": [
         "import pyarrow.parquet as pq\n",
         "\n",
         "\n",
         "folder_path = os.getcwd() + '/data/raw/data'\n",
         "df = read_parquet_folder(folder_path)"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "hdLpjr2153b_"
       },
+      "outputs": [],
+      "source": [
+        "directory = os.getcwd() + '/data/raw/mappings'\n",
+        "make_dir(directory)"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "peZyue6t57Mz"
+      },
+      "outputs": [],
       "source": [
         "def create_ids(df, col, name):\n",
         "    # Create a dictionary mapping unique values to IDs\n",
         "    df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n",
         "\n",
         "    return df"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "p68WNyaf58rS"
+      },
+      "outputs": [],
       "source": [
         "df = create_ids(df, 'artist_name', 'artist')\n",
         "df = create_ids(df, 'pid', 'playlist')\n",
         "df = create_ids(df, 'album_name', 'album')"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aSBKxRFa5-O_"
+      },
+      "outputs": [],
       "source": [
         "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
         "\n",
         "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
         "df['playlist_songs'] += 1"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4WqHH-pn5_nL"
+      },
+      "outputs": [],
       "source": [
         "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
         "\n",
         "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
         "\n",
         "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "V1bhU5rW6BSY"
+      },
+      "outputs": [],
       "source": [
         "df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
         "\n",
         "encoder = LabelEncoder()\n",
         "encoder.fit(df['track_name'])\n",
         "df['track_id'] = encoder.transform(df['track_name'])"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "l6sUWKYC6DCw"
       },
+      "outputs": [],
+      "source": [
+        "df['song_percent'] = df['song_count'] / df['playlist_songs']"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XxC0WnlL6EWz"
+      },
+      "outputs": [],
       "source": [
         "import numpy as np\n",
         "\n",
         "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
         "id": "kbxBcQiX6F2v",
         "outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9"
       },
       "outputs": [
         {
           "data": {
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "dataframe",
+              "variable_name": "artists"
+            },
             "text/html": [
               "\n",
               "  <div id=\"df-cedfd0c3-1f93-4a45-b95c-5d58bbf23f45\" class=\"colab-df-container\">\n",
               "    </div>\n",
               "  </div>\n"
             ],
+            "text/plain": [
+              "   playlist_id  artist_id  album_id\n",
+              "0            0          0         0\n",
+              "1            0          1         1\n",
+              "2            0          2         2\n",
+              "3            0          3         3\n",
+              "4            0          4         4"
+            ]
           },
+          "execution_count": 18,
           "metadata": {},
+          "output_type": "execute_result"
         }
+      ],
+      "source": [
+        "artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n",
+        "artists.head()"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5HLSc9z36Izn"
+      },
+      "outputs": [],
       "source": [
         "X = artists.loc[:,['artist_id','album_id',]]\n",
         "y = artists.loc[:,'playlist_id',]\n",
         "\n",
         "# Split our data into training and test sets\n",
         "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
+      ]
     },
     {
       "cell_type": "code",
     },
     {
       "cell_type": "code",
+      "execution_count": 27,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         "id": "Osq-NpGu9V2k",
         "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
       },
       "outputs": [
         {
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
             "  _warn_prf(average, modifier, msg_start, len(result))\n"
           ]
         },
         {
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Precision: 1.589262536579764e-05\n",
             "Recall: 9.606273770069471e-06\n"
           ]
         },
         {
           "name": "stderr",
+          "output_type": "stream",
           "text": [
             "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
             "  _warn_prf(average, modifier, msg_start, len(result))\n"
           ]
         }
+      ],
+      "source": [
+        "from sklearn.metrics import precision_score, recall_score\n",
+        "y_no_noise = y[labels_db != -1]\n",
+        "labels_db_no_noise = labels_db[labels_db != -1]\n",
+        "\n",
+        "precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
+        "recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n",
+        "\n",
+        "print(f'Precision: {precision}')\n",
+        "print(f'Recall: {recall}')"
       ]
     }
+  ],
+  "metadata": {
+    "colab": {
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

notebooks/naive.ipynb ADDED Viewed

	@@ -0,0 +1,416 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "KHnddFeW5hwh"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import urllib.request\n",
+        "import zipfile\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "import time\n",
+        "import torch\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "import torch.optim as optim\n",
+        "from torch.utils.data import DataLoader, TensorDataset\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "import matplotlib.pyplot as plt\n",
+        "from sklearn.preprocessing import LabelEncoder"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "l7pGG_d85lzH"
+      },
+      "outputs": [],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "dL8TIlH55qSc"
+      },
+      "outputs": [],
+      "source": [
+        "import shutil\n",
+        "import os\n",
+        "\n",
+        "def copy_file(src, dst):\n",
+        "  dst_dir = os.path.dirname(dst)\n",
+        "  if not os.path.exists(dst_dir):\n",
+        "    os.makedirs(dst_dir)\n",
+        "\n",
+        "  shutil.copy2(src, dst)\n",
+        "\n",
+        "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LLy-YA775snY"
+      },
+      "outputs": [],
+      "source": [
+        "def unzip_archive(filepath, dir_path):\n",
+        "  with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
+        "    zip_ref.extractall(dir_path)\n",
+        "\n",
+        "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YtO0seclE1Pb"
+      },
+      "outputs": [],
+      "source": [
+        "import shutil\n",
+        "\n",
+        "def make_dir(directory):\n",
+        "    if os.path.exists(directory):\n",
+        "        shutil.rmtree(directory)\n",
+        "        os.makedirs(directory)\n",
+        "    else:\n",
+        "        os.makedirs(directory)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UeqDk3_65vTt"
+      },
+      "outputs": [],
+      "source": [
+        "directory = os.getcwd() + '/data/raw/data'\n",
+        "make_dir(directory)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zMTup29b5wtO"
+      },
+      "outputs": [],
+      "source": [
+        "cols = [\n",
+        "    'name',\n",
+        "    'pid',\n",
+        "    'num_followers',\n",
+        "    'pos',\n",
+        "    'artist_name',\n",
+        "    'track_name',\n",
+        "    'album_name'\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "h6jQO9HT5zsG",
+        "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "mpd.slice.727000-727999.json\t100/1000\t10.0%"
+          ]
+        }
+      ],
+      "source": [
+        "directory = os.getcwd() + '/data/raw/playlists/data'\n",
+        "df = pd.DataFrame()\n",
+        "index = 0\n",
+        "\n",
+        "for filename in os.listdir(directory):\n",
+        "    if os.path.isfile(os.path.join(directory, filename)):\n",
+        "        if filename.find('.json') != -1 :\n",
+        "          index += 1\n",
+        "\n",
+        "          print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
+        "\n",
+        "          full_path = os.path.join(directory, filename)\n",
+        "\n",
+        "          with open(full_path, 'r') as file:\n",
+        "              json_data = json.load(file)\n",
+        "\n",
+        "          temp = pd.DataFrame(json_data['playlists'])\n",
+        "          expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
+        "\n",
+        "          json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
+        "          result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
+        "          result = result[cols]\n",
+        "\n",
+        "          df = pd.concat([df, result], axis=0, ignore_index=True)\n",
+        "\n",
+        "        if index % 50 == 0:\n",
+        "            df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n",
+        "            del df\n",
+        "            df = pd.DataFrame()\n",
+        "            if index % 100 == 0:\n",
+        "                break"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "PngL0QHq516u"
+      },
+      "outputs": [],
+      "source": [
+        "import pyarrow.parquet as pq\n",
+        "\n",
+        "def read_parquet_folder(folder_path):\n",
+        "    dataframes = []\n",
+        "    for file in os.listdir(folder_path):\n",
+        "        if file.endswith('.parquet'):\n",
+        "            file_path = os.path.join(folder_path, file)\n",
+        "            df = pd.read_parquet(file_path)\n",
+        "            dataframes.append(df)\n",
+        "\n",
+        "    return pd.concat(dataframes, ignore_index=True)\n",
+        "\n",
+        "folder_path = os.getcwd() + '/../data/raw/data'\n",
+        "df = read_parquet_folder(folder_path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "peZyue6t57Mz"
+      },
+      "outputs": [],
+      "source": [
+        "def create_ids(df, col, name):\n",
+        "    value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
+        "    df[f'{name}_id'] = df[col].map(value_to_id)\n",
+        "\n",
+        "    return df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "p68WNyaf58rS"
+      },
+      "outputs": [],
+      "source": [
+        "df = create_ids(df, 'artist_name', 'artist')\n",
+        "df = create_ids(df, 'pid', 'playlist')\n",
+        "df = create_ids(df, 'album_name', 'album')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "aSBKxRFa5-O_"
+      },
+      "outputs": [],
+      "source": [
+        "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
+        "\n",
+        "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
+        "df['playlist_songs'] += 1"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
+        "\n",
+        "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
+        "df['artist_album_id'] = df['artist_album'].map(value_to_id)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "V1bhU5rW6BSY"
+      },
+      "outputs": [],
+      "source": [
+        "df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
+        "\n",
+        "encoder = LabelEncoder()\n",
+        "encoder.fit(df['track_name'])\n",
+        "df['track_id'] = encoder.transform(df['track_name'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "l6sUWKYC6DCw"
+      },
+      "outputs": [],
+      "source": [
+        "df['song_percent'] = df['song_count'] / df['playlist_songs']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "id": "XxC0WnlL6EWz"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "id": "5HLSc9z36Izn"
+      },
+      "outputs": [],
+      "source": [
+        "X = df.loc[:,['artist_id','album_id',]]\n",
+        "y = df.loc[:,'song_percent',]\n",
+        "\n",
+        "# Split our data into training and test sets\n",
+        "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from sklearn.metrics import precision_score, recall_score"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 30,
+      "metadata": {
+        "id": "k47MaxR65Nq4"
+      },
+      "outputs": [],
+      "source": [
+        "class NaiveModel:\n",
+        "    def __init__(self, k=10):\n",
+        "        self.k = k\n",
+        "        self.top_k_items = None\n",
+        "\n",
+        "    def fit(self, X, y):\n",
+        "        df = pd.DataFrame({'album_id': X['album_id'], 'song_percent': y})\n",
+        "        avg_ratings = df.groupby('album_id')['song_percent'].mean()\n",
+        "        self.top_k_items = avg_ratings.nlargest(self.k).index.tolist()\n",
+        "\n",
+        "    def predict(self, X):\n",
+        "        return [self.top_k_items] * len(X)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 36,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def precision_recall(actual,pred, k):\n",
+        "    actuals = set(actual)\n",
+        "    preds = set(pred[:k])\n",
+        "    true_positives = len(actuals & preds)\n",
+        "    precision = true_positives / k\n",
+        "    recall = true_positives / len(actuals)\n",
+        "    return precision, recall"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 38,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Osq-NpGu9V2k",
+        "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
+      },
+      "outputs": [],
+      "source": [
+        "model = NaiveModel()\n",
+        "model.fit(X_train, y_train)\n",
+        "\n",
+        "y_pred = model.predict(X_val)\n",
+        "\n",
+        "y_test_binary = (y_val >= 0.5).astype(int)\n",
+        "y_test_items = X_val['album_id'][y_test_binary == 1].tolist()\n",
+        "\n",
+        "precisions = []\n",
+        "recalls = []\n",
+        "for i in range(len(X_val)):\n",
+        "    precision, recall = precision_recall(y_test_items, y_pred[i], k=10)\n",
+        "    precisions.append(precision)\n",
+        "    recalls.append(recall)\n",
+        "\n",
+        "precision = sum(precisions) / len(precisions)\n",
+        "recall = sum(recalls) / len(recalls)\n",
+        "\n",
+        "print(f\"Precision: {precision}\")\n",
+        "print(f\"Recall: {recall}\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.15"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

notebooks/nn_collab_filter.ipynb CHANGED Viewed

@@ -48,25 +48,14 @@
       },
       "outputs": [],
       "source": [
-        "# prompt: copy a file from another directory to current directory in python code and create folders if needed\n",
-        "\n",
         "import shutil\n",
         "import os\n",
         "\n",
         "def copy_file(src, dst):\n",
-        "  \"\"\"\n",
-        "  Copies a file from src to dst, creating any necessary directories.\n",
-        "\n",
-        "  Args:\n",
-        "    src: The path to the source file.\n",
-        "    dst: The path to the destination file.\n",
-        "  \"\"\"\n",
-        "  # Create the destination directory if it doesn't exist.\n",
         "  dst_dir = os.path.dirname(dst)\n",
         "  if not os.path.exists(dst_dir):\n",
         "    os.makedirs(dst_dir)\n",
         "\n",
-        "  # Copy the file.\n",
         "  shutil.copy2(src, dst)\n",
         "\n",
         "# copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
@@ -84,7 +73,7 @@
         "  with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
         "    zip_ref.extractall(dir_path)\n",
         "\n",
-        "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n"
       ]
     },
     {
@@ -152,17 +141,14 @@
         "directory = os.getcwd() + '/data/raw/playlists/data'\n",
         "df = pd.DataFrame()\n",
         "index = 0\n",
-        "# Loop through all files in the directory\n",
         "for filename in os.listdir(directory):\n",
-        "    # Check if the item is a file (not a subdirectory)\n",
         "    if os.path.isfile(os.path.join(directory, filename)):\n",
         "        if filename.find('.json') != -1 :\n",
         "          index += 1\n",
         "\n",
-        "          # Print the filename or perform operations on the file\n",
         "          print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
         "\n",
-        "          # If you need the full file path, you can use:\n",
         "          full_path = os.path.join(directory, filename)\n",
         "\n",
         "          with open(full_path, 'r') as file:\n",
@@ -171,12 +157,9 @@
         "          temp = pd.DataFrame(json_data['playlists'])\n",
         "          expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
         "\n",
-        "          # Normalize the JSON data\n",
         "          json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
         "\n",
-        "          # Concatenate the original DataFrame with the normalized JSON data\n",
         "          result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
-        "\n",
         "          result = result[cols]\n",
         "\n",
         "          df = pd.concat([df, result], axis=0, ignore_index=True)\n",
@@ -234,10 +217,8 @@
       "outputs": [],
       "source": [
         "def create_ids(df, col, name):\n",
-        "    # Create a dictionary mapping unique values to IDs\n",
         "    value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
         "\n",
-        "    # Create a new column with the IDs\n",
         "    df[f'{name}_id'] = df[col].map(value_to_id)\n",
         "    df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')\n",
         "\n",
@@ -252,10 +233,10 @@
       },
       "outputs": [],
       "source": [
-        "# df = create_ids(df, 'artist_name', 'artist')\n",
         "df = create_ids(df, 'pid', 'playlist')\n",
-        "# df = create_ids(df, 'track_name', 'track')\n",
-        "# df = create_ids(df, 'album_name', 'album')"
       ]
     },
     {
@@ -282,10 +263,8 @@
       "source": [
         "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
         "\n",
-        "# Step 2: Create a dictionary mapping unique combined values to IDs\n",
         "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
         "\n",
-        "# Step 3: Map these IDs back to the DataFrame\n",
         "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
         "\n",
         "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')\n",
@@ -300,32 +279,13 @@
       },
       "outputs": [],
       "source": [
-        "# df = df.groupby(['playlist_id','artist_album','artist_album_id','playlist_songs']).agg({\n",
-        "#     'song_count': 'sum',\n",
-        "#     'track_name': '|'.join,\n",
-        "#     'track_name': '|'.join,\n",
-        "# }).reset_index()\n",
         "df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
         "\n",
-        "# Encode the genres data\n",
         "encoder = LabelEncoder()\n",
         "encoder.fit(df['track_name'])\n",
         "df['track_id'] = encoder.transform(df['track_name'])"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "r0YprWVe_LJ0"
-      },
-      "outputs": [],
-      "source": [
-        "# df['artist_count'] = df.groupby(['playlist_id','artist_id'])['song_id'].transform('nunique')\n",
-        "# df['album_count'] = df.groupby(['playlist_id','artist_id','album_id'])['song_id'].transform('nunique')\n",
-        "# df['song_count'] = df.groupby(['artist_id'])['song_id'].transform('count')"
-      ]
-    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -334,9 +294,7 @@
       },
       "outputs": [],
       "source": [
-        "# df['artist_percent'] = df['artist_count'] / df['playlist_songs']\n",
-        "df['song_percent'] = df['song_count'] / df['playlist_songs']\n",
-        "# df['album_percent'] = df['album_count'] / df['playlist_songs']"
       ]
     },
     {
@@ -349,7 +307,6 @@
       "source": [
         "import numpy as np\n",
         "\n",
-        "# Assuming you have a DataFrame 'df' with a column 'column_name'\n",
         "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
       ]
     },
@@ -429,20 +386,20 @@
       "source": [
         "class NNColabFiltering(nn.Module):\n",
         "\n",
-        "    def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):\n",
         "        super().__init__()\n",
-        "        self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)\n",
         "        self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)\n",
-        "        self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)\n",
         "        self.fc2 = nn.Linear(n_activations,1)\n",
         "        self.rating_range = rating_range\n",
         "\n",
         "    def forward(self, X):\n",
         "        # Get embeddings for minibatch\n",
-        "        embedded_users = self.user_embeddings(X[:,0])\n",
         "        embedded_items = self.item_embeddings(X[:,1])\n",
-        "        # Concatenate user and item embeddings\n",
-        "        embeddings = torch.cat([embedded_users,embedded_items],dim=1)\n",
         "        # Pass embeddings through network\n",
         "        preds = self.fc1(embeddings)\n",
         "        preds = F.relu(preds)\n",
@@ -547,9 +504,9 @@
       "source": [
         "# Train the model\n",
         "dataloaders = {'train':trainloader, 'val':valloader}\n",
-        "n_users = X.loc[:,'playlist_id'].max()+1\n",
         "n_items = X.loc[:,'artist_album_id'].max()+1\n",
-        "model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])\n",
         "criterion = nn.MSELoss()\n",
         "lr=0.001\n",
         "n_epochs=10\n",
@@ -678,31 +635,26 @@
         "def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):\n",
         "    model.eval()\n",
         "\n",
         "\n",
-        "    all_movie_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)\n",
-        "    user_ids = torch.full((len(all_movie_ids),), playlist_id, dtype=torch.long, device=device)\n",
-        "\n",
-        "    # Initialize tensor to store all predictions\n",
-        "    all_predictions = torch.zeros(len(all_movie_ids), device=device)\n",
         "\n",
-        "    # Generate predictions in batches\n",
         "    with torch.no_grad():\n",
-        "        for i in range(0, len(all_movie_ids), batch_size):\n",
-        "            batch_user_ids = user_ids[i:i+batch_size]\n",
-        "            batch_movie_ids = all_movie_ids[i:i+batch_size]\n",
         "\n",
-        "            input_tensor = torch.stack([batch_user_ids, batch_movie_ids], dim=1)\n",
         "            batch_predictions = model(input_tensor).squeeze()\n",
         "            all_predictions[i:i+batch_size] = batch_predictions\n",
         "\n",
-        "    # Convert to numpy for easier handling\n",
         "    predictions = all_predictions.cpu().numpy()\n",
         "\n",
         "    albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())\n",
         "\n",
         "    unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)\n",
         "\n",
-        "    # Get top N recommendations\n",
         "    top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]\n",
         "    recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]\n",
         "\n",

       },
       "outputs": [],
       "source": [
         "import shutil\n",
         "import os\n",
         "\n",
         "def copy_file(src, dst):\n",
         "  dst_dir = os.path.dirname(dst)\n",
         "  if not os.path.exists(dst_dir):\n",
         "    os.makedirs(dst_dir)\n",
         "\n",
         "  shutil.copy2(src, dst)\n",
         "\n",
         "# copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
         "  with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
         "    zip_ref.extractall(dir_path)\n",
         "\n",
+        "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
       ]
     },
     {
         "directory = os.getcwd() + '/data/raw/playlists/data'\n",
         "df = pd.DataFrame()\n",
         "index = 0\n",
+        "\n",
         "for filename in os.listdir(directory):\n",
         "    if os.path.isfile(os.path.join(directory, filename)):\n",
         "        if filename.find('.json') != -1 :\n",
         "          index += 1\n",
         "\n",
         "          print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
         "\n",
         "          full_path = os.path.join(directory, filename)\n",
         "\n",
         "          with open(full_path, 'r') as file:\n",
         "          temp = pd.DataFrame(json_data['playlists'])\n",
         "          expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
         "\n",
         "          json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
         "\n",
         "          result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
         "          result = result[cols]\n",
         "\n",
         "          df = pd.concat([df, result], axis=0, ignore_index=True)\n",
       "outputs": [],
       "source": [
         "def create_ids(df, col, name):\n",
         "    value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
         "\n",
         "    df[f'{name}_id'] = df[col].map(value_to_id)\n",
         "    df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')\n",
         "\n",
       },
       "outputs": [],
       "source": [
+        "df = create_ids(df, 'artist_name', 'artist')\n",
         "df = create_ids(df, 'pid', 'playlist')\n",
+        "df = create_ids(df, 'track_name', 'track')\n",
+        "df = create_ids(df, 'album_name', 'album')"
       ]
     },
     {
       "source": [
         "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
         "\n",
         "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
         "\n",
         "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n",
         "\n",
         "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')\n",
       },
       "outputs": [],
       "source": [
         "df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
         "\n",
         "encoder = LabelEncoder()\n",
         "encoder.fit(df['track_name'])\n",
         "df['track_id'] = encoder.transform(df['track_name'])"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       },
       "outputs": [],
       "source": [
+        "df['song_percent'] = df['song_count'] / df['playlist_songs']"
       ]
     },
     {
       "source": [
         "import numpy as np\n",
         "\n",
         "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
       ]
     },
       "source": [
         "class NNColabFiltering(nn.Module):\n",
         "\n",
+        "    def __init__(self, n_playlists, n_artists, embedding_dim_playlists, embedding_dim_items, n_activations, rating_range):\n",
         "        super().__init__()\n",
+        "        self.playlist_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_playlists)\n",
         "        self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)\n",
+        "        self.fc1 = nn.Linear(embedding_dim_playlists+embedding_dim_items,n_activations)\n",
         "        self.fc2 = nn.Linear(n_activations,1)\n",
         "        self.rating_range = rating_range\n",
         "\n",
         "    def forward(self, X):\n",
         "        # Get embeddings for minibatch\n",
+        "        embedded_playlists = self.playlist_embeddings(X[:,0])\n",
         "        embedded_items = self.item_embeddings(X[:,1])\n",
+        "        # Concatenate playlist and item embeddings\n",
+        "        embeddings = torch.cat([embedded_playlists,embedded_items],dim=1)\n",
         "        # Pass embeddings through network\n",
         "        preds = self.fc1(embeddings)\n",
         "        preds = F.relu(preds)\n",
       "source": [
         "# Train the model\n",
         "dataloaders = {'train':trainloader, 'val':valloader}\n",
+        "n_playlists = X.loc[:,'playlist_id'].max()+1\n",
         "n_items = X.loc[:,'artist_album_id'].max()+1\n",
+        "model = NNColabFiltering(n_playlists,n_items,embedding_dim_playlists=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])\n",
         "criterion = nn.MSELoss()\n",
         "lr=0.001\n",
         "n_epochs=10\n",
         "def generate_recommendations(artist_album, playlists, model, playlist_id, device, top_n=10, batch_size=1024):\n",
         "    model.eval()\n",
         "\n",
+        "    all_album_ids = torch.tensor(artist_album['artist_album_id'].values, dtype=torch.long, device=device)\n",
+        "    playlist_ids = torch.full((len(all_album_ids),), playlist_id, dtype=torch.long, device=device)\n",
         "\n",
+        "    all_predictions = torch.zeros(len(all_album_ids), device=device)\n",
         "\n",
         "    with torch.no_grad():\n",
+        "        for i in range(0, len(all_album_ids), batch_size):\n",
+        "            batch_playlist_ids = playlist_ids[i:i+batch_size]\n",
+        "            batch_album_ids = all_album_ids[i:i+batch_size]\n",
         "\n",
+        "            input_tensor = torch.stack([batch_playlist_ids, batch_album_ids], dim=1)\n",
         "            batch_predictions = model(input_tensor).squeeze()\n",
         "            all_predictions[i:i+batch_size] = batch_predictions\n",
         "\n",
         "    predictions = all_predictions.cpu().numpy()\n",
         "\n",
         "    albums_listened = set(playlists.loc[playlists['playlist_id'] == playlist_id, 'artist_album_id'].tolist())\n",
         "\n",
         "    unlistened_mask = np.isin(artist_album['artist_album_id'].values, list(albums_listened), invert=True)\n",
         "\n",
         "    top_indices = np.argsort(predictions[unlistened_mask])[-top_n:][::-1]\n",
         "    recs = artist_album['artist_album_id'].values[unlistened_mask][top_indices]\n",
         "\n",

scripts/build_features.py CHANGED Viewed

@@ -1,49 +1,21 @@
 import os
-import urllib.request
-import zipfile
-import json
 import pandas as pd
-import time
-import torch
 import numpy as np
 import pandas as pd
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.model_selection import train_test_split
-import matplotlib.pyplot as plt
 from sklearn.preprocessing import LabelEncoder
 import shutil
 import os
-import pyarrow.parquet as pq
-cols = [
-    'name',
-    'pid',
-    'num_followers',
-    'pos',
-    'artist_name',
-    'track_name',
-    'album_name'
-]
-def copy_file(src, dst):
-  dst_dir = os.path.dirname(dst)
-  if not os.path.exists(dst_dir):
-    os.makedirs(dst_dir)
-  shutil.copy2(src, dst)
-def unzip_archive(filepath, dir_path):
-  with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
-    zip_ref.extractall(dir_path)
 def make_dir(directory):
     if os.path.exists(directory):
         shutil.rmtree(directory)
         os.makedirs(directory)
@@ -51,52 +23,73 @@ def make_dir(directory):
         os.makedirs(directory)
-def make_dataset():
-    directory = os.getcwd() + '/data/raw/playlists/data'
-    df = pd.DataFrame()
-    index = 0
-    # Loop through all files in the directory
-    for filename in os.listdir(directory):
-        # Check if the item is a file (not a subdirectory)
-        if os.path.isfile(os.path.join(directory, filename)):
-            if filename.find('.json') != -1 :
-                index += 1
-                # Print the filename or perform operations on the file
-                print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
-                # If you need the full file path, you can use:
-                full_path = os.path.join(directory, filename)
-                with open(full_path, 'r') as file:
-                    json_data = json.load(file)
-                temp = pd.DataFrame(json_data['playlists'])
-                expanded_df = temp.explode('tracks').reset_index(drop=True)
-                # Normalize the JSON data
-                json_normalized = pd.json_normalize(expanded_df['tracks'])
-                # Concatenate the original DataFrame with the normalized JSON data
-                result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
-                result = result[cols]
-                df = pd.concat([df, result], axis=0, ignore_index=True)
-                if index % 50 == 0:
-                    df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
-                    del df
-                    df = pd.DataFrame()
-                    if index % 200 == 0:
-                        break
-if __name__ == '__main__':
-    unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
-    directory = os.getcwd() + '/data/raw/data'
-    make_dir(directory)
-    directory = os.getcwd() + '/data/processed'
-    make_dir(directory)
-    make_dataset()

+import numpy as np
 import os
 import pandas as pd
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 import shutil
 import os
 def make_dir(directory):
+    '''
+    Creates a new blank directory
+    Inputs:
+        directory: path to create a new directory at
+    Returns:
+    '''
     if os.path.exists(directory):
         shutil.rmtree(directory)
         os.makedirs(directory)
         os.makedirs(directory)
+def read_parquet_folder(folder_path):
+    '''
+    Creates the pandas dataframe from a folder of parquet files
+    Inputs:
+        folder_path: the folder path for the parquet files
+    Returns:
+    '''
+    dataframes = []
+    for file in os.listdir(folder_path):
+        if file.endswith('.parquet'):
+            file_path = os.path.join(folder_path, file)
+            df = pd.read_parquet(file_path)
+            dataframes.append(df)
+    return pd.concat(dataframes, ignore_index=True)
+def create_ids(df, col, name):
+    '''
+    Creates unique ids for the features and creates mapping documents
+    Inputs:
+        df: dataframe with the features
+        col: column to create ids on
+        name: name of the newly created id
+    Returns:
+        df: dataframe with the mapped ids
+    '''
+    value_to_id = {val: i for i, val in enumerate(df[col].unique())}
+    df[f'{name}_id'] = df[col].map(value_to_id)
+    df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
+    return df
+if __name__ == '__main__':
+    folder_path = os.getcwd() + '/data/raw/data'
+    df = read_parquet_folder(folder_path)
+    directory = os.getcwd() + '/data/processed'
+    make_dir(directory)
+    df = create_ids(df, 'artist_name', 'artist')
+    df = create_ids(df, 'pid', 'playlist')
+    df = create_ids(df, 'album_name', 'album')
+    df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
+    df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
+    df['playlist_songs'] += 1
+    df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
+    value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
+    df['artist_album_id'] = df['artist_album'].map(value_to_id)
+    df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
+    df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
+    encoder = LabelEncoder()
+    encoder.fit(df['track_name'])
+    df['track_id'] = encoder.transform(df['track_name'])
+    df['song_percent'] = df['song_count'] / df['playlist_songs']
+    df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
+    artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
+    artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')

scripts/make_dataset.py CHANGED Viewed

@@ -1,25 +1,63 @@
-import numpy as np
 import os
-import urllib.request
 import zipfile
 import json
 import pandas as pd
-import time
-import torch
-import numpy as np
 import pandas as pd
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.model_selection import train_test_split
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import LabelEncoder
 import shutil
 import os
-import pyarrow.parquet as pq
 def make_dir(directory):
     if os.path.exists(directory):
         shutil.rmtree(directory)
         os.makedirs(directory)
@@ -27,56 +65,54 @@ def make_dir(directory):
         os.makedirs(directory)
-def read_parquet_folder(folder_path):
-    dataframes = []
-    for file in os.listdir(folder_path):
-        if file.endswith('.parquet'):
-            file_path = os.path.join(folder_path, file)
-            df = pd.read_parquet(file_path)
-            dataframes.append(df)
-    return pd.concat(dataframes, ignore_index=True)
-def create_ids(df, col, name):
-    # Create a dictionary mapping unique values to IDs
-    value_to_id = {val: i for i, val in enumerate(df[col].unique())}
-    # Create a new column with the IDs
-    df[f'{name}_id'] = df[col].map(value_to_id)
-    df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
-    return df
-if __name__ == '__main__':
-    folder_path = os.getcwd() + '/data/raw/data'
-    df = read_parquet_folder(folder_path)
-    directory = os.getcwd() + '/data/processed'
-    make_dir(directory)
-    df = create_ids(df, 'artist_name', 'artist')
-    df = create_ids(df, 'pid', 'playlist')
-    df = create_ids(df, 'album_name', 'album')
-    df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
-    df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
-    df['playlist_songs'] += 1
-    df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
-    value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
-    df['artist_album_id'] = df['artist_album'].map(value_to_id)
-    df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
-    df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
-    encoder = LabelEncoder()
-    encoder.fit(df['track_name'])
-    df['track_id'] = encoder.transform(df['track_name'])
-    df['song_percent'] = df['song_count'] / df['playlist_songs']
-    df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
-    artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
-    artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')

 import os
 import zipfile
 import json
 import pandas as pd
 import pandas as pd
 import shutil
 import os
+cols = [
+    'name',
+    'pid',
+    'num_followers',
+    'pos',
+    'artist_name',
+    'track_name',
+    'album_name'
+]
+def copy_file(src, dst):
+    '''
+    Copies a file from one dir to another
+    Inputs:
+        src: filepath to use as the soruce
+        dst: filepath to copy the file to
+    Returns:
+    '''
+    dst_dir = os.path.dirname(dst)
+    if not os.path.exists(dst_dir):
+        os.makedirs(dst_dir)
+    shutil.copy2(src, dst)
+def unzip_archive(filepath, dir_path):
+    '''
+    Unzips a zipfile to the dir_path
+    Inputs:
+        filepath: filepath of the zip file
+        dir_path: path to extract the zip file contents to
+    Returns:
+    '''
+    with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
+        zip_ref.extractall(dir_path)
 def make_dir(directory):
+    '''
+    Creates a new blank directory
+    Inputs:
+        directory: path to create a new directory at
+    Returns:
+    '''
     if os.path.exists(directory):
         shutil.rmtree(directory)
         os.makedirs(directory)
         os.makedirs(directory)
+def make_dataset():
+    '''
+    Creates the directory of parquet files to create the
+    dataset with, used parquet to reduce memory load
+    Inputs:
+    Returns:
+    '''
+    directory = os.getcwd() + '/data/raw/playlists/data'
+    df = pd.DataFrame()
+    index = 0
+    for filename in os.listdir(directory):
+        if os.path.isfile(os.path.join(directory, filename)):
+            if filename.find('.json') != -1 :
+                index += 1
+                print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
+                full_path = os.path.join(directory, filename)
+                with open(full_path, 'r') as file:
+                    json_data = json.load(file)
+                temp = pd.DataFrame(json_data['playlists'])
+                expanded_df = temp.explode('tracks').reset_index(drop=True)
+                json_normalized = pd.json_normalize(expanded_df['tracks'])
+                result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
+                result = result[cols]
+                df = pd.concat([df, result], axis=0, ignore_index=True)
+                if index % 50 == 0:
+                    df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
+                    del df
+                    df = pd.DataFrame()
+                    if index % 200 == 0:
+                        break
+if __name__ == '__main__':
+    unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
+    directory = os.getcwd() + '/data/raw/data'
+    make_dir(directory)
+    directory = os.getcwd() + '/data/processed'
+    make_dir(directory)
+    make_dataset()

scripts/model.py CHANGED Viewed

@@ -7,9 +7,6 @@ Brinnae Bent
 """
 import os
-import urllib.request
-import zipfile
-import json
 import pandas as pd
 import time
 import torch
@@ -18,26 +15,8 @@ import pandas as pd
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
 from sklearn.model_selection import train_test_split
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import LabelEncoder
-def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
-    # Convert training and test data to TensorDatasets
-    trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
-                            torch.from_numpy(np.array(y_train)).float())
-    valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
-                            torch.from_numpy(np.array(y_val)).float())
-    # Create Dataloaders for our training and test data to allow us to iterate over minibatches
-    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
-    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
-    return trainloader, valloader
 class NNColabFiltering(nn.Module):
@@ -64,9 +43,50 @@ class NNColabFiltering(nn.Module):
         preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
         return preds
-def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
-    model = model.to(device) # Send model to GPU if available
     since = time.time()
     costpaths = {'train':[],'val':[]}
@@ -75,47 +95,36 @@ def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5,
         print('Epoch {}/{}'.format(epoch, num_epochs - 1))
         print('-' * 10)
-        # Each epoch has a training and validation phase
         for phase in ['train', 'val']:
             if phase == 'train':
-                model.train()  # Set model to training mode
             else:
-                model.eval()   # Set model to evaluate mode
             running_loss = 0.0
-            # Get the inputs and labels, and send to GPU if available
             index = 0
             for (inputs,labels) in dataloaders[phase]:
                 inputs = inputs.to(device)
                 labels = labels.to(device)
-                # Zero the weight gradients
                 optimizer.zero_grad()
-                # Forward pass to get outputs and calculate loss
-                # Track gradient only for training data
                 with torch.set_grad_enabled(phase == 'train'):
                     outputs = model.forward(inputs).view(-1)
                     loss = criterion(outputs, labels)
-                    # Backpropagation to get the gradients with respect to each weight
-                    # Only if in train
                     if phase == 'train':
                         loss.backward()
-                        # Update the weights
                         optimizer.step()
-                # Convert loss into a scalar and add it to running_loss
                 running_loss += np.sqrt(loss.item()) * labels.size(0)
                 print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
                 index +=1
-            # Step along learning rate scheduler when in train
             if (phase == 'train') and (scheduler is not None):
                 scheduler.step()
-            # Calculate and display average loss and accuracy for the epoch
             epoch_loss = running_loss / len(dataloaders[phase].dataset)
             costpaths[phase].append(epoch_loss)
             print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
@@ -150,7 +159,6 @@ if __name__ == '__main__':
     cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
     # Save the entire model
     torch.save(model, os.getcwd() + '/models/recommender.pt')

 """
 import os
 import pandas as pd
 import time
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from torch.utils.data import TensorDataset
 from sklearn.model_selection import train_test_split
 class NNColabFiltering(nn.Module):
         preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
         return preds
+def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
+    '''
+    Loads the prefetched data from the output dir
+    Inputs:
+        X_train: training data features
+        y_train: training data target
+        X_val: validation data features
+        y_val: validation data targets
+        batch_size: the batch size to use
+    Returns:
+        trainloader: training dataloader
+        valloader: validation dataloader
+    '''
+    # Convert training and test data to TensorDatasets
+    trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
+                            torch.from_numpy(np.array(y_train)).float())
+    valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
+                            torch.from_numpy(np.array(y_val)).float())
+    # Create Dataloaders for our training and test data to allow us to iterate over minibatches
+    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
+    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
+    return trainloader, valloader
+def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
+    '''
+    Loads the prefetched data from the output dir
+    Inputs:
+        model: the model to train
+        criterion: the criterion to use to train
+        optimizer: the optimizer to use to train
+        dataloaders: the dict of dataloaders to user in the training and validation
+        device: the torch defined cpu/gpu
+        num_epochs: number of epochs to use for training
+        scheduler: the scheduler to use to train for training
+    Returns:
+        costpaths: the loss for each epoch for validation and training
+    '''
+    model = model.to(device)
     since = time.time()
     costpaths = {'train':[],'val':[]}
         print('Epoch {}/{}'.format(epoch, num_epochs - 1))
         print('-' * 10)
         for phase in ['train', 'val']:
             if phase == 'train':
+                model.train()
             else:
+                model.eval()
             running_loss = 0.0
             index = 0
             for (inputs,labels) in dataloaders[phase]:
                 inputs = inputs.to(device)
                 labels = labels.to(device)
                 optimizer.zero_grad()
                 with torch.set_grad_enabled(phase == 'train'):
                     outputs = model.forward(inputs).view(-1)
                     loss = criterion(outputs, labels)
                     if phase == 'train':
                         loss.backward()
                         optimizer.step()
                 running_loss += np.sqrt(loss.item()) * labels.size(0)
                 print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
                 index +=1
             if (phase == 'train') and (scheduler is not None):
                 scheduler.step()
             epoch_loss = running_loss / len(dataloaders[phase].dataset)
             costpaths[phase].append(epoch_loss)
             print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
     cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
     # Save the entire model
     torch.save(model, os.getcwd() + '/models/recommender.pt')