library(readr)
spotify_songs <- read_csv("/Users/crystalmickelsen/Downloads/spotify_songs.csv")
## Rows: 32833 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): track_id, track_name, track_artist, track_album_id, track_album_na...
## dbl (13): track_popularity, danceability, energy, key, loudness, mode, speec...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## track_id track_name track_artist track_popularity
## Length:32833 Length:32833 Length:32833 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 24.00
## Mode :character Mode :character Mode :character Median : 45.00
## Mean : 42.48
## 3rd Qu.: 62.00
## Max. :100.00
## track_album_id track_album_name track_album_release_date
## Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## playlist_name playlist_id playlist_genre playlist_subgenre
## Length:32833 Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## danceability energy key loudness
## Min. :0.0000 Min. :0.000175 Min. : 0.000 Min. :-46.448
## 1st Qu.:0.5630 1st Qu.:0.581000 1st Qu.: 2.000 1st Qu.: -8.171
## Median :0.6720 Median :0.721000 Median : 6.000 Median : -6.166
## Mean :0.6548 Mean :0.698619 Mean : 5.374 Mean : -6.720
## 3rd Qu.:0.7610 3rd Qu.:0.840000 3rd Qu.: 9.000 3rd Qu.: -4.645
## Max. :0.9830 Max. :1.000000 Max. :11.000 Max. : 1.275
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.0410 1st Qu.:0.0151 1st Qu.:0.0000000
## Median :1.0000 Median :0.0625 Median :0.0804 Median :0.0000161
## Mean :0.5657 Mean :0.1071 Mean :0.1753 Mean :0.0847472
## 3rd Qu.:1.0000 3rd Qu.:0.1320 3rd Qu.:0.2550 3rd Qu.:0.0048300
## Max. :1.0000 Max. :0.9180 Max. :0.9940 Max. :0.9940000
## liveness valence tempo duration_ms
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 4000
## 1st Qu.:0.0927 1st Qu.:0.3310 1st Qu.: 99.96 1st Qu.:187819
## Median :0.1270 Median :0.5120 Median :121.98 Median :216000
## Mean :0.1902 Mean :0.5106 Mean :120.88 Mean :225800
## 3rd Qu.:0.2480 3rd Qu.:0.6930 3rd Qu.:133.92 3rd Qu.:253585
## Max. :0.9960 Max. :0.9910 Max. :239.44 Max. :517810
### Check for outliers, and truncate where needed #### Danceability
#### #Truncate danceability to .27
#### Energy
#### #Truncate energy
#### Truncate loudness
#### Acousticness
#### Truncate acousticness
#### Instrumentalness
#Truncate instrumentalness
#### Liveness
#### Truncate liveness
#### Duration in milliseconds
#### Truncate duration_ms
#### Track Popularity - no truncation needed
#### Correlation matrix - loudness and acousticness is highly correlated
to energy
## energy danceability loudness speechiness
## energy 1.00000000 -0.090952941 0.675069403 -0.033256178
## danceability -0.09095294 1.000000000 0.007506094 0.182138203
## loudness 0.67506940 0.007506094 1.000000000 0.006090685
## speechiness -0.03325618 0.182138203 0.006090685 1.000000000
## acousticness -0.51612165 0.002849648 -0.324549507 0.040334465
## instrumentalness 0.07839311 -0.047060773 -0.152070238 -0.154644818
## liveness 0.17276038 -0.123692791 0.094874764 0.059496206
## track_popularity -0.10978060 0.065151338 0.061378290 0.006819442
## duration_ms 0.01028517 -0.101070363 -0.127525512 -0.088314547
## acousticness instrumentalness liveness track_popularity
## energy -0.516121648 0.0783931055 0.1727603801 -0.109780603
## danceability 0.002849648 -0.0470607728 -0.1236927910 0.065151338
## loudness -0.324549507 -0.1520702381 0.0948747639 0.061378290
## speechiness 0.040334465 -0.1546448181 0.0594962064 0.006819442
## acousticness 1.000000000 -0.0881446167 -0.0868127674 0.091731289
## instrumentalness -0.088144617 1.0000000000 0.0004948589 -0.177014077
## liveness -0.086812767 0.0004948589 1.0000000000 -0.050879375
## track_popularity 0.091731289 -0.1770140771 -0.0508793749 1.000000000
## duration_ms -0.075351565 0.0906037288 -0.0071091512 -0.141520092
## duration_ms
## energy 0.010285166
## danceability -0.101070363
## loudness -0.127525512
## speechiness -0.088314547
## acousticness -0.075351565
## instrumentalness 0.090603729
## liveness -0.007109151
## track_popularity -0.141520092
## duration_ms 1.000000000
## Data Preprocessing
## track_id track_name track_artist
## 0 5 5
## track_popularity track_album_id track_album_name
## 0 0 5
## track_album_release_date playlist_name playlist_id
## 0 0 0
## playlist_genre playlist_subgenre danceability
## 0 0 0
## energy key loudness
## 0 0 0
## mode speechiness acousticness
## 0 0 0
## instrumentalness liveness valence
## 0 0 0
## tempo duration_ms
## 0 0
# Set the seed for reproducibility
set.seed(3)
# Randomly sample row indices for the training set
train_indices <- sample(1:NROW(spotify_songs),NROW(spotify_songs)*0.70)
# Create the training set
train_data <- spotify_songs[train_indices, ]
# Create the testing set
test_data <- spotify_songs[-train_indices, ]
##
## Call:
## lm(formula = energy ~ playlist_genre + loudness + danceability +
## acousticness + liveness + instrumentalness + duration_ms +
## track_popularity, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46960 -0.07063 0.00365 0.07126 0.60173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.011e+00 5.805e-03 174.235 < 2e-16 ***
## playlist_genrelatin -7.826e-03 2.669e-03 -2.932 0.00337 **
## playlist_genrepop -2.773e-02 2.556e-03 -10.849 < 2e-16 ***
## playlist_genrer&b -6.036e-02 2.725e-03 -22.152 < 2e-16 ***
## playlist_genrerap -4.240e-02 2.608e-03 -16.258 < 2e-16 ***
## playlist_genrerock 3.005e-02 2.779e-03 10.813 < 2e-16 ***
## loudness 3.910e-02 3.075e-04 127.144 < 2e-16 ***
## danceability -3.192e-02 5.720e-03 -5.580 2.43e-08 ***
## acousticness -2.571e-01 4.259e-03 -60.371 < 2e-16 ***
## liveness 1.123e-01 6.047e-03 18.577 < 2e-16 ***
## instrumentalness 4.147e+00 1.912e-01 21.688 < 2e-16 ***
## duration_ms 1.148e-07 1.432e-08 8.015 1.16e-15 ***
## track_popularity -6.527e-04 3.019e-05 -21.616 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1103 on 22970 degrees of freedom
## Multiple R-squared: 0.6204, Adjusted R-squared: 0.6202
## F-statistic: 3129 on 12 and 22970 DF, p-value: < 2.2e-16
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3715 0.5257 0.4900 0.6427 1.0000
##
## Call:
## lm(formula = energy ~ playlist_genre + loudness + danceability +
## acousticness + liveness + instrumentalness + duration_ms +
## track_popularity, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46960 -0.07063 0.00365 0.07126 0.60173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.011e+00 5.805e-03 174.235 < 2e-16 ***
## playlist_genrelatin -7.826e-03 2.669e-03 -2.932 0.00337 **
## playlist_genrepop -2.773e-02 2.556e-03 -10.849 < 2e-16 ***
## playlist_genrer&b -6.036e-02 2.725e-03 -22.152 < 2e-16 ***
## playlist_genrerap -4.240e-02 2.608e-03 -16.258 < 2e-16 ***
## playlist_genrerock 3.005e-02 2.779e-03 10.813 < 2e-16 ***
## loudness 3.910e-02 3.075e-04 127.144 < 2e-16 ***
## danceability -3.192e-02 5.720e-03 -5.580 2.43e-08 ***
## acousticness -2.571e-01 4.259e-03 -60.371 < 2e-16 ***
## liveness 1.123e-01 6.047e-03 18.577 < 2e-16 ***
## instrumentalness 4.147e+00 1.912e-01 21.688 < 2e-16 ***
## duration_ms 1.148e-07 1.432e-08 8.015 1.16e-15 ***
## track_popularity -6.527e-04 3.019e-05 -21.616 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1103 on 22970 degrees of freedom
## Multiple R-squared: 0.6204, Adjusted R-squared: 0.6202
## F-statistic: 3129 on 12 and 22970 DF, p-value: < 2.2e-16
lm_mse_train <- mean((spotify_lm$fitted.values - train_data$energy)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 0.01"
# Predict on testing data
lm_test_pred <- predict(spotify_lm, newdata = test_data)
# Cal
lm_mse_train <- mean ((lm_test_pred - test_data$energy)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Testing MSE for Linear Model: 0.01"
## Interactions added Linear Regression
##
## Call:
## lm(formula = energy ~ playlist_genre + loudness + danceability +
## acousticness + liveness + instrumentalness + duration_ms +
## track_popularity + acousticness * liveness, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50200 -0.07033 0.00358 0.07136 0.56794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.018e+00 5.888e-03 172.929 < 2e-16 ***
## playlist_genrelatin -8.388e-03 2.668e-03 -3.144 0.00167 **
## playlist_genrepop -2.784e-02 2.553e-03 -10.904 < 2e-16 ***
## playlist_genrer&b -6.061e-02 2.722e-03 -22.264 < 2e-16 ***
## playlist_genrerap -4.263e-02 2.606e-03 -16.359 < 2e-16 ***
## playlist_genrerock 2.988e-02 2.776e-03 10.760 < 2e-16 ***
## loudness 3.909e-02 3.073e-04 127.240 < 2e-16 ***
## danceability -3.236e-02 5.715e-03 -5.662 1.51e-08 ***
## acousticness -2.958e-01 7.152e-03 -41.355 < 2e-16 ***
## liveness 8.082e-02 7.649e-03 10.566 < 2e-16 ***
## instrumentalness 4.188e+00 1.911e-01 21.913 < 2e-16 ***
## duration_ms 1.108e-07 1.432e-08 7.741 1.02e-14 ***
## track_popularity -6.510e-04 3.017e-05 -21.579 < 2e-16 ***
## acousticness:liveness 2.290e-01 3.407e-02 6.721 1.85e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1102 on 22969 degrees of freedom
## Multiple R-squared: 0.6212, Adjusted R-squared: 0.621
## F-statistic: 2897 on 13 and 22969 DF, p-value: < 2.2e-16
library(kknn)
k <- 5
knn_fit_std <- kknn(energy~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, spotify_songs, spotify_songs, k=k)
knn_std_mse <- mean((knn_fit_std$fitted.values - spotify_songs$energy)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse, 2)))
## [1] "Mean Squared Error for stdKNN: 0"
knn_fit <- kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, spotify_songs, spotify_songs, k=k, scale = FALSE)
knn_mse <- mean((knn_fit$fitted.values - spotify_songs$energy)^2)
print(paste("Mean Squared Error for KNN:", round(knn_mse, 2)))
## [1] "Mean Squared Error for KNN: 0.01"
knn_model <- kknn(energy~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, train = train_data, test = train_data, k = 5)
# Predict on training data
knn_train_pred <- fitted.values(knn_model)
# Calculate in-sample MSE manually
knn_train_mse <- mean((train_data$energy - knn_train_pred)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse))
## [1] "In-Sample MSE for KNN: 0.00397292841398528"
# Predict on testing data
knn_model_test <- kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, train = train_data, test = test_data, k = 5)
knn_test_pred <- fitted.values(knn_model_test)
# Calculate out-of-sample MSE manually
knn_test_mse <- mean((test_data$energy - knn_test_pred)^2)
print(paste("Out-of-Sample MSE for KNN: ", knn_test_mse))
## [1] "Out-of-Sample MSE for KNN: 0.0139423967010956"
# Initialize a data frame to store MSE for each k
mse_df <- data.frame(k = integer(), MSE_train = numeric(), MSE_test = numeric())
for (k in c(1, 3, 5, 7, 9, 11)) {
# Fit the k-NN model using training data
knn_model_train <- kknn::kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, train = train_data, test = train_data, k = k)
# Calculate the training MSE
mse_train <- mean((knn_model_train$fitted.values - train_data$energy)^2)
# Test the k-NN with testing data
knn_model_test <- kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, train = train_data, test = test_data, k = k)
mse_test <- mean((knn_model_test$fitted.values - test_data$energy)^2)
# save the results
mse_df <- rbind(mse_df, data.frame(k = k, MSE_train = mse_train, MSE_test = mse_test))
}
# Show the MSE data frame
print(mse_df)
## k MSE_train MSE_test
## 1 1 0.000000000 0.02013862
## 2 3 0.001969737 0.01559235
## 3 5 0.003972928 0.01394240
## 4 7 0.005260572 0.01317560
## 5 9 0.006152549 0.01275864
## 6 11 0.006808594 0.01250487
# Initialize a data frame to store MSE for each k
mse_df <- data.frame(k = integer(), MSE_train = numeric(), MSE_test = numeric())
for (k in c(1, 3, 5, 7, 9, 11)) {
# Fit the k-NN model using training data
knn_model_train <- kknn::kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, train = train_data, test = train_data, k = k)
# Calculate the training MSE
mse_train <- mean((knn_model_train$fitted.values - train_data$energy)^2)
# Test the k-NN with testing data
knn_model_test <- kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, train = train_data, test = test_data, k = k)
mse_test <- mean((knn_model_test$fitted.values - test_data$energy)^2)
# save the results
mse_df <- rbind(mse_df, data.frame(k = k, MSE_train = mse_train, MSE_test = mse_test))
}
# Show the MSE data frame
print(mse_df)
## k MSE_train MSE_test
## 1 1 0.000000000 0.02013862
## 2 3 0.001969737 0.01559235
## 3 5 0.003972928 0.01394240
## 4 7 0.005260572 0.01317560
## 5 9 0.006152549 0.01275864
## 6 11 0.006808594 0.01250487
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(spotify_lm)
spotify_lm_int <- lm(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity + acousticness*liveness , data = train_data)
summary(spotify_lm_int)
##
## Call:
## lm(formula = energy ~ playlist_genre + loudness + danceability +
## acousticness + liveness + instrumentalness + duration_ms +
## track_popularity + acousticness * liveness, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50200 -0.07033 0.00358 0.07136 0.56794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.018e+00 5.888e-03 172.929 < 2e-16 ***
## playlist_genrelatin -8.388e-03 2.668e-03 -3.144 0.00167 **
## playlist_genrepop -2.784e-02 2.553e-03 -10.904 < 2e-16 ***
## playlist_genrer&b -6.061e-02 2.722e-03 -22.264 < 2e-16 ***
## playlist_genrerap -4.263e-02 2.606e-03 -16.359 < 2e-16 ***
## playlist_genrerock 2.988e-02 2.776e-03 10.760 < 2e-16 ***
## loudness 3.909e-02 3.073e-04 127.240 < 2e-16 ***
## danceability -3.236e-02 5.715e-03 -5.662 1.51e-08 ***
## acousticness -2.958e-01 7.152e-03 -41.355 < 2e-16 ***
## liveness 8.082e-02 7.649e-03 10.566 < 2e-16 ***
## instrumentalness 4.188e+00 1.911e-01 21.913 < 2e-16 ***
## duration_ms 1.108e-07 1.432e-08 7.741 1.02e-14 ***
## track_popularity -6.510e-04 3.017e-05 -21.579 < 2e-16 ***
## acousticness:liveness 2.290e-01 3.407e-02 6.721 1.85e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1102 on 22969 degrees of freedom
## Multiple R-squared: 0.6212, Adjusted R-squared: 0.621
## F-statistic: 2897 on 13 and 22969 DF, p-value: < 2.2e-16
k <- 5
knn_fit_std <- kknn(energy~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, spotify_songs, spotify_songs, k=k)
knn_std_mse <- mean((knn_fit_std$fitted.values - spotify_songs$energy)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse, 2)))
## [1] "Mean Squared Error for stdKNN: 0"
knn_fit <- kknn(energy ~ playlist_genre + loudness + danceability + acousticness + liveness + instrumentalness + duration_ms + track_popularity, spotify_songs, spotify_songs, k=k, scale = FALSE)
knn_mse <- mean((knn_fit$fitted.values - spotify_songs$energy)^2)
print(paste("Mean Squared Error for KNN:", round(knn_mse, 2)))
## [1] "Mean Squared Error for KNN: 0.01"
## k MSE_train MSE_test
## 1 1 0.000000000 0.02013862
## 2 3 0.001969737 0.01559235
## 3 5 0.003972928 0.01394240
## 4 7 0.005260572 0.01317560
## 5 9 0.006152549 0.01275864
## 6 11 0.006808594 0.01250487
## [1] "Training MSE for Linear Model: 0.01"
## [1] "Testing MSE for Linear Model: 0.01"