# Install and load required library packages
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(kknn)
library(corrplot)
## corrplot 0.92 loaded
library(readr)
Data Exploration spotify dataset

Load the data:

spotify <- read_csv("spotify.csv")
## Rows: 32833 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): track_id, track_name, track_artist, track_album_id, track_album_na...
## dbl (13): track_popularity, danceability, energy, key, loudness, mode, speec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(spotify)
## spc_tbl_ [32,833 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ track_id                : chr [1:32833] "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
##  $ track_name              : chr [1:32833] "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
##  $ track_artist            : chr [1:32833] "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
##  $ track_popularity        : num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
##  $ track_album_id          : chr [1:32833] "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
##  $ track_album_name        : chr [1:32833] "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
##  $ track_album_release_date: chr [1:32833] "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
##  $ playlist_name           : chr [1:32833] "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
##  $ playlist_id             : chr [1:32833] "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
##  $ playlist_genre          : chr [1:32833] "pop" "pop" "pop" "pop" ...
##  $ playlist_subgenre       : chr [1:32833] "dance pop" "dance pop" "dance pop" "dance pop" ...
##  $ danceability            : num [1:32833] 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
##  $ energy                  : num [1:32833] 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
##  $ key                     : num [1:32833] 6 11 1 7 1 8 5 4 8 2 ...
##  $ loudness                : num [1:32833] -2.63 -4.97 -3.43 -3.78 -4.67 ...
##  $ mode                    : num [1:32833] 1 1 0 1 1 1 0 0 1 1 ...
##  $ speechiness             : num [1:32833] 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
##  $ acousticness            : num [1:32833] 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
##  $ instrumentalness        : num [1:32833] 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
##  $ liveness                : num [1:32833] 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
##  $ valence                 : num [1:32833] 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
##  $ tempo                   : num [1:32833] 122 100 124 122 124 ...
##  $ duration_ms             : num [1:32833] 194754 162600 176616 169093 189052 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   track_id = col_character(),
##   ..   track_name = col_character(),
##   ..   track_artist = col_character(),
##   ..   track_popularity = col_double(),
##   ..   track_album_id = col_character(),
##   ..   track_album_name = col_character(),
##   ..   track_album_release_date = col_character(),
##   ..   playlist_name = col_character(),
##   ..   playlist_id = col_character(),
##   ..   playlist_genre = col_character(),
##   ..   playlist_subgenre = col_character(),
##   ..   danceability = col_double(),
##   ..   energy = col_double(),
##   ..   key = col_double(),
##   ..   loudness = col_double(),
##   ..   mode = col_double(),
##   ..   speechiness = col_double(),
##   ..   acousticness = col_double(),
##   ..   instrumentalness = col_double(),
##   ..   liveness = col_double(),
##   ..   valence = col_double(),
##   ..   tempo = col_double(),
##   ..   duration_ms = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
Examine data
#first 5 rows
head(spotify)
## # A tibble: 6 × 23
##   track_id               track_name track_artist track_popularity track_album_id
##   <chr>                  <chr>      <chr>                   <dbl> <chr>         
## 1 6f807x0ima9a1j3VPbc7VN I Don't C… Ed Sheeran                 66 2oCs0DGTsRO98…
## 2 0r7CVbZTWZgbTCYdfa2P31 Memories … Maroon 5                   67 63rPSO264uRjW…
## 3 1z1Hg7Vb0AhHDiEmnDE79l All the T… Zara Larsson               70 1HoSmj2eLcsrR…
## 4 75FpbthrwQmzHlBJLuGdC7 Call You … The Chainsm…               60 1nqYsOef1yKKu…
## 5 1e8PAfcKUYoKkxPhrHqw4x Someone Y… Lewis Capal…               69 7m7vv9wlQ4i0L…
## 6 7fvUMiyapMsRRxr07cU8Ef Beautiful… Ed Sheeran                 67 2yiy9cd2QktrN…
## # ℹ 18 more variables: track_album_name <chr>, track_album_release_date <chr>,
## #   playlist_name <chr>, playlist_id <chr>, playlist_genre <chr>,
## #   playlist_subgenre <chr>, danceability <dbl>, energy <dbl>, key <dbl>,
## #   loudness <dbl>, mode <dbl>, speechiness <dbl>, acousticness <dbl>,
## #   instrumentalness <dbl>, liveness <dbl>, valence <dbl>, tempo <dbl>,
## #   duration_ms <dbl>
#structure
str(spotify)
## spc_tbl_ [32,833 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ track_id                : chr [1:32833] "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
##  $ track_name              : chr [1:32833] "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
##  $ track_artist            : chr [1:32833] "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
##  $ track_popularity        : num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
##  $ track_album_id          : chr [1:32833] "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
##  $ track_album_name        : chr [1:32833] "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
##  $ track_album_release_date: chr [1:32833] "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
##  $ playlist_name           : chr [1:32833] "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
##  $ playlist_id             : chr [1:32833] "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
##  $ playlist_genre          : chr [1:32833] "pop" "pop" "pop" "pop" ...
##  $ playlist_subgenre       : chr [1:32833] "dance pop" "dance pop" "dance pop" "dance pop" ...
##  $ danceability            : num [1:32833] 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
##  $ energy                  : num [1:32833] 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
##  $ key                     : num [1:32833] 6 11 1 7 1 8 5 4 8 2 ...
##  $ loudness                : num [1:32833] -2.63 -4.97 -3.43 -3.78 -4.67 ...
##  $ mode                    : num [1:32833] 1 1 0 1 1 1 0 0 1 1 ...
##  $ speechiness             : num [1:32833] 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
##  $ acousticness            : num [1:32833] 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
##  $ instrumentalness        : num [1:32833] 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
##  $ liveness                : num [1:32833] 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
##  $ valence                 : num [1:32833] 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
##  $ tempo                   : num [1:32833] 122 100 124 122 124 ...
##  $ duration_ms             : num [1:32833] 194754 162600 176616 169093 189052 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   track_id = col_character(),
##   ..   track_name = col_character(),
##   ..   track_artist = col_character(),
##   ..   track_popularity = col_double(),
##   ..   track_album_id = col_character(),
##   ..   track_album_name = col_character(),
##   ..   track_album_release_date = col_character(),
##   ..   playlist_name = col_character(),
##   ..   playlist_id = col_character(),
##   ..   playlist_genre = col_character(),
##   ..   playlist_subgenre = col_character(),
##   ..   danceability = col_double(),
##   ..   energy = col_double(),
##   ..   key = col_double(),
##   ..   loudness = col_double(),
##   ..   mode = col_double(),
##   ..   speechiness = col_double(),
##   ..   acousticness = col_double(),
##   ..   instrumentalness = col_double(),
##   ..   liveness = col_double(),
##   ..   valence = col_double(),
##   ..   tempo = col_double(),
##   ..   duration_ms = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Everything but track_album_release_date is good. We need to change track_album_release_date to date variable. There are 32833 observations and 23 variables

Modifyig data types
spotify$track_album_release_date<- as.Date(spotify$track_album_release_date)

spotify$playlist_genre<-as.factor(spotify$playlist_genre)
Summary statistics and looking at some tables
summary(spotify)
##    track_id          track_name        track_artist       track_popularity
##  Length:32833       Length:32833       Length:32833       Min.   :  0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 24.00  
##  Mode  :character   Mode  :character   Mode  :character   Median : 45.00  
##                                                           Mean   : 42.48  
##                                                           3rd Qu.: 62.00  
##                                                           Max.   :100.00  
##                                                                           
##  track_album_id     track_album_name   track_album_release_date
##  Length:32833       Length:32833       Min.   :1957-01-01      
##  Class :character   Class :character   1st Qu.:2010-12-04      
##  Mode  :character   Mode  :character   Median :2017-01-27      
##                                        Mean   :2012-09-09      
##                                        3rd Qu.:2019-05-16      
##                                        Max.   :2020-01-29      
##                                        NA's   :1886            
##  playlist_name      playlist_id        playlist_genre playlist_subgenre 
##  Length:32833       Length:32833       edm  :6043     Length:32833      
##  Class :character   Class :character   latin:5155     Class :character  
##  Mode  :character   Mode  :character   pop  :5507     Mode  :character  
##                                        r&b  :5431                       
##                                        rap  :5746                       
##                                        rock :4951                       
##                                                                         
##   danceability        energy              key            loudness      
##  Min.   :0.0000   Min.   :0.000175   Min.   : 0.000   Min.   :-46.448  
##  1st Qu.:0.5630   1st Qu.:0.581000   1st Qu.: 2.000   1st Qu.: -8.171  
##  Median :0.6720   Median :0.721000   Median : 6.000   Median : -6.166  
##  Mean   :0.6548   Mean   :0.698619   Mean   : 5.374   Mean   : -6.720  
##  3rd Qu.:0.7610   3rd Qu.:0.840000   3rd Qu.: 9.000   3rd Qu.: -4.645  
##  Max.   :0.9830   Max.   :1.000000   Max.   :11.000   Max.   :  1.275  
##                                                                        
##       mode         speechiness      acousticness    instrumentalness   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000000  
##  1st Qu.:0.0000   1st Qu.:0.0410   1st Qu.:0.0151   1st Qu.:0.0000000  
##  Median :1.0000   Median :0.0625   Median :0.0804   Median :0.0000161  
##  Mean   :0.5657   Mean   :0.1071   Mean   :0.1753   Mean   :0.0847472  
##  3rd Qu.:1.0000   3rd Qu.:0.1320   3rd Qu.:0.2550   3rd Qu.:0.0048300  
##  Max.   :1.0000   Max.   :0.9180   Max.   :0.9940   Max.   :0.9940000  
##                                                                        
##     liveness         valence           tempo         duration_ms    
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  4000  
##  1st Qu.:0.0927   1st Qu.:0.3310   1st Qu.: 99.96   1st Qu.:187819  
##  Median :0.1270   Median :0.5120   Median :121.98   Median :216000  
##  Mean   :0.1902   Mean   :0.5106   Mean   :120.88   Mean   :225800  
##  3rd Qu.:0.2480   3rd Qu.:0.6930   3rd Qu.:133.92   3rd Qu.:253585  
##  Max.   :0.9960   Max.   :0.9910   Max.   :239.44   Max.   :517810  
## 
table(spotify$track_popularity)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2703  575  387  321  240  240  192  189  201  195  174  172  161  207  201  190 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  219  206  242  205  209  228  207  228  243  242  272  271  266  277  345  323 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##  351  377  388  433  431  435  483  459  486  442  428  464  472  505  430  496 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##  465  497  498  514  506  472  514  492  497  541  503  467  514  492  470  483 
##   64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79 
##  424  462  441  468  425  443  410  408  339  357  353  306  334  326  224  265 
##   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95 
##  172  167  126  183  122  120   91   89  104   27   59   58   27   44   37   15 
##   96   97   98   99  100 
##    7   22   36    4    2
table(spotify$playlist_genre)
## 
##   edm latin   pop   r&b   rap  rock 
##  6043  5155  5507  5431  5746  4951
table(spotify$playlist_subgenre)
## 
##                album rock                  big room              classic rock 
##                      1065                      1206                      1296 
##                 dance pop             electro house                electropop 
##                      1298                      1511                      1408 
##              gangster rap                 hard rock                   hip hop 
##                      1458                      1485                      1322 
##                   hip pop           indie poptimism             latin hip hop 
##                      1256                      1672                      1656 
##                 latin pop                  neo soul            new jack swing 
##                      1262                      1637                      1133 
##            permanent wave                   pop edm             post-teen pop 
##                      1105                      1517                      1129 
## progressive electro house                 reggaeton          southern hip hop 
##                      1809                       949                      1675 
##                      trap                  tropical        urban contemporary 
##                      1291                      1288                      1405
table(spotify$key)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11 
## 3454 4010 2827  913 2201 2680 2670 3352 2430 3027 2273 2996
table(spotify$mode)
## 
##     0     1 
## 14259 18574

From the data we can see that the first variables such as name, artist, album_id, release date are character variables. Then popuarity is a numeric variable which ranges from 0 to 100. Playlist genre and subrenge is categorical. Variables from danceability to valnece ususally range between 0 and 1. tempo and duration_ms have big max values like 239.44 and 517810.Loudness has a min value of -46.448. So we might see if there are any outliers in further exploration. We can see in some variables that the mean id not very close to the median, which would indicate skweness.

data types and class
str(spotify$track_popularity)
##  num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
class(spotify$track_popularity)
## [1] "numeric"
Duplicate data
dups_id <- sum(duplicated(spotify$track_id))
print(dups_id)
## [1] 4477

We can see that a lot of songs have been duplicated in this dataset. They have the same track_id. Therefoer we will remove them, for further analysis.

spotify_dups = spotify[duplicated(spotify$track_id),]
spotify = spotify[!duplicated(spotify$track_id),]

We removed the duplicate songs to another dataset called spotify_dups and removed duplicates from current dataset.

Visualization histograms

Visual exploration to understand the data set.

ggplot(spotify, aes(x= track_popularity)) +
  geom_histogram(binwidth=5, color="darkblue", fill="lightblue") +
  ggtitle("Popularity Distribution") +
  xlab("Popularity") +
  ylab("Frequency")

hist(spotify$danceability, col = 'blue', border = "black", xlab = 'danceability', ylab = 'Frequency', main = 'danceability Distribution')

hist(spotify$energy, col = 'blue', border = "black", xlab = 'energy', ylab = 'Frequency', main = 'energy Distribution')

hist(spotify$key, col = 'blue', border = "black", xlab = 'key', ylab = 'Frequency', main = 'key Distribution')

hist(spotify$loudness, col = 'blue', border = "black", xlab = 'loudness', ylab = 'Frequency', main = 'loudness distribution')

hist(spotify$mode, col = 'blue', border = "black", xlab = 'mode', ylab = 'Frequency', main = 'mode distribution')

hist(spotify$valence, col = 'blue', border = "black", xlab = 'valence', ylab = 'Frequency', main = 'valence Distribution')

hist(spotify$speechiness, col = 'blue', border = "black", xlab = 'speechiness', ylab = 'Frequency', main = 'speechiness Distribution')

hist(spotify$acousticness, col = 'blue', border = "black", xlab = 'acousticness', ylab = 'Frequency', main = 'acousticness Distribution')

hist(spotify$liveness, col = 'blue', border = "black", xlab = 'liveness', ylab = 'Frequency', main = 'liveness Distribution')

hist(spotify$instrumentalness, col = 'blue', border = "black", xlab = 'instrumentalness', ylab = 'Frequency', main = 'instrumentalness Distribution')

hist(spotify$tempo, col = 'blue', border = "black", xlab = 'tempo', ylab = 'Frequency', main = 'tempo Distribution')

hist(spotify$duration_ms, col = 'blue', border = "black", xlab = 'duration_ms', ylab = 'Frequency', main = 'duration Distribution')

plot(spotify$playlist_genre, xlab = 'Genre' , ylab = "Frequencies")

After plotting the histograms we can observe the following distribution:

Duration, tempo and Valence are normally distributed Danceability, Enery and Loudness is left-skewed Acousticness, Speechiness and Liveness is right-skewed By genre, most of the songs are edm.

Visualitazion scatterplots based on genre
ggplot(spotify, aes(x=tempo, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) + ggtitle("tempo and popularity") 

ggplot(spotify, aes(x=danceability, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("danceability and popularity") 

ggplot(spotify, aes(x=energy, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("energy and popularity") 

ggplot(spotify, aes(x=loudness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("loudness and popularity")  

ggplot(spotify, aes(x=speechiness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("speechiness and popularity") 

ggplot(spotify, aes(x=acousticness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("acousticness and popularity") 

ggplot(spotify, aes(x=instrumentalness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("instrumentalness and popularity") 

ggplot(spotify, aes(x=liveness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("liveness and popularity") 

ggplot(spotify, aes(x=valence, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("valence and popularity") 

ggplot(spotify, aes(x=tempo, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("tempo and popularity") 

ggplot(spotify, aes(x=duration_ms, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
  ggtitle("duration and popularity") 

Boxplots
boxplot(track_popularity ~ playlist_genre , data = spotify,
  main = "Popular genre",
  ylab = "Popularity",
  xlab = "Genre",
  col = "yellow")

boxplot( track_popularity ~ mode, data = spotify,
  main = "Popular Mode",
  ylab = "Popularity",
  xlab = "mode",
  col = "yellow")

boxplot(spotify$danceability,
  main = "Boxplot distribution of Danceability",
  col = "yellow")

boxplot(spotify$energy,
  main = "Boxplot distribution of energy",
  col = "yellow")

boxplot(spotify$key,
  main = "Boxplot distribution of key",
  col = "yellow")

boxplot(spotify$loudness,
  main = "Boxplot distribution of loudness",
  col = "yellow")

boxplot(spotify$mode,
  main = "Boxplot distribution of mode",
  col = "yellow")

boxplot(spotify$speechiness,
  main = "Boxplot distribution of speechiness",
  col = "yellow")

boxplot(spotify$acousticness,
  main = "Boxplot distribution of acousticness",
  col = "yellow")

boxplot(spotify$instrumentalness,
  main = "Boxplot distribution of instrumentalness",
  col = "yellow")

boxplot(spotify$liveness,
  main = "Boxplot distribution of liveness",
  col = "yellow")

boxplot(spotify$valence,
  main = "Boxplot distribution of valence",
  col = "yellow")

boxplot(spotify$tempo,
  main = "Boxplot distribution of tempo",
  col = "yellow")

boxplot(spotify$duration_ms,
  main = "Boxplot distribution of duration",
  col = "yellow")

From the boxplots we can observe that a lot of variables (danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, duration) have outliers. Removing them would influence the analysis a lot.

Correlation Matrix visualization with popularity

1.What variables are most strongly correlated with `track popularity?

Since a lot of the variables are character, we cannot to a correlation in all of them. However, we will select the numeric variables to see which ones correlate the most.

matrix <- spotify %>%  select(track_popularity, danceability, energy, key, loudness, mode, speechiness,acousticness, instrumentalness, liveness, valence, tempo, duration_ms)

cor_matrix <- cor(matrix)
print(cor_matrix)
##                  track_popularity danceability      energy           key
## track_popularity      1.000000000  0.046596507 -0.10362202 -0.0081442988
## danceability          0.046596507  1.000000000 -0.08143602  0.0070203133
## energy               -0.103622023 -0.081436016  1.00000000  0.0128766164
## key                  -0.008144299  0.007020313  0.01287662  1.0000000000
## loudness              0.037285154  0.015297214  0.68213774 -0.0005315332
## mode                  0.016275225 -0.055211662 -0.00431074 -0.1760953351
## speechiness           0.005205509  0.183453077 -0.02900840  0.0231150971
## acousticness          0.091725358 -0.028878237 -0.54588619  0.0041975410
## instrumentalness     -0.124431070 -0.002266695  0.02381792  0.0073927120
## liveness             -0.052773029 -0.127002362  0.16370922  0.0021365502
## valence               0.022581329  0.333728627  0.14971046  0.0216180479
## tempo                 0.004446180 -0.184575167  0.15154451 -0.0103554421
## duration_ms          -0.139681783 -0.087560915  0.01758400  0.0188095544
##                       loudness          mode  speechiness acousticness
## track_popularity  0.0372851540  0.0162752246  0.005205509  0.091725358
## danceability      0.0152972137 -0.0552116618  0.183453077 -0.028878237
## energy            0.6821377425 -0.0043107400 -0.029008396 -0.545886191
## key              -0.0005315332 -0.1760953351  0.023115097  0.004197541
## loudness          1.0000000000 -0.0176877795  0.012981015 -0.371602357
## mode             -0.0176877795  1.0000000000 -0.059650521  0.006760044
## speechiness       0.0129810147 -0.0596505208  1.000000000  0.024945425
## acousticness     -0.3716023568  0.0067600443  0.024945425  1.000000000
## instrumentalness -0.1543090413 -0.0057660749 -0.107959332 -0.003100502
## liveness          0.0819270381 -0.0002748805  0.059342894 -0.074539622
## valence           0.0495101626 -0.0030881898  0.064700922 -0.018998564
## tempo             0.0967061101  0.0167097090  0.032728605 -0.114335210
## duration_ms      -0.1045846289  0.0128129792 -0.098435680 -0.094143218
##                  instrumentalness      liveness     valence        tempo
## track_popularity     -0.124431070 -0.0527730291  0.02258133  0.004446180
## danceability         -0.002266695 -0.1270023619  0.33372863 -0.184575167
## energy                0.023817924  0.1637092244  0.14971046  0.151544510
## key                   0.007392712  0.0021365502  0.02161805 -0.010355442
## loudness             -0.154309041  0.0819270381  0.04951016  0.096706110
## mode                 -0.005766075 -0.0002748805 -0.00308819  0.016709709
## speechiness          -0.107959332  0.0593428937  0.06470092  0.032728605
## acousticness         -0.003100502 -0.0745396224 -0.01899856 -0.114335210
## instrumentalness      1.000000000 -0.0085048515 -0.17416318  0.021484050
## liveness             -0.008504851  1.0000000000 -0.01992507  0.022026642
## valence              -0.174163182 -0.0199250679  1.00000000 -0.025135288
## tempo                 0.021484050  0.0220266420 -0.02513529  1.000000000
## duration_ms           0.059077441  0.0075705280 -0.03332403 -0.001695701
##                   duration_ms
## track_popularity -0.139681783
## danceability     -0.087560915
## energy            0.017584005
## key               0.018809554
## loudness         -0.104584629
## mode              0.012812979
## speechiness      -0.098435680
## acousticness     -0.094143218
## instrumentalness  0.059077441
## liveness          0.007570528
## valence          -0.033324030
## tempo            -0.001695701
## duration_ms       1.000000000
corrplot(cor(cor_matrix), method="color", type="upper", order="hclust")

##### Answer: The strongest positive correlations to track_popularity are acousticness with 0.092 and loudness with 0.037. However they have almost no correlation.

The strongest negative correlation to track_popularity are duration_ms with -0.140 and instrumentalness with -0.124. None have them have a relative correlation with track_popularity. These are all results before truncation of data that happens in question.

if we look at other variables we can see that there exist a high positive correlation between energy & loudness with 0.682. There is a negative correlation exists between acousticness & energy with -0.545.

Data Preprocessing

Looking for missing or inconsistent values
colSums(is.na(spotify))
##                 track_id               track_name             track_artist 
##                        0                        4                        4 
##         track_popularity           track_album_id         track_album_name 
##                        0                        0                        4 
## track_album_release_date            playlist_name              playlist_id 
##                     1681                        0                        0 
##           playlist_genre        playlist_subgenre             danceability 
##                        0                        0                        0 
##                   energy                      key                 loudness 
##                        0                        0                        0 
##                     mode              speechiness             acousticness 
##                        0                        0                        0 
##         instrumentalness                 liveness                  valence 
##                        0                        0                        0 
##                    tempo              duration_ms 
##                        0                        0

We can observe that there is no missing data in the numeric variables. There is only missing values in character variables. We can keep them because they will not impact our analysis.

Outliers, Data Truncation/Winsorization

As we saw before from the boxplots we can observe that a lot of variables (danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, duration) have outliers. Removing them would influence the analysis a lot.

Lets create a new dataset with all winsorized and truncated variables to reduce outliers and apply regression models (to see the differences)

spotify_copy <- spotify
boxplot(spotify_copy$danceability,
  main = "Boxplot distribution of Danceability",
  col = "yellow")

summary(spotify_copy$danceability)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5610  0.6700  0.6534  0.7600  0.9830
Truncation of danceability
spotify_copy$danceability[spotify_copy$danceability <= 0.28] <- 0.28

summary(spotify_copy$danceability)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.280   0.561   0.670   0.654   0.760   0.983
boxplot(spotify_copy$danceability,
  main = "Boxplot distribution of Danceability",
  col = "yellow")

summary(spotify_copy$energy)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000175 0.579000 0.722000 0.698388 0.843000 1.000000
boxplot(spotify_copy$energy,
  main = "Boxplot distribution of Energy")

spotify_copy$energy[spotify_copy$energy <= 0.2] <- 0.2
boxplot(spotify_copy$energy,
  main = "Boxplot distribution of energy")

summary(spotify_copy$loudness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -46.448  -8.309  -6.261  -6.818  -4.709   1.275
boxplot(spotify_copy$loudness,
  main = "Boxplot distribution of loudness")

##### Winsorization

# Calculate the 5th and 95th percentiles for 'loudness'
lower_bound_loudness <- quantile(spotify_copy$loudness, 0.05, na.rm = TRUE)
upper_bound_loudness <- quantile(spotify_copy$loudness, 0.95, na.rm = TRUE)

# Winsorize the data
spotify_copy$loudness[spotify_copy$loudness < lower_bound_loudness] <- lower_bound_loudness
spotify_copy$loudness[spotify_copy$loudness > upper_bound_loudness] <- upper_bound_loudness


summary(spotify_copy$loudness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -12.592  -8.309  -6.261  -6.734  -4.709  -2.999
boxplot(spotify_copy$loudness,
  main = "Boxplot distribution of loudness")

spotify_copy$speechiness[spotify_copy$speechiness >= 0.27] <- 0.27


summary(spotify_copy$speechiness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04100 0.06260 0.09934 0.13300 0.27000
boxplot(spotify_copy$speechiness,
  main = "Boxplot distribution of speechiness")

boxplot(spotify_copy$acousticness,
  main = "Boxplot distribution of speechiness")

spotify_copy$acousticness[spotify_copy$acousticness >= 0.6] <- 0.6

summary(spotify_copy$acousticness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.01438 0.07970 0.16519 0.26000 0.60000
boxplot(spotify_copy$acousticness,
  main = "Boxplot distribution of acousticness")

Trucation of instrumentalness
spotify_copy$instrumentalness[spotify_copy$instrumentalness >= 0.015] <- 0.015
boxplot(spotify_copy$instrumentalness,
  main = "Boxplot distribution of instrumentalness")

##### Trucation of instrumentalness

spotify_copy$instrumentalness[spotify_copy$instrumentalness >= 0.015] <- 0.015
boxplot(spotify_copy$instrumentalness,
  main = "Boxplot distribution of instrumentalness")

boxplot(spotify_copy$liveness,
  main = "Boxplot distribution of liveness")

Trucation of liveness
spotify_copy$liveness[spotify_copy$liveness >= 0.4] <- 0.4
boxplot(spotify_copy$liveness,
  main = "Boxplot distribution of liveness")

boxplot(spotify_copy$tempo,
  main = "Boxplot distribution of tempo")

# Calculate the 5th and 95th percentiles for 'tempo'
lower_bound_tempo <- quantile(spotify_copy$tempo, 0.05, na.rm = TRUE)
upper_bound_tempo <- quantile(spotify_copy$tempo, 0.95, na.rm = TRUE)

# Winsorize the data
spotify_copy$tempo[spotify_copy$tempo < lower_bound_tempo] <- lower_bound_tempo
spotify_copy$tempo[spotify_copy$tempo > upper_bound_tempo] <- upper_bound_tempo
boxplot(spotify_copy$tempo,
  main = "Boxplot distribution of tempo")

boxplot(spotify_copy$duration_ms,
  main = "Boxplot distribution of duration_ms")

# Calculate the 5th and 95th percentiles for 'duration'
lower_bound_duration_ms <- quantile(spotify_copy$duration_ms, 0.05, na.rm = TRUE)
upper_bound_duration_ms <- quantile(spotify_copy$duration_ms, 0.95, na.rm = TRUE)

# Winsorize the data
spotify_copy$duration_ms[spotify_copy$duration_ms < lower_bound_duration_ms] <- lower_bound_duration_ms
spotify_copy$duration_ms[spotify_copy$duration_ms > upper_bound_duration_ms] <- upper_bound_duration_ms
boxplot(spotify_copy$duration_ms,
  main = "Boxplot distribution of duration_ms")

Now , all data are truncated or winsorized. Lets start by splittig the data in order to do lm models and knn.

Data Spliting 1
# Set the seed for reproducibility
set.seed(2023)

# Randomly sample row indices for the training set split in 70% and 30%
train_indices <- sample(1:NROW(spotify),NROW(spotify)*0.70)

# Create the training set
train_data <- spotify[train_indices, ] #everything before comma is row selector and after comma is column selesctor

# Create the testing set
test_data <- spotify[-train_indices, ]
Linear Regression Model on spotify Data - Training set
# Train the linear regression model, compring popularity to rest of numeric parameters
lm_model_train <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = train_data)
summary(lm_model_train)
## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + key + 
##     loudness + mode + speechiness + acousticness + instrumentalness + 
##     liveness + valence + tempo + duration_ms, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.757 -17.360   2.935  18.119  60.604 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.684e+01  2.045e+00  32.682  < 2e-16 ***
## danceability      4.170e+00  1.288e+00   3.237  0.00121 ** 
## energy           -2.318e+01  1.461e+00 -15.864  < 2e-16 ***
## key               1.398e-02  4.595e-02   0.304  0.76088    
## loudness          1.132e+00  7.802e-02  14.512  < 2e-16 ***
## mode              7.590e-01  3.357e-01   2.261  0.02378 *  
## speechiness      -7.397e+00  1.657e+00  -4.464 8.09e-06 ***
## acousticness      4.934e+00  8.895e-01   5.547 2.95e-08 ***
## instrumentalness -9.426e+00  7.437e-01 -12.676  < 2e-16 ***
## liveness         -4.420e+00  1.077e+00  -4.104 4.08e-05 ***
## valence           1.997e+00  7.861e-01   2.540  0.01110 *  
## tempo             2.808e-02  6.261e-03   4.485 7.33e-06 ***
## duration_ms      -4.277e-05  2.726e-06 -15.689  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.03 on 19836 degrees of freedom
## Multiple R-squared:  0.06003,    Adjusted R-squared:  0.05946 
## F-statistic: 105.6 on 12 and 19836 DF,  p-value: < 2.2e-16

The model suggests that all the variables are significant except key, since it is the only variable with a p-value higher than 0.5. This model has an R^2 of 0.06003 which means that only 6% of the of the data’s variability can be explained by the regression model, which indicated that this is not a good model.

# Train the linear regression model, comparing popularity to rest of numeric parameters with test data
lm_model_test <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = test_data)
summary(lm_model_test)
## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + key + 
##     loudness + mode + speechiness + acousticness + instrumentalness + 
##     liveness + valence + tempo + duration_ms, data = test_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53.83 -16.86   2.96  18.20  58.35 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.004e+01  3.085e+00  22.703  < 2e-16 ***
## danceability      2.674e+00  1.937e+00   1.381   0.1674    
## energy           -2.328e+01  2.217e+00 -10.504  < 2e-16 ***
## key              -1.690e-02  7.029e-02  -0.240   0.8100    
## loudness          1.212e+00  1.193e-01  10.158  < 2e-16 ***
## mode              1.122e+00  5.136e-01   2.185   0.0289 *  
## speechiness      -3.935e+00  2.494e+00  -1.578   0.1146    
## acousticness      2.943e+00  1.376e+00   2.140   0.0324 *  
## instrumentalness -8.982e+00  1.158e+00  -7.754 9.95e-15 ***
## liveness         -3.929e+00  1.634e+00  -2.404   0.0162 *  
## valence           1.299e+00  1.195e+00   1.087   0.2769    
## tempo             2.195e-02  9.581e-03   2.291   0.0220 *  
## duration_ms      -4.477e-05  4.256e-06 -10.520  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.97 on 8494 degrees of freedom
## Multiple R-squared:  0.05409,    Adjusted R-squared:  0.05275 
## F-statistic: 40.47 on 12 and 8494 DF,  p-value: < 2.2e-16

Doing the lm mode on the test data, even decreases the R^2 to 5.4%.

# Train the linear regression model, comparing popularity to other character parameters
lm_model_ch <- lm(track_popularity ~ playlist_genre + playlist_subgenre, data = train_data)

summary(lm_model_ch)
## 
## Call:
## lm(formula = track_popularity ~ playlist_genre + playlist_subgenre, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.418 -16.339   2.905  16.931  60.098 
## 
## Coefficients: (5 not defined because of singularities)
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                 23.9894     0.6853  35.005  < 2e-16
## playlist_genrelatin                         18.0795     1.0338  17.489  < 2e-16
## playlist_genrepop                           31.4286     1.0654  29.501  < 2e-16
## playlist_genrer&b                           22.2155     1.0313  21.541  < 2e-16
## playlist_genrerap                           24.4461     1.0227  23.902  < 2e-16
## playlist_genrerock                          13.5668     1.0741  12.631  < 2e-16
## playlist_subgenrebig room                    4.4078     1.0705   4.117 3.85e-05
## playlist_subgenreclassic rock               -0.2592     1.1543  -0.225 0.822309
## playlist_subgenredance pop                  -3.0791     1.1009  -2.797 0.005164
## playlist_subgenreelectro house               9.7948     0.9814   9.981  < 2e-16
## playlist_subgenreelectropop                -16.3501     1.0989 -14.878  < 2e-16
## playlist_subgenregangster rap              -15.9361     1.0535 -15.127  < 2e-16
## playlist_subgenrehard rock                  -4.2979     1.1230  -3.827 0.000130
## playlist_subgenrehip hop                     4.6597     1.0591   4.400 1.09e-05
## playlist_subgenrehip pop                    -1.8091     1.2060  -1.500 0.133599
## playlist_subgenreindie poptimism           -14.2180     1.0637 -13.367  < 2e-16
## playlist_subgenrelatin hip hop              -9.5706     1.0786  -8.873  < 2e-16
## playlist_subgenrelatin pop                   4.3781     1.1083   3.950 7.83e-05
## playlist_subgenreneo soul                  -16.3026     1.0344 -15.760  < 2e-16
## playlist_subgenrenew jack swing            -19.9120     1.1214 -17.757  < 2e-16
## playlist_subgenrepermanent wave             14.2161     1.1811  12.036  < 2e-16
## playlist_subgenrepop edm                    12.5587     1.0861  11.563  < 2e-16
## playlist_subgenrepost-teen pop                   NA         NA      NA       NA
## playlist_subgenreprogressive electro house       NA         NA      NA       NA
## playlist_subgenrereggaeton                   4.2245     1.2535   3.370 0.000752
## playlist_subgenresouthern hip hop          -13.6192     1.0064 -13.533  < 2e-16
## playlist_subgenretrap                            NA         NA      NA       NA
## playlist_subgenretropical                        NA         NA      NA       NA
## playlist_subgenreurban contemporary              NA         NA      NA       NA
##                                               
## (Intercept)                                ***
## playlist_genrelatin                        ***
## playlist_genrepop                          ***
## playlist_genrer&b                          ***
## playlist_genrerap                          ***
## playlist_genrerock                         ***
## playlist_subgenrebig room                  ***
## playlist_subgenreclassic rock                 
## playlist_subgenredance pop                 ** 
## playlist_subgenreelectro house             ***
## playlist_subgenreelectropop                ***
## playlist_subgenregangster rap              ***
## playlist_subgenrehard rock                 ***
## playlist_subgenrehip hop                   ***
## playlist_subgenrehip pop                      
## playlist_subgenreindie poptimism           ***
## playlist_subgenrelatin hip hop             ***
## playlist_subgenrelatin pop                 ***
## playlist_subgenreneo soul                  ***
## playlist_subgenrenew jack swing            ***
## playlist_subgenrepermanent wave            ***
## playlist_subgenrepop edm                   ***
## playlist_subgenrepost-teen pop                
## playlist_subgenreprogressive electro house    
## playlist_subgenrereggaeton                 ***
## playlist_subgenresouthern hip hop          ***
## playlist_subgenretrap                         
## playlist_subgenretropical                     
## playlist_subgenreurban contemporary           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.07 on 19825 degrees of freedom
## Multiple R-squared:  0.1372, Adjusted R-squared:  0.1362 
## F-statistic: 137.1 on 23 and 19825 DF,  p-value: < 2.2e-16

This explain that some of the subgroups might predict popularity of the songs. The R^2 is bigger menaing that 13,62% of the variance can be explained by this model. It is stil too low.

Manually check the results
In-sample (training) MSE numeric values
lm_mse_train <- mean((lm_model_train$fitted.values - train_data$track_popularity)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 529.94"
In-sample (training) MSE character values
lm_mse_train_ch <- mean((lm_model_ch$fitted.values - train_data$track_popularity)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train_ch, 2)))
## [1] "Training MSE for Linear Model: 486.43"

MSE for character variables is also very high. However, it is less than with the numeric variables.

Out-of-sample (testing) MSE
# Predict on testing data (numeric)
lm_test_pred <- predict(lm_model_test, newdata = test_data)
# Cal
lm_mse_test <- mean((lm_test_pred - test_data$track_popularity)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_test, 2)))
## [1] "Testing MSE for Linear Model: 526.88"

We can see that the MSE is lower on the testing data. However it is very big (MSE:526.88), so I would not recommend using this model.

# Predict on testing data (character)
lm_test_pred_ch <- predict(lm_model_ch, newdata = test_data)
## Warning in predict.lm(lm_model_ch, newdata = test_data): prediction from a
## rank-deficient fit may be misleading
# Cal
lm_mse_test_ch <- mean((lm_test_pred_ch - test_data$track_popularity)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_test_ch, 2)))
## [1] "Testing MSE for Linear Model: 485.8"

Also very high MSE. Now it is 485.5, only 1 lower than with the training set.

Lets try using it on the data with the truncated and winsorized data.

Data Spliting 2
# Set the seed for reproducibility
set.seed(2023)

# Randomly sample row indices for the training set
train_indices_copy <- sample(1:NROW(spotify_copy),NROW(spotify_copy)*0.70)

# Create the training set
train_data_copy <- spotify_copy[train_indices_copy, ] #everything before comma is row selector and after comma is column selesctor

# Create the testing set
test_data_copy <- spotify_copy[-train_indices_copy, ]
Linear Regression Model on spotify_copy Data - Training set
# Train the linear regression model
lm_model_copy <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = train_data_copy)
summary(lm_model_copy)
## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + key + 
##     loudness + mode + speechiness + acousticness + instrumentalness + 
##     liveness + valence + tempo + duration_ms, data = train_data_copy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.800 -17.508   3.015  17.930  63.353 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.015e+01  2.130e+00  32.928  < 2e-16 ***
## danceability      4.227e+00  1.301e+00   3.249 0.001161 ** 
## energy           -2.328e+01  1.475e+00 -15.785  < 2e-16 ***
## key               2.206e-02  4.584e-02   0.481 0.630366    
## loudness          1.296e+00  9.057e-02  14.313  < 2e-16 ***
## mode              7.174e-01  3.351e-01   2.141 0.032273 *  
## speechiness      -1.111e+01  2.173e+00  -5.112 3.23e-07 ***
## acousticness      5.495e+00  1.014e+00   5.417 6.14e-08 ***
## instrumentalness -4.047e+02  2.853e+01 -14.184  < 2e-16 ***
## liveness         -5.299e+00  1.517e+00  -3.493 0.000479 ***
## valence           2.141e+00  7.831e-01   2.734 0.006262 ** 
## tempo             3.179e-02  6.651e-03   4.779 1.77e-06 ***
## duration_ms      -5.077e-05  3.263e-06 -15.559  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.97 on 19836 degrees of freedom
## Multiple R-squared:  0.06491,    Adjusted R-squared:  0.06434 
## F-statistic: 114.7 on 12 and 19836 DF,  p-value: < 2.2e-16

Same as the other model, all values are significant except key. This model has an R^2 of 0.06328 which means that only 6.3% of the of the data’s variability can be explained by the regression model, which indicated that this is not a good model.

Manually check the results 2
# Create a data frame to compare actual and predicted values
comparison_df_copy <- data.frame(Actual =train_data_copy$track_popularity, lm_predicted =lm_model_copy$fitted.values)
head(comparison_df_copy)
##   Actual lm_predicted
## 1      0     34.13595
## 2     24     41.44849
## 3     39     43.20203
## 4     63     39.96939
## 5     40     34.15089
## 6     30     30.09493
dim(comparison_df_copy)
## [1] 19849     2
In-sample (training) MSE 2
lm_mse_train_copy <- mean((lm_model_copy$fitted.values - train_data_copy$track_popularity)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train_copy, 2)))
## [1] "Training MSE for Linear Model: 527.19"
Out-of-sample (testing) MSE 2
# Predict on testing data
lm_test_pred_copy <- predict(lm_model_copy, newdata = test_data_copy)
# Cal
lm_mse_test_copy <- mean((lm_test_pred_copy - test_data_copy$track_popularity)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_test_copy, 2)))
## [1] "Testing MSE for Linear Model: 524.25"

Now, the MSE is still super high, but the model on the testing set (525.9) is lower than the model on the training set (528.11).

Overall both MSE with testing and training data on spotify and spotify_copy (with winsorized/truncated data) are very high. For truncated data the testing set MSE is 525.9 and the training set MSE is 528.11. For the normal data Testing MSE for Linear Model is 527.51 and Training MSE for Linear Model is 529.94

Model assumption check spotify
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(lm_model_test)

###### Model assumption check spotify_copy

# Diagnostic Plots
par(mfrow = c(2, 2))
plot(lm_model_copy)

3. What assumptions are being made when we use linear regression? Are they met in this dataset? Just describe what you observe from the diagnostic plots.
Answer

There are four assumptions when looking at a linear regression model. These include: - Linearity—relationship between X and Y is linear. - Homoscedasticity—constancy in variance. - Independence –observations are independent of each other. - Normality—observing if the model is normally distributed (bell shape).

I think that when looking at the plots in both data sets (spotify and spotfy_copy) the assumptions are not met. When looking at the Residuals-leverage graph, the x values do not follow a linear line. It is more like clusters. Also the model is not normally distributed. The residual points are not equally scattered on the plot indicating that the variance is not equal or constant. Overall, this model might be good to predict which variables are significant, but not for further analysis.

4. Try adding interaction terms to your linear regression model. At least try to find out one interaction term that has a statistically significant coefficient. Report the interaction term and check how these interaction terms influence the model’s performance in terms of R^2 and how do you interpret your new model?
lm_model_int <- lm(track_popularity ~ danceability + loudness + danceability*loudness, data = spotify)
summary(lm_model_int)
## 
## Call:
## lm(formula = track_popularity ~ danceability + loudness + danceability * 
##     loudness, data = spotify)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.912 -18.249   3.069  18.643  60.571 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            28.4207     1.4221  19.985  < 2e-16 ***
## danceability           20.2423     2.1859   9.261  < 2e-16 ***
## loudness               -0.8238     0.1768  -4.660 3.18e-06 ***
## danceability:loudness   1.7837     0.2743   6.502 8.07e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.64 on 28352 degrees of freedom
## Multiple R-squared:  0.004993,   Adjusted R-squared:  0.004887 
## F-statistic: 47.42 on 3 and 28352 DF,  p-value: < 2.2e-16
Answer:

As we saw before, almost every variables is significant towards track populariry, so it is very easy to find interactions. We performed an interaction which was significant. However the R^2 is too low. Now it is 0.5%. The new coefficient to account for this interaction is 1.784 indicating that if there was an increase of 1 in the danceability variable that 1.784 would be multiplied by the value of loudnessvariable in the regression equation.

lm_model_int <- lm(track_popularity ~ danceability + loudness , data = spotify)
summary(lm_model_int)
## 
## Call:
## lm(formula = track_popularity ~ danceability + loudness, data = spotify)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.803 -18.306   2.989  18.645  59.569 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.38628    0.72256  50.357  < 2e-16 ***
## danceability  7.48487    0.96398   7.765 8.47e-15 ***
## loudness      0.28557    0.04629   6.170 6.93e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.66 on 28353 degrees of freedom
## Multiple R-squared:  0.003509,   Adjusted R-squared:  0.003439 
## F-statistic: 49.92 on 2 and 28353 DF,  p-value: < 2.2e-16

If we perform the same model with no interaction the R^2 is 0.035%. This means that 0.035% of variance can be explained by tgis model, indicating that it is not a good model.

Train the KNN Model with training dataset on spotify data
## Train the KNN Model with training dataset


# Train the KNN model
knn_model <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 5)
Evaluate the model
# Predict on training data
knn_train_pred <- fitted.values(knn_model)

# Calculate in-sample MSE manually
knn_train_mse <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse))
## [1] "In-Sample MSE for KNN:  194.903869196156"

We can see that the MSE is very big, however lower than lm. It is 195 for the training set and spotify dataset.

# Predict on testing data
knn_model_test <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 5)

knn_test_pred <- fitted.values(knn_model_test)

# Calculate out-of-sample MSE manually
knn_test_mse <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Out-of-Sample MSE for KNN: ", knn_test_mse))
## [1] "Out-of-Sample MSE for KNN:  687.024025645934"

Now, doing a KNN on the testing data, increases the MSE to 687.

Train the KNN Model with training dataset on spotify_copy data (truncated variables)
# Train the KNN model spotify_copy
knn_model_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 5)
Manually check the results
# Create a data frame to compare actual and predicted values
comparison_df <- data.frame(Actual =  train_data$track_popularity, lm_predicted =knn_model_copy$fitted.values)
head(comparison_df)
##   Actual lm_predicted
## 1      0     18.33174
## 2     24     43.54473
## 3     39     39.76225
## 4     63     44.50652
## 5     40     42.82548
## 6     30     34.46911
dim(comparison_df)
## [1] 19849     2
# Create a data frame to compare actual and predicted values
comparison_df$knn_std_predicted_copy <- knn_model_copy$fitted.values
head(comparison_df)
##   Actual lm_predicted knn_std_predicted_copy
## 1      0     18.33174               18.33174
## 2     24     43.54473               43.54473
## 3     39     39.76225               39.76225
## 4     63     44.50652               44.50652
## 5     40     42.82548               42.82548
## 6     30     34.46911               34.46911
Evaluate the model
# Predict on training data
knn_train_pred_copy <- fitted.values(knn_model_copy)

# Calculate in-sample MSE manually
knn_train_mse_copy <- mean((train_data_copy$track_popularity - knn_train_pred_copy)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse_copy))
## [1] "In-Sample MSE for KNN:  194.882087277794"

For the training set in spotify_copy , now it is 194. Lets do it on the testing set with the truncated variables.

# Predict on testing data
knn_model_test_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 5)

knn_test_pred_copy <- fitted.values(knn_model_test_copy)

# Calculate out-of-sample MSE manually
knn_test_mse_copy <- mean((test_data_copy$track_popularity - knn_test_pred_copy)^2)
print(paste("Out-of-Sample MSE for KNN: ", knn_test_mse_copy))
## [1] "Out-of-Sample MSE for KNN:  686.330902362197"

Now the values for MSE in knn model for training and testing set on spotify and spotify_copy are the following. spotify has a MSE of 194.903 for training set and a MSE of 687.024 in the testing set. spotify_copy has a MSE of 194.882 on training set and a MSE of 686.331 on testing set.

Therefore the truncation and winsorization of the variables do not influence a lot the overall model, but it lowers a bit the MSE.

For different K, fit model for each senarios and compare.

First spotify data

# Predict on training data
knn_model_spotify_7 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 7)

knn_train_pred <- fitted.values(knn_model_spotify_7)

# Calculate out-of-sample MSE manually
knn_train_mse_7 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_7))
## [1] "Traning MSE for KNN:  252.314205153333"
# Predict on training data
knn_model_spotify_20 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 20)

knn_train_pred <- fitted.values(knn_model_spotify_20)

# Calculate out-of-sample MSE manually
knn_train_mse_20 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_20))
## [1] "Traning MSE for KNN:  389.061411631355"
# Predict on training data
knn_model_spotify_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 50)

knn_train_pred <- fitted.values(knn_model_spotify_50)

# Calculate out-of-sample MSE manually
knn_train_mse_50 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_50))
## [1] "Traning MSE for KNN:  456.16237397779"
# Predict on training data
knn_model_spotify_500 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 500)

knn_train_pred <- fitted.values(knn_model_spotify_500)

# Calculate out-of-sample MSE manually
knn_train_mse_500 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_500))
## [1] "Traning MSE for KNN:  517.746609308476"

We tried with different k’s and saw that (with train data on spotify):

7=252.3142 20=389.06 50= 456.1624 500= 517.746 As always the smaller the K, less MSE.

Now lets try it on testing set:

# Predict on testing data
knn_model_spotify_test <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 7)

knn_test_pred <- fitted.values(knn_model_spotify_test)

# Calculate out-of-sample MSE manually
knn_test_mse <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse))
## [1] "Traning MSE for KNN:  639.188169931466"
# Predict on testing data
knn_model_spotify_test_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 50)

knn_test_pred <- fitted.values(knn_model_spotify_test_50)

# Calculate out-of-sample MSE manually
knn_test_mse_50 <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_50))
## [1] "Traning MSE for KNN:  531.741262685869"
# Predict on testing data
knn_model_spotify_test_200 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 200)

knn_test_pred <- fitted.values(knn_model_spotify_test_200)

# Calculate out-of-sample MSE manually
knn_test_mse_200 <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_200))
## [1] "Traning MSE for KNN:  522.46584494462"

When trying different k’s with the testing model, we get the following MSE values: 7=639.18 50 = 531.741 200 = 522.465
This means that the MSE in the training set was smaller that in the testing set. Lets try it with spotify_copy

Now it is the other way. On the testing set the smaller the k, the bigger MSE, and viceversa.

For different K, fit model for each senarios and compare.

First spotify_copy data with training set.

# Predict on training data -> k=7
knn_model_train_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 7)

knn_train_pred <- fitted.values(knn_model_train_copy)

# Calculate out-of-sample MSE manually
knn_train_mse_copy <- mean((train_data_copy$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_copy))
## [1] "Traning MSE for KNN:  252.96950348016"
# Predict on training data -> k=50
knn_model_train_copy_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 50)

knn_train_pred <- fitted.values(knn_model_train_copy_50)

# Calculate out-of-sample MSE manually
knn_train_mse_copy_50 <- mean((train_data_copy$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_copy_50))
## [1] "Traning MSE for KNN:  454.470364045298"
# Predict on training data -> k=200
knn_model_train_copy_200 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 200)

knn_train_pred <- fitted.values(knn_model_train_copy_200)

# Calculate out-of-sample MSE manually
knn_train_mse_copy_200 <- mean((train_data_copy$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_copy_200))
## [1] "Traning MSE for KNN:  501.613661177734"

Now with spotify_copy we see the following. On the training set we get these results: 7= 252.969 50= 454.470 200= 501.613

On the testing set we get these:

# Predict on testing data
knn_model_test_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 7)

knn_test_pred <- fitted.values(knn_model_test_copy)

# Calculate out-of-sample MSE manually
knn_test_mse_copy <- mean((test_data_copy$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_copy))
## [1] "Traning MSE for KNN:  639.716566097559"
# Predict on testing data
knn_model_test_copy_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 50)

knn_test_pred <- fitted.values(knn_model_test_copy_50)

# Calculate out-of-sample MSE manually
knn_test_mse_copy_50 <- mean((test_data_copy$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_copy_50))
## [1] "Traning MSE for KNN:  532.925151847685"
# Predict on testing data
knn_model_test_copy_200 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 200)

knn_test_pred <- fitted.values(knn_model_test_copy_200)

# Calculate out-of-sample MSE manually
knn_test_mse_copy_200 <- mean((test_data_copy$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_copy_200))
## [1] "Traning MSE for KNN:  522.191267489406"
2. How does the value of k in KNN affect the model’s performance (in terms of training MSE and testing MSE)?
Answer:

With spotify and training set we get the following results for MSE:

7=252.3142 20=389.06 50= 456.1624 500= 517.746

Now with spotify_copy and training set we get the following results for MSE: 7= 252.969 50= 454.470 200= 501.613

A: if we compare the data with winsorized variables, we see that the MSE might be samller, but there is not a big difference.

With spotify and testing set we get the following results for MSE:

7=639.18 50 = 531.741 200 = 522.465

With spotify_copy and testing set we get the following results for MSE:

k=7: 639.716 k=50: 532.925 k=200: 522.191

A: Here again, it is almost the same. spotify_copy might be a better fit,, but the difference is minimal. Something that I did not expected was that for the training sets, if the k is smaller the MSE is smaller (usually that is what happens). However, on the testing set it is the opposite (if k is smaller, MSE is bigger.) Since what we want is a high k and low MSE, I think that implementing the KNN model on the testing set might not be a bad idea. It is true that MSE is still very high, but at least it is getting lower when increasing k.

5. Is there any outliers in the dataset? If yes, apply truncation or winsorization techniques to handle outliers. Compare the performance of the models before and after applying these techniques. What differences do you observe?
Answer:

As we saw before in the analysis a lot of variables have outliers. These variables are danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, duration. We have created a new dataset called spotify_new with all truncated and winosrized variables. In the analysis we performed both lm and knn to both datasets with training and testing sets and we got the following: In knn the training set with spotify_copy, the MSE is a bit smaller than with spotify. However, the difference is minimal. In the testing set, in knn the MSE is also smaller with spotify_copy. However, only some ecimals smaller. NOthing that gets your attention.

For linear models, overall both MSE with testing and training data on spotify and spotify_copy (with winsorized/truncated data) are very high. For truncated data the testing set MSE is 524.25 and the training set MSE is 527.19. For the normal data Testing MSE for Linear Model is 527.51 and Training MSE for Linear Model is 529.94. This means that there is less error when the outliers are removed.

# Train the linear regression model, compring popularity to rest of numeric parameters on spotify data
lm_model_spotify <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = spotify)
summary(lm_model_spotify)
## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + key + 
##     loudness + mode + speechiness + acousticness + instrumentalness + 
##     liveness + valence + tempo + duration_ms, data = spotify)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54.62 -17.22   2.95  18.10  60.54 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.783e+01  1.704e+00  39.805  < 2e-16 ***
## danceability      3.721e+00  1.072e+00   3.470 0.000522 ***
## energy           -2.321e+01  1.220e+00 -19.028  < 2e-16 ***
## key               3.190e-03  3.844e-02   0.083 0.933870    
## loudness          1.156e+00  6.527e-02  17.711  < 2e-16 ***
## mode              8.616e-01  2.809e-01   3.067 0.002161 ** 
## speechiness      -6.328e+00  1.380e+00  -4.587 4.52e-06 ***
## acousticness      4.331e+00  7.466e-01   5.801 6.67e-09 ***
## instrumentalness -9.292e+00  6.255e-01 -14.856  < 2e-16 ***
## liveness         -4.280e+00  8.990e-01  -4.761 1.93e-06 ***
## valence           1.788e+00  6.565e-01   2.724 0.006458 ** 
## tempo             2.609e-02  5.239e-03   4.979 6.42e-07 ***
## duration_ms      -4.342e-05  2.294e-06 -18.925  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.01 on 28343 degrees of freedom
## Multiple R-squared:  0.05808,    Adjusted R-squared:  0.05768 
## F-statistic: 145.6 on 12 and 28343 DF,  p-value: < 2.2e-16
# Train the linear regression model, compring popularity to rest of numeric parameters on spotify data
lm_model_spotify_copy <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = spotify_copy)
summary(lm_model_spotify_copy)
## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + key + 
##     loudness + mode + speechiness + acousticness + instrumentalness + 
##     liveness + valence + tempo + duration_ms, data = spotify_copy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.471 -17.326   2.985  17.960  62.817 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.123e+01  1.774e+00  40.166  < 2e-16 ***
## danceability      3.819e+00  1.083e+00   3.527 0.000421 ***
## energy           -2.323e+01  1.228e+00 -18.913  < 2e-16 ***
## key               1.029e-02  3.834e-02   0.268 0.788384    
## loudness          1.325e+00  7.559e-02  17.527  < 2e-16 ***
## mode              8.320e-01  2.803e-01   2.968 0.002996 ** 
## speechiness      -9.644e+00  1.811e+00  -5.324 1.02e-07 ***
## acousticness      4.762e+00  8.501e-01   5.602 2.14e-08 ***
## instrumentalness -4.055e+02  2.383e+01 -17.015  < 2e-16 ***
## liveness         -5.189e+00  1.266e+00  -4.099 4.16e-05 ***
## valence           1.846e+00  6.541e-01   2.822 0.004778 ** 
## tempo             2.810e-02  5.562e-03   5.051 4.41e-07 ***
## duration_ms      -5.090e-05  2.737e-06 -18.595  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.95 on 28343 degrees of freedom
## Multiple R-squared:  0.06326,    Adjusted R-squared:  0.06287 
## F-statistic: 159.5 on 12 and 28343 DF,  p-value: < 2.2e-16

For R^2, The R^2 for the spotify data was 0.05409 and for the sppotify_copy data it was 0.05409. It was the same, menaing that 5.4% of variance can be explained by the model.

6. How could feature scaling (standardization) affect the KNN model?

Lets create a copy of the data set for standardized variables.

spotify_stand <- spotify

As we saw before in the analysis a lot of variables have outliers. These variables are danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, duration. Lets standardize these variables in spotify_stand to look at the knn model.

min_danceability <- min(spotify_stand$danceability)
max_danceability <- max(spotify_stand$danceability)
spotify_stand$danceability <- (spotify_stand$danceability - min_danceability) / (max_danceability - min_danceability)
summary(spotify_stand$danceability)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5707  0.6816  0.6647  0.7731  1.0000
min_energy <- min(spotify_stand$energy)
max_energy <- max(spotify_stand$energy)
spotify_stand$energy <- (spotify_stand$energy - min_energy) / (max_energy - min_energy)
summary(spotify_stand$energy)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5789  0.7220  0.6983  0.8430  1.0000
min_loudness <- min(spotify_stand$loudness)
max_loudness <- max(spotify_stand$loudness)
spotify_stand$loudness <- (spotify_stand$loudness - min_loudness) / (max_loudness - min_loudness)
min_speechiness <- min(spotify_stand$speechiness)
max_speechiness <- max(spotify_stand$speechiness)
spotify_stand$speechiness <- (spotify_stand$speechiness - min_speechiness) / (max_speechiness - min_speechiness)
min_acousticness <- min(spotify_stand$acousticness)
max_acousticness <- max(spotify_stand$acousticness)
spotify_stand$acousticness <- (spotify_stand$acousticness - min_acousticness) / (max_acousticness - min_acousticness)
min_instrumentalness <- min(spotify_stand$instrumentalness)
max_instrumentalness <- max(spotify_stand$instrumentalness)
spotify_stand$instrumentalness <- (spotify_stand$instrumentalness - min_instrumentalness) / (max_instrumentalness - min_instrumentalness)
min_liveness <- min(spotify_stand$liveness)
max_liveness <- max(spotify_stand$liveness)
spotify_stand$liveness <- (spotify_stand$liveness - min_liveness) / (max_liveness - min_liveness)
min_duration <- min(spotify_stand$duration_ms)
max_duration <- max(spotify_stand$duration_ms)
spotify_stand$duration_ms <- (spotify_stand$duration_ms - min_duration) / (max_duration - min_duration)
KNN Model without standardized data
k <- 5
knn_spotify <- kknn(track_popularity ~ danceability + energy + key + 
    loudness + mode + speechiness + acousticness + instrumentalness + 
    liveness + valence + tempo + duration_ms, spotify, spotify, k=k)
Evaluate:
# Create a data frame to compare actual and predicted values
comparison_spotify <- data.frame(Actual = spotify$track_popularity, knn_predicted = knn_spotify$fitted.values)

head(comparison_spotify)
##   Actual knn_predicted
## 1     66      44.32066
## 2     67      63.32553
## 3     70      59.90562
## 4     60      55.16832
## 5     69      53.92104
## 6     67      40.56705
knn_spotify_mse <- mean((knn_spotify$fitted.values - spotify$track_popularity)^2)
print(paste("Mean Squared Error for spotifyKNN:", round(knn_spotify_mse, 2)))
## [1] "Mean Squared Error for spotifyKNN: 197.96"

Without standardize data the KNN MSE is 197.96. Lets look at the KNN with standardized data.

k <- 5
knn_spotify_stand <- kknn(track_popularity ~ danceability + energy + key + 
    loudness + mode + speechiness + acousticness + instrumentalness + 
    liveness + valence + tempo + duration_ms, spotify_stand, spotify_stand, k=k)
knn_spotify_stand_mse <- mean((knn_spotify_stand$fitted.values - spotify_stand$track_popularity)^2)
print(paste("Mean Squared Error for spotify_standKNN:", round(knn_spotify_stand_mse, 2)))
## [1] "Mean Squared Error for spotify_standKNN: 197.96"
Answer:

The KNN is the same, which means that standardized data do not influence whether MSE is going ti be smaller on the KNN.

7. What insights can you derive from comparing the linear regression and KNN models? Which model would you recommend the most and Why?
Answer

After we created 2 new datasets, one with standardized data, one with truncated and winsorized data, we could conclude the following after doing lm and knn and splitting the data into training and testing set. As far as the linear regression models, we observed that it mighnt not be a good model for this data. I think that when looking at the plots in both data sets (spotify and spotfy_copy) the 4 assumptions for linear models are not met. When looking at the Residuals-leverage graph, the x values do not follow a linear line. It is more like clusters. Also the model is not normally distributed. The residual points are not equally scattered on the plot indicating that the variance is not equal or constant. Overall both MSE with testing and training data on spotify and spotify_copy (with winsorized/truncated data) are very high. For truncated data the testing set MSE is 525.9 and the training set MSE is 528.11.For the normal data Testing MSE for Linear Model is 527.51 and Training MSE for Linear Model is 529.94. The R^2 is also too low and only around 4% of variance in the variable could be explained by lm. For the data with truncated variables, the MSE improves only a bit. Not enough for it to be a good model.

knn model might be better than lm. It is one of the simplest yet powerful algorithms used for classification and regression tasks in machine learning and statistics. It is used for large datasets. There does not have to be a linear line, it predicts based on neighbours and distance. When looking at the plots for this dataset we can see that th points are plotted as clusters, they do not fit a line. Knn is probably better for prediction than lm.

After realizing knn on both datasets we got the following results: spotify_copy might be a better fit, but the difference is minimal. Something that I did not expected was that for the training sets, if the k is smaller the MSE is smaller (usually that is what happens). However, on the testing set it is the opposite (if k is smaller, MSE is bigger.) Since what we want is a high k and low MSE, I think that implementing the KNN model on the testing set might not be a bad idea. It is true that MSE is still very high, but at least it is getting lower when increasing k. We could have a larger mount of k’s and decrease MSE. However, as we saw inthe reults MSE was also super high. The lowest mSE we found was 197. It is a better model than lm, but still it is not a good enough model. This might be becasue the variables we chose for the analysis were only the numeric variables, not the rest. This could mean that danceablility, loudness, valence etc. might not be as stromg predictors to predict song popularitty. However, from the results we can see that knn is a better model than lm for this dataset. Standardization of variables also leads to same MSE, beacuse when performing knn the variables are automatically standardized.