# Install and load required library packages
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(kknn)
library(corrplot)
## corrplot 0.92 loaded
library(readr)
Load the data:
spotify <- read_csv("spotify.csv")
## Rows: 32833 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): track_id, track_name, track_artist, track_album_id, track_album_na...
## dbl (13): track_popularity, danceability, energy, key, loudness, mode, speec...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(spotify)
## spc_tbl_ [32,833 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ track_id : chr [1:32833] "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
## $ track_name : chr [1:32833] "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
## $ track_artist : chr [1:32833] "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
## $ track_popularity : num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
## $ track_album_id : chr [1:32833] "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
## $ track_album_name : chr [1:32833] "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
## $ track_album_release_date: chr [1:32833] "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
## $ playlist_name : chr [1:32833] "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
## $ playlist_id : chr [1:32833] "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
## $ playlist_genre : chr [1:32833] "pop" "pop" "pop" "pop" ...
## $ playlist_subgenre : chr [1:32833] "dance pop" "dance pop" "dance pop" "dance pop" ...
## $ danceability : num [1:32833] 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
## $ energy : num [1:32833] 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
## $ key : num [1:32833] 6 11 1 7 1 8 5 4 8 2 ...
## $ loudness : num [1:32833] -2.63 -4.97 -3.43 -3.78 -4.67 ...
## $ mode : num [1:32833] 1 1 0 1 1 1 0 0 1 1 ...
## $ speechiness : num [1:32833] 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
## $ acousticness : num [1:32833] 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
## $ instrumentalness : num [1:32833] 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
## $ liveness : num [1:32833] 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
## $ valence : num [1:32833] 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
## $ tempo : num [1:32833] 122 100 124 122 124 ...
## $ duration_ms : num [1:32833] 194754 162600 176616 169093 189052 ...
## - attr(*, "spec")=
## .. cols(
## .. track_id = col_character(),
## .. track_name = col_character(),
## .. track_artist = col_character(),
## .. track_popularity = col_double(),
## .. track_album_id = col_character(),
## .. track_album_name = col_character(),
## .. track_album_release_date = col_character(),
## .. playlist_name = col_character(),
## .. playlist_id = col_character(),
## .. playlist_genre = col_character(),
## .. playlist_subgenre = col_character(),
## .. danceability = col_double(),
## .. energy = col_double(),
## .. key = col_double(),
## .. loudness = col_double(),
## .. mode = col_double(),
## .. speechiness = col_double(),
## .. acousticness = col_double(),
## .. instrumentalness = col_double(),
## .. liveness = col_double(),
## .. valence = col_double(),
## .. tempo = col_double(),
## .. duration_ms = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#first 5 rows
head(spotify)
## # A tibble: 6 × 23
## track_id track_name track_artist track_popularity track_album_id
## <chr> <chr> <chr> <dbl> <chr>
## 1 6f807x0ima9a1j3VPbc7VN I Don't C… Ed Sheeran 66 2oCs0DGTsRO98…
## 2 0r7CVbZTWZgbTCYdfa2P31 Memories … Maroon 5 67 63rPSO264uRjW…
## 3 1z1Hg7Vb0AhHDiEmnDE79l All the T… Zara Larsson 70 1HoSmj2eLcsrR…
## 4 75FpbthrwQmzHlBJLuGdC7 Call You … The Chainsm… 60 1nqYsOef1yKKu…
## 5 1e8PAfcKUYoKkxPhrHqw4x Someone Y… Lewis Capal… 69 7m7vv9wlQ4i0L…
## 6 7fvUMiyapMsRRxr07cU8Ef Beautiful… Ed Sheeran 67 2yiy9cd2QktrN…
## # ℹ 18 more variables: track_album_name <chr>, track_album_release_date <chr>,
## # playlist_name <chr>, playlist_id <chr>, playlist_genre <chr>,
## # playlist_subgenre <chr>, danceability <dbl>, energy <dbl>, key <dbl>,
## # loudness <dbl>, mode <dbl>, speechiness <dbl>, acousticness <dbl>,
## # instrumentalness <dbl>, liveness <dbl>, valence <dbl>, tempo <dbl>,
## # duration_ms <dbl>
#structure
str(spotify)
## spc_tbl_ [32,833 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ track_id : chr [1:32833] "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
## $ track_name : chr [1:32833] "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
## $ track_artist : chr [1:32833] "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
## $ track_popularity : num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
## $ track_album_id : chr [1:32833] "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
## $ track_album_name : chr [1:32833] "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
## $ track_album_release_date: chr [1:32833] "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
## $ playlist_name : chr [1:32833] "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
## $ playlist_id : chr [1:32833] "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
## $ playlist_genre : chr [1:32833] "pop" "pop" "pop" "pop" ...
## $ playlist_subgenre : chr [1:32833] "dance pop" "dance pop" "dance pop" "dance pop" ...
## $ danceability : num [1:32833] 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
## $ energy : num [1:32833] 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
## $ key : num [1:32833] 6 11 1 7 1 8 5 4 8 2 ...
## $ loudness : num [1:32833] -2.63 -4.97 -3.43 -3.78 -4.67 ...
## $ mode : num [1:32833] 1 1 0 1 1 1 0 0 1 1 ...
## $ speechiness : num [1:32833] 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
## $ acousticness : num [1:32833] 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
## $ instrumentalness : num [1:32833] 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
## $ liveness : num [1:32833] 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
## $ valence : num [1:32833] 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
## $ tempo : num [1:32833] 122 100 124 122 124 ...
## $ duration_ms : num [1:32833] 194754 162600 176616 169093 189052 ...
## - attr(*, "spec")=
## .. cols(
## .. track_id = col_character(),
## .. track_name = col_character(),
## .. track_artist = col_character(),
## .. track_popularity = col_double(),
## .. track_album_id = col_character(),
## .. track_album_name = col_character(),
## .. track_album_release_date = col_character(),
## .. playlist_name = col_character(),
## .. playlist_id = col_character(),
## .. playlist_genre = col_character(),
## .. playlist_subgenre = col_character(),
## .. danceability = col_double(),
## .. energy = col_double(),
## .. key = col_double(),
## .. loudness = col_double(),
## .. mode = col_double(),
## .. speechiness = col_double(),
## .. acousticness = col_double(),
## .. instrumentalness = col_double(),
## .. liveness = col_double(),
## .. valence = col_double(),
## .. tempo = col_double(),
## .. duration_ms = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Everything but track_album_release_date is good. We need
to change track_album_release_date to date variable. There
are 32833 observations and 23 variables
spotify$track_album_release_date<- as.Date(spotify$track_album_release_date)
spotify$playlist_genre<-as.factor(spotify$playlist_genre)
summary(spotify)
## track_id track_name track_artist track_popularity
## Length:32833 Length:32833 Length:32833 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 24.00
## Mode :character Mode :character Mode :character Median : 45.00
## Mean : 42.48
## 3rd Qu.: 62.00
## Max. :100.00
##
## track_album_id track_album_name track_album_release_date
## Length:32833 Length:32833 Min. :1957-01-01
## Class :character Class :character 1st Qu.:2010-12-04
## Mode :character Mode :character Median :2017-01-27
## Mean :2012-09-09
## 3rd Qu.:2019-05-16
## Max. :2020-01-29
## NA's :1886
## playlist_name playlist_id playlist_genre playlist_subgenre
## Length:32833 Length:32833 edm :6043 Length:32833
## Class :character Class :character latin:5155 Class :character
## Mode :character Mode :character pop :5507 Mode :character
## r&b :5431
## rap :5746
## rock :4951
##
## danceability energy key loudness
## Min. :0.0000 Min. :0.000175 Min. : 0.000 Min. :-46.448
## 1st Qu.:0.5630 1st Qu.:0.581000 1st Qu.: 2.000 1st Qu.: -8.171
## Median :0.6720 Median :0.721000 Median : 6.000 Median : -6.166
## Mean :0.6548 Mean :0.698619 Mean : 5.374 Mean : -6.720
## 3rd Qu.:0.7610 3rd Qu.:0.840000 3rd Qu.: 9.000 3rd Qu.: -4.645
## Max. :0.9830 Max. :1.000000 Max. :11.000 Max. : 1.275
##
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.0410 1st Qu.:0.0151 1st Qu.:0.0000000
## Median :1.0000 Median :0.0625 Median :0.0804 Median :0.0000161
## Mean :0.5657 Mean :0.1071 Mean :0.1753 Mean :0.0847472
## 3rd Qu.:1.0000 3rd Qu.:0.1320 3rd Qu.:0.2550 3rd Qu.:0.0048300
## Max. :1.0000 Max. :0.9180 Max. :0.9940 Max. :0.9940000
##
## liveness valence tempo duration_ms
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 4000
## 1st Qu.:0.0927 1st Qu.:0.3310 1st Qu.: 99.96 1st Qu.:187819
## Median :0.1270 Median :0.5120 Median :121.98 Median :216000
## Mean :0.1902 Mean :0.5106 Mean :120.88 Mean :225800
## 3rd Qu.:0.2480 3rd Qu.:0.6930 3rd Qu.:133.92 3rd Qu.:253585
## Max. :0.9960 Max. :0.9910 Max. :239.44 Max. :517810
##
table(spotify$track_popularity)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2703 575 387 321 240 240 192 189 201 195 174 172 161 207 201 190
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 219 206 242 205 209 228 207 228 243 242 272 271 266 277 345 323
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 351 377 388 433 431 435 483 459 486 442 428 464 472 505 430 496
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
## 465 497 498 514 506 472 514 492 497 541 503 467 514 492 470 483
## 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
## 424 462 441 468 425 443 410 408 339 357 353 306 334 326 224 265
## 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 172 167 126 183 122 120 91 89 104 27 59 58 27 44 37 15
## 96 97 98 99 100
## 7 22 36 4 2
table(spotify$playlist_genre)
##
## edm latin pop r&b rap rock
## 6043 5155 5507 5431 5746 4951
table(spotify$playlist_subgenre)
##
## album rock big room classic rock
## 1065 1206 1296
## dance pop electro house electropop
## 1298 1511 1408
## gangster rap hard rock hip hop
## 1458 1485 1322
## hip pop indie poptimism latin hip hop
## 1256 1672 1656
## latin pop neo soul new jack swing
## 1262 1637 1133
## permanent wave pop edm post-teen pop
## 1105 1517 1129
## progressive electro house reggaeton southern hip hop
## 1809 949 1675
## trap tropical urban contemporary
## 1291 1288 1405
table(spotify$key)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 3454 4010 2827 913 2201 2680 2670 3352 2430 3027 2273 2996
table(spotify$mode)
##
## 0 1
## 14259 18574
From the data we can see that the first variables such as name, artist, album_id, release date are character variables. Then popuarity is a numeric variable which ranges from 0 to 100. Playlist genre and subrenge is categorical. Variables from danceability to valnece ususally range between 0 and 1. tempo and duration_ms have big max values like 239.44 and 517810.Loudness has a min value of -46.448. So we might see if there are any outliers in further exploration. We can see in some variables that the mean id not very close to the median, which would indicate skweness.
str(spotify$track_popularity)
## num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
class(spotify$track_popularity)
## [1] "numeric"
dups_id <- sum(duplicated(spotify$track_id))
print(dups_id)
## [1] 4477
We can see that a lot of songs have been duplicated in this dataset. They have the same track_id. Therefoer we will remove them, for further analysis.
spotify_dups = spotify[duplicated(spotify$track_id),]
spotify = spotify[!duplicated(spotify$track_id),]
We removed the duplicate songs to another dataset called
spotify_dups and removed duplicates from current
dataset.
Visual exploration to understand the data set.
ggplot(spotify, aes(x= track_popularity)) +
geom_histogram(binwidth=5, color="darkblue", fill="lightblue") +
ggtitle("Popularity Distribution") +
xlab("Popularity") +
ylab("Frequency")
hist(spotify$danceability, col = 'blue', border = "black", xlab = 'danceability', ylab = 'Frequency', main = 'danceability Distribution')
hist(spotify$energy, col = 'blue', border = "black", xlab = 'energy', ylab = 'Frequency', main = 'energy Distribution')
hist(spotify$key, col = 'blue', border = "black", xlab = 'key', ylab = 'Frequency', main = 'key Distribution')
hist(spotify$loudness, col = 'blue', border = "black", xlab = 'loudness', ylab = 'Frequency', main = 'loudness distribution')
hist(spotify$mode, col = 'blue', border = "black", xlab = 'mode', ylab = 'Frequency', main = 'mode distribution')
hist(spotify$valence, col = 'blue', border = "black", xlab = 'valence', ylab = 'Frequency', main = 'valence Distribution')
hist(spotify$speechiness, col = 'blue', border = "black", xlab = 'speechiness', ylab = 'Frequency', main = 'speechiness Distribution')
hist(spotify$acousticness, col = 'blue', border = "black", xlab = 'acousticness', ylab = 'Frequency', main = 'acousticness Distribution')
hist(spotify$liveness, col = 'blue', border = "black", xlab = 'liveness', ylab = 'Frequency', main = 'liveness Distribution')
hist(spotify$instrumentalness, col = 'blue', border = "black", xlab = 'instrumentalness', ylab = 'Frequency', main = 'instrumentalness Distribution')
hist(spotify$tempo, col = 'blue', border = "black", xlab = 'tempo', ylab = 'Frequency', main = 'tempo Distribution')
hist(spotify$duration_ms, col = 'blue', border = "black", xlab = 'duration_ms', ylab = 'Frequency', main = 'duration Distribution')
plot(spotify$playlist_genre, xlab = 'Genre' , ylab = "Frequencies")
After plotting the histograms we can observe the following
distribution:
Duration, tempo and Valence are normally distributed Danceability, Enery and Loudness is left-skewed Acousticness, Speechiness and Liveness is right-skewed By genre, most of the songs are edm.
ggplot(spotify, aes(x=tempo, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) + ggtitle("tempo and popularity")
ggplot(spotify, aes(x=danceability, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("danceability and popularity")
ggplot(spotify, aes(x=energy, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("energy and popularity")
ggplot(spotify, aes(x=loudness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("loudness and popularity")
ggplot(spotify, aes(x=speechiness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("speechiness and popularity")
ggplot(spotify, aes(x=acousticness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("acousticness and popularity")
ggplot(spotify, aes(x=instrumentalness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("instrumentalness and popularity")
ggplot(spotify, aes(x=liveness, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("liveness and popularity")
ggplot(spotify, aes(x=valence, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("valence and popularity")
ggplot(spotify, aes(x=tempo, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("tempo and popularity")
ggplot(spotify, aes(x=duration_ms, y=track_popularity)) + geom_jitter(aes(color = playlist_genre)) +
ggtitle("duration and popularity")
boxplot(track_popularity ~ playlist_genre , data = spotify,
main = "Popular genre",
ylab = "Popularity",
xlab = "Genre",
col = "yellow")
boxplot( track_popularity ~ mode, data = spotify,
main = "Popular Mode",
ylab = "Popularity",
xlab = "mode",
col = "yellow")
boxplot(spotify$danceability,
main = "Boxplot distribution of Danceability",
col = "yellow")
boxplot(spotify$energy,
main = "Boxplot distribution of energy",
col = "yellow")
boxplot(spotify$key,
main = "Boxplot distribution of key",
col = "yellow")
boxplot(spotify$loudness,
main = "Boxplot distribution of loudness",
col = "yellow")
boxplot(spotify$mode,
main = "Boxplot distribution of mode",
col = "yellow")
boxplot(spotify$speechiness,
main = "Boxplot distribution of speechiness",
col = "yellow")
boxplot(spotify$acousticness,
main = "Boxplot distribution of acousticness",
col = "yellow")
boxplot(spotify$instrumentalness,
main = "Boxplot distribution of instrumentalness",
col = "yellow")
boxplot(spotify$liveness,
main = "Boxplot distribution of liveness",
col = "yellow")
boxplot(spotify$valence,
main = "Boxplot distribution of valence",
col = "yellow")
boxplot(spotify$tempo,
main = "Boxplot distribution of tempo",
col = "yellow")
boxplot(spotify$duration_ms,
main = "Boxplot distribution of duration",
col = "yellow")
From the boxplots we can observe that a lot of variables
(danceability, energy, loudness,
speechiness, acousticness,
instrumentalness, liveness,
duration) have outliers. Removing them would influence the
analysis a lot.
colSums(is.na(spotify))
## track_id track_name track_artist
## 0 4 4
## track_popularity track_album_id track_album_name
## 0 0 4
## track_album_release_date playlist_name playlist_id
## 1681 0 0
## playlist_genre playlist_subgenre danceability
## 0 0 0
## energy key loudness
## 0 0 0
## mode speechiness acousticness
## 0 0 0
## instrumentalness liveness valence
## 0 0 0
## tempo duration_ms
## 0 0
We can observe that there is no missing data in the numeric variables. There is only missing values in character variables. We can keep them because they will not impact our analysis.
As we saw before from the boxplots we can observe that a lot of
variables (danceability, energy,
loudness, speechiness,
acousticness, instrumentalness,
liveness, duration) have outliers. Removing
them would influence the analysis a lot.
Lets create a new dataset with all winsorized and truncated variables to reduce outliers and apply regression models (to see the differences)
spotify_copy <- spotify
boxplot(spotify_copy$danceability,
main = "Boxplot distribution of Danceability",
col = "yellow")
summary(spotify_copy$danceability)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.5610 0.6700 0.6534 0.7600 0.9830
spotify_copy$danceability[spotify_copy$danceability <= 0.28] <- 0.28
summary(spotify_copy$danceability)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.280 0.561 0.670 0.654 0.760 0.983
boxplot(spotify_copy$danceability,
main = "Boxplot distribution of Danceability",
col = "yellow")
summary(spotify_copy$energy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000175 0.579000 0.722000 0.698388 0.843000 1.000000
boxplot(spotify_copy$energy,
main = "Boxplot distribution of Energy")
spotify_copy$energy[spotify_copy$energy <= 0.2] <- 0.2
boxplot(spotify_copy$energy,
main = "Boxplot distribution of energy")
summary(spotify_copy$loudness)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -46.448 -8.309 -6.261 -6.818 -4.709 1.275
boxplot(spotify_copy$loudness,
main = "Boxplot distribution of loudness")
##### Winsorization
# Calculate the 5th and 95th percentiles for 'loudness'
lower_bound_loudness <- quantile(spotify_copy$loudness, 0.05, na.rm = TRUE)
upper_bound_loudness <- quantile(spotify_copy$loudness, 0.95, na.rm = TRUE)
# Winsorize the data
spotify_copy$loudness[spotify_copy$loudness < lower_bound_loudness] <- lower_bound_loudness
spotify_copy$loudness[spotify_copy$loudness > upper_bound_loudness] <- upper_bound_loudness
summary(spotify_copy$loudness)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12.592 -8.309 -6.261 -6.734 -4.709 -2.999
boxplot(spotify_copy$loudness,
main = "Boxplot distribution of loudness")
spotify_copy$speechiness[spotify_copy$speechiness >= 0.27] <- 0.27
summary(spotify_copy$speechiness)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.04100 0.06260 0.09934 0.13300 0.27000
boxplot(spotify_copy$speechiness,
main = "Boxplot distribution of speechiness")
boxplot(spotify_copy$acousticness,
main = "Boxplot distribution of speechiness")
spotify_copy$acousticness[spotify_copy$acousticness >= 0.6] <- 0.6
summary(spotify_copy$acousticness)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.01438 0.07970 0.16519 0.26000 0.60000
boxplot(spotify_copy$acousticness,
main = "Boxplot distribution of acousticness")
spotify_copy$instrumentalness[spotify_copy$instrumentalness >= 0.015] <- 0.015
boxplot(spotify_copy$instrumentalness,
main = "Boxplot distribution of instrumentalness")
##### Trucation of instrumentalness
spotify_copy$instrumentalness[spotify_copy$instrumentalness >= 0.015] <- 0.015
boxplot(spotify_copy$instrumentalness,
main = "Boxplot distribution of instrumentalness")
boxplot(spotify_copy$liveness,
main = "Boxplot distribution of liveness")
spotify_copy$liveness[spotify_copy$liveness >= 0.4] <- 0.4
boxplot(spotify_copy$liveness,
main = "Boxplot distribution of liveness")
boxplot(spotify_copy$tempo,
main = "Boxplot distribution of tempo")
# Calculate the 5th and 95th percentiles for 'tempo'
lower_bound_tempo <- quantile(spotify_copy$tempo, 0.05, na.rm = TRUE)
upper_bound_tempo <- quantile(spotify_copy$tempo, 0.95, na.rm = TRUE)
# Winsorize the data
spotify_copy$tempo[spotify_copy$tempo < lower_bound_tempo] <- lower_bound_tempo
spotify_copy$tempo[spotify_copy$tempo > upper_bound_tempo] <- upper_bound_tempo
boxplot(spotify_copy$tempo,
main = "Boxplot distribution of tempo")
boxplot(spotify_copy$duration_ms,
main = "Boxplot distribution of duration_ms")
# Calculate the 5th and 95th percentiles for 'duration'
lower_bound_duration_ms <- quantile(spotify_copy$duration_ms, 0.05, na.rm = TRUE)
upper_bound_duration_ms <- quantile(spotify_copy$duration_ms, 0.95, na.rm = TRUE)
# Winsorize the data
spotify_copy$duration_ms[spotify_copy$duration_ms < lower_bound_duration_ms] <- lower_bound_duration_ms
spotify_copy$duration_ms[spotify_copy$duration_ms > upper_bound_duration_ms] <- upper_bound_duration_ms
boxplot(spotify_copy$duration_ms,
main = "Boxplot distribution of duration_ms")
Now , all data are truncated or winsorized. Lets start by splittig the
data in order to do lm models and knn.
# Set the seed for reproducibility
set.seed(2023)
# Randomly sample row indices for the training set split in 70% and 30%
train_indices <- sample(1:NROW(spotify),NROW(spotify)*0.70)
# Create the training set
train_data <- spotify[train_indices, ] #everything before comma is row selector and after comma is column selesctor
# Create the testing set
test_data <- spotify[-train_indices, ]
# Train the linear regression model, compring popularity to rest of numeric parameters
lm_model_train <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = train_data)
summary(lm_model_train)
##
## Call:
## lm(formula = track_popularity ~ danceability + energy + key +
## loudness + mode + speechiness + acousticness + instrumentalness +
## liveness + valence + tempo + duration_ms, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.757 -17.360 2.935 18.119 60.604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.684e+01 2.045e+00 32.682 < 2e-16 ***
## danceability 4.170e+00 1.288e+00 3.237 0.00121 **
## energy -2.318e+01 1.461e+00 -15.864 < 2e-16 ***
## key 1.398e-02 4.595e-02 0.304 0.76088
## loudness 1.132e+00 7.802e-02 14.512 < 2e-16 ***
## mode 7.590e-01 3.357e-01 2.261 0.02378 *
## speechiness -7.397e+00 1.657e+00 -4.464 8.09e-06 ***
## acousticness 4.934e+00 8.895e-01 5.547 2.95e-08 ***
## instrumentalness -9.426e+00 7.437e-01 -12.676 < 2e-16 ***
## liveness -4.420e+00 1.077e+00 -4.104 4.08e-05 ***
## valence 1.997e+00 7.861e-01 2.540 0.01110 *
## tempo 2.808e-02 6.261e-03 4.485 7.33e-06 ***
## duration_ms -4.277e-05 2.726e-06 -15.689 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.03 on 19836 degrees of freedom
## Multiple R-squared: 0.06003, Adjusted R-squared: 0.05946
## F-statistic: 105.6 on 12 and 19836 DF, p-value: < 2.2e-16
The model suggests that all the variables are significant except key, since it is the only variable with a p-value higher than 0.5. This model has an R^2 of 0.06003 which means that only 6% of the of the data’s variability can be explained by the regression model, which indicated that this is not a good model.
# Train the linear regression model, comparing popularity to rest of numeric parameters with test data
lm_model_test <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = test_data)
summary(lm_model_test)
##
## Call:
## lm(formula = track_popularity ~ danceability + energy + key +
## loudness + mode + speechiness + acousticness + instrumentalness +
## liveness + valence + tempo + duration_ms, data = test_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.83 -16.86 2.96 18.20 58.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.004e+01 3.085e+00 22.703 < 2e-16 ***
## danceability 2.674e+00 1.937e+00 1.381 0.1674
## energy -2.328e+01 2.217e+00 -10.504 < 2e-16 ***
## key -1.690e-02 7.029e-02 -0.240 0.8100
## loudness 1.212e+00 1.193e-01 10.158 < 2e-16 ***
## mode 1.122e+00 5.136e-01 2.185 0.0289 *
## speechiness -3.935e+00 2.494e+00 -1.578 0.1146
## acousticness 2.943e+00 1.376e+00 2.140 0.0324 *
## instrumentalness -8.982e+00 1.158e+00 -7.754 9.95e-15 ***
## liveness -3.929e+00 1.634e+00 -2.404 0.0162 *
## valence 1.299e+00 1.195e+00 1.087 0.2769
## tempo 2.195e-02 9.581e-03 2.291 0.0220 *
## duration_ms -4.477e-05 4.256e-06 -10.520 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.97 on 8494 degrees of freedom
## Multiple R-squared: 0.05409, Adjusted R-squared: 0.05275
## F-statistic: 40.47 on 12 and 8494 DF, p-value: < 2.2e-16
Doing the lm mode on the test data, even decreases the R^2 to 5.4%.
# Train the linear regression model, comparing popularity to other character parameters
lm_model_ch <- lm(track_popularity ~ playlist_genre + playlist_subgenre, data = train_data)
summary(lm_model_ch)
##
## Call:
## lm(formula = track_popularity ~ playlist_genre + playlist_subgenre,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.418 -16.339 2.905 16.931 60.098
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9894 0.6853 35.005 < 2e-16
## playlist_genrelatin 18.0795 1.0338 17.489 < 2e-16
## playlist_genrepop 31.4286 1.0654 29.501 < 2e-16
## playlist_genrer&b 22.2155 1.0313 21.541 < 2e-16
## playlist_genrerap 24.4461 1.0227 23.902 < 2e-16
## playlist_genrerock 13.5668 1.0741 12.631 < 2e-16
## playlist_subgenrebig room 4.4078 1.0705 4.117 3.85e-05
## playlist_subgenreclassic rock -0.2592 1.1543 -0.225 0.822309
## playlist_subgenredance pop -3.0791 1.1009 -2.797 0.005164
## playlist_subgenreelectro house 9.7948 0.9814 9.981 < 2e-16
## playlist_subgenreelectropop -16.3501 1.0989 -14.878 < 2e-16
## playlist_subgenregangster rap -15.9361 1.0535 -15.127 < 2e-16
## playlist_subgenrehard rock -4.2979 1.1230 -3.827 0.000130
## playlist_subgenrehip hop 4.6597 1.0591 4.400 1.09e-05
## playlist_subgenrehip pop -1.8091 1.2060 -1.500 0.133599
## playlist_subgenreindie poptimism -14.2180 1.0637 -13.367 < 2e-16
## playlist_subgenrelatin hip hop -9.5706 1.0786 -8.873 < 2e-16
## playlist_subgenrelatin pop 4.3781 1.1083 3.950 7.83e-05
## playlist_subgenreneo soul -16.3026 1.0344 -15.760 < 2e-16
## playlist_subgenrenew jack swing -19.9120 1.1214 -17.757 < 2e-16
## playlist_subgenrepermanent wave 14.2161 1.1811 12.036 < 2e-16
## playlist_subgenrepop edm 12.5587 1.0861 11.563 < 2e-16
## playlist_subgenrepost-teen pop NA NA NA NA
## playlist_subgenreprogressive electro house NA NA NA NA
## playlist_subgenrereggaeton 4.2245 1.2535 3.370 0.000752
## playlist_subgenresouthern hip hop -13.6192 1.0064 -13.533 < 2e-16
## playlist_subgenretrap NA NA NA NA
## playlist_subgenretropical NA NA NA NA
## playlist_subgenreurban contemporary NA NA NA NA
##
## (Intercept) ***
## playlist_genrelatin ***
## playlist_genrepop ***
## playlist_genrer&b ***
## playlist_genrerap ***
## playlist_genrerock ***
## playlist_subgenrebig room ***
## playlist_subgenreclassic rock
## playlist_subgenredance pop **
## playlist_subgenreelectro house ***
## playlist_subgenreelectropop ***
## playlist_subgenregangster rap ***
## playlist_subgenrehard rock ***
## playlist_subgenrehip hop ***
## playlist_subgenrehip pop
## playlist_subgenreindie poptimism ***
## playlist_subgenrelatin hip hop ***
## playlist_subgenrelatin pop ***
## playlist_subgenreneo soul ***
## playlist_subgenrenew jack swing ***
## playlist_subgenrepermanent wave ***
## playlist_subgenrepop edm ***
## playlist_subgenrepost-teen pop
## playlist_subgenreprogressive electro house
## playlist_subgenrereggaeton ***
## playlist_subgenresouthern hip hop ***
## playlist_subgenretrap
## playlist_subgenretropical
## playlist_subgenreurban contemporary
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.07 on 19825 degrees of freedom
## Multiple R-squared: 0.1372, Adjusted R-squared: 0.1362
## F-statistic: 137.1 on 23 and 19825 DF, p-value: < 2.2e-16
This explain that some of the subgroups might predict popularity of the songs. The R^2 is bigger menaing that 13,62% of the variance can be explained by this model. It is stil too low.
lm_mse_train <- mean((lm_model_train$fitted.values - train_data$track_popularity)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 529.94"
lm_mse_train_ch <- mean((lm_model_ch$fitted.values - train_data$track_popularity)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train_ch, 2)))
## [1] "Training MSE for Linear Model: 486.43"
MSE for character variables is also very high. However, it is less than with the numeric variables.
# Predict on testing data (numeric)
lm_test_pred <- predict(lm_model_test, newdata = test_data)
# Cal
lm_mse_test <- mean((lm_test_pred - test_data$track_popularity)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_test, 2)))
## [1] "Testing MSE for Linear Model: 526.88"
We can see that the MSE is lower on the testing data. However it is very big (MSE:526.88), so I would not recommend using this model.
# Predict on testing data (character)
lm_test_pred_ch <- predict(lm_model_ch, newdata = test_data)
## Warning in predict.lm(lm_model_ch, newdata = test_data): prediction from a
## rank-deficient fit may be misleading
# Cal
lm_mse_test_ch <- mean((lm_test_pred_ch - test_data$track_popularity)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_test_ch, 2)))
## [1] "Testing MSE for Linear Model: 485.8"
Also very high MSE. Now it is 485.5, only 1 lower than with the training set.
Lets try using it on the data with the truncated and winsorized data.
# Set the seed for reproducibility
set.seed(2023)
# Randomly sample row indices for the training set
train_indices_copy <- sample(1:NROW(spotify_copy),NROW(spotify_copy)*0.70)
# Create the training set
train_data_copy <- spotify_copy[train_indices_copy, ] #everything before comma is row selector and after comma is column selesctor
# Create the testing set
test_data_copy <- spotify_copy[-train_indices_copy, ]
# Train the linear regression model
lm_model_copy <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = train_data_copy)
summary(lm_model_copy)
##
## Call:
## lm(formula = track_popularity ~ danceability + energy + key +
## loudness + mode + speechiness + acousticness + instrumentalness +
## liveness + valence + tempo + duration_ms, data = train_data_copy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.800 -17.508 3.015 17.930 63.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.015e+01 2.130e+00 32.928 < 2e-16 ***
## danceability 4.227e+00 1.301e+00 3.249 0.001161 **
## energy -2.328e+01 1.475e+00 -15.785 < 2e-16 ***
## key 2.206e-02 4.584e-02 0.481 0.630366
## loudness 1.296e+00 9.057e-02 14.313 < 2e-16 ***
## mode 7.174e-01 3.351e-01 2.141 0.032273 *
## speechiness -1.111e+01 2.173e+00 -5.112 3.23e-07 ***
## acousticness 5.495e+00 1.014e+00 5.417 6.14e-08 ***
## instrumentalness -4.047e+02 2.853e+01 -14.184 < 2e-16 ***
## liveness -5.299e+00 1.517e+00 -3.493 0.000479 ***
## valence 2.141e+00 7.831e-01 2.734 0.006262 **
## tempo 3.179e-02 6.651e-03 4.779 1.77e-06 ***
## duration_ms -5.077e-05 3.263e-06 -15.559 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.97 on 19836 degrees of freedom
## Multiple R-squared: 0.06491, Adjusted R-squared: 0.06434
## F-statistic: 114.7 on 12 and 19836 DF, p-value: < 2.2e-16
Same as the other model, all values are significant except key. This model has an R^2 of 0.06328 which means that only 6.3% of the of the data’s variability can be explained by the regression model, which indicated that this is not a good model.
# Create a data frame to compare actual and predicted values
comparison_df_copy <- data.frame(Actual =train_data_copy$track_popularity, lm_predicted =lm_model_copy$fitted.values)
head(comparison_df_copy)
## Actual lm_predicted
## 1 0 34.13595
## 2 24 41.44849
## 3 39 43.20203
## 4 63 39.96939
## 5 40 34.15089
## 6 30 30.09493
dim(comparison_df_copy)
## [1] 19849 2
lm_mse_train_copy <- mean((lm_model_copy$fitted.values - train_data_copy$track_popularity)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train_copy, 2)))
## [1] "Training MSE for Linear Model: 527.19"
# Predict on testing data
lm_test_pred_copy <- predict(lm_model_copy, newdata = test_data_copy)
# Cal
lm_mse_test_copy <- mean((lm_test_pred_copy - test_data_copy$track_popularity)^2)
print(paste("Testing MSE for Linear Model:", round(lm_mse_test_copy, 2)))
## [1] "Testing MSE for Linear Model: 524.25"
Now, the MSE is still super high, but the model on the testing set (525.9) is lower than the model on the training set (528.11).
Overall both MSE with testing and training data on
spotify and spotify_copy (with
winsorized/truncated data) are very high. For truncated data the testing
set MSE is 525.9 and the training set MSE is
528.11. For the normal data Testing MSE for Linear
Model is 527.51 and Training MSE for Linear Model is
529.94
spotify# Diagnostic Plots
par(mfrow = c(2, 2))
plot(lm_model_test)
###### Model assumption check
spotify_copy
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(lm_model_copy)
There are four assumptions when looking at a linear regression model. These include: - Linearity—relationship between X and Y is linear. - Homoscedasticity—constancy in variance. - Independence –observations are independent of each other. - Normality—observing if the model is normally distributed (bell shape).
I think that when looking at the plots in both data sets
(spotify and spotfy_copy) the assumptions are
not met. When looking at the Residuals-leverage graph, the x values do
not follow a linear line. It is more like clusters. Also the model is
not normally distributed. The residual points are not equally scattered
on the plot indicating that the variance is not equal or constant.
Overall, this model might be good to predict which variables are
significant, but not for further analysis.
lm_model_int <- lm(track_popularity ~ danceability + loudness + danceability*loudness, data = spotify)
summary(lm_model_int)
##
## Call:
## lm(formula = track_popularity ~ danceability + loudness + danceability *
## loudness, data = spotify)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.912 -18.249 3.069 18.643 60.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.4207 1.4221 19.985 < 2e-16 ***
## danceability 20.2423 2.1859 9.261 < 2e-16 ***
## loudness -0.8238 0.1768 -4.660 3.18e-06 ***
## danceability:loudness 1.7837 0.2743 6.502 8.07e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.64 on 28352 degrees of freedom
## Multiple R-squared: 0.004993, Adjusted R-squared: 0.004887
## F-statistic: 47.42 on 3 and 28352 DF, p-value: < 2.2e-16
As we saw before, almost every variables is significant towards
track populariry, so it is very easy to find interactions.
We performed an interaction which was significant. However the R^2 is
too low. Now it is 0.5%. The new coefficient to account for this
interaction is 1.784 indicating that if there was an increase of 1 in
the danceability variable that 1.784 would be multiplied by
the value of loudnessvariable in the regression
equation.
lm_model_int <- lm(track_popularity ~ danceability + loudness , data = spotify)
summary(lm_model_int)
##
## Call:
## lm(formula = track_popularity ~ danceability + loudness, data = spotify)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.803 -18.306 2.989 18.645 59.569
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.38628 0.72256 50.357 < 2e-16 ***
## danceability 7.48487 0.96398 7.765 8.47e-15 ***
## loudness 0.28557 0.04629 6.170 6.93e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.66 on 28353 degrees of freedom
## Multiple R-squared: 0.003509, Adjusted R-squared: 0.003439
## F-statistic: 49.92 on 2 and 28353 DF, p-value: < 2.2e-16
If we perform the same model with no interaction the R^2 is 0.035%. This means that 0.035% of variance can be explained by tgis model, indicating that it is not a good model.
## Train the KNN Model with training dataset
# Train the KNN model
knn_model <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 5)
# Predict on training data
knn_train_pred <- fitted.values(knn_model)
# Calculate in-sample MSE manually
knn_train_mse <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse))
## [1] "In-Sample MSE for KNN: 194.903869196156"
We can see that the MSE is very big, however lower than lm. It is 195
for the training set and spotify dataset.
# Predict on testing data
knn_model_test <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 5)
knn_test_pred <- fitted.values(knn_model_test)
# Calculate out-of-sample MSE manually
knn_test_mse <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Out-of-Sample MSE for KNN: ", knn_test_mse))
## [1] "Out-of-Sample MSE for KNN: 687.024025645934"
Now, doing a KNN on the testing data, increases the MSE to 687.
# Train the KNN model spotify_copy
knn_model_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 5)
# Create a data frame to compare actual and predicted values
comparison_df <- data.frame(Actual = train_data$track_popularity, lm_predicted =knn_model_copy$fitted.values)
head(comparison_df)
## Actual lm_predicted
## 1 0 18.33174
## 2 24 43.54473
## 3 39 39.76225
## 4 63 44.50652
## 5 40 42.82548
## 6 30 34.46911
dim(comparison_df)
## [1] 19849 2
# Create a data frame to compare actual and predicted values
comparison_df$knn_std_predicted_copy <- knn_model_copy$fitted.values
head(comparison_df)
## Actual lm_predicted knn_std_predicted_copy
## 1 0 18.33174 18.33174
## 2 24 43.54473 43.54473
## 3 39 39.76225 39.76225
## 4 63 44.50652 44.50652
## 5 40 42.82548 42.82548
## 6 30 34.46911 34.46911
# Predict on training data
knn_train_pred_copy <- fitted.values(knn_model_copy)
# Calculate in-sample MSE manually
knn_train_mse_copy <- mean((train_data_copy$track_popularity - knn_train_pred_copy)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse_copy))
## [1] "In-Sample MSE for KNN: 194.882087277794"
For the training set in spotify_copy , now it is 194.
Lets do it on the testing set with the truncated variables.
# Predict on testing data
knn_model_test_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 5)
knn_test_pred_copy <- fitted.values(knn_model_test_copy)
# Calculate out-of-sample MSE manually
knn_test_mse_copy <- mean((test_data_copy$track_popularity - knn_test_pred_copy)^2)
print(paste("Out-of-Sample MSE for KNN: ", knn_test_mse_copy))
## [1] "Out-of-Sample MSE for KNN: 686.330902362197"
Now the values for MSE in knn model for training and testing set on
spotify and spotify_copy are the following.
spotify has a MSE of 194.903 for training
set and a MSE of 687.024 in the testing set.
spotify_copy has a MSE of 194.882 on
training set and a MSE of 686.331 on testing set.
Therefore the truncation and winsorization of the variables do not influence a lot the overall model, but it lowers a bit the MSE.
First spotify data
# Predict on training data
knn_model_spotify_7 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 7)
knn_train_pred <- fitted.values(knn_model_spotify_7)
# Calculate out-of-sample MSE manually
knn_train_mse_7 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_7))
## [1] "Traning MSE for KNN: 252.314205153333"
# Predict on training data
knn_model_spotify_20 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 20)
knn_train_pred <- fitted.values(knn_model_spotify_20)
# Calculate out-of-sample MSE manually
knn_train_mse_20 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_20))
## [1] "Traning MSE for KNN: 389.061411631355"
# Predict on training data
knn_model_spotify_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 50)
knn_train_pred <- fitted.values(knn_model_spotify_50)
# Calculate out-of-sample MSE manually
knn_train_mse_50 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_50))
## [1] "Traning MSE for KNN: 456.16237397779"
# Predict on training data
knn_model_spotify_500 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = train_data, k = 500)
knn_train_pred <- fitted.values(knn_model_spotify_500)
# Calculate out-of-sample MSE manually
knn_train_mse_500 <- mean((train_data$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_500))
## [1] "Traning MSE for KNN: 517.746609308476"
We tried with different k’s and saw that (with train data on spotify):
7=252.3142 20=389.06 50= 456.1624 500= 517.746 As always the smaller the K, less MSE.
Now lets try it on testing set:
# Predict on testing data
knn_model_spotify_test <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 7)
knn_test_pred <- fitted.values(knn_model_spotify_test)
# Calculate out-of-sample MSE manually
knn_test_mse <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse))
## [1] "Traning MSE for KNN: 639.188169931466"
# Predict on testing data
knn_model_spotify_test_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 50)
knn_test_pred <- fitted.values(knn_model_spotify_test_50)
# Calculate out-of-sample MSE manually
knn_test_mse_50 <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_50))
## [1] "Traning MSE for KNN: 531.741262685869"
# Predict on testing data
knn_model_spotify_test_200 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data, test = test_data, k = 200)
knn_test_pred <- fitted.values(knn_model_spotify_test_200)
# Calculate out-of-sample MSE manually
knn_test_mse_200 <- mean((test_data$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_200))
## [1] "Traning MSE for KNN: 522.46584494462"
When trying different k’s with the testing model, we get the
following MSE values: 7=639.18 50 = 531.741 200 = 522.465
This means that the MSE in the training set was smaller that in the
testing set. Lets try it with spotify_copy
Now it is the other way. On the testing set the smaller the k, the bigger MSE, and viceversa.
First spotify_copy data with training set.
# Predict on training data -> k=7
knn_model_train_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 7)
knn_train_pred <- fitted.values(knn_model_train_copy)
# Calculate out-of-sample MSE manually
knn_train_mse_copy <- mean((train_data_copy$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_copy))
## [1] "Traning MSE for KNN: 252.96950348016"
# Predict on training data -> k=50
knn_model_train_copy_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 50)
knn_train_pred <- fitted.values(knn_model_train_copy_50)
# Calculate out-of-sample MSE manually
knn_train_mse_copy_50 <- mean((train_data_copy$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_copy_50))
## [1] "Traning MSE for KNN: 454.470364045298"
# Predict on training data -> k=200
knn_model_train_copy_200 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = train_data_copy, k = 200)
knn_train_pred <- fitted.values(knn_model_train_copy_200)
# Calculate out-of-sample MSE manually
knn_train_mse_copy_200 <- mean((train_data_copy$track_popularity - knn_train_pred)^2)
print(paste("Traning MSE for KNN: ", knn_train_mse_copy_200))
## [1] "Traning MSE for KNN: 501.613661177734"
Now with spotify_copy we see the following. On the
training set we get these results: 7= 252.969 50= 454.470 200=
501.613
On the testing set we get these:
# Predict on testing data
knn_model_test_copy <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 7)
knn_test_pred <- fitted.values(knn_model_test_copy)
# Calculate out-of-sample MSE manually
knn_test_mse_copy <- mean((test_data_copy$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_copy))
## [1] "Traning MSE for KNN: 639.716566097559"
# Predict on testing data
knn_model_test_copy_50 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 50)
knn_test_pred <- fitted.values(knn_model_test_copy_50)
# Calculate out-of-sample MSE manually
knn_test_mse_copy_50 <- mean((test_data_copy$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_copy_50))
## [1] "Traning MSE for KNN: 532.925151847685"
# Predict on testing data
knn_model_test_copy_200 <- kknn(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, train = train_data_copy, test = test_data_copy, k = 200)
knn_test_pred <- fitted.values(knn_model_test_copy_200)
# Calculate out-of-sample MSE manually
knn_test_mse_copy_200 <- mean((test_data_copy$track_popularity - knn_test_pred)^2)
print(paste("Traning MSE for KNN: ", knn_test_mse_copy_200))
## [1] "Traning MSE for KNN: 522.191267489406"
With spotify and training set we get
the following results for MSE:
7=252.3142 20=389.06 50= 456.1624 500= 517.746
Now with spotify_copy and training set
we get the following results for MSE: 7= 252.969 50= 454.470 200=
501.613
A: if we compare the data with winsorized variables, we see that the MSE might be samller, but there is not a big difference.
With spotify and testing set we get the
following results for MSE:
7=639.18 50 = 531.741 200 = 522.465
With spotify_copy and testing set we
get the following results for MSE:
k=7: 639.716 k=50: 532.925 k=200: 522.191
A: Here again, it is almost the same. spotify_copy might
be a better fit,, but the difference is minimal. Something that I did
not expected was that for the training sets, if the k is smaller the MSE
is smaller (usually that is what happens). However, on the testing set
it is the opposite (if k is smaller, MSE is bigger.) Since what we want
is a high k and low MSE, I think that implementing the KNN model on the
testing set might not be a bad idea. It is true that MSE is still very
high, but at least it is getting lower when increasing k.
As we saw before in the analysis a lot of variables have outliers.
These variables are danceability, energy,
loudness, speechiness,
acousticness, instrumentalness,
liveness, duration. We have created a new
dataset called spotify_new with all truncated and
winosrized variables. In the analysis we performed both lm and knn to
both datasets with training and testing sets and we got the following:
In knn the training set with spotify_copy, the MSE is a bit
smaller than with spotify. However, the difference is
minimal. In the testing set, in knn the MSE is also smaller with
spotify_copy. However, only some ecimals smaller. NOthing
that gets your attention.
For linear models, overall both MSE with testing and training data on
spotify and spotify_copy (with
winsorized/truncated data) are very high. For truncated data the testing
set MSE is 524.25 and the training set MSE is
527.19. For the normal data Testing MSE for Linear
Model is 527.51 and Training MSE for Linear Model is
529.94. This means that there is less error when the
outliers are removed.
# Train the linear regression model, compring popularity to rest of numeric parameters on spotify data
lm_model_spotify <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = spotify)
summary(lm_model_spotify)
##
## Call:
## lm(formula = track_popularity ~ danceability + energy + key +
## loudness + mode + speechiness + acousticness + instrumentalness +
## liveness + valence + tempo + duration_ms, data = spotify)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.62 -17.22 2.95 18.10 60.54
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.783e+01 1.704e+00 39.805 < 2e-16 ***
## danceability 3.721e+00 1.072e+00 3.470 0.000522 ***
## energy -2.321e+01 1.220e+00 -19.028 < 2e-16 ***
## key 3.190e-03 3.844e-02 0.083 0.933870
## loudness 1.156e+00 6.527e-02 17.711 < 2e-16 ***
## mode 8.616e-01 2.809e-01 3.067 0.002161 **
## speechiness -6.328e+00 1.380e+00 -4.587 4.52e-06 ***
## acousticness 4.331e+00 7.466e-01 5.801 6.67e-09 ***
## instrumentalness -9.292e+00 6.255e-01 -14.856 < 2e-16 ***
## liveness -4.280e+00 8.990e-01 -4.761 1.93e-06 ***
## valence 1.788e+00 6.565e-01 2.724 0.006458 **
## tempo 2.609e-02 5.239e-03 4.979 6.42e-07 ***
## duration_ms -4.342e-05 2.294e-06 -18.925 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.01 on 28343 degrees of freedom
## Multiple R-squared: 0.05808, Adjusted R-squared: 0.05768
## F-statistic: 145.6 on 12 and 28343 DF, p-value: < 2.2e-16
# Train the linear regression model, compring popularity to rest of numeric parameters on spotify data
lm_model_spotify_copy <- lm(track_popularity ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo + duration_ms, data = spotify_copy)
summary(lm_model_spotify_copy)
##
## Call:
## lm(formula = track_popularity ~ danceability + energy + key +
## loudness + mode + speechiness + acousticness + instrumentalness +
## liveness + valence + tempo + duration_ms, data = spotify_copy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.471 -17.326 2.985 17.960 62.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.123e+01 1.774e+00 40.166 < 2e-16 ***
## danceability 3.819e+00 1.083e+00 3.527 0.000421 ***
## energy -2.323e+01 1.228e+00 -18.913 < 2e-16 ***
## key 1.029e-02 3.834e-02 0.268 0.788384
## loudness 1.325e+00 7.559e-02 17.527 < 2e-16 ***
## mode 8.320e-01 2.803e-01 2.968 0.002996 **
## speechiness -9.644e+00 1.811e+00 -5.324 1.02e-07 ***
## acousticness 4.762e+00 8.501e-01 5.602 2.14e-08 ***
## instrumentalness -4.055e+02 2.383e+01 -17.015 < 2e-16 ***
## liveness -5.189e+00 1.266e+00 -4.099 4.16e-05 ***
## valence 1.846e+00 6.541e-01 2.822 0.004778 **
## tempo 2.810e-02 5.562e-03 5.051 4.41e-07 ***
## duration_ms -5.090e-05 2.737e-06 -18.595 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.95 on 28343 degrees of freedom
## Multiple R-squared: 0.06326, Adjusted R-squared: 0.06287
## F-statistic: 159.5 on 12 and 28343 DF, p-value: < 2.2e-16
For R^2, The R^2 for the spotify data was 0.05409 and
for the sppotify_copy data it was 0.05409. It was the same,
menaing that 5.4% of variance can be explained by the model.
Lets create a copy of the data set for standardized variables.
spotify_stand <- spotify
As we saw before in the analysis a lot of variables have outliers.
These variables are danceability, energy,
loudness, speechiness,
acousticness, instrumentalness,
liveness, duration. Lets standardize these
variables in spotify_stand to look at the knn model.
min_danceability <- min(spotify_stand$danceability)
max_danceability <- max(spotify_stand$danceability)
spotify_stand$danceability <- (spotify_stand$danceability - min_danceability) / (max_danceability - min_danceability)
summary(spotify_stand$danceability)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.5707 0.6816 0.6647 0.7731 1.0000
min_energy <- min(spotify_stand$energy)
max_energy <- max(spotify_stand$energy)
spotify_stand$energy <- (spotify_stand$energy - min_energy) / (max_energy - min_energy)
summary(spotify_stand$energy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.5789 0.7220 0.6983 0.8430 1.0000
min_loudness <- min(spotify_stand$loudness)
max_loudness <- max(spotify_stand$loudness)
spotify_stand$loudness <- (spotify_stand$loudness - min_loudness) / (max_loudness - min_loudness)
min_speechiness <- min(spotify_stand$speechiness)
max_speechiness <- max(spotify_stand$speechiness)
spotify_stand$speechiness <- (spotify_stand$speechiness - min_speechiness) / (max_speechiness - min_speechiness)
min_acousticness <- min(spotify_stand$acousticness)
max_acousticness <- max(spotify_stand$acousticness)
spotify_stand$acousticness <- (spotify_stand$acousticness - min_acousticness) / (max_acousticness - min_acousticness)
min_instrumentalness <- min(spotify_stand$instrumentalness)
max_instrumentalness <- max(spotify_stand$instrumentalness)
spotify_stand$instrumentalness <- (spotify_stand$instrumentalness - min_instrumentalness) / (max_instrumentalness - min_instrumentalness)
min_liveness <- min(spotify_stand$liveness)
max_liveness <- max(spotify_stand$liveness)
spotify_stand$liveness <- (spotify_stand$liveness - min_liveness) / (max_liveness - min_liveness)
min_duration <- min(spotify_stand$duration_ms)
max_duration <- max(spotify_stand$duration_ms)
spotify_stand$duration_ms <- (spotify_stand$duration_ms - min_duration) / (max_duration - min_duration)
k <- 5
knn_spotify <- kknn(track_popularity ~ danceability + energy + key +
loudness + mode + speechiness + acousticness + instrumentalness +
liveness + valence + tempo + duration_ms, spotify, spotify, k=k)
# Create a data frame to compare actual and predicted values
comparison_spotify <- data.frame(Actual = spotify$track_popularity, knn_predicted = knn_spotify$fitted.values)
head(comparison_spotify)
## Actual knn_predicted
## 1 66 44.32066
## 2 67 63.32553
## 3 70 59.90562
## 4 60 55.16832
## 5 69 53.92104
## 6 67 40.56705
knn_spotify_mse <- mean((knn_spotify$fitted.values - spotify$track_popularity)^2)
print(paste("Mean Squared Error for spotifyKNN:", round(knn_spotify_mse, 2)))
## [1] "Mean Squared Error for spotifyKNN: 197.96"
Without standardize data the KNN MSE is 197.96. Lets look at the KNN with standardized data.
k <- 5
knn_spotify_stand <- kknn(track_popularity ~ danceability + energy + key +
loudness + mode + speechiness + acousticness + instrumentalness +
liveness + valence + tempo + duration_ms, spotify_stand, spotify_stand, k=k)
knn_spotify_stand_mse <- mean((knn_spotify_stand$fitted.values - spotify_stand$track_popularity)^2)
print(paste("Mean Squared Error for spotify_standKNN:", round(knn_spotify_stand_mse, 2)))
## [1] "Mean Squared Error for spotify_standKNN: 197.96"
The KNN is the same, which means that standardized data do not influence whether MSE is going ti be smaller on the KNN.
After we created 2 new datasets, one with standardized data, one with
truncated and winsorized data, we could conclude the following after
doing lm and knn and splitting the data into training and testing set.
As far as the linear regression models, we observed that it mighnt not
be a good model for this data. I think that when looking at the plots in
both data sets (spotify and spotfy_copy) the 4
assumptions for linear models are not met. When looking at the
Residuals-leverage graph, the x values do not follow a linear line. It
is more like clusters. Also the model is not normally distributed. The
residual points are not equally scattered on the plot indicating that
the variance is not equal or constant. Overall both MSE with testing and
training data on spotify and spotify_copy
(with winsorized/truncated data) are very high. For truncated data the
testing set MSE is 525.9 and the training set MSE is
528.11.For the normal data Testing MSE for Linear Model
is 527.51 and Training MSE for Linear Model is
529.94. The R^2 is also too low and only around 4% of
variance in the variable could be explained by lm. For the data with
truncated variables, the MSE improves only a bit. Not enough for it to
be a good model.
knn model might be better than lm. It is one of the
simplest yet powerful algorithms used for classification and regression
tasks in machine learning and statistics. It is used for large datasets.
There does not have to be a linear line, it predicts based on neighbours
and distance. When looking at the plots for this dataset we can see that
th points are plotted as clusters, they do not fit a line. Knn is
probably better for prediction than lm.
After realizing knn on both datasets we got the following results:
spotify_copy might be a better fit, but the difference is
minimal. Something that I did not expected was that for the training
sets, if the k is smaller the MSE is smaller (usually that is what
happens). However, on the testing set it is the opposite (if k is
smaller, MSE is bigger.) Since what we want is a high k and low MSE, I
think that implementing the KNN model on the testing set might not be a
bad idea. It is true that MSE is still very high, but at least it is
getting lower when increasing k. We could have a larger mount of k’s and
decrease MSE. However, as we saw inthe reults MSE was also super high.
The lowest mSE we found was 197. It is a better model than lm, but still
it is not a good enough model. This might be becasue the variables we
chose for the analysis were only the numeric variables, not the rest.
This could mean that danceablility, loudness, valence etc. might not be
as stromg predictors to predict song popularitty. However, from the
results we can see that knn is a better model than lm for this dataset.
Standardization of variables also leads to same MSE, beacuse when
performing knn the variables are automatically standardized.