First, I will import the necessary libraries:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.3.3
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
spotify <- read.csv("spotify.csv")
summary(spotify)
## id artist_names track_name source
## Length:6513 Length:6513 Length:6513 Length:6513
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## key mode time_signature danceability
## Length:6513 Length:6513 Length:6513 Min. :0.1500
## Class :character Class :character Class :character 1st Qu.:0.5910
## Mode :character Mode :character Mode :character Median :0.6980
## Mean :0.6817
## 3rd Qu.:0.7850
## Max. :0.9850
## energy speechiness acousticness instrumentalness
## Min. :0.0218 Min. :0.0232 Min. :0.0000075 Min. :0.0000000
## 1st Qu.:0.5340 1st Qu.:0.0440 1st Qu.:0.0444000 1st Qu.:0.0000000
## Median :0.6510 Median :0.0722 Median :0.1450000 Median :0.0000000
## Mean :0.6365 Mean :0.1219 Mean :0.2367610 Mean :0.0124687
## 3rd Qu.:0.7590 3rd Qu.:0.1630 3rd Qu.:0.3560000 3rd Qu.:0.0000406
## Max. :0.9890 Max. :0.9660 Max. :0.9940000 Max. :0.9530000
## liveness valence loudness tempo
## Min. :0.0197 Min. :0.0320 Min. :-34.475 Min. : 46.72
## 1st Qu.:0.0974 1st Qu.:0.3160 1st Qu.: -7.564 1st Qu.: 98.01
## Median :0.1240 Median :0.4890 Median : -5.983 Median :120.03
## Mean :0.1802 Mean :0.4924 Mean : -6.351 Mean :122.12
## 3rd Qu.:0.2190 3rd Qu.:0.6690 3rd Qu.: -4.673 3rd Qu.:142.03
## Max. :0.9770 Max. :0.9820 Max. : 1.509 Max. :212.12
## duration_ms weeks_on_chart streams
## Min. : 30133 Min. : 1.00 Min. :2.525e+06
## 1st Qu.:173038 1st Qu.: 1.00 1st Qu.:8.695e+06
## Median :198367 Median : 4.00 Median :2.713e+07
## Mean :202567 Mean : 13.61 Mean :1.090e+08
## 3rd Qu.:226003 3rd Qu.: 16.00 3rd Qu.:9.951e+07
## Max. :690732 Max. :367.00 Max. :3.528e+09
ggplot(data=spotify)+geom_bar(aes(x=key,fill=mode),position="dodge")+labs(title="Prevalence of Keys in Top 200 on Spotify since 2016")+theme_bw()+ scale_fill_manual(values = c("goldenrod", "navy"))
ggplot(data=spotify)+geom_bar(aes(x=time_signature,fill=mode),position="dodge")+labs(title="Prevalence of Time Signature in Top 200 on Spotify since 2016")+theme_bw()+ scale_fill_manual(values = c("goldenrod", "navy"))
### Song Counts by Source (Label) for Sources With More Than 50 Songs on
the List Note: Since some record labels made be represented by multiple
different subsidaries, some counts may be underestimated.
popsource<-spotify|>
group_by(source)|>
mutate(count=n())|>
filter(count>50)|>
ungroup()
ggplot(data=popsource)+geom_bar(aes(x=source),fill="navy")+labs(title="Prevalence of Sources of Song in Top 200 on Spotify since 2016")+theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
### Song Counts for Artists With More Than 25 Songs That Have Made the
List Note: Collaborations could not be easily considered with this, so
actual song counts for these artists may be underestimated.
artsource<-spotify|>
group_by(artist_names)|>
mutate(count=n())|>
filter(count>25)|>
ungroup()
ggplot(data=artsource)+geom_bar(aes(x=artist_names),fill="navy")+labs(title="Prevalence of Artists of Songs in Top 200 on Spotify since 2016")+theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(data=spotify)+geom_histogram(aes(x=weeks_on_chart),fill="slateblue")+labs(title="Distribution of Time Spent in Top 200 on Spotify since 2016",color="Mean Time On Chart")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$weeks_on_chart),color="13.6 Weeks"),size=1.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Use of `spotify$weeks_on_chart` is discouraged.
## ℹ Use `weeks_on_chart` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Distribution of Song Length
ggplot(data=spotify)+geom_histogram(aes(x=duration_ms),fill="cornflowerblue")+labs(title="Distribution of Song Length in Top 200 on Spotify since 2016",color="Mean Length")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$duration_ms),color="202556.7 ms"),size=1.5)
## Warning: Use of `spotify$duration_ms` is discouraged.
## ℹ Use `duration_ms` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Distribution of Danceability
ggplot(data=spotify)+geom_histogram(aes(x=danceability),fill="cornflowerblue")+labs(title="Distribution of Danceability in Top 200 on Spotify since 2016",color="Mean Danceability")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$danceability),color=".68117307"),size=1.5)
## Warning: Use of `spotify$danceability` is discouraged.
## ℹ Use `danceability` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=energy),fill="cornflowerblue")+labs(title="Distribution of Energy in Top 200 on Spotify since 2016",color="Mean Energy")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$energy),color=".6365223"),size=1.5)
## Warning: Use of `spotify$energy` is discouraged.
## ℹ Use `energy` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=speechiness),fill="cornflowerblue")+labs(title="Distribution of Speechiness in Top 200 on Spotify since 2016",color="Mean Speechiness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$speechiness),color=".1219331"),size=1.5)
## Warning: Use of `spotify$speechiness` is discouraged.
## ℹ Use `speechiness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=acousticness),fill="cornflowerblue")+labs(title="Distribution of Acousticness in Top 200 on Spotify since 2016",color="Mean Acousticness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$acousticness),color=".236761"),size=1.5)
## Warning: Use of `spotify$acousticness` is discouraged.
## ℹ Use `acousticness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=instrumentalness),fill="cornflowerblue")+labs(title="Distribution of Instrumentalness in Top 200 on Spotify since 2016",color="Mean Instrumentalness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$instrumentalness),color=" 0.01246869"),size=1.5)
## Warning: Use of `spotify$instrumentalness` is discouraged.
## ℹ Use `instrumentalness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=liveness),fill="cornflowerblue")+labs(title="Distribution of Liveness in Top 200 on Spotify since 2016",color="Mean Liveness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$liveness),color="0.1801681sp"),size=1.5)
## Warning: Use of `spotify$liveness` is discouraged.
## ℹ Use `liveness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=valence),fill="cornflowerblue")+labs(title="Distribution of Valence in Top 200 on Spotify since 2016",color="Mean Valence")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$valence),color="0.4924117"),size=1.5)
## Warning: Use of `spotify$valence` is discouraged.
## ℹ Use `valence` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=loudness),fill="cornflowerblue")+labs(title="Distribution of Loudness in Top 200 on Spotify since 2016",color="Mean Loudness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$loudness),color="-6.350667"),size=1.5)
## Warning: Use of `spotify$loudness` is discouraged.
## ℹ Use `loudness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=spotify)+geom_histogram(aes(x=tempo),fill="cornflowerblue")+labs(title="Distribution of Tempo in Top 200 on Spotify since 2016",color="Mean Tempo")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$tempo),color="122.1172"),size=1.5)
## Warning: Use of `spotify$tempo` is discouraged.
## ℹ Use `tempo` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Building an interfential model to predict the best linear model for predicting weeks spent on the chart:
spotify2<-spotify[c(-1,-2,-3,-4,-5,-6,-7,-19)]
pairs(spotify2)
ggplot(data=spotify2)+geom_point(aes(x=danceability,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=energy,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=speechiness,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=acousticness,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=instrumentalness,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=valence,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=loudness,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=tempo,weeks_on_chart))
ggplot(data=spotify2)+geom_point(aes(x=duration_ms,weeks_on_chart))
corrs<-cor(spotify2)
corrplot(corrs, method="color", addCoef.col = "black",number.cex=.5,tl.cex=.75,col=colorRampPalette(c("navy","white","darkred"))(10))
baselm<-glm(weeks_on_chart~key+mode+time_signature+danceability+energy+speechiness+acousticness+instrumentalness+liveness+valence+poly(loudness,2,raw=T)+tempo+poly(duration_ms,2,raw=T),data=spotify)
summary(baselm)
##
## Call:
## glm(formula = weeks_on_chart ~ key + mode + time_signature +
## danceability + energy + speechiness + acousticness + instrumentalness +
## liveness + valence + poly(loudness, 2, raw = T) + tempo +
## poly(duration_ms, 2, raw = T), data = spotify)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.079e+01 6.606e+00 1.634 0.102370
## keyA#/Bb 2.627e+00 1.528e+00 1.720 0.085527 .
## keyB 4.272e+00 1.454e+00 2.937 0.003321 **
## keyC 6.466e-01 1.386e+00 0.467 0.640831
## keyC#/Db 1.867e+00 1.314e+00 1.421 0.155285
## keyD 2.143e+00 1.486e+00 1.442 0.149337
## keyD#/Eb -1.032e+00 2.060e+00 -0.501 0.616526
## keyE 1.915e+00 1.567e+00 1.222 0.221704
## keyF 2.694e+00 1.476e+00 1.825 0.068044 .
## keyF#/Gb 1.642e+00 1.497e+00 1.096 0.272964
## keyG 4.902e-01 1.435e+00 0.341 0.732749
## keyG#/Ab 2.741e+00 1.464e+00 1.872 0.061276 .
## modeMinor -7.394e-01 6.344e-01 -1.166 0.243819
## time_signature3 beats -1.072e+00 4.979e+00 -0.215 0.829595
## time_signature4 beats 2.887e-01 4.829e+00 0.060 0.952337
## time_signature5 beats -4.200e+00 5.323e+00 -0.789 0.430155
## danceability 5.007e-01 2.493e+00 0.201 0.840787
## energy -8.508e+00 3.018e+00 -2.819 0.004834 **
## speechiness -1.617e+01 2.777e+00 -5.821 6.11e-09 ***
## acousticness 1.174e+00 1.507e+00 0.779 0.435869
## instrumentalness -1.927e+00 4.091e+00 -0.471 0.637689
## liveness -6.698e+00 2.187e+00 -3.063 0.002204 **
## valence 5.324e+00 1.562e+00 3.409 0.000655 ***
## poly(loudness, 2, raw = T)1 9.700e-01 3.663e-01 2.648 0.008123 **
## poly(loudness, 2, raw = T)2 1.858e-02 1.835e-02 1.013 0.311119
## tempo -8.093e-03 1.031e-02 -0.785 0.432459
## poly(duration_ms, 2, raw = T)1 1.002e-04 2.193e-05 4.571 4.94e-06 ***
## poly(duration_ms, 2, raw = T)2 -1.715e-10 4.376e-11 -3.919 9.00e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 569.8593)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3695537 on 6485 degrees of freedom
## AIC: 59841
##
## Number of Fisher Scoring iterations: 2
lm2<-glm(weeks_on_chart~key+energy+speechiness+liveness+valence+poly(duration_ms,2,raw=T),data=spotify)
summary(lm2)
##
## Call:
## glm(formula = weeks_on_chart ~ key + energy + speechiness + liveness +
## valence + poly(duration_ms, 2, raw = T), data = spotify)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.049e-01 3.047e+00 -0.133 0.894284
## keyA#/Bb 2.589e+00 1.524e+00 1.699 0.089432 .
## keyB 4.209e+00 1.449e+00 2.904 0.003696 **
## keyC 9.279e-01 1.380e+00 0.672 0.501368
## keyC#/Db 2.161e+00 1.305e+00 1.656 0.097778 .
## keyD 2.413e+00 1.477e+00 1.634 0.102316
## keyD#/Eb -6.991e-01 2.057e+00 -0.340 0.733966
## keyE 2.054e+00 1.563e+00 1.314 0.188755
## keyF 2.711e+00 1.473e+00 1.841 0.065624 .
## keyF#/Gb 1.752e+00 1.494e+00 1.173 0.240896
## keyG 5.358e-01 1.432e+00 0.374 0.708349
## keyG#/Ab 3.066e+00 1.457e+00 2.104 0.035434 *
## energy -1.918e+00 1.976e+00 -0.970 0.331986
## speechiness -1.785e+01 2.654e+00 -6.726 1.90e-11 ***
## liveness -7.157e+00 2.172e+00 -3.294 0.000992 ***
## valence 6.013e+00 1.438e+00 4.181 2.93e-05 ***
## poly(duration_ms, 2, raw = T)1 1.089e-04 2.155e-05 5.052 4.48e-07 ***
## poly(duration_ms, 2, raw = T)2 -1.898e-10 4.310e-11 -4.403 1.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 570.9894)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3708576 on 6495 degrees of freedom
## AIC: 59843
##
## Number of Fisher Scoring iterations: 2
spotifydummykeys<-dummy_cols(spotify,
select_columns = "key")
lm3<-glm(weeks_on_chart~key_B+key_D+key_F+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummykeys)
summary(lm3)
##
## Call:
## glm(formula = weeks_on_chart ~ key_B + key_D + key_F + speechiness +
## liveness + valence + poly(duration_ms, 2, raw = TRUE), data = spotifydummykeys)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.193e-01 2.825e+00 0.219 0.826488
## key_B 2.646e+00 1.062e+00 2.492 0.012726 *
## key_D 8.724e-01 1.099e+00 0.794 0.427195
## key_F 1.205e+00 1.094e+00 1.102 0.270602
## speechiness -1.722e+01 2.633e+00 -6.542 6.54e-11 ***
## liveness -7.592e+00 2.150e+00 -3.531 0.000417 ***
## valence 5.407e+00 1.324e+00 4.084 4.47e-05 ***
## poly(duration_ms, 2, raw = TRUE)1 1.055e-04 2.137e-05 4.939 8.06e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.835e-10 4.283e-11 -4.285 1.86e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 571.0831)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3714325 on 6504 degrees of freedom
## AIC: 59836
##
## Number of Fisher Scoring iterations: 2
lm4<-glm(weeks_on_chart~key_B+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummykeys)
summary(lm4)
##
## Call:
## glm(formula = weeks_on_chart ~ key_B + speechiness + liveness +
## valence + poly(duration_ms, 2, raw = TRUE), data = spotifydummykeys)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.800e-01 2.822e+00 0.276 0.782267
## key_B 2.462e+00 1.052e+00 2.340 0.019293 *
## speechiness -1.735e+01 2.630e+00 -6.599 4.46e-11 ***
## liveness -7.562e+00 2.150e+00 -3.517 0.000439 ***
## valence 5.448e+00 1.323e+00 4.119 3.85e-05 ***
## poly(duration_ms, 2, raw = TRUE)1 1.057e-04 2.136e-05 4.945 7.79e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.838e-10 4.282e-11 -4.292 1.80e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 571.056)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3715290 on 6506 degrees of freedom
## AIC: 59833
##
## Number of Fisher Scoring iterations: 2
lm5<-glm(weeks_on_chart~speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotify)
summary(lm5)
##
## Call:
## glm(formula = weeks_on_chart ~ speechiness + liveness + valence +
## poly(duration_ms, 2, raw = TRUE), data = spotify)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.924e-01 2.822e+00 0.352 0.72506
## speechiness -1.714e+01 2.629e+00 -6.519 7.59e-11 ***
## liveness -7.590e+00 2.151e+00 -3.529 0.00042 ***
## valence 5.574e+00 1.322e+00 4.216 2.52e-05 ***
## poly(duration_ms, 2, raw = TRUE)1 1.050e-04 2.137e-05 4.915 9.10e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.827e-10 4.283e-11 -4.266 2.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 571.4489)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3718418 on 6507 degrees of freedom
## AIC: 59837
##
## Number of Fisher Scoring iterations: 2
lm6<-glm(weeks_on_chart~artist_names,data=artsource)
summary(lm6)
##
## Call:
## glm(formula = weeks_on_chart ~ artist_names, data = artsource)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.86207 5.88342 2.016 0.044003 *
## artist_namesAriana Grande 0.98793 7.16554 0.138 0.890364
## artist_namesBad Bunny 1.00936 6.99678 0.144 0.885319
## artist_namesBillie Eilish 9.15680 7.31811 1.251 0.211086
## artist_namesBTS -3.96066 6.98233 -0.567 0.570657
## artist_namesDrake -4.10531 6.60743 -0.621 0.534508
## artist_namesEd Sheeran 25.62682 7.54465 3.397 0.000704 ***
## artist_namesEminem 5.49904 7.90561 0.696 0.486822
## artist_namesHarry Styles 8.16293 7.72724 1.056 0.291005
## artist_namesImagine Dragons 28.97577 7.85780 3.688 0.000237 ***
## artist_namesJ. Cole -1.57635 8.39437 -0.188 0.851075
## artist_namesJuice WRLD 0.18793 7.16554 0.026 0.979081
## artist_namesJustin Bieber 4.28608 8.47309 0.506 0.613058
## artist_namesKanye West -7.63758 7.42299 -1.029 0.303730
## artist_namesLana Del Rey -5.86207 8.32041 -0.705 0.481233
## artist_namesLil Uzi Vert -2.78707 7.72724 -0.361 0.718401
## artist_namesOlivia Rodrigo 11.13793 8.25078 1.350 0.177294
## artist_namesPost Malone 4.38793 7.24845 0.605 0.545053
## artist_namesSam Smith 0.03448 8.32041 0.004 0.996694
## artist_namesShawn Mendes 6.73793 8.25078 0.817 0.414295
## artist_namesTaylor Swift -4.10778 6.35222 -0.647 0.517970
## artist_namesThe Weeknd 8.93793 7.39534 1.209 0.227058
## artist_namesTravis Scott 14.06897 8.32041 1.691 0.091116 .
## artist_namesXXXTENTACION 11.87603 7.64952 1.553 0.120801
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1003.824)
##
## Null deviance: 1298194 on 1225 degrees of freedom
## Residual deviance: 1206596 on 1202 degrees of freedom
## AIC: 11979
##
## Number of Fisher Scoring iterations: 2
lm7<-glm(weeks_on_chart~source,data=popsource)
summary(lm7)
##
## Call:
## glm(formula = weeks_on_chart ~ source, data = popsource)
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 17.161 3.325 5.161
## sourceAtlantic Records 3.384 4.158 0.814
## sourceAtlantic Records UK 9.593 4.186 2.292
## sourceBIGHIT MUSIC -9.187 4.101 -2.240
## sourceColumbia -1.827 3.636 -0.503
## sourceDef Jam Recordings -11.643 4.826 -2.412
## sourceDisruptor Records/Columbia 1.349 4.949 0.272
## sourceGeneration Now/Atlantic -8.964 4.480 -2.001
## sourceGrade A Productions/Interscope Records -10.069 4.648 -2.167
## sourceIsland Records 4.952 4.898 1.011
## sourceOVO -9.989 4.782 -2.089
## sourcePolydor Records -3.312 4.521 -0.732
## sourceRCA Records Label -5.436 4.311 -1.261
## sourceRepublic Records -1.331 3.686 -0.361
## sourceRimas Entertainment LLC -1.841 4.067 -0.453
## sourceSony Music Latin 1.250 4.072 0.307
## sourceTaylor Swift -11.403 3.861 -2.954
## sourceUMLE - Latino 2.344 4.351 0.539
## sourceWarner Records -3.111 3.994 -0.779
## sourceWEA Latina -3.532 4.873 -0.725
## Pr(>|t|)
## (Intercept) 2.67e-07 ***
## sourceAtlantic Records 0.41575
## sourceAtlantic Records UK 0.02200 *
## sourceBIGHIT MUSIC 0.02517 *
## sourceColumbia 0.61535
## sourceDef Jam Recordings 0.01593 *
## sourceDisruptor Records/Columbia 0.78528
## sourceGeneration Now/Atlantic 0.04554 *
## sourceGrade A Productions/Interscope Records 0.03038 *
## sourceIsland Records 0.31209
## sourceOVO 0.03685 *
## sourcePolydor Records 0.46394
## sourceRCA Records Label 0.20748
## sourceRepublic Records 0.71803
## sourceRimas Entertainment LLC 0.65076
## sourceSony Music Latin 0.75890
## sourceTaylor Swift 0.00317 **
## sourceUMLE - Latino 0.59007
## sourceWarner Records 0.43605
## sourceWEA Latina 0.46870
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 685.3939)
##
## Null deviance: 1567917 on 2215 degrees of freedom
## Residual deviance: 1505125 on 2196 degrees of freedom
## AIC: 20781
##
## Number of Fisher Scoring iterations: 2
spotifydummynames<-dummy_cols(spotify,
select_columns = c("artist_names","source"))
lm8<-glm(weeks_on_chart~`artist_names_Ed Sheeran`+`artist_names_Imagine Dragons`+`source_Taylor Swift`+`artist_names_Travis Scott`+`source_BIGHIT MUSIC`+`source_Atlantic Records UK`+`source_Def Jam Recordings`+`source_Generation Now/Atlantic`+`source_Grade A Productions/Interscope Records`+`source_OVO`+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummynames)
summary(lm8)
##
## Call:
## glm(formula = weeks_on_chart ~ `artist_names_Ed Sheeran` + `artist_names_Imagine Dragons` +
## `source_Taylor Swift` + `artist_names_Travis Scott` + `source_BIGHIT MUSIC` +
## `source_Atlantic Records UK` + `source_Def Jam Recordings` +
## `source_Generation Now/Atlantic` + `source_Grade A Productions/Interscope Records` +
## source_OVO + speechiness + liveness + valence + poly(duration_ms,
## 2, raw = TRUE), data = spotifydummynames)
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 3.190e-01 2.800e+00 0.114
## `artist_names_Ed Sheeran` 1.780e+01 4.579e+00 3.888
## `artist_names_Imagine Dragons` 2.577e+01 3.907e+00 6.597
## `source_Taylor Swift` -1.004e+01 1.826e+00 -5.497
## `artist_names_Travis Scott` 1.225e+01 4.411e+00 2.777
## `source_BIGHIT MUSIC` -6.921e+00 2.194e+00 -3.154
## `source_Atlantic Records UK` 3.975e+00 3.000e+00 1.325
## `source_Def Jam Recordings` -8.052e+00 3.189e+00 -2.525
## `source_Generation Now/Atlantic` -3.847e+00 2.744e+00 -1.402
## `source_Grade A Productions/Interscope Records` -6.038e+00 2.957e+00 -2.042
## source_OVO -5.649e+00 3.137e+00 -1.801
## speechiness -1.605e+01 2.638e+00 -6.085
## liveness -7.888e+00 2.132e+00 -3.701
## valence 5.212e+00 1.316e+00 3.961
## poly(duration_ms, 2, raw = TRUE)1 1.106e-04 2.122e-05 5.211
## poly(duration_ms, 2, raw = TRUE)2 -1.854e-10 4.245e-11 -4.368
## Pr(>|t|)
## (Intercept) 0.909312
## `artist_names_Ed Sheeran` 0.000102 ***
## `artist_names_Imagine Dragons` 4.53e-11 ***
## `source_Taylor Swift` 4.00e-08 ***
## `artist_names_Travis Scott` 0.005508 **
## `source_BIGHIT MUSIC` 0.001618 **
## `source_Atlantic Records UK` 0.185239
## `source_Def Jam Recordings` 0.011601 *
## `source_Generation Now/Atlantic` 0.161055
## `source_Grade A Productions/Interscope Records` 0.041225 *
## source_OVO 0.071800 .
## speechiness 1.23e-09 ***
## liveness 0.000217 ***
## valence 7.54e-05 ***
## poly(duration_ms, 2, raw = TRUE)1 1.94e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 1.28e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 559.5712)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3635534 on 6497 degrees of freedom
## AIC: 59710
##
## Number of Fisher Scoring iterations: 2
lm9<-glm(weeks_on_chart~`artist_names_Ed Sheeran`+`artist_names_Imagine Dragons`+`source_Taylor Swift`+`artist_names_Travis Scott`+`source_BIGHIT MUSIC`+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummynames)
summary(lm9)
##
## Call:
## glm(formula = weeks_on_chart ~ `artist_names_Ed Sheeran` + `artist_names_Imagine Dragons` +
## `source_Taylor Swift` + `artist_names_Travis Scott` + `source_BIGHIT MUSIC` +
## speechiness + liveness + valence + poly(duration_ms, 2, raw = TRUE),
## data = spotifydummynames)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.723e-01 2.797e+00 0.097 0.922462
## `artist_names_Ed Sheeran` 2.185e+01 3.551e+00 6.155 7.97e-10 ***
## `artist_names_Imagine Dragons` 2.593e+01 3.910e+00 6.632 3.57e-11 ***
## `source_Taylor Swift` -9.826e+00 1.826e+00 -5.380 7.72e-08 ***
## `artist_names_Travis Scott` 1.247e+01 4.414e+00 2.824 0.004757 **
## `source_BIGHIT MUSIC` -6.736e+00 2.195e+00 -3.068 0.002161 **
## speechiness -1.694e+01 2.620e+00 -6.466 1.08e-10 ***
## liveness -7.935e+00 2.133e+00 -3.720 0.000201 ***
## valence 5.549e+00 1.313e+00 4.226 2.41e-05 ***
## poly(duration_ms, 2, raw = TRUE)1 1.099e-04 2.122e-05 5.178 2.31e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.866e-10 4.247e-11 -4.395 1.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 560.6161)
##
## Null deviance: 3777230 on 6512 degrees of freedom
## Residual deviance: 3645126 on 6502 degrees of freedom
## AIC: 59717
##
## Number of Fisher Scoring iterations: 2
vif(lm9)
## GVIF Df GVIF^(1/(2*Df))
## `artist_names_Ed Sheeran` 1.004971 1 1.002482
## `artist_names_Imagine Dragons` 1.003103 1 1.001550
## `source_Taylor Swift` 1.030269 1 1.015022
## `artist_names_Travis Scott` 1.003498 1 1.001748
## `source_BIGHIT MUSIC` 1.004168 1 1.002082
## speechiness 1.026032 1 1.012933
## liveness 1.007387 1 1.003687
## valence 1.031590 1 1.015672
## poly(duration_ms, 2, raw = TRUE) 1.052358 2 1.012840
lm91<-lm(weeks_on_chart~`artist_names_Ed Sheeran`+`artist_names_Imagine Dragons`+`source_Taylor Swift`+`artist_names_Travis Scott`+`source_BIGHIT MUSIC`+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummynames)
summary(lm91)
##
## Call:
## lm(formula = weeks_on_chart ~ `artist_names_Ed Sheeran` + `artist_names_Imagine Dragons` +
## `source_Taylor Swift` + `artist_names_Travis Scott` + `source_BIGHIT MUSIC` +
## speechiness + liveness + valence + poly(duration_ms, 2, raw = TRUE),
## data = spotifydummynames)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.05 -11.64 -7.20 2.47 325.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.723e-01 2.797e+00 0.097 0.922462
## `artist_names_Ed Sheeran` 2.185e+01 3.551e+00 6.155 7.97e-10 ***
## `artist_names_Imagine Dragons` 2.593e+01 3.910e+00 6.632 3.57e-11 ***
## `source_Taylor Swift` -9.826e+00 1.826e+00 -5.380 7.72e-08 ***
## `artist_names_Travis Scott` 1.247e+01 4.414e+00 2.824 0.004757 **
## `source_BIGHIT MUSIC` -6.736e+00 2.195e+00 -3.068 0.002161 **
## speechiness -1.694e+01 2.620e+00 -6.466 1.08e-10 ***
## liveness -7.935e+00 2.133e+00 -3.720 0.000201 ***
## valence 5.549e+00 1.313e+00 4.226 2.41e-05 ***
## poly(duration_ms, 2, raw = TRUE)1 1.099e-04 2.122e-05 5.178 2.31e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.866e-10 4.247e-11 -4.395 1.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.68 on 6502 degrees of freedom
## Multiple R-squared: 0.03497, Adjusted R-squared: 0.03349
## F-statistic: 23.56 on 10 and 6502 DF, p-value: < 2.2e-16
spotify3<-spotify
spotify3$Ed<-spotifydummynames$`artist_names_Ed Sheeran`
spotify3$ImgDrg<-spotifydummynames$`artist_names_Imagine Dragons`
spotify3$TSSource<-spotifydummynames$`source_Taylor Swift`
spotify3$BIGHIT<-spotifydummynames$`source_BIGHIT MUSIC`
spotify3$TScott<-spotifydummynames$`artist_names_Travis Scott`
spotify3$longevity<-ifelse(spotify$weeks_on_chart>mean(spotify$weeks_on_chart), 1, 0)
spotify3$polydur<-poly(spotify$duration_ms,2,raw=TRUE)
lr1<-glm(longevity~Ed+ImgDrg+TSSource+key+mode+time_signature+danceability+energy+speechiness+acousticness+instrumentalness+liveness+valence+loudness+tempo+polydur,data=spotify3,family="binomial")
summary(lr1)
##
## Call:
## glm(formula = longevity ~ Ed + ImgDrg + TSSource + key + mode +
## time_signature + danceability + energy + speechiness + acousticness +
## instrumentalness + liveness + valence + loudness + tempo +
## polydur, family = "binomial", data = spotify3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.828e+00 6.510e-01 -2.809 0.004976 **
## Ed 5.284e-02 3.192e-01 0.166 0.868522
## ImgDrg 1.232e+00 3.417e-01 3.605 0.000313 ***
## TSSource -1.465e+00 2.681e-01 -5.463 4.69e-08 ***
## keyA#/Bb 4.186e-02 1.475e-01 0.284 0.776542
## keyB 4.052e-01 1.355e-01 2.990 0.002791 **
## keyC -4.125e-02 1.340e-01 -0.308 0.758249
## keyC#/Db -6.998e-02 1.275e-01 -0.549 0.582949
## keyD 1.185e-01 1.414e-01 0.838 0.402064
## keyD#/Eb -5.291e-02 1.989e-01 -0.266 0.790277
## keyE 2.150e-02 1.511e-01 0.142 0.886861
## keyF 1.427e-01 1.397e-01 1.021 0.307127
## keyF#/Gb 3.253e-02 1.430e-01 0.228 0.820032
## keyG -9.454e-02 1.399e-01 -0.676 0.499115
## keyG#/Ab 2.006e-01 1.382e-01 1.452 0.146502
## modeMinor -7.128e-02 6.048e-02 -1.179 0.238557
## time_signature3 beats -7.661e-01 4.492e-01 -1.705 0.088109 .
## time_signature4 beats -6.988e-01 4.326e-01 -1.615 0.106225
## time_signature5 beats -1.120e+00 5.105e-01 -2.195 0.028166 *
## danceability 6.869e-01 2.433e-01 2.824 0.004750 **
## energy -8.124e-02 2.893e-01 -0.281 0.778824
## speechiness -2.033e+00 2.989e-01 -6.801 1.04e-11 ***
## acousticness 8.654e-02 1.459e-01 0.593 0.553068
## instrumentalness -7.813e-01 4.826e-01 -1.619 0.105440
## liveness -6.802e-01 2.178e-01 -3.122 0.001794 **
## valence 5.917e-01 1.482e-01 3.993 6.52e-05 ***
## loudness 5.661e-02 1.794e-02 3.156 0.001600 **
## tempo -4.754e-04 1.000e-03 -0.475 0.634586
## polydur1 1.323e-05 3.170e-06 4.174 3.00e-05 ***
## polydur2 -2.452e-11 6.911e-12 -3.547 0.000389 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7737.7 on 6512 degrees of freedom
## Residual deviance: 7465.7 on 6483 degrees of freedom
## AIC: 7525.7
##
## Number of Fisher Scoring iterations: 5
lr2<-glm(longevity~ImgDrg+TSSource+key+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr2)
##
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + key + danceability +
## speechiness + liveness + valence + loudness + polydur, family = "binomial",
## data = spotify3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.705e+00 4.306e-01 -6.282 3.34e-10 ***
## ImgDrg 1.211e+00 3.406e-01 3.557 0.000375 ***
## TSSource -1.439e+00 2.674e-01 -5.381 7.43e-08 ***
## keyA#/Bb 3.395e-02 1.470e-01 0.231 0.817417
## keyB 3.976e-01 1.350e-01 2.945 0.003235 **
## keyC -1.789e-02 1.333e-01 -0.134 0.893231
## keyC#/Db -5.599e-02 1.267e-01 -0.442 0.658639
## keyD 1.471e-01 1.404e-01 1.048 0.294526
## keyD#/Eb -3.114e-02 1.983e-01 -0.157 0.875194
## keyE 2.079e-02 1.505e-01 0.138 0.890127
## keyF 1.414e-01 1.392e-01 1.015 0.309947
## keyF#/Gb 3.578e-02 1.425e-01 0.251 0.801680
## keyG -8.549e-02 1.394e-01 -0.613 0.539833
## keyG#/Ab 2.245e-01 1.373e-01 1.635 0.102056
## danceability 6.942e-01 2.274e-01 3.052 0.002272 **
## speechiness -2.102e+00 2.922e-01 -7.192 6.39e-13 ***
## liveness -6.895e-01 2.160e-01 -3.192 0.001413 **
## valence 5.929e-01 1.386e-01 4.278 1.89e-05 ***
## loudness 5.209e-02 1.294e-02 4.026 5.67e-05 ***
## polydur1 1.332e-05 3.142e-06 4.239 2.24e-05 ***
## polydur2 -2.469e-11 6.865e-12 -3.596 0.000323 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7737.7 on 6512 degrees of freedom
## Residual deviance: 7476.6 on 6492 degrees of freedom
## AIC: 7518.6
##
## Number of Fisher Scoring iterations: 5
spotify3$KeyB<-spotifydummykeys$key_B
lr3<-glm(longevity~ImgDrg+TSSource+KeyB+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr3)
##
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + KeyB + danceability +
## speechiness + liveness + valence + loudness + polydur, family = "binomial",
## data = spotify3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.674e+00 4.186e-01 -6.389 1.67e-10 ***
## ImgDrg 1.201e+00 3.397e-01 3.535 0.000407 ***
## TSSource -1.452e+00 2.669e-01 -5.440 5.32e-08 ***
## KeyB 3.619e-01 9.454e-02 3.828 0.000129 ***
## danceability 6.888e-01 2.266e-01 3.040 0.002365 **
## speechiness -2.105e+00 2.900e-01 -7.260 3.87e-13 ***
## liveness -6.956e-01 2.156e-01 -3.227 0.001252 **
## valence 6.024e-01 1.380e-01 4.366 1.27e-05 ***
## loudness 5.119e-02 1.290e-02 3.970 7.20e-05 ***
## polydur1 1.333e-05 3.134e-06 4.255 2.09e-05 ***
## polydur2 -2.475e-11 6.847e-12 -3.614 0.000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7737.7 on 6512 degrees of freedom
## Residual deviance: 7486.8 on 6502 degrees of freedom
## AIC: 7508.8
##
## Number of Fisher Scoring iterations: 5
popsource$longevity<-ifelse(popsource$weeks_on_chart>mean(spotify$weeks_on_chart), 1, 0)
artsource$longevity<-ifelse(artsource$weeks_on_chart>mean(spotify$weeks_on_chart),1,0)
lr4<-glm(longevity~artist_names,data=artsource,family="binomial")
summary(lr4)
##
## Call:
## glm(formula = longevity ~ artist_names, family = "binomial",
## data = artsource)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.34373 0.45842 -2.931 0.00338 **
## artist_namesAriana Grande 0.15415 0.55074 0.280 0.77956
## artist_namesBad Bunny 0.81764 0.52090 1.570 0.11649
## artist_namesBillie Eilish 0.84296 0.53893 1.564 0.11779
## artist_namesBTS -0.71996 0.59247 -1.215 0.22430
## artist_namesDrake -0.43755 0.53214 -0.822 0.41093
## artist_namesEd Sheeran 0.74903 0.55419 1.352 0.17651
## artist_namesEminem -0.07765 0.62248 -0.125 0.90073
## artist_namesHarry Styles 0.10697 0.59457 0.180 0.85722
## artist_namesImagine Dragons 1.72673 0.56768 3.042 0.00235 **
## artist_namesJ. Cole -0.77653 0.76386 -1.017 0.30935
## artist_namesJuice WRLD -0.15019 0.56698 -0.265 0.79109
## artist_namesJustin Bieber 0.47874 0.62272 0.769 0.44202
## artist_namesKanye West -1.38629 0.75181 -1.844 0.06519 .
## artist_namesLana Del Rey -0.81575 0.76285 -1.069 0.28491
## artist_namesLil Uzi Vert -0.04256 0.60531 -0.070 0.94395
## artist_namesOlivia Rodrigo 0.93827 0.59079 1.588 0.11225
## artist_namesPost Malone -0.06503 0.56857 -0.114 0.90894
## artist_namesSam Smith 0.19860 0.63123 0.315 0.75305
## artist_namesShawn Mendes 0.65059 0.60012 1.084 0.27832
## artist_namesTaylor Swift -0.49549 0.50836 -0.975 0.32972
## artist_namesThe Weeknd 0.39927 0.55619 0.718 0.47284
## artist_namesTravis Scott 0.54523 0.60931 0.895 0.37088
## artist_namesXXXTENTACION 0.30764 0.57733 0.533 0.59412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1312.6 on 1225 degrees of freedom
## Residual deviance: 1226.1 on 1202 degrees of freedom
## AIC: 1274.1
##
## Number of Fisher Scoring iterations: 5
lr5<-glm(longevity~source,data=popsource,family="binomial")
summary(lr5)
##
## Call:
## glm(formula = longevity ~ source, family = "binomial", data = popsource)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.816761 0.275477 -2.965
## sourceAtlantic Records 0.671049 0.335327 2.001
## sourceAtlantic Records UK 0.275630 0.341253 0.808
## sourceBIGHIT MUSIC -1.045379 0.384832 -2.716
## sourceColumbia -0.269272 0.304314 -0.885
## sourceDef Jam Recordings -1.303502 0.512386 -2.544
## sourceDisruptor Records/Columbia 0.034002 0.408606 0.083
## sourceGeneration Now/Atlantic -0.586063 0.398682 -1.470
## sourceGrade A Productions/Interscope Records -1.297772 0.485772 -2.672
## sourceIsland Records 0.395548 0.393391 1.005
## sourceOVO -1.015820 0.469986 -2.161
## sourcePolydor Records -0.453701 0.394881 -1.149
## sourceRCA Records Label 0.005831 0.357036 0.016
## sourceRepublic Records -0.276937 0.309062 -0.896
## sourceRimas Entertainment LLC 0.344157 0.331221 1.039
## sourceSony Music Latin 0.978402 0.329176 2.972
## sourceTaylor Swift -1.498246 0.380170 -3.941
## sourceUMLE - Latino 1.070995 0.350159 3.059
## sourceWarner Records -0.099530 0.332997 -0.299
## sourceWEA Latina 0.205852 0.396346 0.519
## Pr(>|z|)
## (Intercept) 0.00303 **
## sourceAtlantic Records 0.04537 *
## sourceAtlantic Records UK 0.41926
## sourceBIGHIT MUSIC 0.00660 **
## sourceColumbia 0.37624
## sourceDef Jam Recordings 0.01096 *
## sourceDisruptor Records/Columbia 0.93368
## sourceGeneration Now/Atlantic 0.14156
## sourceGrade A Productions/Interscope Records 0.00755 **
## sourceIsland Records 0.31466
## sourceOVO 0.03067 *
## sourcePolydor Records 0.25057
## sourceRCA Records Label 0.98697
## sourceRepublic Records 0.37022
## sourceRimas Entertainment LLC 0.29878
## sourceSony Music Latin 0.00296 **
## sourceTaylor Swift 8.11e-05 ***
## sourceUMLE - Latino 0.00222 **
## sourceWarner Records 0.76502
## sourceWEA Latina 0.60350
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2643.9 on 2215 degrees of freedom
## Residual deviance: 2455.5 on 2196 degrees of freedom
## AIC: 2495.5
##
## Number of Fisher Scoring iterations: 4
spotify3$LatUMLE<-spotifydummynames$`source_UMLE - Latino`
spotify3$LatSony<-spotifydummynames$`source_Sony Music Latin`
spotify3$AtlRec<-spotifydummynames$`source_Atlantic Records`
spotify3$DefJam<-spotifydummynames$`source_Def Jam Recordings`
spotify3$AtlRec<-spotifydummynames$`source_Atlantic Records`
spotify3$GradeA<-spotifydummynames$`source_Grade A Productions/Interscope Records`
spotify3$OVO<-spotifydummynames$source_OVO
spotify3$Kanye<-spotifydummynames$`artist_names_Kanye West`
spotify3$longevity<-ifelse(spotify3$weeks_on_chart>mean(spotify$weeks_on_chart),1,0)
lr6<-glm(longevity~ImgDrg+TSSource+LatUMLE+LatSony+Kanye+BIGHIT+DefJam+AtlRec+GradeA+OVO+KeyB+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr6)
##
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + LatUMLE + LatSony +
## Kanye + BIGHIT + DefJam + AtlRec + GradeA + OVO + KeyB +
## danceability + speechiness + liveness + valence + loudness +
## polydur, family = "binomial", data = spotify3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.561e+00 4.188e-01 -6.115 9.66e-10 ***
## ImgDrg 1.200e+00 3.392e-01 3.537 0.000405 ***
## TSSource -1.472e+00 2.669e-01 -5.515 3.49e-08 ***
## LatUMLE 9.300e-01 2.228e-01 4.174 2.99e-05 ***
## LatSony 8.117e-01 1.868e-01 4.345 1.39e-05 ***
## Kanye -1.464e+00 6.007e-01 -2.437 0.014800 *
## BIGHIT -1.091e+00 2.724e-01 -4.007 6.14e-05 ***
## DefJam -1.245e+00 4.380e-01 -2.842 0.004485 **
## AtlRec 7.963e-01 1.963e-01 4.057 4.97e-05 ***
## GradeA -1.125e+00 4.036e-01 -2.787 0.005321 **
## OVO -6.755e-01 3.863e-01 -1.749 0.080342 .
## KeyB 3.466e-01 9.576e-02 3.620 0.000295 ***
## danceability 6.156e-01 2.284e-01 2.695 0.007032 **
## speechiness -2.063e+00 2.928e-01 -7.044 1.86e-12 ***
## liveness -6.719e-01 2.171e-01 -3.095 0.001969 **
## valence 4.755e-01 1.404e-01 3.387 0.000708 ***
## loudness 4.669e-02 1.300e-02 3.591 0.000330 ***
## polydur1 1.273e-05 3.114e-06 4.088 4.35e-05 ***
## polydur2 -2.283e-11 6.771e-12 -3.371 0.000748 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7737.7 on 6512 degrees of freedom
## Residual deviance: 7380.0 on 6494 degrees of freedom
## AIC: 7418
##
## Number of Fisher Scoring iterations: 5
lr7<-glm(longevity~ImgDrg+TSSource+LatUMLE+LatSony+BIGHIT+DefJam+AtlRec+GradeA+KeyB+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr7)
##
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + LatUMLE + LatSony +
## BIGHIT + DefJam + AtlRec + GradeA + KeyB + danceability +
## speechiness + liveness + valence + loudness + polydur, family = "binomial",
## data = spotify3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.620e+00 4.180e-01 -6.269 3.65e-10 ***
## ImgDrg 1.209e+00 3.393e-01 3.564 0.000365 ***
## TSSource -1.458e+00 2.668e-01 -5.463 4.67e-08 ***
## LatUMLE 9.350e-01 2.229e-01 4.195 2.73e-05 ***
## LatSony 8.162e-01 1.869e-01 4.368 1.26e-05 ***
## BIGHIT -1.082e+00 2.724e-01 -3.973 7.10e-05 ***
## DefJam -1.229e+00 4.380e-01 -2.806 0.005020 **
## AtlRec 8.075e-01 1.964e-01 4.112 3.93e-05 ***
## GradeA -1.113e+00 4.036e-01 -2.756 0.005847 **
## KeyB 3.487e-01 9.568e-02 3.644 0.000268 ***
## danceability 6.414e-01 2.277e-01 2.816 0.004855 **
## speechiness -2.125e+00 2.922e-01 -7.272 3.53e-13 ***
## liveness -6.806e-01 2.171e-01 -3.135 0.001719 **
## valence 4.963e-01 1.401e-01 3.543 0.000395 ***
## loudness 4.707e-02 1.298e-02 3.626 0.000288 ***
## polydur1 1.309e-05 3.120e-06 4.195 2.73e-05 ***
## polydur2 -2.381e-11 6.795e-12 -3.504 0.000458 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7737.7 on 6512 degrees of freedom
## Residual deviance: 7392.4 on 6496 degrees of freedom
## AIC: 7426.4
##
## Number of Fisher Scoring iterations: 4
vif(lr7)
## GVIF Df GVIF^(1/(2*Df))
## ImgDrg 1.007076 1 1.003532
## TSSource 1.018807 1 1.009360
## LatUMLE 1.016570 1 1.008251
## LatSony 1.032258 1 1.016001
## BIGHIT 1.005950 1 1.002971
## DefJam 1.005443 1 1.002718
## AtlRec 1.002963 1 1.001480
## GradeA 1.003122 1 1.001560
## KeyB 1.007094 1 1.003541
## danceability 1.231459 1 1.109711
## speechiness 1.060560 1 1.029835
## liveness 1.020069 1 1.009985
## valence 1.262942 1 1.123807
## loudness 1.143622 1 1.069403
## polydur 1.075281 2 1.018311
spotify4<-spotify3
write.csv(spotify4, "spotselect.csv")
spotify4|>
group_by(longevity) |>
summarize(imgd=mean(ImgDrg),tssource=mean(TSSource),latUM=mean(LatUMLE),sony=mean(LatSony),defjam=mean(DefJam),atlrec=mean(AtlRec),keyb=mean(KeyB),bighit=mean(BIGHIT),dance=mean(danceability),speech=mean(speechiness),live=mean(liveness),val=mean(valence),loud=mean(loudness),dur=mean(duration_ms))
## # A tibble: 2 × 15
## longevity imgd tssource latUM sony defjam atlrec keyb bighit dance
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.00320 0.0346 0.00812 0.0122 0.0107 0.0126 0.0784 0.0220 0.676
## 2 1 0.0120 0.00874 0.0268 0.0366 0.00328 0.0279 0.109 0.00874 0.696
## # ℹ 5 more variables: speech <dbl>, live <dbl>, val <dbl>, loud <dbl>,
## # dur <dbl>