First, I will import the necessary libraries:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.3.3
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
spotify <- read.csv("spotify.csv")

Part 1: Characterizing Songs That Have Appeared on the Spotify Weekly Top 200 List

A Quick Summary of Each of the Columns

summary(spotify)
##       id            artist_names        track_name           source         
##  Length:6513        Length:6513        Length:6513        Length:6513       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      key                mode           time_signature      danceability   
##  Length:6513        Length:6513        Length:6513        Min.   :0.1500  
##  Class :character   Class :character   Class :character   1st Qu.:0.5910  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.6980  
##                                                           Mean   :0.6817  
##                                                           3rd Qu.:0.7850  
##                                                           Max.   :0.9850  
##      energy        speechiness      acousticness       instrumentalness   
##  Min.   :0.0218   Min.   :0.0232   Min.   :0.0000075   Min.   :0.0000000  
##  1st Qu.:0.5340   1st Qu.:0.0440   1st Qu.:0.0444000   1st Qu.:0.0000000  
##  Median :0.6510   Median :0.0722   Median :0.1450000   Median :0.0000000  
##  Mean   :0.6365   Mean   :0.1219   Mean   :0.2367610   Mean   :0.0124687  
##  3rd Qu.:0.7590   3rd Qu.:0.1630   3rd Qu.:0.3560000   3rd Qu.:0.0000406  
##  Max.   :0.9890   Max.   :0.9660   Max.   :0.9940000   Max.   :0.9530000  
##     liveness         valence          loudness           tempo       
##  Min.   :0.0197   Min.   :0.0320   Min.   :-34.475   Min.   : 46.72  
##  1st Qu.:0.0974   1st Qu.:0.3160   1st Qu.: -7.564   1st Qu.: 98.01  
##  Median :0.1240   Median :0.4890   Median : -5.983   Median :120.03  
##  Mean   :0.1802   Mean   :0.4924   Mean   : -6.351   Mean   :122.12  
##  3rd Qu.:0.2190   3rd Qu.:0.6690   3rd Qu.: -4.673   3rd Qu.:142.03  
##  Max.   :0.9770   Max.   :0.9820   Max.   :  1.509   Max.   :212.12  
##   duration_ms     weeks_on_chart      streams         
##  Min.   : 30133   Min.   :  1.00   Min.   :2.525e+06  
##  1st Qu.:173038   1st Qu.:  1.00   1st Qu.:8.695e+06  
##  Median :198367   Median :  4.00   Median :2.713e+07  
##  Mean   :202567   Mean   : 13.61   Mean   :1.090e+08  
##  3rd Qu.:226003   3rd Qu.: 16.00   3rd Qu.:9.951e+07  
##  Max.   :690732   Max.   :367.00   Max.   :3.528e+09

Visual Distributions of the Columns

Keys Broken Down Into Major and Minor

ggplot(data=spotify)+geom_bar(aes(x=key,fill=mode),position="dodge")+labs(title="Prevalence of Keys in Top 200 on Spotify since 2016")+theme_bw()+ scale_fill_manual(values = c("goldenrod", "navy"))

Time Signature Broken Down By Major and Minor Keys

ggplot(data=spotify)+geom_bar(aes(x=time_signature,fill=mode),position="dodge")+labs(title="Prevalence of Time Signature in Top 200 on Spotify since 2016")+theme_bw()+ scale_fill_manual(values = c("goldenrod", "navy"))

### Song Counts by Source (Label) for Sources With More Than 50 Songs on the List Note: Since some record labels made be represented by multiple different subsidaries, some counts may be underestimated.

popsource<-spotify|>
  group_by(source)|>
  mutate(count=n())|>
  filter(count>50)|>
  ungroup()
ggplot(data=popsource)+geom_bar(aes(x=source),fill="navy")+labs(title="Prevalence of Sources of Song in Top 200 on Spotify since 2016")+theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

### Song Counts for Artists With More Than 25 Songs That Have Made the List Note: Collaborations could not be easily considered with this, so actual song counts for these artists may be underestimated.

artsource<-spotify|>
  group_by(artist_names)|>
  mutate(count=n())|>
  filter(count>25)|>
  ungroup()
ggplot(data=artsource)+geom_bar(aes(x=artist_names),fill="navy")+labs(title="Prevalence of Artists of Songs in Top 200 on Spotify since 2016")+theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Distributioon of Weeks Spent on the List

ggplot(data=spotify)+geom_histogram(aes(x=weeks_on_chart),fill="slateblue")+labs(title="Distribution of Time Spent in Top 200 on Spotify since 2016",color="Mean Time On Chart")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$weeks_on_chart),color="13.6 Weeks"),size=1.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Use of `spotify$weeks_on_chart` is discouraged.
## ℹ Use `weeks_on_chart` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### Distribution of Song Length

ggplot(data=spotify)+geom_histogram(aes(x=duration_ms),fill="cornflowerblue")+labs(title="Distribution of Song Length in Top 200 on Spotify since 2016",color="Mean Length")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$duration_ms),color="202556.7 ms"),size=1.5)
## Warning: Use of `spotify$duration_ms` is discouraged.
## ℹ Use `duration_ms` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### Distribution of Danceability

ggplot(data=spotify)+geom_histogram(aes(x=danceability),fill="cornflowerblue")+labs(title="Distribution of Danceability in Top 200 on Spotify since 2016",color="Mean Danceability")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$danceability),color=".68117307"),size=1.5)
## Warning: Use of `spotify$danceability` is discouraged.
## ℹ Use `danceability` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Energy

ggplot(data=spotify)+geom_histogram(aes(x=energy),fill="cornflowerblue")+labs(title="Distribution of Energy in Top 200 on Spotify since 2016",color="Mean Energy")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$energy),color=".6365223"),size=1.5)
## Warning: Use of `spotify$energy` is discouraged.
## ℹ Use `energy` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Speechiness

ggplot(data=spotify)+geom_histogram(aes(x=speechiness),fill="cornflowerblue")+labs(title="Distribution of Speechiness in Top 200 on Spotify since 2016",color="Mean Speechiness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$speechiness),color=".1219331"),size=1.5)
## Warning: Use of `spotify$speechiness` is discouraged.
## ℹ Use `speechiness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Acousticness

ggplot(data=spotify)+geom_histogram(aes(x=acousticness),fill="cornflowerblue")+labs(title="Distribution of Acousticness in Top 200 on Spotify since 2016",color="Mean Acousticness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$acousticness),color=".236761"),size=1.5)
## Warning: Use of `spotify$acousticness` is discouraged.
## ℹ Use `acousticness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Instrumentalness

ggplot(data=spotify)+geom_histogram(aes(x=instrumentalness),fill="cornflowerblue")+labs(title="Distribution of Instrumentalness in Top 200 on Spotify since 2016",color="Mean Instrumentalness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$instrumentalness),color=" 0.01246869"),size=1.5)
## Warning: Use of `spotify$instrumentalness` is discouraged.
## ℹ Use `instrumentalness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Liveness

ggplot(data=spotify)+geom_histogram(aes(x=liveness),fill="cornflowerblue")+labs(title="Distribution of Liveness in Top 200 on Spotify since 2016",color="Mean Liveness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$liveness),color="0.1801681sp"),size=1.5)
## Warning: Use of `spotify$liveness` is discouraged.
## ℹ Use `liveness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Valence

ggplot(data=spotify)+geom_histogram(aes(x=valence),fill="cornflowerblue")+labs(title="Distribution of Valence in Top 200 on Spotify since 2016",color="Mean Valence")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$valence),color="0.4924117"),size=1.5)
## Warning: Use of `spotify$valence` is discouraged.
## ℹ Use `valence` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Loudness

ggplot(data=spotify)+geom_histogram(aes(x=loudness),fill="cornflowerblue")+labs(title="Distribution of Loudness in Top 200 on Spotify since 2016",color="Mean Loudness")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$loudness),color="-6.350667"),size=1.5)
## Warning: Use of `spotify$loudness` is discouraged.
## ℹ Use `loudness` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Tempo

ggplot(data=spotify)+geom_histogram(aes(x=tempo),fill="cornflowerblue")+labs(title="Distribution of Tempo in Top 200 on Spotify since 2016",color="Mean Tempo")+theme_bw()+geom_vline(aes(xintercept=mean(spotify$tempo),color="122.1172"),size=1.5)
## Warning: Use of `spotify$tempo` is discouraged.
## ℹ Use `tempo` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Looking At Correlations Between Variables

Scatterplots

Building an interfential model to predict the best linear model for predicting weeks spent on the chart:

spotify2<-spotify[c(-1,-2,-3,-4,-5,-6,-7,-19)]
pairs(spotify2)

ggplot(data=spotify2)+geom_point(aes(x=danceability,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=energy,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=speechiness,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=acousticness,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=instrumentalness,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=valence,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=loudness,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=tempo,weeks_on_chart))

ggplot(data=spotify2)+geom_point(aes(x=duration_ms,weeks_on_chart))

Building a Correlation Plot

corrs<-cor(spotify2)
corrplot(corrs, method="color", addCoef.col = "black",number.cex=.5,tl.cex=.75,col=colorRampPalette(c("navy","white","darkred"))(10))

Building the Linear Model/Determining Features to Be Used

Attempt 1

baselm<-glm(weeks_on_chart~key+mode+time_signature+danceability+energy+speechiness+acousticness+instrumentalness+liveness+valence+poly(loudness,2,raw=T)+tempo+poly(duration_ms,2,raw=T),data=spotify)
summary(baselm)
## 
## Call:
## glm(formula = weeks_on_chart ~ key + mode + time_signature + 
##     danceability + energy + speechiness + acousticness + instrumentalness + 
##     liveness + valence + poly(loudness, 2, raw = T) + tempo + 
##     poly(duration_ms, 2, raw = T), data = spotify)
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     1.079e+01  6.606e+00   1.634 0.102370    
## keyA#/Bb                        2.627e+00  1.528e+00   1.720 0.085527 .  
## keyB                            4.272e+00  1.454e+00   2.937 0.003321 ** 
## keyC                            6.466e-01  1.386e+00   0.467 0.640831    
## keyC#/Db                        1.867e+00  1.314e+00   1.421 0.155285    
## keyD                            2.143e+00  1.486e+00   1.442 0.149337    
## keyD#/Eb                       -1.032e+00  2.060e+00  -0.501 0.616526    
## keyE                            1.915e+00  1.567e+00   1.222 0.221704    
## keyF                            2.694e+00  1.476e+00   1.825 0.068044 .  
## keyF#/Gb                        1.642e+00  1.497e+00   1.096 0.272964    
## keyG                            4.902e-01  1.435e+00   0.341 0.732749    
## keyG#/Ab                        2.741e+00  1.464e+00   1.872 0.061276 .  
## modeMinor                      -7.394e-01  6.344e-01  -1.166 0.243819    
## time_signature3 beats          -1.072e+00  4.979e+00  -0.215 0.829595    
## time_signature4 beats           2.887e-01  4.829e+00   0.060 0.952337    
## time_signature5 beats          -4.200e+00  5.323e+00  -0.789 0.430155    
## danceability                    5.007e-01  2.493e+00   0.201 0.840787    
## energy                         -8.508e+00  3.018e+00  -2.819 0.004834 ** 
## speechiness                    -1.617e+01  2.777e+00  -5.821 6.11e-09 ***
## acousticness                    1.174e+00  1.507e+00   0.779 0.435869    
## instrumentalness               -1.927e+00  4.091e+00  -0.471 0.637689    
## liveness                       -6.698e+00  2.187e+00  -3.063 0.002204 ** 
## valence                         5.324e+00  1.562e+00   3.409 0.000655 ***
## poly(loudness, 2, raw = T)1     9.700e-01  3.663e-01   2.648 0.008123 ** 
## poly(loudness, 2, raw = T)2     1.858e-02  1.835e-02   1.013 0.311119    
## tempo                          -8.093e-03  1.031e-02  -0.785 0.432459    
## poly(duration_ms, 2, raw = T)1  1.002e-04  2.193e-05   4.571 4.94e-06 ***
## poly(duration_ms, 2, raw = T)2 -1.715e-10  4.376e-11  -3.919 9.00e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 569.8593)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3695537  on 6485  degrees of freedom
## AIC: 59841
## 
## Number of Fisher Scoring iterations: 2

Attempt 2

lm2<-glm(weeks_on_chart~key+energy+speechiness+liveness+valence+poly(duration_ms,2,raw=T),data=spotify)
summary(lm2)
## 
## Call:
## glm(formula = weeks_on_chart ~ key + energy + speechiness + liveness + 
##     valence + poly(duration_ms, 2, raw = T), data = spotify)
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -4.049e-01  3.047e+00  -0.133 0.894284    
## keyA#/Bb                        2.589e+00  1.524e+00   1.699 0.089432 .  
## keyB                            4.209e+00  1.449e+00   2.904 0.003696 ** 
## keyC                            9.279e-01  1.380e+00   0.672 0.501368    
## keyC#/Db                        2.161e+00  1.305e+00   1.656 0.097778 .  
## keyD                            2.413e+00  1.477e+00   1.634 0.102316    
## keyD#/Eb                       -6.991e-01  2.057e+00  -0.340 0.733966    
## keyE                            2.054e+00  1.563e+00   1.314 0.188755    
## keyF                            2.711e+00  1.473e+00   1.841 0.065624 .  
## keyF#/Gb                        1.752e+00  1.494e+00   1.173 0.240896    
## keyG                            5.358e-01  1.432e+00   0.374 0.708349    
## keyG#/Ab                        3.066e+00  1.457e+00   2.104 0.035434 *  
## energy                         -1.918e+00  1.976e+00  -0.970 0.331986    
## speechiness                    -1.785e+01  2.654e+00  -6.726 1.90e-11 ***
## liveness                       -7.157e+00  2.172e+00  -3.294 0.000992 ***
## valence                         6.013e+00  1.438e+00   4.181 2.93e-05 ***
## poly(duration_ms, 2, raw = T)1  1.089e-04  2.155e-05   5.052 4.48e-07 ***
## poly(duration_ms, 2, raw = T)2 -1.898e-10  4.310e-11  -4.403 1.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 570.9894)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3708576  on 6495  degrees of freedom
## AIC: 59843
## 
## Number of Fisher Scoring iterations: 2

Creating Dummy Variables for Each of the Keys

spotifydummykeys<-dummy_cols(spotify, 
                   select_columns = "key")

Attempt 3

lm3<-glm(weeks_on_chart~key_B+key_D+key_F+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummykeys)
summary(lm3)
## 
## Call:
## glm(formula = weeks_on_chart ~ key_B + key_D + key_F + speechiness + 
##     liveness + valence + poly(duration_ms, 2, raw = TRUE), data = spotifydummykeys)
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        6.193e-01  2.825e+00   0.219 0.826488    
## key_B                              2.646e+00  1.062e+00   2.492 0.012726 *  
## key_D                              8.724e-01  1.099e+00   0.794 0.427195    
## key_F                              1.205e+00  1.094e+00   1.102 0.270602    
## speechiness                       -1.722e+01  2.633e+00  -6.542 6.54e-11 ***
## liveness                          -7.592e+00  2.150e+00  -3.531 0.000417 ***
## valence                            5.407e+00  1.324e+00   4.084 4.47e-05 ***
## poly(duration_ms, 2, raw = TRUE)1  1.055e-04  2.137e-05   4.939 8.06e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.835e-10  4.283e-11  -4.285 1.86e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 571.0831)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3714325  on 6504  degrees of freedom
## AIC: 59836
## 
## Number of Fisher Scoring iterations: 2

Attempt 4

lm4<-glm(weeks_on_chart~key_B+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummykeys)
summary(lm4)
## 
## Call:
## glm(formula = weeks_on_chart ~ key_B + speechiness + liveness + 
##     valence + poly(duration_ms, 2, raw = TRUE), data = spotifydummykeys)
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        7.800e-01  2.822e+00   0.276 0.782267    
## key_B                              2.462e+00  1.052e+00   2.340 0.019293 *  
## speechiness                       -1.735e+01  2.630e+00  -6.599 4.46e-11 ***
## liveness                          -7.562e+00  2.150e+00  -3.517 0.000439 ***
## valence                            5.448e+00  1.323e+00   4.119 3.85e-05 ***
## poly(duration_ms, 2, raw = TRUE)1  1.057e-04  2.136e-05   4.945 7.79e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.838e-10  4.282e-11  -4.292 1.80e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 571.056)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3715290  on 6506  degrees of freedom
## AIC: 59833
## 
## Number of Fisher Scoring iterations: 2

Attempt 5

lm5<-glm(weeks_on_chart~speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotify)
summary(lm5)
## 
## Call:
## glm(formula = weeks_on_chart ~ speechiness + liveness + valence + 
##     poly(duration_ms, 2, raw = TRUE), data = spotify)
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        9.924e-01  2.822e+00   0.352  0.72506    
## speechiness                       -1.714e+01  2.629e+00  -6.519 7.59e-11 ***
## liveness                          -7.590e+00  2.151e+00  -3.529  0.00042 ***
## valence                            5.574e+00  1.322e+00   4.216 2.52e-05 ***
## poly(duration_ms, 2, raw = TRUE)1  1.050e-04  2.137e-05   4.915 9.10e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.827e-10  4.283e-11  -4.266 2.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 571.4489)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3718418  on 6507  degrees of freedom
## AIC: 59837
## 
## Number of Fisher Scoring iterations: 2

Attempt 6 (Checking for Significant Artists)

lm6<-glm(weeks_on_chart~artist_names,data=artsource)
summary(lm6)
## 
## Call:
## glm(formula = weeks_on_chart ~ artist_names, data = artsource)
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 11.86207    5.88342   2.016 0.044003 *  
## artist_namesAriana Grande    0.98793    7.16554   0.138 0.890364    
## artist_namesBad Bunny        1.00936    6.99678   0.144 0.885319    
## artist_namesBillie Eilish    9.15680    7.31811   1.251 0.211086    
## artist_namesBTS             -3.96066    6.98233  -0.567 0.570657    
## artist_namesDrake           -4.10531    6.60743  -0.621 0.534508    
## artist_namesEd Sheeran      25.62682    7.54465   3.397 0.000704 ***
## artist_namesEminem           5.49904    7.90561   0.696 0.486822    
## artist_namesHarry Styles     8.16293    7.72724   1.056 0.291005    
## artist_namesImagine Dragons 28.97577    7.85780   3.688 0.000237 ***
## artist_namesJ. Cole         -1.57635    8.39437  -0.188 0.851075    
## artist_namesJuice WRLD       0.18793    7.16554   0.026 0.979081    
## artist_namesJustin Bieber    4.28608    8.47309   0.506 0.613058    
## artist_namesKanye West      -7.63758    7.42299  -1.029 0.303730    
## artist_namesLana Del Rey    -5.86207    8.32041  -0.705 0.481233    
## artist_namesLil Uzi Vert    -2.78707    7.72724  -0.361 0.718401    
## artist_namesOlivia Rodrigo  11.13793    8.25078   1.350 0.177294    
## artist_namesPost Malone      4.38793    7.24845   0.605 0.545053    
## artist_namesSam Smith        0.03448    8.32041   0.004 0.996694    
## artist_namesShawn Mendes     6.73793    8.25078   0.817 0.414295    
## artist_namesTaylor Swift    -4.10778    6.35222  -0.647 0.517970    
## artist_namesThe Weeknd       8.93793    7.39534   1.209 0.227058    
## artist_namesTravis Scott    14.06897    8.32041   1.691 0.091116 .  
## artist_namesXXXTENTACION    11.87603    7.64952   1.553 0.120801    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1003.824)
## 
##     Null deviance: 1298194  on 1225  degrees of freedom
## Residual deviance: 1206596  on 1202  degrees of freedom
## AIC: 11979
## 
## Number of Fisher Scoring iterations: 2

Attempt 7 (Checking for Significant Sources)

lm7<-glm(weeks_on_chart~source,data=popsource)
summary(lm7)
## 
## Call:
## glm(formula = weeks_on_chart ~ source, data = popsource)
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                    17.161      3.325   5.161
## sourceAtlantic Records                          3.384      4.158   0.814
## sourceAtlantic Records UK                       9.593      4.186   2.292
## sourceBIGHIT MUSIC                             -9.187      4.101  -2.240
## sourceColumbia                                 -1.827      3.636  -0.503
## sourceDef Jam Recordings                      -11.643      4.826  -2.412
## sourceDisruptor Records/Columbia                1.349      4.949   0.272
## sourceGeneration Now/Atlantic                  -8.964      4.480  -2.001
## sourceGrade A Productions/Interscope Records  -10.069      4.648  -2.167
## sourceIsland Records                            4.952      4.898   1.011
## sourceOVO                                      -9.989      4.782  -2.089
## sourcePolydor Records                          -3.312      4.521  -0.732
## sourceRCA Records Label                        -5.436      4.311  -1.261
## sourceRepublic Records                         -1.331      3.686  -0.361
## sourceRimas Entertainment LLC                  -1.841      4.067  -0.453
## sourceSony Music Latin                          1.250      4.072   0.307
## sourceTaylor Swift                            -11.403      3.861  -2.954
## sourceUMLE - Latino                             2.344      4.351   0.539
## sourceWarner Records                           -3.111      3.994  -0.779
## sourceWEA Latina                               -3.532      4.873  -0.725
##                                              Pr(>|t|)    
## (Intercept)                                  2.67e-07 ***
## sourceAtlantic Records                        0.41575    
## sourceAtlantic Records UK                     0.02200 *  
## sourceBIGHIT MUSIC                            0.02517 *  
## sourceColumbia                                0.61535    
## sourceDef Jam Recordings                      0.01593 *  
## sourceDisruptor Records/Columbia              0.78528    
## sourceGeneration Now/Atlantic                 0.04554 *  
## sourceGrade A Productions/Interscope Records  0.03038 *  
## sourceIsland Records                          0.31209    
## sourceOVO                                     0.03685 *  
## sourcePolydor Records                         0.46394    
## sourceRCA Records Label                       0.20748    
## sourceRepublic Records                        0.71803    
## sourceRimas Entertainment LLC                 0.65076    
## sourceSony Music Latin                        0.75890    
## sourceTaylor Swift                            0.00317 ** 
## sourceUMLE - Latino                           0.59007    
## sourceWarner Records                          0.43605    
## sourceWEA Latina                              0.46870    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 685.3939)
## 
##     Null deviance: 1567917  on 2215  degrees of freedom
## Residual deviance: 1505125  on 2196  degrees of freedom
## AIC: 20781
## 
## Number of Fisher Scoring iterations: 2

Building Dummy Variables for Artists/Sources

spotifydummynames<-dummy_cols(spotify, 
                   select_columns = c("artist_names","source"))

Attempt 8

lm8<-glm(weeks_on_chart~`artist_names_Ed Sheeran`+`artist_names_Imagine Dragons`+`source_Taylor Swift`+`artist_names_Travis Scott`+`source_BIGHIT MUSIC`+`source_Atlantic Records UK`+`source_Def Jam Recordings`+`source_Generation Now/Atlantic`+`source_Grade A Productions/Interscope Records`+`source_OVO`+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummynames)
summary(lm8)
## 
## Call:
## glm(formula = weeks_on_chart ~ `artist_names_Ed Sheeran` + `artist_names_Imagine Dragons` + 
##     `source_Taylor Swift` + `artist_names_Travis Scott` + `source_BIGHIT MUSIC` + 
##     `source_Atlantic Records UK` + `source_Def Jam Recordings` + 
##     `source_Generation Now/Atlantic` + `source_Grade A Productions/Interscope Records` + 
##     source_OVO + speechiness + liveness + valence + poly(duration_ms, 
##     2, raw = TRUE), data = spotifydummynames)
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                      3.190e-01  2.800e+00   0.114
## `artist_names_Ed Sheeran`                        1.780e+01  4.579e+00   3.888
## `artist_names_Imagine Dragons`                   2.577e+01  3.907e+00   6.597
## `source_Taylor Swift`                           -1.004e+01  1.826e+00  -5.497
## `artist_names_Travis Scott`                      1.225e+01  4.411e+00   2.777
## `source_BIGHIT MUSIC`                           -6.921e+00  2.194e+00  -3.154
## `source_Atlantic Records UK`                     3.975e+00  3.000e+00   1.325
## `source_Def Jam Recordings`                     -8.052e+00  3.189e+00  -2.525
## `source_Generation Now/Atlantic`                -3.847e+00  2.744e+00  -1.402
## `source_Grade A Productions/Interscope Records` -6.038e+00  2.957e+00  -2.042
## source_OVO                                      -5.649e+00  3.137e+00  -1.801
## speechiness                                     -1.605e+01  2.638e+00  -6.085
## liveness                                        -7.888e+00  2.132e+00  -3.701
## valence                                          5.212e+00  1.316e+00   3.961
## poly(duration_ms, 2, raw = TRUE)1                1.106e-04  2.122e-05   5.211
## poly(duration_ms, 2, raw = TRUE)2               -1.854e-10  4.245e-11  -4.368
##                                                 Pr(>|t|)    
## (Intercept)                                     0.909312    
## `artist_names_Ed Sheeran`                       0.000102 ***
## `artist_names_Imagine Dragons`                  4.53e-11 ***
## `source_Taylor Swift`                           4.00e-08 ***
## `artist_names_Travis Scott`                     0.005508 ** 
## `source_BIGHIT MUSIC`                           0.001618 ** 
## `source_Atlantic Records UK`                    0.185239    
## `source_Def Jam Recordings`                     0.011601 *  
## `source_Generation Now/Atlantic`                0.161055    
## `source_Grade A Productions/Interscope Records` 0.041225 *  
## source_OVO                                      0.071800 .  
## speechiness                                     1.23e-09 ***
## liveness                                        0.000217 ***
## valence                                         7.54e-05 ***
## poly(duration_ms, 2, raw = TRUE)1               1.94e-07 ***
## poly(duration_ms, 2, raw = TRUE)2               1.28e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 559.5712)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3635534  on 6497  degrees of freedom
## AIC: 59710
## 
## Number of Fisher Scoring iterations: 2

Final Linear Model/Features

lm9<-glm(weeks_on_chart~`artist_names_Ed Sheeran`+`artist_names_Imagine Dragons`+`source_Taylor Swift`+`artist_names_Travis Scott`+`source_BIGHIT MUSIC`+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummynames)
summary(lm9)
## 
## Call:
## glm(formula = weeks_on_chart ~ `artist_names_Ed Sheeran` + `artist_names_Imagine Dragons` + 
##     `source_Taylor Swift` + `artist_names_Travis Scott` + `source_BIGHIT MUSIC` + 
##     speechiness + liveness + valence + poly(duration_ms, 2, raw = TRUE), 
##     data = spotifydummynames)
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.723e-01  2.797e+00   0.097 0.922462    
## `artist_names_Ed Sheeran`          2.185e+01  3.551e+00   6.155 7.97e-10 ***
## `artist_names_Imagine Dragons`     2.593e+01  3.910e+00   6.632 3.57e-11 ***
## `source_Taylor Swift`             -9.826e+00  1.826e+00  -5.380 7.72e-08 ***
## `artist_names_Travis Scott`        1.247e+01  4.414e+00   2.824 0.004757 ** 
## `source_BIGHIT MUSIC`             -6.736e+00  2.195e+00  -3.068 0.002161 ** 
## speechiness                       -1.694e+01  2.620e+00  -6.466 1.08e-10 ***
## liveness                          -7.935e+00  2.133e+00  -3.720 0.000201 ***
## valence                            5.549e+00  1.313e+00   4.226 2.41e-05 ***
## poly(duration_ms, 2, raw = TRUE)1  1.099e-04  2.122e-05   5.178 2.31e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.866e-10  4.247e-11  -4.395 1.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 560.6161)
## 
##     Null deviance: 3777230  on 6512  degrees of freedom
## Residual deviance: 3645126  on 6502  degrees of freedom
## AIC: 59717
## 
## Number of Fisher Scoring iterations: 2

Calculating VIF for Final Features

vif(lm9)
##                                      GVIF Df GVIF^(1/(2*Df))
## `artist_names_Ed Sheeran`        1.004971  1        1.002482
## `artist_names_Imagine Dragons`   1.003103  1        1.001550
## `source_Taylor Swift`            1.030269  1        1.015022
## `artist_names_Travis Scott`      1.003498  1        1.001748
## `source_BIGHIT MUSIC`            1.004168  1        1.002082
## speechiness                      1.026032  1        1.012933
## liveness                         1.007387  1        1.003687
## valence                          1.031590  1        1.015672
## poly(duration_ms, 2, raw = TRUE) 1.052358  2        1.012840

Getting Adjusted R-Squared Value for Final Model

lm91<-lm(weeks_on_chart~`artist_names_Ed Sheeran`+`artist_names_Imagine Dragons`+`source_Taylor Swift`+`artist_names_Travis Scott`+`source_BIGHIT MUSIC`+speechiness+liveness+valence+poly(duration_ms,2,raw=TRUE),data=spotifydummynames)
summary(lm91)
## 
## Call:
## lm(formula = weeks_on_chart ~ `artist_names_Ed Sheeran` + `artist_names_Imagine Dragons` + 
##     `source_Taylor Swift` + `artist_names_Travis Scott` + `source_BIGHIT MUSIC` + 
##     speechiness + liveness + valence + poly(duration_ms, 2, raw = TRUE), 
##     data = spotifydummynames)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.05 -11.64  -7.20   2.47 325.26 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.723e-01  2.797e+00   0.097 0.922462    
## `artist_names_Ed Sheeran`          2.185e+01  3.551e+00   6.155 7.97e-10 ***
## `artist_names_Imagine Dragons`     2.593e+01  3.910e+00   6.632 3.57e-11 ***
## `source_Taylor Swift`             -9.826e+00  1.826e+00  -5.380 7.72e-08 ***
## `artist_names_Travis Scott`        1.247e+01  4.414e+00   2.824 0.004757 ** 
## `source_BIGHIT MUSIC`             -6.736e+00  2.195e+00  -3.068 0.002161 ** 
## speechiness                       -1.694e+01  2.620e+00  -6.466 1.08e-10 ***
## liveness                          -7.935e+00  2.133e+00  -3.720 0.000201 ***
## valence                            5.549e+00  1.313e+00   4.226 2.41e-05 ***
## poly(duration_ms, 2, raw = TRUE)1  1.099e-04  2.122e-05   5.178 2.31e-07 ***
## poly(duration_ms, 2, raw = TRUE)2 -1.866e-10  4.247e-11  -4.395 1.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.68 on 6502 degrees of freedom
## Multiple R-squared:  0.03497,    Adjusted R-squared:  0.03349 
## F-statistic: 23.56 on 10 and 6502 DF,  p-value: < 2.2e-16

Saving Significant Features in Dataset For Future Use and Creating Longevity Binary Variable

spotify3<-spotify
spotify3$Ed<-spotifydummynames$`artist_names_Ed Sheeran`
spotify3$ImgDrg<-spotifydummynames$`artist_names_Imagine Dragons`
spotify3$TSSource<-spotifydummynames$`source_Taylor Swift`
spotify3$BIGHIT<-spotifydummynames$`source_BIGHIT MUSIC`
spotify3$TScott<-spotifydummynames$`artist_names_Travis Scott`
spotify3$longevity<-ifelse(spotify$weeks_on_chart>mean(spotify$weeks_on_chart), 1, 0)
spotify3$polydur<-poly(spotify$duration_ms,2,raw=TRUE)

Building the Logistic Regression Model

Attempt 1

lr1<-glm(longevity~Ed+ImgDrg+TSSource+key+mode+time_signature+danceability+energy+speechiness+acousticness+instrumentalness+liveness+valence+loudness+tempo+polydur,data=spotify3,family="binomial")
summary(lr1)
## 
## Call:
## glm(formula = longevity ~ Ed + ImgDrg + TSSource + key + mode + 
##     time_signature + danceability + energy + speechiness + acousticness + 
##     instrumentalness + liveness + valence + loudness + tempo + 
##     polydur, family = "binomial", data = spotify3)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.828e+00  6.510e-01  -2.809 0.004976 ** 
## Ed                     5.284e-02  3.192e-01   0.166 0.868522    
## ImgDrg                 1.232e+00  3.417e-01   3.605 0.000313 ***
## TSSource              -1.465e+00  2.681e-01  -5.463 4.69e-08 ***
## keyA#/Bb               4.186e-02  1.475e-01   0.284 0.776542    
## keyB                   4.052e-01  1.355e-01   2.990 0.002791 ** 
## keyC                  -4.125e-02  1.340e-01  -0.308 0.758249    
## keyC#/Db              -6.998e-02  1.275e-01  -0.549 0.582949    
## keyD                   1.185e-01  1.414e-01   0.838 0.402064    
## keyD#/Eb              -5.291e-02  1.989e-01  -0.266 0.790277    
## keyE                   2.150e-02  1.511e-01   0.142 0.886861    
## keyF                   1.427e-01  1.397e-01   1.021 0.307127    
## keyF#/Gb               3.253e-02  1.430e-01   0.228 0.820032    
## keyG                  -9.454e-02  1.399e-01  -0.676 0.499115    
## keyG#/Ab               2.006e-01  1.382e-01   1.452 0.146502    
## modeMinor             -7.128e-02  6.048e-02  -1.179 0.238557    
## time_signature3 beats -7.661e-01  4.492e-01  -1.705 0.088109 .  
## time_signature4 beats -6.988e-01  4.326e-01  -1.615 0.106225    
## time_signature5 beats -1.120e+00  5.105e-01  -2.195 0.028166 *  
## danceability           6.869e-01  2.433e-01   2.824 0.004750 ** 
## energy                -8.124e-02  2.893e-01  -0.281 0.778824    
## speechiness           -2.033e+00  2.989e-01  -6.801 1.04e-11 ***
## acousticness           8.654e-02  1.459e-01   0.593 0.553068    
## instrumentalness      -7.813e-01  4.826e-01  -1.619 0.105440    
## liveness              -6.802e-01  2.178e-01  -3.122 0.001794 ** 
## valence                5.917e-01  1.482e-01   3.993 6.52e-05 ***
## loudness               5.661e-02  1.794e-02   3.156 0.001600 ** 
## tempo                 -4.754e-04  1.000e-03  -0.475 0.634586    
## polydur1               1.323e-05  3.170e-06   4.174 3.00e-05 ***
## polydur2              -2.452e-11  6.911e-12  -3.547 0.000389 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7737.7  on 6512  degrees of freedom
## Residual deviance: 7465.7  on 6483  degrees of freedom
## AIC: 7525.7
## 
## Number of Fisher Scoring iterations: 5

Attempt 2

lr2<-glm(longevity~ImgDrg+TSSource+key+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr2)
## 
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + key + danceability + 
##     speechiness + liveness + valence + loudness + polydur, family = "binomial", 
##     data = spotify3)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.705e+00  4.306e-01  -6.282 3.34e-10 ***
## ImgDrg        1.211e+00  3.406e-01   3.557 0.000375 ***
## TSSource     -1.439e+00  2.674e-01  -5.381 7.43e-08 ***
## keyA#/Bb      3.395e-02  1.470e-01   0.231 0.817417    
## keyB          3.976e-01  1.350e-01   2.945 0.003235 ** 
## keyC         -1.789e-02  1.333e-01  -0.134 0.893231    
## keyC#/Db     -5.599e-02  1.267e-01  -0.442 0.658639    
## keyD          1.471e-01  1.404e-01   1.048 0.294526    
## keyD#/Eb     -3.114e-02  1.983e-01  -0.157 0.875194    
## keyE          2.079e-02  1.505e-01   0.138 0.890127    
## keyF          1.414e-01  1.392e-01   1.015 0.309947    
## keyF#/Gb      3.578e-02  1.425e-01   0.251 0.801680    
## keyG         -8.549e-02  1.394e-01  -0.613 0.539833    
## keyG#/Ab      2.245e-01  1.373e-01   1.635 0.102056    
## danceability  6.942e-01  2.274e-01   3.052 0.002272 ** 
## speechiness  -2.102e+00  2.922e-01  -7.192 6.39e-13 ***
## liveness     -6.895e-01  2.160e-01  -3.192 0.001413 ** 
## valence       5.929e-01  1.386e-01   4.278 1.89e-05 ***
## loudness      5.209e-02  1.294e-02   4.026 5.67e-05 ***
## polydur1      1.332e-05  3.142e-06   4.239 2.24e-05 ***
## polydur2     -2.469e-11  6.865e-12  -3.596 0.000323 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7737.7  on 6512  degrees of freedom
## Residual deviance: 7476.6  on 6492  degrees of freedom
## AIC: 7518.6
## 
## Number of Fisher Scoring iterations: 5

Attempt 3

spotify3$KeyB<-spotifydummykeys$key_B
lr3<-glm(longevity~ImgDrg+TSSource+KeyB+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr3)
## 
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + KeyB + danceability + 
##     speechiness + liveness + valence + loudness + polydur, family = "binomial", 
##     data = spotify3)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.674e+00  4.186e-01  -6.389 1.67e-10 ***
## ImgDrg        1.201e+00  3.397e-01   3.535 0.000407 ***
## TSSource     -1.452e+00  2.669e-01  -5.440 5.32e-08 ***
## KeyB          3.619e-01  9.454e-02   3.828 0.000129 ***
## danceability  6.888e-01  2.266e-01   3.040 0.002365 ** 
## speechiness  -2.105e+00  2.900e-01  -7.260 3.87e-13 ***
## liveness     -6.956e-01  2.156e-01  -3.227 0.001252 ** 
## valence       6.024e-01  1.380e-01   4.366 1.27e-05 ***
## loudness      5.119e-02  1.290e-02   3.970 7.20e-05 ***
## polydur1      1.333e-05  3.134e-06   4.255 2.09e-05 ***
## polydur2     -2.475e-11  6.847e-12  -3.614 0.000301 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7737.7  on 6512  degrees of freedom
## Residual deviance: 7486.8  on 6502  degrees of freedom
## AIC: 7508.8
## 
## Number of Fisher Scoring iterations: 5

Adding Longevity to Top Artists/Sources DataFrames

popsource$longevity<-ifelse(popsource$weeks_on_chart>mean(spotify$weeks_on_chart), 1, 0)
artsource$longevity<-ifelse(artsource$weeks_on_chart>mean(spotify$weeks_on_chart),1,0)

Attempt 4 (Finding Significant Artists)

lr4<-glm(longevity~artist_names,data=artsource,family="binomial")
summary(lr4)
## 
## Call:
## glm(formula = longevity ~ artist_names, family = "binomial", 
##     data = artsource)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                 -1.34373    0.45842  -2.931  0.00338 **
## artist_namesAriana Grande    0.15415    0.55074   0.280  0.77956   
## artist_namesBad Bunny        0.81764    0.52090   1.570  0.11649   
## artist_namesBillie Eilish    0.84296    0.53893   1.564  0.11779   
## artist_namesBTS             -0.71996    0.59247  -1.215  0.22430   
## artist_namesDrake           -0.43755    0.53214  -0.822  0.41093   
## artist_namesEd Sheeran       0.74903    0.55419   1.352  0.17651   
## artist_namesEminem          -0.07765    0.62248  -0.125  0.90073   
## artist_namesHarry Styles     0.10697    0.59457   0.180  0.85722   
## artist_namesImagine Dragons  1.72673    0.56768   3.042  0.00235 **
## artist_namesJ. Cole         -0.77653    0.76386  -1.017  0.30935   
## artist_namesJuice WRLD      -0.15019    0.56698  -0.265  0.79109   
## artist_namesJustin Bieber    0.47874    0.62272   0.769  0.44202   
## artist_namesKanye West      -1.38629    0.75181  -1.844  0.06519 . 
## artist_namesLana Del Rey    -0.81575    0.76285  -1.069  0.28491   
## artist_namesLil Uzi Vert    -0.04256    0.60531  -0.070  0.94395   
## artist_namesOlivia Rodrigo   0.93827    0.59079   1.588  0.11225   
## artist_namesPost Malone     -0.06503    0.56857  -0.114  0.90894   
## artist_namesSam Smith        0.19860    0.63123   0.315  0.75305   
## artist_namesShawn Mendes     0.65059    0.60012   1.084  0.27832   
## artist_namesTaylor Swift    -0.49549    0.50836  -0.975  0.32972   
## artist_namesThe Weeknd       0.39927    0.55619   0.718  0.47284   
## artist_namesTravis Scott     0.54523    0.60931   0.895  0.37088   
## artist_namesXXXTENTACION     0.30764    0.57733   0.533  0.59412   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1312.6  on 1225  degrees of freedom
## Residual deviance: 1226.1  on 1202  degrees of freedom
## AIC: 1274.1
## 
## Number of Fisher Scoring iterations: 5

Attempt 5 (Finding Significant Sources)

lr5<-glm(longevity~source,data=popsource,family="binomial")
summary(lr5)
## 
## Call:
## glm(formula = longevity ~ source, family = "binomial", data = popsource)
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  -0.816761   0.275477  -2.965
## sourceAtlantic Records                        0.671049   0.335327   2.001
## sourceAtlantic Records UK                     0.275630   0.341253   0.808
## sourceBIGHIT MUSIC                           -1.045379   0.384832  -2.716
## sourceColumbia                               -0.269272   0.304314  -0.885
## sourceDef Jam Recordings                     -1.303502   0.512386  -2.544
## sourceDisruptor Records/Columbia              0.034002   0.408606   0.083
## sourceGeneration Now/Atlantic                -0.586063   0.398682  -1.470
## sourceGrade A Productions/Interscope Records -1.297772   0.485772  -2.672
## sourceIsland Records                          0.395548   0.393391   1.005
## sourceOVO                                    -1.015820   0.469986  -2.161
## sourcePolydor Records                        -0.453701   0.394881  -1.149
## sourceRCA Records Label                       0.005831   0.357036   0.016
## sourceRepublic Records                       -0.276937   0.309062  -0.896
## sourceRimas Entertainment LLC                 0.344157   0.331221   1.039
## sourceSony Music Latin                        0.978402   0.329176   2.972
## sourceTaylor Swift                           -1.498246   0.380170  -3.941
## sourceUMLE - Latino                           1.070995   0.350159   3.059
## sourceWarner Records                         -0.099530   0.332997  -0.299
## sourceWEA Latina                              0.205852   0.396346   0.519
##                                              Pr(>|z|)    
## (Intercept)                                   0.00303 ** 
## sourceAtlantic Records                        0.04537 *  
## sourceAtlantic Records UK                     0.41926    
## sourceBIGHIT MUSIC                            0.00660 ** 
## sourceColumbia                                0.37624    
## sourceDef Jam Recordings                      0.01096 *  
## sourceDisruptor Records/Columbia              0.93368    
## sourceGeneration Now/Atlantic                 0.14156    
## sourceGrade A Productions/Interscope Records  0.00755 ** 
## sourceIsland Records                          0.31466    
## sourceOVO                                     0.03067 *  
## sourcePolydor Records                         0.25057    
## sourceRCA Records Label                       0.98697    
## sourceRepublic Records                        0.37022    
## sourceRimas Entertainment LLC                 0.29878    
## sourceSony Music Latin                        0.00296 ** 
## sourceTaylor Swift                           8.11e-05 ***
## sourceUMLE - Latino                           0.00222 ** 
## sourceWarner Records                          0.76502    
## sourceWEA Latina                              0.60350    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2643.9  on 2215  degrees of freedom
## Residual deviance: 2455.5  on 2196  degrees of freedom
## AIC: 2495.5
## 
## Number of Fisher Scoring iterations: 4
spotify3$LatUMLE<-spotifydummynames$`source_UMLE - Latino`
spotify3$LatSony<-spotifydummynames$`source_Sony Music Latin`
spotify3$AtlRec<-spotifydummynames$`source_Atlantic Records`
spotify3$DefJam<-spotifydummynames$`source_Def Jam Recordings`
spotify3$AtlRec<-spotifydummynames$`source_Atlantic Records`
spotify3$GradeA<-spotifydummynames$`source_Grade A Productions/Interscope Records`
spotify3$OVO<-spotifydummynames$source_OVO
spotify3$Kanye<-spotifydummynames$`artist_names_Kanye West`
spotify3$longevity<-ifelse(spotify3$weeks_on_chart>mean(spotify$weeks_on_chart),1,0)

Attempt 6

lr6<-glm(longevity~ImgDrg+TSSource+LatUMLE+LatSony+Kanye+BIGHIT+DefJam+AtlRec+GradeA+OVO+KeyB+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr6)
## 
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + LatUMLE + LatSony + 
##     Kanye + BIGHIT + DefJam + AtlRec + GradeA + OVO + KeyB + 
##     danceability + speechiness + liveness + valence + loudness + 
##     polydur, family = "binomial", data = spotify3)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.561e+00  4.188e-01  -6.115 9.66e-10 ***
## ImgDrg        1.200e+00  3.392e-01   3.537 0.000405 ***
## TSSource     -1.472e+00  2.669e-01  -5.515 3.49e-08 ***
## LatUMLE       9.300e-01  2.228e-01   4.174 2.99e-05 ***
## LatSony       8.117e-01  1.868e-01   4.345 1.39e-05 ***
## Kanye        -1.464e+00  6.007e-01  -2.437 0.014800 *  
## BIGHIT       -1.091e+00  2.724e-01  -4.007 6.14e-05 ***
## DefJam       -1.245e+00  4.380e-01  -2.842 0.004485 ** 
## AtlRec        7.963e-01  1.963e-01   4.057 4.97e-05 ***
## GradeA       -1.125e+00  4.036e-01  -2.787 0.005321 ** 
## OVO          -6.755e-01  3.863e-01  -1.749 0.080342 .  
## KeyB          3.466e-01  9.576e-02   3.620 0.000295 ***
## danceability  6.156e-01  2.284e-01   2.695 0.007032 ** 
## speechiness  -2.063e+00  2.928e-01  -7.044 1.86e-12 ***
## liveness     -6.719e-01  2.171e-01  -3.095 0.001969 ** 
## valence       4.755e-01  1.404e-01   3.387 0.000708 ***
## loudness      4.669e-02  1.300e-02   3.591 0.000330 ***
## polydur1      1.273e-05  3.114e-06   4.088 4.35e-05 ***
## polydur2     -2.283e-11  6.771e-12  -3.371 0.000748 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7737.7  on 6512  degrees of freedom
## Residual deviance: 7380.0  on 6494  degrees of freedom
## AIC: 7418
## 
## Number of Fisher Scoring iterations: 5

Final Logistic Regression Model/Features

lr7<-glm(longevity~ImgDrg+TSSource+LatUMLE+LatSony+BIGHIT+DefJam+AtlRec+GradeA+KeyB+danceability+speechiness+liveness+valence+loudness+polydur,data=spotify3,family="binomial")
summary(lr7)
## 
## Call:
## glm(formula = longevity ~ ImgDrg + TSSource + LatUMLE + LatSony + 
##     BIGHIT + DefJam + AtlRec + GradeA + KeyB + danceability + 
##     speechiness + liveness + valence + loudness + polydur, family = "binomial", 
##     data = spotify3)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.620e+00  4.180e-01  -6.269 3.65e-10 ***
## ImgDrg        1.209e+00  3.393e-01   3.564 0.000365 ***
## TSSource     -1.458e+00  2.668e-01  -5.463 4.67e-08 ***
## LatUMLE       9.350e-01  2.229e-01   4.195 2.73e-05 ***
## LatSony       8.162e-01  1.869e-01   4.368 1.26e-05 ***
## BIGHIT       -1.082e+00  2.724e-01  -3.973 7.10e-05 ***
## DefJam       -1.229e+00  4.380e-01  -2.806 0.005020 ** 
## AtlRec        8.075e-01  1.964e-01   4.112 3.93e-05 ***
## GradeA       -1.113e+00  4.036e-01  -2.756 0.005847 ** 
## KeyB          3.487e-01  9.568e-02   3.644 0.000268 ***
## danceability  6.414e-01  2.277e-01   2.816 0.004855 ** 
## speechiness  -2.125e+00  2.922e-01  -7.272 3.53e-13 ***
## liveness     -6.806e-01  2.171e-01  -3.135 0.001719 ** 
## valence       4.963e-01  1.401e-01   3.543 0.000395 ***
## loudness      4.707e-02  1.298e-02   3.626 0.000288 ***
## polydur1      1.309e-05  3.120e-06   4.195 2.73e-05 ***
## polydur2     -2.381e-11  6.795e-12  -3.504 0.000458 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7737.7  on 6512  degrees of freedom
## Residual deviance: 7392.4  on 6496  degrees of freedom
## AIC: 7426.4
## 
## Number of Fisher Scoring iterations: 4

VIF for Final Logistic Regression Model

vif(lr7)
##                  GVIF Df GVIF^(1/(2*Df))
## ImgDrg       1.007076  1        1.003532
## TSSource     1.018807  1        1.009360
## LatUMLE      1.016570  1        1.008251
## LatSony      1.032258  1        1.016001
## BIGHIT       1.005950  1        1.002971
## DefJam       1.005443  1        1.002718
## AtlRec       1.002963  1        1.001480
## GradeA       1.003122  1        1.001560
## KeyB         1.007094  1        1.003541
## danceability 1.231459  1        1.109711
## speechiness  1.060560  1        1.029835
## liveness     1.020069  1        1.009985
## valence      1.262942  1        1.123807
## loudness     1.143622  1        1.069403
## polydur      1.075281  2        1.018311

Creating a Copy of the DataFrame

spotify4<-spotify3

Writing the Copied DataFrame to a CSV File for Use With Python

write.csv(spotify4, "spotselect.csv")

Looking At Differences in Means Between Songs With And Without Longevity

spotify4|>
  group_by(longevity) |>
  summarize(imgd=mean(ImgDrg),tssource=mean(TSSource),latUM=mean(LatUMLE),sony=mean(LatSony),defjam=mean(DefJam),atlrec=mean(AtlRec),keyb=mean(KeyB),bighit=mean(BIGHIT),dance=mean(danceability),speech=mean(speechiness),live=mean(liveness),val=mean(valence),loud=mean(loudness),dur=mean(duration_ms))
## # A tibble: 2 × 15
##   longevity    imgd tssource   latUM   sony  defjam atlrec   keyb  bighit dance
##       <dbl>   <dbl>    <dbl>   <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl> <dbl>
## 1         0 0.00320  0.0346  0.00812 0.0122 0.0107  0.0126 0.0784 0.0220  0.676
## 2         1 0.0120   0.00874 0.0268  0.0366 0.00328 0.0279 0.109  0.00874 0.696
## # ℹ 5 more variables: speech <dbl>, live <dbl>, val <dbl>, loud <dbl>,
## #   dur <dbl>