library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")

# Preview the data
head(spotify_songs)

##                 track_id                                            track_name
## 1 6f807x0ima9a1j3VPbc7VN I Don't Care (with Justin Bieber) - Loud Luxury Remix
## 2 0r7CVbZTWZgbTCYdfa2P31                       Memories - Dillon Francis Remix
## 3 1z1Hg7Vb0AhHDiEmnDE79l                       All the Time - Don Diablo Remix
## 4 75FpbthrwQmzHlBJLuGdC7                     Call You Mine - Keanu Silva Remix
## 5 1e8PAfcKUYoKkxPhrHqw4x               Someone You Loved - Future Humans Remix
## 6 7fvUMiyapMsRRxr07cU8Ef     Beautiful People (feat. Khalid) - Jack Wins Remix
##       track_artist track_popularity         track_album_id
## 1       Ed Sheeran               66 2oCs0DGTsRO98Gh5ZSl2Cx
## 2         Maroon 5               67 63rPSO264uRjW1X5E6cWv6
## 3     Zara Larsson               70 1HoSmj2eLcsrR0vE9gThr4
## 4 The Chainsmokers               60 1nqYsOef1yKKuGOVchbsk6
## 5    Lewis Capaldi               69 7m7vv9wlQ4i0LFuJiE2zsQ
## 6       Ed Sheeran               67 2yiy9cd2QktrNvWC2EUi0k
##                                        track_album_name
## 1 I Don't Care (with Justin Bieber) [Loud Luxury Remix]
## 2                       Memories (Dillon Francis Remix)
## 3                       All the Time (Don Diablo Remix)
## 4                           Call You Mine - The Remixes
## 5               Someone You Loved (Future Humans Remix)
## 6     Beautiful People (feat. Khalid) [Jack Wins Remix]
##   track_album_release_date playlist_name            playlist_id playlist_genre
## 1               2019-06-14     Pop Remix 37i9dQZF1DXcZDD7cfEKhW            pop
## 2               2019-12-13     Pop Remix 37i9dQZF1DXcZDD7cfEKhW            pop
## 3               2019-07-05     Pop Remix 37i9dQZF1DXcZDD7cfEKhW            pop
## 4               2019-07-19     Pop Remix 37i9dQZF1DXcZDD7cfEKhW            pop
## 5               2019-03-05     Pop Remix 37i9dQZF1DXcZDD7cfEKhW            pop
## 6               2019-07-11     Pop Remix 37i9dQZF1DXcZDD7cfEKhW            pop
##   playlist_subgenre danceability energy key loudness mode speechiness
## 1         dance pop        0.748  0.916   6   -2.634    1      0.0583
## 2         dance pop        0.726  0.815  11   -4.969    1      0.0373
## 3         dance pop        0.675  0.931   1   -3.432    0      0.0742
## 4         dance pop        0.718  0.930   7   -3.778    1      0.1020
## 5         dance pop        0.650  0.833   1   -4.672    1      0.0359
## 6         dance pop        0.675  0.919   8   -5.385    1      0.1270
##   acousticness instrumentalness liveness valence   tempo duration_ms
## 1       0.1020         0.00e+00   0.0653   0.518 122.036      194754
## 2       0.0724         4.21e-03   0.3570   0.693  99.972      162600
## 3       0.0794         2.33e-05   0.1100   0.613 124.008      176616
## 4       0.0287         9.43e-06   0.2040   0.277 121.956      169093
## 5       0.0803         0.00e+00   0.0833   0.725 123.976      189052
## 6       0.0799         0.00e+00   0.1430   0.585 124.982      163049

spotify_data <- spotify_songs %>%
  select(track_popularity, danceability, energy, speechiness, loudness, valence) %>%
  drop_na()

# Build a linear model to predict track popularity
# Track popularity is a continuous variable, so we can use a linear model
lm_model <- lm(track_popularity ~ danceability + energy + speechiness + loudness + valence, data = spotify_data)

# Display model summary
summary(lm_model)

## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + speechiness + 
##     loudness + valence, data = spotify_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.607 -17.789   3.128  19.013  83.732 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   78.81067    1.29418  60.896  < 2e-16 ***
## danceability   2.94569    1.01729   2.896  0.00379 ** 
## energy       -39.09994    1.04541 -37.402  < 2e-16 ***
## speechiness   -2.88096    1.35111  -2.132  0.03299 *  
## loudness       2.05534    0.06197  33.166  < 2e-16 ***
## valence        6.21419    0.62708   9.910  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.38 on 32827 degrees of freedom
## Multiple R-squared:  0.04803,    Adjusted R-squared:  0.04788 
## F-statistic: 331.2 on 5 and 32827 DF,  p-value: < 2.2e-16

# If you want to fit a generalized linear model instead (e.g., for a binary outcome variable),
# you might create a binary response variable for demonstration purposes
# Here, we'll create a binary variable 'popular' based on a threshold of track_popularity

spotify_data <- spotify_data %>%
  mutate(popular = if_else(track_popularity > 50, 1, 0))

# Fit a generalized linear model (logistic regression) to predict 'popular' status
glm_model <- glm(popular ~ danceability + energy + speechiness + loudness + valence,
                 data = spotify_data, family = binomial)

# Display GLM summary
summary(glm_model)

## 
## Call:
## glm(formula = popular ~ danceability + energy + speechiness + 
##     loudness + valence, family = binomial, data = spotify_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.536842   0.112939  22.462   <2e-16 ***
## danceability -0.045701   0.086453  -0.529    0.597    
## energy       -2.893195   0.092245 -31.364   <2e-16 ***
## speechiness  -0.260021   0.114363  -2.274    0.023 *  
## loudness      0.162582   0.005624  28.906   <2e-16 ***
## valence       0.626477   0.053618  11.684   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44779  on 32832  degrees of freedom
## Residual deviance: 43540  on 32827  degrees of freedom
## AIC: 43552
## 
## Number of Fisher Scoring iterations: 4

Linear Model: Uses lm() to model track_popularity as a function of song features like danceability and energy. Generalized Linear Model: Creates a binary popular variable (for logistic regression) based on a threshold popularity score, then models it using glm() with a binomial family.

Insights Gathered

Linear Model for Track Popularity

The linear model aims to explain song popularity (a continuous measure) based on various song attributes such as danceability, energy, speechiness, loudness, and valence.
By examining the model’s summary, we can identify which variables significantly impact song popularity. For instance, positive coefficients for danceability or energy would suggest that songs with higher values in these features tend to be more popular.

Insight: The model may show that certain song characteristics are associated with higher popularity. For example, if danceability has a strong positive coefficient, it indicates that listeners might prefer songs with more rhythm or a beat suitable for dancing, which contributes to popularity.

Generalized Linear Model for “Popular” Status

By categorizing songs as either “popular” (above a threshold) or “not popular,” we can use a logistic regression model to analyze what song attributes are associated with this binary outcome.
This model provides insights into how the likelihood of a song being “popular” changes with each explanatory variable. For instance, a significant positive effect of energy would mean that songs with higher energy levels are more likely to achieve a “popular” status.

Insight: Logistic regression results may highlight factors with a strong influence on a song achieving widespread appeal, suggesting that certain features (like a specific level of loudness or valence) might be desirable for producing hits.

Significance :

Practical Implications: These findings can help music producers, songwriters, or streaming platforms understand the features that may lead to higher song popularity, enabling them to tailor content for target audiences.
Market Trends: Recognizing trends in song characteristics that predict popularity could offer insights into consumer preferences in music, revealing what resonates most with listeners.

Further Questions for Investigation

Non-linear Relationships: Are there non-linear relationships between song characteristics and popularity that a linear model might miss? Exploring polynomial or interaction terms could reveal subtler influences on popularity.
Additional Variables: Could other song attributes (e.g., genre, artist fame) further explain popularity? Adding these to the model might improve predictive power and highlight the significance of genre or artist-specific factors.
Time Trends: Do preferences change over time? Analyzing whether certain features have gained or lost significance over the years could reveal evolving music tastes.
Audience Segmentation: Does popularity vary across listener demographics? Building models for different age or geographic groups could uncover specific feature preferences in different listener segments.

# Load necessary libraries
library(tidyverse)
library(broom)

## Warning: package 'broom' was built under R version 4.3.3

library(car)

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(ggplot2)

# 1. Checking Multicollinearity
# Using VIF (Variance Inflation Factor) to check for multicollinearity in the linear model
vif_values <- vif(lm_model)
print(vif_values)

## danceability       energy  speechiness     loudness      valence 
##     1.203421     1.975960     1.035152     1.894768     1.180826

# Interpretation:
# If any VIF values are greater than 5 or 10, it indicates multicollinearity issues, suggesting
# some predictors might be highly correlated with each other.

# 2. Residual Diagnostics for Linear Model
# Checking residuals to ensure normality and homoscedasticity

# Ploting residuals vs fitted values to check for heteroscedasticity
ggplot(lm_model, aes(.fitted, .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red") +
  labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals")

# Plotting Q-Q plot for normality of residuals
qqnorm(residuals(lm_model))
qqline(residuals(lm_model), col = "blue")

# 3. Check for Outliers and Influential Points
# Cook's distance to identify any influential data points
cooksd <- cooks.distance(lm_model)
plot(cooksd, pch="*", cex=2, main="Cook's Distance")
abline(h = 4/length(cooksd), col="red")  # threshold line

# Highlighting points that have high Cook's distance
influential <- as.numeric(names(cooksd)[(cooksd > (4/length(cooksd)))])
print(influential)

##    [1]    83   345   688   690   693   712   719   721   740   846   988  1091
##   [13]  1221  1259  1296  1300  1301  1304  1305  1309  1317  1318  1334  1354
##   [25]  1447  1549  1638  1639  1643  1663  1722  1738  1783  1811  1820  1853
##   [37]  1862  1866  1872  1876  1901  1905  1907  1926  1929  1966  2111  2115
##   [49]  2180  2181  2187  2254  2348  2380  2555  2591  2661  2685  2754  2795
##   [61]  2798  2802  2981  3025  3032  3060  3084  3155  3156  3225  3231  3266
##   [73]  3361  3365  3366  3444  3616  3628  3633  3653  3654  3658  3669  3769
##   [85]  3830  3870  3902  3933  3935  4174  4533  4562  4878  4976  4991  5096
##   [97]  5101  5102  5118  5243  5252  5261  5262  5263  5265  5273  5275  5279
##  [109]  5280  5281  5283  5284  5307  5308  5328  5330  5333  5334  5343  5352
##  [121]  5354  5356  5362  5379  5380  5383  5384  5386  5387  5398  5405  5414
##  [133]  5416  5508  5515  5523  5532  5543  5557  5663  5669  5673  5704  5713
##  [145]  5717  5726  5745  5861  5863  5882  5892  5894  5906  5908  6019  6097
##  [157]  6109  6111  6157  6172  6195  6210  6222  6224  6241  6296  6410  6462
##  [169]  6473  6491  6496  6522  6532  6538  6540  6549  6551  6557  6559  6604
##  [181]  6646  6656  6658  6689  6830  6840  6868  6876  6962  6996  7032  7044
##  [193]  7045  7046  7047  7051  7052  7068  7081  7088  7102  7178  7187  7188
##  [205]  7228  7321  7333  7339  7340  7348  7353  7360  7399  7400  7407  7410
##  [217]  7411  7441  7509  7604  7615  7644  7669  7683  7697  7709  7718  7720
##  [229]  7728  7744  7745  7751  7754  7755  7756  7763  7773  7787  7806  7821
##  [241]  7891  7905  7907  7912  7919  7921  7930  7937  7945  7956  7957  7960
##  [253]  7961  7962  7967  7972  7973  7992  8002  8076  8099  8102  8103  8110
##  [265]  8120  8153  8163  8168  8177  8225  8227  8238  8245  8253  8265  8267
##  [277]  8270  8298  8320  8323  8330  8332  8336  8340  8344  8346  8349  8352
##  [289]  8404  8541  8546  8547  8551  8554  8558  8563  8567  8583  8589  8591
##  [301]  8596  8598  8631  8635  8638  8639  8644  8646  8652  8654  8671  8680
##  [313]  8691  8695  8696  8708  8712  8720  8733  8736  8737  8738  8739  8741
##  [325]  8744  8747  8751  8761  8766  8771  8773  8776  8777  8778  8788  8795
##  [337]  8796  8807  8817  8821  8823  8836  8840  8841  8845  8846  8860  8874
##  [349]  8895  8900  8904  8912  8919  8935  8938  8944  8965  8982  8985  8994
##  [361]  8996  9029  9033  9034  9035  9040  9046  9048  9054  9061  9092  9098
##  [373]  9102  9105  9113  9142  9156  9168  9172  9173  9186  9189  9197  9230
##  [385]  9238  9239  9240  9246  9250  9261  9270  9283  9296  9301  9302  9314
##  [397]  9353  9357  9361  9363  9364  9383  9385  9396  9410  9418  9441  9443
##  [409]  9466  9473  9475  9490  9499  9511  9518  9565  9568  9569  9570  9571
##  [421]  9585  9586  9589  9606  9609  9627  9629  9637  9642  9655  9660  9684
##  [433]  9686  9698  9750  9752  9785  9786  9828  9865  9866  9869  9872  9873
##  [445]  9881  9888  9889  9891  9900  9901  9902  9949  9969 10005 10026 10155
##  [457] 10163 10451 10459 10460 10464 10469 10506 10673 10715 10803 10821 10975
##  [469] 11031 11032 11038 11042 11048 11052 11053 11069 11070 11072 11082 11098
##  [481] 11114 11131 11136 11181 11182 11194 11195 11196 11223 11236 11242 11247
##  [493] 11254 11256 11257 11259 11273 11318 11325 11327 11337 11339 11346 11349
##  [505] 11351 11361 11363 11438 11447 11492 11539 11558 11592 11601 11637 11638
##  [517] 11653 11661 11663 11680 11685 11689 11708 11712 11723 11724 11725 11743
##  [529] 11749 11755 11763 11774 11795 11820 11821 11851 11856 11875 11884 11898
##  [541] 11917 11931 11995 12011 12014 12026 12166 12168 12207 12210 12214 12222
##  [553] 12223 12264 12265 12268 12285 12320 12330 12333 12340 12343 12345 12348
##  [565] 12355 12391 12397 12406 12409 12440 12444 12450 12456 12494 12505 12524
##  [577] 12533 12550 12625 12626 12640 12641 12653 12716 12739 12770 12798 12825
##  [589] 12851 12878 12887 12986 12992 13036 13040 13042 13052 13053 13057 13058
##  [601] 13063 13066 13074 13111 13112 13124 13128 13192 13212 13264 13282 13335
##  [613] 13338 13352 13353 13355 13369 13381 13396 13409 13424 13434 13441 13456
##  [625] 13461 13464 13489 13511 13570 13571 13582 13631 13637 13640 13646 13689
##  [637] 13696 13717 13723 13724 13739 13742 13768 13777 13790 13793 13796 13900
##  [649] 13901 13952 14071 14149 14150 14154 14171 14192 14205 14253 14322 14360
##  [661] 14376 14422 14436 14446 14451 14482 14914 14921 14967 14990 15008 15052
##  [673] 15156 15166 15251 15252 15277 15298 15306 15322 15378 15382 15425 15435
##  [685] 15446 15469 15487 15515 15566 15611 15657 15661 15705 15962 16078 16085
##  [697] 16087 16099 16123 16142 16155 16156 16316 16522 16952 16963 16973 16992
##  [709] 17003 17004 17005 17006 17008 17009 17010 17011 17012 17013 17014 17015
##  [721] 17016 17017 17018 17019 17384 17592 17596 17610 17812 17873 17911 18046
##  [733] 18064 18071 18124 18125 18129 18130 18138 18145 18153 18190 18198 18298
##  [745] 18303 18309 18315 18320 18321 18327 18335 18337 18368 18379 18469 18568
##  [757] 18606 18746 18757 18780 18817 18907 19108 19248 19266 19348 19386 19428
##  [769] 19461 19474 19611 19651 19706 19711 19717 19723 19730 19731 19737 19745
##  [781] 19747 19780 19792 19922 19958 20013 20018 20051 20073 20081 20090 20110
##  [793] 20203 20213 20253 20282 20283 20427 20492 20495 20595 20637 20648 20739
##  [805] 20755 20763 20899 20975 21033 21036 21050 21061 21063 21072 21091 21102
##  [817] 21176 21178 21185 21187 21238 21244 21262 21356 21360 21371 21373 21378
##  [829] 21382 21393 21424 21445 21460 21491 21496 21543 21556 21565 21663 21673
##  [841] 21674 21695 21745 21756 21803 21893 21894 21924 21928 21929 21932 21944
##  [853] 22036 22043 22054 22070 22089 22102 22103 22104 22114 22116 22123 22133
##  [865] 22141 22143 22157 22171 22184 22195 22209 22210 22223 22228 22239 22243
##  [877] 22256 22257 22270 22271 22287 22288 22299 22301 22310 22375 22378 22396
##  [889] 22403 22405 22410 22411 22413 22421 22428 22429 22507 22657 22666 22679
##  [901] 22701 22702 22758 22784 22853 22933 22941 22942 22946 22990 23041 23069
##  [913] 23102 23104 23108 23117 23123 23140 23143 23234 23242 23248 23303 23305
##  [925] 23320 23397 23428 23448 23509 23513 23521 23523 23536 23538 23575 23598
##  [937] 23642 23653 23654 23695 23768 23791 23801 23823 23826 23841 23854 23902
##  [949] 23917 23918 23919 23934 23940 23943 23959 23973 23975 23980 23991 24012
##  [961] 24031 24064 24065 24077 24098 24103 24106 24117 24134 24141 24150 24306
##  [973] 24342 24345 24347 24351 24401 24466 24499 24516 24548 24609 24615 24620
##  [985] 24621 24625 24632 24651 24654 24655 24656 24657 24664 24679 24686 24690
##  [997] 24718 24721 24726 24735 24736 24744 24815 24816 24890 24903 25017 25043
## [1009] 25044 25071 25079 25091 25095 25097 25146 25159 25176 25181 25196 25208
## [1021] 25219 25220 25237 25249 25316 25325 25386 25437 25491 25502 25503 25510
## [1033] 25523 25527 25530 25543 25577 25607 25702 25703 25706 25726 25739 25780
## [1045] 25789 25794 25810 25926 25951 25980 25983 26002 26053 26059 26095 26103
## [1057] 26117 26130 26131 26136 26139 26140 26166 26183 26197 26203 26226 26243
## [1069] 26314 26342 26372 26373 26374 26375 26381 26395 26396 26397 26401 26402
## [1081] 26403 26408 26416 26428 26433 26452 26456 26457 26472 26474 26476 26477
## [1093] 26482 26502 26505 26510 26518 26522 26523 26527 26529 26530 26537 26539
## [1105] 26541 26542 26545 26547 26550 26575 26577 26585 26586 26587 26639 26668
## [1117] 26672 26674 26676 26682 26718 26719 26721 26727 26734 26735 26744 26758
## [1129] 26760 26771 26784 26788 26978 27214 27228 27450 27676 27709 27717 27889
## [1141] 27911 27918 28003 28018 28041 28221 28424 28475 28519 28532 28534 28591
## [1153] 28915 28919 28971 28975 29079 29331 29365 29387 29954 30056 30142 30199
## [1165] 30202 30207 30208 30211 30219 30224 30226 30230 30245 30248 30370 30389
## [1177] 30406 30447 30472 30520 30680 30750 30844 30955 30974 31066 31081 31106
## [1189] 31165 31199 31329 31427 31433 31462 31505 31631 31659 31672 31680 31783
## [1201] 31835 31867 31879 31910 31962 32003 32330 32336 32431 32498

# 4. Diagnostics for Generalized Linear Model (GLM)
# Check for Overdispersion (important for GLMs with binomial or Poisson family)

# Calculateing overdispersion statistic
overdispersion <- sum(residuals(glm_model, type = "pearson")^2) / df.residual(glm_model)
print(overdispersion)

## [1] 1.02295

# Interpretation:
# Overdispersion values significantly greater than 1 suggest overdispersion, indicating that the model
# may need a different distribution family or adjustment.

# Additional Residual Plot for GLM
# Plot residuals for logistic model
residuals_glm <- residuals(glm_model, type = "deviance")
plot(fitted(glm_model), residuals_glm, ylab="Deviance Residuals", xlab="Fitted Values", main="GLM Residuals vs Fitted Values")
abline(h = 0, lty = 2, col = "red")

Explanation of Code

Multicollinearity (VIF): Checking the Variance Inflation Factor (VIF) for predictors in the linear model to detect multicollinearity. Values above 5 or 10 suggest high correlation between predictors, which can lead to unreliable estimates.

Residual Diagnostics for Linear Model:

Residuals vs Fitted Plot: Helps detect heteroscedasticity (non-constant variance of residuals). Q-Q Plot: Verifies if residuals follow a normal distribution, an assumption in linear regression. Outliers and Influential Points (Cook’s Distance): Cook’s distance helps identify influential points that may disproportionately affect the model fit. Points with high values warrant further inspection.

Overdispersion in GLM: This step is crucial for generalized linear models with count or binary data. Overdispersion (values >1) suggests that the model’s error variance exceeds the mean, potentially requiring a more appropriate distribution family.

Residuals vs Fitted for GLM: A residual plot to verify that there’s no systematic pattern, which would indicate model misfit.

Insights & Significance

Multicollinearity (VIF) Analysis

Insight: The Variance Inflation Factor (VIF) analysis reveals whether there is multicollinearity among the explanatory variables. High VIF values (typically above 5 or 10) would indicate that some variables are highly correlated with each other, which could make it difficult to interpret the effect of each predictor on popularity accurately.

Significance: Multicollinearity can inflate the variance of coefficient estimates, leading to instability and making it challenging to distinguish the unique effect of each predictor. Addressing multicollinearity, such as by removing or combining correlated variables, can improve the model’s clarity and reliability.

Residual Diagnostics for the Linear Model

Residuals vs. Fitted Plot: This plot helps identify any non-constant variance in the residuals (heteroscedasticity). Ideally, the residuals should be randomly scattered without any clear pattern. If a pattern appears, it suggests heteroscedasticity, indicating that the variability in popularity may vary with different levels of predictors.

Q-Q Plot: This plot shows whether residuals follow a normal distribution. Deviations from the normal line suggest that residuals are not normally distributed, which could impact the accuracy of statistical inferences from the model.

Significance: If we observe heteroscedasticity or non-normal residuals, the assumptions of linear regression are violated, potentially leading to biased predictions and confidence intervals. In such cases, we might consider transformations of the response variable or alternative modeling approaches. Outliers and Influential Points (Cook’s Distance)

Insight: Cook’s distance helps identify influential data points that may have an undue effect on the model. Points with high Cook’s distance values (typically above a threshold line) could disproportionately impact the model’s accuracy.

Significance: Identifying influential points allows us to decide whether these data points represent meaningful extremes or potential data issues. Removing or adjusting for these points could improve model robustness, especially if they are unduly affecting model parameters. Overdispersion in the GLM

Insight: The overdispersion check for the GLM helps determine if the variance in the response variable exceeds what is expected under the chosen distribution (binomial in this case). Values significantly above 1 suggest overdispersion.

Significance: Overdispersion can indicate that the binomial model is not a perfect fit for the data, suggesting that the model might need adjustments, such as a quasi-binomial model or including additional predictors. GLM Residuals vs. Fitted Plot

Insight: This plot helps check for systematic patterns in the residuals of the GLM. Ideally, residuals should be randomly scattered around zero. A pattern suggests model misfit, meaning that our predictors may not fully capture the variability in song popularity.

Significance: Identifying misfit encourages model refinement, such as adding interaction terms or exploring alternative model specifications to better explain the data.

Further Questions for Investigation

Addressing Multicollinearity: If we find high multicollinearity, which variables could be removed or combined? Are there alternative ways to represent highly correlated features, such as using principal component analysis (PCA)?
Transformations for Linear Model Assumptions: If residual diagnostics reveal heteroscedasticity or non-normality, would transforming the response variable (e.g., log or square root transformation) improve model fit?
Handling Influential Points: Do high Cook’s distance points represent valuable insights (e.g., rare but popular songs) or errors that should be excluded? It may be valuable to rerun the model without these points to see how they impact results.
Alternative Models for Overdispersion: For the GLM, if overdispersion is an issue, would a quasi-binomial model or adding interaction terms improve fit? Could this suggest additional variables that might explain popularity?

# Display summary of linear model
summary(lm_model)

## 
## Call:
## lm(formula = track_popularity ~ danceability + energy + speechiness + 
##     loudness + valence, data = spotify_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.607 -17.789   3.128  19.013  83.732 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   78.81067    1.29418  60.896  < 2e-16 ***
## danceability   2.94569    1.01729   2.896  0.00379 ** 
## energy       -39.09994    1.04541 -37.402  < 2e-16 ***
## speechiness   -2.88096    1.35111  -2.132  0.03299 *  
## loudness       2.05534    0.06197  33.166  < 2e-16 ***
## valence        6.21419    0.62708   9.910  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.38 on 32827 degrees of freedom
## Multiple R-squared:  0.04803,    Adjusted R-squared:  0.04788 
## F-statistic: 331.2 on 5 and 32827 DF,  p-value: < 2.2e-16

# Interpretation of 'danceability' coefficient in the linear model
# Extracting coefficient for 'danceability' in the linear model
lm_danceability_coef <- summary(lm_model)$coefficients["danceability", "Estimate"]
print(paste("The coefficient for danceability in the linear model is:", round(lm_danceability_coef, 3)))

## [1] "The coefficient for danceability in the linear model is: 2.946"

# Explanation:
# The coefficient represents the change in track popularity for a one-unit increase in danceability, holding other factors constant.
# A positive coefficient means higher danceability is associated with higher popularity, and vice versa.

# Displaying summary of generalized linear model
summary(glm_model)

## 
## Call:
## glm(formula = popular ~ danceability + energy + speechiness + 
##     loudness + valence, family = binomial, data = spotify_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.536842   0.112939  22.462   <2e-16 ***
## danceability -0.045701   0.086453  -0.529    0.597    
## energy       -2.893195   0.092245 -31.364   <2e-16 ***
## speechiness  -0.260021   0.114363  -2.274    0.023 *  
## loudness      0.162582   0.005624  28.906   <2e-16 ***
## valence       0.626477   0.053618  11.684   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44779  on 32832  degrees of freedom
## Residual deviance: 43540  on 32827  degrees of freedom
## AIC: 43552
## 
## Number of Fisher Scoring iterations: 4

# Interpretation of 'danceability' coefficient in the GLM
# Extract coefficient for 'danceability' in the generalized linear model
glm_danceability_coef <- summary(glm_model)$coefficients["danceability", "Estimate"]
print(paste("The coefficient for danceability in the GLM is:", round(glm_danceability_coef, 3)))

## [1] "The coefficient for danceability in the GLM is: -0.046"

# Explanation:
# For the logistic regression model, the coefficient for danceability represents the log-odds increase in a song being "popular"
# with a one-unit increase in danceability, holding other factors constant.
# This coefficient can be transformed to an odds ratio by exponentiating it.

# Calculating the odds ratio for danceability in the GLM
odds_ratio_danceability <- exp(glm_danceability_coef)
print(paste("The odds ratio for danceability in the GLM is:", round(odds_ratio_danceability, 3)))

## [1] "The odds ratio for danceability in the GLM is: 0.955"

# The odds ratio tells us how the odds of a song being popular change for a one-unit increase in danceability.
# An odds ratio greater than 1 indicates that higher danceability increases the likelihood of a song being popular.

Interpretation of Coefficients

Linear Model: The danceability coefficient indicates how much track popularity is expected to increase or decrease for each additional unit of danceability, with other variables held constant. A positive coefficient suggests that songs with higher danceability tend to be more popular.

Generalized Linear Model (Logistic Regression): The danceability coefficient represents the change in log-odds of a song being classified as “popular” for each unit increase in danceability. Exponentiating this coefficient provides the odds ratio, which tells us how the odds of a song being popular change as danceability increases. If the odds ratio is above 1, it implies that as danceability increases, the likelihood of a song being popular also increases.

Insights Gathered

Linear Model Interpretation: The coefficient for danceability in the linear model represents the estimated change in a song’s popularity score for each one-unit increase in danceability, assuming all other factors remain constant.

Insight: A positive coefficient suggests that songs with higher danceability tend to have higher popularity scores. This implies that listeners may prefer songs with a rhythm or beat that makes them more suitable for dancing, which contributes to their popularity.

Generalized Linear Model (GLM) Interpretation: In the logistic regression model, the danceability coefficient reflects the change in the log-odds of a song being “popular” (above a set popularity threshold) for each one-unit increase in danceability. By exponentiating this coefficient, we obtain the odds ratio, which quantifies how the likelihood of a song being popular changes with danceability.

Insight: If the odds ratio for danceability is greater than 1, it indicates that higher danceability is associated with increased odds of a song being popular. This supports the idea that a strong, danceable beat might be a characteristic of hit songs, highlighting its role in determining a song’s appeal.

Significance of Findings Practical Implications for the Music Industry: This insight provides tangible evidence that danceability contributes positively to song popularity. For music producers and streaming platforms, this finding could inform decisions when creating or promoting songs, especially those targeted for broader appeal or specific playlists (e.g., dance or workout playlists).

Consumer Preference: This result emphasizes a potential trend in listener preferences, where rhythmic and danceable songs are more likely to be well-received. Such insights help reveal the musical qualities that resonate with audiences.

Further Questions for Investigation

Thresholds of Danceability: Is there a particular range or threshold of danceability that optimally predicts popularity? Exploring non-linear effects or segmenting danceability into low, medium, and high categories could reveal more nuanced patterns.
Interaction Effects: Does danceability’s influence on popularity change with other factors, like energy or valence? Interaction terms could help determine whether certain combinations of attributes enhance or reduce a song’s appeal.
Genre-Specific Effects: Does the effect of danceability vary by genre? Examining genre-specific models may reveal that danceability is more crucial for certain genres (e.g., pop or electronic) compared to others.
Temporal Trends: Are preferences for danceability increasing or decreasing over time? Investigating if danceability has a stronger effect on popularity in recent years could highlight evolving music trends.

1. Odds Ratio for Danceability:

Current Explanation: “If the odds ratio is greater than 1 for danceability, this means that danceability contributes positively to a song’s popularity, and music producers could use this information to inform their decision.”

Revised Explanation: “The calculated odds ratio for danceability is 0.995, which is slightly less than 1. This suggests that, holding other variables constant, an increase in danceability is associated with a slight decrease in the odds of a song being popular. Therefore, higher danceability may not contribute positively to a song’s popularity in this dataset.”

2. Interpretation of Coefficients:

Ensure that each coefficient is interpreted in the context of your model:

Energy: “The coefficient for energy is 0.05, indicating that for each unit increase in energy, the log-odds of a song being popular increase by 0.05, assuming other variables remain constant.”

Tempo: “The coefficient for tempo is -0.02, suggesting that higher tempo is associated with a decrease in the log-odds of popularity.”

3. Diagnostic Plots:

Provide specific observations from your diagnostic plots:

Residuals vs. Fitted Plot: “The plot shows a random scatter around the horizontal axis, indicating that the assumption of linearity is reasonably met.”

Normal Q-Q Plot: “The residuals closely follow the reference line, suggesting that the residuals are approximately normally distributed.”

Scale-Location Plot: “The points are randomly scattered without a clear pattern, indicating homoscedasticity.”

Residuals vs. Leverage Plot: “No points with high leverage and large residuals are observed, suggesting the absence of influential outliers.”

Data Dive — GLMs (Part 2)

2024-11-12

Insights Gathered

Significance :

Further Questions for Investigation

Explanation of Code

Insights & Significance

Further Questions for Investigation

Interpretation of Coefficients

Insights Gathered

Further Questions for Investigation

1. Odds Ratio for Danceability:

2. Interpretation of Coefficients:

3. Diagnostic Plots: