# Load required libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(dplyr)
spotify_data <- read.csv("spotify-2023.csv")
n_distinct(spotify_data$mode)
## [1] 2
#Recode 'mode' variable into binary format
spotify_data$mode_binary <- ifelse(spotify_data$mode == "Major", 0, 1)
unique(spotify_data$mode)
## [1] "Major" "Minor"
table(spotify_data$mode_binary)
##
## 0 1
## 550 403
# Build a logistic regression model for 'mode
# Select explanatory variables
explanatory_vars <- c("streams", "valence_.", "danceability_.", "energy_.")
# Build logistic regression model
logit_model <- glm(mode_binary ~ streams + valence_. + danceability_. + energy_.,
data = spotify_data,
family = binomial)
# Summary of the model
summary(logit_model)
##
## Call:
## glm(formula = mode_binary ~ streams + valence_. + danceability_. +
## energy_., family = binomial, data = spotify_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.744e+00 3.986e-01 -4.377 1.21e-05 ***
## streams -1.016e-10 1.194e-10 -0.851 0.395031
## valence_. -4.412e-06 3.242e-03 -0.001 0.998914
## danceability_. 1.851e-02 5.089e-03 3.638 0.000275 ***
## energy_. 3.700e-03 4.344e-03 0.852 0.394297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1298.4 on 952 degrees of freedom
## Residual deviance: 1278.3 on 948 degrees of freedom
## AIC: 1288.3
##
## Number of Fisher Scoring iterations: 4
Streams: This coefficient is very small and statistically insignificant, meaning the number of streams has little impact on predicting the music genre
Valence_ and Energy_: These coefficients are also statistically insignificant, suggesting these features don’t strongly influence the model’s prediction.
Danceability_: This coefficient is positive and statistically significant. A higher danceability score increases the predicted probability of the song belonging to a dance-oriented genre.
# Calculate confidence interval for 'danceability_' coefficient
ci_danceability <- confint(logit_model, "danceability_.")
## Waiting for profiling to be done...
ci_danceability
## 2.5 % 97.5 %
## 0.008605308 0.028572961
The confidence interval for the coefficient of the ‘danceability_’ variable is as follows:
Lower bound (2.5%): 0.008605308
Upper bound (97.5%): 0.028572961
This indicates that we are 95% confident that the true coefficient of ‘danceability_’ lies within the range of approximately 0.0086 to 0.0286. In other words, for each unit increase in danceability percentage, the change in the log odds of a song being in ‘Minor’ mode is expected to lie within this interval.