#Load packages and import data
library(ggplot2)
library(dplyr)
library(skimr)
mar2024=read.csv('psichiR_mar2024.csv')
skimr::skim(mar2024)
Name | mar2024 |
Number of rows | 50 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 4 |
logical | 1 |
numeric | 14 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
artist_name | 0 | 1 | 3 | 18 | 0 | 41 | 0 |
track_name | 0 | 1 | 3 | 56 | 0 | 50 | 0 |
album_release_date | 0 | 1 | 8 | 10 | 0 | 49 | 0 |
main_genre | 0 | 1 | 3 | 25 | 0 | 17 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
is_explicit | 0 | 1 | 0.44 | FAL: 28, TRU: 22 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
danceability | 0 | 1 | 0.66 | 0.12 | 0.44 | 0.56 | 0.65 | 0.78 | 0.91 | ▆▇▇▆▃ |
valence | 0 | 1 | 0.51 | 0.21 | 0.13 | 0.34 | 0.51 | 0.65 | 0.89 | ▅▇▇▆▅ |
energy | 0 | 1 | 0.66 | 0.12 | 0.42 | 0.56 | 0.68 | 0.74 | 0.96 | ▃▆▇▅▁ |
loudness | 0 | 1 | -6.01 | 1.78 | -10.61 | -7.13 | -5.64 | -4.88 | -2.81 | ▂▂▃▇▃ |
acousticness | 0 | 1 | 0.25 | 0.22 | 0.00 | 0.09 | 0.16 | 0.41 | 0.83 | ▇▃▃▁▁ |
instrumentalness | 0 | 1 | 0.02 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 0.63 | ▇▁▁▁▁ |
liveness | 0 | 1 | 0.16 | 0.09 | 0.02 | 0.09 | 0.12 | 0.23 | 0.37 | ▂▇▁▂▂ |
speechiness | 0 | 1 | 0.08 | 0.08 | 0.03 | 0.04 | 0.05 | 0.08 | 0.33 | ▇▁▁▁▁ |
key | 0 | 1 | 5.00 | 3.52 | 0.00 | 2.00 | 5.00 | 7.00 | 11.00 | ▇▂▅▂▅ |
tempo | 0 | 1 | 124.07 | 31.40 | 67.03 | 97.96 | 124.98 | 138.06 | 203.76 | ▃▅▇▂▁ |
mode | 0 | 1 | 0.56 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
duration_ms | 0 | 1 | 200458.10 | 32580.81 | 131013.00 | 176936.75 | 199740.00 | 228219.75 | 272373.00 | ▃▇▇▇▂ |
time_signature | 0 | 1 | 3.90 | 0.30 | 3.00 | 4.00 | 4.00 | 4.00 | 4.00 | ▁▁▁▁▇ |
popularity | 0 | 1 | 88.10 | 5.69 | 72.00 | 85.00 | 89.00 | 92.75 | 99.00 | ▁▁▇▇▃ |
-Let’s get familiar with our data. Create a table that shows how many times each genre is represented in the Top 50 most popular songs on Spotify. What is the most popular genre?
mar2024 %>%
group_by(main_genre) %>%
summarise(Count=n()) %>%
arrange(-Count) #Pop
-Which artist, if any, has the greatest number of Top 50 most popular songs on Spotify?
mar2024 %>%
group_by(artist_name) %>%
summarise(Count=n()) %>%
arrange(-Count) #The Weeknd
-Provide the mean, standard deviation, median, and range of values for ‘danceability’
summary(mar2024$danceability,na.rm=T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4450 0.5595 0.6475 0.6626 0.7765 0.9110
mean(mar2024$danceability,na.rm = T)
## [1] 0.66258
sd(mar2024$danceability,na.rm = T)
## [1] 0.1230895
median(mar2024$danceability,na.rm = T)
## [1] 0.6475
range(mar2024$danceability,na.rm = T)
## [1] 0.445 0.911
-Provide the mean, standard deviation, median, and range of values for ‘valence’
summary(mar2024$valence,na.rm=T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1310 0.3440 0.5070 0.5107 0.6545 0.8930
mean(mar2024$valence,na.rm = T)
## [1] 0.51074
sd(mar2024$valence,na.rm = T)
## [1] 0.2123458
median(mar2024$valence,na.rm = T)
## [1] 0.507
range(mar2024$valence,na.rm = T)
## [1] 0.131 0.893
#Mean: 0.51, standard deviation: 0.21, median: 0.51, range: 0.13 - 0.89
-Provide the mean, standard deviation, median, and range of values for ‘energy’
summary(mar2024$energy,na.rm=T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4170 0.5567 0.6780 0.6599 0.7365 0.9650
mean(mar2024$energy,na.rm = T)
## [1] 0.65988
sd(mar2024$energy,na.rm = T)
## [1] 0.1214992
median(mar2024$energy,na.rm = T)
## [1] 0.678
range(mar2024$energy,na.rm = T)
## [1] 0.417 0.965
#Mean: 0.66, standard deviation: 0.12, median: 0.12, range: 0.42 - 0.97
-Does the ‘tempo’ variable meet the assumption of normality?
shapiro.test(mar2024$tempo)
##
## Shapiro-Wilk normality test
##
## data: mar2024$tempo
## W = 0.96591, p-value = 0.157
hist(mar2024$tempo,freq=F,breaks = 15)
lines(density(mar2024$tempo),col='red',lwd=2)#Yes, 'tempo' meets normality.
mar2024 %>%
ggplot(aes(x=tempo))+
geom_histogram(aes(y=..density..),bins = 15,col='black',fill='lightgray')+
geom_density(col='red',lwd=2)+
theme_classic()+
labs(title='Histogram of mar2024$tempo')+
theme(plot.title = element_text(hjust = .5))
-Produce a visual that will show if the ‘loudness’ variable contains any outliers.
plot(mar2024$loudness)
hist(mar2024$loudness)
-Is there a significant difference in the ratings of ‘danceability’ for explicit songs, compared to songs that are not explicit? Note all key statistics.
explicit_dance=mar2024 %>%
filter(is_explicit==TRUE)
explicit.not_dance=mar2024 %>%
filter(is_explicit==FALSE)
shapiro.test(explicit.not_dance$danceability)
##
## Shapiro-Wilk normality test
##
## data: explicit.not_dance$danceability
## W = 0.95148, p-value = 0.2158
shapiro.test(explicit_dance$danceability) #p-val. below 0.05, so assume normality
##
## Shapiro-Wilk normality test
##
## data: explicit_dance$danceability
## W = 0.96369, p-value = 0.5672
hist(explicit.not_dance$danceability,freq=F)
lines(density(explicit.not_dance$danceability))
hist(explicit_dance$danceability,freq=F)
lines(density(explicit_dance$danceability))
t.test(explicit.not_dance$danceability,explicit_dance$danceability) #No, there is no significant difference between non-explicit and explicit for 'danceability.'
##
## Welch Two Sample t-test
##
## data: explicit.not_dance$danceability and explicit_dance$danceability
## t = -1.0673, df = 41.836, p-value = 0.292
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.10988044 0.03386745
## sample estimates:
## mean of x mean of y
## 0.6458571 0.6838636
-Create a regression model to figure out which of the following is the strongest predictor of popularity: danceability, valence, energy, loudness, acousticness. Instrumentalness, liveness, speechiness, or tempo. Please note all key statistics for each variable.
reg_model = lm(popularity ~ danceability + valence + energy + loudness + acousticness + instrumentalness + liveness + speechiness + tempo, data = mar2024)
plot(reg_model)
summary(reg_model) #the p-values for danceability: 0.0761, valence: 0.4283, energy: 0.5354, loudness: 0.5659, acousticness: 0.1489, instrumentalness: 0.4737, liveness: 0.4424, speechiness: 0.4818, tempo: 0.8153
##
## Call:
## lm(formula = popularity ~ danceability + valence + energy + loudness +
## acousticness + instrumentalness + liveness + speechiness +
## tempo, data = mar2024)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4617 -2.2794 0.9388 3.8885 7.1718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105.55792 12.54467 8.415 0.000000000219 ***
## danceability -16.89618 9.27855 -1.821 0.0761 .
## valence 4.42840 5.53381 0.800 0.4283
## energy -7.41451 11.85986 -0.625 0.5354
## loudness 0.39436 0.68123 0.579 0.5659
## acousticness -6.24072 4.23989 -1.472 0.1489
## instrumentalness -6.98729 9.65923 -0.723 0.4737
## liveness -7.39235 9.52787 -0.776 0.4424
## speechiness 9.70106 13.66191 0.710 0.4818
## tempo 0.00681 0.02896 0.235 0.8153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.777 on 40 degrees of freedom
## Multiple R-squared: 0.1585, Adjusted R-squared: -0.03084
## F-statistic: 0.8371 on 9 and 40 DF, p-value: 0.5867
library(purrr)
library(ggplot2)
#Create list of variables to plot
plot_list = c('danceability','valence','energy','loudness','acousticness','instrumentalness','liveness','speechiness','tempo')
# Function create plots for each variable
plot_map = function(some_x) {
ggplot(mar2024,aes(x = !!sym(some_x), y = popularity))+
geom_point(aes(color = popularity), alpha = 0.5)+
labs(title = paste0("Popularity and ", some_x), x = some_x, y = "Popularity")+
geom_smooth(method = lm, se = FALSE)+
theme_bw()+
theme(plot.title = element_text(hjust = .5))
}
# list,function
map(plot_list, plot_map)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]