data <- read.csv("/Users/yashuvaishu/Downloads/Spotify.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
str(data)
## 'data.frame': 8511 obs. of 13 variables:
## $ trackName : chr "A Better Place" "A Dangerous Thing" "A Different Way (with Lauv)" "A Drug From God" ...
## $ artistName : chr "Project AER" "AURORA" "DJ Snake" "Chris Lake" ...
## $ msPlayed : int 119999 1945555 66060 192455 97568 99339 6158627 40539 269453 60453 ...
## $ genre : chr "ambient guitar" "art pop" "edm" "bass house" ...
## $ danceability: num 0.496 0.541 0.784 0.714 0.0828 0.598 0.792 0.486 0.265 0.253 ...
## $ energy : num 0.255 0.556 0.757 0.883 0.012 0.295 0.484 0.881 0.312 0.139 ...
## $ key : int 9 11 8 9 9 1 4 2 7 6 ...
## $ loudness : num -17.98 -6.15 -3.91 -4.43 -36.05 ...
## $ speechiness : num 0.0283 0.0356 0.0384 0.0625 0.0451 0.0276 0.192 0.0474 0.0569 0.0414 ...
## $ valence : num 0.0809 0.106 0.587 0.819 0.0578 0.314 0.245 0.667 0.0998 0.102 ...
## $ tempo : num 142 106 105 126 170 ...
## $ id : chr "2oC9Ah7npALCCPW5DC1gob" "0PDlmmYkuQCUAFhMXvtlsU" "1YMBg7rOjxzbya0fPOYfNX" "4skbQNtyjy8A7mo8oqe2oD" ...
## $ duration_ms : int 120000 215573 198286 192455 97667 225680 224528 480707 269453 60453 ...
summary(data)
## trackName artistName msPlayed genre
## Length:8511 Length:8511 Min. : 0 Length:8511
## Class :character Class :character 1st Qu.: 139977 Class :character
## Mode :character Mode :character Median : 269850 Mode :character
## Mean : 1539795
## 3rd Qu.: 1211910
## Max. :158367130
## danceability energy key loudness
## Min. :0.0000 Min. :0.00108 Min. : 0.000 Min. :-42.044
## 1st Qu.:0.5070 1st Qu.:0.40700 1st Qu.: 2.000 1st Qu.:-10.016
## Median :0.6220 Median :0.59200 Median : 5.000 Median : -7.132
## Mean :0.6016 Mean :0.56681 Mean : 5.243 Mean : -8.580
## 3rd Qu.:0.7140 3rd Qu.:0.75400 3rd Qu.: 8.000 3rd Qu.: -5.309
## Max. :0.9760 Max. :0.99900 Max. :11.000 Max. : 3.010
## speechiness valence tempo id
## Min. :0.00000 Min. :0.0000 Min. : 0.00 Length:8511
## 1st Qu.:0.03610 1st Qu.:0.2380 1st Qu.: 97.18 Class :character
## Median :0.04790 Median :0.4100 Median :118.94 Mode :character
## Mean :0.07833 Mean :0.4353 Mean :119.10
## 3rd Qu.:0.08190 3rd Qu.:0.6180 3rd Qu.:139.32
## Max. :0.94100 Max. :0.9860 Max. :236.20
## duration_ms
## Min. : 10027
## 1st Qu.: 163173
## Median : 195989
## Mean : 203951
## 3rd Qu.: 231378
## Max. :1847210
# Set 1: Danceability and Valence
set1 <- data %>%
select(danceability, valence)
# Visualization for Set 1
ggplot(set1, aes(x = danceability, y = valence)) +
geom_point(alpha = 0.9,color ="#505168") +
labs(title = "Relationship between Danceability and Valence",
x = "Danceability", y = "Valence")
cor(set1$danceability, set1$valence)
## [1] 0.4881956
valence_ci1_1 <- t.test(set1$valence)$conf.int
valence_ci1_1
## [1] 0.4301183 0.4404433
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
danceability1_1 <- t.test(set1$danceability)$conf.int
danceability1_1
## [1] 0.5982702 0.6050095
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
## Identifying Outliers
variable <- data$danceability
# Create a boxplot to visualize the distribution and detect outliers
boxplot(variable, main = "Boxplot of danceability")
# Calculate quartiles and Interquartile Range
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
# Defining lower and upper bound
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Identify outliers based on Tukey's fences
outliers <- variable[(variable < lower) | (variable > upper)]
# Print the outliers
cat("Outliers:", outliers, "\n")
## Outliers: 0.0828 0.171 0.157 0.188 0.115 0.0681 0.176 0.102 0.157 0.175 0 0.169 0.164 0.0901 0.148 0.15 0.174 0.112 0.128 0.087 0.0641 0.18 0.182 0 0.175 0.135 0.112 0.0802 0.163 0.163 0.112 0.102 0.0685 0.0972 0.187 0.148 0.0744 0.104 0.086 0.0631 0.194 0.162 0.0835 0.19 0.111 0.16 0.179 0.182 0.0717 0.196 0.162 0.0848 0.178 0.0833 0.0793 0.181 0.159 0.15 0.173 0.124 0.0863 0.0994 0.0769 0.124 0.156 0.136 0.156 0.14 0.159 0.157 0.145 0.109 0.149 0.14 0.187 0.186 0.0814 0.0828 0.171 0.157 0.188 0.115 0.0681 0.176 0.102 0.157 0.175 0 0.169 0.164 0.0901 0.148 0.15 0.174 0.112 0.128 0.087 0.0641 0.18 0.182 0 0.175 0.135 0.112 0.0802 0.163 0.163 0.112 0.102 0.0685 0.0972 0.187 0.148 0.0744 0.104 0.086 0.0631 0.194 0.162 0.0835 0.19 0.111 0.16 0.179 0.182 0.0717 0.196 0.162 0.0848 0.178 0.0833 0.0793 0.181 0.159 0.15 0.173 0.124 0.0863 0.0994 0.0769 0.124 0.156 0.136 0.156 0.14 0.159 0.157 0.145 0.109 0.149 0.14 0.187 0.186
Based on the scatter plot, there appears to be a positive correlation between danceability and Valence, suggesting that songs that are more danceable tend to have a higher valence.
# Set 2: Loudness and Energy
set2 <- data %>%
select(loudness, energy)
# Visualization of Set 2
ggplot(set2, aes(x = loudness, y = energy)) + geom_point(alpha = 0.9,color ="#DCC48E") +geom_boxplot(fill = "transparent", outlier.shape = NA)+
labs(title = "Relationship between Loudness and Energy",
x = "Loudness", y = "Energy")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
cor(set2$loudness, set2$energy)
## [1] 0.7938352
loudness1_1 <- t.test(set2$loudness)$conf.int
loudness1_1
## [1] -8.693652 -8.467097
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
energy1_1 <- t.test(set2$energy)$conf.int
energy1_1
## [1] 0.5616746 0.5719487
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
## Identifying Outliers
variable <- data$loudness
# Creating a boxplot to visualize the distribution and detect outliers
boxplot(variable, main = "Boxplot of loudness")
# Calculating quartiles and Interquartile Range
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
# Defining lower and upper bound
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Identify outliers based on Tukey's fences
outliers <- variable[(variable < lower) | (variable > upper)]
cat("Outliers:", outliers, "\n")
## Outliers: -17.984 -36.045 -25.382 -20.274 -22.15 -38.793 -18.064 -22.456 -20.089 -18.99 -25.501 -24.299 -20.574 -23.789 -19.535 -36.657 -18.412 -19.93 -17.282 -34.177 -19.074 -23.594 -17.413 -20.179 -24.231 -20.233 -21.081 -21.105 -18.214 -22.515 -28.613 -19.451 -17.328 -19.623 -17.212 -17.805 -18.153 -20.541 -22.331 -22.654 -22.699 -18.054 -20.989 -23.325 -29.287 -19.153 -23.934 -25.642 -29.17 -19.422 -24.402 -28.785 -25.791 -18.225 -19.198 -17.792 -25.243 -25.243 -18.221 -18.717 -17.67 -17.925 -18.366 -30.099 -33.292 -26.941 -21.835 -17.547 -21.356 -17.139 -22.779 -17.214 -19.706 -20.499 -17.576 -21.771 -29.724 -29.724 -21.538 -36.759 -18.505 -24.232 -17.331 -19.976 -19.345 -20.274 -22.383 -19.072 -27.788 -24.608 -18.163 -19.794 -17.247 -20.284 -17.864 -23.826 -22.457 -22.396 -21.348 -30.709 -37.841 -23.115 -20.87 -17.582 -35.251 -24.331 -22.628 -23.025 -22.077 -19.18 -24.711 -18.558 -17.943 -32.863 -31.042 -17.264 -24.887 -21.809 -26.011 -17.298 -19.316 -20.354 -30.769 -19.007 -17.265 -17.455 -27.749 -26.328 -23.049 -27.554 -23.581 -20.382 -30.79 -22.412 -29.715 -26.257 -23.455 -18.698 -17.165 -17.349 -18.357 -17.927 -19.783 -37.156 -33.03 -29.956 -18.433 -20.508 -31.092 -27.213 -18.585 -24.41 -27.167 -18.326 -18.624 -18.015 -20.261 -17.374 -24.894 -28.516 -18.411 -23.411 -28.587 -38.222 -30.054 -19.585 -19.673 -20.05 -18.914 -23.921 -23.796 -28.115 -29.063 -23.032 -22.898 -27.712 -22.002 -19.067 -17.944 -20.191 -28.277 -29.08 -42.044 -18.226 -31.121 -28.246 -22.32 -27.72 -30.407 -20.231 -24.698 -20.265 -25.115 -21.958 -18.859 -31.414 -28.05 -21.845 -18.599 -32.335 -29.707 -31.258 -21.859 -17.446 -21.77 -23.197 -17.216 -18.146 -17.569 -24.581 -18.781 -24.884 -32.122 -17.937 -17.487 -23.255 -26.494 -18.372 -20.472 -29.388 -29.967 -20.554 -23.269 -23.849 -23.604 -23.84 -24.684 -36.281 -36.284 -30.777 -24.708 -28.659 -27.928 -23.663 -17.331 -28.892 -27.685 -33.153 -24.718 -18.344 -17.224 -18.942 -29.631 -28.922 -19.513 -24.173 -18.041 -17.86 -24.717 -22.841 -23.052 -21.428 -30.117 -17.501 -25.997 -30.142 -25.164 -22.644 -39.219 -19.668 -24.22 -18.756 -20.073 -25.103 -19.033 -30.932 -20.508 -29.489 -17.491 -18.531 -18.037 -28.491 -25.418 -21.585 -28.662 -27.988 -17.208 -19.304 -19.155 -19.552 -21.071 -20.969 -19.204 -19.981 -19.165 -20.872 -27.712 3.01 -17.153 -37.242 -34.559 -20.459 -26.509 -28.249 -19.031 -23.512 -28.927 -17.651 -19.499 -24.782 -17.719 -19.47 -33.085 -21.772 -17.984 -36.045 -25.382 -20.274 -22.15 -38.793 -18.064 -22.456 -20.089 -18.99 -25.501 -24.299 -20.574 -23.789 -19.535 -36.657 -18.412 -19.93 -17.282 -34.177 -19.074 -23.594 -17.413 -20.179 -24.231 -20.233 -21.081 -21.105 -18.214 -22.515 -28.613 -19.451 -17.328 -19.623 -17.212 -17.805 -18.153 -20.541 -22.331 -22.654 -22.699 -18.054 -20.989 -23.325 -29.287 -19.153 -23.934 -25.642 -29.17 -19.422 -24.402 -28.785 -25.791 -18.225 -19.198 -17.792 -25.243 -25.243 -18.221 -18.717 -17.67 -17.925 -18.366 -30.099 -33.292 -26.941 -21.835 -17.547 -21.356 -17.139 -22.779 -17.214 -19.706 -20.499 -17.576 -21.771 -29.724 -29.724 -21.538 -36.759 -18.505 -24.232 -17.331 -19.976 -19.345 -20.274 -22.383 -19.072 -27.788 -24.608 -18.163 -19.794 -17.247 -20.284 -17.864 -23.826 -22.457 -22.396 -21.348 -30.709 -37.841 -23.115 -20.87 -17.582 -35.251 -24.331 -22.628 -23.025 -22.077 -19.18 -24.711 -18.558 -17.943 -32.863 -31.042 -17.264 -24.887 -21.809 -26.011 -17.298 -19.316 -20.354 -30.769 -19.007 -17.265 -17.455 -27.749 -26.328 -23.049 -27.554 -23.581 -20.382 -30.79 -22.412 -29.715 -26.257 -23.455 -18.698 -17.165 -17.349 -18.357 -17.927 -19.783 -37.156 -33.03 -29.956 -18.433 -20.508 -31.092 -27.213 -18.585 -24.41 -27.167 -18.326 -18.624 -18.015 -20.261 -17.374 -24.894 -28.516 -18.411 -23.411 -28.587 -38.222 -30.054 -19.585 -19.673 -20.05 -18.914 -23.921 -23.796 -28.115 -29.063 -23.032 -22.898 -27.712 -22.002 -19.067 -17.944 -20.191 -28.277 -29.08 -42.044 -18.226 -31.121 -28.246 -22.32 -27.72 -30.407 -20.231 -24.698 -20.265 -25.115 -21.958 -18.859 -31.414 -28.05 -21.845 -18.599 -32.335 -29.707 -31.258 -21.859 -17.446 -21.77 -23.197 -17.216 -18.146 -17.569 -24.581 -18.781 -24.884 -32.122 -17.937 -17.487 -23.255 -26.494 -18.372 -20.472 -29.388 -29.967 -20.554 -23.269 -23.849 -23.604 -23.84 -24.684 -36.281 -36.284 -30.777 -24.708 -28.659 -27.928 -23.663 -17.331 -28.892 -27.685 -33.153 -24.718 -18.344 -17.224 -18.942 -29.631 -28.922 -19.513 -24.173 -18.041 -17.86 -24.717 -22.841 -23.052 -21.428 -30.117 -17.501 -25.997 -30.142 -25.164 -22.644 -39.219 -19.668 -24.22 -18.756 -20.073 -25.103 -19.033 -30.932 -20.508 -29.489 -17.491 -18.531 -18.037 -28.491 -25.418 -21.585 -28.662 -27.988 -17.208 -19.304 -19.155 -19.552 -21.071 -20.969 -19.204 -19.981 -19.165 -20.872 -27.712 3.01 -17.153 -37.242 -34.559 -20.459 -26.509 -28.249 -19.031 -23.512 -28.927 -17.651 -19.499 -24.782
The scatter plot shows a negative correlation between Loudness and Energy, indicating that songs with higher loudness tend to have lower energy.
# Set 3: Duration and Loudness
set3 <- data %>%
select(duration_ms, loudness)
# Visualization
ggplot(set3, aes(x = duration_ms, y = loudness)) +
geom_point() +
labs(title = "Relationship between Duration and Loudness",
x = "Duration", y = "Loudness")
# Calculate correlation coefficient
cor(set3$duration_ms, set3$loudness)
## [1] 0.1407408
# Calculating z-scores for identifying Outliers
z_scores_x <- scale(data$duration_ms)
z_scores_y <- scale(data$loudness)
# Identifying outliers based on z-scores
outliers <- which(abs(z_scores_x) > 4 | abs(z_scores_y) > 4)
print(outliers)
## [1] 5 61 181 185 214 321 896 955 959 1090 1107 1284 1291 1321 1400
## [16] 1401 1518 1576 1594 1812 1843 1844 1944 2030 2246 2247 2558 2649 2697 2744
## [31] 2799 2810 2851 2861 2870 2977 3086 3173 3183 3204 3210 3268 3486 3538 3590
## [46] 3611 3703 3827 4011 4012 4181 4240 4294 4350 4470 4474 4503 4610 5185 5244
## [61] 5248 5379 5396 5573 5581 5611 5690 5691 5808 5866 5884 6102 6133 6134 6234
## [76] 6320 6536 6537 6848 6939 6987 7034 7089 7100 7141 7151 7160 7267 7376 7463
## [91] 7473 7494 7500 7558 7776 7828 7880 7901 7993 8117 8301 8302 8471
duration_ms1_1 <- t.test(set3$duration_ms)$conf.int
duration_ms1_1
## [1] 202389.0 205512.1
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
loudness1_3 <- t.test(set3$loudness)$conf.int
loudness1_3
## [1] -8.693652 -8.467097
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
## Identifying Outliers based on
variable <- data$duration_ms
# Creating a boxplot to visualize the distribution and detect outliers
boxplot(variable, main = "Boxplot of duration_ms")
# Calculating quartiles and Interquartile Range
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
# Defining lower and upper bound
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Identify outliers based on Tukey's fences
outliers <- variable[(variable < lower) | (variable > upper)]
# Print the outliers
cat("Outliers:", outliers, "\n")
## Outliers
The scatter plot does not reveal a strong correlation between Duration and Loudness.
In this data dive, I explored various relationships between Spotify audio features using scatter plots and correlation coefficients. I also calculated confidence intervals for response variables to estimate population parameters.
The insights gathered include:
Danceability is positively correlated with Valence. Loudness is negatively correlated with Energy. Duration and Loudness do not exhibit a strong correlation.
These insights provide valuable information about the relationships between audio features in Spotify songs, which can be useful for music analysis and recommendation systems.