data <- read.csv("/Users/yashuvaishu/Downloads/Spotify.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
str(data)
## 'data.frame': 8511 obs. of 13 variables:
## $ trackName : chr "A Better Place" "A Dangerous Thing" "A Different Way (with Lauv)" "A Drug From God" ...
## $ artistName : chr "Project AER" "AURORA" "DJ Snake" "Chris Lake" ...
## $ msPlayed : int 119999 1945555 66060 192455 97568 99339 6158627 40539 269453 60453 ...
## $ genre : chr "ambient guitar" "art pop" "edm" "bass house" ...
## $ danceability: num 0.496 0.541 0.784 0.714 0.0828 0.598 0.792 0.486 0.265 0.253 ...
## $ energy : num 0.255 0.556 0.757 0.883 0.012 0.295 0.484 0.881 0.312 0.139 ...
## $ key : int 9 11 8 9 9 1 4 2 7 6 ...
## $ loudness : num -17.98 -6.15 -3.91 -4.43 -36.05 ...
## $ speechiness : num 0.0283 0.0356 0.0384 0.0625 0.0451 0.0276 0.192 0.0474 0.0569 0.0414 ...
## $ valence : num 0.0809 0.106 0.587 0.819 0.0578 0.314 0.245 0.667 0.0998 0.102 ...
## $ tempo : num 142 106 105 126 170 ...
## $ id : chr "2oC9Ah7npALCCPW5DC1gob" "0PDlmmYkuQCUAFhMXvtlsU" "1YMBg7rOjxzbya0fPOYfNX" "4skbQNtyjy8A7mo8oqe2oD" ...
## $ duration_ms : int 120000 215573 198286 192455 97667 225680 224528 480707 269453 60453 ...
summary(data)
## trackName artistName msPlayed genre
## Length:8511 Length:8511 Min. : 0 Length:8511
## Class :character Class :character 1st Qu.: 139977 Class :character
## Mode :character Mode :character Median : 269850 Mode :character
## Mean : 1539795
## 3rd Qu.: 1211910
## Max. :158367130
## danceability energy key loudness
## Min. :0.0000 Min. :0.00108 Min. : 0.000 Min. :-42.044
## 1st Qu.:0.5070 1st Qu.:0.40700 1st Qu.: 2.000 1st Qu.:-10.016
## Median :0.6220 Median :0.59200 Median : 5.000 Median : -7.132
## Mean :0.6016 Mean :0.56681 Mean : 5.243 Mean : -8.580
## 3rd Qu.:0.7140 3rd Qu.:0.75400 3rd Qu.: 8.000 3rd Qu.: -5.309
## Max. :0.9760 Max. :0.99900 Max. :11.000 Max. : 3.010
## speechiness valence tempo id
## Min. :0.00000 Min. :0.0000 Min. : 0.00 Length:8511
## 1st Qu.:0.03610 1st Qu.:0.2380 1st Qu.: 97.18 Class :character
## Median :0.04790 Median :0.4100 Median :118.94 Mode :character
## Mean :0.07833 Mean :0.4353 Mean :119.10
## 3rd Qu.:0.08190 3rd Qu.:0.6180 3rd Qu.:139.32
## Max. :0.94100 Max. :0.9860 Max. :236.20
## duration_ms
## Min. : 10027
## 1st Qu.: 163173
## Median : 195989
## Mean : 203951
## 3rd Qu.: 231378
## Max. :1847210
# Set 1: Danceability and Valence
set1 <- data %>%
select(danceability, valence)
# Visualization for Set 1
ggplot(set1, aes(x = danceability, y = valence)) +
geom_point(alpha = 0.9,color ="#505168") +
labs(title = "Relationship between Danceability and Valence",
x = "Danceability", y = "Valence")
cor(set1$danceability, set1$valence)
## [1] 0.4881956
valence_ci1_1 <- t.test(set1$valence)$conf.int
valence_ci1_1
## [1] 0.4301183 0.4404433
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
danceability1_1 <- t.test(set1$danceability)$conf.int
danceability1_1
## [1] 0.5982702 0.6050095
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
## Identifying Outliers
variable <- data$danceability
# Create a boxplot to visualize the distribution and detect outliers
boxplot(variable, main = "Boxplot of danceability")
# Calculate quartiles and Interquartile Range
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
# Defining lower and upper bound
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Identify outliers based on Tukey's fences
outliers <- variable[(variable < lower) | (variable > upper)]
# Print the outliers
cat("Outliers:", outliers, "\n")
## Outliers: 0.0828 0.171 0.157 0.188 0.115 0.0681 0.176 0.102 0.157 0.175 0 0.169 0.164 0.0901 0.148 0.15 0.174 0.112 0.128 0.087 0.0641 0.18 0.182 0 0.175 0.135 0.112 0.0802 0.163 0.163 0.112 0.102 0.0685 0.0972 0.187 0.148 0.0744 0.104 0.086 0.0631 0.194 0.162 0.0835 0.19 0.111 0.16 0.179 0.182 0.0717 0.196 0.162 0.0848 0.178 0.0833 0.0793 0.181 0.159 0.15 0.173 0.124 0.0863 0.0994 0.0769 0.124 0.156 0.136 0.156 0.14 0.159 0.157 0.145 0.109 0.149 0.14 0.187 0.186 0.0814 0.0828 0.171 0.157 0.188 0.115 0.0681 0.176 0.102 0.157 0.175 0 0.169 0.164 0.0901 0.148 0.15 0.174 0.112 0.128 0.087 0.0641 0.18 0.182 0 0.175 0.135 0.112 0.0802 0.163 0.163 0.112 0.102 0.0685 0.0972 0.187 0.148 0.0744 0.104 0.086 0.0631 0.194 0.162 0.0835 0.19 0.111 0.16 0.179 0.182 0.0717 0.196 0.162 0.0848 0.178 0.0833 0.0793 0.181 0.159 0.15 0.173 0.124 0.0863 0.0994 0.0769 0.124 0.156 0.136 0.156 0.14 0.159 0.157 0.145 0.109 0.149 0.14 0.187 0.186
Based on the scatter plot, there appears to be a positive correlation between danceability and Valence, suggesting that songs that are more danceable tend to have a higher valence.
# Set 2: Loudness and Energy
set2 <- data %>%
select(loudness, energy)
# Visualization of Set 2
ggplot(set2, aes(x = loudness, y = energy)) + geom_point(alpha = 0.9,color ="#DCC48E") +geom_boxplot(fill = "transparent", outlier.shape = NA)+
labs(title = "Relationship between Loudness and Energy",
x = "Loudness", y = "Energy")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
cor(set2$loudness, set2$energy)
## [1] 0.7938352
loudness1_1 <- t.test(set2$loudness)$conf.int
loudness1_1
## [1] -8.693652 -8.467097
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
energy1_1 <- t.test(set2$energy)$conf.int
energy1_1
## [1] 0.5616746 0.5719487
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
## Identifying Outliers
variable <- data$loudness
# Creating a boxplot to visualize the distribution and detect outliers
boxplot(variable, main = "Boxplot of loudness")
# Calculating quartiles and Interquartile Range
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
# Defining lower and upper bound
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Identify outliers based on Tukey's fences
outliers <- variable[(variable < lower) | (variable > upper)]
cat("Outliers:", outliers, "\n")
## Outliers: -17.984 -36.045 -25.382 -20.274 -22.15 -38.793 -18.064 -22.456 -20.089 -18.99 -25.501 -24.299 -20.574 -23.789 -19.535 -36.657 -18.412 -19.93 -17.282 -34.177 -19.074 -23.594 -17.413 -20.179 -24.231 -20.233 -21.081 -21.105 -18.214 -22.515 -28.613 -19.451 -17.328 -19.623 -17.212 -17.805 -18.153 -20.541 -22.331 -22.654 -22.699 -18.054 -20.989 -23.325 -29.287 -19.153 -23.934 -25.642 -29.17 -19.422 -24.402 -28.785 -25.791 -18.225 -19.198 -17.792 -25.243 -25.243 -18.221 -18.717 -17.67 -17.925 -18.366 -30.099 -33.292 -26.941 -21.835 -17.547 -21.356 -17.139 -22.779 -17.214 -19.706 -20.499 -17.576 -21.771 -29.724 -29.724 -21.538 -36.759 -18.505 -24.232 -17.331 -19.976 -19.345 -20.274 -22.383 -19.072 -27.788 -24.608 -18.163 -19.794 -17.247 -20.284 -17.864 -23.826 -22.457 -22.396 -21.348 -30.709 -37.841 -23.115 -20.87 -17.582 -35.251 -24.331 -22.628 -23.025 -22.077 -19.18 -24.711 -18.558 -17.943 -32.863 -31.042 -17.264 -24.887 -21.809 -26.011 -17.298 -19.316 -20.354 -30.769 -19.007 -17.265 -17.455 -27.749 -26.328 -23.049 -27.554 -23.581 -20.382 -30.79 -22.412 -29.715 -26.257 -23.455 -18.698 -17.165 -17.349 -18.357 -17.927 -19.783 -37.156 -33.03 -29.956 -18.433 -20.508 -31.092 -27.213 -18.585 -24.41 -27.167 -18.326 -18.624 -18.015 -20.261 -17.374 -24.894 -28.516 -18.411 -23.411 -28.587 -38.222 -30.054 -19.585 -19.673 -20.05 -18.914 -23.921 -23.796 -28.115 -29.063 -23.032 -22.898 -27.712 -22.002 -19.067 -17.944 -20.191 -28.277 -29.08 -42.044 -18.226 -31.121 -28.246 -22.32 -27.72 -30.407 -20.231 -24.698 -20.265 -25.115 -21.958 -18.859 -31.414 -28.05 -21.845 -18.599 -32.335 -29.707 -31.258 -21.859 -17.446 -21.77 -23.197 -17.216 -18.146 -17.569 -24.581 -18.781 -24.884 -32.122 -17.937 -17.487 -23.255 -26.494 -18.372 -20.472 -29.388 -29.967 -20.554 -23.269 -23.849 -23.604 -23.84 -24.684 -36.281 -36.284 -30.777 -24.708 -28.659 -27.928 -23.663 -17.331 -28.892 -27.685 -33.153 -24.718 -18.344 -17.224 -18.942 -29.631 -28.922 -19.513 -24.173 -18.041 -17.86 -24.717 -22.841 -23.052 -21.428 -30.117 -17.501 -25.997 -30.142 -25.164 -22.644 -39.219 -19.668 -24.22 -18.756 -20.073 -25.103 -19.033 -30.932 -20.508 -29.489 -17.491 -18.531 -18.037 -28.491 -25.418 -21.585 -28.662 -27.988 -17.208 -19.304 -19.155 -19.552 -21.071 -20.969 -19.204 -19.981 -19.165 -20.872 -27.712 3.01 -17.153 -37.242 -34.559 -20.459 -26.509 -28.249 -19.031 -23.512 -28.927 -17.651 -19.499 -24.782 -17.719 -19.47 -33.085 -21.772 -17.984 -36.045 -25.382 -20.274 -22.15 -38.793 -18.064 -22.456 -20.089 -18.99 -25.501 -24.299 -20.574 -23.789 -19.535 -36.657 -18.412 -19.93 -17.282 -34.177 -19.074 -23.594 -17.413 -20.179 -24.231 -20.233 -21.081 -21.105 -18.214 -22.515 -28.613 -19.451 -17.328 -19.623 -17.212 -17.805 -18.153 -20.541 -22.331 -22.654 -22.699 -18.054 -20.989 -23.325 -29.287 -19.153 -23.934 -25.642 -29.17 -19.422 -24.402 -28.785 -25.791 -18.225 -19.198 -17.792 -25.243 -25.243 -18.221 -18.717 -17.67 -17.925 -18.366 -30.099 -33.292 -26.941 -21.835 -17.547 -21.356 -17.139 -22.779 -17.214 -19.706 -20.499 -17.576 -21.771 -29.724 -29.724 -21.538 -36.759 -18.505 -24.232 -17.331 -19.976 -19.345 -20.274 -22.383 -19.072 -27.788 -24.608 -18.163 -19.794 -17.247 -20.284 -17.864 -23.826 -22.457 -22.396 -21.348 -30.709 -37.841 -23.115 -20.87 -17.582 -35.251 -24.331 -22.628 -23.025 -22.077 -19.18 -24.711 -18.558 -17.943 -32.863 -31.042 -17.264 -24.887 -21.809 -26.011 -17.298 -19.316 -20.354 -30.769 -19.007 -17.265 -17.455 -27.749 -26.328 -23.049 -27.554 -23.581 -20.382 -30.79 -22.412 -29.715 -26.257 -23.455 -18.698 -17.165 -17.349 -18.357 -17.927 -19.783 -37.156 -33.03 -29.956 -18.433 -20.508 -31.092 -27.213 -18.585 -24.41 -27.167 -18.326 -18.624 -18.015 -20.261 -17.374 -24.894 -28.516 -18.411 -23.411 -28.587 -38.222 -30.054 -19.585 -19.673 -20.05 -18.914 -23.921 -23.796 -28.115 -29.063 -23.032 -22.898 -27.712 -22.002 -19.067 -17.944 -20.191 -28.277 -29.08 -42.044 -18.226 -31.121 -28.246 -22.32 -27.72 -30.407 -20.231 -24.698 -20.265 -25.115 -21.958 -18.859 -31.414 -28.05 -21.845 -18.599 -32.335 -29.707 -31.258 -21.859 -17.446 -21.77 -23.197 -17.216 -18.146 -17.569 -24.581 -18.781 -24.884 -32.122 -17.937 -17.487 -23.255 -26.494 -18.372 -20.472 -29.388 -29.967 -20.554 -23.269 -23.849 -23.604 -23.84 -24.684 -36.281 -36.284 -30.777 -24.708 -28.659 -27.928 -23.663 -17.331 -28.892 -27.685 -33.153 -24.718 -18.344 -17.224 -18.942 -29.631 -28.922 -19.513 -24.173 -18.041 -17.86 -24.717 -22.841 -23.052 -21.428 -30.117 -17.501 -25.997 -30.142 -25.164 -22.644 -39.219 -19.668 -24.22 -18.756 -20.073 -25.103 -19.033 -30.932 -20.508 -29.489 -17.491 -18.531 -18.037 -28.491 -25.418 -21.585 -28.662 -27.988 -17.208 -19.304 -19.155 -19.552 -21.071 -20.969 -19.204 -19.981 -19.165 -20.872 -27.712 3.01 -17.153 -37.242 -34.559 -20.459 -26.509 -28.249 -19.031 -23.512 -28.927 -17.651 -19.499 -24.782
The scatter plot shows a negative correlation between Loudness and Energy, indicating that songs with higher loudness tend to have lower energy.
# Set 3: Duration and Loudness
set3 <- data %>%
select(duration_ms, loudness)
# Visualization
ggplot(set3, aes(x = duration_ms, y = loudness)) +
geom_point() +
labs(title = "Relationship between Duration and Loudness",
x = "Duration", y = "Loudness")
# Calculate correlation coefficient
cor(set3$duration_ms, set3$loudness)
## [1] 0.1407408
# Calculating z-scores for identifying Outliers
z_scores_x <- scale(data$duration_ms)
z_scores_y <- scale(data$loudness)
# Identifying outliers based on z-scores
outliers <- which(abs(z_scores_x) > 4 | abs(z_scores_y) > 4)
print(outliers)
## [1] 5 61 181 185 214 321 896 955 959 1090 1107 1284 1291 1321 1400
## [16] 1401 1518 1576 1594 1812 1843 1844 1944 2030 2246 2247 2558 2649 2697 2744
## [31] 2799 2810 2851 2861 2870 2977 3086 3173 3183 3204 3210 3268 3486 3538 3590
## [46] 3611 3703 3827 4011 4012 4181 4240 4294 4350 4470 4474 4503 4610 5185 5244
## [61] 5248 5379 5396 5573 5581 5611 5690 5691 5808 5866 5884 6102 6133 6134 6234
## [76] 6320 6536 6537 6848 6939 6987 7034 7089 7100 7141 7151 7160 7267 7376 7463
## [91] 7473 7494 7500 7558 7776 7828 7880 7901 7993 8117 8301 8302 8471
duration_ms1_1 <- t.test(set3$duration_ms)$conf.int
duration_ms1_1
## [1] 202389.0 205512.1
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
loudness1_3 <- t.test(set3$loudness)$conf.int
loudness1_3
## [1] -8.693652 -8.467097
## attr(,"conf.level")
## [1] 0.95
As Confident Intervel is 95% most values of loudness comes in between that range.
## Identifying Outliers based on
variable <- data$duration_ms
# Creating a boxplot to visualize the distribution and detect outliers
boxplot(variable, main = "Boxplot of duration_ms")
# Calculating quartiles and Interquartile Range
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
# Defining lower and upper bound
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
# Identify outliers based on Tukey's fences
outliers <- variable[(variable < lower) | (variable > upper)]
# Print the outliers
cat("Outliers:", outliers, "\n")
## Outliers: 480707 60453 422713 385253 342669 364787 393220 17075 340341 460027 338587 354079 1847210 553056 352667 429251 409000 374938 336567 392827 361053 346347 446987 368852 342024 21907 10027 60333 338333 355227 350120 408133 454773 348966 613747 46737 46737 339933 58307 34667 458815 362093 31045 373133 49910 49910 366587 506876 426499 336692 417240 364987 475058 352388 402202 356827 373357 341820 354253 336118 392267 361373 341396 342880 350200 472947 18333 484613 351701 59027 395707 379733 547107 349560 377493 393813 58288 58042 339160 364800 346627 436267 351133 343032 427224 473025 352916 376169 57674 442000 384770 414896 555573 45587 465133 367818 339086 438493 483920 353907 362227 466887 345013 475187 375347 340253 470500 383013 341028 478808 341812 440100 339320 56794 52057 366853 361067 403613 364330 379227 490435 404432 364286 402655 357242 382973 353732 680779 344053 355893 29400 345240 379768 356674 535963 53442 54960 621659 405600 345307 42893 363587 394162 355175 359373 388707 405029 363521 427360 55143 342720 404174 374333 510573 383513 357433 334973 55324 424573 59245 354863 357877 465200 408063 47395 373691 437317 392427 400042 359387 411493 410213 537653 404107 412424 336519 363253 373987 366760 444369 427547 356067 427507 461435 397773 446098 369547 349547 337660 456000 373267 354267 41733 498852 454200 476960 385853 389693 347453 359253 53360 390747 380053 343253 415813 358870 461269 378160 437139 334744 389024 357187 397440 1394312 397533 359347 58018 346040 407573 480707 60453 422713 385253 342669 364787 393220 17075 340341 460027 338587 354079 1847210 553056 352667 429251 409000 374938 336567 392827 361053 346347 446987 368852 342024 21907 10027 60333 338333 355227 350120 408133 454773 348966 613747 46737 46737 339933 58307 34667 458815 362093 31045 373133 49910 49910 366587 506876 426499 336692 417240 364987 475058 352388 402202 356827 373357 341820 354253 336118 392267 361373 341396 342880 350200 472947 18333 484613 351701 59027 395707 379733 547107 349560 377493 393813 58288 58042 339160 364800 346627 436267 351133 343032 427224 473025 352916 376169 57674 442000 384770 414896 555573 45587 465133 367818 339086 438493 483920 353907 362227 466887 345013 475187 375347 340253 470500 383013 341028 478808 341812 440100 339320 56794 52057 366853 361067 403613 364330 379227 490435 404432 364286 402655 357242 382973 353732 680779 344053 355893 29400 345240 379768 356674 535963 53442 54960 621659 405600 345307 42893 363587 394162 355175 359373 388707 405029 363521 427360 55143 342720 404174 374333 510573 383513 357433 334973 55324 424573 59245 354863 357877 465200 408063 47395 373691 437317 392427 400042 359387 411493 410213 537653 404107 412424 336519 363253 373987 366760 444369 427547 356067 427507 461435 397773 446098 369547 349547 337660 456000 373267 354267 41733 498852 454200 476960 385853 389693 347453 359253 53360 390747 380053 343253 415813 358870 461269 378160 437139 334744 389024 357187 397440 1394312 397533 359347
The scatter plot does not reveal a strong correlation between Duration and Loudness.
In this data dive, I explored various relationships between Spotify audio features using scatter plots and correlation coefficients. I also calculated confidence intervals for response variables to estimate population parameters.
The insights gathered include:
Danceability is positively correlated with Valence. Loudness is negatively correlated with Energy. Duration and Loudness do not exhibit a strong correlation.
These insights provide valuable information about the relationships between audio features in Spotify songs, which can be useful for music analysis and recommendation systems.