# Load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
library(ggplot2)
library(pwr)
## Warning: package 'pwr' was built under R version 4.3.3
# Load the dataset
spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
glimpse(spotify_songs)
## Rows: 32,833
## Columns: 23
## $ track_id <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity <int> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key <int> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode <int> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms <int> 194754, 162600, 176616, 169093, 189052, 16304…
# Extracting the year from the track_album_release_date
spotify_songs <- spotify_songs %>%
mutate(year = as.numeric(format(as.Date(track_album_release_date, format="%Y-%m-%d"), "%Y")))
# Creating a new column indicating whether the song was released before or after 2010
spotify_songs <- spotify_songs %>%
mutate(release_period = ifelse(year < 2010, "Before 2010", "After 2010"))
# Verifying that the release_period column exists
head(spotify_songs)
## track_id track_name
## 1 6f807x0ima9a1j3VPbc7VN I Don't Care (with Justin Bieber) - Loud Luxury Remix
## 2 0r7CVbZTWZgbTCYdfa2P31 Memories - Dillon Francis Remix
## 3 1z1Hg7Vb0AhHDiEmnDE79l All the Time - Don Diablo Remix
## 4 75FpbthrwQmzHlBJLuGdC7 Call You Mine - Keanu Silva Remix
## 5 1e8PAfcKUYoKkxPhrHqw4x Someone You Loved - Future Humans Remix
## 6 7fvUMiyapMsRRxr07cU8Ef Beautiful People (feat. Khalid) - Jack Wins Remix
## track_artist track_popularity track_album_id
## 1 Ed Sheeran 66 2oCs0DGTsRO98Gh5ZSl2Cx
## 2 Maroon 5 67 63rPSO264uRjW1X5E6cWv6
## 3 Zara Larsson 70 1HoSmj2eLcsrR0vE9gThr4
## 4 The Chainsmokers 60 1nqYsOef1yKKuGOVchbsk6
## 5 Lewis Capaldi 69 7m7vv9wlQ4i0LFuJiE2zsQ
## 6 Ed Sheeran 67 2yiy9cd2QktrNvWC2EUi0k
## track_album_name
## 1 I Don't Care (with Justin Bieber) [Loud Luxury Remix]
## 2 Memories (Dillon Francis Remix)
## 3 All the Time (Don Diablo Remix)
## 4 Call You Mine - The Remixes
## 5 Someone You Loved (Future Humans Remix)
## 6 Beautiful People (feat. Khalid) [Jack Wins Remix]
## track_album_release_date playlist_name playlist_id playlist_genre
## 1 2019-06-14 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop
## 2 2019-12-13 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop
## 3 2019-07-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop
## 4 2019-07-19 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop
## 5 2019-03-05 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop
## 6 2019-07-11 Pop Remix 37i9dQZF1DXcZDD7cfEKhW pop
## playlist_subgenre danceability energy key loudness mode speechiness
## 1 dance pop 0.748 0.916 6 -2.634 1 0.0583
## 2 dance pop 0.726 0.815 11 -4.969 1 0.0373
## 3 dance pop 0.675 0.931 1 -3.432 0 0.0742
## 4 dance pop 0.718 0.930 7 -3.778 1 0.1020
## 5 dance pop 0.650 0.833 1 -4.672 1 0.0359
## 6 dance pop 0.675 0.919 8 -5.385 1 0.1270
## acousticness instrumentalness liveness valence tempo duration_ms year
## 1 0.1020 0.00e+00 0.0653 0.518 122.036 194754 2019
## 2 0.0724 4.21e-03 0.3570 0.693 99.972 162600 2019
## 3 0.0794 2.33e-05 0.1100 0.613 124.008 176616 2019
## 4 0.0287 9.43e-06 0.2040 0.277 121.956 169093 2019
## 5 0.0803 0.00e+00 0.0833 0.725 123.976 189052 2019
## 6 0.0799 0.00e+00 0.1430 0.585 124.982 163049 2019
## release_period
## 1 After 2010
## 2 After 2010
## 3 After 2010
## 4 After 2010
## 5 After 2010
## 6 After 2010
# Extracting the year from the track_album_release_date
spotify_songs <- spotify_songs %>%
mutate(year = as.numeric(format(as.Date(track_album_release_date, format="%Y-%m-%d"), "%Y")))
# Creating a new column indicating whether the song was released before or after 2010
spotify_songs <- spotify_songs %>%
mutate(release_period = ifelse(year < 2010, "Before 2010", "After 2010"))
# calculation based on desired power, alpha level, and effect size
alpha_level <- 0.05 # Type 1 error rate
power_level <- 0.8 # Power (1 - Type 2 error rate)
effect_size <- 0.5 # Medium effect size
# analysis to determine required sample size for two-sample t-test
sample_size <- pwr.t.test(d = effect_size, sig.level = alpha_level, power = power_level, type = "two.sample")$n
print(paste("Required sample size for each group: ", round(sample_size)))
## [1] "Required sample size for each group: 64"
# Check if you have enough data for each group
group_sizes <- spotify_songs %>%
group_by(release_period) %>%
summarise(count = n())
print(group_sizes)
## # A tibble: 3 × 2
## release_period count
## <chr> <int>
## 1 After 2010 23770
## 2 Before 2010 7177
## 3 <NA> 1886
if (min(group_sizes$count) >= sample_size) {
# Performing two-sample t-test on danceability
t_test_result <- t.test(danceability ~ release_period, data = spotify_songs)
# Displaying test result
print(t_test_result)
if(t_test_result$p.value < alpha_level) {
print("Reject the null hypothesis: There is a significant difference in danceability before and after 2010.")
} else {
print("Fail to reject the null hypothesis: There is no significant difference in danceability before and after 2010.")
}
} else {
print("Not enough data to perform the hypothesis test.")
}
##
## Welch Two Sample t-test
##
## data: danceability by release_period
## t = 12.33, df = 10385, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group After 2010 and group Before 2010 is not equal to 0
## 95 percent confidence interval:
## 0.02204703 0.03038180
## sample estimates:
## mean in group After 2010 mean in group Before 2010
## 0.6633384 0.6371240
##
## [1] "Reject the null hypothesis: There is a significant difference in danceability before and after 2010."
The code is structured to perform a two-sample t-test, comparing the danceability of songs released before and after 2010, using the Neyman-Pearson framework. Here’s the breakdown:
Alpha Level (Type 1 Error Rate): Set at 0.05, meaning there is a 5% risk of rejecting the null hypothesis when it is actually true (i.e., a false positive).
Power Level (1 - Type 2 Error Rate): Set at 0.8 (80% power), meaning there is an 80% chance of correctly rejecting the null hypothesis if the alternative hypothesis is true (i.e., minimizing the chance of a false negative).
Effect Size: A medium effect size (0.5) is chosen, based on Cohen’s guidelines. This is a measure of how substantial the difference between the two groups must be to be detected by the test.
Sample Size Calculation: The code calculates the required sample size using the power analysis function pwr.t.test(). The required sample size per group is calculated based on the chosen alpha level, power, and effect size. The code then compares the actual number of songs in each group (“Before 2010” and “After 2010”) with the required sample size. If the groups meet the required sample size, the test proceeds; otherwise, it stops, stating there is not enough data.
Two-Sample T-Test: If the sample size is sufficient, the two-sample t-test is performed to compare the average danceability of songs before and after 2010. The p-value from the t-test is compared to the alpha level of 0.05:
Test Interpretation: 1. Reject Null Hypothesis: If the null hypothesis is rejected, it suggests that songs released before and after 2010 have significantly different levels of danceability. 2. Fail to Reject Null Hypothesis: If the null hypothesis is not rejected, it means there is no strong evidence to suggest a difference in danceability between the two periods.
Sample Size Sufficiency: The sample size calculation indicates whether there is enough data to confidently perform the test and detect a meaningful difference if it exists. This is crucial because performing a test with too little data could lead to incorrect conclusions.
Statistical Significance of Danceability Differences: If the null hypothesis is rejected, we can conclude that songs released before and after 2010 have a significant difference in danceability. This might indicate a trend in the music industry where newer songs have evolved in terms of danceability, possibly reflecting changes in music production or consumer preferences.
No Significant Difference: If the null hypothesis is not rejected, it suggests that the danceability of songs has not changed significantly between the two periods, implying consistency in musical trends regarding this feature over time.
This test has practical implications for understanding how certain musical qualities, like danceability, have evolved over time. Music producers, streaming platforms, and artists might use this information to understand trends in music that resonate with listeners. Additionally, researchers in the music industry might be interested in studying whether certain periods in music history saw shifts in specific qualities like danceability, energy, or tempo.
Other Song Features: Are there other features, such as energy or valence, that show more significant differences across time periods? Investigating additional audio features might provide a more comprehensive understanding of how music has evolved.
Cultural or Industry Changes: What cultural or industry shifts could explain differences (or lack thereof) in danceability? For example, did streaming services, the rise of electronic music, or globalized music trends contribute to changes in the overall structure of songs?
Influence of Genres: Does the difference in danceability hold across different genres? For instance, perhaps pop or dance music has seen more change over time than rock or classical music.
# Null Hypothesis (H0): There is no correlation between energy and loudness.
# Alternative Hypothesis (H1): There is a significant correlation between energy and loudness.
# Performing Pearson correlation test between energy and loudness
cor_test_result <- cor.test(spotify_songs$energy, spotify_songs$loudness)
# Displaying test result
print(cor_test_result)
##
## Pearson's product-moment correlation
##
## data: spotify_songs$energy and spotify_songs$loudness
## t = 166.5, df = 32831, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6707167 0.6824465
## sample estimates:
## cor
## 0.6766245
# Fisher's Significance Testing: Interpretation of the p-value
alpha_level <- 0.05 # Significance level
if(cor_test_result$p.value < alpha_level) {
print("Reject the null hypothesis: There is a significant correlation between energy and loudness.")
} else {
print("Fail to reject the null hypothesis: There is no significant correlation between energy and loudness.")
}
## [1] "Reject the null hypothesis: There is a significant correlation between energy and loudness."
# Explanation of confidence in data
cat("The dataset contains a large number of observations, providing sufficient data to confidently apply the Pearson correlation test.
The p-value, based on Fisher's framework, indicates whether we should reject the null hypothesis.
Given the structure of the data and the robustness of the correlation test, we can be confident that the conclusion drawn from this test is reliable.")
## The dataset contains a large number of observations, providing sufficient data to confidently apply the Pearson correlation test.
## The p-value, based on Fisher's framework, indicates whether we should reject the null hypothesis.
## Given the structure of the data and the robustness of the correlation test, we can be confident that the conclusion drawn from this test is reliable.
The provided code is designed to test the correlation between two variables, energy and loudness, using Pearson’s correlation test, which is appropriate for testing linear relationships between continuous variables. The test is conducted within the framework of Fisher’s Significance Testing, where we evaluate the p-value to decide whether to reject the null hypothesis.
Pearson Correlation Test: The cor.test() function performs the Pearson correlation test, which calculates both:
The correlation coefficient (r), a measure of the strength and direction of the relationship between energy and loudness. A value close to +1 or -1 indicates a strong relationship, while a value close to 0 indicates no relationship.
The p-value, which tests whether the observed correlation could have occurred by random chance.
Interpretation of the p-value: The p-value is compared to the significance level (alpha = 0.05):
Reject Null Hypothesis: If the p-value is less than 0.05, we reject the null hypothesis, indicating that there is a statistically significant correlation between energy and loudness.
Fail to Reject Null Hypothesis: If the p-value is greater than or equal to 0.05, we fail to reject the null hypothesis, meaning there is no statistically significant correlation.
Confidence in the Data: The explanation mentions that the dataset contains a large number of observations, which is important because larger sample sizes increase the reliability of statistical tests. The Pearson correlation test is robust and widely used, so if the test finds a significant correlation, we can be confident that the conclusion is reliable.
1.Significant Correlation: If the null hypothesis is rejected (p-value < 0.05), it means there is a statistically significant correlation between energy and loudness. This would suggest that these two features of the songs are related, with louder songs likely having higher energy or vice versa.
Understanding Song Features: A significant correlation between energy and loudness could imply that songs with higher energy levels tend to be louder, which could be useful for music producers, DJs, or streaming platforms. This relationship might reflect general industry practices where louder songs are perceived as more energetic.
Musical Composition and Perception: If there is a significant relationship between energy and loudness, it may also shed light on how listeners perceive songs. Loudness and energy might combine to evoke stronger emotional responses or create a more intense listening experience.
Strength and Nature of Correlation: How strong is the correlation between energy and loudness? Is it positive or negative? A further examination of the correlation coefficient might reveal whether the relationship is weak, moderate, or strong, and whether louder songs are typically more energetic (positive correlation) or less energetic (negative correlation).
Other Song Features: Are there other song features, such as valence (mood) or tempo, that might also correlate with energy or loudness? Exploring additional relationships could provide a broader understanding of how different features influence each other.
Genre-Specific Correlations: Does the correlation between energy and loudness hold across different music genres? It’s possible that some genres, such as electronic dance music (EDM) or rock, might show stronger correlations between these features than others, such as classical or jazz.
# Boxplot for danceability before and after 2010
ggplot(spotify_songs, aes(x = release_period, y = danceability, fill = release_period)) +
geom_boxplot() +
labs(title = "Danceability of Songs Before and After 2010",
x = "Release Period",
y = "Danceability") +
theme_minimal() +
scale_fill_manual(values = c("Before 2010" = "lightblue", "After 2010" = "lightgreen")) +
theme(legend.position = "none")
boxplot that compares the danceability of songs released before and after 2010. This helps illustrate whether there is a significant difference in the distribution of danceability across these two periods, aligning with the hypothesis test performed.
This visualization supports the results of the t-test performed earlier. If the test indicated a significant difference, the boxplot should show a difference in the danceability distributions. If the null hypothesis was not rejected, the boxplot is likely to show considerable overlap in the distributions.
# Scatter plot for energy and loudness with a regression line
ggplot(spotify_songs, aes(x = energy, y = loudness)) +
geom_point(alpha = 0.5, color = "blue") + # Scatter plot points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add regression line
labs(title = "Correlation between Energy and Loudness",
x = "Energy",
y = "Loudness") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
scatter plot that shows the relationship between energy and loudness, with a regression line to highlight the linear trend. This aligns with the second hypothesis test, where we examined whether there is a significant correlation between the two variables.
This visualization complements the Pearson correlation test. If the test found a significant correlation, the scatter plot should show a clear pattern with points clustering along the regression line. If the test found no significant correlation, the scatter plot is expected to show little or no discernible pattern.