data <- read.csv("/Users/yashuvaishu/Downloads/Spotify.csv")
str(data)
## 'data.frame': 8511 obs. of 13 variables:
## $ trackName : chr "A Better Place" "A Dangerous Thing" "A Different Way (with Lauv)" "A Drug From God" ...
## $ artistName : chr "Project AER" "AURORA" "DJ Snake" "Chris Lake" ...
## $ msPlayed : int 119999 1945555 66060 192455 97568 99339 6158627 40539 269453 60453 ...
## $ genre : chr "ambient guitar" "art pop" "edm" "bass house" ...
## $ danceability: num 0.496 0.541 0.784 0.714 0.0828 0.598 0.792 0.486 0.265 0.253 ...
## $ energy : num 0.255 0.556 0.757 0.883 0.012 0.295 0.484 0.881 0.312 0.139 ...
## $ key : int 9 11 8 9 9 1 4 2 7 6 ...
## $ loudness : num -17.98 -6.15 -3.91 -4.43 -36.05 ...
## $ speechiness : num 0.0283 0.0356 0.0384 0.0625 0.0451 0.0276 0.192 0.0474 0.0569 0.0414 ...
## $ valence : num 0.0809 0.106 0.587 0.819 0.0578 0.314 0.245 0.667 0.0998 0.102 ...
## $ tempo : num 142 106 105 126 170 ...
## $ id : chr "2oC9Ah7npALCCPW5DC1gob" "0PDlmmYkuQCUAFhMXvtlsU" "1YMBg7rOjxzbya0fPOYfNX" "4skbQNtyjy8A7mo8oqe2oD" ...
## $ duration_ms : int 120000 215573 198286 192455 97667 225680 224528 480707 269453 60453 ...
Performing a t-test to compare the mean danceability of songs with low energy and high energy. Calculating the test statistic and compare it to the critical value for your chosen alpha level.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Define your data
low_energy_songs <- data %>% filter(energy < 0.5)
high_energy_songs <- data %>% filter(energy >= 0.5)
# Perform t-test
t_test<- t.test(low_energy_songs$danceability, high_energy_songs$danceability)
# Print t-test result
print(t_test)
##
## Welch Two Sample t-test
##
## data: low_energy_songs$danceability and high_energy_songs$danceability
## t = -22.264, df = 5087.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09054185 -0.07588711
## sample estimates:
## mean of x mean of y
## 0.5488327 0.6320472
# Create a box plot
ggplot(data, aes(x = factor(energy >= 0.5), y = danceability)) +
geom_boxplot() +
labs(x = "Energy Level", y = "Danceability") +
theme_minimal()
Performing a test between loudness and speechiness. Calculating the test statistic and comparing it to the critical value for your chosen alpha level.
# Loudness and Speechiness
low_energy_songs <- data %>% filter(tempo < 0.5)
high_energy_songs <- data %>% filter(tempo >= 0.5)
# Performing t test
t_test2<- t.test(low_energy_songs$loudness,high_energy_songs$loudness)
# Print test result
print(t_test2)
##
## Welch Two Sample t-test
##
## data: low_energy_songs$loudness and high_energy_songs$loudness
## t = -6.2313, df = 3.0026, p-value = 0.008313
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -26.16554 -8.47999
## sample estimates:
## mean of x mean of y
## -25.895000 -8.572233
# Create a scatter plot with regression line
library(ggplot2)
# Assuming you have a dataset named 'data'
ggplot(data, aes(x = factor(tempo >= 0.5), y = loudness, fill = factor(tempo >= 0.5))) +
geom_boxplot(color = "blue", outlier.shape = NA) +
geom_smooth(aes(color = factor(tempo >= 0.5)), method = "lm", se = FALSE, size = 1) +
labs(x = "High Energy", y = "Loudness") +
theme_minimal() +
scale_fill_manual(values = c("lightgray", "blue")) +
scale_color_manual(values = c("lightgray" = "lightgray", "blue" = "blue")) +
theme(
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10),
plot.title = element_text(size = 14, hjust = 0.5)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
# Assuming you have a dataset named 'data'
# Calculate the mean or median danceability for each energy level
summary_data <- data %>%
group_by(Energy_Level = factor(energy >= 0.5)) %>%
summarize(Danceability_Mean = mean(danceability), Danceability_Median = median(danceability))
# Create a bar plot
ggplot(summary_data, aes(x = Energy_Level, y = Danceability_Mean)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Danceability_Mean, 2)), vjust = -0.3, size = 4) +
labs(x = "Energy Level", y = "Mean Danceability") +
theme_minimal() +
theme(
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.title = element_text(size = 14, hjust = 0.5)
)
Perform a t-test on the data and calculate the p-value. If the p-value is less than or equal to your chosen alpha level (0.05), reject the null hypothesis and conclude that there is a significant difference in danceability between songs with low and high energy.
# Creating a contingency table
contingency_table <- table(data$energy, data$danceability)
# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)
# Extract the p-value from the test result
p_value <- fisher_test_result$p.value
# Setting your significance level (alpha)
alpha <- 0.05
# Check if the p-value is less than alpha
if (p_value < alpha) {
cat("Reject the null hypothesis")
} else {
cat("Fail to reject the null hypothesis")
}
## Reject the null hypothesis
# Print the p-value
cat(" P-value:", p_value, "\n")
## P-value: 0.0004997501