library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data = read.csv("/Users/yashuvaishu/Downloads/Spotify_Song_Attributes.csv")
summary(data)
## trackName artistName msPlayed genre
## Length:8512 Length:8512 Min. : 0 Length:8512
## Class :character Class :character 1st Qu.: 139988 Class :character
## Mode :character Mode :character Median : 269911 Mode :character
## Mean : 1539769
## 3rd Qu.: 1212140
## Max. :158367130
## danceability energy key loudness
## Min. :0.0000 Min. :0.00108 Min. : 0.000 Min. :-42.044
## 1st Qu.:0.5070 1st Qu.:0.40700 1st Qu.: 2.000 1st Qu.:-10.016
## Median :0.6225 Median :0.59200 Median : 5.000 Median : -7.129
## Mean :0.6017 Mean :0.56684 Mean : 5.244 Mean : -8.580
## 3rd Qu.:0.7140 3rd Qu.:0.75400 3rd Qu.: 8.000 3rd Qu.: -5.308
## Max. :0.9760 Max. :0.99900 Max. :11.000 Max. : 3.010
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.00000 Min. :0.0000017 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.03610 1st Qu.:0.0516000 1st Qu.:0.0000000
## Median :1.0000 Median :0.04790 Median :0.2390000 Median :0.0000241
## Mean :0.6171 Mean :0.07833 Mean :0.3578286 Mean :0.1495417
## 3rd Qu.:1.0000 3rd Qu.:0.08190 3rd Qu.:0.6580000 3rd Qu.:0.0236000
## Max. :1.0000 Max. :0.94100 Max. :0.9960000 Max. :0.9930000
## liveness valence tempo type
## Min. :0.0249 Min. :0.0000 Min. : 0.00 Length:8512
## 1st Qu.:0.0960 1st Qu.:0.2380 1st Qu.: 97.21 Class :character
## Median :0.1200 Median :0.4100 Median :118.94 Mode :character
## Mean :0.1748 Mean :0.4353 Mean :119.10
## 3rd Qu.:0.2090 3rd Qu.:0.6180 3rd Qu.:139.32
## Max. :0.9640 Max. :0.9860 Max. :236.20
## id uri track_href analysis_url
## Length:8512 Length:8512 Length:8512 Length:8512
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## duration_ms time_signature
## Min. : 10027 Min. :0.000
## 1st Qu.: 163173 1st Qu.:4.000
## Median : 195989 Median :4.000
## Mean : 203948 Mean :3.915
## 3rd Qu.: 231367 3rd Qu.:4.000
## Max. :1847210 Max. :5.000
str(data)
## 'data.frame': 8512 obs. of 22 variables:
## $ trackName : chr "A Better Place" "A Dangerous Thing" "A Different Way (with Lauv)" "A Drug From God" ...
## $ artistName : chr "Project AER" "AURORA" "DJ Snake" "Chris Lake" ...
## $ msPlayed : int 119999 1945555 66060 192455 97568 99339 6158627 40539 269453 60453 ...
## $ genre : chr "ambient guitar" "art pop" "edm" "bass house" ...
## $ danceability : num 0.496 0.541 0.784 0.714 0.0828 0.598 0.792 0.486 0.265 0.253 ...
## $ energy : num 0.255 0.556 0.757 0.883 0.012 0.295 0.484 0.881 0.312 0.139 ...
## $ key : int 9 11 8 9 9 1 4 2 7 6 ...
## $ loudness : num -17.98 -6.15 -3.91 -4.43 -36.05 ...
## $ mode : int 1 0 1 1 0 1 1 0 1 1 ...
## $ speechiness : num 0.0283 0.0356 0.0384 0.0625 0.0451 0.0276 0.192 0.0474 0.0569 0.0414 ...
## $ acousticness : num 0.937 0.465 0.495 0.00819 0.9 0.735 0.477 0.0244 0.243 0.726 ...
## $ instrumentalness: num 8.45e-01 0.00 1.18e-06 8.43e-01 8.12e-01 0.00 0.00 0.00 0.00 5.01e-05 ...
## $ liveness : num 0.0909 0.116 0.142 0.231 0.0875 0.107 0.106 0.429 0.0893 0.269 ...
## $ valence : num 0.0809 0.106 0.587 0.819 0.0578 0.314 0.245 0.667 0.0998 0.102 ...
## $ tempo : num 142 106 105 126 170 ...
## $ type : chr "audio_features" "audio_features" "audio_features" "audio_features" ...
## $ id : chr "2oC9Ah7npALCCPW5DC1gob" "0PDlmmYkuQCUAFhMXvtlsU" "1YMBg7rOjxzbya0fPOYfNX" "4skbQNtyjy8A7mo8oqe2oD" ...
## $ uri : chr "spotify:track:2oC9Ah7npALCCPW5DC1gob" "spotify:track:0PDlmmYkuQCUAFhMXvtlsU" "spotify:track:1YMBg7rOjxzbya0fPOYfNX" "spotify:track:4skbQNtyjy8A7mo8oqe2oD" ...
## $ track_href : chr "https://api.spotify.com/v1/tracks/2oC9Ah7npALCCPW5DC1gob" "https://api.spotify.com/v1/tracks/0PDlmmYkuQCUAFhMXvtlsU" "https://api.spotify.com/v1/tracks/1YMBg7rOjxzbya0fPOYfNX" "https://api.spotify.com/v1/tracks/4skbQNtyjy8A7mo8oqe2oD" ...
## $ analysis_url : chr "https://api.spotify.com/v1/audio-analysis/2oC9Ah7npALCCPW5DC1gob" "https://api.spotify.com/v1/audio-analysis/0PDlmmYkuQCUAFhMXvtlsU" "https://api.spotify.com/v1/audio-analysis/1YMBg7rOjxzbya0fPOYfNX" "https://api.spotify.com/v1/audio-analysis/4skbQNtyjy8A7mo8oqe2oD" ...
## $ duration_ms : int 120000 215573 198286 192455 97667 225680 224528 480707 269453 60453 ...
## $ time_signature : int 4 4 4 4 3 4 4 4 3 3 ...
data1 = sample_n(data,20)
scatter_matrix <- pairs(data[, c("loudness", "danceability","energy","tempo","msPlayed")])
This plot provides a visual representation of the relationship between different variables. The slope and intercept of the regression line give insights into the strength and direction of the linear relationship between the two variables.
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Count the occurrences of each genre
genre_counts <- data %>%
group_by(genre) %>%
summarize(count = n())
# Sort genres by count in descending order
genre_counts <- genre_counts[order(-genre_counts$count),]
# Select only the top 10 genres
top_10_genres <- head(genre_counts, 10)
# Plot the top 10 genres
ggplot(top_10_genres, aes(x = reorder(genre, -count), y = count, fill = genre)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Top 10 Genres in the Dataset", x = "Genre", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
scatter_plot <- ggplot(data, aes(x = tempo, y = valence)) +
geom_point(alpha = 0.7, color = "blue") +
labs(title = "Relationship between Tempo and Valence",
x = "Tempo",
y = "Valence") +
theme_minimal()
print(scatter_plot)
Tempo and Valence:
Fast Tempo (High BPM):
Often associated with energetic and upbeat music.
Higher valence values may indicate a more positive or happy emotional tone.
Slow Tempo (Low BPM):
Commonly found in slower, more contemplative music.
Lower valence values may suggest a sadder or more reflective emotional tone.
Moderate Tempo:
Strikes a balance between energy and relaxation.
Valence values may vary, offering a diverse emotional palette.
barplot_genre_tempo <- ggplot(data1, aes(x = genre, y = tempo, fill = genre)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +
labs(title = "Distribution of Tempo Across Genres",
x = "Genre",
y = "Tempo") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Show the barplot
print(barplot_genre_tempo)
These graphs shows the Distribution of Valence across genres.
library(ggplot2)
# Create a scatter plot
ggplot(data, aes(x = loudness, y = energy)) +
geom_point(alpha = 0.7, color = "steelblue") +
labs(title = "Relationship Between Loudness and Energy",
x = "Loudness",
y = "Energy") +
theme_minimal()
Music Production and Editing:
Scenario: Music producers may use this visualization to understand the
relationship between loudness and energy in their songs.
Producers can identify the loudness levels that contribute to higher
energy in the music. This insight can guide decisions during the editing
and mastering process to optimize the energy levels based on audience
preferences.
ggplot(data, aes(x = tempo, y = valence)) +
geom_point(color = 'blue', alpha = 0.5) +
labs(x = 'Tempo', y = 'Valence', title = 'Scatter Plot: Tempo vs Valence') +
theme_minimal()
The scatter plot shows a positive correlation between tempo and valence. This means that as the tempo of a piece of music increases, its valence, or emotional positivity, also tends to increase. This is likely because faster tempos are often associated with excitement, happiness, and energy.
correlation_matrix <- cor(data[, c("tempo", "valence")])
# Print the correlation matrix
print(correlation_matrix)
## tempo valence
## tempo 1.000000 0.114693
## valence 0.114693 1.000000
The correlation coefficient between tempo and valence is 0.1147, indicating a weak positive correlation.
This suggests that, on average, as tempo increases, there is a slight increase in valence, and vice versa.
This correlation can guide the development of recommendation algorithms, allowing platforms to consider both tempo and valence preferences for a more nuanced music discovery experience.
Correlation Analysis for Each Genre:
# Assuming 'spotify_data' is your dataframe
# Replace 'genre_column' with the actual column name containing genre information
# Group by genre
genre_groups <- data %>% group_by(genre)
# Calculate correlations within each genre, filtering out groups with zero standard deviation
genre_correlations <- summarise(genre_groups, correlation_tempo_valence = ifelse(sd(tempo) == 0 | sd(valence) == 0, NA, cor(tempo, valence)))
# Remove rows with NA values (optional)
genre_correlations <- na.omit(genre_correlations)
print(genre_correlations)
## # A tibble: 312 × 2
## genre correlation_tempo_valence
## <chr> <dbl>
## 1 acoustic pop 0.139
## 2 adult standards 0.992
## 3 aesthetic rap 0.169
## 4 afghan pop 0.438
## 5 afrobeats 0.434
## 6 afrofuturism -0.770
## 7 aggressive phonk -0.256
## 8 alabama indie -0.0430
## 9 alaska indie 1
## 10 album rock 0.339
## # ℹ 302 more rows
Column Descriptions:
genre: The genre of the
music.
correlation_tempo_valence: The
correlation coefficient between tempo and valence for each
genre.
Sample Rows:
acoustic pop: The correlation
coefficient for acoustic pop is approximately 0.14. This positive value
suggests a weak positive correlation between tempo and valence in
acoustic pop music. As tempo increases, valence tends to increase
slightly.
adult standards: The correlation
coefficient for adult standards is approximately 0.99. This very high
positive value indicates a strong positive correlation between tempo and
valence in adult standards. A faster tempo is strongly associated with
higher valence in this genre.
aesthetic rap: The correlation
coefficient for aesthetic rap is approximately 0.17. This positive value
suggests a weak positive correlation between tempo and valence in
aesthetic rap.
afghan pop: The correlation
coefficient for Afghan pop is approximately 0.44. This positive value
suggests a moderate positive correlation between tempo and valence in
Afghan pop music.
afrobeats: The correlation
coefficient for afrobeats is approximately 0.43. This positive value
suggests a moderate positive correlation between tempo and valence in
afrobeats.
afrofuturism: The correlation
coefficient for afrofuturism is approximately -0.77. This negative value
indicates a strong negative correlation between tempo and valence in
afrofuturism. As tempo increases, valence tends to decrease
significantly.
aggressive phonk: The correlation
coefficient for aggressive phonk is approximately -0.26. This negative
value suggests a weak negative correlation between tempo and valence in
aggressive phonk.
alabama indie: The correlation
coefficient for Alabama indie is approximately -0.04. This near-zero
value suggests a very weak negative correlation between tempo and
valence in Alabama indie.
alaska indie: The correlation
coefficient for Alaska indie is 1. This perfect positive correlation (1)
indicates a strong positive relationship between tempo and valence in
Alaska indie music.
album rock: The correlation
coefficient for album rock is approximately 0.34. This positive value
suggests a moderate positive correlation between tempo and valence in
album rock.
Interpretation:
Positive correlation coefficients indicate that as tempo increases, valence tends to increase as well.
Negative correlation coefficients indicate that as tempo increases, valence tends to decrease.
The strength of the correlation is indicated by the absolute value of the coefficient: close to 1 indicates a strong correlation, close to 0 indicates a weak correlation.
Considerations:
Correlation does not imply causation. These values suggest associations but do not indicate a causal relationship between tempo and valence.
Interpret the results in the context of the specific genre and the nature of the music.
Applications:
Music Recommendation Systems:
Personalized Playlists: Understanding the correlation between tempo and valence in different genres allows for the creation of more personalized playlists. For example, a recommendation system could suggest playlists with high-energy, upbeat tracks (positive valence) for users who enjoy faster tempos.
Enhanced User Experience: Recommending music that aligns with both tempo and valence preferences enhances the overall listening experience for users. For instance, a user who prefers slower tempos and positive valence may receive recommendations tailored to those preferences.
Genre-Specific Features for Platforms:
Genre-Based Features: Music streaming platforms can incorporate genre-specific features based on these correlations. For instance, they can highlight the energetic and positive aspects of genres with strong positive correlations between tempo and valence.
User Interface Customization: The platform’s user interface could adapt to showcase genres that align with the user’s preferred tempo and valence combination.
Content Creation for Artists:
Guidance for Music Creators: Artists can leverage these correlations to guide their creative process. For example, an artist creating music in a specific genre can consider the typical relationship between tempo and valence to align with audience expectations.
Targeting Audience Preferences: Understanding the preferred tempo and valence combinations within genres allows artists to target specific audience segments and enhance the marketability of their music
# Define your data
low_energy_songs <- data %>% filter(energy < 0.5)
high_energy_songs <- data %>% filter(energy >= 0.5)
# Perform t-test
t_test<- t.test(low_energy_songs$danceability, high_energy_songs$danceability)
# Print t-test result
print(t_test)
##
## Welch Two Sample t-test
##
## data: low_energy_songs$danceability and high_energy_songs$danceability
## t = -22.272, df = 5087, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09057155 -0.07591700
## sample estimates:
## mean of x mean of y
## 0.5488327 0.6320770
# Assuming 'df_spotify' is your Spotify dataset with 'danceability' and 'energy' columns
# Load necessary libraries
library(ggplot2)
# Create a barplot with rotated x-axis labels
ggplot(data, aes(x = cut(danceability, breaks = 10), y = energy)) +
geom_bar(stat = "summary", fun = "mean", fill = "skyblue", alpha = 0.7) +
labs(title = "Mean Energy Across Danceability Levels",
x = "Danceability Levels",
y = "Mean Energy") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Adjust the angle as needed
Scenario An event planner or DJ is organizing a party or
event.
Use Case: Knowing that there is a significant difference in
danceability, the playlist or DJ set can be curated to include
high-energy songs during peak dance moments. This ensures a dynamic and
engaging experience for attendees.
# Create a box plot
ggplot(data, aes(x = factor(energy >= 0.5), y = danceability)) +
geom_boxplot() +
labs(x = "Energy Level", y = "Danceability") +
theme_minimal()
As we see P-value is less than the significance level we can reject null hypothesis (Ho) and accept alternative hypothesis (H1).
Responsible variable = danceability
Explanatory variable = genre
alpha <- 0.05
# ANOVA test
anova_result <- aov(danceability ~ genre,data)
# Summary of ANOVA test
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 522 86.77 0.16623 10.43 <2e-16 ***
## Residuals 7989 127.29 0.01593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is enough evidence to conclude that there is a significant difference in danceability scores among different music genres.
Listeners can expect distinct differences in danceability based on the genre of music they choose.
model_two <- lm(danceability ~ energy + tempo, data)
# summary
summary(model_two)
##
## Call:
## lm(formula = danceability ~ energy + tempo, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58470 -0.09469 0.01380 0.10779 0.41258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.281e-01 7.429e-03 71.086 < 2e-16 ***
## energy 1.854e-01 6.985e-03 26.548 < 2e-16 ***
## tempo -2.652e-04 5.828e-05 -4.551 5.41e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1524 on 8509 degrees of freedom
## Multiple R-squared: 0.07659, Adjusted R-squared: 0.07637
## F-statistic: 352.9 on 2 and 8509 DF, p-value: < 2.2e-16
The results show that both energy (β=0.1854, p<0.001) and `tempo` (β=-0.00026, p<0.001) are significant predictors of danceability, indicating that higher energy and lower tempo are associated with higher danceability, after controlling for each other. The adjusted R-squared value of 0.07654 suggests that the model explains about 7.6% of the variance in danceability.
Now let us think what if energy might depend upon tempo I want to test my hypothesis now by using a term energy*tempo.
model_three <- lm(danceability ~ energy * tempo, data)
# summary
summary(model_three)
##
## Call:
## lm(formula = danceability ~ energy * tempo, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52091 -0.09403 0.01242 0.10816 0.41084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3464561 0.0153791 22.53 <2e-16 ***
## energy 0.5408797 0.0273157 19.80 <2e-16 ***
## tempo 0.0013323 0.0001320 10.09 <2e-16 ***
## energy:tempo -0.0030461 0.0002265 -13.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1508 on 8508 degrees of freedom
## Multiple R-squared: 0.09582, Adjusted R-squared: 0.0955
## F-statistic: 300.5 on 3 and 8508 DF, p-value: < 2.2e-16
Based on the output of the model, we can conclude that the interaction term between energy and tempo is significant (p-value < 0.001). This suggests that the effect of energy on danceability depends on the level of tempo. Specifically, the negative coefficient estimate for the interaction term (-0.003) indicates that the effect of energy on danceability becomes weaker at higher levels of tempo.
However, it’s important to note that the adjusted R-squared of the model is relatively low (0.095), indicating that the predictors in the model do not explain a large proportion of the variation in danceability.
Music Recommendation:
The results can inform the music recommendation system to consider the relationship between energy and danceability when suggesting songs to users. For example, the system might tailor recommendations based on users’ preferences for energetic and danceable songs.
# Conduct linear regression analysis
linear_model <- lm(energy ~ danceability, data = data)
# Summary of the linear regression model
summary(linear_model)
##
## Call:
## lm(formula = energy ~ danceability, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63724 -0.16144 0.01066 0.17260 0.56315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.316757 0.009893 32.02 <2e-16 ***
## danceability 0.415651 0.015899 26.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2326 on 8510 degrees of freedom
## Multiple R-squared: 0.07434, Adjusted R-squared: 0.07423
## F-statistic: 683.5 on 1 and 8510 DF, p-value: < 2.2e-16
From Results by looking at coefficients section in the model summary.
The linear regression model output shows that both energy and the intercept are significant predictors of danceability (p < 2e-16), with an increase in energy positively associated with an increase in danceability. The coefficient estimate for energy is 0.178790, meaning that for every unit increase in energy, there is an expected increase in danceability by 0.178790 units. However, the adjusted R-squared value of 0.07418 suggests that the model explains only a small amount of the variance and there could be other factors that influence danceability which are not captured by the current model.
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Create a scatter plot with regression line
scatter_plot <- ggplot(data, aes(x = danceability, y = energy)) +
geom_point(alpha = 0.7, color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Scatter Plot with Linear Regression: Danceability vs. Energy",
x = "Danceability",
y = "Energy") +
theme_minimal()
# Show the scatter plot with regression line
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Create a scatter plot with regression line
scatter_plot <- ggplot(data, aes(x = loudness,
y = danceability)) +
geom_point(alpha = 0.7, color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Scatter Plot with Linear Regression: Danceability vs. Energy",
x = "Danceability",
y = "Energy") +
theme_minimal()
# Show the scatter plot with regression line
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
data |>
filter( mode == 1 ) |>
ggplot(mapping = aes(x = loudness,
y = danceability)) +
geom_point() +
geom_smooth(method = 'lm', color = 'red', linetype = 'dashed',
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "Loudness vs danceability",
subtitle = paste("Linear Fit R-Squared =")) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
The scatter plot shows a negative correlation between loudness and danceability. This means that as the loudness of a piece of music increases, its danceability, or how suitable it is for dancing, tends to decrease. This is likely because louder music is often associated with heavier genres such as rock and metal, which are not typically associated with dancing.
This information can be helpful in music recommendation systems by helping to filter out songs that are too loud or too quiet for a user’s preferences. For example, if a user indicates that they prefer danceable music, the system can avoid recommending songs that are too loud.