library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data = read.csv("/Users/yashuvaishu/Downloads/Spotify_Song_Attributes.csv")
summary(data)
##   trackName          artistName           msPlayed            genre          
##  Length:8512        Length:8512        Min.   :        0   Length:8512       
##  Class :character   Class :character   1st Qu.:   139988   Class :character  
##  Mode  :character   Mode  :character   Median :   269911   Mode  :character  
##                                        Mean   :  1539769                     
##                                        3rd Qu.:  1212140                     
##                                        Max.   :158367130                     
##   danceability        energy             key            loudness      
##  Min.   :0.0000   Min.   :0.00108   Min.   : 0.000   Min.   :-42.044  
##  1st Qu.:0.5070   1st Qu.:0.40700   1st Qu.: 2.000   1st Qu.:-10.016  
##  Median :0.6225   Median :0.59200   Median : 5.000   Median : -7.129  
##  Mean   :0.6017   Mean   :0.56684   Mean   : 5.244   Mean   : -8.580  
##  3rd Qu.:0.7140   3rd Qu.:0.75400   3rd Qu.: 8.000   3rd Qu.: -5.308  
##  Max.   :0.9760   Max.   :0.99900   Max.   :11.000   Max.   :  3.010  
##       mode         speechiness       acousticness       instrumentalness   
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000017   Min.   :0.0000000  
##  1st Qu.:0.0000   1st Qu.:0.03610   1st Qu.:0.0516000   1st Qu.:0.0000000  
##  Median :1.0000   Median :0.04790   Median :0.2390000   Median :0.0000241  
##  Mean   :0.6171   Mean   :0.07833   Mean   :0.3578286   Mean   :0.1495417  
##  3rd Qu.:1.0000   3rd Qu.:0.08190   3rd Qu.:0.6580000   3rd Qu.:0.0236000  
##  Max.   :1.0000   Max.   :0.94100   Max.   :0.9960000   Max.   :0.9930000  
##     liveness         valence           tempo            type          
##  Min.   :0.0249   Min.   :0.0000   Min.   :  0.00   Length:8512       
##  1st Qu.:0.0960   1st Qu.:0.2380   1st Qu.: 97.21   Class :character  
##  Median :0.1200   Median :0.4100   Median :118.94   Mode  :character  
##  Mean   :0.1748   Mean   :0.4353   Mean   :119.10                     
##  3rd Qu.:0.2090   3rd Qu.:0.6180   3rd Qu.:139.32                     
##  Max.   :0.9640   Max.   :0.9860   Max.   :236.20                     
##       id                uri             track_href        analysis_url      
##  Length:8512        Length:8512        Length:8512        Length:8512       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   duration_ms      time_signature 
##  Min.   :  10027   Min.   :0.000  
##  1st Qu.: 163173   1st Qu.:4.000  
##  Median : 195989   Median :4.000  
##  Mean   : 203948   Mean   :3.915  
##  3rd Qu.: 231367   3rd Qu.:4.000  
##  Max.   :1847210   Max.   :5.000
str(data)
## 'data.frame':    8512 obs. of  22 variables:
##  $ trackName       : chr  "A Better Place" "A Dangerous Thing" "A Different Way (with Lauv)" "A Drug From God" ...
##  $ artistName      : chr  "Project AER" "AURORA" "DJ Snake" "Chris Lake" ...
##  $ msPlayed        : int  119999 1945555 66060 192455 97568 99339 6158627 40539 269453 60453 ...
##  $ genre           : chr  "ambient guitar" "art pop" "edm" "bass house" ...
##  $ danceability    : num  0.496 0.541 0.784 0.714 0.0828 0.598 0.792 0.486 0.265 0.253 ...
##  $ energy          : num  0.255 0.556 0.757 0.883 0.012 0.295 0.484 0.881 0.312 0.139 ...
##  $ key             : int  9 11 8 9 9 1 4 2 7 6 ...
##  $ loudness        : num  -17.98 -6.15 -3.91 -4.43 -36.05 ...
##  $ mode            : int  1 0 1 1 0 1 1 0 1 1 ...
##  $ speechiness     : num  0.0283 0.0356 0.0384 0.0625 0.0451 0.0276 0.192 0.0474 0.0569 0.0414 ...
##  $ acousticness    : num  0.937 0.465 0.495 0.00819 0.9 0.735 0.477 0.0244 0.243 0.726 ...
##  $ instrumentalness: num  8.45e-01 0.00 1.18e-06 8.43e-01 8.12e-01 0.00 0.00 0.00 0.00 5.01e-05 ...
##  $ liveness        : num  0.0909 0.116 0.142 0.231 0.0875 0.107 0.106 0.429 0.0893 0.269 ...
##  $ valence         : num  0.0809 0.106 0.587 0.819 0.0578 0.314 0.245 0.667 0.0998 0.102 ...
##  $ tempo           : num  142 106 105 126 170 ...
##  $ type            : chr  "audio_features" "audio_features" "audio_features" "audio_features" ...
##  $ id              : chr  "2oC9Ah7npALCCPW5DC1gob" "0PDlmmYkuQCUAFhMXvtlsU" "1YMBg7rOjxzbya0fPOYfNX" "4skbQNtyjy8A7mo8oqe2oD" ...
##  $ uri             : chr  "spotify:track:2oC9Ah7npALCCPW5DC1gob" "spotify:track:0PDlmmYkuQCUAFhMXvtlsU" "spotify:track:1YMBg7rOjxzbya0fPOYfNX" "spotify:track:4skbQNtyjy8A7mo8oqe2oD" ...
##  $ track_href      : chr  "https://api.spotify.com/v1/tracks/2oC9Ah7npALCCPW5DC1gob" "https://api.spotify.com/v1/tracks/0PDlmmYkuQCUAFhMXvtlsU" "https://api.spotify.com/v1/tracks/1YMBg7rOjxzbya0fPOYfNX" "https://api.spotify.com/v1/tracks/4skbQNtyjy8A7mo8oqe2oD" ...
##  $ analysis_url    : chr  "https://api.spotify.com/v1/audio-analysis/2oC9Ah7npALCCPW5DC1gob" "https://api.spotify.com/v1/audio-analysis/0PDlmmYkuQCUAFhMXvtlsU" "https://api.spotify.com/v1/audio-analysis/1YMBg7rOjxzbya0fPOYfNX" "https://api.spotify.com/v1/audio-analysis/4skbQNtyjy8A7mo8oqe2oD" ...
##  $ duration_ms     : int  120000 215573 198286 192455 97667 225680 224528 480707 269453 60453 ...
##  $ time_signature  : int  4 4 4 4 3 4 4 4 3 3 ...
data1 = sample_n(data,20)
scatter_matrix <- pairs(data[, c("loudness", "danceability","energy","tempo","msPlayed")])

This plot provides a visual representation of the relationship between different variables. The slope and intercept of the regression line give insights into the strength and direction of the linear relationship between the two variables.

# Load necessary libraries
library(dplyr)
library(ggplot2)

# Count the occurrences of each genre
genre_counts <- data %>%
  group_by(genre) %>%
  summarize(count = n())

# Sort genres by count in descending order
genre_counts <- genre_counts[order(-genre_counts$count),]

# Select only the top 10 genres
top_10_genres <- head(genre_counts, 10)

# Plot the top 10 genres
ggplot(top_10_genres, aes(x = reorder(genre, -count), y = count, fill = genre)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Top 10 Genres in the Dataset", x = "Genre", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)


scatter_plot <- ggplot(data, aes(x = tempo, y = valence)) +
  geom_point(alpha = 0.7, color = "blue") +
  labs(title = "Relationship between Tempo and Valence",
       x = "Tempo",
       y = "Valence") +
  theme_minimal()
print(scatter_plot)

barplot_genre_tempo <- ggplot(data1, aes(x = genre, y = tempo, fill = genre)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +
  labs(title = "Distribution of Tempo Across Genres",
       x = "Genre",
       y = "Tempo") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Show the barplot
print(barplot_genre_tempo)

These graphs shows the Distribution of Valence across genres.

library(ggplot2)

# Create a scatter plot
ggplot(data, aes(x = loudness, y = energy)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  labs(title = "Relationship Between Loudness and Energy",
       x = "Loudness",
       y = "Energy") +
  theme_minimal()

Music Production and Editing:
Scenario: Music producers may use this visualization to understand the relationship between loudness and energy in their songs.

Producers can identify the loudness levels that contribute to higher energy in the music. This insight can guide decisions during the editing and mastering process to optimize the energy levels based on audience preferences.

ggplot(data, aes(x = tempo, y = valence)) +
  geom_point(color = 'blue', alpha = 0.5) +
  labs(x = 'Tempo', y = 'Valence', title = 'Scatter Plot: Tempo vs Valence') +
  theme_minimal()

The scatter plot shows a positive correlation between tempo and valence. This means that as the tempo of a piece of music increases, its valence, or emotional positivity, also tends to increase. This is likely because faster tempos are often associated with excitement, happiness, and energy.

correlation_matrix <- cor(data[, c("tempo", "valence")])

# Print the correlation matrix
print(correlation_matrix)
##            tempo  valence
## tempo   1.000000 0.114693
## valence 0.114693 1.000000
  1. Correlation Analysis for Each Genre:

    • Calculate correlation coefficients for tempo and valence within each genre subgroup.
# Assuming 'spotify_data' is your dataframe
# Replace 'genre_column' with the actual column name containing genre information

# Group by genre
genre_groups <- data %>% group_by(genre)

# Calculate correlations within each genre, filtering out groups with zero standard deviation
genre_correlations <- summarise(genre_groups, correlation_tempo_valence = ifelse(sd(tempo) == 0 | sd(valence) == 0, NA, cor(tempo, valence)))

# Remove rows with NA values (optional)
genre_correlations <- na.omit(genre_correlations)

print(genre_correlations)
## # A tibble: 312 × 2
##    genre            correlation_tempo_valence
##    <chr>                                <dbl>
##  1 acoustic pop                        0.139 
##  2 adult standards                     0.992 
##  3 aesthetic rap                       0.169 
##  4 afghan pop                          0.438 
##  5 afrobeats                           0.434 
##  6 afrofuturism                       -0.770 
##  7 aggressive phonk                   -0.256 
##  8 alabama indie                      -0.0430
##  9 alaska indie                        1     
## 10 album rock                          0.339 
## # ℹ 302 more rows
  1. Column Descriptions:

    • genre: The genre of the music.

    • correlation_tempo_valence: The correlation coefficient between tempo and valence for each genre.

  2. Sample Rows:

    • acoustic pop: The correlation coefficient for acoustic pop is approximately 0.14. This positive value suggests a weak positive correlation between tempo and valence in acoustic pop music. As tempo increases, valence tends to increase slightly.

    • adult standards: The correlation coefficient for adult standards is approximately 0.99. This very high positive value indicates a strong positive correlation between tempo and valence in adult standards. A faster tempo is strongly associated with higher valence in this genre.

    • aesthetic rap: The correlation coefficient for aesthetic rap is approximately 0.17. This positive value suggests a weak positive correlation between tempo and valence in aesthetic rap.

    • afghan pop: The correlation coefficient for Afghan pop is approximately 0.44. This positive value suggests a moderate positive correlation between tempo and valence in Afghan pop music.

    • afrobeats: The correlation coefficient for afrobeats is approximately 0.43. This positive value suggests a moderate positive correlation between tempo and valence in afrobeats.

    • afrofuturism: The correlation coefficient for afrofuturism is approximately -0.77. This negative value indicates a strong negative correlation between tempo and valence in afrofuturism. As tempo increases, valence tends to decrease significantly.

    • aggressive phonk: The correlation coefficient for aggressive phonk is approximately -0.26. This negative value suggests a weak negative correlation between tempo and valence in aggressive phonk.

    • alabama indie: The correlation coefficient for Alabama indie is approximately -0.04. This near-zero value suggests a very weak negative correlation between tempo and valence in Alabama indie.

    • alaska indie: The correlation coefficient for Alaska indie is 1. This perfect positive correlation (1) indicates a strong positive relationship between tempo and valence in Alaska indie music.

    • album rock: The correlation coefficient for album rock is approximately 0.34. This positive value suggests a moderate positive correlation between tempo and valence in album rock.

  3. Interpretation:

    • Positive correlation coefficients indicate that as tempo increases, valence tends to increase as well.

    • Negative correlation coefficients indicate that as tempo increases, valence tends to decrease.

    • The strength of the correlation is indicated by the absolute value of the coefficient: close to 1 indicates a strong correlation, close to 0 indicates a weak correlation.

  4. Considerations:

    • Correlation does not imply causation. These values suggest associations but do not indicate a causal relationship between tempo and valence.

    • Interpret the results in the context of the specific genre and the nature of the music.

Applications:

  1. Music Recommendation Systems:

    • Personalized Playlists: Understanding the correlation between tempo and valence in different genres allows for the creation of more personalized playlists. For example, a recommendation system could suggest playlists with high-energy, upbeat tracks (positive valence) for users who enjoy faster tempos.

    • Enhanced User Experience: Recommending music that aligns with both tempo and valence preferences enhances the overall listening experience for users. For instance, a user who prefers slower tempos and positive valence may receive recommendations tailored to those preferences.

  2. Genre-Specific Features for Platforms:

    • Genre-Based Features: Music streaming platforms can incorporate genre-specific features based on these correlations. For instance, they can highlight the energetic and positive aspects of genres with strong positive correlations between tempo and valence.

    • User Interface Customization: The platform’s user interface could adapt to showcase genres that align with the user’s preferred tempo and valence combination.

  3. Content Creation for Artists:

Real-World Example: Music Recommendation System

Hypothesis Test 1: Here I am considering attributes Danceability and Energy

Null Hypothesis (H0): There is no significant difference in the danceability between songs with low energy (energy < 0.5) and songs with high energy (energy >= 0.5).

Alternative Hypothesis (H1): There is a significant difference in the danceability between songs with low energy and songs with high energy.

Alpha Level (Significance Level): α = 0.05 (5%) - A common choice for alpha.

Minimum Effect Size: You may consider a minimum effect size of 0.1 in danceability as practically significant based on domain knowledge.

Neyman-Pearson Hypothesis Test:

# Define your data
low_energy_songs <- data %>% filter(energy < 0.5)
high_energy_songs <- data %>% filter(energy >= 0.5)

# Perform t-test
t_test<- t.test(low_energy_songs$danceability, high_energy_songs$danceability)

# Print t-test result
print(t_test)
## 
##  Welch Two Sample t-test
## 
## data:  low_energy_songs$danceability and high_energy_songs$danceability
## t = -22.272, df = 5087, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09057155 -0.07591700
## sample estimates:
## mean of x mean of y 
## 0.5488327 0.6320770
# Assuming 'df_spotify' is your Spotify dataset with 'danceability' and 'energy' columns

# Load necessary libraries
library(ggplot2)

# Create a barplot with rotated x-axis labels
ggplot(data, aes(x = cut(danceability, breaks = 10), y = energy)) +
  geom_bar(stat = "summary", fun = "mean", fill = "skyblue", alpha = 0.7) +
  labs(title = "Mean Energy Across Danceability Levels",
       x = "Danceability Levels",
       y = "Mean Energy") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  # Adjust the angle as needed

Scenario An event planner or DJ is organizing a party or event.
Use Case: Knowing that there is a significant difference in danceability, the playlist or DJ set can be curated to include high-energy songs during peak dance moments. This ensures a dynamic and engaging experience for attendees.

# Create a box plot
ggplot(data, aes(x = factor(energy >= 0.5), y = danceability)) +
  geom_boxplot() +
  labs(x = "Energy Level", y = "Danceability") +
  theme_minimal()

As we see P-value is less than the significance level we can reject null hypothesis (Ho) and accept alternative hypothesis (H1).

Anova Test

Here I am considering danceability as continuous and genre as categorical.

Responsible variable = danceability

Explanatory variable = genre

Hypothesis

Null Hypothesis (H0): There is no significant difference in danceability scores across different music genres.

Alternative Hypothesis (H1): There is a significant difference in danceability scores across different music genres

I am Considering alpha level here as 0.05

alpha <- 0.05

# ANOVA test
anova_result <- aov(danceability ~ genre,data)

# Summary of ANOVA test
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## genre        522  86.77 0.16623   10.43 <2e-16 ***
## Residuals   7989 127.29 0.01593                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As P value is very less than the significance level we can reject null hypothesis (H0) and Accept alternative hypothesis(H1).

There is enough evidence to conclude that there is a significant difference in danceability scores among different music genres.

Listeners can expect distinct differences in danceability based on the genre of music they choose.

For adding additional variable into model, I am here considering the relation between danceability and energy, while controlling for the effect of tempo as a covariate. Basically tempo mean beats per minute so we can say tempo can also predict danceability.

model_two <- lm(danceability ~ energy + tempo, data)

# summary
summary(model_two)
## 
## Call:
## lm(formula = danceability ~ energy + tempo, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58470 -0.09469  0.01380  0.10779  0.41258 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.281e-01  7.429e-03  71.086  < 2e-16 ***
## energy       1.854e-01  6.985e-03  26.548  < 2e-16 ***
## tempo       -2.652e-04  5.828e-05  -4.551 5.41e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1524 on 8509 degrees of freedom
## Multiple R-squared:  0.07659,    Adjusted R-squared:  0.07637 
## F-statistic: 352.9 on 2 and 8509 DF,  p-value: < 2.2e-16

The results show that both energy (β=0.1854, p<0.001) and `tempo` (β=-0.00026, p<0.001) are significant predictors of danceability, indicating that higher energy and lower tempo are associated with higher danceability, after controlling for each other. The adjusted R-squared value of 0.07654 suggests that the model explains about 7.6% of the variance in danceability.

Now let us think what if energy might depend upon tempo I want to test my hypothesis now by using a term energy*tempo.

model_three <- lm(danceability ~ energy * tempo, data)

# summary
summary(model_three)
## 
## Call:
## lm(formula = danceability ~ energy * tempo, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52091 -0.09403  0.01242  0.10816  0.41084 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.3464561  0.0153791   22.53   <2e-16 ***
## energy        0.5408797  0.0273157   19.80   <2e-16 ***
## tempo         0.0013323  0.0001320   10.09   <2e-16 ***
## energy:tempo -0.0030461  0.0002265  -13.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1508 on 8508 degrees of freedom
## Multiple R-squared:  0.09582,    Adjusted R-squared:  0.0955 
## F-statistic: 300.5 on 3 and 8508 DF,  p-value: < 2.2e-16

Based on the output of the model, we can conclude that the interaction term between energy and tempo is significant (p-value < 0.001). This suggests that the effect of energy on danceability depends on the level of tempo. Specifically, the negative coefficient estimate for the interaction term (-0.003) indicates that the effect of energy on danceability becomes weaker at higher levels of tempo.

However, it’s important to note that the adjusted R-squared of the model is relatively low (0.095), indicating that the predictors in the model do not explain a large proportion of the variation in danceability.

  • Music Recommendation:

    The results can inform the music recommendation system to consider the relationship between energy and danceability when suggesting songs to users. For example, the system might tailor recommendations based on users’ preferences for energetic and danceable songs.

Linear Regression

# Conduct linear regression analysis
linear_model <- lm(energy ~ danceability, data = data)

# Summary of the linear regression model
summary(linear_model)
## 
## Call:
## lm(formula = energy ~ danceability, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63724 -0.16144  0.01066  0.17260  0.56315 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.316757   0.009893   32.02   <2e-16 ***
## danceability 0.415651   0.015899   26.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2326 on 8510 degrees of freedom
## Multiple R-squared:  0.07434,    Adjusted R-squared:  0.07423 
## F-statistic: 683.5 on 1 and 8510 DF,  p-value: < 2.2e-16

From Results by looking at coefficients section in the model summary.

The linear regression model output shows that both energy and the intercept are significant predictors of danceability (p < 2e-16), with an increase in energy positively associated with an increase in danceability. The coefficient estimate for energy is 0.178790, meaning that for every unit increase in energy, there is an expected increase in danceability by 0.178790 units. However, the adjusted R-squared value of 0.07418 suggests that the model explains only a small amount of the variance and there could be other factors that influence danceability which are not captured by the current model.

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Create a scatter plot with regression line
scatter_plot <- ggplot(data, aes(x = danceability, y = energy)) +
  geom_point(alpha = 0.7, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Scatter Plot with Linear Regression: Danceability vs. Energy",
       x = "Danceability",
       y = "Energy") +
  theme_minimal()

# Show the scatter plot with regression line
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Create a scatter plot with regression line
scatter_plot <- ggplot(data, aes(x = loudness, 
                       y = danceability)) +
  geom_point(alpha = 0.7, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Scatter Plot with Linear Regression: Danceability vs. Energy",
       x = "Danceability",
       y = "Energy") +
  theme_minimal()

# Show the scatter plot with regression line
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'

data |> 
  filter( mode == 1  ) |>
  ggplot(mapping = aes(x = loudness, 
                       y = danceability)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'red', linetype = 'dashed', 
              se = FALSE) +
  geom_smooth(se = FALSE) +
  labs(title = "Loudness vs danceability",
       subtitle = paste("Linear Fit R-Squared =")) +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

The scatter plot shows a negative correlation between loudness and danceability. This means that as the loudness of a piece of music increases, its danceability, or how suitable it is for dancing, tends to decrease. This is likely because louder music is often associated with heavier genres such as rock and metal, which are not typically associated with dancing.

This information can be helpful in music recommendation systems by helping to filter out songs that are too loud or too quiet for a user’s preferences. For example, if a user indicates that they prefer danceable music, the system can avoid recommending songs that are too loud.