A Statistical Analysis of Multi Functioning Rap Gods

Author

Kimberly N. Reiner

Dataset: Spotify and Genius Lyrics

1. Consider the lyrics data

library(haven)
library(tidyverse)
library(fastDummies)
library(nnet)
library(dplyr)

Look at average values for each Rap Artist: name(song title), artist, popularity(song), danceability,loudness, energy, speechiness, acousticness, instrumentalness, liveness, valence, temp, duration_ms, swears, totalWords, uniqueWords, averageLength, flow_wpm, vulgarityRate

library(dplyr)




Top10_Analysis<-read.csv("~/Desktop/Top10.csv")

Top10_Analysis <- as_tibble(Top10_Analysis) %>%
  dplyr::select(name,artist, popularity, danceability, energy, speechiness, acousticness, instrumentalness,loudness, liveness, valence, tempo, duration_ms, swears, totalWords,uniqueWords, averageLength, flow_wpm, vulgarityRate )

# Grouping the data by artist
Top10_Analysis_by_artist <- Top10_Analysis %>%
  group_by(artist)

# Summary statistics for each artist
summary_by_artist <- summarise(Top10_Analysis_by_artist,
                                mean_popularity = mean(popularity, na.rm = TRUE),
                                mean_danceability = mean(danceability, na.rm = TRUE),
                                mean_energy = mean(energy, na.rm = TRUE),
                                mean_speechiness = mean(speechiness, na.rm = TRUE),
                                mean_acousticness = mean(acousticness, na.rm = TRUE),
                                mean_instrumentalness = mean(instrumentalness, na.rm = TRUE),                        mean_loudness = mean(loudness, na.rm = TRUE),
                                mean_liveness = mean(liveness, na.rm = TRUE),
                                mean_valence = mean(valence, na.rm = TRUE),
                                mean_tempo = mean(tempo, na.rm = TRUE),
                                mean_duration_ms = mean(duration_ms, na.rm = TRUE),
                                mean_swears = mean(swears, na.rm = TRUE),
                                mean_totalWords = mean(totalWords, na.rm = TRUE),
                                mean_uniqueWords = mean(uniqueWords, na.rm = TRUE),
                                mean_averageLength = mean(averageLength, na.rm = TRUE),
                                mean_flow_wpm = mean(flow_wpm, na.rm = TRUE),
                                mean_vulgarityRate = mean(vulgarityRate, na.rm = TRUE))





library(dplyr)

# Splitting columns into three groups to make it easier to create graphics.
n_cols <- ncol(summary_by_artist)
n_cols_per_table <- (n_cols - 1) %/% 3  # Calculate approximately equal number of columns for each table

# Create three separate tables with "artist" column repeated to keep track of this label.
table1 <- summary_by_artist[, c( 1, 1, 2:(n_cols_per_table + 1))]
table2 <- summary_by_artist[, c( 1, 1, (n_cols_per_table + 2):(2 * n_cols_per_table + 1))]
table3 <- summary_by_artist[, c(1, 1, (2 * n_cols_per_table + 2):n_cols)]

# Rename the repeated "artist" column in each table
colnames(table1)[2] <- "artist"
colnames(table2)[2] <- "artist"
colnames(table3)[2] <- "artist"

# Print or use these tables as needed
print(table1)
# A tibble: 10 × 7
   artist  artist mean_popularity mean_danceability mean_energy mean_speechiness
   <chr>   <chr>            <dbl>             <dbl>       <dbl>            <dbl>
 1 2Pac    2Pac              44.3             0.789       0.735            0.235
 2 Drake   Drake             57.8             0.648       0.557            0.220
 3 Eminem  Eminem            58.1             0.740       0.770            0.257
 4 JAY-Z   JAY-Z             39.9             0.680       0.767            0.303
 5 Kendri… Kendr…            50.6             0.626       0.644            0.275
 6 Lil Wa… Lil W…            34.7             0.685       0.698            0.263
 7 Nas     Nas               38.2             0.650       0.753            0.294
 8 Nicki … Nicki…            48.9             0.716       0.683            0.203
 9 Snoop … Snoop…            25.6             0.688       0.714            0.248
10 The No… The N…            22.9             0.711       0.731            0.312
# ℹ 1 more variable: mean_acousticness <dbl>
print(table2)
# A tibble: 10 × 7
   artist  artist mean_instrumentalness mean_loudness mean_liveness mean_valence
   <chr>   <chr>                  <dbl>         <dbl>         <dbl>        <dbl>
 1 2Pac    2Pac                0.00893          -6.80         0.199        0.636
 2 Drake   Drake               0.00814          -7.92         0.195        0.353
 3 Eminem  Eminem              0.000122         -4.51         0.266        0.540
 4 JAY-Z   JAY-Z               0.0116           -5.59         0.233        0.577
 5 Kendri… Kendr…              0.00237          -8.00         0.248        0.470
 6 Lil Wa… Lil W…              0.000587         -6.15         0.241        0.561
 7 Nas     Nas                 0.000193         -5.91         0.226        0.554
 8 Nicki … Nicki…              0.00860          -6.13         0.219        0.461
 9 Snoop … Snoop…              0.00298          -5.97         0.250        0.571
10 The No… The N…              0.00208          -6.32         0.212        0.598
# ℹ 1 more variable: mean_tempo <dbl>
print(table3)
# A tibble: 10 × 9
   artist   artist mean_duration_ms mean_swears mean_totalWords mean_uniqueWords
   <chr>    <chr>             <dbl>       <dbl>           <dbl>            <dbl>
 1 2Pac     2Pac            262728.        30.1            777.             292.
 2 Drake    Drake           240018.        10.2            536.             210.
 3 Eminem   Eminem          267958.        15.8            852.             344.
 4 JAY-Z    JAY-Z           246930.        15.0            648.             264.
 5 Kendric… Kendr…          261510.        13.8            681.             268.
 6 Lil Way… Lil W…          228370.        21.8            629.             237.
 7 Nas      Nas             220992.        11.2            608.             281.
 8 Nicki M… Nicki…          219241.        13.9            540.             194.
 9 Snoop D… Snoop…          226943.        20.1            547.             207.
10 The Not… The N…          253524.        24.3            674.             284.
# ℹ 3 more variables: mean_averageLength <dbl>, mean_flow_wpm <dbl>,
#   mean_vulgarityRate <dbl>
# Display summary statistics for each artist
#print(summary_by_artist)

Playing around with correlations.

library(dplyr)
library(ggplot2)

library(dplyr)
library(ggplot2)

# Calculate correlation coefficients
correlation_matrix <- cor(Top10_Analysis[, c("popularity", "danceability", "energy", "speechiness", "acousticness", "instrumentalness", "loudness", "liveness", "valence", "tempo", "duration_ms", "swears", "totalWords", "uniqueWords", "averageLength", "flow_wpm", "vulgarityRate")])

# Convert correlation matrix to tidy format for visualization
correlation_data <- as.data.frame(as.table(correlation_matrix))
colnames(correlation_data) <- c("Variable1", "Variable2", "Correlation")

# Filter out correlations with popularity (as this is the metric we would like to consider)
flow_wpm_correlation <- correlation_data %>%
  filter(Variable1 == "popularity" | Variable2 == "popularity") %>%
  filter(Variable1 != Variable2)

# Plot correlation coefficients
ggplot(flow_wpm_correlation, aes(x = Variable2, y = Correlation, fill = Variable1)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  theme_minimal() +
  labs(x = "Variable", y = "Correlation with popularity", fill = "Variable") +
  ggtitle("Correlation of Variables with popularity")

# Extract correlations with popularity
popularity_correlation <- correlation_matrix["popularity", ]

# Print correlation coefficient for popularity
print(popularity_correlation)
      popularity     danceability           energy      speechiness 
    1.0000000000     0.0213578565    -0.1043837420    -0.1014664999 
    acousticness instrumentalness         loudness         liveness 
    0.0071366723     0.0083631240    -0.0000438763    -0.0508413913 
         valence            tempo      duration_ms           swears 
   -0.1879453939     0.0742749720     0.1134398258    -0.0872883322 
      totalWords      uniqueWords    averageLength         flow_wpm 
    0.1202335440     0.1173026217     0.0159271752    -0.0081929663 
   vulgarityRate 
   -0.1380476354 
# Extract correlation coefficients with "popularity"
popularity_correlation <- correlation_matrix["popularity", ]

# Remove "popularity" from the correlation coefficients
popularity_correlation <- popularity_correlation[-which(names(popularity_correlation) == "popularity")]

# Sort correlation coefficients in descending order
top_positive_correlations <- sort(popularity_correlation, decreasing = TRUE)

# Select the top 5 positive coefficients
top_5_positive <- head(top_positive_correlations, 5)

# Print the top 5 positive coefficients
print(top_5_positive)
  totalWords  uniqueWords  duration_ms        tempo danceability 
  0.12023354   0.11730262   0.11343983   0.07427497   0.02135786 
Top10_Analysis<-read.csv("~/Desktop/Top10.csv")

Top10_Analysis <- as_tibble(Top10_Analysis) %>%
  dplyr::select(name, artist, popularity, uniqueWords, swears, energy, totalWords, flow_wpm, averageLength, duration_ms, tempo, danceability)


#Make a model with respect to popularity
m1 <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = Top10_Analysis)


summary(m1)

Call:
lm(formula = popularity ~ totalWords + uniqueWords + duration_ms + 
    tempo + danceability, data = Top10_Analysis)

Residuals:
    Min      1Q  Median      3Q     Max 
-44.195 -12.724  -1.517  11.961  53.767 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.268e+01  3.056e+00   7.421 1.65e-13 ***
totalWords   4.934e-04  3.731e-03   0.132   0.8948    
uniqueWords  1.628e-02  8.185e-03   1.989   0.0468 *  
duration_ms  2.142e-05  8.563e-06   2.502   0.0124 *  
tempo        4.850e-02  1.207e-02   4.018 6.06e-05 ***
danceability 3.594e+00  2.816e+00   1.276   0.2020    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.85 on 2233 degrees of freedom
Multiple R-squared:  0.02476,   Adjusted R-squared:  0.02257 
F-statistic: 11.34 on 5 and 2233 DF,  p-value: 8.078e-11
hist(Top10_Analysis$popularity)

almost_sas <- function(aov.results){
  aov_residuals <- residuals(aov.results)
  par(mfrow=c(2,2))
  plot(aov.results, which=1)
  hist(aov_residuals)
  plot(aov.results, which=2)
}
almost_sas(m1)

# Fit the linear regression model
m1 <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability , data = Top10_Analysis)

# Residual Analysis
# 1. Linearity Check
plot(m1, which = 1)

# 2. Independence of Errors Check
# Example: Durbin-Watson test
library(car)
durbinWatsonTest(m1)
 lag Autocorrelation D-W Statistic p-value
   1       0.4055432      1.188721       0
 Alternative hypothesis: rho != 0
# 3. Homoscedasticity Check
# Plotting residuals against fitted values
plot(m1, which = 3)

# 4. Normality of Residuals Check
# Example: QQ plot
qqnorm(resid(m1))
qqline(resid(m1))

# Example: Shapiro-Wilk test
shapiro.test(resid(m1))

    Shapiro-Wilk normality test

data:  resid(m1)
W = 0.99195, p-value = 8.429e-10
# 5. Multicollinearity Check
# Example: Variance Inflation Factor (VIF)
library(car)
vif(m1)
  totalWords  uniqueWords  duration_ms        tempo danceability 
    4.536690     3.347790     1.918308     1.025490     1.094735 
# 6. Outliers and Influential Points Check
# Example: Cook's distance
plot(m1, which = 4)

# Example: Identify influential points
influencePlot(m1)

       StudRes         Hat        CookD
155  2.7561756 0.003449488 0.0043695501
398  3.0209612 0.002204150 0.0033478115
803 -1.4503270 0.028990982 0.0104617834
864  0.1767099 0.034290579 0.0001848788
869 -1.3976496 0.031896184 0.0107220232

The most appropriate method is to utilize an lm model (Continuous and normalized) since the data is both continuous and normal. My evidence is that my Q-Q residuals are linear and my histogram shows a “roughly” normal distribution. We also can see some of our outliers in our testing. :)

hist(Top10_Analysis$popularity)

coefficients(m1)
 (Intercept)   totalWords  uniqueWords  duration_ms        tempo danceability 
2.267975e+01 4.934232e-04 1.627919e-02 2.142283e-05 4.850023e-02 3.593523e+00 

Model:

\[\hat{y}=\text{22.67975+0.0004934232totalWords+0.01627919uniqueWords+0.00002142283duration_ms+0.04850023tempo+3.593523danceability} \]

Check the relevant assumptions of the model. Are we okay to proceed with statistical inference?

almost_sas <- function(aov.results){
  aov_residuals <- residuals(aov.results)
  par(mfrow=c(2,2))
  plot(aov.results, which=1)
  hist(aov_residuals)
  plot(aov.results, which=2)
}
almost_sas(m1)

Yes. We can continue with the statistical inference. The most appropriate method is to utilize an lm model since the data is both continuous and normal. My evidence is that my Q-Q residuals are linear and my histogram shows a normal distribution albeit with a slight skew, but this is not enough to deviate from a normalized model.) :)

1b-iv. Which, if any, are significant predictors of popularity? Test at the \(\alpha=0.05\) level.

summary(m1)

Call:
lm(formula = popularity ~ totalWords + uniqueWords + duration_ms + 
    tempo + danceability, data = Top10_Analysis)

Residuals:
    Min      1Q  Median      3Q     Max 
-44.195 -12.724  -1.517  11.961  53.767 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.268e+01  3.056e+00   7.421 1.65e-13 ***
totalWords   4.934e-04  3.731e-03   0.132   0.8948    
uniqueWords  1.628e-02  8.185e-03   1.989   0.0468 *  
duration_ms  2.142e-05  8.563e-06   2.502   0.0124 *  
tempo        4.850e-02  1.207e-02   4.018 6.06e-05 ***
danceability 3.594e+00  2.816e+00   1.276   0.2020    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.85 on 2233 degrees of freedom
Multiple R-squared:  0.02476,   Adjusted R-squared:  0.02257 
F-statistic: 11.34 on 5 and 2233 DF,  p-value: 8.078e-11

Model:

\[\hat{y}=\text{22.67975+0.0004934232totalWords+0.01627919uniqueWords+0.00002142283duration_ms+0.04850023tempo+3.593523danceability} \] Hypotheses: \[H_0:\beta_\text{totalWords}=\beta_\text{uniqueWords }=\beta_\text{duration_ms }=\beta_\text{tempo }=\beta_\text{danceability}=0\] \[H_1:\text{At least one }\beta_{\text{i}}\ne 0\]

Test Statistic and p-Value for predictor (popularity): \[F_{0} = 11.34 \text{ (p} <0.001)\] Conclusion/Interpretation: \[\text{Reject } H_{0}. \text{ There is sufficient evidence to suggest that at least one slope is non-zero.}\]

Hypotheses: \[H_{0}:\beta_{totalWords}=0\]

\[H_{1}:\beta_{totalWords}\neq0\]

Test statistic and p-Value: \[t_{0}=0.132 \text{ } (p=0.8948)\]

Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]

Conclusion / Interpretation: \[\text{Fail to Reject } H_{0}. \text{There is not sufficient evidence to suggest that totalWords significantly predicts popularity.}\]

Hypotheses: \[H_{0}:\beta_{uniqueWords}=0\]

\[H_{1}:\beta_{uniqueWords}\neq0\]

Test statistic and p-Value: \[t_{0}= 1.989 \text{ } (p=0.0468 )\]

Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]

Conclusion / Interpretation: \[\text{Reject } H_{0}. \text{There is sufficient evidence to suggest that uniqueWords significantly predicts popularity.}\]

Hypotheses: \[H_{0}:\beta_{ duration_{ms} }=0\]

\[H_{1}:\beta_{ duration_{ms}}\neq0\]

Test statistic and p-Value: \[t_{0}= 2.502 \text{ } (p=0.0124 )\]

Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]

Conclusion / Interpretation: \[\text{Reject } H_{0}. \text{There is sufficient evidence to suggest that duration_ms significantly predicts popularity.}\]

Hypotheses: \[H_{0}:\beta_{ tempo}=0\]

\[H_{1}:\beta_{ tempo}\neq0\]

Test statistic and p-Value: \[t_{0}= 4.018 \text{ } (p<0.001)\]

Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]

Conclusion / Interpretation: \[\text{Reject } H_{0}. \text{There is sufficient evidence to suggest that tempo significantly predicts popularity.}\]

Hypotheses: \[H_{0}:\beta_{ danceability}=0\]

\[H_{1}:\beta_{ danceability}\neq0\]

Test statistic and p-Value: \[t_{0}= 1.276 \text{ } (p=0.2020)\]

Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]

Conclusion / Interpretation: \[\text{Fail to Reject } H_{0}. \text{There is not sufficient evidence to suggest that danceability significantly predicts popularity.}\]

set.seed(123) # for reproducibility
train_indices <- sample(1:nrow(Top10_Analysis), 0.8 * nrow(Top10_Analysis))
train_data <- Top10_Analysis[train_indices, ]
test_data <- Top10_Analysis[-train_indices, ]


m_train <- lm(flow_wpm ~ uniqueWords + swears + energy + totalWords + swears:uniqueWords, data = train_data)

predictions <- predict(m_train, newdata = test_data)

rmse <- sqrt(mean((test_data$flow_wpm - predictions)^2))
rsquared <- cor(test_data$flow_wpm, predictions)^2

print(paste("RMSE:", rmse))
[1] "RMSE: 43.9146389560486"
print(paste("R-squared:", rsquared))
[1] "R-squared: 0.350063473786371"
# Create a data frame with the artist and mean values excluding popularity
artist_means <- data.frame(
  artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
  mean_danceability = c(0.7890979, 0.6483333, 0.7399128, 0.6800664, 0.6260769, 0.6504067, 0.7164859, 0.7110244, 0.6884114, 0.6884114    ),
  mean_energy = c(0.7350280, 0.5566111, 0.7699060, 0.7667124, 0.6443462, 0.7526642, 0.6827042, 0.7314756, 0.7143483, 0.7143483),
  mean_speechiness = c(0.2354993, 0.2199174, 0.2572725, 0.3026354, 0.2745423, 0.2940384, 0.2032725, 0.3120671, 0.2479428, 0.2479428 ),
  mean_acousticness = c(0.1286715, 0.2358899, 0.1186457, 0.1303869, 0.3045322, 0.1262964, 0.1944455, 0.1858926, 0.1627993, 0.1627993        ),
  mean_instrumentalness = c(0.0089259957, 0.0081427684, 0.0001216790, 0.0116003766, 0.0023705843, 0.0001926131, 0.0085975033, 0.0020849655, 0.0029760254, 0.0029760254 )
)

# Convert data to long format
artist_means_long <- tidyr::pivot_longer(artist_means, -artist, names_to = "feature", values_to = "mean_value")

# Plot using ggplot2
ggplot(artist_means_long, aes(x = artist, y = mean_value, fill = feature)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(x = "Artist", y = "Mean Value", fill = "Feature") +
  ggtitle("Mean Values of Features for Each Artist") +
  scale_fill_manual(values = c("mean_danceability" = "blue", "mean_energy" = "green", 
                                "mean_speechiness" = "red", "mean_acousticness" = "orange", 
                                "mean_instrumentalness" = "purple")) # Optional: Customize colors

# Plot residuals vs. predicted values
plot(m1$fitted.values, resid(m1))

# Plot residuals vs. each predictor variable
par(mfrow=c(2,2))
plot(Top10_Analysis$totalWords, resid(m1))
plot(Top10_Analysis$uniqueWords, resid(m1))
plot(Top10_Analysis$duration_ms, resid(m1))
plot(Top10_Analysis$tempo, resid(m1))

plot(Top10_Analysis$danceability, resid(m1))

summary(m1)

Call:
lm(formula = popularity ~ totalWords + uniqueWords + duration_ms + 
    tempo + danceability, data = Top10_Analysis)

Residuals:
    Min      1Q  Median      3Q     Max 
-44.195 -12.724  -1.517  11.961  53.767 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.268e+01  3.056e+00   7.421 1.65e-13 ***
totalWords   4.934e-04  3.731e-03   0.132   0.8948    
uniqueWords  1.628e-02  8.185e-03   1.989   0.0468 *  
duration_ms  2.142e-05  8.563e-06   2.502   0.0124 *  
tempo        4.850e-02  1.207e-02   4.018 6.06e-05 ***
danceability 3.594e+00  2.816e+00   1.276   0.2020    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.85 on 2233 degrees of freedom
Multiple R-squared:  0.02476,   Adjusted R-squared:  0.02257 
F-statistic: 11.34 on 5 and 2233 DF,  p-value: 8.078e-11
# Split data into training and testing sets
set.seed(123) # for reproducibility
train_indices <- sample(1:nrow(Top10_Analysis), 0.8 * nrow(Top10_Analysis))
train_data <- Top10_Analysis[train_indices, ]
test_data <- Top10_Analysis[-train_indices, ]

# Fit model on training data
m1_train <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = train_data)

# Predict on testing data
predictions <- predict(m1_train, newdata = test_data)

# Calculate performance metrics (e.g., RMSE, R-squared) on testing data
rmse <- sqrt(mean((test_data$popularity - predictions)^2))
rsquared <- cor(test_data$popularity, predictions)^2

library(dplyr)

# Group the data by artist and fit a linear regression model for each group
models <- Top10_Analysis %>%
  group_by(artist) %>%
  do(model = lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = .))

# Extract coefficients or summaries for each model
model_summaries <- models %>%
  summarise(artist = first(artist),
            intercept = coef(model)[1],
            totalWords_coef = coef(model)[2],
            uniqueWords_coef = coef(model)[3],
            duration_ms_coef = coef(model)[4],
            tempo_coef = coef(model)[5],
            danceability_coef = coef(model)[6])

# View the summaries
print(model_summaries)
# A tibble: 10 × 7
   artist intercept totalWords_coef uniqueWords_coef duration_ms_coef tempo_coef
   <chr>      <dbl>           <dbl>            <dbl>            <dbl>      <dbl>
 1 2Pac        40.5        0.00943         -0.0376        0.0000228      -0.0393
 2 Drake       45.5        0.0154          -0.0191       -0.000000968     0.0457
 3 Eminem      33.2        0.0290          -0.0514       -0.0000147       0.0394
 4 JAY-Z       36.6        0.000716        -0.0400        0.0000521       0.0754
 5 Kendr…      30.5       -0.00789         -0.0313        0.0000457       0.0297
 6 Lil W…      36.8       -0.0143           0.0345        0.0000404      -0.0299
 7 Nas         30.2        0.0156           0.000412     -0.0000493       0.0355
 8 Nicki…      53.5        0.0160           0.00296      -0.0000454      -0.0599
 9 Snoop…      18.3       -0.00103          0.0211        0.00000308     -0.0456
10 The N…      29.6       -0.00579          0.0249        0.0000234      -0.0136
# ℹ 1 more variable: danceability_coef <dbl>
library(dplyr)




Top10_Analysis<-read.csv("~/Desktop/Top10.csv")

Top10_Analysis <- as_tibble(Top10_Analysis) %>%
  dplyr::select(name,artist, popularity, danceability, energy, speechiness, acousticness, instrumentalness,loudness, liveness, valence, tempo, duration_ms, swears, totalWords,uniqueWords, averageLength, flow_wpm, vulgarityRate, uniqueRate)


Top10_Analysis_by_artist <- Top10_Analysis %>%
  group_by(artist)

summary_by_artist <- summarise(Top10_Analysis_by_artist,
                              mean_uniqueRate = mean(uniqueRate, na.rm = TRUE),
                              mean_vulgarityRate = mean(vulgarityRate, na.rm = TRUE),
                                mean_swears = mean(swears, na.rm = TRUE),
                                
                                mean_uniqueWords = mean(uniqueWords, na.rm = TRUE),
                                
                                mean_flow_wpm = mean(flow_wpm, na.rm = TRUE),
                              mean_totalWords=mean(totalWords, na.rm=TRUE),
                              mean_duration_ms=mean(duration_ms, na.rm=TRUE),
                              mean_tempo=mean(tempo, na.rm=TRUE),
                              mean_danceability=mean(danceability, na.rm=TRUE),
                              
                                )


print(summary_by_artist)
# A tibble: 10 × 10
   artist        mean_uniqueRate mean_vulgarityRate mean_swears mean_uniqueWords
   <chr>                   <dbl>              <dbl>       <dbl>            <dbl>
 1 2Pac                    0.382             0.0390        30.1             292.
 2 Drake                   0.395             0.0177        10.2             210.
 3 Eminem                  0.410             0.0187        15.8             344.
 4 JAY-Z                   0.424             0.0235        15.0             264.
 5 Kendrick Lam…           0.408             0.0191        13.8             268.
 6 Lil Wayne               0.390             0.0340        21.8             237.
 7 Nas                     0.477             0.0177        11.2             281.
 8 Nicki Minaj             0.365             0.0235        13.9             194.
 9 Snoop Dogg              0.392             0.0344        20.1             207.
10 The Notoriou…           0.436             0.0352        24.3             284.
# ℹ 5 more variables: mean_flow_wpm <dbl>, mean_totalWords <dbl>,
#   mean_duration_ms <dbl>, mean_tempo <dbl>, mean_danceability <dbl>
library(ggplot2)

# Define the coefficients based on your model summary
intercept <- 22.67975
totalWords_coef <- 0.0004934232
uniqueWords_coef <- 0.01627919
duration_ms_coef <- 0.00002142283
tempo_coef <- 0.04850023
danceability_coef <- 3.593523

# Apply the model for each artist
Top10_Analysis$predicted_popularity <- with(Top10_Analysis, 
                                             intercept + 
                                             totalWords_coef * totalWords + 
                                             uniqueWords_coef * uniqueWords + 
                                             duration_ms_coef * duration_ms + 
                                             tempo_coef * tempo + 
                                             danceability_coef * danceability)

# Plot the predicted values against uniqueWords for each artist
ggplot(Top10_Analysis, aes(x = uniqueWords, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Unique Words", y = "Predicted Popularity") +
  theme_minimal()

library(ggplot2)

# Define the coefficients based on the model summary
intercept <-  22.67975
totalWords_coef <- 0.0004934232
uniqueWords_coef <- 0.01627919
duration_ms_coef <- 0.00002142283
tempo_coef <- 0.04850023
danceability_coef <- 3.593523

# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
                             predicted_popularity = intercept +
                                                     totalWords_coef * totalWords +
                                                     uniqueWords_coef * uniqueWords +
                                                     duration_ms_coef * duration_ms +
                                                     tempo_coef * tempo +
                                                     danceability_coef * danceability)

# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = uniqueWords, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Unique Words", y = "Predicted Popularity") +
  facet_wrap(~ artist, nrow = 3) +  # Separate graphs for each artist
  theme_minimal()

library(ggplot2)

# Define the coefficients based on the model summary
intercept <-  22.67975
totalWords_coef <- 0.0004934232


# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
                             predicted_popularity = intercept +
                                                     totalWords_coef * totalWords)

# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = totalWords, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Total Words", y = "Predicted Popularity") +
  facet_wrap(~ artist, nrow = 3) +  # Separate graphs for each artist
  theme_minimal()

library(ggplot2)

# Define the coefficients based on the model summary
intercept <-  22.67975

uniqueWords_coef <- 0.01627919


# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
                             predicted_popularity = intercept +
                                                     
                                                     uniqueWords_coef * uniqueWords )

# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = uniqueWords, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Unique Words", y = "Predicted Popularity") +
  facet_wrap(~ artist, nrow = 3) +  # Separate graphs for each artist
  theme_minimal()

library(ggplot2)

# Define the coefficients based on the model summary
intercept <-  22.67975

duration_ms_coef <- 0.00002142283


# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
                             predicted_popularity = intercept +
                                                     
                                                     duration_ms_coef * duration_ms )

# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = duration_ms, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Duration", y = "Predicted Popularity") +
  facet_wrap(~ artist, nrow = 3) +  # Separate graphs for each artist
  theme_minimal()

library(ggplot2)

# Define the coefficients based on the model summary
intercept <-  22.67975

tempo_coef <- 0.04850023


# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
                             predicted_popularity = intercept +
                                                     
                                                     tempo_coef * tempo )

# Plot the predicted popularity against tempo for each artist with color
ggplot(Top10_Analysis, aes(x = tempo, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Tempo", y = "Predicted Popularity") +
  facet_wrap(~ artist, nrow = 3) +  # Separate graphs for each artist
  theme_minimal()

library(ggplot2)

# Define the coefficients based on the model summary
intercept <-  22.67975

danceability_coef <- 3.593523

# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
                             predicted_popularity = intercept +
                                                     
                                                     danceability_coef * danceability)

# Plot the predicted popularity against danceability for each artist with color
ggplot(Top10_Analysis, aes(x = danceability, y = predicted_popularity, color = artist)) +
  geom_line() +
  labs(x = "Danceability", y = "Predicted Popularity") +
  facet_wrap(~ artist, nrow = 3) +  # Separate graphs for each artist
  theme_minimal()

# Fit the linear regression model
m1 <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = Top10_Analysis)

# Extract the coefficients
coefficients <- coef(m1)

print(coefficients)
 (Intercept)   totalWords  uniqueWords  duration_ms        tempo danceability 
2.267975e+01 4.934232e-04 1.627919e-02 2.142283e-05 4.850023e-02 3.593523e+00 
library(ggplot2)

# Define the coefficients
intercept <- 22.67975
uniqueWords_coef <- 0.01627919

# List of artists
artist <- c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne", "Snoop Dogg")

# Define a color palette with enough colors for each artist
colors <- c("blue", "green", "red", "purple", "orange", "yellow", "cyan", "magenta", "darkgreen", "darkblue")

# Initialize an empty data frame to store combined data
combined_data <- data.frame()

# Loop through each artist
for (i in seq_along(artist)) {
  # Filter the dataset to include only songs by the current artist
  artist_data <- Top10_Analysis[Top10_Analysis$artist == artist[i], ]
  
  # Calculate the predicted popularity for the current artist
  artist_data$predicted_popularity <- with(artist_data, intercept + uniqueWords_coef * uniqueWords)
  
  # Add a column for artist name
  artist_data$artist <- artist[i]
  
  # Combine the data for the current artist with the combined data
  combined_data <- rbind(combined_data, artist_data)
}

# Plot all artists on the same plot
ggplot(combined_data, aes(x = uniqueWords, y = predicted_popularity, color = artist, group = artist)) +
  geom_line() +
  labs(x = "Unique Words", y = "Predicted Popularity", title = "Predicted Popularity for Rap Artists") +
  theme_minimal()

library(ggplot2)


artist_data <- data.frame(
  artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
  vulgarityRate = c(0.03904857, 0.01772178, 0.0186974, 0.02353002, 0.01912823, 0.01772455, 0.02352152, 0.03519564, 0.03403338, 0.03436607),
  uniqueRate = c(0.3820612, 0.3945815, 0.4096715, 0.4238028, 0.4081528, 0.4767687, 0.3647998, 0.4358999, 0.4358999, 0.3916636)
)


library(ggplot2)
library(tidyr)


artist_data <- data.frame(
  artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
  vulgarityRate = c(0.03904857, 0.01772178, 0.0186974, 0.02353002, 0.01912823, 0.01772455, 0.02352152, 0.03519564, 0.03403338, 0.03436607),
  uniqueRate = c(0.3820612, 0.3945815, 0.4096715, 0.4238028, 0.4081528, 0.4767687, 0.3647998, 0.4358999, 0.4358999, 0.3916636)
)

# Reshape the data for plotting
melted_data <- pivot_longer(artist_data, cols = c("vulgarityRate", "uniqueRate"), names_to = "Variable")

# Bar graph
ggplot(melted_data, aes(x = artist, y = value, fill = Variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Artist", y = "Value", fill = "Variable") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)


artist_data <- data.frame(
  artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
  mean_flow_wpm = c(178.0674, 134.7107, 191.6072, 157.2335, 155.6770, 165.4653, 148.7801, 159.7788, 174.4709, 146.5127)
)

# Bar graph
ggplot(artist_data, aes(x = artist, y = mean_flow_wpm)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Artist", y = "Mean Flow WPM", title = "Mean Flow WPM for Each Artist") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))