library(haven)
library(tidyverse)
library(fastDummies)
library(nnet)
library(dplyr)A Statistical Analysis of Multi Functioning Rap Gods
Dataset: Spotify and Genius Lyrics
1. Consider the lyrics data
Look at average values for each Rap Artist: name(song title), artist, popularity(song), danceability,loudness, energy, speechiness, acousticness, instrumentalness, liveness, valence, temp, duration_ms, swears, totalWords, uniqueWords, averageLength, flow_wpm, vulgarityRate
library(dplyr)
Top10_Analysis<-read.csv("~/Desktop/Top10.csv")
Top10_Analysis <- as_tibble(Top10_Analysis) %>%
dplyr::select(name,artist, popularity, danceability, energy, speechiness, acousticness, instrumentalness,loudness, liveness, valence, tempo, duration_ms, swears, totalWords,uniqueWords, averageLength, flow_wpm, vulgarityRate )
# Grouping the data by artist
Top10_Analysis_by_artist <- Top10_Analysis %>%
group_by(artist)
# Summary statistics for each artist
summary_by_artist <- summarise(Top10_Analysis_by_artist,
mean_popularity = mean(popularity, na.rm = TRUE),
mean_danceability = mean(danceability, na.rm = TRUE),
mean_energy = mean(energy, na.rm = TRUE),
mean_speechiness = mean(speechiness, na.rm = TRUE),
mean_acousticness = mean(acousticness, na.rm = TRUE),
mean_instrumentalness = mean(instrumentalness, na.rm = TRUE), mean_loudness = mean(loudness, na.rm = TRUE),
mean_liveness = mean(liveness, na.rm = TRUE),
mean_valence = mean(valence, na.rm = TRUE),
mean_tempo = mean(tempo, na.rm = TRUE),
mean_duration_ms = mean(duration_ms, na.rm = TRUE),
mean_swears = mean(swears, na.rm = TRUE),
mean_totalWords = mean(totalWords, na.rm = TRUE),
mean_uniqueWords = mean(uniqueWords, na.rm = TRUE),
mean_averageLength = mean(averageLength, na.rm = TRUE),
mean_flow_wpm = mean(flow_wpm, na.rm = TRUE),
mean_vulgarityRate = mean(vulgarityRate, na.rm = TRUE))
library(dplyr)
# Splitting columns into three groups to make it easier to create graphics.
n_cols <- ncol(summary_by_artist)
n_cols_per_table <- (n_cols - 1) %/% 3 # Calculate approximately equal number of columns for each table
# Create three separate tables with "artist" column repeated to keep track of this label.
table1 <- summary_by_artist[, c( 1, 1, 2:(n_cols_per_table + 1))]
table2 <- summary_by_artist[, c( 1, 1, (n_cols_per_table + 2):(2 * n_cols_per_table + 1))]
table3 <- summary_by_artist[, c(1, 1, (2 * n_cols_per_table + 2):n_cols)]
# Rename the repeated "artist" column in each table
colnames(table1)[2] <- "artist"
colnames(table2)[2] <- "artist"
colnames(table3)[2] <- "artist"
# Print or use these tables as needed
print(table1)# A tibble: 10 × 7
artist artist mean_popularity mean_danceability mean_energy mean_speechiness
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2Pac 2Pac 44.3 0.789 0.735 0.235
2 Drake Drake 57.8 0.648 0.557 0.220
3 Eminem Eminem 58.1 0.740 0.770 0.257
4 JAY-Z JAY-Z 39.9 0.680 0.767 0.303
5 Kendri… Kendr… 50.6 0.626 0.644 0.275
6 Lil Wa… Lil W… 34.7 0.685 0.698 0.263
7 Nas Nas 38.2 0.650 0.753 0.294
8 Nicki … Nicki… 48.9 0.716 0.683 0.203
9 Snoop … Snoop… 25.6 0.688 0.714 0.248
10 The No… The N… 22.9 0.711 0.731 0.312
# ℹ 1 more variable: mean_acousticness <dbl>
print(table2)# A tibble: 10 × 7
artist artist mean_instrumentalness mean_loudness mean_liveness mean_valence
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2Pac 2Pac 0.00893 -6.80 0.199 0.636
2 Drake Drake 0.00814 -7.92 0.195 0.353
3 Eminem Eminem 0.000122 -4.51 0.266 0.540
4 JAY-Z JAY-Z 0.0116 -5.59 0.233 0.577
5 Kendri… Kendr… 0.00237 -8.00 0.248 0.470
6 Lil Wa… Lil W… 0.000587 -6.15 0.241 0.561
7 Nas Nas 0.000193 -5.91 0.226 0.554
8 Nicki … Nicki… 0.00860 -6.13 0.219 0.461
9 Snoop … Snoop… 0.00298 -5.97 0.250 0.571
10 The No… The N… 0.00208 -6.32 0.212 0.598
# ℹ 1 more variable: mean_tempo <dbl>
print(table3)# A tibble: 10 × 9
artist artist mean_duration_ms mean_swears mean_totalWords mean_uniqueWords
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2Pac 2Pac 262728. 30.1 777. 292.
2 Drake Drake 240018. 10.2 536. 210.
3 Eminem Eminem 267958. 15.8 852. 344.
4 JAY-Z JAY-Z 246930. 15.0 648. 264.
5 Kendric… Kendr… 261510. 13.8 681. 268.
6 Lil Way… Lil W… 228370. 21.8 629. 237.
7 Nas Nas 220992. 11.2 608. 281.
8 Nicki M… Nicki… 219241. 13.9 540. 194.
9 Snoop D… Snoop… 226943. 20.1 547. 207.
10 The Not… The N… 253524. 24.3 674. 284.
# ℹ 3 more variables: mean_averageLength <dbl>, mean_flow_wpm <dbl>,
# mean_vulgarityRate <dbl>
# Display summary statistics for each artist
#print(summary_by_artist)Playing around with correlations.
library(dplyr)
library(ggplot2)
library(dplyr)
library(ggplot2)
# Calculate correlation coefficients
correlation_matrix <- cor(Top10_Analysis[, c("popularity", "danceability", "energy", "speechiness", "acousticness", "instrumentalness", "loudness", "liveness", "valence", "tempo", "duration_ms", "swears", "totalWords", "uniqueWords", "averageLength", "flow_wpm", "vulgarityRate")])
# Convert correlation matrix to tidy format for visualization
correlation_data <- as.data.frame(as.table(correlation_matrix))
colnames(correlation_data) <- c("Variable1", "Variable2", "Correlation")
# Filter out correlations with popularity (as this is the metric we would like to consider)
flow_wpm_correlation <- correlation_data %>%
filter(Variable1 == "popularity" | Variable2 == "popularity") %>%
filter(Variable1 != Variable2)
# Plot correlation coefficients
ggplot(flow_wpm_correlation, aes(x = Variable2, y = Correlation, fill = Variable1)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
theme_minimal() +
labs(x = "Variable", y = "Correlation with popularity", fill = "Variable") +
ggtitle("Correlation of Variables with popularity")# Extract correlations with popularity
popularity_correlation <- correlation_matrix["popularity", ]
# Print correlation coefficient for popularity
print(popularity_correlation) popularity danceability energy speechiness
1.0000000000 0.0213578565 -0.1043837420 -0.1014664999
acousticness instrumentalness loudness liveness
0.0071366723 0.0083631240 -0.0000438763 -0.0508413913
valence tempo duration_ms swears
-0.1879453939 0.0742749720 0.1134398258 -0.0872883322
totalWords uniqueWords averageLength flow_wpm
0.1202335440 0.1173026217 0.0159271752 -0.0081929663
vulgarityRate
-0.1380476354
# Extract correlation coefficients with "popularity"
popularity_correlation <- correlation_matrix["popularity", ]
# Remove "popularity" from the correlation coefficients
popularity_correlation <- popularity_correlation[-which(names(popularity_correlation) == "popularity")]
# Sort correlation coefficients in descending order
top_positive_correlations <- sort(popularity_correlation, decreasing = TRUE)
# Select the top 5 positive coefficients
top_5_positive <- head(top_positive_correlations, 5)
# Print the top 5 positive coefficients
print(top_5_positive) totalWords uniqueWords duration_ms tempo danceability
0.12023354 0.11730262 0.11343983 0.07427497 0.02135786
Top10_Analysis<-read.csv("~/Desktop/Top10.csv")
Top10_Analysis <- as_tibble(Top10_Analysis) %>%
dplyr::select(name, artist, popularity, uniqueWords, swears, energy, totalWords, flow_wpm, averageLength, duration_ms, tempo, danceability)
#Make a model with respect to popularity
m1 <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = Top10_Analysis)
summary(m1)
Call:
lm(formula = popularity ~ totalWords + uniqueWords + duration_ms +
tempo + danceability, data = Top10_Analysis)
Residuals:
Min 1Q Median 3Q Max
-44.195 -12.724 -1.517 11.961 53.767
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.268e+01 3.056e+00 7.421 1.65e-13 ***
totalWords 4.934e-04 3.731e-03 0.132 0.8948
uniqueWords 1.628e-02 8.185e-03 1.989 0.0468 *
duration_ms 2.142e-05 8.563e-06 2.502 0.0124 *
tempo 4.850e-02 1.207e-02 4.018 6.06e-05 ***
danceability 3.594e+00 2.816e+00 1.276 0.2020
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 17.85 on 2233 degrees of freedom
Multiple R-squared: 0.02476, Adjusted R-squared: 0.02257
F-statistic: 11.34 on 5 and 2233 DF, p-value: 8.078e-11
hist(Top10_Analysis$popularity)almost_sas <- function(aov.results){
aov_residuals <- residuals(aov.results)
par(mfrow=c(2,2))
plot(aov.results, which=1)
hist(aov_residuals)
plot(aov.results, which=2)
}
almost_sas(m1)# Fit the linear regression model
m1 <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability , data = Top10_Analysis)
# Residual Analysis
# 1. Linearity Check
plot(m1, which = 1)# 2. Independence of Errors Check
# Example: Durbin-Watson test
library(car)
durbinWatsonTest(m1) lag Autocorrelation D-W Statistic p-value
1 0.4055432 1.188721 0
Alternative hypothesis: rho != 0
# 3. Homoscedasticity Check
# Plotting residuals against fitted values
plot(m1, which = 3)# 4. Normality of Residuals Check
# Example: QQ plot
qqnorm(resid(m1))
qqline(resid(m1))# Example: Shapiro-Wilk test
shapiro.test(resid(m1))
Shapiro-Wilk normality test
data: resid(m1)
W = 0.99195, p-value = 8.429e-10
# 5. Multicollinearity Check
# Example: Variance Inflation Factor (VIF)
library(car)
vif(m1) totalWords uniqueWords duration_ms tempo danceability
4.536690 3.347790 1.918308 1.025490 1.094735
# 6. Outliers and Influential Points Check
# Example: Cook's distance
plot(m1, which = 4)# Example: Identify influential points
influencePlot(m1) StudRes Hat CookD
155 2.7561756 0.003449488 0.0043695501
398 3.0209612 0.002204150 0.0033478115
803 -1.4503270 0.028990982 0.0104617834
864 0.1767099 0.034290579 0.0001848788
869 -1.3976496 0.031896184 0.0107220232
The most appropriate method is to utilize an lm model (Continuous and normalized) since the data is both continuous and normal. My evidence is that my Q-Q residuals are linear and my histogram shows a “roughly” normal distribution. We also can see some of our outliers in our testing. :)
hist(Top10_Analysis$popularity)coefficients(m1) (Intercept) totalWords uniqueWords duration_ms tempo danceability
2.267975e+01 4.934232e-04 1.627919e-02 2.142283e-05 4.850023e-02 3.593523e+00
Model:
\[\hat{y}=\text{22.67975+0.0004934232totalWords+0.01627919uniqueWords+0.00002142283duration_ms+0.04850023tempo+3.593523danceability} \]
Check the relevant assumptions of the model. Are we okay to proceed with statistical inference?
almost_sas <- function(aov.results){
aov_residuals <- residuals(aov.results)
par(mfrow=c(2,2))
plot(aov.results, which=1)
hist(aov_residuals)
plot(aov.results, which=2)
}
almost_sas(m1)Yes. We can continue with the statistical inference. The most appropriate method is to utilize an lm model since the data is both continuous and normal. My evidence is that my Q-Q residuals are linear and my histogram shows a normal distribution albeit with a slight skew, but this is not enough to deviate from a normalized model.) :)
1b-iv. Which, if any, are significant predictors of popularity? Test at the \(\alpha=0.05\) level.
summary(m1)
Call:
lm(formula = popularity ~ totalWords + uniqueWords + duration_ms +
tempo + danceability, data = Top10_Analysis)
Residuals:
Min 1Q Median 3Q Max
-44.195 -12.724 -1.517 11.961 53.767
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.268e+01 3.056e+00 7.421 1.65e-13 ***
totalWords 4.934e-04 3.731e-03 0.132 0.8948
uniqueWords 1.628e-02 8.185e-03 1.989 0.0468 *
duration_ms 2.142e-05 8.563e-06 2.502 0.0124 *
tempo 4.850e-02 1.207e-02 4.018 6.06e-05 ***
danceability 3.594e+00 2.816e+00 1.276 0.2020
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 17.85 on 2233 degrees of freedom
Multiple R-squared: 0.02476, Adjusted R-squared: 0.02257
F-statistic: 11.34 on 5 and 2233 DF, p-value: 8.078e-11
Model:
\[\hat{y}=\text{22.67975+0.0004934232totalWords+0.01627919uniqueWords+0.00002142283duration_ms+0.04850023tempo+3.593523danceability} \] Hypotheses: \[H_0:\beta_\text{totalWords}=\beta_\text{uniqueWords }=\beta_\text{duration_ms }=\beta_\text{tempo }=\beta_\text{danceability}=0\] \[H_1:\text{At least one }\beta_{\text{i}}\ne 0\]
Test Statistic and p-Value for predictor (popularity): \[F_{0} = 11.34 \text{ (p} <0.001)\] Conclusion/Interpretation: \[\text{Reject } H_{0}. \text{ There is sufficient evidence to suggest that at least one slope is non-zero.}\]
Hypotheses: \[H_{0}:\beta_{totalWords}=0\]
\[H_{1}:\beta_{totalWords}\neq0\]
Test statistic and p-Value: \[t_{0}=0.132 \text{ } (p=0.8948)\]
Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]
Conclusion / Interpretation: \[\text{Fail to Reject } H_{0}. \text{There is not sufficient evidence to suggest that totalWords significantly predicts popularity.}\]
Hypotheses: \[H_{0}:\beta_{uniqueWords}=0\]
\[H_{1}:\beta_{uniqueWords}\neq0\]
Test statistic and p-Value: \[t_{0}= 1.989 \text{ } (p=0.0468 )\]
Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]
Conclusion / Interpretation: \[\text{Reject } H_{0}. \text{There is sufficient evidence to suggest that uniqueWords significantly predicts popularity.}\]
Hypotheses: \[H_{0}:\beta_{ duration_{ms} }=0\]
\[H_{1}:\beta_{ duration_{ms}}\neq0\]
Test statistic and p-Value: \[t_{0}= 2.502 \text{ } (p=0.0124 )\]
Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]
Conclusion / Interpretation: \[\text{Reject } H_{0}. \text{There is sufficient evidence to suggest that duration_ms significantly predicts popularity.}\]
Hypotheses: \[H_{0}:\beta_{ tempo}=0\]
\[H_{1}:\beta_{ tempo}\neq0\]
Test statistic and p-Value: \[t_{0}= 4.018 \text{ } (p<0.001)\]
Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]
Conclusion / Interpretation: \[\text{Reject } H_{0}. \text{There is sufficient evidence to suggest that tempo significantly predicts popularity.}\]
Hypotheses: \[H_{0}:\beta_{ danceability}=0\]
\[H_{1}:\beta_{ danceability}\neq0\]
Test statistic and p-Value: \[t_{0}= 1.276 \text{ } (p=0.2020)\]
Rejection Region: \[\text{Reject } H_{0} \text{ if } p < \alpha \text{ for } \alpha=0.05.\]
Conclusion / Interpretation: \[\text{Fail to Reject } H_{0}. \text{There is not sufficient evidence to suggest that danceability significantly predicts popularity.}\]
set.seed(123) # for reproducibility
train_indices <- sample(1:nrow(Top10_Analysis), 0.8 * nrow(Top10_Analysis))
train_data <- Top10_Analysis[train_indices, ]
test_data <- Top10_Analysis[-train_indices, ]
m_train <- lm(flow_wpm ~ uniqueWords + swears + energy + totalWords + swears:uniqueWords, data = train_data)
predictions <- predict(m_train, newdata = test_data)
rmse <- sqrt(mean((test_data$flow_wpm - predictions)^2))
rsquared <- cor(test_data$flow_wpm, predictions)^2
print(paste("RMSE:", rmse))[1] "RMSE: 43.9146389560486"
print(paste("R-squared:", rsquared))[1] "R-squared: 0.350063473786371"
# Create a data frame with the artist and mean values excluding popularity
artist_means <- data.frame(
artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
mean_danceability = c(0.7890979, 0.6483333, 0.7399128, 0.6800664, 0.6260769, 0.6504067, 0.7164859, 0.7110244, 0.6884114, 0.6884114 ),
mean_energy = c(0.7350280, 0.5566111, 0.7699060, 0.7667124, 0.6443462, 0.7526642, 0.6827042, 0.7314756, 0.7143483, 0.7143483),
mean_speechiness = c(0.2354993, 0.2199174, 0.2572725, 0.3026354, 0.2745423, 0.2940384, 0.2032725, 0.3120671, 0.2479428, 0.2479428 ),
mean_acousticness = c(0.1286715, 0.2358899, 0.1186457, 0.1303869, 0.3045322, 0.1262964, 0.1944455, 0.1858926, 0.1627993, 0.1627993 ),
mean_instrumentalness = c(0.0089259957, 0.0081427684, 0.0001216790, 0.0116003766, 0.0023705843, 0.0001926131, 0.0085975033, 0.0020849655, 0.0029760254, 0.0029760254 )
)
# Convert data to long format
artist_means_long <- tidyr::pivot_longer(artist_means, -artist, names_to = "feature", values_to = "mean_value")
# Plot using ggplot2
ggplot(artist_means_long, aes(x = artist, y = mean_value, fill = feature)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(x = "Artist", y = "Mean Value", fill = "Feature") +
ggtitle("Mean Values of Features for Each Artist") +
scale_fill_manual(values = c("mean_danceability" = "blue", "mean_energy" = "green",
"mean_speechiness" = "red", "mean_acousticness" = "orange",
"mean_instrumentalness" = "purple")) # Optional: Customize colors# Plot residuals vs. predicted values
plot(m1$fitted.values, resid(m1))# Plot residuals vs. each predictor variable
par(mfrow=c(2,2))
plot(Top10_Analysis$totalWords, resid(m1))
plot(Top10_Analysis$uniqueWords, resid(m1))
plot(Top10_Analysis$duration_ms, resid(m1))
plot(Top10_Analysis$tempo, resid(m1))plot(Top10_Analysis$danceability, resid(m1))
summary(m1)
Call:
lm(formula = popularity ~ totalWords + uniqueWords + duration_ms +
tempo + danceability, data = Top10_Analysis)
Residuals:
Min 1Q Median 3Q Max
-44.195 -12.724 -1.517 11.961 53.767
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.268e+01 3.056e+00 7.421 1.65e-13 ***
totalWords 4.934e-04 3.731e-03 0.132 0.8948
uniqueWords 1.628e-02 8.185e-03 1.989 0.0468 *
duration_ms 2.142e-05 8.563e-06 2.502 0.0124 *
tempo 4.850e-02 1.207e-02 4.018 6.06e-05 ***
danceability 3.594e+00 2.816e+00 1.276 0.2020
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 17.85 on 2233 degrees of freedom
Multiple R-squared: 0.02476, Adjusted R-squared: 0.02257
F-statistic: 11.34 on 5 and 2233 DF, p-value: 8.078e-11
# Split data into training and testing sets
set.seed(123) # for reproducibility
train_indices <- sample(1:nrow(Top10_Analysis), 0.8 * nrow(Top10_Analysis))
train_data <- Top10_Analysis[train_indices, ]
test_data <- Top10_Analysis[-train_indices, ]
# Fit model on training data
m1_train <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = train_data)
# Predict on testing data
predictions <- predict(m1_train, newdata = test_data)
# Calculate performance metrics (e.g., RMSE, R-squared) on testing data
rmse <- sqrt(mean((test_data$popularity - predictions)^2))
rsquared <- cor(test_data$popularity, predictions)^2library(dplyr)
# Group the data by artist and fit a linear regression model for each group
models <- Top10_Analysis %>%
group_by(artist) %>%
do(model = lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = .))
# Extract coefficients or summaries for each model
model_summaries <- models %>%
summarise(artist = first(artist),
intercept = coef(model)[1],
totalWords_coef = coef(model)[2],
uniqueWords_coef = coef(model)[3],
duration_ms_coef = coef(model)[4],
tempo_coef = coef(model)[5],
danceability_coef = coef(model)[6])
# View the summaries
print(model_summaries)# A tibble: 10 × 7
artist intercept totalWords_coef uniqueWords_coef duration_ms_coef tempo_coef
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2Pac 40.5 0.00943 -0.0376 0.0000228 -0.0393
2 Drake 45.5 0.0154 -0.0191 -0.000000968 0.0457
3 Eminem 33.2 0.0290 -0.0514 -0.0000147 0.0394
4 JAY-Z 36.6 0.000716 -0.0400 0.0000521 0.0754
5 Kendr… 30.5 -0.00789 -0.0313 0.0000457 0.0297
6 Lil W… 36.8 -0.0143 0.0345 0.0000404 -0.0299
7 Nas 30.2 0.0156 0.000412 -0.0000493 0.0355
8 Nicki… 53.5 0.0160 0.00296 -0.0000454 -0.0599
9 Snoop… 18.3 -0.00103 0.0211 0.00000308 -0.0456
10 The N… 29.6 -0.00579 0.0249 0.0000234 -0.0136
# ℹ 1 more variable: danceability_coef <dbl>
library(dplyr)
Top10_Analysis<-read.csv("~/Desktop/Top10.csv")
Top10_Analysis <- as_tibble(Top10_Analysis) %>%
dplyr::select(name,artist, popularity, danceability, energy, speechiness, acousticness, instrumentalness,loudness, liveness, valence, tempo, duration_ms, swears, totalWords,uniqueWords, averageLength, flow_wpm, vulgarityRate, uniqueRate)
Top10_Analysis_by_artist <- Top10_Analysis %>%
group_by(artist)
summary_by_artist <- summarise(Top10_Analysis_by_artist,
mean_uniqueRate = mean(uniqueRate, na.rm = TRUE),
mean_vulgarityRate = mean(vulgarityRate, na.rm = TRUE),
mean_swears = mean(swears, na.rm = TRUE),
mean_uniqueWords = mean(uniqueWords, na.rm = TRUE),
mean_flow_wpm = mean(flow_wpm, na.rm = TRUE),
mean_totalWords=mean(totalWords, na.rm=TRUE),
mean_duration_ms=mean(duration_ms, na.rm=TRUE),
mean_tempo=mean(tempo, na.rm=TRUE),
mean_danceability=mean(danceability, na.rm=TRUE),
)
print(summary_by_artist)# A tibble: 10 × 10
artist mean_uniqueRate mean_vulgarityRate mean_swears mean_uniqueWords
<chr> <dbl> <dbl> <dbl> <dbl>
1 2Pac 0.382 0.0390 30.1 292.
2 Drake 0.395 0.0177 10.2 210.
3 Eminem 0.410 0.0187 15.8 344.
4 JAY-Z 0.424 0.0235 15.0 264.
5 Kendrick Lam… 0.408 0.0191 13.8 268.
6 Lil Wayne 0.390 0.0340 21.8 237.
7 Nas 0.477 0.0177 11.2 281.
8 Nicki Minaj 0.365 0.0235 13.9 194.
9 Snoop Dogg 0.392 0.0344 20.1 207.
10 The Notoriou… 0.436 0.0352 24.3 284.
# ℹ 5 more variables: mean_flow_wpm <dbl>, mean_totalWords <dbl>,
# mean_duration_ms <dbl>, mean_tempo <dbl>, mean_danceability <dbl>
library(ggplot2)
# Define the coefficients based on your model summary
intercept <- 22.67975
totalWords_coef <- 0.0004934232
uniqueWords_coef <- 0.01627919
duration_ms_coef <- 0.00002142283
tempo_coef <- 0.04850023
danceability_coef <- 3.593523
# Apply the model for each artist
Top10_Analysis$predicted_popularity <- with(Top10_Analysis,
intercept +
totalWords_coef * totalWords +
uniqueWords_coef * uniqueWords +
duration_ms_coef * duration_ms +
tempo_coef * tempo +
danceability_coef * danceability)
# Plot the predicted values against uniqueWords for each artist
ggplot(Top10_Analysis, aes(x = uniqueWords, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Unique Words", y = "Predicted Popularity") +
theme_minimal()library(ggplot2)
# Define the coefficients based on the model summary
intercept <- 22.67975
totalWords_coef <- 0.0004934232
uniqueWords_coef <- 0.01627919
duration_ms_coef <- 0.00002142283
tempo_coef <- 0.04850023
danceability_coef <- 3.593523
# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
predicted_popularity = intercept +
totalWords_coef * totalWords +
uniqueWords_coef * uniqueWords +
duration_ms_coef * duration_ms +
tempo_coef * tempo +
danceability_coef * danceability)
# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = uniqueWords, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Unique Words", y = "Predicted Popularity") +
facet_wrap(~ artist, nrow = 3) + # Separate graphs for each artist
theme_minimal()library(ggplot2)
# Define the coefficients based on the model summary
intercept <- 22.67975
totalWords_coef <- 0.0004934232
# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
predicted_popularity = intercept +
totalWords_coef * totalWords)
# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = totalWords, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Total Words", y = "Predicted Popularity") +
facet_wrap(~ artist, nrow = 3) + # Separate graphs for each artist
theme_minimal()library(ggplot2)
# Define the coefficients based on the model summary
intercept <- 22.67975
uniqueWords_coef <- 0.01627919
# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
predicted_popularity = intercept +
uniqueWords_coef * uniqueWords )
# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = uniqueWords, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Unique Words", y = "Predicted Popularity") +
facet_wrap(~ artist, nrow = 3) + # Separate graphs for each artist
theme_minimal()library(ggplot2)
# Define the coefficients based on the model summary
intercept <- 22.67975
duration_ms_coef <- 0.00002142283
# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
predicted_popularity = intercept +
duration_ms_coef * duration_ms )
# Plot the predicted popularity against uniqueWords for each artist with color
ggplot(Top10_Analysis, aes(x = duration_ms, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Duration", y = "Predicted Popularity") +
facet_wrap(~ artist, nrow = 3) + # Separate graphs for each artist
theme_minimal()library(ggplot2)
# Define the coefficients based on the model summary
intercept <- 22.67975
tempo_coef <- 0.04850023
# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
predicted_popularity = intercept +
tempo_coef * tempo )
# Plot the predicted popularity against tempo for each artist with color
ggplot(Top10_Analysis, aes(x = tempo, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Tempo", y = "Predicted Popularity") +
facet_wrap(~ artist, nrow = 3) + # Separate graphs for each artist
theme_minimal()library(ggplot2)
# Define the coefficients based on the model summary
intercept <- 22.67975
danceability_coef <- 3.593523
# Apply the model for each artist and add the predicted popularity to the data frame
Top10_Analysis <- transform(Top10_Analysis,
predicted_popularity = intercept +
danceability_coef * danceability)
# Plot the predicted popularity against danceability for each artist with color
ggplot(Top10_Analysis, aes(x = danceability, y = predicted_popularity, color = artist)) +
geom_line() +
labs(x = "Danceability", y = "Predicted Popularity") +
facet_wrap(~ artist, nrow = 3) + # Separate graphs for each artist
theme_minimal()# Fit the linear regression model
m1 <- lm(popularity ~ totalWords + uniqueWords + duration_ms + tempo + danceability, data = Top10_Analysis)
# Extract the coefficients
coefficients <- coef(m1)
print(coefficients) (Intercept) totalWords uniqueWords duration_ms tempo danceability
2.267975e+01 4.934232e-04 1.627919e-02 2.142283e-05 4.850023e-02 3.593523e+00
library(ggplot2)
# Define the coefficients
intercept <- 22.67975
uniqueWords_coef <- 0.01627919
# List of artists
artist <- c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne", "Snoop Dogg")
# Define a color palette with enough colors for each artist
colors <- c("blue", "green", "red", "purple", "orange", "yellow", "cyan", "magenta", "darkgreen", "darkblue")
# Initialize an empty data frame to store combined data
combined_data <- data.frame()
# Loop through each artist
for (i in seq_along(artist)) {
# Filter the dataset to include only songs by the current artist
artist_data <- Top10_Analysis[Top10_Analysis$artist == artist[i], ]
# Calculate the predicted popularity for the current artist
artist_data$predicted_popularity <- with(artist_data, intercept + uniqueWords_coef * uniqueWords)
# Add a column for artist name
artist_data$artist <- artist[i]
# Combine the data for the current artist with the combined data
combined_data <- rbind(combined_data, artist_data)
}
# Plot all artists on the same plot
ggplot(combined_data, aes(x = uniqueWords, y = predicted_popularity, color = artist, group = artist)) +
geom_line() +
labs(x = "Unique Words", y = "Predicted Popularity", title = "Predicted Popularity for Rap Artists") +
theme_minimal()library(ggplot2)
artist_data <- data.frame(
artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
vulgarityRate = c(0.03904857, 0.01772178, 0.0186974, 0.02353002, 0.01912823, 0.01772455, 0.02352152, 0.03519564, 0.03403338, 0.03436607),
uniqueRate = c(0.3820612, 0.3945815, 0.4096715, 0.4238028, 0.4081528, 0.4767687, 0.3647998, 0.4358999, 0.4358999, 0.3916636)
)
library(ggplot2)
library(tidyr)
artist_data <- data.frame(
artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
vulgarityRate = c(0.03904857, 0.01772178, 0.0186974, 0.02353002, 0.01912823, 0.01772455, 0.02352152, 0.03519564, 0.03403338, 0.03436607),
uniqueRate = c(0.3820612, 0.3945815, 0.4096715, 0.4238028, 0.4081528, 0.4767687, 0.3647998, 0.4358999, 0.4358999, 0.3916636)
)
# Reshape the data for plotting
melted_data <- pivot_longer(artist_data, cols = c("vulgarityRate", "uniqueRate"), names_to = "Variable")
# Bar graph
ggplot(melted_data, aes(x = artist, y = value, fill = Variable)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Artist", y = "Value", fill = "Variable") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))library(ggplot2)
artist_data <- data.frame(
artist = c("2Pac", "Drake", "Eminem", "JAY-Z", "Kendrick Lamar", "Nas", "Nicki Minaj", "The Notorious B.I.G.", "Lil Wayne","Snoop Dogg"),
mean_flow_wpm = c(178.0674, 134.7107, 191.6072, 157.2335, 155.6770, 165.4653, 148.7801, 159.7788, 174.4709, 146.5127)
)
# Bar graph
ggplot(artist_data, aes(x = artist, y = mean_flow_wpm)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Artist", y = "Mean Flow WPM", title = "Mean Flow WPM for Each Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))