\[\\[0.2in]\]
\[\\[0.1in]\]
library(psych)
library(lm.beta)
library(gvlma)
library(coefplot)
library(dplyr)
library(jtools)
library(corrplot)
\[\\[0.1in]\]
youtube <- read.csv("tut3b.youtube.csv", header = T)
# In case you did not save the anger dummy, you can create it by running the line below
youtube$anger_dummy <- case_when((youtube$anger > 0) ~ 1, TRUE ~ 0)
\[\\[0.01in]\]
# The variables included in the model and their correlations:
psych::describe(youtube[c(35, 54:56)])
## vars n mean sd median trimmed mad min max range skew
## anger 1 9633 0.61 1.58 0 0.31 0 0 43 43 10.82
## partisanship 2 9633 0.77 2.19 0 0.45 0 0 73 73 17.40
## america 3 9633 0.24 0.77 0 0.07 0 0 32 32 11.57
## economy 4 9633 0.19 0.94 0 0.00 0 0 39 39 14.07
## kurtosis se
## anger 201.19 0.02
## partisanship 450.13 0.02
## america 336.32 0.01
## economy 382.12 0.01
corrplot(cor(youtube[c(35, 54:56)]), type = "lower",
tl.col = "black", tl.srt = 45)
# The OLS model
lm1 <- lm(anger ~ partisanship + america * economy, data = youtube)
summary(lm.beta(lm1))
##
## Call:
## lm(formula = anger ~ partisanship + america * economy, data = youtube)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.275 -0.486 -0.243 0.369 34.792
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 0.242700 NA 0.013449 18.05 <2e-16 ***
## partisanship 0.243797 0.339364 0.006269 38.89 <2e-16 ***
## america 0.363646 0.178379 0.020182 18.02 <2e-16 ***
## economy 0.388778 0.231198 0.014682 26.48 <2e-16 ***
## america:economy 0.065034 0.138181 0.005086 12.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.185 on 9628 degrees of freedom
## Multiple R-squared: 0.4345, Adjusted R-squared: 0.4343
## F-statistic: 1849 on 4 and 9628 DF, p-value: < 2.2e-16
# Scatterplot of Predicted vs Actual values
plot(youtube$anger, predict(lm1),
xlab = "Anger (y)", ylab = "Predicted anger (Å·)",
main = "Predicted vs actual anger")
abline(lm(predict(lm1) ~ youtube$anger), col = "blue", lwd = 3)
set.seed(123)
train_size <- round(0.75 * nrow(youtube))
# Create an index for splitting the data
train_index <- sample(1:train_size, train_size, replace = FALSE)
# Split the data into training and testing sets
train_data <- youtube[train_index, ]
test_data <- youtube[-train_index, ]
psych::describe(train_data[c(35, 54:56)])
## vars n mean sd median trimmed mad min max range skew
## anger 1 7225 0.62 1.70 0 0.32 0 0 43 43 11.02
## partisanship 2 7225 0.79 2.35 0 0.48 0 0 73 73 18.08
## america 3 7225 0.26 0.84 0 0.09 0 0 32 32 11.64
## economy 4 7225 0.17 0.80 0 0.00 0 0 16 16 9.44
## kurtosis se
## anger 195.65 0.02
## partisanship 446.82 0.03
## america 320.79 0.01
## economy 127.20 0.01
psych::describe(test_data[c(35, 54:56)])
## vars n mean sd median trimmed mad min max range skew
## anger 1 2408 0.55 1.14 0 0.30 0 0 20 20 5.32
## partisanship 2 2408 0.71 1.62 0 0.38 0 0 28 28 6.31
## america 3 2408 0.16 0.52 0 0.03 0 0 7 7 4.99
## economy 4 2408 0.25 1.25 0 0.01 0 0 39 39 16.01
## kurtosis se
## anger 55.10 0.02
## partisanship 64.92 0.03
## america 36.60 0.01
## economy 411.61 0.03
corrplot(cor(train_data[c(35, 54:56)]), type = "lower",
tl.col = "black", tl.srt = 45)
corrplot(cor(test_data[c(35, 54:56)]), type = "lower",
tl.col = "black", tl.srt = 45)
# Fit a linear model on the training data
lm2 <- lm(anger ~ partisanship + america * economy, data = train_data)
summary(lm.beta(lm2))
##
## Call:
## lm(formula = anger ~ partisanship + america * economy, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.976 -0.491 -0.241 0.302 34.306
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 0.240668 NA 0.016254 14.81 <2e-16 ***
## partisanship 0.249844 0.346861 0.007158 34.90 <2e-16 ***
## america 0.343697 0.169916 0.023238 14.79 <2e-16 ***
## economy 0.457599 0.217108 0.020929 21.86 <2e-16 ***
## america:economy 0.077127 0.163746 0.005847 13.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.242 on 7220 degrees of freedom
## Multiple R-squared: 0.464, Adjusted R-squared: 0.4637
## F-statistic: 1563 on 4 and 7220 DF, p-value: < 2.2e-16
# Scatterplot of Predicted vs Actual values
plot(train_data$anger, predict(lm2),
xlab = "Anger (y)", ylab = "Predicted anger (Å·)",
main = "Predicted vs actual anger")
abline(lm(predict(lm2) ~ train_data$anger), col = "blue", lwd = 3)
# summary measures
r_squared <- 1 - (sum((test_data$anger - predict(lm2, newdata = test_data))^2) / sum((test_data$anger - mean(test_data$anger))^2))
rmse <- sqrt(mean((predict(lm2, newdata = test_data) - test_data$anger)^2))
mae <- mean(abs(predict(lm2, newdata = test_data) - test_data$anger))
# Print the metrics
cat("R-squared:", r_squared, "\n")
## R-squared: 0.2113745
cat("RMSE:", rmse, "\n")
## RMSE: 1.013268
cat("MAE:", mae, "\n")
## MAE: 0.6367743
# let's compare the two models:
export_summs(lm1, lm2, scale = F,
error_format = "[{conf.low}, {conf.high}]")
| Model 1 | Model 2 | |
|---|---|---|
| (Intercept) | 0.24 *** | 0.24 *** |
| [0.22, 0.27] | [0.21, 0.27] | |
| partisanship | 0.24 *** | 0.25 *** |
| [0.23, 0.26] | [0.24, 0.26] | |
| america | 0.36 *** | 0.34 *** |
| [0.32, 0.40] | [0.30, 0.39] | |
| economy | 0.39 *** | 0.46 *** |
| [0.36, 0.42] | [0.42, 0.50] | |
| america:economy | 0.07 *** | 0.08 *** |
| [0.06, 0.08] | [0.07, 0.09] | |
| N | 9633 | 7225 |
| R2 | 0.43 | 0.46 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||
export_summs(lm1, lm2, scale = T,
error_format = "[{conf.low}, {conf.high}]")
| Model 1 | Model 2 | |
|---|---|---|
| (Intercept) | 0.59 *** | 0.61 *** |
| [0.57, 0.62] | [0.58, 0.64] | |
| partisanship | 0.53 *** | 0.59 *** |
| [0.51, 0.56] | [0.56, 0.62] | |
| america | 0.29 *** | 0.30 *** |
| [0.26, 0.32] | [0.26, 0.34] | |
| economy | 0.38 *** | 0.38 *** |
| [0.35, 0.41] | [0.35, 0.42] | |
| america:economy | 0.05 *** | 0.05 *** |
| [0.04, 0.05] | [0.04, 0.06] | |
| N | 9633 | 7225 |
| R2 | 0.43 | 0.46 |
| All continuous predictors are mean-centered and scaled by 1 standard deviation. *** p < 0.001; ** p < 0.01; * p < 0.05. | ||
# Eyeballing relations between the predictors and the dummy outcome
ggplot(youtube, aes(x = anger_dummy, y = partisanship, group = anger_dummy, fill = anger_dummy)) +
geom_boxplot() +
labs(x = "Anger (dummy)", y = "Partisanship") +
theme_minimal() + theme(legend.position = "none")
ggplot(youtube, aes(x = anger_dummy, y = america, group = anger_dummy, fill = anger_dummy)) +
geom_boxplot() +
labs(x = "Anger (dummy)", y = "America") +
theme_minimal() + theme(legend.position = "none")
ggplot(youtube, aes(x = anger_dummy, y = economy, group = anger_dummy, fill = anger_dummy)) +
geom_boxplot() +
labs(x = "Anger (dummy)", y = "Economy") +
theme_minimal() + theme(legend.position = "none")
ggplot(youtube, aes(x = anger_dummy, y = america*economy, group = anger_dummy, fill = anger_dummy)) +
geom_boxplot() +
labs(x = "Anger (dummy)", y = "America * Economy") +
theme_minimal() + theme(legend.position = "none")
# Fit logistic regression model - all data
logit1 <- glm(anger_dummy ~ partisanship + america * economy, data = youtube, family = binomial)
summary(logit1)
##
## Call:
## glm(formula = anger_dummy ~ partisanship + america * economy,
## family = binomial, data = youtube)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3425 -0.8601 -0.7311 1.2019 1.7031
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.18307 0.02915 -40.59 <2e-16 ***
## partisanship 0.37926 0.02252 16.84 <2e-16 ***
## america 0.49228 0.04272 11.52 <2e-16 ***
## economy 0.56277 0.05006 11.24 <2e-16 ***
## america:economy -0.08211 0.04065 -2.02 0.0434 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12230 on 9632 degrees of freedom
## Residual deviance: 11260 on 9628 degrees of freedom
## AIC: 11270
##
## Number of Fisher Scoring iterations: 7
exp(cbind(OR = coef(logit1), confint(logit1))) # Exponentiated coefficients
## OR 2.5 % 97.5 %
## (Intercept) 0.3063369 0.2892466 0.324261
## partisanship 1.4612040 1.3985882 1.527665
## america 1.6360353 1.5051211 1.779567
## economy 1.7555284 1.5924097 1.935267
## america:economy 0.9211719 0.8836096 1.018216
# Fit logistic regression model - training data
logit2 <- glm(anger_dummy ~ partisanship + america * economy, data = train_data, family = binomial)
summary(logit2)
##
## Call:
## glm(formula = anger_dummy ~ partisanship + america * economy,
## family = binomial, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2107 -0.8630 -0.7343 1.2736 1.6986
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.17304 0.03385 -34.657 < 2e-16 ***
## partisanship 0.37726 0.02608 14.465 < 2e-16 ***
## america 0.47485 0.04601 10.320 < 2e-16 ***
## economy 0.51896 0.05731 9.056 < 2e-16 ***
## america:economy -0.08584 0.03264 -2.630 0.00854 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9186.0 on 7224 degrees of freedom
## Residual deviance: 8513.6 on 7220 degrees of freedom
## AIC: 8523.6
##
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = coef(logit2), confint(logit2))) # Exponentiated coefficients
## OR 2.5 % 97.5 %
## (Intercept) 0.3094243 0.2894563 0.3305293
## partisanship 1.4582803 1.3862143 1.5354405
## america 1.6077661 1.4698035 1.7606406
## economy 1.6802841 1.5020805 1.8792230
## america:economy 0.9177451 0.8860423 1.0069305
# Make predictions on the test data
predictions <- predict(logit2, newdata = test_data, type = "response")
# Convert predicted probabilities to binary predictions (0 or 1)
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Confusion matrix
conf_mat <- table(test_data$anger_dummy, predicted_classes)
# Summary measures
accuracy <- mean(predicted_classes == test_data$anger_dummy)
precision <- conf_mat[2, 2] / sum(conf_mat[, 2])
recall <- conf_mat[2, 2] / sum(conf_mat[2, ])
f1 <- 2 * (precision * recall) / (precision + recall)
# Print the metrics
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.7163621
cat("Precision:", precision, "\n")
## Precision: 0.7166667
cat("Recall:", recall, "\n")
## Recall: 0.2185515
cat("F1:", f1, "\n")
## F1: 0.3349562