\[\\[0.2in]\]

Advanced Predictive Modeling

\[\\[0.1in]\]

Housekeeping

Loading needed libraries

library(psych)
library(lm.beta)
library(gvlma)
library(coefplot)
library(dplyr)
library(jtools)
library(corrplot)

\[\\[0.1in]\]

Read data

youtube <- read.csv("tut3b.youtube.csv", header = T)

# In case you did not save the anger dummy, you can create it by running the line below
youtube$anger_dummy <- case_when((youtube$anger > 0) ~ 1, TRUE ~ 0)

\[\\[0.01in]\]

Single-set OLS model

# The variables included in the model and their correlations:
psych::describe(youtube[c(35, 54:56)])

##              vars    n mean   sd median trimmed mad min max range  skew
## anger           1 9633 0.61 1.58      0    0.31   0   0  43    43 10.82
## partisanship    2 9633 0.77 2.19      0    0.45   0   0  73    73 17.40
## america         3 9633 0.24 0.77      0    0.07   0   0  32    32 11.57
## economy         4 9633 0.19 0.94      0    0.00   0   0  39    39 14.07
##              kurtosis   se
## anger          201.19 0.02
## partisanship   450.13 0.02
## america        336.32 0.01
## economy        382.12 0.01

corrplot(cor(youtube[c(35, 54:56)]), type = "lower",  
         tl.col = "black", tl.srt = 45)

# The OLS model
lm1 <- lm(anger ~ partisanship + america * economy, data = youtube)
summary(lm.beta(lm1))

## 
## Call:
## lm(formula = anger ~ partisanship + america * economy, data = youtube)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.275  -0.486  -0.243   0.369  34.792 
## 
## Coefficients:
##                 Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)     0.242700           NA   0.013449   18.05   <2e-16 ***
## partisanship    0.243797     0.339364   0.006269   38.89   <2e-16 ***
## america         0.363646     0.178379   0.020182   18.02   <2e-16 ***
## economy         0.388778     0.231198   0.014682   26.48   <2e-16 ***
## america:economy 0.065034     0.138181   0.005086   12.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.185 on 9628 degrees of freedom
## Multiple R-squared:  0.4345, Adjusted R-squared:  0.4343 
## F-statistic:  1849 on 4 and 9628 DF,  p-value: < 2.2e-16

# Scatterplot of Predicted vs Actual values
plot(youtube$anger, predict(lm1), 
     xlab = "Anger (y)", ylab = "Predicted anger (ŷ)",
     main = "Predicted vs actual anger")
abline(lm(predict(lm1) ~ youtube$anger), col = "blue", lwd = 3)

Splitting the data

set.seed(123)
train_size <- round(0.75 * nrow(youtube))

# Create an index for splitting the data
train_index <- sample(1:train_size, train_size, replace = FALSE)

# Split the data into training and testing sets
train_data <- youtube[train_index, ]
test_data <- youtube[-train_index, ]

psych::describe(train_data[c(35, 54:56)])

##              vars    n mean   sd median trimmed mad min max range  skew
## anger           1 7225 0.62 1.70      0    0.32   0   0  43    43 11.02
## partisanship    2 7225 0.79 2.35      0    0.48   0   0  73    73 18.08
## america         3 7225 0.26 0.84      0    0.09   0   0  32    32 11.64
## economy         4 7225 0.17 0.80      0    0.00   0   0  16    16  9.44
##              kurtosis   se
## anger          195.65 0.02
## partisanship   446.82 0.03
## america        320.79 0.01
## economy        127.20 0.01

psych::describe(test_data[c(35, 54:56)])

##              vars    n mean   sd median trimmed mad min max range  skew
## anger           1 2408 0.55 1.14      0    0.30   0   0  20    20  5.32
## partisanship    2 2408 0.71 1.62      0    0.38   0   0  28    28  6.31
## america         3 2408 0.16 0.52      0    0.03   0   0   7     7  4.99
## economy         4 2408 0.25 1.25      0    0.01   0   0  39    39 16.01
##              kurtosis   se
## anger           55.10 0.02
## partisanship    64.92 0.03
## america         36.60 0.01
## economy        411.61 0.03

corrplot(cor(train_data[c(35, 54:56)]), type = "lower",  
         tl.col = "black", tl.srt = 45)

corrplot(cor(test_data[c(35, 54:56)]), type = "lower",  
         tl.col = "black", tl.srt = 45)

Split-sample OLS model

# Fit a linear model on the training data
lm2 <- lm(anger ~ partisanship + america * economy, data = train_data)
summary(lm.beta(lm2))

## 
## Call:
## lm(formula = anger ~ partisanship + america * economy, data = train_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.976 -0.491 -0.241  0.302 34.306 
## 
## Coefficients:
##                 Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)     0.240668           NA   0.016254   14.81   <2e-16 ***
## partisanship    0.249844     0.346861   0.007158   34.90   <2e-16 ***
## america         0.343697     0.169916   0.023238   14.79   <2e-16 ***
## economy         0.457599     0.217108   0.020929   21.86   <2e-16 ***
## america:economy 0.077127     0.163746   0.005847   13.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.242 on 7220 degrees of freedom
## Multiple R-squared:  0.464,  Adjusted R-squared:  0.4637 
## F-statistic:  1563 on 4 and 7220 DF,  p-value: < 2.2e-16

# Scatterplot of Predicted vs Actual values
plot(train_data$anger, predict(lm2), 
     xlab = "Anger (y)", ylab = "Predicted anger (ŷ)",
     main = "Predicted vs actual anger")
abline(lm(predict(lm2) ~ train_data$anger), col = "blue", lwd = 3)

# summary measures
r_squared <- 1 - (sum((test_data$anger - predict(lm2, newdata = test_data))^2) / sum((test_data$anger - mean(test_data$anger))^2))
rmse <- sqrt(mean((predict(lm2, newdata = test_data) - test_data$anger)^2))
mae <- mean(abs(predict(lm2, newdata = test_data) - test_data$anger))

# Print the metrics
cat("R-squared:", r_squared, "\n")

## R-squared: 0.2113745

cat("RMSE:", rmse, "\n")

## RMSE: 1.013268

cat("MAE:", mae, "\n")

## MAE: 0.6367743

Comparing model performance

# let's compare the two models:

export_summs(lm1, lm2, scale = F,
             error_format = "[{conf.low}, {conf.high}]")

	Model 1	Model 2
(Intercept)	0.24 ***	0.24 ***
	[0.22, 0.27]	[0.21, 0.27]
partisanship	0.24 ***	0.25 ***
	[0.23, 0.26]	[0.24, 0.26]
america	0.36 ***	0.34 ***
	[0.32, 0.40]	[0.30, 0.39]
economy	0.39 ***	0.46 ***
	[0.36, 0.42]	[0.42, 0.50]
america:economy	0.07 ***	0.08 ***
	[0.06, 0.08]	[0.07, 0.09]
N	9633	7225
R2	0.43	0.46
* p < 0.001; p < 0.01; * p < 0.05.

export_summs(lm1, lm2, scale = T,
             error_format = "[{conf.low}, {conf.high}]")

	Model 1	Model 2
(Intercept)	0.59 ***	0.61 ***
	[0.57, 0.62]	[0.58, 0.64]
partisanship	0.53 ***	0.59 ***
	[0.51, 0.56]	[0.56, 0.62]
america	0.29 ***	0.30 ***
	[0.26, 0.32]	[0.26, 0.34]
economy	0.38 ***	0.38 ***
	[0.35, 0.41]	[0.35, 0.42]
america:economy	0.05 ***	0.05 ***
	[0.04, 0.05]	[0.04, 0.06]
N	9633	7225
R2	0.43	0.46
All continuous predictors are mean-centered and scaled by 1 standard deviation. * p < 0.001; p < 0.01; * p < 0.05.

Dummy dependent variable

# Eyeballing relations between the predictors and the dummy outcome

ggplot(youtube, aes(x = anger_dummy, y = partisanship, group = anger_dummy, fill  = anger_dummy)) +
  geom_boxplot() +
  labs(x = "Anger (dummy)", y = "Partisanship") +
  theme_minimal() + theme(legend.position = "none")

ggplot(youtube, aes(x = anger_dummy, y = america, group = anger_dummy, fill  = anger_dummy)) +
  geom_boxplot() +
  labs(x = "Anger (dummy)", y = "America") +
  theme_minimal() + theme(legend.position = "none")

ggplot(youtube, aes(x = anger_dummy, y = economy, group = anger_dummy, fill  = anger_dummy)) +
  geom_boxplot() +
  labs(x = "Anger (dummy)", y = "Economy") +
  theme_minimal() + theme(legend.position = "none")

ggplot(youtube, aes(x = anger_dummy, y = america*economy, group = anger_dummy, fill  = anger_dummy)) +
  geom_boxplot() +
  labs(x = "Anger (dummy)", y = "America * Economy") +
  theme_minimal() + theme(legend.position = "none")

Single-set Logit model

# Fit logistic regression model - all data
logit1 <- glm(anger_dummy ~ partisanship + america * economy, data = youtube, family = binomial)
summary(logit1)

## 
## Call:
## glm(formula = anger_dummy ~ partisanship + america * economy, 
##     family = binomial, data = youtube)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3425  -0.8601  -0.7311   1.2019   1.7031  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.18307    0.02915  -40.59   <2e-16 ***
## partisanship     0.37926    0.02252   16.84   <2e-16 ***
## america          0.49228    0.04272   11.52   <2e-16 ***
## economy          0.56277    0.05006   11.24   <2e-16 ***
## america:economy -0.08211    0.04065   -2.02   0.0434 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12230  on 9632  degrees of freedom
## Residual deviance: 11260  on 9628  degrees of freedom
## AIC: 11270
## 
## Number of Fisher Scoring iterations: 7

exp(cbind(OR = coef(logit1), confint(logit1)))  # Exponentiated coefficients

##                        OR     2.5 %   97.5 %
## (Intercept)     0.3063369 0.2892466 0.324261
## partisanship    1.4612040 1.3985882 1.527665
## america         1.6360353 1.5051211 1.779567
## economy         1.7555284 1.5924097 1.935267
## america:economy 0.9211719 0.8836096 1.018216

Split-sample Logit model

# Fit logistic regression model - training data
logit2 <- glm(anger_dummy ~ partisanship + america * economy, data = train_data, family = binomial)
summary(logit2)

## 
## Call:
## glm(formula = anger_dummy ~ partisanship + america * economy, 
##     family = binomial, data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2107  -0.8630  -0.7343   1.2736   1.6986  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.17304    0.03385 -34.657  < 2e-16 ***
## partisanship     0.37726    0.02608  14.465  < 2e-16 ***
## america          0.47485    0.04601  10.320  < 2e-16 ***
## economy          0.51896    0.05731   9.056  < 2e-16 ***
## america:economy -0.08584    0.03264  -2.630  0.00854 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9186.0  on 7224  degrees of freedom
## Residual deviance: 8513.6  on 7220  degrees of freedom
## AIC: 8523.6
## 
## Number of Fisher Scoring iterations: 5

exp(cbind(OR = coef(logit2), confint(logit2)))  # Exponentiated coefficients

##                        OR     2.5 %    97.5 %
## (Intercept)     0.3094243 0.2894563 0.3305293
## partisanship    1.4582803 1.3862143 1.5354405
## america         1.6077661 1.4698035 1.7606406
## economy         1.6802841 1.5020805 1.8792230
## america:economy 0.9177451 0.8860423 1.0069305

Predict values for testing dataset

# Make predictions on the test data
predictions <- predict(logit2, newdata = test_data, type = "response")

# Convert predicted probabilities to binary predictions (0 or 1)
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

# Confusion matrix
conf_mat <- table(test_data$anger_dummy, predicted_classes)

# Summary measures

accuracy <- mean(predicted_classes == test_data$anger_dummy)
precision <- conf_mat[2, 2] / sum(conf_mat[, 2])
recall <- conf_mat[2, 2] / sum(conf_mat[2, ])
f1 <- 2 * (precision * recall) / (precision + recall)
# Print the metrics
cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.7163621

cat("Precision:", precision, "\n")

## Precision: 0.7166667

cat("Recall:", recall, "\n")

## Recall: 0.2185515

cat("F1:", f1, "\n")

## F1: 0.3349562

DSO2008 2023-2024 Working with Big Data

Eliyahu V. Sapir

11 December 2023

Advanced Predictive Modeling

Housekeeping

Loading needed libraries

Read data

Single-set OLS model

Splitting the data

Split-sample OLS model

Comparing model performance

Dummy dependent variable

Single-set Logit model

Split-sample Logit model

Predict values for testing dataset