library(readxl)
library(lmtest)
library(stats)
setwd ("C:/Users/Yung Cho/Documents/GitHub/Yung_QC")
data16 <- read_xlsx("ANES 2016.xlsx", sheet = 2)
# The ANES 2016 dataset alayzes voting behavitor and public opinion during the 2016 US presidential election.
data16 <- na.omit(data16)
head(data16)
## # A tibble: 6 × 33
## id weight religious age.group education race vote16 feel.dempres
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 300001 0.842 1 3 1 1 2 15
## 2 300002 1.01 0 3 2 1 2 50
## 3 300006 0.646 1 5 1 1 3 30
## 4 300007 0.688 1 10 3 1 2 0
## 5 300008 4.62 1 9 0 2 1 85
## 6 300012 0.943 1 9 1 1 2 15
## # ℹ 25 more variables: feel.reppres <dbl>, feel.fundamentalists <dbl>,
## # feel.feminists <dbl>, feel.liberals <dbl>, feel.unions <dbl>,
## # feel.poor <dbl>, feel.bigbiz <dbl>, feel.cons <dbl>, feel.scotus <dbl>,
## # feel.lgb <dbl>, feel.congress <dbl>, feel.rich <dbl>, feel.muslims <dbl>,
## # feel.christians <dbl>, feel.jews <dbl>, feel.teaparty <dbl>,
## # feel.police <dbl>, feel.trans <dbl>, feel.scientists <dbl>, feel.blm <dbl>,
## # feel.asians <dbl>, feel.hisp <dbl>, feel.blacks <dbl>, feel.undoc <dbl>, …
# The likelihood ratio test is used to compare the goodness of fit between two competing models. It is based on the ratio of the likelihoods of the two models, a more complex model and a simpler model. The test statistic follows a chi-squared distribution, and the p-value is used to determine the significance of the difference in fit between the two models.
full_model <- lm(vote16 ~ religious + education + feel.poor, data = data16)
reduced_model <- lm(vote16 ~ education + feel.poor, data = data16)
lrtest(full_model, reduced_model)
## Likelihood ratio test
##
## Model 1: vote16 ~ religious + education + feel.poor
## Model 2: vote16 ~ education + feel.poor
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 -2085.4
## 2 4 -2120.3 -1 69.799 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1 adding religious significantly improves the model fit.The chisq is very large, and p-value is very small. this provides strong evidence against the null hypothesis.
# The AIC and BIC models are used to compare the goodness of fit between two models with different numbers of predictors. Lower values indicate a better fit.
model_0 <- lm(vote16 ~ 1, data = data16)
model_1 <- lm(vote16 ~ religious, data = data16)
model_2 <- lm(vote16 ~ religious + education, data = data16)
model_3 <- lm(vote16 ~ religious + education + feel.poor, data = data16)
AIC_calc <- AIC(model_0, model_1, model_2, model_3)
BIC_calc <- BIC(model_0, model_1, model_2, model_3)
values <- data.frame(
"Model" = c("Vote", "+Religious", "++Education", "+++Feeling"),
"AIC" = AIC_calc,
"BIC" = BIC_calc
)
print(values)
## Model AIC.df AIC.AIC BIC.df BIC.BIC
## model_0 Vote 2 4340.947 2 4352.416
## model_1 +Religious 3 4274.716 3 4291.920
## model_2 ++Education 4 4240.903 4 4263.841
## model_3 +++Feeling 5 4180.904 5 4209.576
# The addition of each variable (Religious, Education, and feel.poor) improves the model's performance, as evidenced by decreasing AIC and BIC values.Based on this analysis, Model 3 (Vote ~ Religious + Education + feel.poor) is the best model because it has the lowest AIC and BIC values, indicating it is the most appropriate model for explaining voting behavior (Vote)