library(readxl)
library(lmtest)
library(stats)

setwd ("C:/Users/Yung Cho/Documents/GitHub/Yung_QC") 

data16 <- read_xlsx("ANES 2016.xlsx", sheet = 2)
# The ANES 2016 dataset alayzes voting behavitor and public opinion during the 2016 US presidential election.
data16 <- na.omit(data16)

head(data16)
## # A tibble: 6 × 33
##       id weight religious age.group education  race vote16 feel.dempres
##    <dbl>  <dbl>     <dbl>     <dbl>     <dbl> <dbl>  <dbl>        <dbl>
## 1 300001  0.842         1         3         1     1      2           15
## 2 300002  1.01          0         3         2     1      2           50
## 3 300006  0.646         1         5         1     1      3           30
## 4 300007  0.688         1        10         3     1      2            0
## 5 300008  4.62          1         9         0     2      1           85
## 6 300012  0.943         1         9         1     1      2           15
## # ℹ 25 more variables: feel.reppres <dbl>, feel.fundamentalists <dbl>,
## #   feel.feminists <dbl>, feel.liberals <dbl>, feel.unions <dbl>,
## #   feel.poor <dbl>, feel.bigbiz <dbl>, feel.cons <dbl>, feel.scotus <dbl>,
## #   feel.lgb <dbl>, feel.congress <dbl>, feel.rich <dbl>, feel.muslims <dbl>,
## #   feel.christians <dbl>, feel.jews <dbl>, feel.teaparty <dbl>,
## #   feel.police <dbl>, feel.trans <dbl>, feel.scientists <dbl>, feel.blm <dbl>,
## #   feel.asians <dbl>, feel.hisp <dbl>, feel.blacks <dbl>, feel.undoc <dbl>, …
# The likelihood ratio test is used to compare the goodness of fit between two competing models. It is based on the ratio of the likelihoods of the two models, a more complex model and a simpler model. The test statistic follows a chi-squared distribution, and the p-value is used to determine the significance of the difference in fit between the two models.

full_model <- lm(vote16 ~ religious + education + feel.poor, data = data16)
reduced_model <- lm(vote16 ~ education + feel.poor, data = data16)

lrtest(full_model, reduced_model)
## Likelihood ratio test
## 
## Model 1: vote16 ~ religious + education + feel.poor
## Model 2: vote16 ~ education + feel.poor
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   5 -2085.4                         
## 2   4 -2120.3 -1 69.799  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1 adding religious significantly improves the model fit.The chisq is very large, and p-value is very small. this provides strong evidence against the null hypothesis.
# The AIC and BIC models are used to compare the goodness of fit between two models with different numbers of predictors. Lower values indicate a better fit. 

model_0 <- lm(vote16 ~ 1, data = data16)
model_1 <- lm(vote16 ~ religious, data = data16)
model_2 <- lm(vote16 ~ religious + education, data = data16)
model_3 <- lm(vote16 ~ religious + education + feel.poor, data = data16)

AIC_calc <- AIC(model_0, model_1, model_2, model_3)
BIC_calc <- BIC(model_0, model_1, model_2, model_3)    

values <- data.frame(
    "Model" = c("Vote", "+Religious", "++Education", "+++Feeling"),
    "AIC" = AIC_calc,
    "BIC" = BIC_calc
)

print(values)
##               Model AIC.df  AIC.AIC BIC.df  BIC.BIC
## model_0        Vote      2 4340.947      2 4352.416
## model_1  +Religious      3 4274.716      3 4291.920
## model_2 ++Education      4 4240.903      4 4263.841
## model_3  +++Feeling      5 4180.904      5 4209.576
# The addition of each variable (Religious, Education, and feel.poor) improves the model's performance, as evidenced by decreasing AIC and BIC values.Based on this analysis, Model 3 (Vote ~ Religious + Education + feel.poor) is the best model because it has the lowest AIC and BIC values, indicating it is the most appropriate model for explaining voting behavior (Vote)