Problem Set 4

packages <- c("tidyverse", "Lock5Data", "modelsummary", "effects", "survey", "car", "interactions", "kableExtra", "flextable", "scales", "sjPlot", "sjmisc", "rockchalk", "nnet", "ggeffects", "carData")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## Warning: package 'ggplot2' was built under R version 4.3.2

## Warning: package 'tidyr' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'modelsummary' was built under R version 4.3.3

## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## Loading required package: carData

## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## Warning: package 'survey' was built under R version 4.3.2

## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival

## Warning: package 'survival' was built under R version 4.3.3

## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

## Warning: package 'kableExtra' was built under R version 4.3.2

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## Warning: package 'flextable' was built under R version 4.3.2

## 
## Attaching package: 'flextable'
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

## Warning: package 'sjPlot' was built under R version 4.3.3

## Learn more about sjPlot with 'browseVignettes("sjPlot")'.

## Warning: package 'sjmisc' was built under R version 4.3.3

## 
## Attaching package: 'sjmisc'
## 
## The following object is masked from 'package:purrr':
## 
##     is_empty
## 
## The following object is masked from 'package:tidyr':
## 
##     replace_na
## 
## The following object is masked from 'package:tibble':
## 
##     add_case
## 
## 
## Attaching package: 'rockchalk'
## 
## The following object is masked from 'package:dplyr':
## 
##     summarize

## Warning: package 'ggeffects' was built under R version 4.3.3

## 
## Attaching package: 'ggeffects'
## 
## The following object is masked from 'package:interactions':
## 
##     johnson_neyman

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "Lock5Data" "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "Lock5Data"    "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "effects"      "carData"      "modelsummary" "Lock5Data"    "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[5]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "Lock5Data"    "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "car"          "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "Lock5Data"    "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "interactions" "car"          "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "Lock5Data"   
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "interactions" "car"          "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "Lock5Data"    "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "flextable"    "kableExtra"   "interactions" "car"          "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "Lock5Data"    "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "scales"       "flextable"    "kableExtra"   "interactions" "car"         
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "Lock5Data"    "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "sjPlot"       "scales"       "flextable"    "kableExtra"   "interactions"
##  [6] "car"          "survey"       "survival"     "Matrix"       "grid"        
## [11] "effects"      "carData"      "modelsummary" "Lock5Data"    "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "sjmisc"       "sjPlot"       "scales"       "flextable"    "kableExtra"  
##  [6] "interactions" "car"          "survey"       "survival"     "Matrix"      
## [11] "grid"         "effects"      "carData"      "modelsummary" "Lock5Data"   
## [16] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [21] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [26] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [31] "methods"      "base"        
## 
## [[13]]
##  [1] "rockchalk"    "sjmisc"       "sjPlot"       "scales"       "flextable"   
##  [6] "kableExtra"   "interactions" "car"          "survey"       "survival"    
## [11] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [16] "Lock5Data"    "lubridate"    "forcats"      "stringr"      "dplyr"       
## [21] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [26] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [31] "datasets"     "methods"      "base"        
## 
## [[14]]
##  [1] "nnet"         "rockchalk"    "sjmisc"       "sjPlot"       "scales"      
##  [6] "flextable"    "kableExtra"   "interactions" "car"          "survey"      
## [11] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [16] "modelsummary" "Lock5Data"    "lubridate"    "forcats"      "stringr"     
## [21] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [26] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [31] "utils"        "datasets"     "methods"      "base"        
## 
## [[15]]
##  [1] "ggeffects"    "nnet"         "rockchalk"    "sjmisc"       "sjPlot"      
##  [6] "scales"       "flextable"    "kableExtra"   "interactions" "car"         
## [11] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [16] "carData"      "modelsummary" "Lock5Data"    "lubridate"    "forcats"     
## [21] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [26] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [31] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[16]]
##  [1] "ggeffects"    "nnet"         "rockchalk"    "sjmisc"       "sjPlot"      
##  [6] "scales"       "flextable"    "kableExtra"   "interactions" "car"         
## [11] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [16] "carData"      "modelsummary" "Lock5Data"    "lubridate"    "forcats"     
## [21] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [26] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [31] "grDevices"    "utils"        "datasets"     "methods"      "base"

data(USStates)
States <- USStates

model1 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data = States)
summary(model1)

## 
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite, 
##     data = States)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.697  -3.605  -1.036   4.495  13.686 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.99082    6.01413   0.165    0.870    
## College          1.12498    0.22514   4.997 8.88e-06 ***
## HouseholdIncome -0.07554    0.15345  -0.492    0.625    
## NonWhite         0.43123    0.08246   5.230 4.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.622 on 46 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.5904 
## F-statistic: 24.55 on 3 and 46 DF,  p-value: 1.237e-09

Task 1

Using the States dataset, conduct a multiple linear regression with ClintonVote as the dependent variable and College, HouseholdIncome, and NonWhite as independent variables. Create regression tables using the sjPlot package. Interpret the coefficients and assess the model fit.

tab_model(model1)

	Clinton Vote
Predictors	Estimates	CI	p
(Intercept)	0.99	-11.11 – 13.10	0.870
College	1.12	0.67 – 1.58	<0.001
HouseholdIncome	-0.08	-0.38 – 0.23	0.625
NonWhite	0.43	0.27 – 0.60	<0.001
Observations	50
R² / R² adjusted	0.616 / 0.590

Interpretation

The intercept represents the expected value of ClintonVote when all the independent variables (College, HouseholdIncome, and NonWhite) are zero. However, since the intercept is not statistically significant (p = 0.870), it is not meaningful in this context. College (1.12, p < 0.001):

The coefficient for College is 1.12, which means that for every one-unit increase in the percentage of college graduates, the ClintonVote is expected to increase by 1.12 units, holding other variables constant. This coefficient is statistically significant (p < 0.001), indicating a strong positive relationship between the percentage of college graduates and the vote for Clinton.

The coefficient for HouseholdIncome is -0.08. This coefficient is not statistically significant (p = 0.625), meaning that household income does not have a meaningful impact on the vote for Clinton in this model.

The coefficient for NonWhite is 0.43. This coefficient is statistically significant (p < 0.001), indicating a strong positive relationship between the percentage of the non-white population and the vote for Clinton.

The R-squared value is 0.616, meaning that approximately 61.6% of the variance in ClintonVote can be explained by the independent variables (College, HouseholdIncome, and NonWhite). This indicates that the model is suitable to the data.

The adjusted R-squared value is 0.590, which adjusts for the number of predictors in the model. This value is slightly lower than the R-squared, indicating that the model explains a substantial portion of the variance even after accounting for the number of predictors.

Task 2

Extend the model from Task 1 by adding the Region variable. Compare this new model to the previous one using AIC and BIC. Create a regression table with all models with modelsummary. Interpret the results and discuss which model you prefer and why.

model1 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data = States)
summary(model1)

## 
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite, 
##     data = States)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.697  -3.605  -1.036   4.495  13.686 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.99082    6.01413   0.165    0.870    
## College          1.12498    0.22514   4.997 8.88e-06 ***
## HouseholdIncome -0.07554    0.15345  -0.492    0.625    
## NonWhite         0.43123    0.08246   5.230 4.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.622 on 46 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.5904 
## F-statistic: 24.55 on 3 and 46 DF,  p-value: 1.237e-09

model2 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite + Region, data = States)
summary(model2)

## 
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite + 
##     Region, data = States)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3217  -4.1695   0.0435   4.2880  11.2477 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     14.80589    7.15877   2.068   0.0447 *  
## College          1.18862    0.25668   4.631 3.36e-05 ***
## HouseholdIncome -0.42863    0.17107  -2.506   0.0161 *  
## NonWhite         0.52461    0.08175   6.417 9.03e-08 ***
## RegionNE         8.03052    2.72062   2.952   0.0051 ** 
## RegionS         -3.40381    2.74204  -1.241   0.2212    
## RegionW          5.65798    2.77249   2.041   0.0474 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.82 on 43 degrees of freedom
## Multiple R-squared:  0.7224, Adjusted R-squared:  0.6836 
## F-statistic: 18.65 on 6 and 43 DF,  p-value: 1.545e-10

AIC(model1, model2)

##        df      AIC
## model1  5 336.7667
## model2  8 326.4857

BIC(model1, model2)

##        df      BIC
## model1  5 346.3268
## model2  8 341.7819

model_list <- list(
  "Model 1" = model1,
  "Model 2" = model2
)

modelsummary(model_list)

	Model 1	Model 2
(Intercept)	0.991	14.806
	(6.014)	(7.159)
College	1.125	1.189
	(0.225)	(0.257)
HouseholdIncome	-0.076	-0.429
	(0.153)	(0.171)
NonWhite	0.431	0.525
	(0.082)	(0.082)
RegionNE		8.031
		(2.721)
RegionS		-3.404
		(2.742)
RegionW		5.658
		(2.772)
Num.Obs.	50	50
R2	0.616	0.722
R2 Adj.	0.590	0.684
AIC	336.8	326.5
BIC	346.3	341.8
Log.Lik.	-163.383	-155.243
RMSE	6.35	5.40

Interpretation

Model 2 is preferred over Model 1 due to its higher R-squared (0.722) and Adjusted R-squared (0.684) values, indicating it explains more variance in ClintonVote. It also has lower AIC (326.5) and BIC (341.8) values, suggesting a better fit to the data while accounting for the number of predictors. The lower RMSE (5.40) further indicates better predictive accuracy. The coefficients for the Region variables reveal significant differences in ClintonVote across regions, with the Northeast and West showing higher votes for Clinton compared to the baseline region, and the South showing lower votes. These enhancements make Model 2 more comprehensive and accurate in capturing the geographic variation in voting patterns, thus providing a superior fit and deeper insights than Model 1.

Task 3

Using the States dataset, fit a model predicting ClintonVote with College, NonWhite, and their interaction. Visualize the interaction effect using the effects package and interpret the results. How does the interaction change our understanding of the relationship between education and voting patterns?

model3 <- lm(ClintonVote ~ College + NonWhite + NonWhite:College, data=States)
summary(model3)

## 
## Call:
## lm(formula = ClintonVote ~ College + NonWhite + NonWhite:College, 
##     data = States)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.446  -3.543  -0.295   3.936  13.831 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -15.22574   13.26349  -1.148 0.256927    
## College            1.50058    0.39913   3.760 0.000479 ***
## NonWhite           0.99725    0.47769   2.088 0.042397 *  
## College:NonWhite  -0.01802    0.01457  -1.236 0.222664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.532 on 46 degrees of freedom
## Multiple R-squared:  0.6259, Adjusted R-squared:  0.6015 
## F-statistic: 25.66 on 3 and 46 DF,  p-value: 6.636e-10

plot(allEffects(model3), multiline = TRUE)

##### Interpretation The interaction plot indicates that areas with a number of college graduates tend to show higher levels of support for Clinton among non white voters. The lines representing different levels of the non-white population indicate that at each level of college graduates, a higher percentage of non-white population corresponds to higher Clinton votes. This positive interaction effect means that the influence of college education on voting for Clinton is stronger in areas with a higher percentage of the non-white population, suggesting that educational attainment and racial demographics together amplify support for Clinton. The interaction allows us to understand the relationship between education and voting is not uniform across different racial demographic.

Task 4

Using the States dataset, create a binary outcome variable indicating whether a state voted for Clinton (1) or not (0). Perform a logistic regression with this new variable as the dependent variable and College, HouseholdIncome, and Region as predictors. Create regression tables using both modelsummary and sjPlot. Interpret the odds ratios and assess model fit.

States$Clinton <- abs(as.numeric(States$Elect2016) - 2)
table(States$Clinton)

## 
##  0  1 
## 30 20

m.logit <- glm(Clinton ~ College+HouseholdIncome+Region, data=States, family=binomial(link="logit"))
summary(m.logit)

## 
## Call:
## glm(formula = Clinton ~ College + HouseholdIncome + Region, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     -13.88304    4.57843  -3.032  0.00243 **
## College           0.25390    0.12980   1.956  0.05046 . 
## HouseholdIncome   0.05599    0.06642   0.843  0.39922   
## RegionNE          3.02242    1.41280   2.139  0.03241 * 
## RegionS           0.31259    1.58896   0.197  0.84404   
## RegionW           3.02137    1.43108   2.111  0.03475 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 32.014  on 44  degrees of freedom
## AIC: 44.014
## 
## Number of Fisher Scoring iterations: 6

tab_model(m.logit, show.ci=FALSE, show.se=TRUE, show.stat=TRUE)

	Clinton
Predictors	Odds Ratios	std. Error	Statistic	p
(Intercept)	0.00	0.00	-3.03	0.002
College	1.29	0.17	1.96	0.050
HouseholdIncome	1.06	0.07	0.84	0.399
Region [NE]	20.54	29.02	2.14	0.032
Region [S]	1.37	2.17	0.20	0.844
Region [W]	20.52	29.37	2.11	0.035
Observations	50
R² Tjur	0.588

Interpretation

At the intercept, the odds of voting for Clinton are extremely low (approaching zero) when all predictors are at their reference levels (the p-value (0.002) indicates that this is statistically significant). In college, each percentage point increase in college graduates increases the odds of voting for Clinton by 29% (marginally significant with a p-value of 0.050). The model suggests that higher college education levels and residing in the Northeast or West regions are indicators of support, for Clinton, however household income and living in the South do not show significant effects.

Task 5

Leveraging the same logistic regression model from Task 4, use the effects package to plot the predicted probabilities of voting for Clinton across the range of College values, separated by Region. Then, create a plot using sjPlot’s plot_model function to visualize the odds ratios of all predictors in the model.

plot(allEffects(m.logit), type="response")

effect_plot <- Effect("Region", m.logit)
plot(effect_plot, type = "response", ylim=c(0,1))

library(sjPlot)
plot_model(m.logit, type = "pred", terms = c("College", "Region")) +
  labs(title = "Predicted Probabilities of Voting for Clinton by College and Region")

## Data were 'prettified'. Consider using `terms="College [all]"` to get
##   smooth plots.

plot_model(m.logit, type = "est", show.values = TRUE, value.offset = 0.3) +
  labs(title = "Odds Ratios of Predictors in Logistic Regression Model")