# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "broom", "sjPlot", "ggplot2", "car", "Lock5Data", "pandoc", "mosaic", "effects") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## Loading required package: carData
## 
## 
## Attaching package: 'car'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## 
## Attaching package: 'mosaic'
## 
## 
## The following object is masked from 'package:Matrix':
## 
##     mean
## 
## 
## The following objects are masked from 'package:car':
## 
##     deltaMethod, logit
## 
## 
## The following object is masked from 'package:modelsummary':
## 
##     msummary
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## 
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     stat
## 
## 
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## 
## 
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## 
## 
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "broom"        "modelsummary" "fst"          "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[6]]
##  [1] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[7]]
##  [1] "car"          "carData"      "sjPlot"       "broom"        "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "Lock5Data"    "car"          "carData"      "sjPlot"       "broom"       
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "pandoc"       "Lock5Data"    "car"          "carData"      "sjPlot"      
##  [6] "broom"        "modelsummary" "fst"          "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "mosaic"       "mosaicData"   "ggformula"    "Matrix"       "lattice"     
##  [6] "pandoc"       "Lock5Data"    "car"          "carData"      "sjPlot"      
## [11] "broom"        "modelsummary" "fst"          "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "effects"      "mosaic"       "mosaicData"   "ggformula"    "Matrix"      
##  [6] "lattice"      "pandoc"       "Lock5Data"    "car"          "carData"     
## [11] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"

Task 1

data(USStates)
stats <- USStates
head(stats)

##        State HouseholdIncome Region Population EighthGradeMath HighSchool
## 1    Alabama          46.472      S      4.875           268.7       87.1
## 2     Alaska          76.114      W      0.740           274.3       92.8
## 3    Arizona          53.510      W      7.016           279.9       87.1
## 4   Arkansas          43.813      S      3.004           274.4       89.1
## 5 California          67.169      W     39.537           275.6       87.4
## 6   Colorado          65.458      W      5.607           284.7       91.5
##   College    IQ    GSP Vegetables Fruit Smokers PhysicalActivity Obese NonWhite
## 1    26.0  95.7 40.279       80.7  55.1    20.9             42.8  36.2     31.6
## 2    26.5  99.0 70.936       81.0  63.1    21.0             58.3  29.5     34.7
## 3    27.4  97.4 43.096       79.2  62.8    15.6             52.7  29.5     22.5
## 4    24.7  97.5 38.467       80.7  55.3    22.3             45.4  37.1     22.7
## 5    34.5  95.5 67.698       78.6  67.5    11.3             57.5  25.8     39.4
## 6    39.6 101.6 59.057       82.6  67.0    14.6             58.7  23.0     15.8
##   HeavyDrinkers Electoral ClintonVote Elect2016 TwoParents StudentSpending
## 1          5.45         9       34.36         R       60.9           9.236
## 2          7.33         3       36.55         R       71.5          17.510
## 3          5.57        11       45.13         R       62.7           7.613
## 4          5.32         6       33.65         R       63.3           9.846
## 5          5.95        55       61.73         D       66.8          11.495
## 6          7.30         9       48.16         D       71.9           9.575
##   Insured
## 1    83.7
## 2    80.2
## 3    83.4
## 4    84.1
## 5    85.2
## 6    87.2

m1 <- lm(ClintonVote ~ College, data = stats)
m2 <- lm(ClintonVote ~ HouseholdIncome, data = stats)
m3 <- lm(ClintonVote ~ NonWhite, data = stats)
m4 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data = stats)

tab_model(m1, m2, m3, m4)

	ClintonVote			ClintonVote			ClintonVote			ClintonVote
Predictors	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p
(Intercept)	11.49	-1.36 – 24.34	0.079	9.68	-5.79 – 25.15	0.215	35.19	29.69 – 40.69	<0.001	0.99	-11.11 – 13.10	0.870
College	0.98	0.59 – 1.36	<0.001							1.12	0.67 – 1.58	<0.001
HouseholdIncome				0.59	0.33 – 0.85	<0.001				-0.08	-0.38 – 0.23	0.625
NonWhite							0.37	0.16 – 0.58	0.001	0.43	0.27 – 0.60	<0.001
Observations	50			50			50			50
R² / R² adjusted	0.355 / 0.342			0.296 / 0.281			0.211 / 0.194			0.616 / 0.590

Model 1: College and ClintonVote

In the first model, the coefficient is 0.98, meaning that for each unit increase in College, ClintonVote increases by 0.98 units. This relationship is highly significant with a p value less than 0.001. The R-squared value for this model is 0.355, and the adjusted R-squared value is 0.342, indicating that approximately 34.2% of the variance in prestige is explained by College alone.

Model 2: HouseholdIncome and ClintonVote

In the second model, the coefficient is 0.59, meaning that for each unit increase in HouseholdIncome, ClintonVote increases by 0.59 units. This relationship is highly significant with a p value less than 0.001. The R-squared value for this model is 0.296, and the adjusted R-squared value is 0.281, indicating that approximately 28.1% of the variance in prestige is explained by HouseholdIncome alone.

Model 3: NonWhite and ClintonVote

In the third model, the coefficient is 0.37, meaning that for each unit increase in NonWhite, ClintonVote increases by 0.37 units. This relationship is highly significant with a p value of 0.001. The R-squared value for this model is 0.211, and the adjusted R-squared value is 0.194, indicating that approximately 19.4% of the variance in prestige is explained by NonWhite alone.

Model 4: College, HouseholdIncome, Nonwhite, and ClintonVote

The fourth model incorporates College, HouseholdIncome, and Nonwhite to understand their combined effects on ClintonVote. The coefficient for College is 1.12, meaning that for each unit increase in College, ClintonVote incleases by 1.12 units. This relationship is highly significant with a p value of <0.001. The coefficient for HousholdIncome is -0.08, meaning that for each unit increase in College, CLintonVote incleases by -0.08 units. This relationship is not significant with a p value of 0.625. The coefficent for NonWhite is 0.43, meaning for each unit increase in NonWhite, ClintonVote increase by 0.43 units. This relationship is highly significant, with a p value of less than 0.001. The adjusted R square value this model is 0.59, showing the combination of all three predictors provides the best explantory power.

Task 2

Extend the model from Task 1 by adding the Region variable. Compare this new model to the previous one using AIC and BIC. Create a regression table with all models with modelsummary. Interpret the results and discuss which model you prefer and why.

m5 <- lm(ClintonVote ~ Region, data = stats)
m6 <- glm(ClintonVote ~ College + HouseholdIncome + NonWhite + Region, data = stats)

models_1 <- list(m1, m2, m3, m5, m6)
modelsummary(models_1)

tinytable_gcs20h94q9o4ds97bky4

	(1)	(2)	(3)	(4)	(5)
(Intercept)	11.487	9.678	35.189	39.903	14.806
	(6.391)	(7.695)	(2.734)	(2.455)	(7.159)
College	0.976				1.189
	(0.190)				(0.257)
HouseholdIncome		0.589			-0.429
		(0.131)			(0.171)
NonWhite			0.373		0.525
			(0.104)		(0.082)
RegionNE				14.221	8.031
				(3.626)	(2.721)
RegionS				-0.678	-3.404
				(3.471)	(2.742)
RegionW				3.552	5.658
				(3.471)	(2.772)
Num.Obs.	50	50	50	50	50
R2	0.355	0.296	0.211	0.313	0.722
R2 Adj.	0.342	0.281	0.194	0.268
AIC	358.6	363.0	368.7	365.8	326.5
BIC	364.4	368.8	374.5	375.3	341.8
Log.Lik.	-176.311	-178.512	-181.358	-177.886	-155.243
RMSE	8.23	8.60	9.10	8.49	5.40

AIC(m1, m2, m3, m5, m6)

##    df      AIC
## m1  3 358.6214
## m2  3 363.0234
## m3  3 368.7152
## m5  5 365.7721
## m6  8 326.4857

BIC(m1, m2, m3, m5, m6)

##    df      BIC
## m1  3 364.3575
## m2  3 368.7595
## m3  3 374.4513
## m5  5 375.3322
## m6  8 341.7819

he AIC values decrease from m1 to m3, indicating that each subsequent model provides a better fit to the data while accounting for model complexity. The AIC significantly lowers in value for m5 and m6, suggesting it’s they are the best fit among these three models according to this criterion. The BIC values also decrease from m1 to m3, and then significantly drops for m5 and m6. Like AIC, BIC penalizes model complexity, but more severely. Even with this stricter penalty, m6 still has the lowest BIC, further supporting it as the best model among the three.

Model Improvement: Both AIC and BIC consistently decrease when the region variable is added, indicating the significance of the region varibale and that the more complex models provide better fits to the data, even when penalizing for additional parameters.
Best Model: Model m6, which includes all main effects and interactions, appears to be the best fit according to both AIC and BIC. This suggests that the interactions between the College, HouseholdIncome, NonWhite, and Region variables provide meaningful improvements in predicting ClintonVote.

Task 3

Using the States dataset, fit a model predicting ClintonVote with College, NonWhite, and their interaction. Visualize the interaction effect using the effects package and interpret the results. How does the interaction change our understanding of the relationship between education and voting patterns?

data(USStates)
stats <- USStates

# Fit a baseline model without interaction
m.baseline <- lm(ClintonVote ~ College, data=stats)
summary(m.baseline)

## 
## Call:
## lm(formula = ClintonVote ~ College, data = stats)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3211  -5.2097   0.3566   5.3001  20.3779 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.4872     6.3913   1.797   0.0786 .  
## College       0.9760     0.1898   5.142 4.96e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.395 on 48 degrees of freedom
## Multiple R-squared:  0.3552, Adjusted R-squared:  0.3417 
## F-statistic: 26.44 on 1 and 48 DF,  p-value: 4.964e-06

# Fit a model with interaction
m.interact <- lm(ClintonVote ~ NonWhite + College + NonWhite:College, data=stats)
summary(m.interact)

## 
## Call:
## lm(formula = ClintonVote ~ NonWhite + College + NonWhite:College, 
##     data = stats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.446  -3.543  -0.295   3.936  13.831 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -15.22574   13.26349  -1.148 0.256927    
## NonWhite           0.99725    0.47769   2.088 0.042397 *  
## College            1.50058    0.39913   3.760 0.000479 ***
## NonWhite:College  -0.01802    0.01457  -1.236 0.222664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.532 on 46 degrees of freedom
## Multiple R-squared:  0.6259, Adjusted R-squared:  0.6015 
## F-statistic: 25.66 on 3 and 46 DF,  p-value: 6.636e-10

# Plot the effects of the baseline model
plot(allEffects(m.baseline))

# Plot the effects of the interaction model
plot(allEffects(m.interact))

# Plot the effects of the interaction model with multiline option
plot(allEffects(m.interact), multiline = TRUE)

All slopes are greater than 0, indicating that throughout each education score the more Non-white respondents are the more likely they are to vote for Clinton. The slopes values lower as education score increases, while intercepts rise, indicating that the more educated respondents are the likelyhood of voting for Clinton increases, while variation in voting patterns based on % Non white decrease. As each of the lines are no parrallel, this indicates there is an interaction effect between the Non-White and College predictors and the clinton vote outcome differ depending on the level of interaction.

Alternative Graph with GLM (Not sure what type of graph is wanted)

stats$Clinton <- car::recode(stats$Elect2016, "'D'='1'; 'R'='0'", as.factor=TRUE)

m.baseline1 <- glm(Clinton ~ College + NonWhite + College:NonWhite, 
                data=stats, family=binomial(link="logit"))
summary(m.baseline1)

## 
## Call:
## glm(formula = Clinton ~ College + NonWhite + College:NonWhite, 
##     family = binomial(link = "logit"), data = stats)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      -18.846621   8.251618  -2.284   0.0224 *
## College            0.501762   0.235410   2.131   0.0331 *
## NonWhite           0.316978   0.267814   1.184   0.2366  
## College:NonWhite  -0.007502   0.007838  -0.957   0.3385  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 42.570  on 46  degrees of freedom
## AIC: 50.57
## 
## Number of Fisher Scoring iterations: 5

# Plot just the interaction
eff <- Effect(c("College", "NonWhite"), m.baseline1)
plot(eff, type = "response", multiline = TRUE)

Task 4

Using the States dataset, create a binary outcome variable indicating whether a state voted for Clinton (1) or not (0). Perform a logistic regression with this new variable as the dependent variable and College, HouseholdIncome, and Region as predictors. Create regression tables using both modelsummary and sjPlot. Interpret the odds ratios and assess model fit.

data(USStates)
stats <- USStates

view(stats$Elect2016)

# Recode the Region variable to create a South dummy variable
stats$Clinton <- car::recode(stats$Elect2016, "'D'='1'; 'R'='0'", as.factor=TRUE)

# Check the table of the new variable
table(stats$Clinton)

## 
##  0  1 
## 30 20

my_logit <- glm(Clinton ~ College + HouseholdIncome + Region, data=stats, family=binomial(link="logit"))

modelsummary(my_logit)

tinytable_axc4dkckmtyjdo0ew2bf

	(1)
(Intercept)	-13.883
	(4.578)
College	0.254
	(0.130)
HouseholdIncome	0.056
	(0.066)
RegionNE	3.022
	(1.413)
RegionS	0.313
	(1.589)
RegionW	3.021
	(1.431)
Num.Obs.	50
AIC	44.0
BIC	55.5
Log.Lik.	-16.007
RMSE	0.32

tab_model(my_logit)

	Clinton
Predictors	Odds Ratios	CI	p
(Intercept)	0.00	0.00 – 0.00	0.002
College	1.29	1.01 – 1.71	0.050
HouseholdIncome	1.06	0.93 – 1.22	0.399
Region [NE]	20.54	1.70 – 599.19	0.032
Region [S]	1.37	0.04 – 32.70	0.844
Region [W]	20.52	1.55 – 500.93	0.035
Observations	50
R² Tjur	0.588

Intercept: The odds ratio of 0 represents the odds of voting for Clinton when both the percentage of college graduates and household income are 0. This extremely small value is not practically interpretable but serves as a baseline for the model. College: For each 1 percentage point increase in college graduates, the odds of a state voting for Clinton increase by a factor of 1.29, or about 12.9%. HouseholdIncome: For each 1 unit increase in household income (in thousands of dollars), the odds of a state voting for Clinton increase by a factor of 1.06, or about 10.6%. These odds ratios indicate that both higher percentages of college graduates and higher household incomes are associated with increased odds of states voting for Clinton, with the effect of college education being slightly stronger than that of household income.

Assesing model fit: The R square value of 0.588 demonstrates a moderate level of explantory power and a moderate level of the model describing the observed data.

Task 5

Leveraging the same logistic regression model from Task 4, use the effects package to plot the predicted probabilities of voting for Clinton across the range of College values, separated by Region. Then, create a plot using sjPlot’s plot_model function to visualize the odds ratios of all predictors in the model.

data(USStates)
stats <- USStates

view(stats$Elect2016)

stats$Clinton <- car::recode(stats$Elect2016, "'D'='1'; 'R'='0'", as.factor=TRUE)

model_mama <- glm(Clinton ~ College:Region, data = stats, family = binomial(link="logit"))

effect_plot <- allEffects(model_mama, partial.residuals = TRUE)

plot(effect_plot, 'College:Region', multiline = TRUE, ci.style = "bands", ylab = "Predicted Probability of Voting for Clinton")

## Warning in plot.eff(x[[selection]], lattice = lattice, ...): partial residuals
## are not displayed in a multiline plot

plot_model(model_mama, type = "est", show.values = TRUE, value.offset = 0.3)

Problem Set 4

Benny Konishi

2024-07-23