# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "broom", "sjPlot", "ggplot2", "car", "Lock5Data", "pandoc", "mosaic", "effects") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
##
## Attaching package: 'mosaic'
##
##
## The following object is masked from 'package:Matrix':
##
## mean
##
##
## The following objects are masked from 'package:car':
##
## deltaMethod, logit
##
##
## The following object is masked from 'package:modelsummary':
##
## msummary
##
##
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
##
##
## The following object is masked from 'package:purrr':
##
## cross
##
##
## The following object is masked from 'package:ggplot2':
##
## stat
##
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
##
##
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
##
##
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[7]]
## [1] "car" "carData" "sjPlot" "broom" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "Lock5Data" "car" "carData" "sjPlot" "broom"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "pandoc" "Lock5Data" "car" "carData" "sjPlot"
## [6] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "mosaic" "mosaicData" "ggformula" "Matrix" "lattice"
## [6] "pandoc" "Lock5Data" "car" "carData" "sjPlot"
## [11] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "effects" "mosaic" "mosaicData" "ggformula" "Matrix"
## [6] "lattice" "pandoc" "Lock5Data" "car" "carData"
## [11] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [16] "forcats" "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [26] "graphics" "grDevices" "utils" "datasets" "methods"
## [31] "base"
data(USStates)
stats <- USStates
head(stats)
## State HouseholdIncome Region Population EighthGradeMath HighSchool
## 1 Alabama 46.472 S 4.875 268.7 87.1
## 2 Alaska 76.114 W 0.740 274.3 92.8
## 3 Arizona 53.510 W 7.016 279.9 87.1
## 4 Arkansas 43.813 S 3.004 274.4 89.1
## 5 California 67.169 W 39.537 275.6 87.4
## 6 Colorado 65.458 W 5.607 284.7 91.5
## College IQ GSP Vegetables Fruit Smokers PhysicalActivity Obese NonWhite
## 1 26.0 95.7 40.279 80.7 55.1 20.9 42.8 36.2 31.6
## 2 26.5 99.0 70.936 81.0 63.1 21.0 58.3 29.5 34.7
## 3 27.4 97.4 43.096 79.2 62.8 15.6 52.7 29.5 22.5
## 4 24.7 97.5 38.467 80.7 55.3 22.3 45.4 37.1 22.7
## 5 34.5 95.5 67.698 78.6 67.5 11.3 57.5 25.8 39.4
## 6 39.6 101.6 59.057 82.6 67.0 14.6 58.7 23.0 15.8
## HeavyDrinkers Electoral ClintonVote Elect2016 TwoParents StudentSpending
## 1 5.45 9 34.36 R 60.9 9.236
## 2 7.33 3 36.55 R 71.5 17.510
## 3 5.57 11 45.13 R 62.7 7.613
## 4 5.32 6 33.65 R 63.3 9.846
## 5 5.95 55 61.73 D 66.8 11.495
## 6 7.30 9 48.16 D 71.9 9.575
## Insured
## 1 83.7
## 2 80.2
## 3 83.4
## 4 84.1
## 5 85.2
## 6 87.2
m1 <- lm(ClintonVote ~ College, data = stats)
m2 <- lm(ClintonVote ~ HouseholdIncome, data = stats)
m3 <- lm(ClintonVote ~ NonWhite, data = stats)
m4 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data = stats)
tab_model(m1, m2, m3, m4)
| Â | ClintonVote | ClintonVote | ClintonVote | ClintonVote | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 11.49 | -1.36 – 24.34 | 0.079 | 9.68 | -5.79 – 25.15 | 0.215 | 35.19 | 29.69 – 40.69 | <0.001 | 0.99 | -11.11 – 13.10 | 0.870 |
| College | 0.98 | 0.59 – 1.36 | <0.001 | 1.12 | 0.67 – 1.58 | <0.001 | ||||||
| HouseholdIncome | 0.59 | 0.33 – 0.85 | <0.001 | -0.08 | -0.38 – 0.23 | 0.625 | ||||||
| NonWhite | 0.37 | 0.16 – 0.58 | 0.001 | 0.43 | 0.27 – 0.60 | <0.001 | ||||||
| Observations | 50 | 50 | 50 | 50 | ||||||||
| R2 / R2 adjusted | 0.355 / 0.342 | 0.296 / 0.281 | 0.211 / 0.194 | 0.616 / 0.590 | ||||||||
Model 1: College and ClintonVote
In the first model, the coefficient is 0.98, meaning that for each unit increase in College, ClintonVote increases by 0.98 units. This relationship is highly significant with a p value less than 0.001. The R-squared value for this model is 0.355, and the adjusted R-squared value is 0.342, indicating that approximately 34.2% of the variance in prestige is explained by College alone.
Model 2: HouseholdIncome and ClintonVote
In the second model, the coefficient is 0.59, meaning that for each unit increase in HouseholdIncome, ClintonVote increases by 0.59 units. This relationship is highly significant with a p value less than 0.001. The R-squared value for this model is 0.296, and the adjusted R-squared value is 0.281, indicating that approximately 28.1% of the variance in prestige is explained by HouseholdIncome alone.
Model 3: NonWhite and ClintonVote
In the third model, the coefficient is 0.37, meaning that for each unit increase in NonWhite, ClintonVote increases by 0.37 units. This relationship is highly significant with a p value of 0.001. The R-squared value for this model is 0.211, and the adjusted R-squared value is 0.194, indicating that approximately 19.4% of the variance in prestige is explained by NonWhite alone.
Model 4: College, HouseholdIncome, Nonwhite, and ClintonVote
The fourth model incorporates College, HouseholdIncome, and Nonwhite to understand their combined effects on ClintonVote. The coefficient for College is 1.12, meaning that for each unit increase in College, ClintonVote incleases by 1.12 units. This relationship is highly significant with a p value of <0.001. The coefficient for HousholdIncome is -0.08, meaning that for each unit increase in College, CLintonVote incleases by -0.08 units. This relationship is not significant with a p value of 0.625. The coefficent for NonWhite is 0.43, meaning for each unit increase in NonWhite, ClintonVote increase by 0.43 units. This relationship is highly significant, with a p value of less than 0.001. The adjusted R square value this model is 0.59, showing the combination of all three predictors provides the best explantory power.
Extend the model from Task 1 by adding the Region variable. Compare this new model to the previous one using AIC and BIC. Create a regression table with all models with modelsummary. Interpret the results and discuss which model you prefer and why.
m5 <- lm(ClintonVote ~ Region, data = stats)
m6 <- glm(ClintonVote ~ College + HouseholdIncome + NonWhite + Region, data = stats)
models_1 <- list(m1, m2, m3, m5, m6)
modelsummary(models_1)
| (1) | (2) | (3) | (4) | (5) | |
|---|---|---|---|---|---|
| (Intercept) | 11.487 | 9.678 | 35.189 | 39.903 | 14.806 |
| (6.391) | (7.695) | (2.734) | (2.455) | (7.159) | |
| College | 0.976 | 1.189 | |||
| (0.190) | (0.257) | ||||
| HouseholdIncome | 0.589 | -0.429 | |||
| (0.131) | (0.171) | ||||
| NonWhite | 0.373 | 0.525 | |||
| (0.104) | (0.082) | ||||
| RegionNE | 14.221 | 8.031 | |||
| (3.626) | (2.721) | ||||
| RegionS | -0.678 | -3.404 | |||
| (3.471) | (2.742) | ||||
| RegionW | 3.552 | 5.658 | |||
| (3.471) | (2.772) | ||||
| Num.Obs. | 50 | 50 | 50 | 50 | 50 |
| R2 | 0.355 | 0.296 | 0.211 | 0.313 | 0.722 |
| R2 Adj. | 0.342 | 0.281 | 0.194 | 0.268 | |
| AIC | 358.6 | 363.0 | 368.7 | 365.8 | 326.5 |
| BIC | 364.4 | 368.8 | 374.5 | 375.3 | 341.8 |
| Log.Lik. | -176.311 | -178.512 | -181.358 | -177.886 | -155.243 |
| RMSE | 8.23 | 8.60 | 9.10 | 8.49 | 5.40 |
AIC(m1, m2, m3, m5, m6)
## df AIC
## m1 3 358.6214
## m2 3 363.0234
## m3 3 368.7152
## m5 5 365.7721
## m6 8 326.4857
BIC(m1, m2, m3, m5, m6)
## df BIC
## m1 3 364.3575
## m2 3 368.7595
## m3 3 374.4513
## m5 5 375.3322
## m6 8 341.7819
he AIC values decrease from m1 to m3, indicating that each subsequent model provides a better fit to the data while accounting for model complexity. The AIC significantly lowers in value for m5 and m6, suggesting it’s they are the best fit among these three models according to this criterion.
The BIC values also decrease from m1 to m3, and then significantly drops for m5 and m6. Like AIC, BIC penalizes model complexity, but more severely. Even with this stricter penalty, m6 still has the lowest BIC, further supporting it as the best model among the three.
Model Improvement: Both AIC and BIC consistently decrease when the region variable is added, indicating the significance of the region varibale and that the more complex models provide better fits to the data, even when penalizing for additional parameters.
Best Model: Model m6, which includes all main effects and interactions, appears to be the best fit according to both AIC and BIC. This suggests that the interactions between the College, HouseholdIncome, NonWhite, and Region variables provide meaningful improvements in predicting ClintonVote.
Using the States dataset, fit a model predicting ClintonVote with College, NonWhite, and their interaction. Visualize the interaction effect using the effects package and interpret the results. How does the interaction change our understanding of the relationship between education and voting patterns?
data(USStates)
stats <- USStates
# Fit a baseline model without interaction
m.baseline <- lm(ClintonVote ~ College, data=stats)
summary(m.baseline)
##
## Call:
## lm(formula = ClintonVote ~ College, data = stats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.3211 -5.2097 0.3566 5.3001 20.3779
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.4872 6.3913 1.797 0.0786 .
## College 0.9760 0.1898 5.142 4.96e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.395 on 48 degrees of freedom
## Multiple R-squared: 0.3552, Adjusted R-squared: 0.3417
## F-statistic: 26.44 on 1 and 48 DF, p-value: 4.964e-06
# Fit a model with interaction
m.interact <- lm(ClintonVote ~ NonWhite + College + NonWhite:College, data=stats)
summary(m.interact)
##
## Call:
## lm(formula = ClintonVote ~ NonWhite + College + NonWhite:College,
## data = stats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.446 -3.543 -0.295 3.936 13.831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.22574 13.26349 -1.148 0.256927
## NonWhite 0.99725 0.47769 2.088 0.042397 *
## College 1.50058 0.39913 3.760 0.000479 ***
## NonWhite:College -0.01802 0.01457 -1.236 0.222664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.532 on 46 degrees of freedom
## Multiple R-squared: 0.6259, Adjusted R-squared: 0.6015
## F-statistic: 25.66 on 3 and 46 DF, p-value: 6.636e-10
# Plot the effects of the baseline model
plot(allEffects(m.baseline))
# Plot the effects of the interaction model
plot(allEffects(m.interact))
# Plot the effects of the interaction model with multiline option
plot(allEffects(m.interact), multiline = TRUE)
All slopes are greater than 0, indicating that throughout each education score the more Non-white respondents are the more likely they are to vote for Clinton. The slopes values lower as education score increases, while intercepts rise, indicating that the more educated respondents are the likelyhood of voting for Clinton increases, while variation in voting patterns based on % Non white decrease. As each of the lines are no parrallel, this indicates there is an interaction effect between the Non-White and College predictors and the clinton vote outcome differ depending on the level of interaction.
stats$Clinton <- car::recode(stats$Elect2016, "'D'='1'; 'R'='0'", as.factor=TRUE)
m.baseline1 <- glm(Clinton ~ College + NonWhite + College:NonWhite,
data=stats, family=binomial(link="logit"))
summary(m.baseline1)
##
## Call:
## glm(formula = Clinton ~ College + NonWhite + College:NonWhite,
## family = binomial(link = "logit"), data = stats)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -18.846621 8.251618 -2.284 0.0224 *
## College 0.501762 0.235410 2.131 0.0331 *
## NonWhite 0.316978 0.267814 1.184 0.2366
## College:NonWhite -0.007502 0.007838 -0.957 0.3385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 67.301 on 49 degrees of freedom
## Residual deviance: 42.570 on 46 degrees of freedom
## AIC: 50.57
##
## Number of Fisher Scoring iterations: 5
# Plot just the interaction
eff <- Effect(c("College", "NonWhite"), m.baseline1)
plot(eff, type = "response", multiline = TRUE)
Using the States dataset, create a binary outcome variable indicating whether a state voted for Clinton (1) or not (0). Perform a logistic regression with this new variable as the dependent variable and College, HouseholdIncome, and Region as predictors. Create regression tables using both modelsummary and sjPlot. Interpret the odds ratios and assess model fit.
data(USStates)
stats <- USStates
view(stats$Elect2016)
# Recode the Region variable to create a South dummy variable
stats$Clinton <- car::recode(stats$Elect2016, "'D'='1'; 'R'='0'", as.factor=TRUE)
# Check the table of the new variable
table(stats$Clinton)
##
## 0 1
## 30 20
my_logit <- glm(Clinton ~ College + HouseholdIncome + Region, data=stats, family=binomial(link="logit"))
modelsummary(my_logit)
| (1) | |
|---|---|
| (Intercept) | -13.883 |
| (4.578) | |
| College | 0.254 |
| (0.130) | |
| HouseholdIncome | 0.056 |
| (0.066) | |
| RegionNE | 3.022 |
| (1.413) | |
| RegionS | 0.313 |
| (1.589) | |
| RegionW | 3.021 |
| (1.431) | |
| Num.Obs. | 50 |
| AIC | 44.0 |
| BIC | 55.5 |
| Log.Lik. | -16.007 |
| RMSE | 0.32 |
tab_model(my_logit)
| Â | Clinton | ||
|---|---|---|---|
| Predictors | Odds Ratios | CI | p |
| (Intercept) | 0.00 | 0.00 – 0.00 | 0.002 |
| College | 1.29 | 1.01 – 1.71 | 0.050 |
| HouseholdIncome | 1.06 | 0.93 – 1.22 | 0.399 |
| Region [NE] | 20.54 | 1.70 – 599.19 | 0.032 |
| Region [S] | 1.37 | 0.04 – 32.70 | 0.844 |
| Region [W] | 20.52 | 1.55 – 500.93 | 0.035 |
| Observations | 50 | ||
| R2 Tjur | 0.588 | ||
Intercept: The odds ratio of 0 represents the odds of voting for Clinton when both the percentage of college graduates and household income are 0. This extremely small value is not practically interpretable but serves as a baseline for the model. College: For each 1 percentage point increase in college graduates, the odds of a state voting for Clinton increase by a factor of 1.29, or about 12.9%. HouseholdIncome: For each 1 unit increase in household income (in thousands of dollars), the odds of a state voting for Clinton increase by a factor of 1.06, or about 10.6%. These odds ratios indicate that both higher percentages of college graduates and higher household incomes are associated with increased odds of states voting for Clinton, with the effect of college education being slightly stronger than that of household income.
Assesing model fit: The R square value of 0.588 demonstrates a moderate level of explantory power and a moderate level of the model describing the observed data.
Leveraging the same logistic regression model from Task 4, use the effects package to plot the predicted probabilities of voting for Clinton across the range of College values, separated by Region. Then, create a plot using sjPlot’s plot_model function to visualize the odds ratios of all predictors in the model.
data(USStates)
stats <- USStates
view(stats$Elect2016)
stats$Clinton <- car::recode(stats$Elect2016, "'D'='1'; 'R'='0'", as.factor=TRUE)
model_mama <- glm(Clinton ~ College:Region, data = stats, family = binomial(link="logit"))
effect_plot <- allEffects(model_mama, partial.residuals = TRUE)
plot(effect_plot, 'College:Region', multiline = TRUE, ci.style = "bands", ylab = "Predicted Probability of Voting for Clinton")
## Warning in plot.eff(x[[selection]], lattice = lattice, ...): partial residuals
## are not displayed in a multiline plot
plot_model(model_mama, type = "est", show.values = TRUE, value.offset = 0.3)