# Load data
data(USStates)
States <- USStates
# Multiple linear regression analysis
model1 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data=States)
# Generate regression table using sjPlot package
library(sjPlot)
tab_model(model1)
| Â | ClintonVote | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 0.99 | -11.11 – 13.10 | 0.870 |
| College | 1.12 | 0.67 – 1.58 | <0.001 |
| HouseholdIncome | -0.08 | -0.38 – 0.23 | 0.625 |
| NonWhite | 0.43 | 0.27 – 0.60 | <0.001 |
| Observations | 50 | ||
| R2 / R2 adjusted | 0.616 / 0.590 | ||
# Interpretation of results
summary(model1)
##
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite,
## data = States)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.697 -3.605 -1.036 4.495 13.686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.99082 6.01413 0.165 0.870
## College 1.12498 0.22514 4.997 8.88e-06 ***
## HouseholdIncome -0.07554 0.15345 -0.492 0.625
## NonWhite 0.43123 0.08246 5.230 4.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.622 on 46 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.5904
## F-statistic: 24.55 on 3 and 46 DF, p-value: 1.237e-09
College:A 1% increase in college graduation rate is linked to a 1.12%
increase in ClintonVote.
HouseholdIncome:Household income doesn’t significantly affect
ClintonVote.
NonWhite:A 1% increase in the non-white population is linked to a
0.43% increase in ClintonVote.
Model Fit:This model explains 61.6% of the variance in ClintonVote.
# Load required libraries
library(modelsummary)
library(sjPlot)
# Existing model (from Task 1)
model1 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data=States)
# New model (adding the Region variable)
model2 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite + Region, data=States)
# Compare models using AIC and BIC
aic1 <- AIC(model1)
aic2 <- AIC(model2)
bic1 <- BIC(model1)
bic2 <- BIC(model2)
# Print AIC and BIC results
aic1; aic2
## [1] 336.7667
## [1] 326.4857
bic1; bic2
## [1] 346.3268
## [1] 341.7819
# Generate regression tables using modelsummary package
msummary(list("Model 1" = model1, "Model 2" = model2))
| Model 1 | Model 2 | |
|---|---|---|
| (Intercept) | 0.991 | 14.806 |
| (6.014) | (7.159) | |
| College | 1.125 | 1.189 |
| (0.225) | (0.257) | |
| HouseholdIncome | -0.076 | -0.429 |
| (0.153) | (0.171) | |
| NonWhite | 0.431 | 0.525 |
| (0.082) | (0.082) | |
| RegionNE | 8.031 | |
| (2.721) | ||
| RegionS | -3.404 | |
| (2.742) | ||
| RegionW | 5.658 | |
| (2.772) | ||
| Num.Obs. | 50 | 50 |
| R2 | 0.616 | 0.722 |
| R2 Adj. | 0.590 | 0.684 |
| AIC | 336.8 | 326.5 |
| BIC | 346.3 | 341.8 |
| Log.Lik. | -163.383 | -155.243 |
| RMSE | 6.35 | 5.40 |
# Generate regression tables using sjPlot package
tab_model(model1, model2)
| Â | ClintonVote | ClintonVote | ||||
|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 0.99 | -11.11 – 13.10 | 0.870 | 14.81 | 0.37 – 29.24 | 0.045 |
| College | 1.12 | 0.67 – 1.58 | <0.001 | 1.19 | 0.67 – 1.71 | <0.001 |
| HouseholdIncome | -0.08 | -0.38 – 0.23 | 0.625 | -0.43 | -0.77 – -0.08 | 0.016 |
| NonWhite | 0.43 | 0.27 – 0.60 | <0.001 | 0.52 | 0.36 – 0.69 | <0.001 |
| Region [NE] | 8.03 | 2.54 – 13.52 | 0.005 | |||
| Region [S] | -3.40 | -8.93 – 2.13 | 0.221 | |||
| Region [W] | 5.66 | 0.07 – 11.25 | 0.047 | |||
| Observations | 50 | 50 | ||||
| R2 / R2 adjusted | 0.616 / 0.590 | 0.722 / 0.684 | ||||
# Interpretation of results
summary(model1)
##
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite,
## data = States)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.697 -3.605 -1.036 4.495 13.686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.99082 6.01413 0.165 0.870
## College 1.12498 0.22514 4.997 8.88e-06 ***
## HouseholdIncome -0.07554 0.15345 -0.492 0.625
## NonWhite 0.43123 0.08246 5.230 4.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.622 on 46 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.5904
## F-statistic: 24.55 on 3 and 46 DF, p-value: 1.237e-09
summary(model2)
##
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite +
## Region, data = States)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3217 -4.1695 0.0435 4.2880 11.2477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.80589 7.15877 2.068 0.0447 *
## College 1.18862 0.25668 4.631 3.36e-05 ***
## HouseholdIncome -0.42863 0.17107 -2.506 0.0161 *
## NonWhite 0.52461 0.08175 6.417 9.03e-08 ***
## RegionNE 8.03052 2.72062 2.952 0.0051 **
## RegionS -3.40381 2.74204 -1.241 0.2212
## RegionW 5.65798 2.77249 2.041 0.0474 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.82 on 43 degrees of freedom
## Multiple R-squared: 0.7224, Adjusted R-squared: 0.6836
## F-statistic: 18.65 on 6 and 43 DF, p-value: 1.545e-10
AIC:
Model 1: 336.77
Model 2: 326.49
BIC:
Model 1: 346.33
Model 2: 341.78
Model 2 has
lower AIC and BIC, so it’s better.
College:
Model 1: 1.12 (significant)
Model 2: 1.19
(significant)
Significant positive effect in both
models.
HouseholdIncome:
Model 1: -0.08 (not significant)
Model
2: -0.43 (significant)
Only significant in Model 2.
NonWhite:
Model 1: 0.43 (significant)
Model 2: 0.52
(significant)
Significant in both models.
Region (Model 2 only):
NE: 8.03 (significant)
S: -3.40
(not significant)
W: 5.66 (significant)
# Fit the model including interaction
model3 <- lm(ClintonVote ~ College * NonWhite, data=States)
# Visualize the interaction effect using the effects package
library(effects)
plot(allEffects(model3))
# Interpretation of results
summary(model3)
##
## Call:
## lm(formula = ClintonVote ~ College * NonWhite, data = States)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.446 -3.543 -0.295 3.936 13.831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.22574 13.26349 -1.148 0.256927
## College 1.50058 0.39913 3.760 0.000479 ***
## NonWhite 0.99725 0.47769 2.088 0.042397 *
## College:NonWhite -0.01802 0.01457 -1.236 0.222664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.532 on 46 degrees of freedom
## Multiple R-squared: 0.6259, Adjusted R-squared: 0.6015
## F-statistic: 25.66 on 3 and 46 DF, p-value: 6.636e-10
College:
Estimate: 1.501
p-value: 0.0005
(significant)
A higher college graduation rate is associated with an
increase in ClintonVote.
NonWhite:
Estimate: 0.997
p-value: 0.0424
(significant)
A higher proportion of non-white population is
associated with an increase in ClintonVote.
# Create a binary outcome variable indicating whether a state voted for Clinton (1) or not (0)
# Here, we assume that ClintonVote represents the percentage of votes Clinton received in each state
# Set the threshold at 50% for a state voting for Clinton
States$ClintonVoteBinary <- ifelse(States$ClintonVote >= 50, 1, 0)
# Verify the distribution of the binary outcome variable
table(States$ClintonVoteBinary)
##
## 0 1
## 37 13
# Perform a logistic regression with College, HouseholdIncome, and Region as predictors
model4 <- glm(ClintonVoteBinary ~ College + HouseholdIncome + Region, data=States, family=binomial)
# Generate regression tables using modelsummary and sjPlot packages
library(modelsummary)
modelsummary(model4)
| (1) | |
|---|---|
| (Intercept) | -17.162 |
| (6.316) | |
| College | 0.285 |
| (0.146) | |
| HouseholdIncome | 0.076 |
| (0.074) | |
| RegionNE | 2.176 |
| (1.406) | |
| RegionS | -16.310 |
| (2420.647) | |
| RegionW | 2.589 |
| (1.624) | |
| Num.Obs. | 50 |
| AIC | 37.8 |
| BIC | 49.3 |
| Log.Lik. | -12.890 |
| RMSE | 0.29 |
library(sjPlot)
tab_model(model4)
| Â | ClintonVoteBinary | ||
|---|---|---|---|
| Predictors | Odds Ratios | CI | p |
| (Intercept) | 0.00 | 0.00 – 0.00 | 0.007 |
| College | 1.33 | 1.03 – 1.87 | 0.052 |
| HouseholdIncome | 1.08 | 0.94 – 1.27 | 0.301 |
| Region [NE] | 8.81 | 0.67 – 245.88 | 0.122 |
| Region [S] | 0.00 | NA – 257647072057664952864682004628268668026464282602266646868262204084428088660882608404066484.00 | 0.995 |
| Region [W] | 13.32 | 0.70 – 565.53 | 0.111 |
| Observations | 50 | ||
| R2 Tjur | 0.564 | ||
# Interpret the odds ratios and assess model fit
summary(model4)
##
## Call:
## glm(formula = ClintonVoteBinary ~ College + HouseholdIncome +
## Region, family = binomial, data = States)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.16240 6.31603 -2.717 0.00658 **
## College 0.28494 0.14633 1.947 0.05150 .
## HouseholdIncome 0.07628 0.07376 1.034 0.30110
## RegionNE 2.17592 1.40578 1.548 0.12166
## RegionS -16.31016 2420.64651 -0.007 0.99462
## RegionW 2.58895 1.62430 1.594 0.11096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57.306 on 49 degrees of freedom
## Residual deviance: 25.780 on 44 degrees of freedom
## AIC: 37.78
##
## Number of Fisher Scoring iterations: 18
# Load required libraries
library(effects)
library(sjPlot)
# Load and inspect the data
data(USStates)
States <- USStates
summary(States) # Summary of the data
## State HouseholdIncome Region Population EighthGradeMath
## Alabama : 1 Min. :42.01 MW:13 Min. : 0.579 Min. :268.7
## Alaska : 1 1st Qu.:50.91 NE:11 1st Qu.: 1.842 1st Qu.:276.6
## Arizona : 1 Median :56.34 S :13 Median : 4.569 Median :281.2
## Arkansas : 1 Mean :57.85 W :13 Mean : 6.501 Mean :281.2
## California: 1 3rd Qu.:64.75 3rd Qu.: 7.309 3rd Qu.:285.7
## Colorado : 1 Max. :78.92 Max. :39.537 Max. :294.5
## (Other) :44
## HighSchool College IQ GSP
## Min. :86.40 Min. :22.50 Min. : 94.20 Min. :34.03
## 1st Qu.:89.03 1st Qu.:27.98 1st Qu.: 98.47 1st Qu.:46.26
## Median :91.35 Median :32.65 Median :100.85 Median :51.23
## Mean :90.76 Mean :33.08 Mean :100.34 Mean :52.89
## 3rd Qu.:92.47 3rd Qu.:37.65 3rd Qu.:102.70 3rd Qu.:59.06
## Max. :95.00 Max. :50.90 Max. :104.30 Max. :73.53
##
## Vegetables Fruit Smokers PhysicalActivity
## Min. :76.10 Min. :53.70 Min. : 8.90 Min. :41.90
## 1st Qu.:80.62 1st Qu.:61.62 1st Qu.:15.03 1st Qu.:47.10
## Median :81.95 Median :63.30 Median :17.15 Median :50.40
## Mean :81.87 Mean :63.32 Mean :17.33 Mean :50.58
## 3rd Qu.:83.17 3rd Qu.:66.65 3rd Qu.:19.30 3rd Qu.:53.98
## Max. :87.60 Max. :70.30 Max. :26.00 Max. :59.70
##
## Obese NonWhite HeavyDrinkers Electoral
## Min. :23.00 Min. : 5.40 Min. :3.750 Min. : 3.00
## 1st Qu.:28.77 1st Qu.:14.35 1st Qu.:5.827 1st Qu.: 5.00
## Median :30.90 Median :21.75 Median :6.440 Median : 8.00
## Mean :31.43 Mean :23.05 Mean :6.483 Mean :10.70
## 3rd Qu.:34.38 3rd Qu.:31.45 3rd Qu.:7.322 3rd Qu.:11.75
## Max. :39.50 Max. :74.90 Max. :8.770 Max. :55.00
##
## ClintonVote Elect2016 TwoParents StudentSpending Insured
## Min. :21.63 D:20 Min. :54.10 Min. : 6.953 Min. :75.20
## 1st Qu.:35.83 R:30 1st Qu.:63.30 1st Qu.: 9.604 1st Qu.:83.42
## Median :45.91 Median :66.10 Median :11.319 Median :86.20
## Mean :43.78 Mean :66.53 Mean :11.950 Mean :86.24
## 3rd Qu.:49.98 3rd Qu.:70.42 3rd Qu.:14.072 3rd Qu.:89.67
## Max. :62.22 Max. :80.60 Max. :22.366 Max. :95.80
##
str(States) # Structure of the data
## 'data.frame': 50 obs. of 22 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ HouseholdIncome : num 46.5 76.1 53.5 43.8 67.2 ...
## $ Region : Factor w/ 4 levels "MW","NE","S",..: 3 4 4 3 4 4 2 2 3 3 ...
## $ Population : num 4.88 0.74 7.02 3 39.54 ...
## $ EighthGradeMath : num 269 274 280 274 276 ...
## $ HighSchool : num 87.1 92.8 87.1 89.1 87.4 91.5 92.4 89.2 89.4 87.5 ...
## $ College : num 26 26.5 27.4 24.7 34.5 39.6 42.7 33.4 28.8 30.9 ...
## $ IQ : num 95.7 99 97.4 97.5 95.5 ...
## $ GSP : num 40.3 70.9 43.1 38.5 67.7 ...
## $ Vegetables : num 80.7 81 79.2 80.7 78.6 82.6 83.1 82.8 80.6 81.9 ...
## $ Fruit : num 55.1 63.1 62.8 55.3 67.5 67 68.5 64.6 65.6 61.4 ...
## $ Smokers : num 20.9 21 15.6 22.3 11.3 14.6 12.7 17 16.1 17.5 ...
## $ PhysicalActivity: num 42.8 58.3 52.7 45.4 57.5 58.7 51.9 46.3 49.5 46.1 ...
## $ Obese : num 36.2 29.5 29.5 37.1 25.8 23 27.4 33.5 30.7 32.5 ...
## $ NonWhite : num 31.6 34.7 22.5 22.7 39.4 15.8 23.3 30.9 24.3 40.6 ...
## $ HeavyDrinkers : num 5.45 7.33 5.57 5.32 5.95 7.3 6.45 6.4 7.61 5.82 ...
## $ Electoral : int 9 3 11 6 55 9 7 3 29 16 ...
## $ ClintonVote : num 34.4 36.5 45.1 33.6 61.7 ...
## $ Elect2016 : Factor w/ 2 levels "D","R": 2 2 2 2 1 1 1 1 2 2 ...
## $ TwoParents : num 60.9 71.5 62.7 63.3 66.8 71.9 66.5 62.6 61 62.1 ...
## $ StudentSpending : num 9.24 17.51 7.61 9.85 11.49 ...
## $ Insured : num 83.7 80.2 83.4 84.1 85.2 87.2 91.1 90.7 78.2 79.3 ...
# Create a binary variable indicating whether a state voted for Clinton (1) or not (0)
# Here, we assume that ClintonVote represents the percentage of votes Clinton received in each state
# Set the threshold at 50% for a state voting for Clinton
States$ClintonVoteBinary <- ifelse(States$ClintonVote >= 50, 1, 0)
# Check the distribution of the binary variable
table(States$ClintonVoteBinary)
##
## 0 1
## 37 13
# Check the distribution of independent variables
hist(States$College, main="College Distribution", xlab="College") # Histogram of College variable
hist(States$HouseholdIncome, main="HouseholdIncome Distribution", xlab="HouseholdIncome") # Histogram of HouseholdIncome variable
table(States$Region) # Distribution of Region variable
##
## MW NE S W
## 13 11 13 13
# Perform logistic regression analysis
model4 <- glm(ClintonVoteBinary ~ College + HouseholdIncome + Region, data=States, family=binomial)
# Summary of the model
summary(model4)
##
## Call:
## glm(formula = ClintonVoteBinary ~ College + HouseholdIncome +
## Region, family = binomial, data = States)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.16240 6.31603 -2.717 0.00658 **
## College 0.28494 0.14633 1.947 0.05150 .
## HouseholdIncome 0.07628 0.07376 1.034 0.30110
## RegionNE 2.17592 1.40578 1.548 0.12166
## RegionS -16.31016 2420.64651 -0.007 0.99462
## RegionW 2.58895 1.62430 1.594 0.11096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57.306 on 49 degrees of freedom
## Residual deviance: 25.780 on 44 degrees of freedom
## AIC: 37.78
##
## Number of Fisher Scoring iterations: 18
# Task 5: Visualize predicted probabilities
# Visualize predicted probabilities using the effects package
all_effects <- allEffects(model4)
plot(all_effects)
# Visualize predicted probabilities using the sjPlot package
plot_model(model4, type = "pred", terms = "College [all]")
plot_model(model4, type = "pred", terms = "HouseholdIncome [all]")
plot_model(model4, type = "pred", terms = "Region")
# Visualize odds ratios using plot_model (without log transformation)
plot_model(model4, type = "est", transform = NULL)
# Additional plots for predicted probabilities
plot_model(model4, type = "pred", terms = c("College", "Region [all]"))
## Data were 'prettified'. Consider using `terms="College [all]"` to get
## smooth plots.
plot_model(model4, type = "pred", terms = c("HouseholdIncome", "Region [all]"))
## Data were 'prettified'. Consider using `terms="HouseholdIncome [all]"`
## to get smooth plots.