Task 1

# Load data
data(USStates)
States <- USStates

# Multiple linear regression analysis
model1 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data=States)

# Generate regression table using sjPlot package
library(sjPlot)
tab_model(model1)
  ClintonVote
Predictors Estimates CI p
(Intercept) 0.99 -11.11 – 13.10 0.870
College 1.12 0.67 – 1.58 <0.001
HouseholdIncome -0.08 -0.38 – 0.23 0.625
NonWhite 0.43 0.27 – 0.60 <0.001
Observations 50
R2 / R2 adjusted 0.616 / 0.590
# Interpretation of results
summary(model1)
## 
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite, 
##     data = States)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.697  -3.605  -1.036   4.495  13.686 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.99082    6.01413   0.165    0.870    
## College          1.12498    0.22514   4.997 8.88e-06 ***
## HouseholdIncome -0.07554    0.15345  -0.492    0.625    
## NonWhite         0.43123    0.08246   5.230 4.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.622 on 46 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.5904 
## F-statistic: 24.55 on 3 and 46 DF,  p-value: 1.237e-09

College:A 1% increase in college graduation rate is linked to a 1.12% increase in ClintonVote.

HouseholdIncome:Household income doesn’t significantly affect ClintonVote.

NonWhite:A 1% increase in the non-white population is linked to a 0.43% increase in ClintonVote.

Model Fit:This model explains 61.6% of the variance in ClintonVote.

Task 2

# Load required libraries
library(modelsummary)
library(sjPlot)

# Existing model (from Task 1)
model1 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite, data=States)

# New model (adding the Region variable)
model2 <- lm(ClintonVote ~ College + HouseholdIncome + NonWhite + Region, data=States)

# Compare models using AIC and BIC
aic1 <- AIC(model1)
aic2 <- AIC(model2)
bic1 <- BIC(model1)
bic2 <- BIC(model2)

# Print AIC and BIC results
aic1; aic2
## [1] 336.7667
## [1] 326.4857
bic1; bic2
## [1] 346.3268
## [1] 341.7819
# Generate regression tables using modelsummary package
msummary(list("Model 1" = model1, "Model 2" = model2))
tinytable_z0wpye9627n290f9qjc8
Model 1 Model 2
(Intercept) 0.991 14.806
(6.014) (7.159)
College 1.125 1.189
(0.225) (0.257)
HouseholdIncome -0.076 -0.429
(0.153) (0.171)
NonWhite 0.431 0.525
(0.082) (0.082)
RegionNE 8.031
(2.721)
RegionS -3.404
(2.742)
RegionW 5.658
(2.772)
Num.Obs. 50 50
R2 0.616 0.722
R2 Adj. 0.590 0.684
AIC 336.8 326.5
BIC 346.3 341.8
Log.Lik. -163.383 -155.243
RMSE 6.35 5.40
# Generate regression tables using sjPlot package
tab_model(model1, model2)
  ClintonVote ClintonVote
Predictors Estimates CI p Estimates CI p
(Intercept) 0.99 -11.11 – 13.10 0.870 14.81 0.37 – 29.24 0.045
College 1.12 0.67 – 1.58 <0.001 1.19 0.67 – 1.71 <0.001
HouseholdIncome -0.08 -0.38 – 0.23 0.625 -0.43 -0.77 – -0.08 0.016
NonWhite 0.43 0.27 – 0.60 <0.001 0.52 0.36 – 0.69 <0.001
Region [NE] 8.03 2.54 – 13.52 0.005
Region [S] -3.40 -8.93 – 2.13 0.221
Region [W] 5.66 0.07 – 11.25 0.047
Observations 50 50
R2 / R2 adjusted 0.616 / 0.590 0.722 / 0.684
# Interpretation of results
summary(model1)
## 
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite, 
##     data = States)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.697  -3.605  -1.036   4.495  13.686 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.99082    6.01413   0.165    0.870    
## College          1.12498    0.22514   4.997 8.88e-06 ***
## HouseholdIncome -0.07554    0.15345  -0.492    0.625    
## NonWhite         0.43123    0.08246   5.230 4.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.622 on 46 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.5904 
## F-statistic: 24.55 on 3 and 46 DF,  p-value: 1.237e-09
summary(model2)
## 
## Call:
## lm(formula = ClintonVote ~ College + HouseholdIncome + NonWhite + 
##     Region, data = States)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3217  -4.1695   0.0435   4.2880  11.2477 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     14.80589    7.15877   2.068   0.0447 *  
## College          1.18862    0.25668   4.631 3.36e-05 ***
## HouseholdIncome -0.42863    0.17107  -2.506   0.0161 *  
## NonWhite         0.52461    0.08175   6.417 9.03e-08 ***
## RegionNE         8.03052    2.72062   2.952   0.0051 ** 
## RegionS         -3.40381    2.74204  -1.241   0.2212    
## RegionW          5.65798    2.77249   2.041   0.0474 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.82 on 43 degrees of freedom
## Multiple R-squared:  0.7224, Adjusted R-squared:  0.6836 
## F-statistic: 18.65 on 6 and 43 DF,  p-value: 1.545e-10
  1. Model Comparison (AIC and BIC):
  1. Interpretation of Coefficients:
  1. Model Fit:

Task 3

# Fit the model including interaction
model3 <- lm(ClintonVote ~ College * NonWhite, data=States)

# Visualize the interaction effect using the effects package
library(effects)
plot(allEffects(model3))

# Interpretation of results
summary(model3)
## 
## Call:
## lm(formula = ClintonVote ~ College * NonWhite, data = States)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.446  -3.543  -0.295   3.936  13.831 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -15.22574   13.26349  -1.148 0.256927    
## College            1.50058    0.39913   3.760 0.000479 ***
## NonWhite           0.99725    0.47769   2.088 0.042397 *  
## College:NonWhite  -0.01802    0.01457  -1.236 0.222664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.532 on 46 degrees of freedom
## Multiple R-squared:  0.6259, Adjusted R-squared:  0.6015 
## F-statistic: 25.66 on 3 and 46 DF,  p-value: 6.636e-10
  1. Interaction Term (College):
  1. Main Effects:
  1. Model Fit:

Task 4

# Create a binary outcome variable indicating whether a state voted for Clinton (1) or not (0)
# Here, we assume that ClintonVote represents the percentage of votes Clinton received in each state
# Set the threshold at 50% for a state voting for Clinton
States$ClintonVoteBinary <- ifelse(States$ClintonVote >= 50, 1, 0)

# Verify the distribution of the binary outcome variable
table(States$ClintonVoteBinary)
## 
##  0  1 
## 37 13
# Perform a logistic regression with College, HouseholdIncome, and Region as predictors
model4 <- glm(ClintonVoteBinary ~ College + HouseholdIncome + Region, data=States, family=binomial)

# Generate regression tables using modelsummary and sjPlot packages
library(modelsummary)
modelsummary(model4)
tinytable_z94pp73q7dkfu0i1uiyo
(1)
(Intercept) -17.162
(6.316)
College 0.285
(0.146)
HouseholdIncome 0.076
(0.074)
RegionNE 2.176
(1.406)
RegionS -16.310
(2420.647)
RegionW 2.589
(1.624)
Num.Obs. 50
AIC 37.8
BIC 49.3
Log.Lik. -12.890
RMSE 0.29
library(sjPlot)
tab_model(model4)
  ClintonVoteBinary
Predictors Odds Ratios CI p
(Intercept) 0.00 0.00 – 0.00 0.007
College 1.33 1.03 – 1.87 0.052
HouseholdIncome 1.08 0.94 – 1.27 0.301
Region [NE] 8.81 0.67 – 245.88 0.122
Region [S] 0.00 NA – 257647072057664952864682004628268668026464282602266646868262204084428088660882608404066484.00 0.995
Region [W] 13.32 0.70 – 565.53 0.111
Observations 50
R2 Tjur 0.564
# Interpret the odds ratios and assess model fit
summary(model4)
## 
## Call:
## glm(formula = ClintonVoteBinary ~ College + HouseholdIncome + 
##     Region, family = binomial, data = States)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -17.16240    6.31603  -2.717  0.00658 **
## College            0.28494    0.14633   1.947  0.05150 . 
## HouseholdIncome    0.07628    0.07376   1.034  0.30110   
## RegionNE           2.17592    1.40578   1.548  0.12166   
## RegionS          -16.31016 2420.64651  -0.007  0.99462   
## RegionW            2.58895    1.62430   1.594  0.11096   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.306  on 49  degrees of freedom
## Residual deviance: 25.780  on 44  degrees of freedom
## AIC: 37.78
## 
## Number of Fisher Scoring iterations: 18

Task 5

# Load required libraries
library(effects)
library(sjPlot)

# Load and inspect the data
data(USStates)
States <- USStates
summary(States) # Summary of the data
##         State    HouseholdIncome Region    Population     EighthGradeMath
##  Alabama   : 1   Min.   :42.01   MW:13   Min.   : 0.579   Min.   :268.7  
##  Alaska    : 1   1st Qu.:50.91   NE:11   1st Qu.: 1.842   1st Qu.:276.6  
##  Arizona   : 1   Median :56.34   S :13   Median : 4.569   Median :281.2  
##  Arkansas  : 1   Mean   :57.85   W :13   Mean   : 6.501   Mean   :281.2  
##  California: 1   3rd Qu.:64.75           3rd Qu.: 7.309   3rd Qu.:285.7  
##  Colorado  : 1   Max.   :78.92           Max.   :39.537   Max.   :294.5  
##  (Other)   :44                                                           
##    HighSchool       College            IQ              GSP       
##  Min.   :86.40   Min.   :22.50   Min.   : 94.20   Min.   :34.03  
##  1st Qu.:89.03   1st Qu.:27.98   1st Qu.: 98.47   1st Qu.:46.26  
##  Median :91.35   Median :32.65   Median :100.85   Median :51.23  
##  Mean   :90.76   Mean   :33.08   Mean   :100.34   Mean   :52.89  
##  3rd Qu.:92.47   3rd Qu.:37.65   3rd Qu.:102.70   3rd Qu.:59.06  
##  Max.   :95.00   Max.   :50.90   Max.   :104.30   Max.   :73.53  
##                                                                  
##    Vegetables        Fruit          Smokers      PhysicalActivity
##  Min.   :76.10   Min.   :53.70   Min.   : 8.90   Min.   :41.90   
##  1st Qu.:80.62   1st Qu.:61.62   1st Qu.:15.03   1st Qu.:47.10   
##  Median :81.95   Median :63.30   Median :17.15   Median :50.40   
##  Mean   :81.87   Mean   :63.32   Mean   :17.33   Mean   :50.58   
##  3rd Qu.:83.17   3rd Qu.:66.65   3rd Qu.:19.30   3rd Qu.:53.98   
##  Max.   :87.60   Max.   :70.30   Max.   :26.00   Max.   :59.70   
##                                                                  
##      Obese          NonWhite     HeavyDrinkers     Electoral    
##  Min.   :23.00   Min.   : 5.40   Min.   :3.750   Min.   : 3.00  
##  1st Qu.:28.77   1st Qu.:14.35   1st Qu.:5.827   1st Qu.: 5.00  
##  Median :30.90   Median :21.75   Median :6.440   Median : 8.00  
##  Mean   :31.43   Mean   :23.05   Mean   :6.483   Mean   :10.70  
##  3rd Qu.:34.38   3rd Qu.:31.45   3rd Qu.:7.322   3rd Qu.:11.75  
##  Max.   :39.50   Max.   :74.90   Max.   :8.770   Max.   :55.00  
##                                                                 
##   ClintonVote    Elect2016   TwoParents    StudentSpending     Insured     
##  Min.   :21.63   D:20      Min.   :54.10   Min.   : 6.953   Min.   :75.20  
##  1st Qu.:35.83   R:30      1st Qu.:63.30   1st Qu.: 9.604   1st Qu.:83.42  
##  Median :45.91             Median :66.10   Median :11.319   Median :86.20  
##  Mean   :43.78             Mean   :66.53   Mean   :11.950   Mean   :86.24  
##  3rd Qu.:49.98             3rd Qu.:70.42   3rd Qu.:14.072   3rd Qu.:89.67  
##  Max.   :62.22             Max.   :80.60   Max.   :22.366   Max.   :95.80  
## 
str(States) # Structure of the data
## 'data.frame':    50 obs. of  22 variables:
##  $ State           : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ HouseholdIncome : num  46.5 76.1 53.5 43.8 67.2 ...
##  $ Region          : Factor w/ 4 levels "MW","NE","S",..: 3 4 4 3 4 4 2 2 3 3 ...
##  $ Population      : num  4.88 0.74 7.02 3 39.54 ...
##  $ EighthGradeMath : num  269 274 280 274 276 ...
##  $ HighSchool      : num  87.1 92.8 87.1 89.1 87.4 91.5 92.4 89.2 89.4 87.5 ...
##  $ College         : num  26 26.5 27.4 24.7 34.5 39.6 42.7 33.4 28.8 30.9 ...
##  $ IQ              : num  95.7 99 97.4 97.5 95.5 ...
##  $ GSP             : num  40.3 70.9 43.1 38.5 67.7 ...
##  $ Vegetables      : num  80.7 81 79.2 80.7 78.6 82.6 83.1 82.8 80.6 81.9 ...
##  $ Fruit           : num  55.1 63.1 62.8 55.3 67.5 67 68.5 64.6 65.6 61.4 ...
##  $ Smokers         : num  20.9 21 15.6 22.3 11.3 14.6 12.7 17 16.1 17.5 ...
##  $ PhysicalActivity: num  42.8 58.3 52.7 45.4 57.5 58.7 51.9 46.3 49.5 46.1 ...
##  $ Obese           : num  36.2 29.5 29.5 37.1 25.8 23 27.4 33.5 30.7 32.5 ...
##  $ NonWhite        : num  31.6 34.7 22.5 22.7 39.4 15.8 23.3 30.9 24.3 40.6 ...
##  $ HeavyDrinkers   : num  5.45 7.33 5.57 5.32 5.95 7.3 6.45 6.4 7.61 5.82 ...
##  $ Electoral       : int  9 3 11 6 55 9 7 3 29 16 ...
##  $ ClintonVote     : num  34.4 36.5 45.1 33.6 61.7 ...
##  $ Elect2016       : Factor w/ 2 levels "D","R": 2 2 2 2 1 1 1 1 2 2 ...
##  $ TwoParents      : num  60.9 71.5 62.7 63.3 66.8 71.9 66.5 62.6 61 62.1 ...
##  $ StudentSpending : num  9.24 17.51 7.61 9.85 11.49 ...
##  $ Insured         : num  83.7 80.2 83.4 84.1 85.2 87.2 91.1 90.7 78.2 79.3 ...
# Create a binary variable indicating whether a state voted for Clinton (1) or not (0)
# Here, we assume that ClintonVote represents the percentage of votes Clinton received in each state
# Set the threshold at 50% for a state voting for Clinton
States$ClintonVoteBinary <- ifelse(States$ClintonVote >= 50, 1, 0)

# Check the distribution of the binary variable
table(States$ClintonVoteBinary)
## 
##  0  1 
## 37 13
# Check the distribution of independent variables
hist(States$College, main="College Distribution", xlab="College") # Histogram of College variable

hist(States$HouseholdIncome, main="HouseholdIncome Distribution", xlab="HouseholdIncome") # Histogram of HouseholdIncome variable

table(States$Region) # Distribution of Region variable
## 
## MW NE  S  W 
## 13 11 13 13
# Perform logistic regression analysis
model4 <- glm(ClintonVoteBinary ~ College + HouseholdIncome + Region, data=States, family=binomial)

# Summary of the model
summary(model4)
## 
## Call:
## glm(formula = ClintonVoteBinary ~ College + HouseholdIncome + 
##     Region, family = binomial, data = States)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -17.16240    6.31603  -2.717  0.00658 **
## College            0.28494    0.14633   1.947  0.05150 . 
## HouseholdIncome    0.07628    0.07376   1.034  0.30110   
## RegionNE           2.17592    1.40578   1.548  0.12166   
## RegionS          -16.31016 2420.64651  -0.007  0.99462   
## RegionW            2.58895    1.62430   1.594  0.11096   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57.306  on 49  degrees of freedom
## Residual deviance: 25.780  on 44  degrees of freedom
## AIC: 37.78
## 
## Number of Fisher Scoring iterations: 18
# Task 5: Visualize predicted probabilities
# Visualize predicted probabilities using the effects package
all_effects <- allEffects(model4)
plot(all_effects)

# Visualize predicted probabilities using the sjPlot package
plot_model(model4, type = "pred", terms = "College [all]")

plot_model(model4, type = "pred", terms = "HouseholdIncome [all]")

plot_model(model4, type = "pred", terms = "Region")

# Visualize odds ratios using plot_model (without log transformation)
plot_model(model4, type = "est", transform = NULL)

# Additional plots for predicted probabilities
plot_model(model4, type = "pred", terms = c("College", "Region [all]"))
## Data were 'prettified'. Consider using `terms="College [all]"` to get
##   smooth plots.

plot_model(model4, type = "pred", terms = c("HouseholdIncome", "Region [all]"))
## Data were 'prettified'. Consider using `terms="HouseholdIncome [all]"`
##   to get smooth plots.