HW 4

# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "broom", "sjPlot", "ggplot2", "car", "Lock5Data", "mosaic", "rockchalk", "effects", "sjPlot") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
## 
## 载入需要的程序包：carData
## 
## 
## 载入程序包：'car'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## 
## 载入程序包：'mosaic'
## 
## 
## The following object is masked from 'package:Matrix':
## 
##     mean
## 
## 
## The following objects are masked from 'package:car':
## 
##     deltaMethod, logit
## 
## 
## The following object is masked from 'package:modelsummary':
## 
##     msummary
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## 
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     stat
## 
## 
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## 
## 
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## 
## 
## 
## 载入程序包：'rockchalk'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     summarize
## 
## 
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "fst"       "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "broom"        "modelsummary" "fst"          "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[6]]
##  [1] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
##  [6] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [11] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"        
## 
## [[7]]
##  [1] "car"          "carData"      "sjPlot"       "broom"        "modelsummary"
##  [6] "fst"          "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "Lock5Data"    "car"          "carData"      "sjPlot"       "broom"       
##  [6] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "mosaic"       "mosaicData"   "ggformula"    "Matrix"       "lattice"     
##  [6] "Lock5Data"    "car"          "carData"      "sjPlot"       "broom"       
## [11] "modelsummary" "fst"          "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "rockchalk"    "mosaic"       "mosaicData"   "ggformula"    "Matrix"      
##  [6] "lattice"      "Lock5Data"    "car"          "carData"      "sjPlot"      
## [11] "broom"        "modelsummary" "fst"          "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "effects"      "rockchalk"    "mosaic"       "mosaicData"   "ggformula"   
##  [6] "Matrix"       "lattice"      "Lock5Data"    "car"          "carData"     
## [11] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "effects"      "rockchalk"    "mosaic"       "mosaicData"   "ggformula"   
##  [6] "Matrix"       "lattice"      "Lock5Data"    "car"          "carData"     
## [11] "sjPlot"       "broom"        "modelsummary" "fst"          "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"

Task 1 Using the States dataset, conduct a multiple linear regression with ClintonVote as the dependent variable and College, HouseholdIncome, and NonWhite as independent variables. Create regression tables using the sjPlot package. Interpret the coefficients and assess the model fit.

# Load the States data
data(USStates)
States <- USStates

States$Clinton <- abs(as.numeric(States$Elect2016) - 2)
table(States$Clinton) # Clinton win is coded as 1, Trump win is coded as 0

## 
##  0  1 
## 30 20

# Examine the data
head(States[, c("State", "Clinton", "College", "ClintonVote", "HouseholdIncome", "NonWhite")])

##        State Clinton College ClintonVote HouseholdIncome NonWhite
## 1    Alabama       0    26.0       34.36          46.472     31.6
## 2     Alaska       0    26.5       36.55          76.114     34.7
## 3    Arizona       0    27.4       45.13          53.510     22.5
## 4   Arkansas       0    24.7       33.65          43.813     22.7
## 5 California       1    34.5       61.73          67.169     39.4
## 6   Colorado       1    39.6       48.16          65.458     15.8

log odds

m.logit <- glm(Clinton ~ College, data=States, family=binomial(link="logit"))
summary(m.logit)

## 
## Call:
## glm(formula = Clinton ~ College, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.25042    2.65182  -3.488 0.000486 ***
## College      0.26312    0.07759   3.391 0.000696 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 48.299  on 48  degrees of freedom
## AIC: 52.299
## 
## Number of Fisher Scoring iterations: 5

# Visualize the relationship
plot(x = States$College, y = States$Clinton, 
     xlab = "% Graduated College", ylab = "Voted for Clinton", 
     pch = 16, cex=0.7, xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$College, y = States$Clinton, labels = States$State, 
     pos = 3, cex = 0.4, srt=45)

# Add predicted probabilities
newdata <- data.frame(College=seq(min(States$College), max(States$College), length.out=100))
newdata$prob <- predict(m.logit, newdata=newdata, type="response")
lines(x=newdata$College, y=newdata$prob, col="red")

# scatter plot with annotated points
plot(x = States$College, y = States$Clinton, xlab = "% Graduated from College", ylab = "Voted for Clinton", pch = 16, cex=0.7,
     xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$College, y = States$Clinton, labels = States$State, pos = 3, cex = 0.4, srt=45) # srt for text angle (here, 45 degrees)

# now including a regression line
m <- lm(Clinton ~ College, data=States)
abline(a=coef(m)[1], b=coef(m)[2], col="red")

m.logit1 <- glm(Clinton ~ HouseholdIncome, data=States, family=binomial(link="logit"))
summary(m.logit1)

## 
## Call:
## glm(formula = Clinton ~ HouseholdIncome, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -11.43714    3.13017  -3.654 0.000258 ***
## HouseholdIncome   0.18889    0.05331   3.543 0.000396 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 45.255  on 48  degrees of freedom
## AIC: 49.255
## 
## Number of Fisher Scoring iterations: 5

# scatter plot with annotated points
plot(x = States$HouseholdIncome, y = States$Clinton, xlab = "% Household Income", ylab = "Voted for Clinton", pch = 16, cex=0.7,
     xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$HouseholdIncome, y = States$Clinton, labels = States$State, pos = 3, cex = 0.4, srt=45) # srt for text angle (here, 45 degrees)

# now including a regression line
m <- lm(Clinton ~ HouseholdIncome, data=States)
abline(a=coef(m)[1], b=coef(m)[2], col="blue")

# Visualize the relationship
plot(x = States$HouseholdIncome, y = States$Clinton, 
     xlab = "% Household Income", ylab = "Voted for Clinton", 
     pch = 16, cex=0.7, xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$HouseholdIncome, y = States$Clinton, labels = States$State, 
     pos = 3, cex = 0.4, srt=45)

# Add predicted probabilities
newdata <- data.frame(HouseholdIncome=seq(min(States$HouseholdIncome), max(States$HouseholdIncome), length.out=100))
newdata$prob <- predict(m.logit1, newdata=newdata, type="response")
lines(x=newdata$HouseholdIncome, y=newdata$prob, col="blue")

m.logit2 <- glm(Clinton ~ NonWhite, data=States, family=binomial(link="logit"))
summary(m.logit2)

## 
## Call:
## glm(formula = Clinton ~ NonWhite, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -1.24432    0.65585  -1.897   0.0578 .
## NonWhite     0.03593    0.02500   1.437   0.1506  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 65.008  on 48  degrees of freedom
## AIC: 69.008
## 
## Number of Fisher Scoring iterations: 4

# scatter plot with annotated points
plot(x = States$NonWhite, y = States$Clinton, xlab = "% NonWhite", ylab = "Voted for Clinton", pch = 16, cex=0.7,
     xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$NonWhite, y = States$Clinton, labels = States$State, pos = 3, cex = 0.4, srt=45) # srt for text angle (here, 45 degrees)

# now including a regression line
m <- lm(Clinton ~ NonWhite, data=States)
abline(a=coef(m)[1], b=coef(m)[2], col="green")

# Visualize the relationship
plot(x = States$NonWhite, y = States$Clinton, 
     xlab = "% NonWhite", ylab = "Voted for Clinton", 
     pch = 16, cex=0.7, xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$NonWhite, y = States$Clinton, labels = States$State, 
     pos = 3, cex = 0.4, srt=45)

# Add predicted probabilities
newdata <- data.frame(NonWhite=seq(min(States$NonWhite), max(States$NonWhite), length.out=100))
newdata$prob <- predict(m.logit2, newdata=newdata, type="response")
lines(x=newdata$NonWhite, y=newdata$prob, col="green")

Log-odds ratios (not recommended due to difficulty in interpretation):

coef(m.logit)

## (Intercept)     College 
##   -9.250416    0.263120

coef(m.logit1)

##     (Intercept) HouseholdIncome 
##     -11.4371426       0.1888949

coef(m.logit2)

## (Intercept)    NonWhite 
## -1.24431999  0.03593297

Odds ratios:

exp(coef(m.logit))

##  (Intercept)      College 
## 9.607172e-05 1.300983e+00

exp(coef(m.logit1))

##     (Intercept) HouseholdIncome 
##    1.078728e-05    1.207914e+00

exp(coef(m.logit2))

## (Intercept)    NonWhite 
##   0.2881368   1.0365864

Intercept (College): The odds ratio of 0.00009607 represents the odds of voting for Clinton when the percentage of college graduates is 0%. This very small value indicates that the odds of voting for Clinton are extremely low in a hypothetical state with no college graduates.

College: For each 1 percentage point increase in college graduates, the odds of voting for Clinton increase by a factor of 1.301, or about 30.1%. This suggests a strong positive relationship between the percentage of college graduates and the likelihood of voting for Clinton.

Intercept (Household Income): The odds ratio of 0.00001078 represents the odds of voting for Clinton when the percentage of people’s household income is 0%. This very small value indicates that the odds of voting for Clinton are extremely low in a hypothetical state with people have 0 household income.

Household Income: For each 1 percentage point increase in Household Income, the odds of voting for Clinton increase by a factor of 1.207, or about 20.7%. This suggests a positive relationship between the percentage of Household Income and the likelihood of voting for Clinton.

Intercept (NonWhite): The odds ratio of 0.2881368 represents the odds of voting for Clinton when the percentage of Non White people is 0%. This very small value indicates that the odds of voting for Clinton are extremely low in a hypothetical state with no NonWhite.

NonWhite: For each 1 percentage point increase in NonWhite people, the odds of voting for Clinton increase by a factor of 1.036, or about 3.6%. This suggests a weak positive relationship between the percentage of NonWhites and the likelihood of voting for Clinton.

Partial derivatives:

# At probability level of 0.2:
est.slope <- coef(m.logit)[2]
prob.level <- 0.2
est.slope * prob.level * (1 - prob.level)

##   College 
## 0.0420992

# At probability level of 0.5:
prob.level <- 0.5
est.slope * prob.level * (1 - prob.level)

## College 
## 0.06578

# At probability level of 0.9:
prob.level <- 0.9
est.slope * prob.level * (1 - prob.level)

##   College 
## 0.0236808

These values show how the probability of voting for Clinton changes with a small increase in the percentage of college graduates, at different baseline probabilities:

At a baseline probability of 0.2, a 1 percentage point increase in college graduates is associated with a 0.0420992 (or 4.21 percentage point) increase in the probability of voting for Clinton. When the baseline probability is 0.5, a 1 percentage point increase in college graduates is associated with a 0.06578 (or 6.58 percentage point) increase in the probability of voting for Clinton. At a baseline probability of 0.9, a 1 percentage point increase in college graduates is associated with a 0.0236808 (or 2.37 percentage point) increase in the probability of voting for Clinton.

The effect is smaller at very low (0.2) or very high (0.9) probabilities, demonstrating the non-linear nature of logistic regression. The largest effect is observed at the middle probability (0.5), which is characteristic of the logistic curve’s S-shape.

# At probability level of 0.2:
est.slope <- coef(m.logit1)[2]
prob.level <- 0.2
est.slope * prob.level * (1 - prob.level)

## HouseholdIncome 
##      0.03022318

# At probability level of 0.5:
prob.level <- 0.5
est.slope * prob.level * (1 - prob.level)

## HouseholdIncome 
##      0.04722372

# At probability level of 0.9:
prob.level <- 0.9
est.slope * prob.level * (1 - prob.level)

## HouseholdIncome 
##      0.01700054

These values show how the probability of voting for Clinton changes with a small increase in the percentage of Household Incomes, at different baseline probabilities:

At a baseline probability of 0.2, a 1 percentage point increase in household income is associated with a 0.03022318 (or 3.02 percentage point) increase in the probability of voting for Clinton. When the baseline probability is 0.5, a 1 percentage point increase in household income is associated with a 0.04722372 (or 4.72 percentage point) increase in the probability of voting for Clinton. At a baseline probability of 0.9, a 1 percentage point increase in household income is associated with a 0.01700054 (or 1.70 percentage point) increase in the probability of voting for Clinton.

# At probability level of 0.2:
est.slope <- coef(m.logit2)[2]
prob.level <- 0.2
est.slope * prob.level * (1 - prob.level)

##    NonWhite 
## 0.005749276

# At probability level of 0.5:
prob.level <- 0.5
est.slope * prob.level * (1 - prob.level)

##    NonWhite 
## 0.008983244

# At probability level of 0.9:
prob.level <- 0.9
est.slope * prob.level * (1 - prob.level)

##    NonWhite 
## 0.003233968

These values show how the probability of voting for Clinton changes with a small increase in the percentage of NonWhite, at different baseline probabilities:

At a baseline probability of 0.2, a 1 percentage point increase in NonWhite is associated with a 0.005749276 (or 0.57 percentage point) increase in the probability of voting for Clinton. When the baseline probability is 0.5, a 1 percentage point increase in NonWhite is associated with a 0.008983244 (or 0.89 percentage point) increase in the probability of voting for Clinton. At a baseline probability of 0.9, a 1 percentage point increase in NonWhite is associated with a 0.003233968 (or 0.32 percentage point) increase in the probability of voting for Clinton.

Predicted probabilities: Looking directly at the intercept line

# For the intercept (College = 0):
x <- 0
intercept_value <- plogis(coef(m.logit)[1] + coef(m.logit)[2] * x)
format(intercept_value, scientific = FALSE)

##     (Intercept) 
## "0.00009606249"

# For different levels of College (e.g.20%, 40% or 60%):
x <- 20
plogis(coef(m.logit)[1] + coef(m.logit)[2] * x)

## (Intercept) 
##  0.01819912

x <- 40
plogis(coef(m.logit)[1] + coef(m.logit)[2] * x)

## (Intercept) 
##   0.7814925

x <- 60
plogis(coef(m.logit)[1] + coef(m.logit)[2] * x)

## (Intercept) 
##    0.998553

These values show the predicted probability of voting for Clinton at different percentages of college graduates:

In a hypothetical state with 0% college graduates, the probability of voting for Clinton is extremely low (0.0096%). At 20% college graduates, the probability increases to about 1.82%. At 40% college graduates, the probability jumps to 78.15%. At 60% college graduates, the probability is very high at 99.86%.

This demonstrates a strong, non-linear relationship between the percentage of college graduates and the likelihood of voting for Clinton. The effect is most pronounced in the middle range (around 40% college graduates) and less impactful at very low or very high percentages.

Predicted probabilities are often the easiest to interpret, showing how the probability of voting for Clinton changes as the percentage of college graduates increases. We can see that the relationship is not linear – there’s a dramatic increase in the probability of voting for Clinton as the percentage of college graduates rises from 20% to 40%, with the effect leveling off at higher percentages.

# For the intercept (Household Income = 0):
x <- 0
intercept_value <- plogis(coef(m.logit1)[1] + coef(m.logit1)[2] * x)
format(intercept_value, scientific = FALSE)

##     (Intercept) 
## "0.00001078717"

# For different levels of Household Income (e.g.20%, 40% or 60%):
x <- 20
plogis(coef(m.logit)[1] + coef(m.logit1)[2] * x)

## (Intercept) 
##  0.00418307

x <- 40
plogis(coef(m.logit)[1] + coef(m.logit1)[2] * x)

## (Intercept) 
##   0.1551691

x <- 60
plogis(coef(m.logit)[1] + coef(m.logit1)[2] * x)

## (Intercept) 
##   0.8892671

These values show the predicted probability of voting for Clinton at different percentages of household income:

In a hypothetical state with 0% household income, the probability of voting for Clinton is extremely low (0.0011%). At 20% household income, the probability increases to about 0.41%. At 40% household income, the probability jumps to 15.51%. At 60% household income, the probability is very high at 88.93%.

This demonstrates a strong, non-linear relationship between the percentage of household income and the likelihood of voting for Clinton. The effect is most pronounced in the middle range (around 40% household income) and less impactful at very low or very high percentages.

# For the intercept (NonWhite = 0):
x <- 0
intercept_value <- plogis(coef(m.logit2)[1] + coef(m.logit2)[2] * x)
format(intercept_value, scientific = FALSE)

## (Intercept) 
## "0.2236849"

# For different levels of NonWhite (e.g.20%, 40% or 60%):
x <- 20
plogis(coef(m.logit2)[1] + coef(m.logit1)[2] * x)

## (Intercept) 
##   0.9264625

x <- 40
plogis(coef(m.logit2)[1] + coef(m.logit1)[2] * x)

## (Intercept) 
##   0.9981879

x <- 60
plogis(coef(m.logit2)[1] + coef(m.logit1)[2] * x)

## (Intercept) 
##   0.9999585

These values show the predicted probability of voting for Clinton at different percentages of NonWhites:

In a hypothetical state with 0% NonWhites, the probability of voting for Clinton is moderate (22.37%). At 20% NonWhites, the probability increases to about 92.65%. At 40% NonWhites, the probability jumps to 99.82%. At 60% NonWhites, the probability is very high at 99.99%.

Task 2 Extend the model from Task 1 by adding the Region variable. Compare this new model to the previous one using AIC and BIC. Create a regression table with all models with modelsummary. Interpret the results and discuss which model you prefer and why.

m.logitmultiple <- glm(Clinton ~ College + HouseholdIncome + Region, data=States, family=binomial(link="logit"))
summary(m.logitmultiple)

## 
## Call:
## glm(formula = Clinton ~ College + HouseholdIncome + Region, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     -13.88304    4.57843  -3.032  0.00243 **
## College           0.25390    0.12980   1.956  0.05046 . 
## HouseholdIncome   0.05599    0.06642   0.843  0.39922   
## RegionNE          3.02242    1.41280   2.139  0.03241 * 
## RegionS           0.31259    1.58896   0.197  0.84404   
## RegionW           3.02137    1.43108   2.111  0.03475 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 32.014  on 44  degrees of freedom
## AIC: 44.014
## 
## Number of Fisher Scoring iterations: 6

# Specify the levels of 'Sex' you want to evaluate
xlevels.list <- list(Sex = c(0, 1))

# Compute all effects including the interaction
all_eff <- allEffects(m.logitmultiple, xlevels = xlevels.list)

# Plot the predicted probabilities
plot(all_eff, type = "response", multiline = TRUE)

# Plot just the interaction
eff <- Effect(c("College", "HouseholdIncome"), m.logitmultiple, xlevels = xlevels.list)
plot(eff, type = "response", multiline = TRUE)

# Visualize the results
library(rockchalk)

plotPlane(m.logitmultiple, plotx1 = "College", plotx2 = "HouseholdIncome", 
          phi=15, theta=40)

# Rotating the cube
plotPlane(m.logitmultiple, plotx1 = "College", plotx2 = "HouseholdIncome", 
          phi=15, theta=0)

plotPlane(m.logitmultiple, plotx1 = "HouseholdIncome", plotx2 = "College", 
          phi=15, theta=0)

# Log-odds ratios
coef(m.logitmultiple)

##     (Intercept)         College HouseholdIncome        RegionNE         RegionS 
##    -13.88304271      0.25389586      0.05599371      3.02242479      0.31259020 
##         RegionW 
##      3.02137322

# Odds ratios
odds_ratios <- exp(coef(m.logitmultiple))
knitr::kable(odds_ratios, col.names = "Odds Ratio", digits = 4)

	Odds Ratio
(Intercept)	0.0000
College	1.2890
HouseholdIncome	1.0576
RegionNE	20.5410
RegionS	1.3670
RegionW	20.5195

Interpretation of odds ratios:

Intercept: The odds ratio of 1.060e-06 represents the odds of voting for Clinton when both the percentage of college graduates and household income are 0. This extremely small value is not practically interpretable but serves as a baseline for the model.

College: For each 1 percentage point increase in college graduates, the odds of voting for Clinton increase by a factor of 1.1787, or about 17.87%, holding household income constant.

HouseholdIncome: For each 1 unit increase in household income (in thousands of dollars), the odds of voting for Clinton increase by a factor of 1.1425, or about 14.25%, holding the percentage of college graduates constant. These odds ratios indicate that both higher percentages of college graduates and higher household incomes are associated with increased odds of voting for Clinton, with the effect of college education being slightly stronger than that of household income.

# Partial derivatives (at probability level 0.7)
prob.level <- 0.7

# For College:
est.slope <- coef(m.logitmultiple)[2]
est.slope * prob.level * (1 - prob.level)

##    College 
## 0.05331813

# For Household Income:
est.slope <- coef(m.logitmultiple)[3]
est.slope * prob.level * (1 - prob.level)

## HouseholdIncome 
##      0.01175868

Partial derivatives (at probability level 0.7):

For College: 0.03453409

For Household Income: 0.0279843

These partial derivatives are calculated at a probability level of 0.7 and represent the change in probability of voting for Clinton for a one-unit change in the predictor, holding other variables constant:

For College: A 1 percentage point increase in college graduates is associated with a 3.45 percentage point increase in the probability of voting for Clinton.

For Household Income: A $1,000 increase in household income is associated with a 2.80 percentage point increase in the probability of voting for Clinton.

These results indicate that at this probability level (0.7), changes in the percentage of college graduates have a slightly larger effect on the probability of voting for Clinton compared to changes in household income. However, both variables show substantial positive effects on the likelihood of a Clinton vote.

data(m.logitmultiple)

## Warning in data(m.logitmultiple): 没有'm.logitmultiple'这个数据集

head(m.logitmultiple)

## $coefficients
##     (Intercept)         College HouseholdIncome        RegionNE         RegionS 
##    -13.88304271      0.25389586      0.05599371      3.02242479      0.31259020 
##         RegionW 
##      3.02137322 
## 
## $residuals
##         1         2         3         4         5         6         7         8 
## -1.012689 -2.137094 -1.403048 -1.007860  1.190371  1.057392  1.016375  1.316914 
##         9        10        11        12        13        14        15        16 
## -1.033070 -1.063375  1.292378 -1.233093  2.157696 -1.035285 -1.134411 -1.147237 
##        17        18        19        20        21        22        23        24 
## -1.013292 -1.014234  1.555151  1.016240  1.001998 -1.061796  2.023356 -1.004276 
##        25        26        27        28        29        30        31        32 
## -1.078687 -2.058397 -1.242438  8.729600  1.042388  1.010123 11.813126  1.020733 
##        33        34        35        36        37        38        39        40 
## -1.084092 -1.203823 -1.070921 -1.016468  1.411597 -8.401720  1.110201 -1.027242 
##        41        42        43        44        45        46        47        48 
## -1.059059 -1.048649 -1.060214 -3.710020  1.085783  1.569616  1.141067 -1.009281 
##        49        50 
## -1.150362 -1.676232 
## 
## $fitted.values
##           1           2           3           4           5           6 
## 0.012530001 0.532074973 0.287266079 0.007798675 0.840074093 0.945722691 
##           7           8           9          10          11          12 
## 0.983888948 0.759350797 0.032011544 0.059597663 0.773767600 0.189031310 
##          13          14          15          16          17          18 
## 0.463457364 0.034082219 0.118485560 0.128340286 0.013117178 0.014034048 
##          19          20          21          22          23          24 
## 0.643024236 0.984019080 0.998005902 0.058199942 0.494228338 0.004257627 
##          25          26          27          28          29          30 
## 0.072946666 0.514185081 0.195130677 0.114552787 0.959336084 0.989978190 
##          31          32          33          34          35          36 
## 0.084651597 0.979688046 0.077569068 0.169313223 0.066224504 0.016200943 
##          37          38          39          40          41          42 
## 0.708417666 0.880976751 0.900737773 0.026519664 0.055765574 0.046392331 
##          43          44          45          46          47          48 
## 0.056793760 0.730459704 0.920994193 0.637098675 0.876372686 0.009195999 
##          49          50 
## 0.130708470 0.403423733 
## 
## $effects
##     (Intercept)         College HouseholdIncome        RegionNE         RegionS 
##     0.563869224     1.745249989     1.804813606     1.625253083    -0.772512740 
##         RegionW                                                                 
##    -2.111248556    -0.006284866     0.553330679    -0.018589566    -0.086406588 
##                                                                                 
##     0.444838035    -0.200706390     0.716773031    -0.114240517    -0.387758833 
##                                                                                 
##    -0.416214502    -0.029944506     0.024759324     0.829836738    -0.006793780 
##                                                                                 
##    -0.035850320    -0.199750579     0.629276548     0.041944751    -0.248275696 
##                                                                                 
##    -0.934424348    -0.600455191     3.075225965     0.061410994    -0.022492701 
##                                                                                 
##     3.579597374     0.005642786    -0.129591128    -0.531290095    -0.226456197 
##                                                                                 
##     0.019730847     0.594068381    -2.824447411     0.214379849    -0.005004122 
##                                                                                 
##    -0.191578990    -0.053911471    -0.079807083    -1.710588599     0.163274688 
##                                                                                 
##     0.527473879     0.216541957     0.034968640    -0.423110964    -0.653669337 
## 
## $R
##                 (Intercept)   College HouseholdIncome    RegionNE     RegionS
## (Intercept)       -2.217287 -73.72000     -132.631606 -0.35255229 -0.26007120
## College            0.000000  11.01486        4.809687  0.21341270  0.04239286
## HouseholdIncome    0.000000   0.00000       16.580439 -0.06708556 -0.07531324
## RegionNE           0.000000   0.00000        0.000000  0.77934016 -0.13574085
## RegionS            0.000000   0.00000        0.000000  0.00000000  0.69506883
## RegionW            0.000000   0.00000        0.000000  0.00000000  0.00000000
##                    RegionW
## (Intercept)     -0.9691705
## College         -0.6549886
## HouseholdIncome  0.3649720
## RegionNE        -0.2276490
## RegionS         -0.3275942
## RegionW         -0.6987712
## 
## $rank
## [1] 6

str(m.logitmultiple)

## List of 30
##  $ coefficients     : Named num [1:6] -13.883 0.254 0.056 3.022 0.313 ...
##   ..- attr(*, "names")= chr [1:6] "(Intercept)" "College" "HouseholdIncome" "RegionNE" ...
##  $ residuals        : Named num [1:50] -1.01 -2.14 -1.4 -1.01 1.19 ...
##   ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
##  $ fitted.values    : Named num [1:50] 0.0125 0.5321 0.2873 0.0078 0.8401 ...
##   ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
##  $ effects          : Named num [1:50] 0.564 1.745 1.805 1.625 -0.773 ...
##   ..- attr(*, "names")= chr [1:50] "(Intercept)" "College" "HouseholdIncome" "RegionNE" ...
##  $ R                : num [1:6, 1:6] -2.22 0 0 0 0 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "(Intercept)" "College" "HouseholdIncome" "RegionNE" ...
##   .. ..$ : chr [1:6] "(Intercept)" "College" "HouseholdIncome" "RegionNE" ...
##  $ rank             : int 6
##  $ qr               :List of 5
##   ..$ qr   : num [1:50, 1:6] -2.2173 0.225 0.2041 0.0397 0.1653 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:50] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:6] "(Intercept)" "College" "HouseholdIncome" "RegionNE" ...
##   ..$ rank : int 6
##   ..$ qraux: num [1:6] 1.05 1.29 1.19 1 1.11 ...
##   ..$ pivot: int [1:6] 1 2 3 4 5 6
##   ..$ tol  : num 1e-11
##   ..- attr(*, "class")= chr "qr"
##  $ family           :List of 13
##   ..$ family    : chr "binomial"
##   ..$ link      : chr "logit"
##   ..$ linkfun   :function (mu)  
##   ..$ linkinv   :function (eta)  
##   ..$ variance  :function (mu)  
##   ..$ dev.resids:function (y, mu, wt)  
##   ..$ aic       :function (y, n, mu, wt, dev)  
##   ..$ mu.eta    :function (eta)  
##   ..$ initialize: language {     if (NCOL(y) == 1) { ...
##   ..$ validmu   :function (mu)  
##   ..$ valideta  :function (eta)  
##   ..$ simulate  :function (object, nsim)  
##   ..$ dispersion: num 1
##   ..- attr(*, "class")= chr "family"
##  $ linear.predictors: Named num [1:50] -4.367 0.128 -0.909 -4.846 1.659 ...
##   ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
##  $ deviance         : num 32
##  $ aic              : num 44
##  $ null.deviance    : num 67.3
##  $ iter             : int 6
##  $ weights          : Named num [1:50] 0.01237 0.24897 0.20474 0.00774 0.13435 ...
##   ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
##  $ prior.weights    : Named num [1:50] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
##  $ df.residual      : int 44
##  $ df.null          : int 49
##  $ y                : Named num [1:50] 0 0 0 0 1 1 1 1 0 0 ...
##   ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
##  $ converged        : logi TRUE
##  $ boundary         : logi FALSE
##  $ model            :'data.frame':   50 obs. of  4 variables:
##   ..$ Clinton        : num [1:50] 0 0 0 0 1 1 1 1 0 0 ...
##   ..$ College        : num [1:50] 26 26.5 27.4 24.7 34.5 39.6 42.7 33.4 28.8 30.9 ...
##   ..$ HouseholdIncome: num [1:50] 46.5 76.1 53.5 43.8 67.2 ...
##   ..$ Region         : Factor w/ 4 levels "MW","NE","S",..: 3 4 4 3 4 4 2 2 3 3 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language Clinton ~ College + HouseholdIncome + Region
##   .. .. ..- attr(*, "variables")= language list(Clinton, College, HouseholdIncome, Region)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "Clinton" "College" "HouseholdIncome" "Region"
##   .. .. .. .. ..$ : chr [1:3] "College" "HouseholdIncome" "Region"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "College" "HouseholdIncome" "Region"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(Clinton, College, HouseholdIncome, Region)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "factor"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "Clinton" "College" "HouseholdIncome" "Region"
##  $ call             : language glm(formula = Clinton ~ College + HouseholdIncome + Region, family = binomial(link = "logit"),      data = States)
##  $ formula          :Class 'formula'  language Clinton ~ College + HouseholdIncome + Region
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ terms            :Classes 'terms', 'formula'  language Clinton ~ College + HouseholdIncome + Region
##   .. ..- attr(*, "variables")= language list(Clinton, College, HouseholdIncome, Region)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "Clinton" "College" "HouseholdIncome" "Region"
##   .. .. .. ..$ : chr [1:3] "College" "HouseholdIncome" "Region"
##   .. ..- attr(*, "term.labels")= chr [1:3] "College" "HouseholdIncome" "Region"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Clinton, College, HouseholdIncome, Region)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:4] "Clinton" "College" "HouseholdIncome" "Region"
##  $ data             :'data.frame':   50 obs. of  23 variables:
##   ..$ State           : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ HouseholdIncome : num [1:50] 46.5 76.1 53.5 43.8 67.2 ...
##   ..$ Region          : Factor w/ 4 levels "MW","NE","S",..: 3 4 4 3 4 4 2 2 3 3 ...
##   ..$ Population      : num [1:50] 4.88 0.74 7.02 3 39.54 ...
##   ..$ EighthGradeMath : num [1:50] 269 274 280 274 276 ...
##   ..$ HighSchool      : num [1:50] 87.1 92.8 87.1 89.1 87.4 91.5 92.4 89.2 89.4 87.5 ...
##   ..$ College         : num [1:50] 26 26.5 27.4 24.7 34.5 39.6 42.7 33.4 28.8 30.9 ...
##   ..$ IQ              : num [1:50] 95.7 99 97.4 97.5 95.5 ...
##   ..$ GSP             : num [1:50] 40.3 70.9 43.1 38.5 67.7 ...
##   ..$ Vegetables      : num [1:50] 80.7 81 79.2 80.7 78.6 82.6 83.1 82.8 80.6 81.9 ...
##   ..$ Fruit           : num [1:50] 55.1 63.1 62.8 55.3 67.5 67 68.5 64.6 65.6 61.4 ...
##   ..$ Smokers         : num [1:50] 20.9 21 15.6 22.3 11.3 14.6 12.7 17 16.1 17.5 ...
##   ..$ PhysicalActivity: num [1:50] 42.8 58.3 52.7 45.4 57.5 58.7 51.9 46.3 49.5 46.1 ...
##   ..$ Obese           : num [1:50] 36.2 29.5 29.5 37.1 25.8 23 27.4 33.5 30.7 32.5 ...
##   ..$ NonWhite        : num [1:50] 31.6 34.7 22.5 22.7 39.4 15.8 23.3 30.9 24.3 40.6 ...
##   ..$ HeavyDrinkers   : num [1:50] 5.45 7.33 5.57 5.32 5.95 7.3 6.45 6.4 7.61 5.82 ...
##   ..$ Electoral       : int [1:50] 9 3 11 6 55 9 7 3 29 16 ...
##   ..$ ClintonVote     : num [1:50] 34.4 36.5 45.1 33.6 61.7 ...
##   ..$ Elect2016       : Factor w/ 2 levels "D","R": 2 2 2 2 1 1 1 1 2 2 ...
##   ..$ TwoParents      : num [1:50] 60.9 71.5 62.7 63.3 66.8 71.9 66.5 62.6 61 62.1 ...
##   ..$ StudentSpending : num [1:50] 9.24 17.51 7.61 9.85 11.49 ...
##   ..$ Insured         : num [1:50] 83.7 80.2 83.4 84.1 85.2 87.2 91.1 90.7 78.2 79.3 ...
##   ..$ Clinton         : num [1:50] 0 0 0 0 1 1 1 1 0 0 ...
##  $ offset           : NULL
##  $ control          :List of 3
##   ..$ epsilon: num 1e-08
##   ..$ maxit  : num 25
##   ..$ trace  : logi FALSE
##  $ method           : chr "glm.fit"
##  $ contrasts        :List of 1
##   ..$ Region: chr "contr.treatment"
##  $ xlevels          :List of 1
##   ..$ Region: chr [1:4] "MW" "NE" "S" "W"
##  - attr(*, "class")= chr [1:2] "glm" "lm"

AIC(m.logitmultiple, m.logit, m.logit1, m.logit2)

##                 df      AIC
## m.logitmultiple  6 44.01426
## m.logit          2 52.29914
## m.logit1         2 49.25457
## m.logit2         2 69.00812

BIC(m.logitmultiple, m.logit, m.logit1, m.logit2)

##                 df      BIC
## m.logitmultiple  6 55.48640
## m.logit          2 56.12318
## m.logit1         2 53.07862
## m.logit2         2 72.83217

The AIC values varied among models, indicating that each subsequent model provides a better fit to the data while accounting for model complexity. Model m.logit1 has the lowest AIC, suggesting it’s the best fit among these four models according to this criterion.

The BIC values also varied among models. Like AIC, BIC penalizes model complexity, but more severely. Even with this stricter penalty, m.logit1 still has the lowest BIC, further supporting it as the best model among the three.

Model Improvement: Both AIC and BIC consistently decrease as we add more predictors and interactions, indicating that the more complex models provide better fits to the data, even when penalizing for additional parameters.

Best Model: Model m.logit1, which includes all main effects and interactions, appears to be the best fit according to both AIC and BIC. This suggests that the interactions between education, year, and vocabulary provide meaningful improvements in predicting sex.

Trade-off: While m.logit1 is the best fit, it’s also the most complex. Depending on the research goals, one might choose m.logit as a compromise between fit and simplicity.

Caution: While these metrics suggest m.logit1 is the best model, it’s important to consider other factors such as theoretical relevance, parsimony, and practical significance of the additional terms.

Model Validation: It would be advisable to validate these models on a separate dataset or through cross-validation to ensure that the complex model (m.logit1) is not overfitting the data.

Task 3 Using the States dataset, fit a model predicting ClintonVote with College, NonWhite, and their interaction. Visualize the interaction effect using the effects package and interpret the results. How does the interaction change our understanding of the relationship between education and voting patterns?

NONm.logit <- glm(Clinton ~ College + NonWhite, data=States, family=binomial(link="logit"))
summary(NONm.logit)

## 
## Call:
## glm(formula = Clinton ~ College + NonWhite, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.22316    3.45322  -3.540 0.000401 ***
## College       0.30825    0.09138   3.373 0.000743 ***
## NonWhite      0.06606    0.03550   1.861 0.062804 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 43.480  on 47  degrees of freedom
## AIC: 49.48
## 
## Number of Fisher Scoring iterations: 5

# Visualize the results
library(rockchalk)

plotPlane(NONm.logit, plotx1 = "College", plotx2 = "NonWhite", 
          phi=15, theta=40)

# Rotating the cube
plotPlane(NONm.logit, plotx1 = "College", plotx2 = "NonWhite", 
          phi=15, theta=0)

# Log-odds ratios
coef(NONm.logit)

##  (Intercept)      College     NonWhite 
## -12.22315569   0.30824524   0.06605751

# Odds ratios
odds_ratios <- exp(coef(NONm.logit))
knitr::kable(odds_ratios, col.names = "Odds Ratio", digits = 4)

	Odds Ratio
(Intercept)	0.0000
College	1.3610
NonWhite	1.0683

Interpretation of odds ratios:

Intercept: The odds ratio of 1.060e-06 represents the odds of voting for Clinton when both the percentage of college graduates and NonWhites are 0. This extremely small value is not practically interpretable but serves as a baseline for the model.

College: For each 1 percentage point increase in college graduates, the odds of voting for Clinton increase by a factor of 1.3610, or about 13.61%, holding NonWhites constant.

HouseholdIncome: For each 1 unit increase in NonWhites (in thousands of dollars), the odds of voting for Clinton increase by a factor of 1.0683, or about 10.68%, holding the percentage of college graduates constant. These odds ratios indicate that both higher percentages of college graduates and higher household incomes are associated with increased odds of voting for Clinton, with the effect of college education being slightly stronger than that of NonWhites.

# Partial derivatives (at probability level 0.7)
prob.level <- 0.7

# For College:
est.slope <- coef(NONm.logit)[2]
est.slope * prob.level * (1 - prob.level)

##   College 
## 0.0647315

# For NonWhite:
est.slope <- coef(NONm.logit)[3]
est.slope * prob.level * (1 - prob.level)

##   NonWhite 
## 0.01387208

Partial derivatives (at probability level 0.7):

For College: 0.0647315

For NonWhite: 0.01387208

For College: A 1 percentage point increase in college graduates is associated with a 6.47 percentage point increase in the probability of voting for Clinton.

For NonWhite: A 1 increase in NonWhite is associated with a 1,38 percentage point increase in the probability of voting for Clinton.

These results indicate that at this probability level (0.7), changes in the percentage of college graduates have a slightly larger effect on the probability of voting for Clinton compared to changes in NonWhite. However, both variables show substantial positive effects on the likelihood of a Clinton vote.

# Predicted probabilities
mean_college <- mean(States$College, na.rm = TRUE)
mean_income <- mean(States$NonWhite, na.rm = TRUE)

college_seq <- seq(min(States$College), max(States$College), length.out=100)
income_seq <- seq(min(States$NonWhite), max(States$NonWhite), length.out=100)

data_college <- data.frame(College = college_seq, NonWhite = rep(mean_income, length(college_seq)))
data_income <- data.frame(College = rep(mean_college, length(income_seq)), NonWhite = income_seq)

prob_college <- predict(NONm.logit, newdata = data_college, type = "response")
prob_income <- predict(NONm.logit, newdata = data_income, type = "response")


# Plot predicted probabilities
par(mfrow=c(1,2))

plot(college_seq, prob_college, type = "l", col = "blue", ylim=c(0,1),
     xlab = "College", ylab = "Predicted Probability",
     main = "Predicted Probability vs College (Income at Mean)")

plot(income_seq, prob_income, type = "l", col = "red", ylim=c(0,1),
     xlab = "NonWhitee", ylab = "Predicted Probability",
     main = "Predicted Probability vs NonWhite (College at Mean)")

# Using effects package
plot(allEffects(NONm.logit), type="response")

eff <- Effect("College", NONm.logit)
plot(eff, type = "response")

Task 4 Using the States dataset, create a binary outcome variable indicating whether a state voted for Clinton (1) or not (0). Perform a logistic regression with this new variable as the dependent variable and College, HouseholdIncome, and Region as predictors. Create regression tables using both modelsummary and sjPlot. Interpret the odds ratios and assess model fit.

# Load the States data
data(USStates)
States <- USStates

States$Clinton <- abs(as.numeric(States$Elect2016) - 2)
table(States$Clinton) # Clinton win is coded as 1, Trump win is coded as 0

## 
##  0  1 
## 30 20

task4.logit <- glm(Clinton ~ College + HouseholdIncome + Region, data=States, family=binomial(link="logit"))
summary(task4.logit)

## 
## Call:
## glm(formula = Clinton ~ College + HouseholdIncome + Region, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     -13.88304    4.57843  -3.032  0.00243 **
## College           0.25390    0.12980   1.956  0.05046 . 
## HouseholdIncome   0.05599    0.06642   0.843  0.39922   
## RegionNE          3.02242    1.41280   2.139  0.03241 * 
## RegionS           0.31259    1.58896   0.197  0.84404   
## RegionW           3.02137    1.43108   2.111  0.03475 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 32.014  on 44  degrees of freedom
## AIC: 44.014
## 
## Number of Fisher Scoring iterations: 6

# Visualize the results
library(rockchalk)

plotPlane(task4.logit, plotx1 = "College", plotx2 = "HouseholdIncome", 
          phi=15, theta=40)

# Log-odds ratios
coef(task4.logit)

##     (Intercept)         College HouseholdIncome        RegionNE         RegionS 
##    -13.88304271      0.25389586      0.05599371      3.02242479      0.31259020 
##         RegionW 
##      3.02137322

# Odds ratios
odds_ratios <- exp(coef(task4.logit))
knitr::kable(odds_ratios, col.names = "Odds Ratio", digits = 4)

	Odds Ratio
(Intercept)	0.0000
College	1.2890
HouseholdIncome	1.0576
RegionNE	20.5410
RegionS	1.3670
RegionW	20.5195

# Partial derivatives (at probability level 0.7)
prob.level <- 0.7

# For College:
est.slope <- coef(task4.logit)[2]
est.slope * prob.level * (1 - prob.level)

##    College 
## 0.05331813

# For Household Income:
est.slope <- coef(task4.logit)[3]
est.slope * prob.level * (1 - prob.level)

## HouseholdIncome 
##      0.01175868

# For Region:
est.slope <- coef(task4.logit)[4]
est.slope * prob.level * (1 - prob.level)

##  RegionNE 
## 0.6347092

modelsummary(task4.logit)

tinytable_30ttt7lp82t05k33ocx6

	(1)
(Intercept)	-13.883
	(4.578)
College	0.254
	(0.130)
HouseholdIncome	0.056
	(0.066)
RegionNE	3.022
	(1.413)
RegionS	0.313
	(1.589)
RegionW	3.021
	(1.431)
Num.Obs.	50
AIC	44.0
BIC	55.5
Log.Lik.	-16.007
RMSE	0.32

library(sjPlot)

Task 5 Leveraging the same logistic regression model from Task 4, use the effects package to plot the predicted probabilities of voting for Clinton across the range of College values, separated by Region. Then, create a plot using sjPlot’s plot_model function to visualize the odds ratios of all predictors in the model.

# Load the States data
data(USStates)
States <- USStates

States$Clinton <- abs(as.numeric(States$Elect2016) - 2)
table(States$Clinton) # Clinton win is coded as 1, Trump win is coded as 0

## 
##  0  1 
## 30 20

Task5m.logit <- glm(Clinton ~ College + Region, data=States, family=binomial(link="logit"))
summary(Task5m.logit)

## 
## Call:
## glm(formula = Clinton ~ College + Region, family = binomial(link = "logit"), 
##     data = States)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -12.6386     4.1933  -3.014  0.00258 **
## College       0.3097     0.1132   2.737  0.00620 **
## RegionNE      3.1303     1.4014   2.234  0.02550 * 
## RegionS       0.4023     1.5266   0.264  0.79212   
## RegionW       3.5404     1.3146   2.693  0.00708 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67.301  on 49  degrees of freedom
## Residual deviance: 32.734  on 45  degrees of freedom
## AIC: 42.734
## 
## Number of Fisher Scoring iterations: 6

# scatter plot with annotated points
plot(x = States$College, y = States$Clinton, xlab = "% Graduated from College", ylab = "Voted for Clinton", pch = 16, cex=0.7,
     xlim=c(20,60), ylim=c(-0.3, 1.3))
text(x = States$College, y = States$Clinton, labels = States$State, pos = 3, cex = 0.4, srt=45) # srt for text angle (here, 45 degrees)

# now including a regression line
m <- lm(Clinton ~ College, data=States)
abline(a=coef(m)[1], b=coef(m)[2], col="red")

HW 4

Lyra Dong

2024-07-23