library(tidyverse)
library(tidycensus)
library(MASS) # glm.nb()
library(pscl) # zeroinfl()
library(lmtest) # lrtest()
library(modelsummary)
set.seed(08901)
We pull six variables from the 2022 ACS 5-year estimates for all U.S. counties. The outcome of interest is the foreign-born population count (B05002_013).
# One-time setup: census_api_key("YOUR_KEY", install = TRUE)
census_api_key("fc561cd65e1b44273f52a1e76620bc59253f8ba3", overwrite = TRUE)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
county_raw <- get_acs(
geography = "county",
variables = c(
foreign_born = "B05002_013", # foreign-born population
total_pop = "B01003_001", # total population (used as offset)
med_hhinc = "B19013_001", # median household income
in_poverty = "B17001_002", # population in poverty
bachelors = "B15003_022", # bachelor's degree holders
hispanic = "B03003_003"), # Hispanic/Latino population
year = 2022, output = "wide")
## Getting data from the 2018-2022 5-year ACS
data <- county_raw %>%
rename(
foreign_born = foreign_bornE,
total_pop = total_popE,
med_hhinc = med_hhincE,
in_poverty = in_povertyE,
bachelors = bachelorsE,
hispanic = hispanicE) %>%
# Remove very small counties and rows with missing/invalid values
filter(total_pop > 100,
!is.na(foreign_born), !is.na(med_hhinc), med_hhinc > 0,
!is.na(in_poverty), !is.na(bachelors), !is.na(hispanic)) %>%
mutate(
income_log = log(med_hhinc), # log-transform income
pct_poverty = 100 * in_poverty / total_pop, # % in poverty
pct_college = 100 * bachelors / total_pop, # % with bachelor's
pct_hispanic = 100 * hispanic / total_pop, # % Hispanic
state_fips = substr(GEOID, 1, 2),
region = factor(
case_when(
state_fips %in% c("09","23","25","33","44","50",
"34","36","42") ~ "Northeast",
state_fips %in% c("17","18","26","39","55",
"19","20","27","29","31",
"38","46") ~ "Midwest",
state_fips %in% c("10","11","12","13","24",
"37","45","51","54","01",
"21","28","47","05","22",
"40","48") ~ "South",
TRUE ~ "West"),
levels = c("South","Northeast","Midwest","West"))) # South = reference
Right-skewed distributions with excess zeros and large variance are poor fits for OLS.
# Log-scale histogram to reveal right-skewed shape
ggplot(aes(x = foreign_born + 1), data = data) +
geom_histogram(bins = 50, fill = "#3182bd", color = "white") +
scale_x_log10(labels = scales::comma) +
theme_classic() +
labs(y = "Number of counties",
x = "Foreign-born population (log scale)",
caption = "ACS 5-year estimates, U.S. counties, 2022")
# Key diagnostic: for a Poisson distribution, mean should ≈ variance
# A large Var/Mean ratio signals overdispersion
summarize(data,
mean = mean(foreign_born, na.rm = TRUE),
variance = var(foreign_born, na.rm = TRUE),
n = sum(!is.na(foreign_born)))
## # A tibble: 1 × 3
## mean variance n
## <dbl> <dbl> <int>
## 1 14090. 8132729470. 3220
# 14k vs 8.13b -- overdispersed
OLS can produce negative count predictions, a fundamental flaw for count outcomes.
data$log_fb1 <- log(data$foreign_born + 1) # log(y+1) avoids log(0)
ols.s <- lm(foreign_born ~ pct_poverty + pct_hispanic + pct_college,
data = data)
ols.fe <- lm(foreign_born ~ pct_poverty + pct_college +
pct_hispanic + income_log + region,
data = data)
ols.fe.log <- lm(log_fb1 ~ pct_poverty + pct_college +
pct_hispanic + income_log + region,
data = data)
# The raw OLS model predicts negative counts for some counties
min(predict(ols.fe, data))
## [1] -133408.9
# Log+1 avoids negatives but imposes a symmetric error structure on skewed data
min(exp(predict(ols.fe.log, data)) - 1)
## [1] -0.5608007
Poisson regression guarantees non-negative predictions via a log link. Exponentiate coefficients to get Incidence Rate Ratios (IRRs).
pois <- glm(foreign_born ~ pct_poverty + pct_college +
pct_hispanic + income_log + region,
data = data,
family = poisson(link = "log"))
summary(pois)
##
## Call:
## glm(formula = foreign_born ~ pct_poverty + pct_college + pct_hispanic +
## income_log + region, family = poisson(link = "log"), data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.284e+01 1.073e-02 -3991.3 <2e-16 ***
## pct_poverty 6.542e-02 4.605e-05 1420.7 <2e-16 ***
## pct_college 7.135e-02 3.477e-05 2052.1 <2e-16 ***
## pct_hispanic 4.614e-02 8.795e-06 5246.1 <2e-16 ***
## income_log 4.448e+00 9.332e-04 4767.0 <2e-16 ***
## regionNortheast 6.316e-01 4.298e-04 1469.5 <2e-16 ***
## regionMidwest -1.335e-01 5.474e-04 -243.9 <2e-16 ***
## regionWest 5.955e-02 3.842e-04 155.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 235722206 on 3219 degrees of freedom
## Residual deviance: 114569919 on 3212 degrees of freedom
## AIC: 114597277
##
## Number of Fisher Scoring iterations: 11
# Exponentiated coefficients = IRRs
# IRR > 1: predictor associated with more foreign-born residents
# IRR < 1: predictor associated with fewer foreign-born residents
exp(coef(pois))
## (Intercept) pct_poverty pct_college pct_hispanic income_log
## 2.475701e-19 1.067608e+00 1.073955e+00 1.047223e+00 8.549809e+01
## regionNortheast regionMidwest regionWest
## 1.880551e+00 8.750326e-01 1.061363e+00
Poisson assumes Var(y) = E(y). When variance far exceeds the mean, Poisson standard errors are too small and inference is unreliable.
cat("Mean: ", round(mean(data$foreign_born)), "\n")
## Mean: 14090
cat("Variance:", format(round(var(data$foreign_born)), big.mark = ","), "\n")
## Variance: 8,132,729,470
cat("Var/Mean:", round(var(data$foreign_born) / mean(data$foreign_born)), "\n")
## Var/Mean: 577189
# Rule of thumb: ratio >> 1 indicates overdispersion → use negative binomial
The negative binomial adds a dispersion parameter θ that absorbs extra variance. Small θ = heavy overdispersion; θ → ∞ collapses to Poisson.
nb <- glm.nb(
foreign_born ~ pct_poverty + pct_college +
pct_hispanic + income_log + region,
data = data)
## Warning: glm.fit: algorithm did not converge
# θ estimate and its SE
round(nb$theta, 3)
## [1] 0.531
round(nb$SE.theta, 3)
## [1] 0.011
# Likelihood ratio test: does NB fit significantly better than Poisson?
# p < 0.05 → overdispersion is real → prefer NB
library(lmtest)
lrt <- lrtest(pois, nb)
pval <- lrt$`Pr(>Chisq)`[2]
cat("LR test p-value:", round(pval, 4), "\n")
## LR test p-value: 0
An offset adjusts for differing exposure (here: county population size) so that coefficients reflect per-capita rates, not absolute counts.
# Without offset: predicts absolute foreign-born count (conflates pop size)
nb.count <- glm.nb(
foreign_born ~ pct_poverty + pct_college,
data = data)
# With offset: predicts per-capita foreign-born rate (controls for pop size)
# log(total_pop) is added with a fixed coefficient of 1
nb.rate <- glm.nb(
foreign_born ~ offset(log(total_pop)) +
pct_poverty + pct_college,
data = data)
# Compare IRRs — direction and magnitude often change with the offset
exp(coef(nb.count))
## (Intercept) pct_poverty pct_college
## 64.609491 1.055992 1.429777
exp(coef(nb.rate))
## (Intercept) pct_poverty pct_college
## 0.03235705 0.98184530 1.05735745
Use when zeros arise from two distinct processes: - Structural zeros: unit could never have a non-zero count - Sampling zeros: unit is at risk but happened to have zero this period
n_zero <- sum(data$foreign_born == 0)
pct_zero <- round(100 * mean(data$foreign_born == 0), 1)
cat("Counties with 0 foreign-born:", n_zero, "(", pct_zero, "% of counties)\n")
## Counties with 0 foreign-born: 17 ( 0.5 % of counties)
# How many zeros does the fitted Poisson model expect?
p_zero_pois <- mean(dpois(0, fitted(pois)))
cat("Poisson-expected % zeros:", round(100 * p_zero_pois, 1), "%\n")
## Poisson-expected % zeros: 0 %
# Observed >> expected → possible zero inflation
# Count component: predictors of how many foreign-born (conditional on being at risk)
# Zero component: predictors of being a structural zero (using | separator)
zip_f <- zeroinfl(
foreign_born ~ pct_poverty + pct_college + income_log | pct_hispanic,
data = data, dist = "poisson")
summary(zip_f)
##
## Call:
## zeroinfl(formula = foreign_born ~ pct_poverty + pct_college + income_log |
## pct_hispanic, data = data, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -14.589 -13.431 -11.946 -8.359 1096.685
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.284e+01 9.356e-03 -5648 <2e-16 ***
## pct_poverty 1.751e-01 2.987e-05 5862 <2e-16 ***
## pct_college 7.742e-02 3.702e-05 2091 <2e-16 ***
## income_log 5.301e+00 8.329e-04 6365 <2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.364226 0.294816 -18.195 <2e-16 ***
## pct_hispanic 0.007772 0.010060 0.773 0.44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 12
## Log-likelihood: -7.312e+07 on 6 Df
Lower AIC = better fit. AIC penalizes for additional parameters.
round(AIC(pois), 1) # Poisson
## [1] 114597277
round(AIC(nb), 1) # Negative Binomial
## [1] 55824.3
round(AIC(zip_f), 1) # Zero-Inflated Poisson
## [1] 146243136
Work through the following questions using the data
object created above.
foreign_born.summarize(data,
mean = mean(foreign_born, na.rm = TRUE),
variance = var(foreign_born, na.rm = TRUE),
n = sum(!is.na(foreign_born)))
var(data$foreign_born)/mean(data$foreign_born)
#this is indicative of overdispersion, as the ratio is greater than 10, and Poisson models cannot correct for that Therefore, we should use a negative binomial distribution instead as Poisson would not be appropriate; it would likely lead to inflated t-statistics and p-values
foreign_born using
pct_hispanic, pct_college, and
income_log.pct_hispanic in one sentence.pois1 <- glm(foreign_born ~ pct_hispanic + pct_college + income_log,
data = data,
family = poisson(link = "log"))
summary(pois1)
# Exponentiated coefficients = IRRs
# IRR > 1: predictor associated with more foreign-born residents
# IRR < 1: predictor associated with fewer foreign-born residents
exp(coef(pois1))
#interpretation: For every one percentage point increase in hispanic population across american counties, the model predicts a 5.4% increase in the foreign born population.
lrtest() comparing Poisson vs. NB.nb <- glm.nb(
foreign_born ~ pct_hispanic + pct_college + income_log,
data = data)
lts <- lrtest(pois, nb)
lts$'Pr(>Chisq)'[2]
lts <- lrtest(nb, pois)
lts$'Pr(>Chisq)'[2]
round(nb$theta, 3)
round(nb$SE.theta, 3)
# because the theta is so small (0.474) we should stick with a negative binomial regression and not switch to Poisson
offset(log(total_pop)).# With offset: predicts per-capita foreign-born rate (controls for pop size)
# log(total_pop) is added with a fixed coefficient of 1
nb2 <- glm.nb(
foreign_born ~ offset(log(total_pop)) +
pct_hispanic + pct_college + income_log,
data = data)
# Compare IRRs — direction and magnitude often change with the offset
exp(coef(nb))
exp(coef(nb2))
# directionality didn't change but magnitude did; huge drop for college and income IRRs
#Explanation: With the offset, all IRRs move closer to 0, as it changes from representing count to rate of change.
foreign_born == 0
and compare to Poisson-expected zeros.pct_poverty + pct_college + income_log in the count
component and pct_hispanic in the zero component.# Number of counties with foreign_born == 0
noforeign <- sum(data$foreign_born == 0, na.rm = TRUE)
# Number of counties with foreign_born > 0
foreign <- sum(data$foreign_born > 0, na.rm = TRUE)
noforeign/foreign
# only .5% of counties have no foreign born population
zip1 <- zeroinfl(
foreign_born ~ pct_poverty + pct_college + income_log | pct_hispanic,
data = data, dist = "poisson")
summary(zip1)
round(AIC(pois1), 1) # Poisson
round(AIC(nb2), 1) # Negative Binomial
round(AIC(zip1), 1) # Zero-Inflated Poisson
# the model that's closest to zero fits the best; in this case, it's the negative binomial regression.