library(tidyverse)
library(tidycensus)
library(MASS)       # glm.nb()
library(pscl)       # zeroinfl()
library(lmtest)     # lrtest()
library(modelsummary)

set.seed(08901)

1. Load ACS county-level data

We pull six variables from the 2022 ACS 5-year estimates for all U.S. counties. The outcome of interest is the foreign-born population count (B05002_013).

# One-time setup: census_api_key("YOUR_KEY", install = TRUE)

census_api_key("fc561cd65e1b44273f52a1e76620bc59253f8ba3", overwrite = TRUE) 
## To install your API key for use in future sessions, run this function with `install = TRUE`.
county_raw <- get_acs(
  geography = "county",
  variables = c(
    foreign_born = "B05002_013",   # foreign-born population
    total_pop    = "B01003_001",   # total population (used as offset)
    med_hhinc    = "B19013_001",   # median household income
    in_poverty   = "B17001_002",   # population in poverty
    bachelors    = "B15003_022",   # bachelor's degree holders
    hispanic     = "B03003_003"),  # Hispanic/Latino population
  year = 2022, output = "wide")
## Getting data from the 2018-2022 5-year ACS

2. Clean and create analysis variables

data <- county_raw %>%
  rename(
    foreign_born = foreign_bornE,
    total_pop    = total_popE,
    med_hhinc    = med_hhincE,
    in_poverty   = in_povertyE,
    bachelors    = bachelorsE,
    hispanic     = hispanicE) %>%
  # Remove very small counties and rows with missing/invalid values
  filter(total_pop > 100,
         !is.na(foreign_born), !is.na(med_hhinc), med_hhinc > 0,
         !is.na(in_poverty), !is.na(bachelors), !is.na(hispanic)) %>%
  mutate(
    income_log   = log(med_hhinc),                  # log-transform income
    pct_poverty  = 100 * in_poverty  / total_pop,   # % in poverty
    pct_college  = 100 * bachelors   / total_pop,   # % with bachelor's
    pct_hispanic = 100 * hispanic    / total_pop,   # % Hispanic
    state_fips   = substr(GEOID, 1, 2),
    region = factor(
      case_when(
        state_fips %in% c("09","23","25","33","44","50",
                          "34","36","42")             ~ "Northeast",
        state_fips %in% c("17","18","26","39","55",
                          "19","20","27","29","31",
                          "38","46")                  ~ "Midwest",
        state_fips %in% c("10","11","12","13","24",
                          "37","45","51","54","01",
                          "21","28","47","05","22",
                          "40","48")                  ~ "South",
        TRUE                                          ~ "West"),
      levels = c("South","Northeast","Midwest","West")))  # South = reference

3. Explore the outcome distribution

Right-skewed distributions with excess zeros and large variance are poor fits for OLS.

# Log-scale histogram to reveal right-skewed shape
ggplot(aes(x = foreign_born + 1), data = data) +
  geom_histogram(bins = 50, fill = "#3182bd", color = "white") +
  scale_x_log10(labels = scales::comma) +
  theme_classic() +
  labs(y = "Number of counties",
       x = "Foreign-born population (log scale)",
       caption = "ACS 5-year estimates, U.S. counties, 2022")

# Key diagnostic: for a Poisson distribution, mean should ≈ variance
# A large Var/Mean ratio signals overdispersion
summarize(data,
  mean     = mean(foreign_born, na.rm = TRUE),
  variance = var(foreign_born,  na.rm = TRUE),
  n        = sum(!is.na(foreign_born)))
## # A tibble: 1 × 3
##     mean    variance     n
##    <dbl>       <dbl> <int>
## 1 14090. 8132729470.  3220
# 14k vs 8.13b -- overdispersed 

4. OLS baseline models

OLS can produce negative count predictions, a fundamental flaw for count outcomes.

data$log_fb1 <- log(data$foreign_born + 1)   # log(y+1) avoids log(0)

ols.s      <- lm(foreign_born ~ pct_poverty + pct_hispanic + pct_college,
                 data = data)

ols.fe     <- lm(foreign_born ~ pct_poverty + pct_college +
                   pct_hispanic + income_log + region,
                 data = data)

ols.fe.log <- lm(log_fb1 ~ pct_poverty + pct_college +
                   pct_hispanic + income_log + region,
                 data = data)
# The raw OLS model predicts negative counts for some counties
min(predict(ols.fe, data))
## [1] -133408.9
# Log+1 avoids negatives but imposes a symmetric error structure on skewed data
min(exp(predict(ols.fe.log, data)) - 1)
## [1] -0.5608007

5. Poisson regression

Poisson regression guarantees non-negative predictions via a log link. Exponentiate coefficients to get Incidence Rate Ratios (IRRs).

pois <- glm(foreign_born ~ pct_poverty + pct_college +
              pct_hispanic + income_log + region,
            data   = data,
            family = poisson(link = "log"))

summary(pois)
## 
## Call:
## glm(formula = foreign_born ~ pct_poverty + pct_college + pct_hispanic + 
##     income_log + region, family = poisson(link = "log"), data = data)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.284e+01  1.073e-02 -3991.3   <2e-16 ***
## pct_poverty      6.542e-02  4.605e-05  1420.7   <2e-16 ***
## pct_college      7.135e-02  3.477e-05  2052.1   <2e-16 ***
## pct_hispanic     4.614e-02  8.795e-06  5246.1   <2e-16 ***
## income_log       4.448e+00  9.332e-04  4767.0   <2e-16 ***
## regionNortheast  6.316e-01  4.298e-04  1469.5   <2e-16 ***
## regionMidwest   -1.335e-01  5.474e-04  -243.9   <2e-16 ***
## regionWest       5.955e-02  3.842e-04   155.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 235722206  on 3219  degrees of freedom
## Residual deviance: 114569919  on 3212  degrees of freedom
## AIC: 114597277
## 
## Number of Fisher Scoring iterations: 11
# Exponentiated coefficients = IRRs
# IRR > 1: predictor associated with more foreign-born residents
# IRR < 1: predictor associated with fewer foreign-born residents
exp(coef(pois))
##     (Intercept)     pct_poverty     pct_college    pct_hispanic      income_log 
##    2.475701e-19    1.067608e+00    1.073955e+00    1.047223e+00    8.549809e+01 
## regionNortheast   regionMidwest      regionWest 
##    1.880551e+00    8.750326e-01    1.061363e+00

6. Overdispersion check

Poisson assumes Var(y) = E(y). When variance far exceeds the mean, Poisson standard errors are too small and inference is unreliable.

cat("Mean:    ", round(mean(data$foreign_born)), "\n")
## Mean:     14090
cat("Variance:", format(round(var(data$foreign_born)), big.mark = ","), "\n")
## Variance: 8,132,729,470
cat("Var/Mean:", round(var(data$foreign_born) / mean(data$foreign_born)), "\n")
## Var/Mean: 577189
# Rule of thumb: ratio >> 1 indicates overdispersion → use negative binomial

7. Negative binomial regression

The negative binomial adds a dispersion parameter θ that absorbs extra variance. Small θ = heavy overdispersion; θ → ∞ collapses to Poisson.

nb <- glm.nb(
  foreign_born ~ pct_poverty + pct_college +
    pct_hispanic + income_log + region,
  data = data)
## Warning: glm.fit: algorithm did not converge
# θ estimate and its SE
round(nb$theta,    3)
## [1] 0.531
round(nb$SE.theta, 3)
## [1] 0.011
# Likelihood ratio test: does NB fit significantly better than Poisson?
# p < 0.05 → overdispersion is real → prefer NB
library(lmtest)
lrt  <- lrtest(pois, nb)
pval <- lrt$`Pr(>Chisq)`[2]
cat("LR test p-value:", round(pval, 4), "\n")
## LR test p-value: 0

8. Offset: modeling rates instead of counts

An offset adjusts for differing exposure (here: county population size) so that coefficients reflect per-capita rates, not absolute counts.

# Without offset: predicts absolute foreign-born count (conflates pop size)
nb.count <- glm.nb(
  foreign_born ~ pct_poverty + pct_college,
  data = data)

# With offset: predicts per-capita foreign-born rate (controls for pop size)
# log(total_pop) is added with a fixed coefficient of 1
nb.rate <- glm.nb(
  foreign_born ~ offset(log(total_pop)) +
    pct_poverty + pct_college,
  data = data)

# Compare IRRs — direction and magnitude often change with the offset
exp(coef(nb.count))
## (Intercept) pct_poverty pct_college 
##   64.609491    1.055992    1.429777
exp(coef(nb.rate))
## (Intercept) pct_poverty pct_college 
##  0.03235705  0.98184530  1.05735745

9. Zero-inflated Poisson model

Use when zeros arise from two distinct processes: - Structural zeros: unit could never have a non-zero count - Sampling zeros: unit is at risk but happened to have zero this period

n_zero   <- sum(data$foreign_born == 0)
pct_zero <- round(100 * mean(data$foreign_born == 0), 1)
cat("Counties with 0 foreign-born:", n_zero, "(", pct_zero, "% of counties)\n")
## Counties with 0 foreign-born: 17 ( 0.5 % of counties)
# How many zeros does the fitted Poisson model expect?
p_zero_pois <- mean(dpois(0, fitted(pois)))
cat("Poisson-expected % zeros:", round(100 * p_zero_pois, 1), "%\n")
## Poisson-expected % zeros: 0 %
# Observed >> expected → possible zero inflation
# Count component: predictors of how many foreign-born (conditional on being at risk)
# Zero component:  predictors of being a structural zero (using | separator)
zip_f <- zeroinfl(
  foreign_born ~ pct_poverty + pct_college + income_log | pct_hispanic,
  data = data, dist = "poisson")

summary(zip_f)
## 
## Call:
## zeroinfl(formula = foreign_born ~ pct_poverty + pct_college + income_log | 
##     pct_hispanic, data = data, dist = "poisson")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
##  -14.589  -13.431  -11.946   -8.359 1096.685 
## 
## Count model coefficients (poisson with log link):
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.284e+01  9.356e-03   -5648   <2e-16 ***
## pct_poverty  1.751e-01  2.987e-05    5862   <2e-16 ***
## pct_college  7.742e-02  3.702e-05    2091   <2e-16 ***
## income_log   5.301e+00  8.329e-04    6365   <2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.364226   0.294816 -18.195   <2e-16 ***
## pct_hispanic  0.007772   0.010060   0.773     0.44    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 12 
## Log-likelihood: -7.312e+07 on 6 Df

10. Model comparison via AIC

Lower AIC = better fit. AIC penalizes for additional parameters.

round(AIC(pois),  1)   # Poisson
## [1] 114597277
round(AIC(nb),    1)   # Negative Binomial
## [1] 55824.3
round(AIC(zip_f), 1)   # Zero-Inflated Poisson
## [1] 146243136

Lab Exercises

Work through the following questions using the data object created above.

Exercise 1: Examine the outcome

  1. Calculate the mean, variance, and Var/Mean ratio for foreign_born.
  2. Does the ratio suggest Poisson is appropriate? Why or why not?
summarize(data,
  mean     = mean(foreign_born, na.rm = TRUE),
  variance = var(foreign_born,  na.rm = TRUE),
  n        = sum(!is.na(foreign_born)))

var(data$foreign_born)/mean(data$foreign_born)
#this is indicative of overdispersion, as the ratio is greater than 10, and Poisson models cannot correct for that Therefore, we should use a negative binomial distribution instead as Poisson would not be appropriate; it would likely lead to inflated t-statistics and p-values

Exercise 2: Fit and interpret a Poisson model

  1. Fit a Poisson model predicting foreign_born using pct_hispanic, pct_college, and income_log.
  2. Exponentiate the coefficients. Interpret the IRR for pct_hispanic in one sentence.
pois1 <- glm(foreign_born ~ pct_hispanic + pct_college + income_log,
            data   = data,
            family = poisson(link = "log"))

summary(pois1)

# Exponentiated coefficients = IRRs
# IRR > 1: predictor associated with more foreign-born residents
# IRR < 1: predictor associated with fewer foreign-born residents
exp(coef(pois1))

#interpretation: For every one percentage point increase in hispanic population across american counties, the model predicts a 5.4% increase in the foreign born population. 

Exercise 3: Test for overdispersion

  1. Fit the equivalent negative binomial model.
  2. Run lrtest() comparing Poisson vs. NB.
  3. Report θ and the LR test p-value. Should you use Poisson or NB here?
nb <- glm.nb(
  foreign_born ~ pct_hispanic + pct_college + income_log,
  data = data)

lts <- lrtest(pois, nb)
lts$'Pr(>Chisq)'[2]

lts <- lrtest(nb, pois)
lts$'Pr(>Chisq)'[2]

round(nb$theta,    3)
round(nb$SE.theta, 3)

# because the theta is so small (0.474) we should stick with a negative binomial regression and not switch to Poisson

Exercise 4: Add a population offset

  1. Re-fit the NB model from Exercise 3, adding offset(log(total_pop)).
  2. Compare IRRs with and without the offset.
  3. In one sentence, explain what the offset changes about the interpretation.
# With offset: predicts per-capita foreign-born rate (controls for pop size)
# log(total_pop) is added with a fixed coefficient of 1
nb2 <- glm.nb(
  foreign_born ~ offset(log(total_pop)) +
    pct_hispanic + pct_college + income_log,
  data = data)

# Compare IRRs — direction and magnitude often change with the offset
exp(coef(nb))
exp(coef(nb2))

# directionality didn't change but magnitude did; huge drop for college and income IRRs

#Explanation: With the offset, all IRRs move closer to 0, as it changes from representing count to rate of change. 

Exercise 5 (Challenge): Zero-inflated model

  1. Check what percent of counties have foreign_born == 0 and compare to Poisson-expected zeros.
  2. Fit a ZIP model using pct_poverty + pct_college + income_log in the count component and pct_hispanic in the zero component.
  3. Compare AIC across Poisson, NB, and ZIP. Which model fits best?
# Number of counties with foreign_born == 0
noforeign <- sum(data$foreign_born == 0, na.rm = TRUE)
# Number of counties with foreign_born > 0
foreign <- sum(data$foreign_born > 0, na.rm = TRUE)

noforeign/foreign
# only .5% of counties have no foreign born population


zip1 <- zeroinfl(
  foreign_born ~ pct_poverty + pct_college + income_log | pct_hispanic,
  data = data, dist = "poisson")

summary(zip1)


round(AIC(pois1),  1)   # Poisson
round(AIC(nb2),    1)   # Negative Binomial
round(AIC(zip1), 1)   # Zero-Inflated Poisson

# the model that's closest to zero fits the best; in this case, it's the negative binomial regression.