Init

library(kirkegaard)

## Loading required package: tidyverse

## ── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
## [32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
## [32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtibble   [39m 3.3.0
## [32m✔[39m [34mpurrr    [39m 1.2.0     [32m✔[39m [34mtidyr    [39m 1.3.1
## ── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
## [31m✖[39m [34me1071[39m::[32melement()[39m masks [34mggplot2[39m::element()
## [31m✖[39m [34mtidyr[39m::[32mexpand()[39m  masks [34mMatrix[39m::expand()
## [31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
## [31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()
## [31m✖[39m [34mtidyr[39m::[32mpack()[39m    masks [34mMatrix[39m::pack()
## [31m✖[39m [34mcar[39m::[32mrecode()[39m    masks [34mdplyr[39m::recode()
## [31m✖[39m [34mxgboost[39m::[32mslice()[39m masks [34mdplyr[39m::slice()
## [31m✖[39m [34mpurrr[39m::[32msome()[39m    masks [34mcar[39m::some()
## [31m✖[39m [34mtidyr[39m::[32munpack()[39m  masks [34mMatrix[39m::unpack()
## [36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:effectsize':
## 
##     phi
## 
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## 
## The following object is masked from 'package:modelsummary':
## 
##     SD
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:effectsize':
## 
##     standardize
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  dplyr,
  readr,
  ggplot2,
  fixest,
  modelsummary,
  effectsize,
  spatialreg,
  spdep,
  sf,
  tigris,
  car,
  gt
)

theme_set(theme_bw())

options(
    digits = 3,
    tigris_use_cache = TRUE
)

fig_dir <- "figs"
dir.create(fig_dir, showWarnings = FALSE)

# modelsummary formatting
gof_map_short <- list(
  list("raw" = "nobs", "clean" = "N", "fmt" = 0),
  list("raw" = "adj.r.squared", "clean" = "Adj. R2", "fmt" = 3),
  list("raw" = "r2.within", "clean" = "Within R2", "fmt" = 3)
)

coef_labels <- c(
  black_share = "Black %", hispanic_share = "Hispanic %",
  native_share = "Native %", asian_share = "Asian %",
  white_share = "White %",
  nh_black_share = "NH Black %", nh_native_share = "NH Native %",
  nh_asian_share = "NH Asian %",
  z_nh_black_share = "NH Black %", z_hispanic_share = "Hispanic %",
  z_nh_native_share = "NH Native %", z_nh_asian_share = "NH Asian %",
  z_poverty_rate = "Poverty rate", z_median_income = "Median income",
  z_gini = "Gini coefficient", z_employment_rate = "Employment rate",
  poverty_rate = "Poverty rate", median_income = "Median income ($10k)",
  gini = "Gini coefficient", employment_rate = "Employment rate",
  `(Intercept)` = "(Intercept)"
)

# pop-weighted z-score function
weighted_scale <- function(x, w) {
  mu <- weighted.mean(x, w, na.rm = TRUE)
  sigma <- sqrt(sum(w * (x - mu)^2, na.rm = TRUE) / sum(w[!is.na(x)]))
  (x - mu) / sigma
}

Data

# NH race shares (2020 census)
nh_2020 <- readRDS("data/county_nhrace_panel_1990_2020.rds") |>
  filter(year == 2020) |>
  select(GEOID, nh_black_share, nh_native_share, nh_asian_share, hispanic_share, total)

# homicide (2010-2020, state-imputed)
hom <- readRDS("data/county_homicide_cross_2010_2020_imputed.rds")

# ACS econ data
econ_acs <- readRDS("data/county_econ_acs.rds")

# ACS 2022 for cross-sectional standardized models
econ_2022 <- econ_acs |> filter(acs_year == 2022) |>
  transmute(GEOID,
            median_income = median_incomeE / 10000,
            poverty_rate = poverty_belowE / poverty_totalE,
            gini = giniE,
            employment_rate = employ_employedE / employ_totalE)

# ACS 2018 for model comparison / bivariate models
econ_2018 <- econ_acs |> filter(acs_year == 2018) |>
  transmute(GEOID,
            median_income = median_incomeE / 10000,
            poverty_rate = poverty_belowE / poverty_totalE,
            gini = giniE,
            employment_rate = employ_employedE / employ_totalE)

# cross-sectional for standardized models (ACS 2022)
cs <- nh_2020 |>
  inner_join(hom, by = "GEOID") |>
  inner_join(econ_2022, by = "GEOID") |>
  mutate(state_fips = substr(GEOID, 1, 2)) |>
  filter(!is.na(hom_rate), !is.na(poverty_rate))

# cross-sectional for bivariate/model comparison (ACS 2018)
cs_18 <- nh_2020 |>
  inner_join(hom, by = "GEOID") |>
  inner_join(econ_2018, by = "GEOID") |>
  mutate(state_fips = substr(GEOID, 1, 2)) |>
  filter(!is.na(hom_rate), !is.na(poverty_rate))

# longitudinal panel (1970-2020, 6 periods, homicide only)
panel <- readRDS("data/analysis_longitudinal_panel.rds")

# panel with NH race shares and multiple outcomes (1990-2020)
panel_nh <- readRDS("data/panel_1990_2020_all_outcomes.rds")

cat("Cross-sectional (ACS 2022):", nrow(cs), "counties\n")

## Cross-sectional (ACS 2022): 3135 counties

cat("Cross-sectional (ACS 2018):", nrow(cs_18), "counties\n")

## Cross-sectional (ACS 2018): 3140 counties

cat("Longitudinal panel:", nrow(panel), "rows,", n_distinct(panel$GEOID), "counties\n")

## Longitudinal panel: 19079 rows, 3248 counties

cat("NH panel 1990-2020:", nrow(panel_nh), "rows,", n_distinct(panel_nh$GEOID), "counties\n")

## NH panel 1990-2020: 12802 rows, 3233 counties

Analysis

Cross-sectional: standardized coefficients

# pop-weighted z-score all variables
cs_z <- cs |> mutate(across(
  c(nh_black_share, hispanic_share, nh_native_share, nh_asian_share,
    poverty_rate, median_income, gini, employment_rate, hom_rate),
  ~ weighted_scale(., total),
  .names = "z_{.col}"
))

# 3 models: demographic, socioeconomic, full (all pop-weighted)
m_demo <- feols(z_hom_rate ~ z_nh_black_share + z_hispanic_share + z_nh_native_share + z_nh_asian_share,
                data = cs_z, weights = ~total)
m_ses <- feols(z_hom_rate ~ z_poverty_rate + z_median_income + z_gini + z_employment_rate,
               data = cs_z, weights = ~total)

## NOTE: 1 observation removed because of NA values (RHS: 1).

m_full <- feols(z_hom_rate ~ z_nh_black_share + z_hispanic_share + z_nh_native_share + z_nh_asian_share +
                  z_poverty_rate + z_median_income + z_gini + z_employment_rate,
                data = cs_z, weights = ~total)

## NOTE: 1 observation removed because of NA values (RHS: 1).

ms_save(
  list("Demographic" = m_demo, "Socioeconomic" = m_ses, "Full" = m_full),
  "Cross-sectional OLS: Homicide rate (standardized coefficients)",
  "table_cs_standardized.png"
)

## file:////tmp/RtmpDLJUrO/file4c21f5ac16ed8.html screenshot completed

## Saved table_cs_standardized.png

Epsilon-squared variance decomposition

# full model for epsilon² (pop-weighted)
m_full_lm <- lm(z_hom_rate ~ z_nh_black_share + z_hispanic_share + z_nh_native_share + z_nh_asian_share +
                  z_poverty_rate + z_median_income + z_gini + z_employment_rate,
                data = cs_z, weights = total)

eps2 <- epsilon_squared(m_full_lm, alternative = "two.sided", ci = 0.95)

eps2_df <- data.frame(
  variable = c("NH Black %", "Hispanic %", "NH Native %", "NH Asian %",
               "Poverty rate", "Median income", "Gini coefficient", "Employment rate"),
  epsilon2 = eps2$Epsilon2_partial,
  ci_low = eps2$CI_low,
  ci_high = eps2$CI_high,
  type = c(rep("Demographic", 4), rep("Socioeconomic", 4))
) |> mutate(
  sqrt_eps = sqrt(epsilon2),
  sqrt_ci_low = sqrt(pmax(ci_low, 0)),
  sqrt_ci_high = sqrt(ci_high)
)

eps2_df$variable <- factor(eps2_df$variable, levels = rev(eps2_df$variable))

p_eps <- ggplot(eps2_df, aes(x = sqrt_eps, y = variable, fill = type)) +
  geom_col() +
  geom_errorbar(aes(xmin = sqrt_ci_low, xmax = sqrt_ci_high), width = 0.3, orientation = "y") +
  scale_fill_manual(values = c("Demographic" = "red", "Socioeconomic" = "steelblue")) +
  labs(x = expression(sqrt(epsilon^2) ~ "(partial, Type III)"),
       y = NULL, fill = NULL,
       title = "Effect sizes: Full model predicting county homicide rate") +
  theme(legend.position = "bottom")

ggsave(file.path(fig_dir, "barplot_epsilon.png"), p_eps, width = 9, height = 5, dpi = 150)
p_eps

OLS vs Spatial Error Model

# spatial weights for cross-sectional sample
counties_sf <- readRDS("data/spatial_weights.rds")$counties_sf
counties_proj <- st_transform(counties_sf, 5070)

# filter to complete cases and build weights
cs_z_complete <- cs_z |> filter(!is.na(z_employment_rate))
idx <- match(cs_z_complete$GEOID, counties_sf$GEOID)
cs_z_sp <- cs_z_complete[!is.na(idx), ]
counties_matched <- counties_proj[idx[!is.na(idx)], ]
centroids <- st_centroid(counties_matched)

## Warning: st_centroid assumes attributes are constant over geometries

coords <- st_coordinates(centroids)
W_cs <- nb2listw(knn2nb(knearneigh(coords, k = 5)), style = "W")

# OLS (weighted)
m_ols <- lm(z_hom_rate ~ z_nh_black_share + z_hispanic_share + z_nh_native_share + z_nh_asian_share +
              z_poverty_rate + z_median_income + z_gini + z_employment_rate,
            data = cs_z_sp, weights = total)

# spatial error model
m_sem <- errorsarlm(z_hom_rate ~ z_nh_black_share + z_hispanic_share + z_nh_native_share + z_nh_asian_share +
                      z_poverty_rate + z_median_income + z_gini + z_employment_rate,
                    data = cs_z_sp, listw = W_cs)

# build comparison table
ols_s <- summary(m_ols)$coefficients
sem_s <- summary(m_sem)$Coef
var_keys <- rownames(ols_s)[-1]
var_names <- c("NH Black %", "Hispanic %", "NH Native %", "NH Asian %",
               "Poverty rate", "Median income", "Gini coefficient", "Employment rate")

fmt_coef <- function(b, se, p) {
  stars <- ifelse(p < 0.001, "***", ifelse(p < 0.01, "**", ifelse(p < 0.05, "*", "")))
  sprintf("%.3f%s (%.3f)", b, stars, se)
}

comp_df <- data.frame(
  Variable = c(var_names, "Lambda (spatial)", "N", "AIC"),
  OLS = c(mapply(fmt_coef, ols_s[var_keys, 1], ols_s[var_keys, 2], ols_s[var_keys, 4]),
          "\u2014", as.character(nrow(cs_z_sp)), as.character(round(AIC(m_ols)))),
  `Spatial Error` = c(mapply(fmt_coef, sem_s[var_keys, 1], sem_s[var_keys, 2], sem_s[var_keys, 4]),
                      sprintf("%.3f*** (%.3f)", m_sem$lambda, m_sem$lambda.se),
                      as.character(nrow(cs_z_sp)), as.character(round(AIC(m_sem)))),
  check.names = FALSE
)

tab_spatial <- gt(comp_df) |>
  tab_header(title = "Cross-sectional models: Homicide rate (standardized coefficients)") |>
  tab_footnote("* p < 0.05, ** p < 0.01, *** p < 0.001. All variables pop-weighted z-scored. SE in parentheses.")

gtsave(tab_spatial, file.path(fig_dir, "table_ols_vs_spatial.png"))

## file:////tmp/RtmpDLJUrO/file4c21f654cb75f.html screenshot completed

Black share model comparison

# bivariate regressions: Black % as sole predictor of homicide rate
# all pop-weighted, hetero-robust SEs

# 1. OLS cross-sectional (ACS 2018, NH Black)
m_comp_ols <- feols(hom_rate ~ nh_black_share, data = cs_18, weights = ~total)

# 2. Spatial Error (ACS 2018, NH Black)
idx_18 <- match(cs_18$GEOID, counties_sf$GEOID)
cs_18_sp <- cs_18[!is.na(idx_18), ]
counties_18 <- counties_proj[idx_18[!is.na(idx_18)], ]
W_18 <- nb2listw(knn2nb(knearneigh(st_coordinates(st_centroid(counties_18)), k = 5)), style = "W")

## Warning: st_centroid assumes attributes are constant over geometries

m_comp_sem <- errorsarlm(hom_rate ~ nh_black_share, data = cs_18_sp, listw = W_18)

# 3. OLS + State FE (ACS 2018, NH Black)
m_comp_sfe <- feols(hom_rate ~ nh_black_share | state_fips, data = cs_18, weights = ~total)

## NOTE: 1 fixed-effect singleton was removed (1 observation).

# 4. Panel county+year FE (1990-2020, NH Black)
panel_hom_nh <- panel_nh |> filter(!is.na(hom_rate))
m_comp_panel_nh <- feols(hom_rate ~ nh_black_share | GEOID + year,
                         data = panel_hom_nh, weights = ~total)

## NOTE: 9/0 fixed-effect singletons were removed (9 observations).

# 5. Panel county+year FE (1970-2020, total Black)
m_comp_panel_70 <- feols(hom_rate ~ black_share | GEOID + census_year,
                         data = panel, weights = ~total)

## NOTES: 234 observations removed because of NA values (LHS: 234).
##        18/0 fixed-effect singletons were removed (18 observations).

# summary table
comp_models <- data.frame(
  Model = c("OLS", "Spatial Error", "OLS + State FE",
            "County + Year FE", "County + Year FE"),
  Design = c("CS", "CS", "CS", "Panel", "Panel"),
  Years = c("2010-2020", "2010-2020", "2010-2020", "1990-2020", "1970-2020"),
  `Black % coefficient` = c(
    sprintf("%.2f*** (%.2f)", coef(m_comp_ols)["nh_black_share"],
            se(m_comp_ols, se = "hetero")["nh_black_share"]),
    sprintf("%.2f*** (%.2f)", coef(m_comp_sem)["nh_black_share"],
            summary(m_comp_sem)$Coef["nh_black_share", 2]),
    sprintf("%.2f*** (%.2f)", coef(m_comp_sfe)["nh_black_share"],
            se(m_comp_sfe, se = "hetero")["nh_black_share"]),
    sprintf("%.2f*** (%.2f)", coef(m_comp_panel_nh)["nh_black_share"],
            se(m_comp_panel_nh, se = "hetero")["nh_black_share"]),
    sprintf("%.2f*** (%.2f)", coef(m_comp_panel_70)["black_share"],
            se(m_comp_panel_70, se = "hetero")["black_share"])
  ),
  N = c(m_comp_ols$nobs, nrow(cs_18_sp), m_comp_sfe$nobs,
        m_comp_panel_nh$nobs, m_comp_panel_70$nobs),
  check.names = FALSE
)

tab_comp <- gt(comp_models) |>
  tab_header(title = "Black share \u2192 Homicide rate: Model comparison") |>
  tab_footnote("* p < 0.05, ** p < 0.01, *** p < 0.001. Robust SE in parentheses. All models pop-weighted. CS models use NH Black %; 1970-2020 panel uses total Black % (NH unavailable pre-1990).")

gtsave(tab_comp, file.path(fig_dir, "table_black_model_comparison.png"))

## file:////tmp/RtmpDLJUrO/file4c21f7bb4ee44.html screenshot completed

Cross-sectional: multiple outcomes with state FE

# use panel_nh 2020 slice (has z-scored outcomes from within-year standardization)
p2_2020 <- panel_nh |> filter(year == 2020)

outcomes_5 <- c("poverty_rate_z", "median_income_z", "unemployment_rate_z", "pct_bachelors_z", "hom_rate_z")
outcome_labels_5 <- c("Poverty", "Income", "Unemployment", "Education", "Homicide")

state_fe_models <- lapply(outcomes_5, function(y) {
  f <- as.formula(paste(y, "~ nh_black_share + hispanic_share + nh_native_share + nh_asian_share | state_fips"))
  p2_2020$state_fips <- substr(p2_2020$GEOID, 1, 2)
  feols(f, data = p2_2020, weights = ~total)
})

## NOTES: 8 observations removed because of NA values (LHS: 8).
##        1 fixed-effect singleton was removed (1 observation).

## NOTES: 9 observations removed because of NA values (LHS: 9).
##        1 fixed-effect singleton was removed (1 observation).

## NOTES: 8 observations removed because of NA values (LHS: 8).
##        1 fixed-effect singleton was removed (1 observation).
## NOTES: 8 observations removed because of NA values (LHS: 8).
##        1 fixed-effect singleton was removed (1 observation).

## NOTES: 78 observations removed because of NA values (LHS: 78).
##        1 fixed-effect singleton was removed (1 observation).

names(state_fe_models) <- outcome_labels_5

ms_save(
  state_fe_models,
  "Cross-sectional + State FE (2020, z-scored outcomes)",
  "table_state_fe.png"
)

## file:////tmp/RtmpDLJUrO/file4c21f129c335b.html screenshot completed

## Saved table_state_fe.png

Panel models: multiple outcomes (1990-2020)

panel_fe_models <- lapply(outcomes_5, function(y) {
  f <- as.formula(paste(y, "~ nh_black_share + hispanic_share + nh_native_share + nh_asian_share | GEOID + year"))
  feols(f, data = panel_nh, weights = ~total, cluster = ~GEOID)
})

## NOTES: 87 observations removed because of NA values (LHS: 87).
##        9/0 fixed-effect singletons were removed (9 observations).

## NOTES: 88 observations removed because of NA values (LHS: 88).
##        9/0 fixed-effect singletons were removed (9 observations).

## NOTES: 87 observations removed because of NA values (LHS: 87).
##        9/0 fixed-effect singletons were removed (9 observations).
## NOTES: 87 observations removed because of NA values (LHS: 87).
##        9/0 fixed-effect singletons were removed (9 observations).

## NOTES: 234 observations removed because of NA values (LHS: 234).
##        9/0 fixed-effect singletons were removed (9 observations).

names(panel_fe_models) <- outcome_labels_5

ms_save(
  panel_fe_models,
  "County + Year FE (1990-2020, z-scored outcomes)",
  "table_fe_1990_2020.png"
)

## file:////tmp/RtmpDLJUrO/file4c21f73a30f24.html screenshot completed

## Saved table_fe_1990_2020.png

Scatter: Black % vs homicide rate

p_scatter <- ggplot(cs_18, aes(x = nh_black_share * 100, y = hom_rate)) +
  geom_point(alpha = 0.2, size = 0.8, color = "steelblue") +
  geom_smooth(method = "lm", color = "red", fill = "red", alpha = 0.2, linewidth = 0.8) +
  labs(x = "Black share (2020 Census)", y = "Homicide rate per 100k (2010-2020)",
       title = "Cross-sectional: Black share vs homicide rate") +
  scale_x_continuous(labels = function(x) paste0(x, "%")) +
  coord_cartesian(ylim = c(0, 50))

ggsave(file.path(fig_dir, "scatter_cs_black_hom.png"), p_scatter, width = 7, height = 6, dpi = 150)

## [37m`geom_smooth()` using formula = 'y ~ x'[39m

p_scatter

## [37m`geom_smooth()` using formula = 'y ~ x'[39m

Maps

# county homicide rate map (CONUS)
counties_map <- counties(year = 2020, cb = TRUE) |>
  filter(!STATEFP %in% c("02", "15", "72", "78", "66", "60", "69")) |>
  st_transform(5070)

map_d <- counties_map |> left_join(cs_18 |> select(GEOID, hom_rate), by = "GEOID")

p_hom_map <- ggplot(map_d) +
  geom_sf(aes(fill = hom_rate), color = NA) +
  scale_fill_viridis_c(option = "inferno", trans = "sqrt",
                       name = "Homicide rate\nper 100k",
                       breaks = c(0, 5, 10, 20, 40)) +
  labs(title = "County homicide rate (2010-2020)") +
  theme_void() +
  theme(legend.position = c(0.92, 0.3))

ggsave(file.path(fig_dir, "map_homicide_rate.png"), p_hom_map, width = 12, height = 7, dpi = 200)
p_hom_map

# Native American population share map
race_2020 <- readRDS("data/county_race_panel_1970_2020.rds") |> filter(year == 2020)
map_native <- counties_map |> left_join(race_2020 |> select(GEOID, native_share), by = "GEOID")

p_native_map <- ggplot(map_native) +
  geom_sf(aes(fill = native_share), color = NA) +
  scale_fill_viridis_c(option = "viridis", trans = "sqrt",
                       name = "Native American\nshare",
                       labels = scales::percent) +
  labs(title = "Native American population share by county (2020)") +
  theme_void() +
  theme(legend.position = c(0.92, 0.3))

ggsave(file.path(fig_dir, "map_native_share.png"), p_native_map, width = 12, height = 7, dpi = 200)
p_native_map

US counties: racial composition and social outcomes

Emil O. W. Kirkegaard & Claude (Anthropic)

31 March, 2026

Init

Data

Analysis

Cross-sectional: standardized coefficients

Epsilon-squared variance decomposition

OLS vs Spatial Error Model

Cross-sectional: multiple outcomes with state FE

Panel models: multiple outcomes (1990-2020)

Scatter: Black % vs homicide rate

Maps

Meta