Init

library(kirkegaard)
load_packages(
  dplyr, ggplot2, sf, tigris, ggrepel, arrow, glmnet, xgboost, e1071, knitr
)

theme_set(theme_bw())
options(digits = 3, tigris_use_cache = TRUE)

data_dir <- "data"
fig_dir <- "figs"

Part 1: Estimating county-level firearm ownership

Training data: BRFSS SMART MMSA (2002-2004)

The BRFSS asked about household firearm ownership in its core questionnaire in 2002 and 2004. The SMART MMSA files provide survey-weighted estimates for ~146 metropolitan areas. After merging with ACS predictors and filtering to complete cases, we have 118 MMSAs for training.

train <- readRDS(file.path(data_dir, "firearm_training_data.rds")) |>
  rename(firearm_ownership = firearm_rate)
cat("Training MMSAs:", nrow(train), "\n")
## Training MMSAs: 118
cat("Firearm ownership range:", round(range(train$firearm_ownership), 3), "\n")
## Firearm ownership range: 0.115 0.602
cat("Respondents range:", range(train$n_respondents), "\n")
## Respondents range: 460 8957
p <- ggplot(train, aes(x = reorder(gsub(" Metropolitan.*| Micropolitan.*", "", mmsa_name),
                                     firearm_ownership),
                        y = firearm_ownership)) +
  geom_col(fill = "steelblue", width = 0.7) +
  coord_flip() +
  labs(x = NULL, y = "Household firearm ownership rate",
       title = "BRFSS SMART MMSA firearm ownership rates (2002-2004 pooled)") +
  theme_minimal(base_size = 6)
ggsave(file.path(fig_dir, "barplot_mmsa_firearm_ownership.png"), p, width = 8, height = 16, dpi = 150)
p

Predictors (30 variables)

  • 15 demographic/economic variables (ACS 2008-2012): median income, Gini, poverty rate, education, unemployment, LFP, sex ratio, race/ethnicity shares (White, Black, Native, Asian, Hispanic), veteran share, homeownership, median age
  • 13 industry composition variables (ACS 2008-2012): % employed in agriculture/mining, construction, manufacturing, wholesale, retail, transport, information, finance, professional services, education/health, arts/food, other services, public administration
  • 1 election variable: Republican presidential vote share (average of 2000, 2004, 2008 county results, pop-weighted to CBSA level)
  • 1 gun law variable: Everytown state gun law strength index (0-100 scale), assigned to MMSAs/counties by state

Model comparison (LOOCV on 118 MMSAs)

models_obj <- readRDS(file.path(data_dir, "firearm_models.rds"))
pred_cols <- models_obj$pred_cols

We trained four model classes with leave-one-out cross-validation on the 118 MMSAs. Hyperparameters were tuned via cross-validation (lambda for ridge/lasso via cv.glmnet, cost/gamma/epsilon for SVM via tune.svm, tree parameters for XGBoost via xgb.cv).

# Compute LOOCV predictions for each model
svm_fit <- models_obj$svm
xgb_fit <- models_obj$xgb
svm_center <- models_obj$svm_center
svm_scale <- models_obj$svm_scale

X <- as.matrix(train[, pred_cols])
y <- train$firearm_ownership

# Ridge and Lasso LOOCV
ridge_cv <- cv.glmnet(scale(X), y, alpha = 0, nfolds = nrow(X), grouped = FALSE)
lasso_cv <- cv.glmnet(scale(X), y, alpha = 1, nfolds = nrow(X), grouped = FALSE)

# Manual LOOCV for SVM and XGBoost to get predicted values
n <- nrow(train)
svm_loo <- xgb_loo <- ridge_loo <- lasso_loo <- numeric(n)

ridge_fit_full <- glmnet(scale(X), y, alpha = 0, lambda = ridge_cv$lambda.min)
lasso_fit_full <- glmnet(scale(X), y, alpha = 1, lambda = lasso_cv$lambda.min)

for (i in 1:n) {
  Xi <- X[-i, ]; yi <- y[-i]
  Xi_s <- scale(Xi)
  ci <- attr(Xi_s, "scaled:center"); si <- attr(Xi_s, "scaled:scale")
  test_s <- matrix(scale(matrix(X[i,], nrow=1), center=ci, scale=si), nrow=1)

  # Ridge
  ridge_i <- glmnet(Xi_s, yi, alpha = 0, lambda = ridge_cv$lambda.min)
  ridge_loo[i] <- predict(ridge_i, test_s)[1]

  # Lasso
  lasso_i <- glmnet(Xi_s, yi, alpha = 1, lambda = lasso_cv$lambda.min)
  lasso_loo[i] <- predict(lasso_i, test_s)[1]

  # SVM
  svm_i <- svm(x = Xi_s, y = yi, kernel = "radial",
                cost = 4, gamma = 0.0078125, epsilon = 0.01)
  svm_loo[i] <- predict(svm_i, test_s)

  # XGBoost
  dti <- xgb.DMatrix(data = Xi, label = yi)
  xgb_i <- xgb.train(params = models_obj$xgb_params, data = dti,
                       nrounds = models_obj$xgb_nrounds, verbose = 0)
  xgb_loo[i] <- predict(xgb_i, matrix(X[i,], nrow=1))
}

loocv_results <- data.frame(
  Model = c("Ridge", "Lasso", "SVM RBF", "XGBoost"),
  LOOCV_R2 = c(cor(ridge_loo, y)^2, cor(lasso_loo, y)^2,
                cor(svm_loo, y)^2, cor(xgb_loo, y)^2),
  LOOCV_MAE = c(mean(abs(ridge_loo - y)), mean(abs(lasso_loo - y)),
                 mean(abs(svm_loo - y)), mean(abs(xgb_loo - y)))
)
loocv_results |> mutate(across(where(is.numeric), ~round(., 3))) |> kable()
Model LOOCV_R2 LOOCV_MAE
Ridge 0.671 0.054
Lasso 0.668 0.054
SVM RBF 0.792 0.042
XGBoost 0.737 0.047

LOOCV scatterplots (training sample, out-of-sample predictions)

train$svm_loo <- svm_loo
train$xgb_loo <- xgb_loo
train$ridge_loo <- ridge_loo
train$lasso_loo <- lasso_loo
train$short_name <- gsub(" Metropolitan.*| Micropolitan.*", "", train$mmsa_name)

p <- ggplot(train, aes(x = svm_loo, y = firearm_ownership)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = short_name), size = 2.2, max.overlaps = 25,
                  segment.alpha = 0.3) +
  labs(x = "SVM LOOCV predicted", y = "Actual HH firearm ownership (BRFSS)",
       title = "LOOCV: SVM predicted vs actual (118 MMSAs)",
       subtitle = sprintf("R² = %.3f", cor(svm_loo, y)^2)) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "loocv_svm_mmsa.png"), p, width = 10, height = 10, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

p <- ggplot(train, aes(x = xgb_loo, y = firearm_ownership)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = short_name), size = 2.2, max.overlaps = 25,
                  segment.alpha = 0.3) +
  labs(x = "XGBoost LOOCV predicted", y = "Actual HH firearm ownership (BRFSS)",
       title = "LOOCV: XGBoost predicted vs actual (118 MMSAs)",
       subtitle = sprintf("R² = %.3f", cor(xgb_loo, y)^2)) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "loocv_xgb_mmsa.png"), p, width = 10, height = 10, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

State-level validation (vs RAND estimates)

RAND Corporation provides state-level household firearm ownership rate (HFR) estimates for 1980-2016, derived from a structural equation model integrating 51 surveys with administrative data. We compare our predictions (aggregated or direct) against the RAND 2008-2012 average.

rand <- read_csv(file.path(data_dir, "RAND_HFR_state_firearm_ownership.csv"),
                  show_col_types = FALSE)

rand_match <- rand |>
  filter(Year %in% 2008:2012) |>
  group_by(FIP, STATE) |>
  summarise(rand_hfr = mean(HFR, na.rm = TRUE), .groups = "drop") |>
  mutate(state_fips = sprintf("%02d", FIP))

state_pred <- readRDS(file.path(data_dir, "state_firearm_predictions.rds"))

# State predictions file already has all 30 predictor columns
state_pred_v <- state_pred |> filter(!is.na(gun_law_idx))

X_state <- as.matrix(state_pred_v[, pred_cols])
X_state_s <- scale(X_state, center = svm_center, scale = svm_scale)

state_pred_v$svm_pred <- predict(svm_fit, X_state_s)
state_pred_v$xgb_pred <- pmin(pmax(predict(xgb_fit, X_state), 0), 1)

val <- state_pred_v |>
  left_join(rand_match, by = "state_fips") |>
  filter(!is.na(rand_hfr))

Calibration table

check_cal <- function(pred, actual, name) {
  fit <- lm(actual ~ pred)
  data.frame(
    Model = name,
    R2 = round(cor(pred, actual)^2, 3),
    Slope = round(coef(fit)[2], 3),
    Intercept = round(coef(fit)[1], 3),
    MAE = round(mean(abs(pred - actual)), 3),
    Pred_range = paste0("[", round(min(pred), 2), ", ", round(max(pred), 2), "]")
  )
}

state_cal <- bind_rows(
  check_cal(val$svm_pred, val$rand_hfr, "SVM RBF"),
  check_cal(val$xgb_pred, val$rand_hfr, "XGBoost")
)
kable(state_cal, caption = "State-level validation vs RAND HFR (2008-2012 avg, n=50)")
State-level validation vs RAND HFR (2008-2012 avg, n=50)
Model R2 Slope Intercept MAE Pred_range
pred…1 SVM RBF 0.888 0.985 0.023 0.035 [0.11, 0.58]
pred…2 XGBoost 0.915 1.089 -0.030 0.031 [0.14, 0.57]

State validation scatterplots

p <- ggplot(val, aes(x = svm_pred, y = rand_hfr)) +
  geom_point(alpha = 0.7, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = STATE), size = 3, max.overlaps = 50,
                  segment.alpha = 0.3) +
  labs(x = "SVM predicted HFR", y = "RAND HFR (2008-2012 avg)",
       title = "State validation: SVM predictions vs RAND estimates",
       subtitle = sprintf("R² = %.3f, slope = %.3f, intercept = %.3f",
                          cor(val$svm_pred, val$rand_hfr)^2,
                          coef(lm(rand_hfr ~ svm_pred, data = val))[2],
                          coef(lm(rand_hfr ~ svm_pred, data = val))[1])) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "validation_state_svm.png"), p, width = 9, height = 9, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

p <- ggplot(val, aes(x = xgb_pred, y = rand_hfr)) +
  geom_point(alpha = 0.7, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = STATE), size = 3, max.overlaps = 50,
                  segment.alpha = 0.3) +
  labs(x = "XGBoost predicted HFR", y = "RAND HFR (2008-2012 avg)",
       title = "State validation: XGBoost predictions vs RAND estimates",
       subtitle = sprintf("R² = %.3f, slope = %.3f, intercept = %.3f",
                          cor(val$xgb_pred, val$rand_hfr)^2,
                          coef(lm(rand_hfr ~ xgb_pred, data = val))[2],
                          coef(lm(rand_hfr ~ xgb_pred, data = val))[1])) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "validation_state_xgb.png"), p, width = 9, height = 9, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

County-level validation (vs firearm suicide share)

The firearm suicide share (FS/S = firearm suicides / total suicides) is a widely used proxy for gun ownership (state-level r = 0.86 with actual ownership). We computed county-level FS/S from CDC WONDER data (1999-2020, pooled).

Scaling check: At the state level, the slope of FS/S regressed on actual RAND HFR is 0.787. If our county predictions are correctly scaled, the slope of FS/S on predicted HFR at county level should match. This is a strong test of whether our predictions are on the right scale.

fss <- readRDS(file.path(data_dir, "county_fss_1999_2020.rds"))
gun <- read_parquet(file.path(data_dir, "county_firearm_ownership_estimates.parquet"))

val_fss <- gun |>
  inner_join(fss |> select(GEOID, fss, total_deaths), by = "GEOID") |>
  filter(!is.na(fss), !is.na(firearm_rate_svm))

cat("Counties with FS/S data:", nrow(val_fss), "\n")
## Counties with FS/S data: 2839
# Scaling validation
fit_county_svm <- lm(fss ~ firearm_rate_svm, data = val_fss, weights = total_deaths)
fit_county_xgb <- lm(fss ~ firearm_rate_xgb, data = val_fss, weights = total_deaths)

# State-level ground truth slope
rand_fss <- rand |>
  filter(Year %in% 2008:2012) |>
  mutate(across(c(Fem_FS_S, Male_FS_S), ~ifelse(. == -9, NA, .))) |>
  group_by(FIP, STATE) |>
  summarise(avg_fss = mean((Fem_FS_S + Male_FS_S)/2, na.rm = TRUE),
            hfr = mean(HFR, na.rm = TRUE), .groups = "drop")
state_slope <- coef(lm(avg_fss ~ hfr, data = rand_fss))[2]

fss_cal <- data.frame(
  Level = c("State (ground truth)", "County (SVM)", "County (XGBoost)"),
  `Slope (FS/S ~ HFR)` = round(c(state_slope, coef(fit_county_svm)[2], coef(fit_county_xgb)[2]), 3),
  `Intercept` = round(c(coef(lm(avg_fss ~ hfr, data = rand_fss))[1],
                         coef(fit_county_svm)[1], coef(fit_county_xgb)[1]), 3),
  check.names = FALSE
)
kable(fss_cal, caption = "FS/S scaling validation: county predictions vs state ground truth")
FS/S scaling validation: county predictions vs state ground truth
Level Slope (FS/S ~ HFR) Intercept
hfr State (ground truth) 0.787 0.137
firearm_rate_svm County (SVM) 0.790 0.225
firearm_rate_xgb County (XGBoost) 0.919 0.182

SVM slope (county) matches the state ground truth slope almost exactly, confirming correct scaling.

wr_svm <- cov.wt(cbind(val_fss$firearm_rate_svm, val_fss$fss),
                   wt = val_fss$total_deaths, cor = TRUE)$cor[1,2]

p <- ggplot(val_fss, aes(x = firearm_rate_svm, y = fss)) +
  geom_point(aes(size = total_deaths), alpha = 0.1, color = "grey30") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.8) +
  scale_size_continuous(range = c(0.3, 5), guide = "none") +
  labs(x = "SVM predicted firearm ownership rate",
       y = "Firearm Suicide Share (FS/S, 1999-2020)",
       title = "County-level validation: SVM predictions vs FS/S proxy",
       subtitle = sprintf("r = %.2f unweighted (n=%d), r = %.2f weighted by deaths",
                          cor(val_fss$firearm_rate_svm, val_fss$fss),
                          nrow(val_fss), wr_svm)) +
  theme_minimal()
ggsave(file.path(fig_dir, "validation_county_fss_svm.png"), p, width = 10, height = 9, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

County-level estimates

cat("Counties:", nrow(gun), "\n")
## Counties: 3115
cat("SVM range:", round(range(gun$firearm_rate_svm), 3), "\n")
## SVM range: 0.113 0.663
cat("SVM mean:", round(mean(gun$firearm_rate_svm), 3),
    "sd:", round(sd(gun$firearm_rate_svm), 3), "\n")
## SVM mean: 0.477 sd: 0.088

Map of estimated firearm ownership

counties_sf <- counties(year = 2020, cb = TRUE) |>
  filter(!STATEFP %in% c("02", "15", "72", "78", "66", "60", "69")) |>
  st_transform(5070)

map_gun <- counties_sf |> left_join(gun, by = "GEOID")

p <- ggplot(map_gun) +
  geom_sf(aes(fill = firearm_rate_svm), color = NA) +
  scale_fill_viridis_c(option = "inferno", labels = scales::percent,
                       name = "Estimated HH\nFirearm Rate",
                       breaks = seq(0.1, 0.7, 0.1)) +
  labs(title = "Estimated household firearm ownership rate by county (SVM)",
       subtitle = "Trained on 118 MMSAs (BRFSS 2002-04), 30 predictors | State R²=0.89 vs RAND | County FS/S slope validated") +
  theme_void() +
  theme(legend.position = c(0.92, 0.3),
        panel.background = element_rect(fill = "white", color = NA))
ggsave(file.path(fig_dir, "map_firearm_rate_svm.png"), p, width = 12, height = 7, dpi = 200)
p

Model comparison table

knitr::include_graphics(file.path(fig_dir, "table_imputation_model_comparison.png"))

SVM RBF selected as final model: best LOOCV R² (0.79), near-perfect state calibration (slope=0.99), and validated county-level FS/S scaling.


Part 2: Race, firearm ownership, and homicide

We fit three models at three levels of analysis:

  1. Demographics alone: non-White race shares (Black, Hispanic, Native, Asian)
  2. Gun ownership alone: household firearm rate
  3. Both: demographics + gun ownership (additive)

Levels: (A) states using RAND HFR, (B) all counties using SVM-imputed HFR, (C) MMSAs only using BRFSS survey-based HFR.

library(modelsummary)

gof_map_short <- list(
  list("raw" = "adj.r.squared", "clean" = "Adj. R²", "fmt" = 3),
  list("raw" = "nobs", "clean" = "N", "fmt" = 0)
)

# Helper: fit 3 models, return named list for modelsummary
fit_3models <- function(df, gun_var = "firearm_ownership") {
  f_demo <- as.formula("hom_rate ~ black_share + hispanic_share + native_share + asian_share + log_pwpd")
  f_gun  <- as.formula(paste("hom_rate ~", gun_var))
  f_both <- as.formula(paste("hom_rate ~ black_share + hispanic_share + native_share + asian_share +", gun_var, "+ log_pwpd"))

  list(
    "Demographics" = lm(f_demo, data = df),
    "Gun ownership" = lm(f_gun, data = df),
    "Both" = lm(f_both, data = df)
  )
}

# Helper: render modelsummary table and save to PNG
ms_table <- function(models, title, filename) {
  coef_labels <- c(
    "black_share" = "Black %",
    "hispanic_share" = "Hispanic %",
    "native_share" = "Native %",
    "asian_share" = "Asian %",
    "firearm_ownership" = "HH firearm ownership",
    "black_share:firearm_ownership" = "Black % x HH firearm ownership",
    "log_pwpd" = "log(pop-weighted density)",
    "(Intercept)" = "(Intercept)"
  )
  tab <- modelsummary(
    models,
    estimate = "{estimate}{stars}",
    statistic = "({std.error})",
    stars = c("***" = 0.001, "**" = 0.01, "*" = 0.05),
    coef_rename = coef_labels,
    gof_map = gof_map_short,
    title = title,
    output = "gt"
  )
  gt::gtsave(tab, file.path(fig_dir, filename))
  tab
}

Data

# County-level data
d <- read_parquet(file.path(data_dir, "analysis_firearm_race.parquet"))
cat("All counties:", nrow(d), "\n")
## All counties: 3112
# State-level: aggregate counties, use RAND HFR
state_pwpd <- readRDS(file.path(data_dir, "state_pop_weighted_density.rds"))

state_d <- d |>
  mutate(state_fips = substr(GEOID, 1, 2)) |>
  group_by(state_fips) |>
  summarise(
    across(c(black_share, hispanic_share, native_share, asian_share),
           ~weighted.mean(., w = total)),
    hom_rate = sum(hom_deaths) / sum(hom_pop) * 100000,
    .groups = "drop"
  ) |>
  left_join(rand_match |> select(state_fips, rand_hfr), by = "state_fips") |>
  left_join(state_pwpd, by = "state_fips") |>
  mutate(log_pwpd = log(pwpd + 1)) |>
  rename(firearm_ownership = rand_hfr) |>
  filter(!is.na(firearm_ownership))
cat("States (with RAND HFR):", nrow(state_d), "\n")
## States (with RAND HFR): 50
# MMSA-level: use survey-based BRFSS firearm rate (no imputation needed)
mmsa_gun <- readRDS(file.path(data_dir, "mmsa_firearm_ownership.rds")) |>
  select(mmsa_code, firearm_ownership = firearm_rate, n_respondents)

# Need MMSA-level homicide and race data
# Aggregate counties to MMSAs using the crosswalk
county_cbsa <- readxl::read_xls(file.path(data_dir, "cbsa_delineation_2013.xls"), skip = 2) |>
  transmute(
    county_fips = paste0(`FIPS State Code`, `FIPS County Code`),
    cbsa_code = stringr::str_pad(`CBSA Code`, 5, pad = "0")
  )

mmsa_d <- d |>
  inner_join(county_cbsa, by = c("GEOID" = "county_fips")) |>
  group_by(cbsa_code) |>
  summarise(
    across(c(black_share, hispanic_share, native_share, asian_share),
           ~weighted.mean(., w = total)),
    hom_rate = sum(hom_deaths) / sum(hom_pop) * 100000,
    pwpd = weighted.mean(pwpd, w = pop_total),
    .groups = "drop"
  ) |>
  mutate(log_pwpd = log(pwpd + 1)) |>
  inner_join(mmsa_gun, by = c("cbsa_code" = "mmsa_code")) |>
  filter(!is.na(firearm_ownership), !is.na(hom_rate))
cat("MMSAs (with BRFSS survey data):", nrow(mmsa_d), "\n")
## MMSAs (with BRFSS survey data): 118

Maps

map_d <- counties_sf |> left_join(d, by = "GEOID")

p <- ggplot(map_d) +
  geom_sf(aes(fill = firearm_ownership), color = NA) +
  scale_fill_viridis_c(option = "inferno", labels = scales::percent,
                       name = "HH Firearm\nRate") +
  labs(title = "Estimated household firearm ownership rate") +
  theme_void() + theme(legend.position = c(0.92, 0.3),
                        panel.background = element_rect(fill = "white", color = NA))
ggsave(file.path(fig_dir, "map_analysis_firearm.png"), p, width = 12, height = 7, dpi = 200)
p

Homicide rate map: see figs/map_homicide_rate.png (from prior analysis).

Black share map: see figs/map_black_share.png (from prior analysis).

Descriptives

d |>
  select(hom_rate, firearm_ownership, black_share, hispanic_share, native_share,
         asian_share) |>
  summary()
##     hom_rate     firearm_ownership  black_share      hispanic_share  
##  Min.   : 0.00   Min.   :0.113     Min.   :0.00000   Min.   :0.0034  
##  1st Qu.: 1.99   1st Qu.:0.441     1st Qu.:0.00535   1st Qu.:0.0237  
##  Median : 3.26   Median :0.490     Median :0.02104   Median :0.0461  
##  Mean   : 4.39   Mean   :0.477     Mean   :0.08786   Mean   :0.0982  
##  3rd Qu.: 5.32   3rd Qu.:0.536     3rd Qu.:0.10188   3rd Qu.:0.1054  
##  Max.   :42.89   Max.   :0.663     Max.   :0.87455   Max.   :0.9768  
##   native_share      asian_share     
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00273   1st Qu.:0.00371  
##  Median :0.00470   Median :0.00615  
##  Mean   :0.01821   Mean   :0.01516  
##  3rd Qu.:0.00937   3rd Qu.:0.01382  
##  Max.   :0.88733   Max.   :0.53020

Correlation matrices

# State-level: include both RAND and BRFSS firearm ownership
state_brfss <- mmsa_d |>
  inner_join(
    county_cbsa |> mutate(state_fips = substr(county_fips, 1, 2)) |>
      left_join(d |> select(GEOID, pop_total), by = c("county_fips" = "GEOID")) |>
      filter(!is.na(pop_total)) |>
      group_by(cbsa_code, state_fips) |> summarise(pop = sum(pop_total), .groups = "drop") |>
      group_by(cbsa_code) |> slice_max(pop, n = 1) |> ungroup() |>
      select(cbsa_code, state_fips),
    by = "cbsa_code"
  ) |>
  group_by(state_fips) |>
  summarise(brfss_ownership = mean(firearm_ownership), .groups = "drop")

state_fss <- readRDS(file.path(data_dir, "state_fss_1999_2020.rds"))

state_d2 <- state_d |>
  left_join(state_brfss, by = "state_fips") |>
  left_join(state_fss |> select(state_fips, fss), by = "state_fips")

cor_vars_s <- c("hom_rate", "firearm_ownership", "brfss_ownership", "fss",
                "black_share", "hispanic_share", "native_share", "asian_share", "log_pwpd")
cor_labels_s <- c("Homicide rate", "RAND HFR", "BRFSS HFR", "Firearm suicide %",
                  "Black %", "Hispanic %", "Native %", "Asian %", "log(pop-wtd density)")

sc <- cor(state_d2[!is.na(state_d2$brfss_ownership), cor_vars_s], use = "complete.obs")
colnames(sc) <- rownames(sc) <- cor_labels_s
n_s <- nrow(sc)

melted_s <- expand.grid(Var1 = cor_labels_s, Var2 = cor_labels_s) |>
  mutate(value = as.vector(sc))

p <- ggplot(melted_s, aes(x = Var2, y = Var1, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = sprintf("%.2f", value)), size = 2.8) +
  scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick",
                       midpoint = 0, limits = c(-1, 1), name = "r") +
  labs(title = sprintf("State-level correlations (n=%d states with both RAND and BRFSS)",
                       sum(!is.na(state_d2$brfss_ownership)))) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_blank())
ggsave(file.path(fig_dir, "cormat_states.png"), p, width = 9, height = 7.5, dpi = 150)
p

# County/MMSA split: upper = all counties (SVM), lower = MMSAs (BRFSS)
shared_vars <- c("hom_rate", "firearm_ownership", "black_share", "hispanic_share",
                 "native_share", "asian_share", "log_pwpd")
shared_labels <- c("Homicide rate", "Firearm own. (SVM)", "Black %", "Hispanic %",
                   "Native %", "Asian %", "log(pop-wtd density)")

county_cor <- cor(d[, shared_vars], use = "complete.obs")
mmsa_brfss <- mmsa_d  # already has firearm_ownership = BRFSS
mmsa_cor <- cor(mmsa_brfss[, shared_vars], use = "complete.obs")

nv <- length(shared_vars)
combined <- matrix(NA, nv, nv)
rownames(combined) <- colnames(combined) <- shared_labels
for (i in 1:nv) for (j in 1:nv) {
  if (i < j) combined[i, j] <- county_cor[i, j]
  else if (i > j) combined[i, j] <- mmsa_cor[i, j]
}

melted_c <- expand.grid(Var1 = shared_labels, Var2 = shared_labels) |>
  mutate(value = as.vector(combined))

p <- ggplot(melted_c, aes(x = Var2, y = Var1, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size = 2.8) +
  scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick",
                       midpoint = 0, limits = c(-1, 1), name = "r", na.value = "grey90") +
  labs(title = "County/MMSA correlations",
       subtitle = "Upper: all counties, n=3,112, SVM imputed | Lower: MMSAs, n=118, BRFSS survey") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_blank())
ggsave(file.path(fig_dir, "cormat_counties.png"), p, width = 9, height = 7.5, dpi = 150)
p

A. State-level regressions (RAND HFR, n=50)

mods_state <- fit_3models(state_d)
ms_table(mods_state, "State-level: homicide rate per 100k", "table_state_regressions.png")
## file:////tmp/RtmpaiogIy/file53e8a533f46f.html screenshot completed
State-level: homicide rate per 100k
Demographics Gun ownership Both
(Intercept) 9.272* 3.675** -0.041
(3.539) (1.154) (4.713)
Black % 25.855*** 26.708***
(2.462) (2.315)
Hispanic % 4.928 6.720*
(2.710) (2.607)
Native % 34.326** 19.668
(9.968) (10.696)
Asian % 2.055 4.522
(3.650) (3.517)
log(pop-weighted density) -1.125* -0.232
(0.530) (0.590)
HH firearm ownership 4.211 7.214**
(2.807) (2.608)
Adj. R² 0.709 0.025 0.747
N 50 50 50

B. County-level regressions (all counties, SVM-imputed HFR)

mods_county <- fit_3models(d)
ms_table(mods_county, "County-level (all): homicide rate per 100k", "table_county_regressions.png")
## file:////tmp/RtmpaiogIy/file53e8a30e46e03.html screenshot completed
County-level (all): homicide rate per 100k
Demographics Gun ownership Both
(Intercept) 0.477 6.900*** -0.743
(0.399) (0.413) (0.704)
Black % 21.785*** 21.912***
(0.365) (0.369)
Hispanic % 2.304*** 2.502***
(0.397) (0.408)
Native % 9.831*** 9.945***
(0.774) (0.775)
Asian % -8.830*** -7.202***
(1.905) (2.055)
log(pop-weighted density) 0.294*** 0.354***
(0.071) (0.077)
HH firearm ownership -5.252*** 1.689*
(0.851) (0.803)
Adj. R² 0.545 0.012 0.546
N 3112 3112 3112

C. MMSA-level regressions (BRFSS survey HFR, no imputation)

mods_mmsa <- fit_3models(mmsa_d)
ms_table(mods_mmsa, "MMSA-level (survey data): homicide rate per 100k", "table_mmsa_regressions.png")
## file:////tmp/RtmpaiogIy/file53e8a13447319.html screenshot completed
MMSA-level (survey data): homicide rate per 100k
Demographics Gun ownership Both
(Intercept) -0.090 3.995*** -7.336
(3.223) (1.077) (3.904)
Black % 26.908*** 27.421***
(1.736) (1.682)
Hispanic % 1.755 2.733
(1.519) (1.499)
Native % 22.375*** 19.880***
(4.839) (4.737)
Asian % -2.822 1.780
(3.170) (3.405)
log(pop-weighted density) 0.340 0.980
(0.474) (0.503)
HH firearm ownership 5.515* 6.488**
(2.784) (2.114)
Adj. R² 0.685 0.024 0.707
N 118 118 118

Interaction models

Adding race x gun ownership interactions at each level:

int_models <- list(
  "State" = lm(hom_rate ~ black_share * firearm_ownership + hispanic_share + native_share + asian_share + log_pwpd, data = state_d),
  "County" = lm(hom_rate ~ black_share * firearm_ownership + hispanic_share + native_share + asian_share + log_pwpd, data = d),
  "MMSA" = lm(hom_rate ~ black_share * firearm_ownership + hispanic_share + native_share + asian_share + log_pwpd, data = mmsa_d)
)

ms_table(int_models, "Interaction models: race x firearm ownership", "table_interaction_regressions.png")
## file:////tmp/RtmpaiogIy/file53e8a58aa8e10.html screenshot completed
Interaction models: race x firearm ownership
State County MMSA
(Intercept) -1.358 -2.042** -7.438
(4.601) (0.717) (3.933)
Black % 8.769 41.267*** 25.125**
(9.181) (2.507) (7.536)
HH firearm ownership 3.435 5.360*** 5.873*
(3.141) (0.924) (2.894)
Hispanic % 5.427* 2.985*** 2.609
(2.600) (0.408) (1.557)
Native % 23.228* 10.093*** 20.076***
(10.484) (0.768) (4.798)
Asian % 1.232 -6.000** 1.379
(3.770) (2.041) (3.652)
log(pop-weighted density) 0.226 0.259*** 1.036
(0.614) (0.077) (0.536)
Black % x HH firearm ownership 42.826 -41.178*** 5.438
(21.259) (5.276) (17.393)
Adj. R² 0.764 0.554 0.705
N 50 3112 118

Visualize the interaction (county level)

d$gun_tercile <- cut(d$firearm_ownership,
                      breaks = quantile(d$firearm_ownership, c(0, 1/3, 2/3, 1)),
                      labels = c("Low gun ownership", "Medium", "High gun ownership"),
                      include.lowest = TRUE)

p <- ggplot(d, aes(x = black_share, y = hom_rate, color = gun_tercile)) +
  geom_point(alpha = 0.15, size = 0.8) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
  labs(x = "Black population share", y = "Homicide rate per 100k",
       color = "Firearm ownership",
       title = "Black share vs homicide rate, by firearm ownership level") +
  theme_minimal()
ggsave(file.path(fig_dir, "interaction_black_firearm_hom.png"), p, width = 9, height = 7, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

d$black_tercile <- cut(d$black_share,
                        breaks = quantile(d$black_share, c(0, 1/3, 2/3, 1)),
                        labels = c("Low Black share", "Medium", "High Black share"),
                        include.lowest = TRUE)

p <- ggplot(d, aes(x = firearm_ownership, y = hom_rate, color = black_tercile)) +
  geom_point(alpha = 0.15, size = 0.8) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
  labs(x = "Estimated firearm ownership rate", y = "Homicide rate per 100k",
       color = "Black pop share",
       title = "Firearm ownership vs homicide rate, by Black population share") +
  theme_minimal()
ggsave(file.path(fig_dir, "interaction_firearm_black_hom.png"), p, width = 9, height = 7, dpi = 150)
## `geom_smooth()` using formula = 'y ~ x'
p
## `geom_smooth()` using formula = 'y ~ x'

Meta

sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] modelsummary_2.5.0    knitr_1.50            e1071_1.7-16         
##  [4] xgboost_1.7.11.1      glmnet_4.1-10         Matrix_1.7-4         
##  [7] arrow_23.0.1.1        ggrepel_0.9.6         tigris_2.2.1         
## [10] sf_1.0-21             kirkegaard_2025-11-19 robustbase_0.99-6    
## [13] psych_2.5.6           assertthat_0.2.1      weights_1.1.2        
## [16] magrittr_2.0.4        lubridate_1.9.4       forcats_1.0.1        
## [19] stringr_1.6.0         dplyr_1.1.4           purrr_1.2.0          
## [22] readr_2.1.5           tidyr_1.3.1           tibble_3.3.0         
## [25] ggplot2_4.0.1.9000    tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.5.3       later_1.4.4         cellranger_1.1.0   
##   [4] datawizard_1.3.0    rpart_4.1.24        lifecycle_1.0.4    
##   [7] Rdpack_2.6.4        globals_0.18.0      processx_3.8.6     
##  [10] lattice_0.22-7      vroom_1.6.6         MASS_7.3-65        
##  [13] insight_1.4.2       backports_1.5.0     Hmisc_5.2-4        
##  [16] sass_0.4.10         rmarkdown_2.30      jquerylib_0.1.4    
##  [19] yaml_2.3.10         chromote_0.5.1      DBI_1.2.3          
##  [22] minqa_1.2.8         RColorBrewer_1.1-3  multcomp_1.4-28    
##  [25] nnet_7.3-20         TH.data_1.1-4       rappdirs_0.3.3     
##  [28] webshot2_0.1.2      sandwich_3.1-1      listenv_0.9.1      
##  [31] gdata_3.0.1         units_1.0-0         performance_0.15.2 
##  [34] parallelly_1.45.1   codetools_0.2-20    xml2_1.4.0         
##  [37] tidyselect_1.2.1    shape_1.4.6.1       farver_2.1.2       
##  [40] lme4_1.1-37         effectsize_1.0.1    base64enc_0.1-3    
##  [43] jsonlite_2.0.0      mitml_0.4-5         Formula_1.2-5      
##  [46] survival_3.8-6      iterators_1.0.14    emmeans_1.11.2-8   
##  [49] systemfonts_1.3.1   foreach_1.5.2       tools_4.5.3        
##  [52] ragg_1.5.0          Rcpp_1.1.0          glue_1.8.0         
##  [55] mnormt_2.1.1        gridExtra_2.3       pan_1.9            
##  [58] xfun_0.57           mgcv_1.9-3          websocket_1.4.4    
##  [61] withr_3.0.2         fastmap_1.2.0       boot_1.3-32        
##  [64] digest_0.6.39       timechange_0.3.0    R6_2.6.1           
##  [67] estimability_1.5.1  mice_3.18.0         textshaping_1.0.4  
##  [70] colorspace_2.1-2    gtools_3.9.5        generics_0.1.4     
##  [73] data.table_1.17.8   class_7.3-23        httr_1.4.7         
##  [76] htmlwidgets_1.6.4   parameters_0.28.2   pkgconfig_2.0.3    
##  [79] gtable_0.3.6        lmtest_0.9-40       S7_0.2.1           
##  [82] htmltools_0.5.8.1   scales_1.4.0        png_0.1-8          
##  [85] reformulas_0.4.1    rstudioapi_0.17.1   tzdb_0.5.0         
##  [88] uuid_1.2-1          coda_0.19-4.1       checkmate_2.3.3    
##  [91] nlme_3.1-168        nloptr_2.2.1        proxy_0.4-27       
##  [94] cachem_1.1.0        zoo_1.8-14          KernSmooth_2.23-26 
##  [97] parallel_4.5.3      foreign_0.8-91      pillar_1.11.1      
## [100] grid_4.5.3          vctrs_0.6.5         promises_1.3.3     
## [103] jomo_2.7-6          xtable_1.8-4        cluster_2.1.8.2    
## [106] archive_1.1.12      htmlTable_2.4.3     evaluate_1.0.5     
## [109] mvtnorm_1.3-3       cli_3.6.5           compiler_4.5.3     
## [112] rlang_1.1.6.9000    crayon_1.5.3        future.apply_1.20.0
## [115] labeling_0.4.3      classInt_0.4-11     ps_1.9.1           
## [118] fs_1.6.6            stringi_1.8.7       viridisLite_0.4.2  
## [121] tables_0.9.31       bayestestR_0.17.0   hms_1.1.3          
## [124] bit64_4.6.0-1       future_1.67.0       rbibutils_2.3      
## [127] gt_1.3.0            broom_1.0.11        bslib_0.9.0        
## [130] DEoptimR_1.1-4      bit_4.6.0           readxl_1.4.5