Init

library(kirkegaard)
load_packages(
  dplyr, ggplot2, sf, tigris, ggrepel, arrow, glmnet, xgboost, e1071, knitr
)

theme_set(theme_bw())
options(digits = 3, tigris_use_cache = TRUE)

data_dir <- "data"
fig_dir <- "figs"

Part 1: Estimating county-level firearm ownership

Training data: BRFSS SMART MMSA (2002-2004)

The BRFSS asked about household firearm ownership in its core questionnaire in 2002 and 2004. The SMART MMSA files provide survey-weighted estimates for ~146 metropolitan areas. After merging with ACS predictors and filtering to complete cases, we have 118 MMSAs for training.

train <- readRDS(file.path(data_dir, "firearm_training_data.rds")) |>
  rename(firearm_ownership = firearm_rate)
cat("Training MMSAs:", nrow(train), "\n")

## Training MMSAs: 118

cat("Firearm ownership range:", round(range(train$firearm_ownership), 3), "\n")

## Firearm ownership range: 0.115 0.602

cat("Respondents range:", range(train$n_respondents), "\n")

## Respondents range: 460 8957

p <- ggplot(train, aes(x = reorder(gsub(" Metropolitan.*| Micropolitan.*", "", mmsa_name),
                                     firearm_ownership),
                        y = firearm_ownership)) +
  geom_col(fill = "steelblue", width = 0.7) +
  coord_flip() +
  labs(x = NULL, y = "Household firearm ownership rate",
       title = "BRFSS SMART MMSA firearm ownership rates (2002-2004 pooled)") +
  theme_minimal(base_size = 6)
ggsave(file.path(fig_dir, "barplot_mmsa_firearm_ownership.png"), p, width = 8, height = 16, dpi = 150)
p

Predictors (30 variables)

15 demographic/economic variables (ACS 2008-2012): median income, Gini, poverty rate, education, unemployment, LFP, sex ratio, race/ethnicity shares (White, Black, Native, Asian, Hispanic), veteran share, homeownership, median age
13 industry composition variables (ACS 2008-2012): % employed in agriculture/mining, construction, manufacturing, wholesale, retail, transport, information, finance, professional services, education/health, arts/food, other services, public administration
1 election variable: Republican presidential vote share (average of 2000, 2004, 2008 county results, pop-weighted to CBSA level)
1 gun law variable: Everytown state gun law strength index (0-100 scale), assigned to MMSAs/counties by state

Model comparison (LOOCV on 118 MMSAs)

models_obj <- readRDS(file.path(data_dir, "firearm_models.rds"))
pred_cols <- models_obj$pred_cols

We trained four model classes with leave-one-out cross-validation on the 118 MMSAs. Hyperparameters were tuned via cross-validation (lambda for ridge/lasso via cv.glmnet, cost/gamma/epsilon for SVM via tune.svm, tree parameters for XGBoost via xgb.cv).

# Compute LOOCV predictions for each model
svm_fit <- models_obj$svm
xgb_fit <- models_obj$xgb
svm_center <- models_obj$svm_center
svm_scale <- models_obj$svm_scale

X <- as.matrix(train[, pred_cols])
y <- train$firearm_ownership

# Ridge and Lasso LOOCV
ridge_cv <- cv.glmnet(scale(X), y, alpha = 0, nfolds = nrow(X), grouped = FALSE)
lasso_cv <- cv.glmnet(scale(X), y, alpha = 1, nfolds = nrow(X), grouped = FALSE)

# Manual LOOCV for SVM and XGBoost to get predicted values
n <- nrow(train)
svm_loo <- xgb_loo <- ridge_loo <- lasso_loo <- numeric(n)

ridge_fit_full <- glmnet(scale(X), y, alpha = 0, lambda = ridge_cv$lambda.min)
lasso_fit_full <- glmnet(scale(X), y, alpha = 1, lambda = lasso_cv$lambda.min)

for (i in 1:n) {
  Xi <- X[-i, ]; yi <- y[-i]
  Xi_s <- scale(Xi)
  ci <- attr(Xi_s, "scaled:center"); si <- attr(Xi_s, "scaled:scale")
  test_s <- matrix(scale(matrix(X[i,], nrow=1), center=ci, scale=si), nrow=1)

  # Ridge
  ridge_i <- glmnet(Xi_s, yi, alpha = 0, lambda = ridge_cv$lambda.min)
  ridge_loo[i] <- predict(ridge_i, test_s)[1]

  # Lasso
  lasso_i <- glmnet(Xi_s, yi, alpha = 1, lambda = lasso_cv$lambda.min)
  lasso_loo[i] <- predict(lasso_i, test_s)[1]

  # SVM
  svm_i <- svm(x = Xi_s, y = yi, kernel = "radial",
                cost = 4, gamma = 0.0078125, epsilon = 0.01)
  svm_loo[i] <- predict(svm_i, test_s)

  # XGBoost
  dti <- xgb.DMatrix(data = Xi, label = yi)
  xgb_i <- xgb.train(params = models_obj$xgb_params, data = dti,
                       nrounds = models_obj$xgb_nrounds, verbose = 0)
  xgb_loo[i] <- predict(xgb_i, matrix(X[i,], nrow=1))
}

loocv_results <- data.frame(
  Model = c("Ridge", "Lasso", "SVM RBF", "XGBoost"),
  LOOCV_R2 = c(cor(ridge_loo, y)^2, cor(lasso_loo, y)^2,
                cor(svm_loo, y)^2, cor(xgb_loo, y)^2),
  LOOCV_MAE = c(mean(abs(ridge_loo - y)), mean(abs(lasso_loo - y)),
                 mean(abs(svm_loo - y)), mean(abs(xgb_loo - y)))
)
loocv_results |> mutate(across(where(is.numeric), ~round(., 3))) |> kable()

Model	LOOCV_R2	LOOCV_MAE
Ridge	0.671	0.054
Lasso	0.668	0.054
SVM RBF	0.792	0.042
XGBoost	0.737	0.047

LOOCV scatterplots (training sample, out-of-sample predictions)

train$svm_loo <- svm_loo
train$xgb_loo <- xgb_loo
train$ridge_loo <- ridge_loo
train$lasso_loo <- lasso_loo
train$short_name <- gsub(" Metropolitan.*| Micropolitan.*", "", train$mmsa_name)

p <- ggplot(train, aes(x = svm_loo, y = firearm_ownership)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = short_name), size = 2.2, max.overlaps = 25,
                  segment.alpha = 0.3) +
  labs(x = "SVM LOOCV predicted", y = "Actual HH firearm ownership (BRFSS)",
       title = "LOOCV: SVM predicted vs actual (118 MMSAs)",
       subtitle = sprintf("R² = %.3f", cor(svm_loo, y)^2)) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "loocv_svm_mmsa.png"), p, width = 10, height = 10, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

p <- ggplot(train, aes(x = xgb_loo, y = firearm_ownership)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = short_name), size = 2.2, max.overlaps = 25,
                  segment.alpha = 0.3) +
  labs(x = "XGBoost LOOCV predicted", y = "Actual HH firearm ownership (BRFSS)",
       title = "LOOCV: XGBoost predicted vs actual (118 MMSAs)",
       subtitle = sprintf("R² = %.3f", cor(xgb_loo, y)^2)) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "loocv_xgb_mmsa.png"), p, width = 10, height = 10, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

State-level validation (vs RAND estimates)

RAND Corporation provides state-level household firearm ownership rate (HFR) estimates for 1980-2016, derived from a structural equation model integrating 51 surveys with administrative data. We compare our predictions (aggregated or direct) against the RAND 2008-2012 average.

rand <- read_csv(file.path(data_dir, "RAND_HFR_state_firearm_ownership.csv"),
                  show_col_types = FALSE)

rand_match <- rand |>
  filter(Year %in% 2008:2012) |>
  group_by(FIP, STATE) |>
  summarise(rand_hfr = mean(HFR, na.rm = TRUE), .groups = "drop") |>
  mutate(state_fips = sprintf("%02d", FIP))

state_pred <- readRDS(file.path(data_dir, "state_firearm_predictions.rds"))

# State predictions file already has all 30 predictor columns
state_pred_v <- state_pred |> filter(!is.na(gun_law_idx))

X_state <- as.matrix(state_pred_v[, pred_cols])
X_state_s <- scale(X_state, center = svm_center, scale = svm_scale)

state_pred_v$svm_pred <- predict(svm_fit, X_state_s)
state_pred_v$xgb_pred <- pmin(pmax(predict(xgb_fit, X_state), 0), 1)

val <- state_pred_v |>
  left_join(rand_match, by = "state_fips") |>
  filter(!is.na(rand_hfr))

Calibration table

check_cal <- function(pred, actual, name) {
  fit <- lm(actual ~ pred)
  data.frame(
    Model = name,
    R2 = round(cor(pred, actual)^2, 3),
    Slope = round(coef(fit)[2], 3),
    Intercept = round(coef(fit)[1], 3),
    MAE = round(mean(abs(pred - actual)), 3),
    Pred_range = paste0("[", round(min(pred), 2), ", ", round(max(pred), 2), "]")
  )
}

state_cal <- bind_rows(
  check_cal(val$svm_pred, val$rand_hfr, "SVM RBF"),
  check_cal(val$xgb_pred, val$rand_hfr, "XGBoost")
)
kable(state_cal, caption = "State-level validation vs RAND HFR (2008-2012 avg, n=50)")

State-level validation vs RAND HFR (2008-2012 avg, n=50)
	Model	R2	Slope	Intercept	MAE	Pred_range
pred…1	SVM RBF	0.888	0.985	0.023	0.035	[0.11, 0.58]
pred…2	XGBoost	0.915	1.089	-0.030	0.031	[0.14, 0.57]

State validation scatterplots

p <- ggplot(val, aes(x = svm_pred, y = rand_hfr)) +
  geom_point(alpha = 0.7, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = STATE), size = 3, max.overlaps = 50,
                  segment.alpha = 0.3) +
  labs(x = "SVM predicted HFR", y = "RAND HFR (2008-2012 avg)",
       title = "State validation: SVM predictions vs RAND estimates",
       subtitle = sprintf("R² = %.3f, slope = %.3f, intercept = %.3f",
                          cor(val$svm_pred, val$rand_hfr)^2,
                          coef(lm(rand_hfr ~ svm_pred, data = val))[2],
                          coef(lm(rand_hfr ~ svm_pred, data = val))[1])) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "validation_state_svm.png"), p, width = 9, height = 9, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

p <- ggplot(val, aes(x = xgb_pred, y = rand_hfr)) +
  geom_point(alpha = 0.7, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.7) +
  geom_text_repel(aes(label = STATE), size = 3, max.overlaps = 50,
                  segment.alpha = 0.3) +
  labs(x = "XGBoost predicted HFR", y = "RAND HFR (2008-2012 avg)",
       title = "State validation: XGBoost predictions vs RAND estimates",
       subtitle = sprintf("R² = %.3f, slope = %.3f, intercept = %.3f",
                          cor(val$xgb_pred, val$rand_hfr)^2,
                          coef(lm(rand_hfr ~ xgb_pred, data = val))[2],
                          coef(lm(rand_hfr ~ xgb_pred, data = val))[1])) +
  coord_equal(xlim = c(0, 0.7), ylim = c(0, 0.7)) +
  theme_minimal()
ggsave(file.path(fig_dir, "validation_state_xgb.png"), p, width = 9, height = 9, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

County-level validation (vs firearm suicide share)

The firearm suicide share (FS/S = firearm suicides / total suicides) is a widely used proxy for gun ownership (state-level r = 0.86 with actual ownership). We computed county-level FS/S from CDC WONDER data (1999-2020, pooled).

Scaling check: At the state level, the slope of FS/S regressed on actual RAND HFR is 0.787. If our county predictions are correctly scaled, the slope of FS/S on predicted HFR at county level should match. This is a strong test of whether our predictions are on the right scale.

fss <- readRDS(file.path(data_dir, "county_fss_1999_2020.rds"))
gun <- read_parquet(file.path(data_dir, "county_firearm_ownership_estimates.parquet"))

val_fss <- gun |>
  inner_join(fss |> select(GEOID, fss, total_deaths), by = "GEOID") |>
  filter(!is.na(fss), !is.na(firearm_rate_svm))

cat("Counties with FS/S data:", nrow(val_fss), "\n")

## Counties with FS/S data: 2839

# Scaling validation
fit_county_svm <- lm(fss ~ firearm_rate_svm, data = val_fss, weights = total_deaths)
fit_county_xgb <- lm(fss ~ firearm_rate_xgb, data = val_fss, weights = total_deaths)

# State-level ground truth slope
rand_fss <- rand |>
  filter(Year %in% 2008:2012) |>
  mutate(across(c(Fem_FS_S, Male_FS_S), ~ifelse(. == -9, NA, .))) |>
  group_by(FIP, STATE) |>
  summarise(avg_fss = mean((Fem_FS_S + Male_FS_S)/2, na.rm = TRUE),
            hfr = mean(HFR, na.rm = TRUE), .groups = "drop")
state_slope <- coef(lm(avg_fss ~ hfr, data = rand_fss))[2]

fss_cal <- data.frame(
  Level = c("State (ground truth)", "County (SVM)", "County (XGBoost)"),
  `Slope (FS/S ~ HFR)` = round(c(state_slope, coef(fit_county_svm)[2], coef(fit_county_xgb)[2]), 3),
  `Intercept` = round(c(coef(lm(avg_fss ~ hfr, data = rand_fss))[1],
                         coef(fit_county_svm)[1], coef(fit_county_xgb)[1]), 3),
  check.names = FALSE
)
kable(fss_cal, caption = "FS/S scaling validation: county predictions vs state ground truth")

FS/S scaling validation: county predictions vs state ground truth
	Level	Slope (FS/S ~ HFR)	Intercept
hfr	State (ground truth)	0.787	0.137
firearm_rate_svm	County (SVM)	0.790	0.225
firearm_rate_xgb	County (XGBoost)	0.919	0.182

SVM slope (county) matches the state ground truth slope almost exactly, confirming correct scaling.

wr_svm <- cov.wt(cbind(val_fss$firearm_rate_svm, val_fss$fss),
                   wt = val_fss$total_deaths, cor = TRUE)$cor[1,2]

p <- ggplot(val_fss, aes(x = firearm_rate_svm, y = fss)) +
  geom_point(aes(size = total_deaths), alpha = 0.1, color = "grey30") +
  geom_smooth(method = "lm", se = FALSE, color = "steelblue", linewidth = 0.8) +
  scale_size_continuous(range = c(0.3, 5), guide = "none") +
  labs(x = "SVM predicted firearm ownership rate",
       y = "Firearm Suicide Share (FS/S, 1999-2020)",
       title = "County-level validation: SVM predictions vs FS/S proxy",
       subtitle = sprintf("r = %.2f unweighted (n=%d), r = %.2f weighted by deaths",
                          cor(val_fss$firearm_rate_svm, val_fss$fss),
                          nrow(val_fss), wr_svm)) +
  theme_minimal()
ggsave(file.path(fig_dir, "validation_county_fss_svm.png"), p, width = 10, height = 9, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

County-level estimates

cat("Counties:", nrow(gun), "\n")

## Counties: 3115

cat("SVM range:", round(range(gun$firearm_rate_svm), 3), "\n")

## SVM range: 0.113 0.663

cat("SVM mean:", round(mean(gun$firearm_rate_svm), 3),
    "sd:", round(sd(gun$firearm_rate_svm), 3), "\n")

## SVM mean: 0.477 sd: 0.088

Map of estimated firearm ownership

counties_sf <- counties(year = 2020, cb = TRUE) |>
  filter(!STATEFP %in% c("02", "15", "72", "78", "66", "60", "69")) |>
  st_transform(5070)

map_gun <- counties_sf |> left_join(gun, by = "GEOID")

p <- ggplot(map_gun) +
  geom_sf(aes(fill = firearm_rate_svm), color = NA) +
  scale_fill_viridis_c(option = "inferno", labels = scales::percent,
                       name = "Estimated HH\nFirearm Rate",
                       breaks = seq(0.1, 0.7, 0.1)) +
  labs(title = "Estimated household firearm ownership rate by county (SVM)",
       subtitle = "Trained on 118 MMSAs (BRFSS 2002-04), 30 predictors | State R²=0.89 vs RAND | County FS/S slope validated") +
  theme_void() +
  theme(legend.position = c(0.92, 0.3),
        panel.background = element_rect(fill = "white", color = NA))
ggsave(file.path(fig_dir, "map_firearm_rate_svm.png"), p, width = 12, height = 7, dpi = 200)
p

Model comparison table

knitr::include_graphics(file.path(fig_dir, "table_imputation_model_comparison.png"))

SVM RBF selected as final model: best LOOCV R² (0.79), near-perfect state calibration (slope=0.99), and validated county-level FS/S scaling.

Part 2: Race, firearm ownership, and homicide

We fit three models at three levels of analysis:

Demographics alone: non-White race shares (Black, Hispanic, Native, Asian)
Gun ownership alone: household firearm rate
Both: demographics + gun ownership (additive)

Levels: (A) states using RAND HFR, (B) all counties using SVM-imputed HFR, (C) MMSAs only using BRFSS survey-based HFR.

library(modelsummary)

gof_map_short <- list(
  list("raw" = "adj.r.squared", "clean" = "Adj. R²", "fmt" = 3),
  list("raw" = "nobs", "clean" = "N", "fmt" = 0)
)

# Helper: fit 3 models, return named list for modelsummary
fit_3models <- function(df, gun_var = "firearm_ownership") {
  f_demo <- as.formula("hom_rate ~ black_share + hispanic_share + native_share + asian_share + log_pwpd")
  f_gun  <- as.formula(paste("hom_rate ~", gun_var))
  f_both <- as.formula(paste("hom_rate ~ black_share + hispanic_share + native_share + asian_share +", gun_var, "+ log_pwpd"))

  list(
    "Demographics" = lm(f_demo, data = df),
    "Gun ownership" = lm(f_gun, data = df),
    "Both" = lm(f_both, data = df)
  )
}

# Helper: render modelsummary table and save to PNG
ms_table <- function(models, title, filename) {
  coef_labels <- c(
    "black_share" = "Black %",
    "hispanic_share" = "Hispanic %",
    "native_share" = "Native %",
    "asian_share" = "Asian %",
    "firearm_ownership" = "HH firearm ownership",
    "black_share:firearm_ownership" = "Black % x HH firearm ownership",
    "log_pwpd" = "log(pop-weighted density)",
    "(Intercept)" = "(Intercept)"
  )
  tab <- modelsummary(
    models,
    estimate = "{estimate}{stars}",
    statistic = "({std.error})",
    stars = c("***" = 0.001, "**" = 0.01, "*" = 0.05),
    coef_rename = coef_labels,
    gof_map = gof_map_short,
    title = title,
    output = "gt"
  )
  gt::gtsave(tab, file.path(fig_dir, filename))
  tab
}

Data

# County-level data
d <- read_parquet(file.path(data_dir, "analysis_firearm_race.parquet"))
cat("All counties:", nrow(d), "\n")

## All counties: 3112

# State-level: aggregate counties, use RAND HFR
state_pwpd <- readRDS(file.path(data_dir, "state_pop_weighted_density.rds"))

state_d <- d |>
  mutate(state_fips = substr(GEOID, 1, 2)) |>
  group_by(state_fips) |>
  summarise(
    across(c(black_share, hispanic_share, native_share, asian_share),
           ~weighted.mean(., w = total)),
    hom_rate = sum(hom_deaths) / sum(hom_pop) * 100000,
    .groups = "drop"
  ) |>
  left_join(rand_match |> select(state_fips, rand_hfr), by = "state_fips") |>
  left_join(state_pwpd, by = "state_fips") |>
  mutate(log_pwpd = log(pwpd + 1)) |>
  rename(firearm_ownership = rand_hfr) |>
  filter(!is.na(firearm_ownership))
cat("States (with RAND HFR):", nrow(state_d), "\n")

## States (with RAND HFR): 50

# MMSA-level: use survey-based BRFSS firearm rate (no imputation needed)
mmsa_gun <- readRDS(file.path(data_dir, "mmsa_firearm_ownership.rds")) |>
  select(mmsa_code, firearm_ownership = firearm_rate, n_respondents)

# Need MMSA-level homicide and race data
# Aggregate counties to MMSAs using the crosswalk
county_cbsa <- readxl::read_xls(file.path(data_dir, "cbsa_delineation_2013.xls"), skip = 2) |>
  transmute(
    county_fips = paste0(`FIPS State Code`, `FIPS County Code`),
    cbsa_code = stringr::str_pad(`CBSA Code`, 5, pad = "0")
  )

mmsa_d <- d |>
  inner_join(county_cbsa, by = c("GEOID" = "county_fips")) |>
  group_by(cbsa_code) |>
  summarise(
    across(c(black_share, hispanic_share, native_share, asian_share),
           ~weighted.mean(., w = total)),
    hom_rate = sum(hom_deaths) / sum(hom_pop) * 100000,
    pwpd = weighted.mean(pwpd, w = pop_total),
    .groups = "drop"
  ) |>
  mutate(log_pwpd = log(pwpd + 1)) |>
  inner_join(mmsa_gun, by = c("cbsa_code" = "mmsa_code")) |>
  filter(!is.na(firearm_ownership), !is.na(hom_rate))
cat("MMSAs (with BRFSS survey data):", nrow(mmsa_d), "\n")

## MMSAs (with BRFSS survey data): 118

Maps

map_d <- counties_sf |> left_join(d, by = "GEOID")

p <- ggplot(map_d) +
  geom_sf(aes(fill = firearm_ownership), color = NA) +
  scale_fill_viridis_c(option = "inferno", labels = scales::percent,
                       name = "HH Firearm\nRate") +
  labs(title = "Estimated household firearm ownership rate") +
  theme_void() + theme(legend.position = c(0.92, 0.3),
                        panel.background = element_rect(fill = "white", color = NA))
ggsave(file.path(fig_dir, "map_analysis_firearm.png"), p, width = 12, height = 7, dpi = 200)
p

Homicide rate map: see figs/map_homicide_rate.png (from prior analysis).

Black share map: see figs/map_black_share.png (from prior analysis).

Descriptives

d |>
  select(hom_rate, firearm_ownership, black_share, hispanic_share, native_share,
         asian_share) |>
  summary()

##     hom_rate     firearm_ownership  black_share      hispanic_share  
##  Min.   : 0.00   Min.   :0.113     Min.   :0.00000   Min.   :0.0034  
##  1st Qu.: 1.99   1st Qu.:0.441     1st Qu.:0.00535   1st Qu.:0.0237  
##  Median : 3.26   Median :0.490     Median :0.02104   Median :0.0461  
##  Mean   : 4.39   Mean   :0.477     Mean   :0.08786   Mean   :0.0982  
##  3rd Qu.: 5.32   3rd Qu.:0.536     3rd Qu.:0.10188   3rd Qu.:0.1054  
##  Max.   :42.89   Max.   :0.663     Max.   :0.87455   Max.   :0.9768  
##   native_share      asian_share     
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00273   1st Qu.:0.00371  
##  Median :0.00470   Median :0.00615  
##  Mean   :0.01821   Mean   :0.01516  
##  3rd Qu.:0.00937   3rd Qu.:0.01382  
##  Max.   :0.88733   Max.   :0.53020

Correlation matrices

# State-level: include both RAND and BRFSS firearm ownership
state_brfss <- mmsa_d |>
  inner_join(
    county_cbsa |> mutate(state_fips = substr(county_fips, 1, 2)) |>
      left_join(d |> select(GEOID, pop_total), by = c("county_fips" = "GEOID")) |>
      filter(!is.na(pop_total)) |>
      group_by(cbsa_code, state_fips) |> summarise(pop = sum(pop_total), .groups = "drop") |>
      group_by(cbsa_code) |> slice_max(pop, n = 1) |> ungroup() |>
      select(cbsa_code, state_fips),
    by = "cbsa_code"
  ) |>
  group_by(state_fips) |>
  summarise(brfss_ownership = mean(firearm_ownership), .groups = "drop")

state_fss <- readRDS(file.path(data_dir, "state_fss_1999_2020.rds"))

state_d2 <- state_d |>
  left_join(state_brfss, by = "state_fips") |>
  left_join(state_fss |> select(state_fips, fss), by = "state_fips")

cor_vars_s <- c("hom_rate", "firearm_ownership", "brfss_ownership", "fss",
                "black_share", "hispanic_share", "native_share", "asian_share", "log_pwpd")
cor_labels_s <- c("Homicide rate", "RAND HFR", "BRFSS HFR", "Firearm suicide %",
                  "Black %", "Hispanic %", "Native %", "Asian %", "log(pop-wtd density)")

sc <- cor(state_d2[!is.na(state_d2$brfss_ownership), cor_vars_s], use = "complete.obs")
colnames(sc) <- rownames(sc) <- cor_labels_s
n_s <- nrow(sc)

melted_s <- expand.grid(Var1 = cor_labels_s, Var2 = cor_labels_s) |>
  mutate(value = as.vector(sc))

p <- ggplot(melted_s, aes(x = Var2, y = Var1, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = sprintf("%.2f", value)), size = 2.8) +
  scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick",
                       midpoint = 0, limits = c(-1, 1), name = "r") +
  labs(title = sprintf("State-level correlations (n=%d states with both RAND and BRFSS)",
                       sum(!is.na(state_d2$brfss_ownership)))) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_blank())
ggsave(file.path(fig_dir, "cormat_states.png"), p, width = 9, height = 7.5, dpi = 150)
p

# County/MMSA split: upper = all counties (SVM), lower = MMSAs (BRFSS)
shared_vars <- c("hom_rate", "firearm_ownership", "black_share", "hispanic_share",
                 "native_share", "asian_share", "log_pwpd")
shared_labels <- c("Homicide rate", "Firearm own. (SVM)", "Black %", "Hispanic %",
                   "Native %", "Asian %", "log(pop-wtd density)")

county_cor <- cor(d[, shared_vars], use = "complete.obs")
mmsa_brfss <- mmsa_d  # already has firearm_ownership = BRFSS
mmsa_cor <- cor(mmsa_brfss[, shared_vars], use = "complete.obs")

nv <- length(shared_vars)
combined <- matrix(NA, nv, nv)
rownames(combined) <- colnames(combined) <- shared_labels
for (i in 1:nv) for (j in 1:nv) {
  if (i < j) combined[i, j] <- county_cor[i, j]
  else if (i > j) combined[i, j] <- mmsa_cor[i, j]
}

melted_c <- expand.grid(Var1 = shared_labels, Var2 = shared_labels) |>
  mutate(value = as.vector(combined))

p <- ggplot(melted_c, aes(x = Var2, y = Var1, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size = 2.8) +
  scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick",
                       midpoint = 0, limits = c(-1, 1), name = "r", na.value = "grey90") +
  labs(title = "County/MMSA correlations",
       subtitle = "Upper: all counties, n=3,112, SVM imputed | Lower: MMSAs, n=118, BRFSS survey") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_blank())
ggsave(file.path(fig_dir, "cormat_counties.png"), p, width = 9, height = 7.5, dpi = 150)
p

A. State-level regressions (RAND HFR, n=50)

mods_state <- fit_3models(state_d)
ms_table(mods_state, "State-level: homicide rate per 100k", "table_state_regressions.png")

## file:////tmp/RtmpaiogIy/file53e8a533f46f.html screenshot completed

State-level: homicide rate per 100k
	Demographics	Gun ownership	Both
(Intercept)	9.272*	3.675**	-0.041
	(3.539)	(1.154)	(4.713)
Black %	25.855***		26.708***
	(2.462)		(2.315)
Hispanic %	4.928		6.720*
	(2.710)		(2.607)
Native %	34.326**		19.668
	(9.968)		(10.696)
Asian %	2.055		4.522
	(3.650)		(3.517)
log(pop-weighted density)	-1.125*		-0.232
	(0.530)		(0.590)
HH firearm ownership		4.211	7.214**
		(2.807)	(2.608)
Adj. R²	0.709	0.025	0.747
N	50	50	50

B. County-level regressions (all counties, SVM-imputed HFR)

mods_county <- fit_3models(d)
ms_table(mods_county, "County-level (all): homicide rate per 100k", "table_county_regressions.png")

## file:////tmp/RtmpaiogIy/file53e8a30e46e03.html screenshot completed

County-level (all): homicide rate per 100k
	Demographics	Gun ownership	Both
(Intercept)	0.477	6.900***	-0.743
	(0.399)	(0.413)	(0.704)
Black %	21.785***		21.912***
	(0.365)		(0.369)
Hispanic %	2.304***		2.502***
	(0.397)		(0.408)
Native %	9.831***		9.945***
	(0.774)		(0.775)
Asian %	-8.830***		-7.202***
	(1.905)		(2.055)
log(pop-weighted density)	0.294***		0.354***
	(0.071)		(0.077)
HH firearm ownership		-5.252***	1.689*
		(0.851)	(0.803)
Adj. R²	0.545	0.012	0.546
N	3112	3112	3112

C. MMSA-level regressions (BRFSS survey HFR, no imputation)

mods_mmsa <- fit_3models(mmsa_d)
ms_table(mods_mmsa, "MMSA-level (survey data): homicide rate per 100k", "table_mmsa_regressions.png")

## file:////tmp/RtmpaiogIy/file53e8a13447319.html screenshot completed

MMSA-level (survey data): homicide rate per 100k
	Demographics	Gun ownership	Both
(Intercept)	-0.090	3.995***	-7.336
	(3.223)	(1.077)	(3.904)
Black %	26.908***		27.421***
	(1.736)		(1.682)
Hispanic %	1.755		2.733
	(1.519)		(1.499)
Native %	22.375***		19.880***
	(4.839)		(4.737)
Asian %	-2.822		1.780
	(3.170)		(3.405)
log(pop-weighted density)	0.340		0.980
	(0.474)		(0.503)
HH firearm ownership		5.515*	6.488**
		(2.784)	(2.114)
Adj. R²	0.685	0.024	0.707
N	118	118	118

Interaction models

Adding race x gun ownership interactions at each level:

int_models <- list(
  "State" = lm(hom_rate ~ black_share * firearm_ownership + hispanic_share + native_share + asian_share + log_pwpd, data = state_d),
  "County" = lm(hom_rate ~ black_share * firearm_ownership + hispanic_share + native_share + asian_share + log_pwpd, data = d),
  "MMSA" = lm(hom_rate ~ black_share * firearm_ownership + hispanic_share + native_share + asian_share + log_pwpd, data = mmsa_d)
)

ms_table(int_models, "Interaction models: race x firearm ownership", "table_interaction_regressions.png")

## file:////tmp/RtmpaiogIy/file53e8a58aa8e10.html screenshot completed

Interaction models: race x firearm ownership
	State	County	MMSA
(Intercept)	-1.358	-2.042**	-7.438
	(4.601)	(0.717)	(3.933)
Black %	8.769	41.267***	25.125**
	(9.181)	(2.507)	(7.536)
HH firearm ownership	3.435	5.360***	5.873*
	(3.141)	(0.924)	(2.894)
Hispanic %	5.427*	2.985***	2.609
	(2.600)	(0.408)	(1.557)
Native %	23.228*	10.093***	20.076***
	(10.484)	(0.768)	(4.798)
Asian %	1.232	-6.000**	1.379
	(3.770)	(2.041)	(3.652)
log(pop-weighted density)	0.226	0.259***	1.036
	(0.614)	(0.077)	(0.536)
Black % x HH firearm ownership	42.826	-41.178***	5.438
	(21.259)	(5.276)	(17.393)
Adj. R²	0.764	0.554	0.705
N	50	3112	118

Visualize the interaction (county level)

d$gun_tercile <- cut(d$firearm_ownership,
                      breaks = quantile(d$firearm_ownership, c(0, 1/3, 2/3, 1)),
                      labels = c("Low gun ownership", "Medium", "High gun ownership"),
                      include.lowest = TRUE)

p <- ggplot(d, aes(x = black_share, y = hom_rate, color = gun_tercile)) +
  geom_point(alpha = 0.15, size = 0.8) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
  labs(x = "Black population share", y = "Homicide rate per 100k",
       color = "Firearm ownership",
       title = "Black share vs homicide rate, by firearm ownership level") +
  theme_minimal()
ggsave(file.path(fig_dir, "interaction_black_firearm_hom.png"), p, width = 9, height = 7, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

d$black_tercile <- cut(d$black_share,
                        breaks = quantile(d$black_share, c(0, 1/3, 2/3, 1)),
                        labels = c("Low Black share", "Medium", "High Black share"),
                        include.lowest = TRUE)

p <- ggplot(d, aes(x = firearm_ownership, y = hom_rate, color = black_tercile)) +
  geom_point(alpha = 0.15, size = 0.8) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
  labs(x = "Estimated firearm ownership rate", y = "Homicide rate per 100k",
       color = "Black pop share",
       title = "Firearm ownership vs homicide rate, by Black population share") +
  theme_minimal()
ggsave(file.path(fig_dir, "interaction_firearm_black_hom.png"), p, width = 9, height = 7, dpi = 150)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

Firearm ownership, race, and homicide: county-level analysis

30 March, 2026