library(readxl)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

Convictions <- read_excel("Strict Liability spread sheet.xlsx")
Arrest <- read_excel("NJ DIH Arrest data 2017-2024.xlsx")
Overdose <-Underlying_Cause_of_Death_1999_2023 <- read_csv("Underlying Cause of Death 1999-2023.csv", 
    col_types = cols(Notes = col_skip(), 
        Year = col_integer(), `Year Code` = col_integer(), 
        Population = col_integer()))
State_OD_18 <- read_csv("Statewide_data_month_2018-2025-5.csv", 
    col_types = cols(Notes = col_skip(), 
        State = col_skip(), `State Code` = col_skip(), 
        Population = col_skip(), `Crude Rate` = col_skip()))
State_OD_mon <- read_csv("State Month Underlying Cause of Death, 1999-2025.csv")

## Rows: 317 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Month, Month Code, Deaths
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Arrest <- Arrest %>%
  mutate(
    Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
  )

Convictions <- Convictions  %>%
  mutate(
    Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
  )

Convictions <- Convictions %>%
  mutate(
    Ethnicity = ifelse(Ethnicity %in% c("Unknown", "Not Provided", "NA", ""), NA, Ethnicity)
  )

# Create Conviction Year Variable in
Convictions$Sentence.Date <- as.Date(Convictions$`Date of Sentence`)
Convictions$Sentence.Year <- as.numeric(format(Convictions$Sentence.Date, "%Y"))

#Create Conviction Month & Year code
Convictions$`Month Code` <- format(Convictions$Sentence.Date, "%Y/%m")



#Create Year & Month Variable in State_OD
library(dplyr)

State_OD_18 <- State_OD_18 %>%
  mutate(
    Year = as.numeric(sub("/.*", "", `Month Code`)),
    Month = as.numeric(sub(".*/", "", `Month Code`))
  )

State_OD_mon <- State_OD_mon %>%
  mutate(
    Year = as.numeric(sub("/.*", "", `Month Code`)),
    Month = as.numeric(sub(".*/", "", `Month Code`))
  )

# Create Month Code
Arrest$Month_NUM <- match(Arrest$`Arrest Month`, month.name)

Arrest$`Month Code` <- paste0(Arrest$`Arrest Year`, "/", sprintf("%02d", Arrest$Month_NUM))

#Create list of all months  to include; dates: 2017, through 2024.
mons_list <- data.frame(`Month Code` = format(
  seq.Date(
    from = as.Date("2017-01-01"),  # set start
    to   = as.Date("2024-12-31"),  # set  end
    by   = "month"
  ),
  "%Y/%m" 
))

mons_list <- mons_list %>%
  rename(`Month Code` = Month.Code)

#Count arrests
arrests_by_mon <- Arrest %>%
  group_by(`Month Code`) %>%
  summarise(Arrest_Count = n(), .groups = "drop")

# Insert 0's for months not reported as having arrests
arrests_by_mon <- mons_list %>%
  left_join(arrests_by_mon, by = "Month Code") %>%
  mutate(Arrest_Count = ifelse(is.na(Arrest_Count), 0, Arrest_Count))

library(dplyr)
library(tidyr)

# Step 1: Create list of all months  to include; dates: August 5, 1988, through June 23, 2025.
all_months <- data.frame(`Month Code` = format(
  seq.Date(
    from = as.Date("1988-01-01"),  # set  start
    to   = as.Date("2022-12-01"),  # set  end
    by   = "month"
  ),
  "%Y/%m" 
))

all_months <- all_months %>%
  rename(`Month Code` = Month.Code)

# Step 2: Count convictions
conv_by_mon <- Convictions %>%
  group_by(`Month Code`) %>%
  summarise(Conv_Count = n(), .groups = "drop")

# Insert 0's for months not reported as having arrests
conv_by_mon <- all_months %>%
  left_join(conv_by_mon, by = "Month Code") %>%
  mutate(Conv_Count = ifelse(is.na(Conv_Count), 0, Conv_Count))

For the state level data – I am using the population from the counties in the CDC Wonder dataset and just combining them – but may be more accurate if we just find a population dataset for NJ state. I added population estimates from 2024 from the Census Burea website before it was rendered inoperable by the government shutdown.

# 1) Statewide population by year (summing population data across counties to determine state population -- may be best to find population data from another source for the state)
pop_by_year <- Overdose %>%
  group_by(Year) %>%
  summarise(Population = sum(Population, na.rm = TRUE), .groups = "drop")

#Added 2024 population estimates from Census bureau

library(dplyr)
pop_by_year <- pop_by_year %>%
  bind_rows(data.frame(
    Year = 2024,
    Population = 9500851
  ))


# 2) Arrests by year
arrests_by_year <- Arrest %>%
  group_by(`Arrest Year`) %>%
  summarise(Arrest_Count = n(), .groups = "drop") %>%
  rename(Year = `Arrest Year`)

# 3) Convictions by year 
#convictions_by_year_filter code removed

convictions_by_year<- Convictions %>%
  group_by(Sentence.Year) %>%
  summarise(Conviction_Count = n(), .groups = "drop") %>%
  rename(Year = Sentence.Year)

# 4) Deaths by year (statewide)

deaths_by_year <- State_OD_18 %>%
  group_by(Year) %>%
  summarise(Count = sum(Deaths, na.rm = TRUE), .groups = "drop")

#ARRESTS 2017-2024

library(ggplot2)

ggplot(arrests_by_year, aes(x = `Year`, y = Arrest_Count)) +
  geom_line(group = 1, color = "steelblue", size = 1.2) +
  geom_point(size = 2) +
  geom_text(aes(label = Arrest_Count), vjust = -0.5) +
  labs(title = "Drug-Induced Arrests by Year",
       x = "Year", y = "Arrest Count") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

@Jennifer, We need to double check that that arrest data is complete through 2024. I did not pull it…. looking at the dataset the last arrest included is September of 2024.

library(knitr)
county_summary_arr <- Arrest %>%
  count(County) %>%
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

kable(county_summary_arr,
      caption = "Table: Arrests by County",
      col.names = c("County", "Count", "Percent"),
      align = "lrr")

Table: Arrests by County
County	Count	Percent
Atlantic	57	15.5
Burlington	56	15.3
Cape May	47	12.8
Ocean	37	10.1
Middlesex	31	8.4
Statewide or Regional LEA	22	6.0
Monmouth	20	5.4
Morris	20	5.4
Bergen	18	4.9
Gloucester	14	3.8
Passaic	13	3.5
Essex	9	2.5
Cumberland	5	1.4
Hunterdon	4	1.1
Camden	3	0.8
Mercer	3	0.8
Salem	2	0.5
Union	2	0.5
Warren	2	0.5
Hudson	1	0.3
Somerset	1	0.3

##Strict Liability Conviction Data – 1988 - April 2025

library(dplyr)
library(ggplot2)

# Line plot
ggplot(convictions_by_year, aes(x = Year, y = Conviction_Count)) +
  geom_line(color = "darkblue", size = 1.2) +
  geom_point(size = 2) +
  geom_text(aes(label = Conviction_Count), vjust = -0.7, size = 3.5) +  # adds counts above points
  scale_x_continuous(breaks = seq(min(convictions_by_year$Year, na.rm = TRUE),
                                  max(convictions_by_year$Year, na.rm = TRUE),
                                  by = 2)) +
  labs(title = "Number of DID Convictions by Year",
       x = "Year",
       y = "Number of Convictions") +
  theme_minimal()

library(knitr)
county_summary_conv <- Convictions %>%
  count(County) %>%
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

kable(county_summary_conv,
      caption = "Table: Convictions by County 1999- 2025(June)",
      col.names = c("County", "Count", "Percent"),
      align = "lrr")

Table: Convictions by County 1999- 2025(June)
County	Count	Percent
Cape May	68	17.7
Atlantic	66	17.1
Burlington	56	14.5
Middlesex	34	8.8
Ocean	26	6.8
Morris	20	5.2
Gloucester	18	4.7
Passaic	16	4.2
Monmouth	13	3.4
Camden	9	2.3
Hunterdon	9	2.3
Somerset	9	2.3
Essex	8	2.1
Hudson	6	1.6
Mercer	6	1.6
Bergen	5	1.3
Sussex	5	1.3
Union	5	1.3
Warren	4	1.0
Cumberland	2	0.5

Overdose

library(dplyr)
library(ggplot2)

# Summarize total deaths by year

State_OD_mon$Deaths <- as.numeric(State_OD_mon$Deaths)

## Warning: NAs introduced by coercion

deaths_by_year_tot <- State_OD_mon %>%
  group_by(Year) %>%
  summarise(Count = sum(Deaths, na.rm = TRUE), .groups = "drop")

# Line chart of total deaths over time
ggplot(deaths_by_year_tot, aes(x = Year, y = Count)) +
  geom_line(color = "darkred", size = 1.2) +
  geom_point(size = 2) +
  geom_text(aes(label = Count), vjust = -0.5, size = 3) +
  labs(title = "Total Overdose Deaths by Year",
       x = "Year", y = "Number of Deaths") +
  theme_minimal()

library(dplyr)
#Combine into one dataset
combined_data <- full_join(
  dplyr::select(arrests_by_year, Year, Arrest_Count),
  dplyr::select(deaths_by_year, Year, Count),
  by = "Year"
)

library(ggplot2)

# Choose a scaling factor to bring arrests to the same ballpark as deaths
scale_factor <- 30  # adjust as needed

ggplot(combined_data, aes(x = Year)) +
  geom_line(aes(y = Count, color = "Overdose Deaths"), size = 1.2) +
  geom_line(aes(y = Arrest_Count * scale_factor, color = "Arrests"), size = 1.2) +
  scale_y_continuous(
    name = "Overdose Deaths",
    sec.axis = sec_axis(~./scale_factor, name = "Arrests")
  ) +
  scale_color_manual(values = c("Overdose Deaths" = "blue", "Arrests" = "red")) +
  labs(
    title = "Overdose Deaths vs Arrests by Year (Dual Y-Axis)",
    x = "Year",
    color = "Metric"
  ) +
  theme_minimal()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Visually there does not appear to be a relationships whereby Arrests decrease when overdose deaths decrease. This means that the number of overdose deaths (the underlying incident rate) does not seem to predict whether or not there are arrests. Of course, we will test this later. It also doesn’t appear that arrests result in less overdoses the next year…

library(tidyr)

combined_long <- combined_data %>%
  pivot_longer(cols = c(Arrest_Count, Count), names_to = "Metric", values_to = "Value")

ggplot(combined_long, aes(x = Year, y = Value)) +
  geom_line(color = "steelblue", size = 1.2) +
  facet_wrap(
    ~ Metric,
    scales = "free_y",
    labeller = as_labeller(
      c("Count" = "Overdose Deaths", "Arrest_Count" = "Arrests")
    )
  ) +
  labs(
    title = "Arrests vs Overdose Deaths by Year",
    x = "Year",
    y = "Count"
  ) +
  theme_minimal()

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

Transformed to Z scores…

combined_data_z <- combined_data %>%
  mutate(
    Arrest_z = scale(Arrest_Count),
    Deaths_z = scale(Count)
  )

ggplot(combined_data_z, aes(x = Year)) +
  geom_line(aes(y = Arrest_z, color = "Arrests"), size = 1.2) +
  geom_line(aes(y = Deaths_z, color = "Overdose Deaths"), size = 1.2) +
  labs(title = "Z-Score Comparison: Arrests vs Overdose Deaths",
       y = "Z-Score (Standardized Values)",
       x = "Year",
       color = "Metric") +
  theme_minimal()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Arrest_OD_month <- left_join(arrests_by_mon, State_OD_mon, by = "Month Code")

Arrest_OD_month <- Arrest_OD_month %>%
  left_join(pop_by_year, by = "Year")

#Convictions/Overdoses

convictions_filtered <- Convictions %>%
  filter(Sentence.Year >= 2018, Sentence.Year <= 2025)


#Create aggregated count of convictions by month for the entire dataset time period
conv_sum <- Convictions %>%
  filter(!is.na(`Month Code`)) %>%
  group_by(`Month Code`) %>%
  summarise(Conviction_Count = n()) %>%
  arrange(`Month Code`)

library(dplyr)

combined_df_3 <- conv_by_mon %>%
  left_join(State_OD_mon,       by = "Month Code") 

#Filter out prior to 1999

combined_df_3 <- combined_df_3 %>%
  mutate(Month_Date = as.Date(paste0(`Month Code`, "/01"), format = "%Y/%m/%d"))

combined_df_3_filtered <- combined_df_3 %>%
  filter(Month_Date >= as.Date("1999-01-01")) #because this is as far back as the OD data goes back

#Add population to the dataset

combined_df_3 <-combined_df_3 %>%
  left_join(pop_by_year, by = "Year")
combined_df_3_filtered <- combined_df_3_filtered %>%
  left_join(pop_by_year, by = "Year")

combined_yearly <- deaths_by_year_tot %>%
  left_join(convictions_by_year,       by = "Year") 

#filter dates

combined_yearly <- combined_yearly %>%
   filter(Year >= as.Date(1999)) 

# Add 0's for convictions for N/A
combined_yearly <- combined_yearly %>%
  replace_na(list(
    Conviction_Count = 0
  ))

library(ggplot2)

scale_factor <- 20  # Adjust to visually match scales

ggplot(combined_yearly, aes(x = Year)) +
  geom_line(aes(y = Count, color = "Overdose Deaths"), size = 1.2) +
  geom_line(aes(y = Conviction_Count * scale_factor, color = "Convictions"), size = 1.2) +
  scale_y_continuous(
    name = "Overdose Deaths",
    sec.axis = sec_axis(~./scale_factor, name = "Convictions")
  ) +
  scale_color_manual(
    values = c("Overdose Deaths" = "blue", "Convictions" = "red")
  ) +
  labs(
    title = "Overdose Deaths vs Convictions by Year",
    x = "Year",
    color = "Metric"
  ) +
  theme_minimal()

State – further statistical analysis

Arrests and Overdose deaths

Correlation

Correlations between arrests & overdose deaths since data is count – ran a spearman rank correlation

cor.test(Arrest_OD_month$Arrest_Count,
         Arrest_OD_month$Deaths,
         method = "spearman")

## Warning in cor.test.default(Arrest_OD_month$Arrest_Count,
## Arrest_OD_month$Deaths, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  Arrest_OD_month$Arrest_Count and Arrest_OD_month$Deaths
## S = 113708, p-value = 0.02496
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2287815

There was a statistically significant, positive association between the two variables, ρ = 0.229, p = .025. This suggests that months with higher numbers of arrests tend to also have higher numbers of overdose deaths.

###Poisson Model Arrest_OD_month

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Poisson regression: model overdose deaths as count
poisson_model1 <- glm(
  Deaths ~ Arrest_Count +  offset (log(Population)),
  family = poisson(link = "log"),
  data = Arrest_OD_month
)

summary(poisson_model1)

## 
## Call:
## glm(formula = Deaths ~ Arrest_Count + offset(log(Population)), 
##     family = poisson(link = "log"), data = Arrest_OD_month)
## 
## Coefficients:
##                Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)  -10.675628   0.010808 -987.791  < 2e-16 ***
## Arrest_Count   0.017381   0.002119    8.201 2.39e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 778.99  on 95  degrees of freedom
## Residual deviance: 712.84  on 94  degrees of freedom
## AIC: 1411.8
## 
## Number of Fisher Scoring iterations: 4

exp(coef(poisson_model1))

##  (Intercept) Arrest_Count 
## 2.310116e-05 1.017533e+00

# Overdispersion test
overdispersion <- sum(residuals(poisson_model1, type = "pearson")^2) / df.residual(poisson_model1)
overdispersion

## [1] 7.051196

Model: Deaths ~ Arrest_Count + offset (log(Population))

Overdispersion Detected - 7.05 (A value > 1 means variance > mean → Poisson assumptions are violated)

A Poisson regression model indicated that arrest counts were associated with overdose deaths (p < 0.001), with a 1-unit increase in arrests associated with a 1.75% increase in expected deaths, after controlling for population. Despite these findings, the model exhibited substantial overdispersion (dispersion = 7.05), suggesting that the Poisson assumptions were violated and that standard errors and significance tests may be unreliable.

Negative-Binomial model may be more appropriate for model.

Negative Binomial Model

library(MASS)
nb_model <- glm.nb(Deaths ~ Arrest_Count + offset (log(Population)), data = Arrest_OD_month)
summary(nb_model)

## 
## Call:
## glm.nb(formula = Deaths ~ Arrest_Count + offset(log(Population)), 
##     data = Arrest_OD_month, init.theta = 32.30496007, link = log)
## 
## Coefficients:
##                Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)  -10.673850   0.030274 -352.578  < 2e-16 ***
## Arrest_Count   0.017343   0.006095    2.845  0.00444 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(32.305) family taken to be 1)
## 
##     Null deviance: 105.788  on 95  degrees of freedom
## Residual deviance:  97.835  on 94  degrees of freedom
## AIC: 997.36
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  32.30 
##           Std. Err.:  5.39 
## 
##  2 x log-likelihood:  -991.361

exp(coef(nb_model))

##  (Intercept) Arrest_Count 
## 2.314226e-05 1.017494e+00

# Overdispersion test
overdispersion2 <- sum(residuals(nb_model, type = "pearson")^2) / df.residual(nb_model)
overdispersion2

## [1] 0.8896591

glm.nb(Deaths ~ Arrest_Count + offset (log(Population)), data = Arrest_OD_month)

overdispersion2= 0.8896591, no overdispersion

AIC = 997.36 — much lower than Poisson model (1411.8), so NB is clearly a better fit.

A Negative Binomial regression model was fitted to examine the association between monthly overdose deaths, arrest counts, and population size, addressing overdispersion observed in the earlier Poisson model. Arrest counts were associated with overdose deaths (p < 0.01), with a 1-unit increase in arrests associated with a 1.7% increase in expected deaths, after controlling for population.

The model showed no evidence of overdispersion (dispersion = 0.8896591), indicating a good fit. Compared to the Poisson model, the negative binomial regression provided a substantially improved fit (AIC = 997.36 vs. 1411.8), confirming it as the more appropriate model for overdispersed count data.

Negative Binomial Model with 1 month time lag.

library(dplyr)

Arrest_OD_month <- Arrest_OD_month %>%
  mutate(Arrest_Count_lag1 = lag(Arrest_Count, n = 1))

library(MASS)

nb_model_lag <- glm.nb(Deaths ~ Arrest_Count_lag1 + offset (log(Population)), data = Arrest_OD_month)
summary(nb_model_lag)

## 
## Call:
## glm.nb(formula = Deaths ~ Arrest_Count_lag1 + offset(log(Population)), 
##     data = Arrest_OD_month, init.theta = 30.97458706, link = log)
## 
## Coefficients:
##                     Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)       -10.662585   0.031235 -341.368   <2e-16 ***
## Arrest_Count_lag1   0.014060   0.006262    2.245   0.0247 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(30.9746) family taken to be 1)
## 
##     Null deviance: 101.667  on 94  degrees of freedom
## Residual deviance:  96.729  on 93  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 990.24
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  30.97 
##           Std. Err.:  5.16 
## 
##  2 x log-likelihood:  -984.239

exp(coef(nb_model_lag))

##       (Intercept) Arrest_Count_lag1 
##      2.340443e-05      1.014160e+00

# Overdispersion test
overdispersion3 <- sum(residuals(nb_model_lag, type = "pearson")^2) / df.residual(nb_model_lag)
overdispersion3

## [1] 0.8930095

A negative binomial regression was conducted to test whether lagged arrest counts predict monthly overdose deaths, while controlling for population size. Arrest counts were lagged by one month to assess potential delayed effects. The model revealed that lagged arrests were significantly associated with overdose deaths (p < 0.05) after controlling for population, indicating a measurable delayed impact, with 1 arrest associated with a 1.4% increase in overdose deaths the following month.

The model demonstrated good fit to the data, with a dispersion statistic below 1 (0.89) and a slightly lower AIC (990.24) compared to the non-lagged model (AIC = 997.36). These findings suggest that changes in arrests may influence monthly overdose mortality, when accounting for temporal lag of 1 month.

I further compare these models below…

# Confidence intervals (exp) for the original NB model
exp_confint_nb_1 <- exp(confint(nb_model))

## Waiting for profiling to be done...

# Confidence intervals (exp) for the lagged NB model
exp_confint_lag_1 <- exp(confint(nb_model_lag))

## Waiting for profiling to be done...

anova(nb_model, nb_model_lag, test = "Chisq")

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: Deaths
##                                         Model    theta Resid. df
## 1      Arrest_Count + offset(log(Population)) 32.30496        94
## 2 Arrest_Count_lag1 + offset(log(Population)) 30.97459        93
##      2 x log-lik.   Test    df LR stat.     Pr(Chi)
## 1       -991.3613                                  
## 2       -984.2395 1 vs 2     1 7.121889 0.007614845

Does including Arrest_Count_lag1 improve the model significantly more than Arrest_Count?

model 2 has a higher log-likelihood (less negative), it fits the data better. This difference is statistically significant.

A likelihood ratio test comparing two Negative Binomial models showed that including the lagged arrest count (Arrest_Count_lag1) significantly improved model fit compared to the current month’s arrest count (Arrest_Count), χ²(1) = 7.12, p = 0.0076.

library(DHARMa)

## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')

# For original model
sim1 <- simulateResiduals(nb_model)
plot(sim1)

Panel 1 (Left): QQ Plot of Residuals for Negative Binomial Model This panel compares the distribution of simulated residuals to what we’d expect under a perfectly fitting model. KS test (Kolmogorov-Smirnov test): p = 0.02974 → this is statistically significant at the 0.05 level. ❗ This indicates non-uniformity in residuals — i.e., the residuals do not match the expected distribution. Conclusion: There is some systematic deviation, possibly suggesting model misspecification (e.g., wrong functional form, missing predictors, etc.). Dispersion test (p = 0.4): ✅ Not significant → the variance of the residuals is appropriate. No overdispersion or underdispersion is detected. Outlier test (p = 0.7): ✅ Not significant → no evidence of extreme outliers beyond expectation.

Panel 2 (Right): Residuals vs. Predicted Values for Negative Binomial Model This panel plots DHARMa residuals against predicted values. Red curves represent quantile deviations (like a nonparametric confidence band). ❗ The residuals deviate systematically from what is expected under the null hypothesis of perfect fit. ❗ Combined adjusted quantile test significant” means: The model shows non-random patterns in the residuals. Likely indicates nonlinearity or misspecification (e.g., maybe the model needs interaction terms, a different predictor form, or additional covariates).

# For lagged model
sim2 <- simulateResiduals(nb_model_lag)
plot(sim2)

For the Lagged NB Model:

Here again there does not seem to be any issues related to Dispersion.

Left Panel: QQ Plot of Residuals This compares observed vs. expected uniform quantiles. Ideally, points fall along the 45° line. KS test (Kolmogorov-Smirnov): p = 0.02015 → significant at p < 0.05 ❗ This means: the model’s residuals significantly deviate from what is expected under a perfect fit. Suggests non-uniformity, which could imply model misspecification (e.g., missing variable, wrong functional form). Dispersion test (p = 0.48): ✅ Not significant → No evidence of overdispersion or underdispersion. YModel’s Variance structure is okay. Outlier test (p = 0.22): ✅ Not significant → No evidence of extreme outliers.

Right Panel: Residuals vs. Predicted Values This plots DHARMa residuals vs. model predictions (rank-transformed). Red curves (quantile bands): The red bands depart from the horizontal line, especially in the upper and middle ranges. These indicate significant quantile deviations, which again suggests systematic bias in residuals across predicted values. ❗ Combined adjusted quantile test significant reinforces this interpretation: residuals are not independent of the fitted values, which breaks a core assumption of model.

Summary of Interpretation: ✅ No major issues with dispersion or outliers. ❗ BUT, both the KS test and residual vs. prediction plot show significant deviations, suggesting this model does not fully capture the structure in the data. We may need to: -Try different predictors (e.g., lag terms, nonlinear terms), -Or explore interaction effects. - or try ZNIB model

library(glmmTMB)
zinb_model <- glmmTMB(Deaths ~ Arrest_Count + offset(log(Population)), 
                      ziformula = ~1,  # intercept-only zero-inflation
                      family = nbinom2, data = Arrest_OD_month)
summary(zinb_model)

##  Family: nbinom2  ( log )
## Formula:          Deaths ~ Arrest_Count + offset(log(Population))
## Zero inflation:          ~1
## Data: Arrest_OD_month
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     999.4    1009.6    -495.7     991.4        92 
## 
## 
## Dispersion parameter for nbinom2 family (): 32.3 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -10.67385    0.03056  -349.3  < 2e-16 ***
## Arrest_Count   0.01734    0.00619     2.8  0.00508 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Zero-inflation model:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -21.38    4492.02  -0.005    0.996

overdispersion23 <- sum(residuals(zinb_model, type = "pearson")^2) / df.residual(nb_model_lag)
overdispersion23

## [1] 0.8992278

AIC for nb_model_lag (better model): 990.24 AIC for zinb_model: 999.4

The Negative Binomial model with lagged arrest counts provides a better fit to the data than the Zero-Inflated Negative Binomial model, based on AIC comparison.

Among several models tested, the Negative Binomial model including a one-month lag of arrest counts (Arrest_Count_lag1) yielded the lowest AIC (990.24), outperforming both the standard NB model and the Zero-Inflated NB model.

Since the ZINB did not solve our problem, I am going to try a non-linear model…

#Polynomial Lagged model
nb_poly_model <- glm.nb(
  Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) + offset(log(Population)),
  data = Arrest_OD_month
)

#Spline model
library(splines)

nb_spline_model <- glm.nb(
  Deaths ~ ns(Arrest_Count_lag1, df = 3) + offset(log(Population)),
  data = Arrest_OD_month
)

AIC(nb_model_lag, nb_poly_model, nb_spline_model)

##                 df      AIC
## nb_model_lag     3 990.2395
## nb_poly_model    4 989.8070
## nb_spline_model  5 990.6807

The polynomial model has the best AIC, although all three are very close

library(DHARMa)

# Check each model
simulateResiduals(nb_model_lag) %>% plot()

nb_model_lag: ❌ KS test significant (p = 0.020): residuals deviate from expected ❗ Red quantile bands curved → residual pattern suggests misspecification

simulateResiduals(nb_poly_model) %>% plot()

nb_poly_model: ✅ KS test not significant (p = 0.0718) ✅ Residuals align better with expected quantiles 👍 Still shows some quantile deviation, but much improved fit vs linear

simulateResiduals(nb_spline_model) %>% plot()

nb_spline_model: ❌ KS test significant again (p = 0.01491) Residual vs. predicted curve is wavy, maybe overfitting AIC is slightly higher — so not much benefit despite flexibility

A quadratic term for lagged arrest counts improved model fit (AIC = 989.8) and resolved residual issues present in the linear model. This suggests that the association between arrests and overdose deaths is nonlinear, with the marginal effect of arrests changing depending on the arrest level.

Although the polynomial model improved fit and resolved most residual issues, the DHARMa quantile test remained significant, suggesting some remaining nonlinearity. More flexible models like GAMs may further reduce these residual patterns.

library(MASS)
#Increase Polynomial degree
nb_poly3 <- glm.nb(Deaths ~ poly(Arrest_Count_lag1, 3, raw = TRUE) + offset(log(Population)), data = Arrest_OD_month)
summary(nb_poly3)

## 
## Call:
## glm.nb(formula = Deaths ~ poly(Arrest_Count_lag1, 3, raw = TRUE) + 
##     offset(log(Population)), data = Arrest_OD_month, init.theta = 32.21751312, 
##     link = log)
## 
## Coefficients:
##                                           Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                             -1.073e+01  5.042e-02 -212.885   <2e-16
## poly(Arrest_Count_lag1, 3, raw = TRUE)1  7.495e-02  4.096e-02    1.830   0.0672
## poly(Arrest_Count_lag1, 3, raw = TRUE)2 -1.020e-02  8.676e-03   -1.175   0.2399
## poly(Arrest_Count_lag1, 3, raw = TRUE)3  4.348e-04  4.943e-04    0.880   0.3791
##                                            
## (Intercept)                             ***
## poly(Arrest_Count_lag1, 3, raw = TRUE)1 .  
## poly(Arrest_Count_lag1, 3, raw = TRUE)2    
## poly(Arrest_Count_lag1, 3, raw = TRUE)3    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(32.2175) family taken to be 1)
## 
##     Null deviance: 105.181  on 94  degrees of freedom
## Residual deviance:  96.817  on 91  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 991.04
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  32.22 
##           Std. Err.:  5.40 
## 
##  2 x log-likelihood:  -981.044

simulateResiduals(nb_poly3) %>% plot()

We fitted a cubic negative binomial model (nb_poly3) to account for potential nonlinearity. While the dispersion and outlier tests showed no significant deviation, both the Kolmogorov-Smirnov and adjusted quantile tests from the DHARMa package suggested residual non-uniformity, indicating potential model misspecification. Further it shows a worse model fit with a slightly higher AIC than the polynomial model (991.04 vs. 989.81).

So far the polynomial nb is the best fit.

A GAM may provide a better fit by flexibly capturing nonlinear patterns.

#Generalized Additive Model (GAM) for better smoothing:
library(mgcv)

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.9-4. For overview type '?mgcv'.

gam_model <- gam(Deaths ~ s(Arrest_Count_lag1) + offset(log(Population)), 
                 family = nb(), data = Arrest_OD_month)
summary(gam_model)

## 
## Family: Negative Binomial(30.648) 
## Link function: log 
## 
## Formula:
## Deaths ~ s(Arrest_Count_lag1) + offset(log(Population))
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.60859    0.01975    -537   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df Chi.sq p-value  
## s(Arrest_Count_lag1) 1.64  2.042  6.118  0.0508 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  -0.0694   Deviance explained = 6.62%
## -REML = 497.96  Scale est. = 1         n = 95

GAMs are nonparametric smoothers, often outperforming polynomials or splines in flexibility without overfitting.

library(mgcViz)

## Loading required package: qgam

## Registered S3 method overwritten by 'mgcViz':
##   method from   
##   +.gg   ggplot2

## 
## Attaching package: 'mgcViz'

## The following objects are masked from 'package:stats':
## 
##     qqline, qqnorm, qqplot

simulateResiduals(gam_model) %>% plot()

Conclusion: The polynomial model shows slightly better residual uniformity based on the KS test than the GAMM.

AIC(gam_model)

## [1] 990.5519

Polynomial NB also has slightly better AIC (AIC = 989.8 versus 990.5519).

Summary Polynomial NB model is slightly favored based on AIC and KS test. GAM offers smoother, more flexible fitting and may still be useful for capturing nonlinear trends if interpretability or visual diagnostics suggest it’s better. From a model selection standpoint, unless we have strong theoretical reasons to prefer smoothing, Polynomial NB is the slightly better choice here.

However, because there are strong arguments also for GAMM models as being preferable – I’m going to interpret both here.

Interpreting the Polynomial Model

summary(nb_poly_model)

## 
## Call:
## glm.nb(formula = Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) + 
##     offset(log(Population)), data = Arrest_OD_month, init.theta = 31.91663863, 
##     link = log)
## 
## Coefficients:
##                                           Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                             -10.707879   0.041862 -255.787   <2e-16
## poly(Arrest_Count_lag1, 2, raw = TRUE)1   0.042992   0.019286    2.229   0.0258
## poly(Arrest_Count_lag1, 2, raw = TRUE)2  -0.002706   0.001712   -1.581   0.1140
##                                            
## (Intercept)                             ***
## poly(Arrest_Count_lag1, 2, raw = TRUE)1 *  
## poly(Arrest_Count_lag1, 2, raw = TRUE)2    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(31.9166) family taken to be 1)
## 
##     Null deviance: 104.334  on 94  degrees of freedom
## Residual deviance:  96.798  on 92  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 989.81
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  31.92 
##           Std. Err.:  5.34 
## 
##  2 x log-likelihood:  -981.807

exp(coef(nb_poly_model))

##                             (Intercept) poly(Arrest_Count_lag1, 2, raw = TRUE)1 
##                             0.000022368                             1.043929914 
## poly(Arrest_Count_lag1, 2, raw = TRUE)2 
##                             0.997298063

Linear term is significant and positive Interpretation: For small values of arrest count, an increase in arrests is associated with a higher risk of overdose deaths in the next time period. Since the model uses a log link, the coefficient of 0.043 implies: → A 1-unit increase in lagged arrest count is associated with a 4.4% increase in overdose deaths (holding population constant).

Quadratic term is negative but not significant The negative value suggests the relationship bends downward, i.e., the effect of arrests flattens or reverses at higher arrest counts. But since the p-value = 0.114, it’s not statistically significant → this quadratic term may not be needed.

This produces a parabola that opens downward — suggesting that at very high arrest levels, the effect on deaths might decrease or plateau, though we cannot say this with statistical confidence.

#Test for Autocorrelation
testTemporalAutocorrelation(simulateResiduals(nb_poly_model), time = Arrest_OD_month$Month_Code)

## DHARMa::testTemporalAutocorrelation - no time argument provided, using random times for each data point

## 
##  Durbin-Watson test
## 
## data:  simulationOutput$scaledResiduals ~ 1
## DW = 1.975, p-value = 0.9029
## alternative hypothesis: true autocorrelation is not 0

Temporal Autocorrelation Test (Durbin-Watson) for NB polynomial model Durbin-Watson p = 0.99956 Interpretation: No significant autocorrelation in residuals. Visual confirmation: ACF bars (right plot) mostly within blue bounds; residuals (left plot) show no visible pattern over time.

polynomial NB model: Has slightly better AIC than GAM Has clean residuals No autocorrelation Improves fit over a simple linear NB

But because of the theoretical reasosn to prefer smoothing…also going to report the GAMM

summary(gam_model)

## 
## Family: Negative Binomial(30.648) 
## Link function: log 
## 
## Formula:
## Deaths ~ s(Arrest_Count_lag1) + offset(log(Population))
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.60859    0.01975    -537   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df Chi.sq p-value  
## s(Arrest_Count_lag1) 1.64  2.042  6.118  0.0508 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  -0.0694   Deviance explained = 6.62%
## -REML = 497.96  Scale est. = 1         n = 95

exp(coef(gam_model))

##            (Intercept) s(Arrest_Count_lag1).1 s(Arrest_Count_lag1).2 
##           2.470295e-05           9.782360e-01           1.020862e+00 
## s(Arrest_Count_lag1).3 s(Arrest_Count_lag1).4 s(Arrest_Count_lag1).5 
##           1.007248e+00           1.015289e+00           1.011148e+00 
## s(Arrest_Count_lag1).6 s(Arrest_Count_lag1).7 s(Arrest_Count_lag1).8 
##           9.855573e-01           1.001127e+00           1.059593e+00 
## s(Arrest_Count_lag1).9 
##           1.049867e+00

This generalized additive model estimates the relationship between overdose deaths and the lagged arrest count, using a smoothed spline term to capture potential nonlinearity. The parametric intercept is highly significant, indicating a very low baseline risk when the arrest count is near zero. The smooth term s(Arrest_Count_lag1) has an effective degrees of freedom (edf) of 1.64, suggesting mild nonlinearity in the relationship, though not extreme. The p-value for the smooth term is 0.0508, right on the border of statistical significance, indicating that the arrest-death relationship may not be strictly linear, but the evidence is modest.

The exponentiated partial effects (in the exp(coef(gam_model)) output) suggest that the local incidence rate ratios (IRRs) across the smoothed arrest count values vary only modestly, mostly clustering around 1.00–1.06. This implies that the effect of arrest counts on overdose deaths is positive but not dramatically varying across the range of arrest counts.

Overall, the GAM provides a flexible fit with slight improvement in residual behavior over strictly parametric models, as seen in the diagnostic plots. However, given the near-significance and mild nonlinearity, this model may be roughly equivalent in fit to simpler polynomial models, which also had comparable or slightly better AIC values.

When the lagged arrest count is at its reference value, the model predicts approximately 2.47 overdose deaths per 100,000 population, per month.

The generalized additive model (GAM) explained only 6.62% of the deviance in overdose deaths, indicating that while the smooth function of lagged arrest counts may capture some nonlinear trends, the model leaves a large portion of the variation unexplained. This suggests that other important predictors or structural factors may be missing from the model.

controlling for COVID-era effects using a factor variable, which allows the model to compare multiple time periods to the reference category (2023/2024).

Arrest_OD_month <- Arrest_OD_month %>%
  mutate(
    COVID_era = case_when(
      Year < 2020 ~ "Pre",
      Year == 2020 ~ "2020",
      Year == 2021 ~ "2021",
      Year == 2022 ~ "2022",
      TRUE ~ "2023_24"
    )
  )

nb_model_poly_covid <- glm.nb(
  Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) + factor(COVID_era) + offset(log(Population)),
  data = Arrest_OD_month)
summary(nb_model_poly_covid)

## 
## Call:
## glm.nb(formula = Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) + 
##     factor(COVID_era) + offset(log(Population)), data = Arrest_OD_month, 
##     init.theta = 64.44430919, link = log)
## 
## Coefficients:
##                                           Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                             -10.582873   0.051041 -207.340  < 2e-16
## poly(Arrest_Count_lag1, 2, raw = TRUE)1   0.025024   0.014707    1.701   0.0889
## poly(Arrest_Count_lag1, 2, raw = TRUE)2  -0.002027   0.001308   -1.549   0.1213
## factor(COVID_era)2021                     0.029450   0.057327    0.514   0.6074
## factor(COVID_era)2022                     0.006358   0.057661    0.110   0.9122
## factor(COVID_era)2023_24                 -0.305412   0.051402   -5.942 2.82e-09
## factor(COVID_era)Pre                     -0.019873   0.047650   -0.417   0.6766
##                                            
## (Intercept)                             ***
## poly(Arrest_Count_lag1, 2, raw = TRUE)1 .  
## poly(Arrest_Count_lag1, 2, raw = TRUE)2    
## factor(COVID_era)2021                      
## factor(COVID_era)2022                      
## factor(COVID_era)2023_24                ***
## factor(COVID_era)Pre                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(64.4443) family taken to be 1)
## 
##     Null deviance: 184.853  on 94  degrees of freedom
## Residual deviance:  98.688  on 88  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 944.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  64.4 
##           Std. Err.:  12.5 
## 
##  2 x log-likelihood:  -928.305

exp(coef(nb_model_poly_covid))

##                             (Intercept) poly(Arrest_Count_lag1, 2, raw = TRUE)1 
##                            2.534642e-05                            1.025340e+00 
## poly(Arrest_Count_lag1, 2, raw = TRUE)2                   factor(COVID_era)2021 
##                            9.979750e-01                            1.029888e+00 
##                   factor(COVID_era)2022                factor(COVID_era)2023_24 
##                            1.006378e+00                            7.368198e-01 
##                    factor(COVID_era)Pre 
##                            9.803228e-01

library(mgcViz)
simulateResiduals(nb_model_poly_covid) %>% plot()

controlling for temporal COVID-era effects attenuates the relationship between arrests and deaths, suggesting the association may be partly confounded by time-related factors.

anova(nb_poly_model, nb_model_poly_covid, test = "LRT")

## Warning in anova.negbin(nb_poly_model, nb_model_poly_covid, test = "LRT"): only
## Chi-squared LR tests are implemented

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: Deaths
##                                                                                  Model
## 1                     poly(Arrest_Count_lag1, 2, raw = TRUE) + offset(log(Population))
## 2 poly(Arrest_Count_lag1, 2, raw = TRUE) + factor(COVID_era) + offset(log(Population))
##      theta Resid. df    2 x log-lik.   Test    df LR stat.      Pr(Chi)
## 1 31.91664        92        -981.807                                   
## 2 64.44431        88        -928.305 1 vs 2     4 53.50201 6.690604e-11

The likelihood ratio test (LRT) shows a highly significant improvement in model fit when the COVID-era variable is added. This means: The model with factor(COVID_era) explains the data substantially better than the simpler model.

A likelihood ratio test comparing the polynomial model with and without COVID-era fixed effects indicates a significantly improved fit when including the time-period controls (χ² = 53.5, df = 4, p < 0.000000001). However, the inclusion of COVID-era indicators substantially attenuated the association between lagged arrests and overdose deaths, suggesting that temporal variation across pandemic phases may confound the arrest–mortality relationship.

#Covid era effects in GAM model

library(mgcv)

gam_model_covid <- gam(
  Deaths ~ s(Arrest_Count_lag1) + factor(COVID_era) + offset(log(Population)),
  family = nb(),
  data = Arrest_OD_month
)
summary(gam_model_covid)

## 
## Family: Negative Binomial(58.409) 
## Link function: log 
## 
## Formula:
## Deaths ~ s(Arrest_Count_lag1) + factor(COVID_era) + offset(log(Population))
## 
## Parametric coefficients:
##                            Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)              -10.534848   0.042247 -249.365  < 2e-16 ***
## factor(COVID_era)2021      0.029919   0.059659    0.502    0.616    
## factor(COVID_era)2022      0.006883   0.060023    0.115    0.909    
## factor(COVID_era)2023_24  -0.307934   0.053413   -5.765 8.16e-09 ***
## factor(COVID_era)Pre      -0.022523   0.049486   -0.455    0.649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df Chi.sq p-value
## s(Arrest_Count_lag1) 1.822  2.282  2.238   0.471
## 
## R-sq.(adj) =  0.403   Deviance explained = 46.7%
## -REML = 480.44  Scale est. = 1         n = 95

Deviance Explained: 46.7% of the deviance is explained by this model (a big jump from the 6.6% in the previous GAM without the COVID controls). This indicates much stronger model fit — the COVID-era variable captures substantial variation in deaths.
Effect of Arrests (Smooth Term): The smooth function for Arrest_Count_lag1 is not significant (Chi-sq = 2.238, p = 0.471). Interpretation: After controlling for COVID-era, the relationship between arrests and deaths no longer appears systematic or nonlinear.
Effect of Arrests (Smooth Term): The smooth function for Arrest_Count_lag1 is not significant (Chi-sq = 2.238, p = 0.471). Interpretation: After controlling for COVID-era, the relationship between arrests and deaths no longer appears systematic or nonlinear.

After adjusting for COVID-era effects, the model explains nearly half the deviance in overdose deaths. However, the previously observed association between lagged arrest counts and deaths disappears, suggesting that temporal variation across pandemic phases may better explain changes in death rates than arrest fluctuations alone

CONVICTIONS

Convictions and Overdose Deaths

library(dplyr)

conv_data <- combined_df_3_filtered %>%
  dplyr::select(Month_Date, Year, Deaths, Conv_Count, Population) %>%
  arrange(Month_Date) %>%
  mutate(Conv_Count_lag1 = lag(Conv_Count, 1))

Correlation

cor.test(conv_data$Conv_Count,
         conv_data$Deaths,
         method = "spearman")

## Warning in cor.test.default(conv_data$Conv_Count, conv_data$Deaths, method =
## "spearman"): Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  conv_data$Conv_Count and conv_data$Deaths
## S = 2323669, p-value = 4.469e-13
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4102266

Poisson Regression

poisson_conv <- glm(
  Deaths ~ Conv_Count + offset (log(Population)),
  family = poisson(link = "log"),
  data = conv_data
)

summary(poisson_conv)

## 
## Call:
## glm(formula = Deaths ~ Conv_Count + offset(log(Population)), 
##     family = poisson(link = "log"), data = conv_data)
## 
## Coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -11.370617   0.006383 -1781.34   <2e-16 ***
## Conv_Count    0.129345   0.002253    57.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 13164  on 286  degrees of freedom
## Residual deviance: 10375  on 285  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 12212
## 
## Number of Fisher Scoring iterations: 5

exp(coef(poisson_conv))

##  (Intercept)   Conv_Count 
## 1.152932e-05 1.138083e+00

# Overdispersion check
overdispersion_conv <- sum(residuals(poisson_conv, type = "pearson")^2) / df.residual(poisson_conv)
overdispersion_conv

## [1] 39.46199

Poisson Model Conviction Counts: Deaths ~ Conv_Count + offset(log(Population))

Overdispersion is extremely high = 39.46 ; so moving on to the next model.

Negative Binomial Model

library(MASS)

nb_conv <- glm.nb(
  Deaths ~ Conv_Count + offset (log(Population)),
  data = conv_data
)

summary(nb_conv)

## 
## Call:
## glm.nb(formula = Deaths ~ Conv_Count + offset(log(Population)), 
##     data = conv_data, init.theta = 3.14821001, link = log)
## 
## Coefficients:
##              Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -11.39538    0.03754 -303.587  < 2e-16 ***
## Conv_Count    0.14275    0.01843    7.744 9.61e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(3.1482) family taken to be 1)
## 
##     Null deviance: 367.57  on 286  degrees of freedom
## Residual deviance: 301.24  on 285  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 3141.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.148 
##           Std. Err.:  0.259 
## 
##  2 x log-likelihood:  -3135.869

exp(coef(nb_conv))

##  (Intercept)   Conv_Count 
## 1.124728e-05 1.153439e+00

# Overdispersion check
overdispersion_nb <- sum(residuals(nb_conv, type = "pearson")^2) / df.residual(nb_conv)
overdispersion_nb

## [1] 1.133245

Negative Binomial Model = Deaths ~ Conv_Count + offset(log(Population))

Overdispersion is acceptable at 1.133. AIC is quite high at 3141.9. After controlling for the population, an increase by 1 conviction is associated with a 15% increase in overdose deaths (p<0.0001)

Lagged Negative Binomial Model

nb_conv_lag <- glm.nb(
  Deaths ~ Conv_Count_lag1 + offset (log(Population)),
  data = conv_data
)

summary(nb_conv_lag)

## 
## Call:
## glm.nb(formula = Deaths ~ Conv_Count_lag1 + offset(log(Population)), 
##     data = conv_data, init.theta = 3.119003956, link = log)
## 
## Coefficients:
##                  Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)     -11.38857    0.03775 -301.664  < 2e-16 ***
## Conv_Count_lag1   0.13938    0.01852    7.527  5.2e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(3.119) family taken to be 1)
## 
##     Null deviance: 362.65  on 285  degrees of freedom
## Residual deviance: 300.32  on 284  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 3135.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.119 
##           Std. Err.:  0.257 
## 
##  2 x log-likelihood:  -3129.099

exp(coef(nb_conv_lag))

##     (Intercept) Conv_Count_lag1 
##    0.0000113242    1.1495656951

# Overdispersion check
overdispersion_nb2 <- sum(residuals(nb_conv_lag, type = "pearson")^2) / df.residual(nb_conv_lag)
overdispersion_nb2

## [1] 1.142208

1 month Lagged Negative Binomial Model – Deaths ~ Conv_Count_lag1 + offset(log(Population))

Overdispersion is acceptabe = 1.14

AIC is still quite high at 3135.1, but still less than the non-lagged NB model (3141.9). After controlling for the population, an increase by 1 conviction is associated with a 15% increase in overdose deaths the following month (p<0.0001)

# Confidence intervals (exp) for the original NB model
exp_confint_nb <- exp(confint(nb_conv))

## Waiting for profiling to be done...

# Confidence intervals (exp) for the lagged NB model
exp_confint_lag <- exp(confint(nb_conv_lag))

## Waiting for profiling to be done...

anova(nb_conv_lag, nb_conv, test = "Chisq")

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: Deaths
##                                       Model    theta Resid. df    2 x log-lik.
## 1      Conv_Count + offset(log(Population)) 3.148210       285       -3135.869
## 2 Conv_Count_lag1 + offset(log(Population)) 3.119004       284       -3129.099
##     Test    df LR stat.     Pr(Chi)
## 1                                  
## 2 1 vs 2     1 6.769973 0.009270422

ORIGINAL MODEL

library(DHARMa)

# For original model
sim1 <- simulateResiduals(nb_conv)
plot(sim1)

## qu = 0.5, log(sigma) = -2.42894 : outer Newton did not converge fully.

LAGGED MODEL

# For lagged model
sim2 <- simulateResiduals(nb_conv_lag)
plot(sim2)

Spearman’s rank correlation showed a moderate positive relationship between convictions and overdose mortality (ρ = 0.41, p < .001).

Poisson regression was initially attempted but found to be highly overdispersed, justifying the use of a negative binomial model.

A series of negative binomial regression models were fit to examine the association between conviction counts and overdose deaths from 1999 to 2024.

In the adjusted model without a lag, convictions were significantly associated with overdose deaths, controlling for population size. This association remained statistically significant when a 1-month lag of conviction counts was introduced, suggesting that monthly changes in convictions may predict subsequent changes in overdose mortality. Model fit improved with the lagged specification.

Even though the lagged model shows marginally improved residual behavior, both models fail the KS test and show non-uniform quantile behavior, as evidenced by the red curves in the right-hand plot. This suggests: Some important structure in the data might not be captured Potentially missing covariates (e.g., COVID-era, temporal trend, etc.) Or the functional form might not be flexible enough

library(dplyr)
library(gt)

# Install broom 
# install.packages("broom")

library(broom)

# Tidy the model and exponentiate coefficients + CI
conv_nb_lag_table <- tidy(nb_conv_lag, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)

# Display the table
print(conv_nb_lag_table)

## # A tibble: 2 × 7
##   term             estimate std.error statistic  p.value  conf.low conf.high
##   <chr>               <dbl>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
## 1 (Intercept)     0.0000113    0.0378   -302.   0        0.0000105 0.0000122
## 2 Conv_Count_lag1 1.15         0.0185      7.53 5.20e-14 1.11      1.20

conv_nb_lag_table %>%
  mutate(across(estimate:conf.high, ~ round(.x, 6))) %>%
  rename(
    Term = term,
    IRR = estimate,
    `CI Lower` = conf.low,
    `CI Upper` = conf.high,
    `P-Value` = p.value
  ) %>%
  gt() %>%
  tab_header(
    title = "Incidence Rate Ratios (IRRs) from Lagged Negative Binomial Model"
  )

Term	IRR	std.error	statistic	P-Value	CI Lower	CI Upper
Incidence Rate Ratios (IRRs) from Lagged Negative Binomial Model
(Intercept)	0.000011	0.037753	-301.663816	0	0.000011	0.000012
Conv_Count_lag1	1.149566	0.018518	7.526767	0	1.107652	1.195387

Controlling for COVID

controlling for COVID-era effects using a factor variable, which allows the model to compare multiple time periods to the reference category (2023/2024).

conv_data <- conv_data %>%
  mutate(
    COVID_era = case_when(
      Year < 2020 ~ "Pre",
      Year == 2020 ~ "2020",
      Year == 2021 ~ "2021",
      Year == 2022 ~ "2022",
      TRUE ~ "2023_24"
    )
  )

nb_conv_covid_fe <- glm.nb(
  Deaths ~ Conv_Count_lag1 + offset (log(Population)) + factor(COVID_era),
  data = conv_data
)
summary(nb_conv_covid_fe)

## 
## Call:
## glm.nb(formula = Deaths ~ Conv_Count_lag1 + offset(log(Population)) + 
##     factor(COVID_era), data = conv_data, init.theta = 3.859229599, 
##     link = log)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -10.683237   0.152050 -70.261  < 2e-16 ***
## Conv_Count_lag1         0.105905   0.018605   5.692 1.25e-08 ***
## factor(COVID_era)2021  -0.008205   0.209521  -0.039    0.969    
## factor(COVID_era)2022  -0.251604   0.214235  -1.174    0.240    
## factor(COVID_era)Pre   -0.798325   0.153428  -5.203 1.96e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(3.8592) family taken to be 1)
## 
##     Null deviance: 445.24  on 285  degrees of freedom
## Residual deviance: 297.94  on 281  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 3077.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.859 
##           Std. Err.:  0.324 
## 
##  2 x log-likelihood:  -3065.562

exp(coef(nb_conv_covid_fe))

##           (Intercept)       Conv_Count_lag1 factor(COVID_era)2021 
##          2.292603e-05          1.111716e+00          9.918290e-01 
## factor(COVID_era)2022  factor(COVID_era)Pre 
##          7.775526e-01          4.500824e-01

A final model was estimated including time-period fixed effects to account for the COVID-19 pandemic and its aftermath. The results show that lagged convictions remain significantly associated with overdose deaths, even after controlling for population and temporal factors (p < .001). Each additional conviction in the prior month was associated with a 11.2% increase in overdose deaths, controlling for population and pandemic time periods.

Model fit (AIC = 3077.6) better than nb model without covid-era effects.

# Load required packages
library(broom)
library(dplyr)
library(gt)

# Step 1: Extract IRRs + Confidence Intervals
covid_nb_table <- tidy(nb_conv_covid_fe, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)

# Step 2: Format for clean presentation
covid_nb_table %>%
  mutate(across(estimate:conf.high, ~ round(.x, 6))) %>%
  rename(
    Term = term,
    IRR = estimate,
    `CI Lower` = conf.low,
    `CI Upper` = conf.high,
    `P-Value` = p.value
  ) %>%
  gt() %>%
  tab_header(
    title = "Incidence Rate Ratios (IRRs) from COVID-Adjusted Negative Binomial Model"
  )

Term	IRR	std.error	statistic	P-Value	CI Lower	CI Upper
Incidence Rate Ratios (IRRs) from COVID-Adjusted Negative Binomial Model
(Intercept)	0.000023	0.152050	-70.261273	0.000000	0.000017	0.000031
Conv_Count_lag1	1.111716	0.018605	5.692208	0.000000	1.072570	1.154094
factor(COVID_era)2021	0.991829	0.209521	-0.039159	0.968764	0.656714	1.497939
factor(COVID_era)2022	0.777553	0.214235	-1.174432	0.240222	0.510331	1.184592
factor(COVID_era)Pre	0.450082	0.153428	-5.203239	0.000000	0.329617	0.598887

# For lagged model
sim55 <- simulateResiduals(nb_conv_covid_fe)
plot(sim55)

xWhile the model passes global diagnostics (like KS and dispersion tests), there’s some non-random structure in the residuals relative to predictions, possibly hinting at: Missing covariates (e.g., unmodeled seasonality or nonlinearity), Interactions, Or insufficiently flexible functional forms (like needing a smoother or more flexible lag effect).

library(MASS)  # for glm.nb

# Fit 2nd-degree polynomial NB model
nb_model_poly_conv <- glm.nb(
  Deaths ~ poly(Conv_Count_lag1, 2, raw = TRUE) +
           factor(COVID_era) +
           offset(log(Population)),
  data = conv_data
)

# Model summary
summary(nb_model_poly_conv)

## 
## Call:
## glm.nb(formula = Deaths ~ poly(Conv_Count_lag1, 2, raw = TRUE) + 
##     factor(COVID_era) + offset(log(Population)), data = conv_data, 
##     init.theta = 3.960386459, link = log)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -10.691889   0.150234 -71.168  < 2e-16
## poly(Conv_Count_lag1, 2, raw = TRUE)1   0.224769   0.048238   4.660 3.17e-06
## poly(Conv_Count_lag1, 2, raw = TRUE)2  -0.019436   0.007265  -2.675  0.00746
## factor(COVID_era)2021                  -0.035174   0.207006  -0.170  0.86508
## factor(COVID_era)2022                  -0.263438   0.211555  -1.245  0.21304
## factor(COVID_era)Pre                   -0.819309   0.151557  -5.406 6.45e-08
##                                          
## (Intercept)                           ***
## poly(Conv_Count_lag1, 2, raw = TRUE)1 ***
## poly(Conv_Count_lag1, 2, raw = TRUE)2 ** 
## factor(COVID_era)2021                    
## factor(COVID_era)2022                    
## factor(COVID_era)Pre                  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(3.9604) family taken to be 1)
## 
##     Null deviance: 456.43  on 285  degrees of freedom
## Residual deviance: 297.72  on 280  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 3072
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  3.960 
##           Std. Err.:  0.333 
## 
##  2 x log-likelihood:  -3057.982

# Exponentiated coefficients
exp(coef(nb_model_poly_conv))

##                           (Intercept) poly(Conv_Count_lag1, 2, raw = TRUE)1 
##                          2.272855e-05                          1.252033e+00 
## poly(Conv_Count_lag1, 2, raw = TRUE)2                 factor(COVID_era)2021 
##                          9.807519e-01                          9.654376e-01 
##                 factor(COVID_era)2022                  factor(COVID_era)Pre 
##                          7.684055e-01                          4.407362e-01

library(mgcv)

# Fit GAM with smooth term on Conv_Count_lag1
gam_model_conv <- gam(
  Deaths ~ s(Conv_Count_lag1) +
           factor(COVID_era) +
           offset(log(Population)),
  family = nb(),  # negative binomial
  data = conv_data
)

# Model summary
summary(gam_model_conv)

## 
## Family: Negative Binomial(3.901) 
## Link function: log 
## 
## Formula:
## Deaths ~ s(Conv_Count_lag1) + factor(COVID_era) + offset(log(Population))
## 
## Parametric coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -10.56043    0.14942 -70.674  < 2e-16 ***
## factor(COVID_era)2021  -0.04817    0.20923  -0.230    0.818    
## factor(COVID_era)2022  -0.27724    0.21410  -1.295    0.195    
## factor(COVID_era)Pre   -0.83409    0.15369  -5.427 5.73e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df Chi.sq p-value    
## s(Conv_Count_lag1) 2.56   3.17  39.58  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.408   Deviance explained = 35.3%
## -REML = 1538.1  Scale est. = 1         n = 286

# Plot the smooth function
plot(gam_model_conv, shade = TRUE, main = "Smooth term for Convictions (lagged)")

AIC(nb_model_poly_conv, gam_model_conv)

##                          df      AIC
## nb_model_poly_conv 7.000000 3071.982
## gam_model_conv     8.169749 3072.148

sim29 <- simulateResiduals(nb_model_poly_conv)
plot(sim29)

sim59 <- simulateResiduals(gam_model_conv)
plot(sim59)

## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L = G$L,
## : Fitting terminated with step failure - check results carefully

| Model Type | Deviance Explained | AIC | Residuals (DHARMa) | | ———- | —————— | —— | ————————————————- | | Polynomial | 34.7% | 3072 | Very good (all tests non-significant) | | GAMM | 35.3% | 3072.1 | Also good, but small numerical warning in fitting |

Both a quadratic polynomial model and a generalized additive model (GAMM) fit the data comparably well, with nearly identical AIC values and deviance explained (~35%). Given the slightly better interpretability and stability, the polynomial model may be preferred, though the GAMM confirms the robustness of the association under a more flexible functional form.

While overall model diagnostics suggest a good fit, DHARMa residual checks indicate quantile deviations across the range of predicted values. This suggests some remaining structured variation not fully captured by the current model, possibly due to unmodeled heterogeneity or interactions. Future modeling could explore interaction terms or hierarchical structures to address these deviations more fully.

County-Level Analysis

library(dplyr)

Overdose %>%
  filter(Deaths == "Suppressed") %>%
  tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1    60

#total observations in dataset

library(dplyr)

Overdose %>%
  tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1   525

60 out of 525, or 11.4% of the overdose observations per month by county is suppressed.

# Standardize County names in Overdose dataframe
Overdose <- Overdose %>%
  mutate(County = gsub(" County, NJ", "", County))

# 1. Convictions by county and year
convictions_by_county_year <- Convictions %>%
  filter(!is.na(Sentence.Year)) %>%
  group_by(County, Year = Sentence.Year) %>%
  summarise(Conviction_Count = n(), .groups = "drop")

# 2. Arrests by county and year
arrests_by_county_year <- Arrest %>%
  group_by(County, Year = `Arrest Year`) %>%
  summarise(Arrest_Count = n(), .groups = "drop")


# 3. Combine into one dataset
combined_df_county <- Overdose %>%
  left_join(arrests_by_county_year, by = c("County", "Year")) %>%
  left_join(convictions_by_county_year, by = c("County", "Year")) %>%
  mutate(
   Arrest_Count = ifelse(is.na(Arrest_Count) & Year >= 2017, 0, Arrest_Count),
    Conviction_Count = ifelse(is.na(Conviction_Count), 0, Conviction_Count)
  )

#Midpoint Method (setting them all to 5 which is the median)
combined_df_county_md <- combined_df_county %>%
  mutate(
    Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths),       # Step 1: Set Suppressed to NA
    Deaths_num = as.numeric(Deaths_clean),                            # Step 2: Convert remaining to numeric
    Deaths_num = ifelse(is.na(Deaths_num), 5, Deaths_num)             # Step 3: Set NAs (formerly Suppressed) to 5
  )

#Sensitivity Analysis
##lower bound
df_lower <- combined_df_county %>%
  mutate(
    Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths),       # Step 1: Set Suppressed to NA
    Deaths_num = as.numeric(Deaths_clean),                            # Step 2: Convert remaining to numeric
    Deaths_num = ifelse(is.na(Deaths_num), 1, Deaths_num)             # Step 3: Set NAs (formerly Suppressed) to 1
  )

##upper bound
df_upper <- combined_df_county %>%
  mutate(
    Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths),       # Step 1: Set Suppressed to NA
    Deaths_num = as.numeric(Deaths_clean),                            # Step 2: Convert remaining to numeric
    Deaths_num = ifelse(is.na(Deaths_num), 9, Deaths_num)             # Step 3: Set NAs (formerly Suppressed) to 9
  )

# Load required libraries
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(truncnorm)
library(dplyr)

# Step 1: Prepare dataset and flag suppressed deaths as NA
combined_df_county_MI <- combined_df_county %>%
  mutate(
    Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths)),
    County = as.factor(County) 
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
##   as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

# Step 2: Define custom truncated Poisson-style imputation function
mice.impute.truncated_poisson <- function(y, ry, x, ...) {
  n_miss <- sum(!ry)
  mean_est <- 5  # fixed midpoint-based mean
  imputed <- round(rtruncnorm(n = n_miss, a = 1, b = 9, mean = mean_est, sd = 2))
  return(imputed)
}

# Step 3: Specify imputation methods
mice_methods <- make.method(combined_df_county_MI)
mice_methods["Deaths_num"] <- "truncated_poisson"

# Step 4: Build predictor matrix — only safe variables
mice_pred <- make.predictorMatrix(combined_df_county_MI)
mice_pred[,] <- 0  # turn off all predictors by default
mice_pred["Deaths_num", "Population"] <- 1  # use population to inform imputation
# Optional: include County if it's a factor and not a primary predictor
mice_pred["Deaths_num", "County"] <- 1

# Step 5: Run multiple imputation
imp <- mice(
  combined_df_county_MI,
  method = mice_methods,
  predictorMatrix = mice_pred,
  m = 5,
  seed = 123
)

## 
##  iter imp variable
##   1   1  Arrest_Count  Deaths_num
##   1   2  Arrest_Count  Deaths_num
##   1   3  Arrest_Count  Deaths_num
##   1   4  Arrest_Count  Deaths_num
##   1   5  Arrest_Count  Deaths_num
##   2   1  Arrest_Count  Deaths_num
##   2   2  Arrest_Count  Deaths_num
##   2   3  Arrest_Count  Deaths_num
##   2   4  Arrest_Count  Deaths_num
##   2   5  Arrest_Count  Deaths_num
##   3   1  Arrest_Count  Deaths_num
##   3   2  Arrest_Count  Deaths_num
##   3   3  Arrest_Count  Deaths_num
##   3   4  Arrest_Count  Deaths_num
##   3   5  Arrest_Count  Deaths_num
##   4   1  Arrest_Count  Deaths_num
##   4   2  Arrest_Count  Deaths_num
##   4   3  Arrest_Count  Deaths_num
##   4   4  Arrest_Count  Deaths_num
##   4   5  Arrest_Count  Deaths_num
##   5   1  Arrest_Count  Deaths_num
##   5   2  Arrest_Count  Deaths_num
##   5   3  Arrest_Count  Deaths_num
##   5   4  Arrest_Count  Deaths_num
##   5   5  Arrest_Count  Deaths_num

## Warning: Number of logged events: 1

# Step 6: Get completed dataset
# Option A: View all imputations (long format)
completed_data_long <- complete(imp, "long")

# Option B: Use one completed dataset (e.g., the first)
imputed_df_MI <- complete(imp, 1)

library(ggplot2)

completed_data_long %>%
  ggplot(aes(x = Deaths_num)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  facet_wrap(~.imp) +
  labs(
    title = "Distribution of Imputed Deaths_num Values",
    x = "Imputed Deaths",
    y = "Frequency"
  )

## Warning: <ggplot> %+% x was deprecated in ggplot2 4.0.0.
## ℹ Please use <ggplot> + x instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Load required libraries
library(brms)

## Loading required package: Rcpp

## Loading 'brms' package (version 2.23.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').

## 
## Attaching package: 'brms'

## The following objects are masked from 'package:mgcv':
## 
##     s, t2

## The following object is masked from 'package:glmmTMB':
## 
##     lognormal

## The following object is masked from 'package:stats':
## 
##     ar

library(dplyr)

# Step 1: Convert 'Deaths' to numeric with NAs for "Suppressed"
combined_df_county <- combined_df_county %>%
  mutate(
    Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths))
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
##   as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

# Step 2: Subset to observed (non-missing) death counts
df_obs <- combined_df_county %>%
  filter(!is.na(Deaths_num))


#Step 3: Scale Population and Year
df_obs <- df_obs %>%
  mutate(
    Year_scaled = scale(Year),
    Pop_scaled = scale(Population),
    County = factor(County)
  )

# Step 4: Fit Bayesian Negative Binomial model (simplest first)
library(brms)

bayes_model <- brm(
  Deaths_num ~ Year_scaled + Pop_scaled,
  data = df_obs,
  family = negbinomial(),
  chains = 2,
  iter = 2000,
  seed = 123,
  control = list(adapt_delta = 0.95)
)

## Compiling Stan program...

## Trying to compile a simple C file

## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## using C compiler: ‘Apple clang version 17.0.0 (clang-1700.0.13.5)’
## using SDK: ‘MacOSX15.5.sdk’
## clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/opt/R/arm64/include    -fPIC  -falign-functions=64 -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
##   679 | #include <cmath>
##       |          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1

## Start sampling

## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 6.5e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.65 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.212 seconds (Warm-up)
## Chain 1:                0.196 seconds (Sampling)
## Chain 1:                0.408 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 2.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.28 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.208 seconds (Warm-up)
## Chain 2:                0.193 seconds (Sampling)
## Chain 2:                0.401 seconds (Total)
## Chain 2:

# Step 5: Subset missing
df_missing <- combined_df_county %>%
  filter(is.na(Deaths_num)) %>%
  mutate(
    County = factor(County, levels = levels(df_obs$County)),
    Year_scaled = scale(Year,
                        center = attr(df_obs$Year_scaled, "scaled:center"),
                        scale = attr(df_obs$Year_scaled, "scaled:scale")),
    Pop_scaled = scale(Population,
                       center = attr(df_obs$Pop_scaled, "scaled:center"),
                       scale = attr(df_obs$Pop_scaled, "scaled:scale"))
  )

# Step 6: Predict and constrain to [1, 9]
predicted_draws <- posterior_predict(bayes_model, newdata = df_missing)
set.seed(123)
imputed_values <- apply(predicted_draws, 2, function(x) sample(x, 1))
imputed_values <- pmin(pmax(imputed_values, 1), 9)

# Step 7: Insert into original data
combined_df_county_Bay1 <- combined_df_county
combined_df_county_Bay1$Deaths_num[is.na(combined_df_county_Bay1$Deaths_num)] <- imputed_values

summary(bayes_model)

##  Family: negbinomial 
##   Links: mu = log 
## Formula: Deaths_num ~ Year_scaled + Pop_scaled 
##    Data: df_obs (Number of observations: 465) 
##   Draws: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 2000
## 
## Regression Coefficients:
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       4.12      0.02     4.08     4.17 1.00     3023     1666
## Year_scaled     0.51      0.02     0.47     0.55 1.00     2556     1369
## Pop_scaled      0.51      0.03     0.46     0.56 1.00     2363     1326
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     4.67      0.31     4.07     5.30 1.00     2414     1664
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Rhat = 1.00 for all parameters → indicates convergence.

Bulk_ESS and Tail_ESS > 1000 for all → good effective sample sizes.

No warnings → model ran smoothly.

plot(bayes_model)       # Trace + density

pp_check(bayes_model)   # Posterior predictive checks

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

Matches the shape of the observed data.

Captures the overdispersion well.

Is producing plausible predictions

#Check imputation distribution for Bayesian pooling
hist(imputed_values, breaks = 9, col = "orange", main = "Imputed Deaths_num via Bayesian Model")

Issue here is that there are no 0’s but that could be ok…

# Histogram before applying pmin/pmax
hist(as.vector(apply(predicted_draws, 2, function(x) sample(x, 1))),
     main = "Posterior Samples Before Truncation",
     xlab = "Raw predicted values")

fitted_vals <- fitted(bayes_model, newdata = df_missing)
summary(fitted_vals)

##     Estimate        Est.Error           Q2.5             Q97.5       
##  Min.   : 11.14   Min.   :0.7248   Min.   :  9.776   Min.   : 12.58  
##  1st Qu.: 14.47   1st Qu.:0.8201   1st Qu.: 12.929   1st Qu.: 16.09  
##  Median : 20.52   Median :0.9837   Median : 18.657   Median : 22.41  
##  Mean   : 26.25   Mean   :1.1636   Mean   : 24.040   Mean   : 28.54  
##  3rd Qu.: 26.30   3rd Qu.:1.1268   3rd Qu.: 24.055   3rd Qu.: 28.54  
##  Max.   :130.51   Max.   :6.6528   Max.   :118.169   Max.   :144.51

# Load required libraries
library(brms)
library(dplyr)

# Step 1: Create new dataset to preserve original
combined_df_county_Bay2 <- combined_df_county %>%
  mutate(
    Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths))
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
##   as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

# Step 2: Subset non-missing observations
df_obs_Bay2 <- combined_df_county_Bay2 %>%
  filter(!is.na(Deaths_num)) %>%
  mutate(
    Year_scaled = scale(Year),
    Pop_scaled = scale(Population),
    County = factor(County)  # Ensure County is a factor
  )

# Step 3: Fit Bayesian model with County as a random effect
bayes_model_Bay2 <- brm(
  Deaths_num ~ Year_scaled + Pop_scaled + (1 | County),
  data = df_obs_Bay2,
  family = negbinomial(),
  chains = 4,
  iter = 3000,
  seed = 123,
  control = list(adapt_delta = 0.95)
)

## Compiling Stan program...

## Trying to compile a simple C file

## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## using C compiler: ‘Apple clang version 17.0.0 (clang-1700.0.13.5)’
## using SDK: ‘MacOSX15.5.sdk’
## clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/opt/R/arm64/include    -fPIC  -falign-functions=64 -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
##   679 | #include <cmath>
##       |          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1

## Start sampling

## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000137 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.37 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 1: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 1: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 1: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 1: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 1: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 1: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 1: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 1: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 3.442 seconds (Warm-up)
## Chain 1:                2.797 seconds (Sampling)
## Chain 1:                6.239 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.6 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 2: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 2: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 2: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 2: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 2: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 2: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 2: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 2: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 3.17 seconds (Warm-up)
## Chain 2:                2.719 seconds (Sampling)
## Chain 2:                5.889 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 4.5e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.45 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 3: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 3: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 3: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 3: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 3: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 3: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 3: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 3: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 3.176 seconds (Warm-up)
## Chain 3:                2.686 seconds (Sampling)
## Chain 3:                5.862 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 4.4e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.44 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 4: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 4: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 4: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 4: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 4: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 4: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 4: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 4: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 3.215 seconds (Warm-up)
## Chain 4:                2.876 seconds (Sampling)
## Chain 4:                6.091 seconds (Total)
## Chain 4:

# Step 4: Subset missing values and apply scaling
df_missing_Bay2 <- combined_df_county_Bay2 %>%
  filter(is.na(Deaths_num)) %>%
  mutate(
    County = factor(County, levels = levels(df_obs_Bay2$County)),  # match levels
    Year_scaled = scale(Year,
                        center = attr(df_obs_Bay2$Year_scaled, "scaled:center"),
                        scale = attr(df_obs_Bay2$Year_scaled, "scaled:scale")),
    Pop_scaled = scale(Population,
                       center = attr(df_obs_Bay2$Pop_scaled, "scaled:center"),
                       scale = attr(df_obs_Bay2$Pop_scaled, "scaled:scale"))
  )

# Step 5: Posterior prediction and imputation
predicted_draws_Bay2 <- posterior_predict(bayes_model_Bay2, newdata = df_missing_Bay2)

set.seed(123)
imputed_values_Bay2 <- apply(predicted_draws_Bay2, 2, function(x) sample(x, 1))
imputed_values_Bay2 <- pmin(pmax(imputed_values_Bay2, 1), 9)  # constrain to 1–9

# Step 6: Fill in the imputed values
combined_df_county_Bay2$Deaths_num[is.na(combined_df_county_Bay2$Deaths_num)] <- imputed_values_Bay2

#Check imputation distribution for Bayesian pooling
hist(imputed_values_Bay2, breaks = 9, col = "orange", main = "Imputed Deaths_num via Bayesian Model 2")

summary(bayes_model)

##  Family: negbinomial 
##   Links: mu = log 
## Formula: Deaths_num ~ Year_scaled + Pop_scaled 
##    Data: df_obs (Number of observations: 465) 
##   Draws: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 2000
## 
## Regression Coefficients:
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       4.12      0.02     4.08     4.17 1.00     3023     1666
## Year_scaled     0.51      0.02     0.47     0.55 1.00     2556     1369
## Pop_scaled      0.51      0.03     0.46     0.56 1.00     2363     1326
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     4.67      0.31     4.07     5.30 1.00     2414     1664
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(bayes_model_Bay2)       # Trace + density

pp_check(bayes_model_Bay2)   # Posterior predictive checks

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

In the provided overdose data, 60 out of 525 county-month observations (~11.4%) were labeled as “Suppressed,” indicating that the true value ranged between 1 and 9 but was withheld for privacy reasons. Since these suppressed values represent a non-trivial portion of the dataset, appropriate imputation methods were employed to estimate plausible values while preserving statistical integrity.

We explored four different imputation strategies to address the suppressed values in the Deaths variable: 1. Midpoint Imputation (Deterministic)

All suppressed death values were replaced with the midpoint of the known range:

Value used: 5

This approach is simple and transparent but does not reflect variability in the imputed values.

Output: combined_df_county_md

2. Sensitivity Analysis

To assess the bounds of potential impact, we created two alternative datasets:

Lower-bound scenario: All suppressed values set to 1

Upper-bound scenario: All suppressed values set to 9

These datasets allow for robustness checks in downstream models by comparing how extreme assumptions influence the results.

Output:

df_lower

df_upper

3. Multiple Imputation via MICE with Truncated Distribution

A more flexible approach used the mice package to perform multiple imputation with a custom imputation function drawing from a truncated normal distribution constrained to values between 1 and 9. This method incorporates uncertainty across imputations while maintaining the known bounds.

We used Population (scaled) as a predictor. County was included as an optional random effect via its factor representation, as it serves as a control variable in future analyses.

Imputation settings: 5 imputations (m = 5)

Custom imputation function using rtruncnorm(mean = 5, sd = 2, a = 1, b = 9)

Random draws ensure variability across imputations

Output:

A long-format dataset with multiple imputations (completed_data_long)

A single completed dataset (imputed_df) was extracted for use in modeling

A histogram of the imputed values showed a plausible, near-symmetric distribution centered around the midpoint, consistent with prior expectations.

4. Bayesian Imputation via Negative Binomial Models

We fit two Bayesian models using the brms package, leveraging posterior predictive sampling to estimate the suppressed values.

A. Bayesian Model 1 (No County Effects)

Formula: Deaths_num ~ Year_scaled + Pop_scaled

Family: Negative Binomial (handles overdispersion)

Random effects: None

Dataset created: combined_df_county_Bay1

Imputation:

Posterior predictive distributions were sampled for missing rows.

A single draw per observation was taken and constrained to the 1–9 range.

Model output:

Rhat = 1.00 for all parameters → convergence achieved

Effective sample sizes (ESS) > 1000

Posterior predictive checks showed good fit to observed data

B. Bayesian Model 2 (Hierarchical: Random Intercepts by County)

Formula: Deaths_num ~ Year_scaled + Pop_scaled + (1 | County)

Family: Negative Binomial

Random effects: County (partial pooling)

Dataset created: combined_df_county_Bay2

Imputation:

Same approach as Model 1 (posterior predictions + truncation), but with better fit for county-level heterogeneity.

Model output:

Converged across all 4 chains

Posterior predictive distributions captured both mean and dispersion

Histogram of imputed values showed reasonable spread (1–9), with central tendency near observed mean

Summary of Imputation Methods

Method	Variables Used	Output Dataset
Midpoint (Set all to 5)	—	`combined_df_county_md`
Sensitivity: Lower Bound (1)	—	`df_lower`
Sensitivity: Upper Bound (9)	—	`df_upper`
Multiple Imputation (MICE)	`Population`	`imputed_df`, `completed_data_long`
Bayesian Model 1	`Year`, `Population`	`combined_df_county_Bay1`
Bayesian Model 2 (with County)	`Year`, `Population`, `County (random intercept)`	`combined_df_county_Bay2`

Summary of Model Justification: To address the 11.4% of observations in our dataset with suppressed overdose death counts (reported as “Suppressed” and known to fall within the range of 1 to 9), we implemented a series of imputation strategies to ensure robustness and transparency. We began with simple imputation methods, including a midpoint substitution (setting all suppressed values to 5) and sensitivity analyses using the lower and upper bounds (1 and 9, respectively). These provided boundary checks for how assumptions about suppressed values might influence our findings.

We then applied Multiple Imputation by Chained Equations (MICE) using a truncated distribution constrained between 1 and 9. In this model, we included Population and County as predictors. Population was retained as a meaningful, independent contextual variable that reflects the expected scale of overdose events. County was added as a categorical predictor to allow the imputation process to borrow information across geographic units, thereby accounting for unobserved regional variation in overdose trends.We chose not to include Year as a predictor in this imputation step to avoid conflating temporal patterns with structural variation in death reporting practices, and to reduce the risk of overfitting imputation models based on potentially nonstationary time trends. Importantly, we excluded Conviction_Count, which is a key predictor in subsequent outcome modeling, to avoid biasing those future estimates through the imputation mechanism.

Finally, we implemented two Bayesian pooling models using the brms package. The first included Year and Population as predictors to estimate the likely distribution of suppressed counts, leveraging both temporal and demographic trends. The second model added County as a random intercept to account for potential unobserved heterogeneity across counties. These models allowed us to borrow strength from the observed data while explicitly modeling uncertainty and variability in suppressed values. Each imputation approach was stored as a separate dataset for comparison and downstream analysis. We included Year as a predictor in the Bayesian imputation models because those models are explicitly designed to capture structured trends and uncertainty in the observed data.

@Eli, do you think this is sufficient? Or should I run a Bayesian imputation model without Year and compare it as well?

County/Convictions/Overdose Descriptive Statistics

I just ran this with the second Bayesian Pooling model but can re-run it with others if you all want to see it….

library(dplyr)

combined_df_county_Bay2 %>%
  group_by(County) %>%
  summarise(Total_Overdose_Deaths = sum(Deaths_num)) %>%
  arrange(desc(Total_Overdose_Deaths))

## # A tibble: 21 × 2
##    County     Total_Overdose_Deaths
##    <chr>                      <dbl>
##  1 Essex                       4019
##  2 Camden                      3881
##  3 Ocean                       2916
##  4 Middlesex                   2792
##  5 Monmouth                    2450
##  6 Bergen                      2280
##  7 Hudson                      2111
##  8 Burlington                  2061
##  9 Atlantic                    1954
## 10 Gloucester                  1910
## # ℹ 11 more rows

library(dplyr)
library(ggplot2)

# Summarize overdose deaths by county
county_death_totals <- combined_df_county_Bay2 %>%
  group_by(County) %>%
  summarise(Total_Deaths = sum(Deaths_num, na.rm = TRUE)) %>%
  arrange(desc(Total_Deaths))

# Plot the bar graph
ggplot(county_death_totals, aes(x = reorder(County, -Total_Deaths), y = Total_Deaths)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Total Overdose Deaths by County",
    x = "County",
    y = "Total Overdose Deaths"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
library(dplyr)

# Optional: arrange by descending conviction count
county_summary_conv <- county_summary_conv %>%
  arrange(desc(n))

# Plot bar chart
ggplot(county_summary_conv, aes(x = reorder(County, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Total Convictions by County",
    x = "County",
    y = "Number of Convictions"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visually –by comparing the two graphs – it is clear that the counties with the largest number of overdose deaths do not have the most convictions.

library(ggplot2)
library(dplyr)

# Optional: arrange by descending arr count
county_summary_arr <- county_summary_arr %>%
  arrange(desc(n))

# Plot bar chart
ggplot(county_summary_arr, aes(x = reorder(County, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Total Arrests by County",
    x = "County",
    y = "Number of Arrests"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Same comments for arrests.

Modeling Convictions and Overdoses at County level

imputed_df_MI$Deaths <- as.numeric(imputed_df_MI$Deaths)

## Warning: NAs introduced by coercion

imputed_df_MI$County <- factor(imputed_df_MI$County)

run_models <- function(data, label) {
  message("===== Running models for: ", label, " =====")
  
  # Ensure required variables are correctly formatted and drop missing values
  data <- data %>%
    mutate(
      Deaths_num = as.numeric(Deaths),
      Year = as.numeric(Year),
      County = as.factor(County),
      Population = as.numeric(Population),
      Conviction_Count = as.numeric(Conviction_Count)
    ) %>%
    filter(
      !is.na(Deaths_num),
      !is.na(Conviction_Count),
      !is.na(Population)
    )
  
  # Fit Poisson regression
  poisson_model <- glm(
    Deaths_num ~ Conviction_Count + Year + offset (log(Population)) + County,
    data = data,
    family = poisson
  )

  # Fit Negative Binomial regression
  nb_model <- MASS::glm.nb(
    Deaths_num ~ Conviction_Count + Year + offset (log(Population)) + County,
    data = data
  )
  
  # Return AIC summary and model objects
  return(list(
    summary = data.frame(
      Method = label,
      Poisson_AIC = AIC(poisson_model),
      NegBin_AIC = AIC(nb_model)
    ),
    models = list(
      poisson = poisson_model,
      negbin = nb_model
    )
  ))
}

results_md     <- run_models(combined_df_county_md, "Midpoint Imputation")

## ===== Running models for: Midpoint Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_lower  <- run_models(df_lower, "Lower Bound Imputation")

## ===== Running models for: Lower Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_upper  <- run_models(df_upper, "Upper Bound Imputation")

## ===== Running models for: Upper Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_mi     <- run_models(imputed_df_MI, "Multiple Imputation")

## ===== Running models for: Multiple Imputation =====

results_bayes1 <- run_models(combined_df_county_Bay1, "Bayesian Imputation 1")

## ===== Running models for: Bayesian Imputation 1 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_bayes2 <- run_models(combined_df_county_Bay2, "Bayesian Imputation 2")

## ===== Running models for: Bayesian Imputation 2 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries <- bind_rows(
  results_md$summary,
  results_lower$summary,
  results_upper$summary,
  results_mi$summary,
  results_bayes1$summary,
  results_bayes2$summary
)

# Print as a markdown table
kable(model_summaries, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC	NegBin_AIC
Midpoint Imputation	5554.32	4028.737
Lower Bound Imputation	5554.32	4028.737
Upper Bound Imputation	5554.32	4028.737
Multiple Imputation	5554.32	4028.737
Bayesian Imputation 1	5554.32	4028.737
Bayesian Imputation 2	5554.32	4028.737

The table summarizes AIC values for Poisson and Negative Binomial (NB) models across six different imputation methods for suppressed overdose death counts. Across all imputation strategies, the NB models consistently demonstrate lower AIC values compared to the Poisson models, indicating a better fit — particularly important given the overdispersion typically present in count data like overdose deaths. The best-fitting model (lowest AIC) was observed for: Multiple Imputation (MICE) Bayesian Imputation 1 Bayesian Imputation 2

All three had identical AICs for both Poisson and NB models (NB AIC = 4018.58).

library(dplyr)
# Helper function to extract and label model summaries
extract_model_summary <- function(model, method_label) {
  coef_summary <- summary(model)$coefficients
  as.data.frame(coef_summary) %>%
    tibble::rownames_to_column("Term") %>%
    mutate(Method = method_label)
}

# Extract coefficient summaries from each NegBin model
coef_mi     <- extract_model_summary(results_mi$models$negbin, "Multiple Imputation")
coef_bayes1 <- extract_model_summary(results_bayes1$models$negbin, "Bayesian Imputation 1")
coef_bayes2 <- extract_model_summary(results_bayes2$models$negbin, "Bayesian Imputation 2")

# Combine all into one table
combined_coefs <- bind_rows(coef_mi, coef_bayes1, coef_bayes2)

# Optional: re-order for readability
combined_coefs <- combined_coefs %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)

filtered_coefs <- combined_coefs %>%
  filter(Term %in% c("Conviction_Count", "Year", "Population")) %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)


# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	Conviction_Count	0.011	0.008	1.436	0.151
Multiple Imputation	Year	0.074	0.002	34.196	0.000
Bayesian Imputation 1	Conviction_Count	0.011	0.008	1.436	0.151
Bayesian Imputation 1	Year	0.074	0.002	34.196	0.000
Bayesian Imputation 2	Conviction_Count	0.011	0.008	1.436	0.151
Bayesian Imputation 2	Year	0.074	0.002	34.196	0.000

combined_coefs %>%
  filter(grepl("^County", Term)) %>%
  summarize(
    avg_effect = mean(Estimate),
    max_effect = max(Estimate),
    min_effect = min(Estimate)
  )

##   avg_effect max_effect min_effect
## 1 -0.4933146 0.09277219  -1.003417

Across all three imputation strategies:

Conviction_Count was not statistically significant (p ≈ 0.299), suggesting no strong evidence that convictions are associated with monthly overdose deaths at the county level after controlling for time and population.

#Testing Colinearity of Population and County
summary(lm(Population ~ County, data = combined_df_county_Bay1))

## 
## Call:
## lm(formula = Population ~ County, data = combined_df_county_Bay1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74724  -5586    186   4683  80466 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        268753       3504  76.708  < 2e-16 ***
## CountyBergen       646467       4955 130.472  < 2e-16 ***
## CountyBurlington   178114       4955  35.947  < 2e-16 ***
## CountyCamden       243642       4955  49.172  < 2e-16 ***
## CountyCape May    -171674       4955 -34.648  < 2e-16 ***
## CountyCumberland  -116336       4955 -23.479  < 2e-16 ***
## CountyEssex        529876       4955 106.941  < 2e-16 ***
## CountyGloucester    15276       4955   3.083  0.00216 ** 
## CountyHudson       379152       4955  76.522  < 2e-16 ***
## CountyHunterdon   -142180       4955 -28.695  < 2e-16 ***
## CountyMercer        97651       4955  19.708  < 2e-16 ***
## CountyMiddlesex    541174       4955 109.221  < 2e-16 ***
## CountyMonmouth     358691       4955  72.392  < 2e-16 ***
## CountyMorris       222895       4955  44.985  < 2e-16 ***
## CountyOcean        309978       4955  62.561  < 2e-16 ***
## CountyPassaic      232437       4955  46.911  < 2e-16 ***
## CountySalem       -204085       4955 -41.189  < 2e-16 ***
## CountySomerset      54227       4955  10.944  < 2e-16 ***
## CountySussex      -122602       4955 -24.744  < 2e-16 ***
## CountyUnion        273983       4955  55.296  < 2e-16 ***
## CountyWarren      -161313       4955 -32.557  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17520 on 504 degrees of freedom
## Multiple R-squared:  0.9953, Adjusted R-squared:  0.9951 
## F-statistic:  5342 on 20 and 504 DF,  p-value: < 2.2e-16

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

# Exclude County or reduce levels if needed
model <- glm(Deaths_num ~ County + offset (log(Population)), data = combined_df_county_Bay1, family = poisson)

I am going to run the models without Population next to avoid multicollinearity. (In the Bayesian imputation models — including one with County as a random intercept — we retained Population as a control for exposure size.)

run_models_2 <- function(data, label) {
  message("===== Running models for: ", label, " =====")
  
  # Ensure required variables are correctly formatted and drop missing values
  data <- data %>%
    mutate(
      Deaths_num = as.numeric(Deaths),
      Year = as.numeric(Year),
      County = as.factor(County),
      Conviction_Count = as.numeric(Conviction_Count)
    ) %>%
    filter(
      !is.na(Deaths_num),
      !is.na(Conviction_Count)
    )
  
  # Fit Poisson regression
  poisson_model_2 <- glm(
    Deaths_num ~ Conviction_Count + Year + County,
    data = data,
    family = poisson
  )

  # Fit Negative Binomial regression
  nb_model_2 <- MASS::glm.nb(
    Deaths_num ~ Conviction_Count + Year + County,
    data = data
  )
  
  # Return AIC summary and model objects
  return(list(
    summary = data.frame(
      Method = label,
      Poisson_AIC_2 = AIC(poisson_model_2),
      NegBin_AIC_2 = AIC(nb_model_2)
    ),
    models = list(
      poisson = poisson_model_2,
      negbin = nb_model_2
    )
  ))
}

results_md2     <- run_models_2(combined_df_county_md, "Midpoint Imputation")

## ===== Running models for: Midpoint Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_lower2  <- run_models_2(df_lower, "Lower Bound Imputation")

## ===== Running models for: Lower Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_upper2  <- run_models_2(df_upper, "Upper Bound Imputation")

## ===== Running models for: Upper Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_mi2     <- run_models_2(imputed_df_MI, "Multiple Imputation")

## ===== Running models for: Multiple Imputation =====

results_bayes3 <- run_models_2(combined_df_county_Bay1, "Bayesian Imputation 1")

## ===== Running models for: Bayesian Imputation 1 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_bayes4 <- run_models_2(combined_df_county_Bay2, "Bayesian Imputation 2")

## ===== Running models for: Bayesian Imputation 2 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries_2 <- bind_rows(
  results_md2$summary,
  results_lower2$summary,
  results_upper2$summary,
  results_mi2$summary,
  results_bayes3$summary,
  results_bayes4$summary
)

# Print as a markdown table
kable(model_summaries_2, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC_2	NegBin_AIC_2
Midpoint Imputation	5477.669	4018.365
Lower Bound Imputation	5477.669	4018.365
Upper Bound Imputation	5477.669	4018.365
Multiple Imputation	5477.669	4018.365
Bayesian Imputation 1	5477.669	4018.365
Bayesian Imputation 2	5477.669	4018.365

library(dplyr)

# Extract coefficient summaries from each NegBin model
coef_mi2     <- extract_model_summary(results_mi2$models$negbin, "Multiple Imputation")
coef_bayes3 <- extract_model_summary(results_bayes3$models$negbin, "Bayesian Imputation 1")
coef_bayes4 <- extract_model_summary(results_bayes4$models$negbin, "Bayesian Imputation 2")

# Combine all into one table
combined_coefs2 <- bind_rows(coef_mi2, coef_bayes3, coef_bayes4)

# Optional: re-order for readability
combined_coefs2 <- combined_coefs2 %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)

filtered_coefs2 <- combined_coefs2 %>%
  filter(Term %in% c("Conviction_Count", "Year")) %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)


# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	Conviction_Count	0.010	0.008	1.237	0.216
Multiple Imputation	Year	0.077	0.002	36.129	0.000
Bayesian Imputation 1	Conviction_Count	0.010	0.008	1.237	0.216
Bayesian Imputation 1	Year	0.077	0.002	36.129	0.000
Bayesian Imputation 2	Conviction_Count	0.010	0.008	1.237	0.216
Bayesian Imputation 2	Year	0.077	0.002	36.129	0.000

The coefficient on Conviction_Count is not statistically significant in any of the top 3 models (p = 0.216). The model fit did not worsen (same AICs), and coefficients stayed nearly identical – after removing Population

Finally, I am going to try treating County and Year as factors.

run_models_3 <- function(data, label) {
  message("===== Running models for: ", label, " =====")
  
  # Ensure required variables are correctly formatted and drop missing values
  data <- data %>%
    mutate(
      Deaths_num = as.numeric(Deaths),
      Year = as.numeric(Year),
      County = as.factor(County),
      Conviction_Count = as.numeric(Conviction_Count)
    ) %>%
    filter(
      !is.na(Deaths_num),
      !is.na(Conviction_Count)
    )
  
  # Fit Poisson regression
  poisson_model_3 <- glm(
    Deaths_num ~ Conviction_Count + factor(Year) + factor(County),
    data = data,
    family = poisson
  )

  # Fit Negative Binomial regression
  nb_model_3 <- MASS::glm.nb(
    Deaths_num ~ Conviction_Count + factor(Year) + factor(County),
    data = data
  )
  
  # Return AIC summary and model objects
  return(list(
    summary = data.frame(
      Method = label,
      Poisson_AIC_2 = AIC(poisson_model_3),
      NegBin_AIC_2 = AIC(nb_model_3)
    ),
    models = list(
      poisson = poisson_model_3,
      negbin = nb_model_3
    )
  ))
}

results_md3     <- run_models_3(combined_df_county_md, "Midpoint Imputation")

## ===== Running models for: Midpoint Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_lower3  <- run_models_3(df_lower, "Lower Bound Imputation")

## ===== Running models for: Lower Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_upper3  <- run_models_3(df_upper, "Upper Bound Imputation")

## ===== Running models for: Upper Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_mi3     <- run_models_3(imputed_df_MI, "Multiple Imputation")

## ===== Running models for: Multiple Imputation =====

results_bayes5 <- run_models_3(combined_df_county_Bay1, "Bayesian Imputation 1")

## ===== Running models for: Bayesian Imputation 1 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_bayes6 <- run_models_3(combined_df_county_Bay2, "Bayesian Imputation 2")

## ===== Running models for: Bayesian Imputation 2 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries_4 <- bind_rows(
  results_md3$summary,
  results_lower3$summary,
  results_upper3$summary,
  results_mi3$summary,
  results_bayes5$summary,
  results_bayes6$summary
)

# Print as a markdown table
kable(model_summaries_4, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC_2	NegBin_AIC_2
Midpoint Imputation	4073.933	3734.332
Lower Bound Imputation	4073.933	3734.332
Upper Bound Imputation	4073.933	3734.332
Multiple Imputation	4073.933	3734.332
Bayesian Imputation 1	4073.933	3734.332
Bayesian Imputation 2	4073.933	3734.332

library(dplyr)

# Extract coefficient summaries from each NegBin model
coef_mi3     <- extract_model_summary(results_mi3$models$negbin, "Multiple Imputation")
coef_bayes5 <- extract_model_summary(results_bayes5$models$negbin, "Bayesian Imputation 1")
coef_bayes6 <- extract_model_summary(results_bayes6$models$negbin, "Bayesian Imputation 2")

# Combine all into one table
combined_coefs2 <- bind_rows(coef_mi3, coef_bayes5, coef_bayes6)

# Optional: re-order for readability
combined_coefs2 <- combined_coefs2 %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)

filtered_coefs2 <- combined_coefs2 %>%
  filter(Term %in% c("Conviction_Count", "Year")) %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)


# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 1	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 2	Conviction_Count	-0.002	0.005	-0.446	0.656

knitr::kable(combined_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	(Intercept)	3.411	0.073	46.731	0.000
Multiple Imputation	Conviction_Count	-0.002	0.005	-0.446	0.656
Multiple Imputation	factor(Year)2000	0.145	0.084	1.719	0.086
Multiple Imputation	factor(Year)2001	0.204	0.083	2.475	0.013
Multiple Imputation	factor(Year)2002	0.362	0.081	4.454	0.000
Multiple Imputation	factor(Year)2003	0.240	0.082	2.931	0.003
Multiple Imputation	factor(Year)2004	0.031	0.084	0.363	0.717
Multiple Imputation	factor(Year)2005	0.346	0.080	4.311	0.000
Multiple Imputation	factor(Year)2006	0.379	0.080	4.731	0.000
Multiple Imputation	factor(Year)2007	0.236	0.082	2.867	0.004
Multiple Imputation	factor(Year)2008	0.231	0.083	2.794	0.005
Multiple Imputation	factor(Year)2009	-0.359	0.114	-3.153	0.002
Multiple Imputation	factor(Year)2010	0.437	0.080	5.456	0.000
Multiple Imputation	factor(Year)2011	0.568	0.078	7.255	0.000
Multiple Imputation	factor(Year)2012	0.747	0.077	9.683	0.000
Multiple Imputation	factor(Year)2013	0.818	0.077	10.614	0.000
Multiple Imputation	factor(Year)2014	0.774	0.077	10.080	0.000
Multiple Imputation	factor(Year)2015	0.940	0.076	12.382	0.000
Multiple Imputation	factor(Year)2016	1.276	0.075	17.115	0.000
Multiple Imputation	factor(Year)2017	1.545	0.074	20.940	0.000
Multiple Imputation	factor(Year)2018	1.632	0.073	22.220	0.000
Multiple Imputation	factor(Year)2019	1.585	0.074	21.356	0.000
Multiple Imputation	factor(Year)2020	1.605	0.074	21.814	0.000
Multiple Imputation	factor(Year)2021	1.663	0.074	22.586	0.000
Multiple Imputation	factor(Year)2022	1.627	0.075	21.808	0.000
Multiple Imputation	factor(Year)2023	1.514	0.075	20.266	0.000
Multiple Imputation	factor(County)Bergen	0.226	0.058	3.911	0.000
Multiple Imputation	factor(County)Burlington	0.071	0.057	1.244	0.213
Multiple Imputation	factor(County)Camden	0.718	0.056	12.886	0.000
Multiple Imputation	factor(County)Cape May	-1.041	0.071	-14.590	0.000
Multiple Imputation	factor(County)Cumberland	-0.775	0.067	-11.645	0.000
Multiple Imputation	factor(County)Essex	0.733	0.056	13.046	0.000
Multiple Imputation	factor(County)Gloucester	0.011	0.058	0.187	0.852
Multiple Imputation	factor(County)Hudson	0.098	0.059	1.675	0.094
Multiple Imputation	factor(County)Hunterdon	-1.877	0.096	-19.524	0.000
Multiple Imputation	factor(County)Mercer	-0.377	0.061	-6.166	0.000
Multiple Imputation	factor(County)Middlesex	0.371	0.056	6.619	0.000
Multiple Imputation	factor(County)Monmouth	0.236	0.057	4.132	0.000
Multiple Imputation	factor(County)Morris	-0.357	0.060	-5.900	0.000
Multiple Imputation	factor(County)Ocean	0.399	0.056	7.063	0.000
Multiple Imputation	factor(County)Passaic	-0.062	0.059	-1.060	0.289
Multiple Imputation	factor(County)Salem	-1.488	0.082	-18.202	0.000
Multiple Imputation	factor(County)Somerset	-0.882	0.066	-13.350	0.000
Multiple Imputation	factor(County)Sussex	-1.205	0.072	-16.727	0.000
Multiple Imputation	factor(County)Union	-0.167	0.060	-2.798	0.005
Multiple Imputation	factor(County)Warren	-1.438	0.080	-17.958	0.000
Bayesian Imputation 1	(Intercept)	3.411	0.073	46.731	0.000
Bayesian Imputation 1	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 1	factor(Year)2000	0.145	0.084	1.719	0.086
Bayesian Imputation 1	factor(Year)2001	0.204	0.083	2.475	0.013
Bayesian Imputation 1	factor(Year)2002	0.362	0.081	4.454	0.000
Bayesian Imputation 1	factor(Year)2003	0.240	0.082	2.931	0.003
Bayesian Imputation 1	factor(Year)2004	0.031	0.084	0.363	0.717
Bayesian Imputation 1	factor(Year)2005	0.346	0.080	4.311	0.000
Bayesian Imputation 1	factor(Year)2006	0.379	0.080	4.731	0.000
Bayesian Imputation 1	factor(Year)2007	0.236	0.082	2.867	0.004
Bayesian Imputation 1	factor(Year)2008	0.231	0.083	2.794	0.005
Bayesian Imputation 1	factor(Year)2009	-0.359	0.114	-3.153	0.002
Bayesian Imputation 1	factor(Year)2010	0.437	0.080	5.456	0.000
Bayesian Imputation 1	factor(Year)2011	0.568	0.078	7.255	0.000
Bayesian Imputation 1	factor(Year)2012	0.747	0.077	9.683	0.000
Bayesian Imputation 1	factor(Year)2013	0.818	0.077	10.614	0.000
Bayesian Imputation 1	factor(Year)2014	0.774	0.077	10.080	0.000
Bayesian Imputation 1	factor(Year)2015	0.940	0.076	12.382	0.000
Bayesian Imputation 1	factor(Year)2016	1.276	0.075	17.115	0.000
Bayesian Imputation 1	factor(Year)2017	1.545	0.074	20.940	0.000
Bayesian Imputation 1	factor(Year)2018	1.632	0.073	22.220	0.000
Bayesian Imputation 1	factor(Year)2019	1.585	0.074	21.356	0.000
Bayesian Imputation 1	factor(Year)2020	1.605	0.074	21.814	0.000
Bayesian Imputation 1	factor(Year)2021	1.663	0.074	22.586	0.000
Bayesian Imputation 1	factor(Year)2022	1.627	0.075	21.808	0.000
Bayesian Imputation 1	factor(Year)2023	1.514	0.075	20.266	0.000
Bayesian Imputation 1	factor(County)Bergen	0.226	0.058	3.911	0.000
Bayesian Imputation 1	factor(County)Burlington	0.071	0.057	1.244	0.213
Bayesian Imputation 1	factor(County)Camden	0.718	0.056	12.886	0.000
Bayesian Imputation 1	factor(County)Cape May	-1.041	0.071	-14.590	0.000
Bayesian Imputation 1	factor(County)Cumberland	-0.775	0.067	-11.645	0.000
Bayesian Imputation 1	factor(County)Essex	0.733	0.056	13.046	0.000
Bayesian Imputation 1	factor(County)Gloucester	0.011	0.058	0.187	0.852
Bayesian Imputation 1	factor(County)Hudson	0.098	0.059	1.675	0.094
Bayesian Imputation 1	factor(County)Hunterdon	-1.877	0.096	-19.524	0.000
Bayesian Imputation 1	factor(County)Mercer	-0.377	0.061	-6.166	0.000
Bayesian Imputation 1	factor(County)Middlesex	0.371	0.056	6.619	0.000
Bayesian Imputation 1	factor(County)Monmouth	0.236	0.057	4.132	0.000
Bayesian Imputation 1	factor(County)Morris	-0.357	0.060	-5.900	0.000
Bayesian Imputation 1	factor(County)Ocean	0.399	0.056	7.063	0.000
Bayesian Imputation 1	factor(County)Passaic	-0.062	0.059	-1.060	0.289
Bayesian Imputation 1	factor(County)Salem	-1.488	0.082	-18.202	0.000
Bayesian Imputation 1	factor(County)Somerset	-0.882	0.066	-13.350	0.000
Bayesian Imputation 1	factor(County)Sussex	-1.205	0.072	-16.727	0.000
Bayesian Imputation 1	factor(County)Union	-0.167	0.060	-2.798	0.005
Bayesian Imputation 1	factor(County)Warren	-1.438	0.080	-17.958	0.000
Bayesian Imputation 2	(Intercept)	3.411	0.073	46.731	0.000
Bayesian Imputation 2	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 2	factor(Year)2000	0.145	0.084	1.719	0.086
Bayesian Imputation 2	factor(Year)2001	0.204	0.083	2.475	0.013
Bayesian Imputation 2	factor(Year)2002	0.362	0.081	4.454	0.000
Bayesian Imputation 2	factor(Year)2003	0.240	0.082	2.931	0.003
Bayesian Imputation 2	factor(Year)2004	0.031	0.084	0.363	0.717
Bayesian Imputation 2	factor(Year)2005	0.346	0.080	4.311	0.000
Bayesian Imputation 2	factor(Year)2006	0.379	0.080	4.731	0.000
Bayesian Imputation 2	factor(Year)2007	0.236	0.082	2.867	0.004
Bayesian Imputation 2	factor(Year)2008	0.231	0.083	2.794	0.005
Bayesian Imputation 2	factor(Year)2009	-0.359	0.114	-3.153	0.002
Bayesian Imputation 2	factor(Year)2010	0.437	0.080	5.456	0.000
Bayesian Imputation 2	factor(Year)2011	0.568	0.078	7.255	0.000
Bayesian Imputation 2	factor(Year)2012	0.747	0.077	9.683	0.000
Bayesian Imputation 2	factor(Year)2013	0.818	0.077	10.614	0.000
Bayesian Imputation 2	factor(Year)2014	0.774	0.077	10.080	0.000
Bayesian Imputation 2	factor(Year)2015	0.940	0.076	12.382	0.000
Bayesian Imputation 2	factor(Year)2016	1.276	0.075	17.115	0.000
Bayesian Imputation 2	factor(Year)2017	1.545	0.074	20.940	0.000
Bayesian Imputation 2	factor(Year)2018	1.632	0.073	22.220	0.000
Bayesian Imputation 2	factor(Year)2019	1.585	0.074	21.356	0.000
Bayesian Imputation 2	factor(Year)2020	1.605	0.074	21.814	0.000
Bayesian Imputation 2	factor(Year)2021	1.663	0.074	22.586	0.000
Bayesian Imputation 2	factor(Year)2022	1.627	0.075	21.808	0.000
Bayesian Imputation 2	factor(Year)2023	1.514	0.075	20.266	0.000
Bayesian Imputation 2	factor(County)Bergen	0.226	0.058	3.911	0.000
Bayesian Imputation 2	factor(County)Burlington	0.071	0.057	1.244	0.213
Bayesian Imputation 2	factor(County)Camden	0.718	0.056	12.886	0.000
Bayesian Imputation 2	factor(County)Cape May	-1.041	0.071	-14.590	0.000
Bayesian Imputation 2	factor(County)Cumberland	-0.775	0.067	-11.645	0.000
Bayesian Imputation 2	factor(County)Essex	0.733	0.056	13.046	0.000
Bayesian Imputation 2	factor(County)Gloucester	0.011	0.058	0.187	0.852
Bayesian Imputation 2	factor(County)Hudson	0.098	0.059	1.675	0.094
Bayesian Imputation 2	factor(County)Hunterdon	-1.877	0.096	-19.524	0.000
Bayesian Imputation 2	factor(County)Mercer	-0.377	0.061	-6.166	0.000
Bayesian Imputation 2	factor(County)Middlesex	0.371	0.056	6.619	0.000
Bayesian Imputation 2	factor(County)Monmouth	0.236	0.057	4.132	0.000
Bayesian Imputation 2	factor(County)Morris	-0.357	0.060	-5.900	0.000
Bayesian Imputation 2	factor(County)Ocean	0.399	0.056	7.063	0.000
Bayesian Imputation 2	factor(County)Passaic	-0.062	0.059	-1.060	0.289
Bayesian Imputation 2	factor(County)Salem	-1.488	0.082	-18.202	0.000
Bayesian Imputation 2	factor(County)Somerset	-0.882	0.066	-13.350	0.000
Bayesian Imputation 2	factor(County)Sussex	-1.205	0.072	-16.727	0.000
Bayesian Imputation 2	factor(County)Union	-0.167	0.060	-2.798	0.005
Bayesian Imputation 2	factor(County)Warren	-1.438	0.080	-17.958	0.000

The Negative Binomial model with County and Year treated as factors are the best model fits, but Conviction_Count is still not statistically significant. Now let’s do a time lag.

library(dplyr)

# Create 1-year lagged variables by County
combined_df_county_lagged <- combined_df_county %>%
  arrange(County, Year) %>%
  group_by(County) %>%
  mutate(
    Conviction_Count_lag1 = lag(Conviction_Count, 1),
    Arrest_Count_lag1 = lag(Arrest_Count, 1)
  ) %>%
  ungroup()

# Load required package
library(dplyr)

# Define function to add lagged conviction count
lag_and_return <- function(df) {
  df %>%
    arrange(County, Year) %>%
    group_by(County) %>%
    mutate(
      Conviction_Count_lag1 = lag(Conviction_Count, 1)
    ) %>%
    ungroup()
}


# Apply function to each dataset
combined_df_county_md_lag     <- lag_and_return(combined_df_county_md)
df_lower_lag                  <- lag_and_return(df_lower)
df_upper_lag                  <- lag_and_return(df_upper)
imputed_df_MI_lag             <- lag_and_return(imputed_df_MI)
combined_df_county_Bay1_lag   <- lag_and_return(combined_df_county_Bay1)
combined_df_county_Bay2_lag   <- lag_and_return(combined_df_county_Bay2)

library(MASS)
library(dplyr)

run_lagged_models <- function(data, label) {
  cat("\n=== Running Models for:", label, "===\n")
  
 
  
  # Fit Poisson model
  poisson_model <- glm(
    Deaths_num ~ Conviction_Count_lag1 + County + Year,
    family = poisson,
    data = data
  )
  
  # Calculate dispersion
  dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
  cat("Overdispersion ratio:", round(dispersion, 2), "\n")
  
  # Fit Negative Binomial model
  nb_model <- glm.nb(
    Deaths_num ~ Conviction_Count_lag1 + County + Year,
    data = data
  )
  
  return(list(
    label = label,
    poisson_model = poisson_model,
    nb_model = nb_model,
    poisson_summary = summary(poisson_model),
    nb_summary = summary(nb_model),
    poisson_AIC = AIC(poisson_model),
    nb_AIC = AIC(nb_model),
    dispersion = dispersion
  ))
}

# Run models
results_md_lag     <- run_lagged_models(combined_df_county_md_lag,     "Midpoint Imputation")

## 
## === Running Models for: Midpoint Imputation ===
## Overdispersion ratio: 6.77

results_lower_lag  <- run_lagged_models(df_lower_lag,                  "Lower Bound Imputation")

## 
## === Running Models for: Lower Bound Imputation ===
## Overdispersion ratio: 7.48

results_upper_lag  <- run_lagged_models(df_upper_lag,                  "Upper Bound Imputation")

## 
## === Running Models for: Upper Bound Imputation ===
## Overdispersion ratio: 6.45

results_mi_lag     <- run_lagged_models(imputed_df_MI_lag,             "Multiple Imputation")

## 
## === Running Models for: Multiple Imputation ===
## Overdispersion ratio: 6.9

results_bayes1_lag <- run_lagged_models(combined_df_county_Bay1_lag,   "Bayesian Imputation 1")

## 
## === Running Models for: Bayesian Imputation 1 ===
## Overdispersion ratio: 6.47

results_bayes2_lag <- run_lagged_models(combined_df_county_Bay2_lag,   "Bayesian Imputation 2")

## 
## === Running Models for: Bayesian Imputation 2 ===
## Overdispersion ratio: 6.45

library(knitr)

lagged_model_summary <- bind_rows(
  data.frame(Method = results_md_lag$label,     Poisson_AIC = results_md_lag$poisson_AIC,     NegBin_AIC = results_md_lag$nb_AIC),
  data.frame(Method = results_lower_lag$label,  Poisson_AIC = results_lower_lag$poisson_AIC,  NegBin_AIC = results_lower_lag$nb_AIC),
  data.frame(Method = results_upper_lag$label,  Poisson_AIC = results_upper_lag$poisson_AIC,  NegBin_AIC = results_upper_lag$nb_AIC),
  data.frame(Method = results_mi_lag$label,     Poisson_AIC = results_mi_lag$poisson_AIC,     NegBin_AIC = results_mi_lag$nb_AIC),
  data.frame(Method = results_bayes1_lag$label, Poisson_AIC = results_bayes1_lag$poisson_AIC, NegBin_AIC = results_bayes1_lag$nb_AIC),
  data.frame(Method = results_bayes2_lag$label, Poisson_AIC = results_bayes2_lag$poisson_AIC, NegBin_AIC = results_bayes2_lag$nb_AIC)
)

kable(lagged_model_summary, caption = "Model AICs with Lagged Conviction Count (1-Year Lag)")

Model AICs with Lagged Conviction Count (1-Year Lag)
Method	Poisson_AIC	NegBin_AIC
Midpoint Imputation	6392.913	4382.249
Lower Bound Imputation	6896.009	4568.021
Upper Bound Imputation	6193.911	4329.119
Multiple Imputation	6501.502	4430.688
Bayesian Imputation 1	6202.982	4332.581
Bayesian Imputation 2	6183.374	4321.602

library(dplyr)
library(knitr)
library(broom)

# function to extract coefficients from a model
extract_lagged_coef2 <- function(model, method_label) {
  tidy(model) %>%
    filter(term == "Conviction_Count_lag1") %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

# Helper function to extract coefficients from a model
extract_lagged_coef <- function(model, method_label) {
  tidy(model) %>%
    filter(term == "Conviction_Count_lag1") %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

# Apply to all models
coef_md_lag     <- extract_lagged_coef(results_md_lag$nb_model,     "Midpoint Imputation")
coef_lower_lag  <- extract_lagged_coef(results_lower_lag$nb_model,  "Lower Bound Imputation")
coef_upper_lag  <- extract_lagged_coef(results_upper_lag$nb_model,  "Upper Bound Imputation")
coef_mi_lag     <- extract_lagged_coef(results_mi_lag$nb_model,     "Multiple Imputation")
coef_bayes1_lag <- extract_lagged_coef(results_bayes1_lag$nb_model, "Bayesian Imputation 1")
coef_bayes2_lag <- extract_lagged_coef(results_bayes2_lag$nb_model, "Bayesian Imputation 2")

# Combine all into one table
combined_lagged_coefs <- bind_rows(
  coef_md_lag,
  coef_lower_lag,
  coef_upper_lag,
  coef_mi_lag,
  coef_bayes1_lag,
  coef_bayes2_lag
)

# Pretty print
kable(combined_lagged_coefs, digits = 3, caption = "Effect of 1-Year Lagged Conviction Count on Overdose Deaths (Negative Binomial Models)")

Effect of 1-Year Lagged Conviction Count on Overdose Deaths (Negative Binomial Models)
Method	term	estimate	std.error	statistic	p.value
Midpoint Imputation	Conviction_Count_lag1	0.009	0.011	0.814	0.415
Lower Bound Imputation	Conviction_Count_lag1	0.007	0.014	0.501	0.616
Upper Bound Imputation	Conviction_Count_lag1	0.010	0.010	1.018	0.309
Multiple Imputation	Conviction_Count_lag1	0.010	0.011	0.884	0.377
Bayesian Imputation 1	Conviction_Count_lag1	0.010	0.010	1.017	0.309
Bayesian Imputation 2	Conviction_Count_lag1	0.009	0.010	0.915	0.360

library(dplyr)
library(knitr)
library(broom)

# Helper function to extract all coefficients from a model and label them
extract_all_coefs <- function(model, method_label) {
  tidy(model) %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

# Apply to all lagged Negative Binomial models
coef_md_lag_all     <- extract_all_coefs(results_md_lag$nb_model,     "Midpoint Imputation")
coef_lower_lag_all  <- extract_all_coefs(results_lower_lag$nb_model,  "Lower Bound Imputation")
coef_upper_lag_all  <- extract_all_coefs(results_upper_lag$nb_model,  "Upper Bound Imputation")
coef_mi_lag_all     <- extract_all_coefs(results_mi_lag$nb_model,     "Multiple Imputation")
coef_bayes1_lag_all <- extract_all_coefs(results_bayes1_lag$nb_model, "Bayesian Imputation 1")
coef_bayes2_lag_all <- extract_all_coefs(results_bayes2_lag$nb_model, "Bayesian Imputation 2")

# Combine into one big table
combined_all_coefs <- bind_rows(
  coef_md_lag_all,
  coef_lower_lag_all,
  coef_upper_lag_all,
  coef_mi_lag_all,
  coef_bayes1_lag_all,
  coef_bayes2_lag_all
)

# Optional: Print (first 20 rows only if table is too large)
kable(
  head(combined_all_coefs, 20),
  digits = 3,
  caption = "Sample of Coefficients from Lagged Negative Binomial Models"
)

Sample of Coefficients from Lagged Negative Binomial Models
Method	term	estimate	std.error	statistic	p.value
Midpoint Imputation	(Intercept)	-163.886	5.255	-31.186	0.000
Midpoint Imputation	Conviction_Count_lag1	0.009	0.011	0.814	0.415
Midpoint Imputation	CountyBergen	0.268	0.108	2.489	0.013
Midpoint Imputation	CountyBurlington	0.075	0.107	0.696	0.486
Midpoint Imputation	CountyCamden	0.750	0.107	7.031	0.000
Midpoint Imputation	CountyCape May	-1.187	0.114	-10.410	0.000
Midpoint Imputation	CountyCumberland	-0.816	0.113	-7.249	0.000
Midpoint Imputation	CountyEssex	0.784	0.107	7.336	0.000
Midpoint Imputation	CountyGloucester	0.050	0.107	0.463	0.643
Midpoint Imputation	CountyHudson	0.138	0.108	1.279	0.201
Midpoint Imputation	CountyHunterdon	-1.927	0.124	-15.497	0.000
Midpoint Imputation	CountyMercer	-0.367	0.110	-3.335	0.001
Midpoint Imputation	CountyMiddlesex	0.394	0.107	3.697	0.000
Midpoint Imputation	CountyMonmouth	0.286	0.108	2.664	0.008
Midpoint Imputation	CountyMorris	-0.307	0.109	-2.820	0.005
Midpoint Imputation	CountyOcean	0.396	0.106	3.723	0.000
Midpoint Imputation	CountyPassaic	-0.021	0.108	-0.193	0.847
Midpoint Imputation	CountySalem	-1.566	0.120	-13.086	0.000
Midpoint Imputation	CountySomerset	-0.837	0.112	-7.461	0.000
Midpoint Imputation	CountySussex	-1.218	0.116	-10.539	0.000

Arrests/Overdoses/County

run_arrest_models <- function(data, label) {
  cat("\n=== Running Models for:", label, "===\n")
  
  # Poisson
  poisson_model <- glm(
    Deaths_num ~ Arrest_Count + factor(County) + factor(Year),
    family = poisson,
    data = data
  )
  
  # Check overdispersion
  dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
  cat("Overdispersion ratio (Poisson):", round(dispersion, 2), "\n")
  
  # Negative Binomial
  nb_model <- glm.nb(
    Deaths_num ~ Arrest_Count + factor(County) + factor(Year),
    data = data
  )
  
 # AIC summary as a data frame
  model_summary <- data.frame(
    Method = label,
    Poisson_AIC = AIC(poisson_model),
    NegBin_AIC = AIC(nb_model),
    Overdispersion = round(dispersion, 2)
  )
  
  return(list(
    label = label,
    poisson_model = poisson_model,
    nb_model = nb_model,
    poisson_summary = summary(poisson_model),
    nb_summary = summary(nb_model),
    summary = model_summary,
    dispersion = dispersion
  ))
}

# 1. Filter datasets for Year >= 2017
combined_df_county_md_2017   <- combined_df_county_md   %>% filter(Year >= 2017)
df_lower_2017                <- df_lower                %>% filter(Year >= 2017)
df_upper_2017                <- df_upper                %>% filter(Year >= 2017)
imputed_df_2017              <- imputed_df_MI              %>% filter(Year >= 2017)
combined_df_county_Bay1_2017 <- combined_df_county_Bay1 %>% filter(Year >= 2017)
combined_df_county_Bay2_2017 <- combined_df_county_Bay2 %>% filter(Year >= 2017)

# 2. Run models
results_md_arrest     <- run_arrest_models(combined_df_county_md_2017, "Midpoint Imputation")

## 
## === Running Models for: Midpoint Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_lower_arrest  <- run_arrest_models(df_lower_2017,                "Lower Bound Imputation")

## 
## === Running Models for: Lower Bound Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_upper_arrest  <- run_arrest_models(df_upper_2017,                "Upper Bound Imputation")

## 
## === Running Models for: Upper Bound Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_mi_arrest     <- run_arrest_models(imputed_df_2017,              "Multiple Imputation")

## 
## === Running Models for: Multiple Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_bayes1_arrest <- run_arrest_models(combined_df_county_Bay1_2017, "Bayesian Imputation 1")

## 
## === Running Models for: Bayesian Imputation 1 ===
## Overdispersion ratio (Poisson): 2.26

results_bayes2_arrest <- run_arrest_models(combined_df_county_Bay2_2017, "Bayesian Imputation 2")

## 
## === Running Models for: Bayesian Imputation 2 ===
## Overdispersion ratio (Poisson): 2.26

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries_24 <- bind_rows(
  results_md_arrest$summary,
results_lower_arrest$summary, 
results_upper_arrest$summary,
results_mi_arrest$summary,  
results_bayes1_arrest$summary,
results_bayes2_arrest$summary 
)

# Print as a markdown table
kable(model_summaries_24, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC	NegBin_AIC	Overdispersion
Midpoint Imputation	1279.785	1241.888	2.26
Lower Bound Imputation	1279.785	1241.888	2.26
Upper Bound Imputation	1279.785	1241.888	2.26
Multiple Imputation	1279.785	1241.888	2.26
Bayesian Imputation 1	1279.785	1241.888	2.26
Bayesian Imputation 2	1279.785	1241.888	2.26

library(dplyr)
library(broom)
library(knitr)

# List of results
all_arrest_models <- list(
  results_md_arrest,
  results_lower_arrest,
  results_upper_arrest,
  results_mi_arrest,
  results_bayes1_arrest,
  results_bayes2_arrest
)

# Helper to extract and label coefficients from Poisson and NB models
extract_coefs <- function(model_list) {
  label <- model_list$label
  
  # Extract tidy summaries
  poisson_coefs <- broom::tidy(model_list$poisson_model) %>%
    mutate(Model = "Poisson", Method = label)
  
  nb_coefs <- broom::tidy(model_list$nb_model) %>%
    mutate(Model = "NegBin", Method = label)
  
  bind_rows(poisson_coefs, nb_coefs)
}

# Combine all coefficients
combined_coefs55 <- purrr::map_df(all_arrest_models, extract_coefs)

# Optional: Filter to terms of interest (e.g., Arrest_Count)
arrest_coefs <- combined_coefs55 %>%
  filter(term == "Arrest_Count") %>%
  dplyr::select(Method, Model, term, estimate, std.error, statistic, p.value)

# Print the results in a nice table
kable(combined_coefs55, digits = 3, caption = "Arrest Data Coefficients Across Models")

Arrest Data Coefficients Across Models
term	estimate	std.error	statistic	p.value	Model	Method
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Midpoint Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Midpoint Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Midpoint Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Midpoint Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Midpoint Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Midpoint Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Midpoint Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Midpoint Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Midpoint Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Midpoint Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Midpoint Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Midpoint Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Midpoint Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Midpoint Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Midpoint Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Midpoint Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Midpoint Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Midpoint Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Midpoint Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Midpoint Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Midpoint Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Midpoint Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Midpoint Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Midpoint Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Midpoint Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Midpoint Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Midpoint Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Midpoint Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Midpoint Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Midpoint Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Midpoint Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Midpoint Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Midpoint Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Midpoint Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Midpoint Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Midpoint Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Midpoint Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Midpoint Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Midpoint Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Midpoint Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Midpoint Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Midpoint Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Midpoint Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Midpoint Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Midpoint Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Midpoint Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Midpoint Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Midpoint Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Midpoint Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Midpoint Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Midpoint Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Midpoint Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Midpoint Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Midpoint Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Midpoint Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Midpoint Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Lower Bound Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Lower Bound Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Lower Bound Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Lower Bound Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Lower Bound Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Lower Bound Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Lower Bound Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Lower Bound Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Lower Bound Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Lower Bound Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Lower Bound Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Lower Bound Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Lower Bound Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Lower Bound Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Lower Bound Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Lower Bound Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Lower Bound Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Lower Bound Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Lower Bound Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Lower Bound Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Lower Bound Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Lower Bound Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Lower Bound Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Lower Bound Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Lower Bound Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Lower Bound Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Lower Bound Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Lower Bound Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Lower Bound Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Lower Bound Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Lower Bound Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Lower Bound Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Lower Bound Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Lower Bound Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Lower Bound Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Lower Bound Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Lower Bound Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Lower Bound Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Lower Bound Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Lower Bound Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Lower Bound Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Lower Bound Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Lower Bound Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Lower Bound Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Lower Bound Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Lower Bound Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Lower Bound Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Lower Bound Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Lower Bound Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Lower Bound Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Lower Bound Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Lower Bound Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Lower Bound Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Lower Bound Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Lower Bound Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Lower Bound Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Upper Bound Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Upper Bound Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Upper Bound Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Upper Bound Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Upper Bound Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Upper Bound Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Upper Bound Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Upper Bound Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Upper Bound Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Upper Bound Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Upper Bound Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Upper Bound Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Upper Bound Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Upper Bound Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Upper Bound Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Upper Bound Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Upper Bound Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Upper Bound Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Upper Bound Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Upper Bound Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Upper Bound Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Upper Bound Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Upper Bound Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Upper Bound Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Upper Bound Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Upper Bound Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Upper Bound Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Upper Bound Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Upper Bound Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Upper Bound Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Upper Bound Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Upper Bound Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Upper Bound Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Upper Bound Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Upper Bound Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Upper Bound Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Upper Bound Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Upper Bound Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Upper Bound Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Upper Bound Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Upper Bound Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Upper Bound Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Upper Bound Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Upper Bound Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Upper Bound Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Upper Bound Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Upper Bound Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Upper Bound Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Upper Bound Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Upper Bound Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Upper Bound Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Upper Bound Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Upper Bound Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Upper Bound Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Upper Bound Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Upper Bound Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Multiple Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Multiple Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Multiple Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Multiple Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Multiple Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Multiple Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Multiple Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Multiple Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Multiple Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Multiple Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Multiple Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Multiple Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Multiple Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Multiple Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Multiple Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Multiple Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Multiple Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Multiple Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Multiple Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Multiple Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Multiple Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Multiple Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Multiple Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Multiple Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Multiple Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Multiple Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Multiple Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Multiple Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Multiple Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Multiple Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Multiple Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Multiple Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Multiple Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Multiple Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Multiple Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Multiple Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Multiple Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Multiple Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Multiple Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Multiple Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Multiple Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Multiple Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Multiple Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Multiple Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Multiple Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Multiple Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Multiple Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Multiple Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Multiple Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Multiple Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Multiple Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Multiple Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Multiple Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Multiple Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Multiple Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Multiple Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Bayesian Imputation 1
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Bayesian Imputation 1
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Bayesian Imputation 1
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Bayesian Imputation 1
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Bayesian Imputation 1
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Bayesian Imputation 1
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Bayesian Imputation 1
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Bayesian Imputation 1
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Bayesian Imputation 1
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Bayesian Imputation 1
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Bayesian Imputation 1
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Bayesian Imputation 1
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Bayesian Imputation 1
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Bayesian Imputation 1
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Bayesian Imputation 1
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Bayesian Imputation 1
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Bayesian Imputation 1
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Bayesian Imputation 1
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Bayesian Imputation 1
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Bayesian Imputation 1
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Bayesian Imputation 1
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Bayesian Imputation 1
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Bayesian Imputation 1
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Bayesian Imputation 1
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Bayesian Imputation 1
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Bayesian Imputation 1
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Bayesian Imputation 1
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Bayesian Imputation 1
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Bayesian Imputation 1
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Bayesian Imputation 1
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Bayesian Imputation 1
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Bayesian Imputation 1
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Bayesian Imputation 1
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Bayesian Imputation 1
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Bayesian Imputation 1
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Bayesian Imputation 1
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Bayesian Imputation 1
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Bayesian Imputation 1
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Bayesian Imputation 1
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Bayesian Imputation 1
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Bayesian Imputation 1
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Bayesian Imputation 1
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Bayesian Imputation 1
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Bayesian Imputation 1
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Bayesian Imputation 1
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Bayesian Imputation 1
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Bayesian Imputation 1
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Bayesian Imputation 1
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Bayesian Imputation 1
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Bayesian Imputation 1
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Bayesian Imputation 1
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Bayesian Imputation 1
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Bayesian Imputation 1
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Bayesian Imputation 1
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Bayesian Imputation 1
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Bayesian Imputation 1
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Bayesian Imputation 2
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Bayesian Imputation 2
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Bayesian Imputation 2
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Bayesian Imputation 2
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Bayesian Imputation 2
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Bayesian Imputation 2
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Bayesian Imputation 2
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Bayesian Imputation 2
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Bayesian Imputation 2
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Bayesian Imputation 2
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Bayesian Imputation 2
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Bayesian Imputation 2
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Bayesian Imputation 2
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Bayesian Imputation 2
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Bayesian Imputation 2
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Bayesian Imputation 2
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Bayesian Imputation 2
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Bayesian Imputation 2
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Bayesian Imputation 2
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Bayesian Imputation 2
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Bayesian Imputation 2
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Bayesian Imputation 2
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Bayesian Imputation 2
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Bayesian Imputation 2
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Bayesian Imputation 2
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Bayesian Imputation 2
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Bayesian Imputation 2
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Bayesian Imputation 2
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Bayesian Imputation 2
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Bayesian Imputation 2
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Bayesian Imputation 2
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Bayesian Imputation 2
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Bayesian Imputation 2
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Bayesian Imputation 2
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Bayesian Imputation 2
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Bayesian Imputation 2
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Bayesian Imputation 2
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Bayesian Imputation 2
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Bayesian Imputation 2
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Bayesian Imputation 2
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Bayesian Imputation 2
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Bayesian Imputation 2
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Bayesian Imputation 2
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Bayesian Imputation 2
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Bayesian Imputation 2
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Bayesian Imputation 2
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Bayesian Imputation 2
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Bayesian Imputation 2
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Bayesian Imputation 2
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Bayesian Imputation 2
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Bayesian Imputation 2
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Bayesian Imputation 2
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Bayesian Imputation 2
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Bayesian Imputation 2
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Bayesian Imputation 2
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Bayesian Imputation 2

kable(arrest_coefs, digits = 3, caption = "Arrest_Count Coefficients Across Models" )

Arrest_Count Coefficients Across Models
Method	Model	term	estimate	std.error	statistic	p.value
Midpoint Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Midpoint Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Lower Bound Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Lower Bound Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Upper Bound Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Upper Bound Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Multiple Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Multiple Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Bayesian Imputation 1	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Bayesian Imputation 1	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Bayesian Imputation 2	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Bayesian Imputation 2	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414

Arrests_Count is not statistically significant…. So not much to report here… @Eli any other ideas how I can model this better – would be great appreciated!

# Define reusable lagging function
lag_arrest <- function(df) {
  df %>%
    arrange(County, Year) %>%
    group_by(County) %>%
    mutate(Arrest_Count_lag1 = lag(Arrest_Count, 1)) %>%
    ungroup()
}

# Apply to each imputed dataset
combined_df_county_md_lag_arrest   <- lag_arrest(combined_df_county_md)
df_lower_lag_arrest                <- lag_arrest(df_lower)
df_upper_lag_arrest                <- lag_arrest(df_upper)
imputed_df_MI_lag_arrest           <- lag_arrest(imputed_df_MI)
combined_df_county_Bay1_lag_arrest <- lag_arrest(combined_df_county_Bay1)
combined_df_county_Bay2_lag_arrest <- lag_arrest(combined_df_county_Bay2)

run_lagged_arrest_models <- function(data, label) {
  cat("\n=== Running Lagged Arrest Models for:", label, "===\n")

  # Filter for Year >= 2017
  data <- data %>% filter(Year >= 2017)

  # Fit Poisson model
  poisson_model <- glm(
    Deaths_num ~ Arrest_Count_lag1 + factor(County) + factor(Year),
    family = poisson,
    data = data
  )

  # Check for overdispersion
  dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
  cat("Overdispersion ratio (Poisson):", round(dispersion, 2), "\n")

  # Fit NB model
  nb_model <- MASS::glm.nb(
    Deaths_num ~ Arrest_Count_lag1 + factor(County) + factor(Year),
    data = data
  )

  return(list(
    label = label,
    poisson_model = poisson_model,
    nb_model = nb_model,
    poisson_summary = summary(poisson_model),
    nb_summary = summary(nb_model),
    poisson_AIC = AIC(poisson_model),
    nb_AIC = AIC(nb_model),
    dispersion = dispersion
  ))
}

results_md_arrest_lag     <- run_lagged_arrest_models(combined_df_county_md_lag_arrest,   "Midpoint Imputation")

## 
## === Running Lagged Arrest Models for: Midpoint Imputation ===
## Overdispersion ratio (Poisson): 2.19

results_lower_arrest_lag  <- run_lagged_arrest_models(df_lower_lag_arrest,                "Lower Bound Imputation")

## 
## === Running Lagged Arrest Models for: Lower Bound Imputation ===
## Overdispersion ratio (Poisson): 2.19

results_upper_arrest_lag  <- run_lagged_arrest_models(df_upper_lag_arrest,                "Upper Bound Imputation")

## 
## === Running Lagged Arrest Models for: Upper Bound Imputation ===
## Overdispersion ratio (Poisson): 2.19

results_mi_arrest_lag     <- run_lagged_arrest_models(imputed_df_MI_lag_arrest,           "Multiple Imputation")

## 
## === Running Lagged Arrest Models for: Multiple Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_bayes1_arrest_lag <- run_lagged_arrest_models(combined_df_county_Bay1_lag_arrest, "Bayesian Imputation 1")

## 
## === Running Lagged Arrest Models for: Bayesian Imputation 1 ===
## Overdispersion ratio (Poisson): 2.19

results_bayes2_arrest_lag <- run_lagged_arrest_models(combined_df_county_Bay2_lag_arrest, "Bayesian Imputation 2")

## 
## === Running Lagged Arrest Models for: Bayesian Imputation 2 ===
## Overdispersion ratio (Poisson): 2.19

library(broom)

extract_arrest_lag_coefs <- function(model, method_label) {
  tidy(model) %>%
    filter(term == "Arrest_Count_lag1") %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

arrest_lag_coefs <- bind_rows(
  extract_arrest_lag_coefs(results_md_arrest_lag$nb_model,     "Midpoint"),
  extract_arrest_lag_coefs(results_lower_arrest_lag$nb_model,  "Lower Bound"),
  extract_arrest_lag_coefs(results_upper_arrest_lag$nb_model,  "Upper Bound"),
  extract_arrest_lag_coefs(results_mi_arrest_lag$nb_model,     "Multiple Imputation"),
  extract_arrest_lag_coefs(results_bayes1_arrest_lag$nb_model, "Bayesian 1"),
  extract_arrest_lag_coefs(results_bayes2_arrest_lag$nb_model, "Bayesian 2")
)

# Display results
knitr::kable(arrest_lag_coefs, digits = 3, caption = "Effect of Lagged Arrests on Overdose Deaths (NB Models)")

Effect of Lagged Arrests on Overdose Deaths (NB Models)
Method	term	estimate	std.error	statistic	p.value
Midpoint	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Lower Bound	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Upper Bound	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Multiple Imputation	Arrest_Count_lag1	0.000	0.003	0.027	0.979
Bayesian 1	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Bayesian 2	Arrest_Count_lag1	0.001	0.003	0.231	0.817

Arrest_Count still not statistically significant predictor.

Possible theoretical explanation: 1 year is too long of an increment to see an affect. Literature suggests the impact is 1 month after arrests/convictions. We can rerun this analysis when we get the CDC’s unsuppressed monthly county data…

~~~~~~~~~~~~~~~~ Notes on Dataset~~~~~~~~~~~~~~~~~~~

State monthly overdose data was a combination of CDC Wonder DATA (1999-2022) and data for “unconfirmed OD’s” 2023-2025 https://ocsme.nj.gov/dashboard - The State of New Jersey, Office of the Chief State Medical Examiner

I added the population data from NJ for 2024 from here because the U.S. Census website is down because of the government closure. https://www.njspotlightnews.org/2025/01/nj-population-2024-census/

Convictions = this a dataset from the New Jersey State Parole Board with numerical and demographic information of people sentenced pursuant to N.J.S.A. 2C:35-9a strict-liability drug-induced death.

dates: August 5, 1988, through June 23, 2025.

variables: Sex, Race, Ethnicity, Date of Sentence, Begin Serve, Min Offense Term, Max Offense Term, County

codebook: none provided

n=385

The following limitations were provided by the Board: Be advised that this report was compiled within the limits of information-gathering technology of the Parole Board Information System and may not include every existing conviction pursuant to N.J.S.A. 2C:35-9a. Be further advised that information related to race/ethnicity is reported based on data provided by the New Jersey Department of Corrections and cannot be verified.

Unknown was changed to NA

Arrests = raw data from the NJ AG’s arrest portal. This is 2017 through 2024, arrests for N.J.S.A. 2C:35-9a strict-liability drug-induced death by county and race. They have collapsed White and Hispanic into one category and it only has three racial categories… White (including Hispanic), Black, and unknown…

dates: 2017 through 2024

variables: ID, County, Arrest Year, Arrest Month, Race, Age Range, Gender, Offense Statute, Offense Category, Arrest source, Status

n=373

codebook: ?

Unknown & Not Provided were changed to NA

Overdose = CDC wonder data, counts of overdose death, yearly

dates: 1999- 2023 variables: Notes, County, County Code, Year, Year Code, Deaths, Population, Crude Rate

crude rate calculated per 100,000

N= 529

Datasource limitations: a big limitation here is that CDC supresses data if the county has less than 15 deaths to report. We are in the process of requesting the unsupressed data.

• About revised 2019 deaths: On March 11, 2021, the 2019 mortality data on CDC WONDER was updated with the 2019 mortality data updated by NCHS on March 4, 2021 to include corrected information for residents of Texas affecting 5 records previously coded to cause code U01.4, Terrorism involving firearms (homicide). The underlying and multiple cause of death codes for 5 records were corrected in the 2019 data. Underlying and multiple cause of death codes for those 5 records were recoded to Assault (homicide) by other and unspecified firearm discharge, ICD-10 code X95. The corrected final death records replaces the data released on December 22, 2020. • About revised 2014 deaths: A revised data file for 2014 was released on April 3, 2017 to include corrections affecting 125 deaths previously coded to “Accidental discharge of firearms” (ICD-10 codes W32-W34). This corrected file replaced the file released on December 9, 2015. The underlying cause of death changed for 125 deaths in 2014 that occurred in Tennessee (80%) and Massachusetts (20%). (Note: Data by state of residence are also impacted in Connecticut, Florida, Georgia, Kentucky, North Carolina and Virginia for records where the death occurred in Tennessee or Massachusetts but the decedent was a resident of another state.) Prior to the revision, the underlying cause of death for all 125 deaths were classified as “Accidental discharge of firearms” (ICD-10 codes W32-W34). After the revision, 50% of the changed records are now classified as “Assault (homicide) by discharge of firearms” (ICD-10 codes U01.4, X93-X95); 42% are now classified as “Intentional self-harm (suicide) by discharge of firearms (ICD-10 codes X72-X74); 4% of the revised death records are now classified as”Discharge of firearms, undetermined intent” (ICD-10 codes Y22-Y24); and the remaining 4% to various other causes. For more information, please refer to Deaths: Final Data for 2014.

Query Parameters for 1999-2020: ICD-10 Codes: X40 (Accidental poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics); X41 (Accidental poisoning by and exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic drugs, not elsewhere classified); X42 (Accidental poisoning by and exposure to narcotics and psychodysleptics [hallucinogens], not elsewhere classified); X43 (Accidental poisoning by and exposure to other drugs acting on the autonomic nervous system); X44(Accidental poisoning by and exposure to other and unspecified drugs, medicaments and biological substances); X60 (Intentional
self-poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics); X61 (Intentional self-poisoning by and
exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic drugs, not elsewhere classified); X62
(Intentional self-poisoning by and exposure to narcotics and psychodysleptics [hallucinogens], not elsewhere classified); X63
(Intentional self-poisoning by and exposure to other drugs acting on the autonomic nervous system); X64 (Intentional
self-poisoning by and exposure to other and unspecified drugs, medicaments and biological substances); X85 (Assault by drugs,
medicaments and biological substances); Y10 (Poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics,
undetermined intent); Y11 (Poisoning by and exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic
drugs, not elsewhere classified, undetermined intent); Y12 (Poisoning by and exposure to narcotics and psychodysleptics
[hallucinogens], not elsewhere classified, undetermined intent); Y13 (Poisoning by and exposure to other drugs acting on the
autonomic nervous system, undetermined intent); Y14 (Poisoning by and exposure to other and unspecified drugs, medicaments and
biological substances, undetermined intent)

Query parameters for 2021-2023 = used location of incident (not resident of decedent) and followed the same definition of overdose death as NVSS (X40-44 [unintentional self harm/injury], X60-64 [intentional self harm/suicide], X85 [assault], Y10-14 [undetermined intent]).

Racial Demographics of the Population =

dates: 1989-2023

1989 is from here: https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/comp8090.htm

1990-2000 https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/cest9199.html

2000-2010—under county estimates/intercensal county estimates/2000-2010/by race sex and Hispanic origin https://www.nj.gov/labor/labormarketinformation/demographics/population-household-estimates/index.shtml

2010-2023: https://data.census.gov/table/ACSDT5Y2019.B01001A?t=Race+and+Ethnicity&g=040XX00US34$0500000&moe=false

Notes: For 1989, they categorize race into only 3 categories: ‘white’, ‘black’, and ‘other races’.

For 1990-1999 they separated race data into ‘Persons Not of Hispanic Origin’–white, black, ai/an, asian or pacific islander and ‘Persons of Hispanic Origin (Latinos)’–white, black, ai/an, asian or pacific islander (same categories).

Data was retrieved from the New Jersey Department of Labor and Workforce Development’s Population and Household Estimates intercensal population estimates.”

I also found this language at the bottom of one of the 1990-2000 data:

“Source: US Bureau of the Census, Population Estimates Branch. Prepared by: New Jersey Department of Labor, Division of Labor Market & Demographic Research, 4/02”

2025-10-NJ DID Overdose Analysis

Taleed El-Sabawi

2025-10-08

Overdose

State – further statistical analysis

Arrests and Overdose deaths

Correlation

Negative Binomial Model

Negative Binomial Model with 1 month time lag.

CONVICTIONS

Correlation

Poisson Regression

Negative Binomial Model

Lagged Negative Binomial Model

Controlling for COVID

County-Level Analysis

County/Convictions/Overdose Descriptive Statistics

Modeling Convictions and Overdoses at County level

Arrests/Overdoses/County