library(readxl)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
Convictions <- read_excel("Strict Liability spread sheet.xlsx")
Arrest <- read_excel("NJ DIH Arrest data 2017-2024.xlsx")
Overdose <-Underlying_Cause_of_Death_1999_2023 <- read_csv("Underlying Cause of Death 1999-2023.csv", 
    col_types = cols(Notes = col_skip(), 
        Year = col_integer(), `Year Code` = col_integer(), 
        Population = col_integer()))
State_OD_18 <- read_csv("Statewide_data_month_2018-2025-5.csv", 
    col_types = cols(Notes = col_skip(), 
        State = col_skip(), `State Code` = col_skip(), 
        Population = col_skip(), `Crude Rate` = col_skip()))
State_OD_mon <- read_csv("State Month Underlying Cause of Death, 1999-2025.csv")

## Rows: 317 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Month, Month Code, Deaths
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Arrest <- Arrest %>%
  mutate(
    Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
  )

Convictions <- Convictions  %>%
  mutate(
    Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
  )

Convictions <- Convictions %>%
  mutate(
    Ethnicity = ifelse(Ethnicity %in% c("Unknown", "Not Provided", "NA", ""), NA, Ethnicity)
  )

# Create Conviction Year Variable in
Convictions$Sentence.Date <- as.Date(Convictions$`Date of Sentence`)
Convictions$Sentence.Year <- as.numeric(format(Convictions$Sentence.Date, "%Y"))

#Create Conviction Month & Year code
Convictions$`Month Code` <- format(Convictions$Sentence.Date, "%Y/%m")



#Create Year & Month Variable in State_OD
library(dplyr)

State_OD_18 <- State_OD_18 %>%
  mutate(
    Year = as.numeric(sub("/.*", "", `Month Code`)),
    Month = as.numeric(sub(".*/", "", `Month Code`))
  )

State_OD_mon <- State_OD_mon %>%
  mutate(
    Year = as.numeric(sub("/.*", "", `Month Code`)),
    Month = as.numeric(sub(".*/", "", `Month Code`))
  )

# Create Month Code
Arrest$Month_NUM <- match(Arrest$`Arrest Month`, month.name)

Arrest$`Month Code` <- paste0(Arrest$`Arrest Year`, "/", sprintf("%02d", Arrest$Month_NUM))

#Create list of all months  to include; dates: 2017, through 2024.
mons_list <- data.frame(`Month Code` = format(
  seq.Date(
    from = as.Date("2017-01-01"),  # set start
    to   = as.Date("2024-12-31"),  # set  end
    by   = "month"
  ),
  "%Y/%m" 
))

mons_list <- mons_list %>%
  rename(`Month Code` = Month.Code)

#Count arrests
arrests_by_mon <- Arrest %>%
  group_by(`Month Code`) %>%
  summarise(Arrest_Count = n(), .groups = "drop")

# Insert 0's for months not reported as having arrests
arrests_by_mon <- mons_list %>%
  left_join(arrests_by_mon, by = "Month Code") %>%
  mutate(Arrest_Count = ifelse(is.na(Arrest_Count), 0, Arrest_Count))

library(dplyr)
library(tidyr)

# Step 1: Create list of all months  to include; dates: August 5, 1988, through June 23, 2025.
all_months <- data.frame(`Month Code` = format(
  seq.Date(
    from = as.Date("1988-01-01"),  # set  start
    to   = as.Date("2022-12-01"),  # set  end
    by   = "month"
  ),
  "%Y/%m" 
))

all_months <- all_months %>%
  rename(`Month Code` = Month.Code)

# Step 2: Count convictions
conv_by_mon <- Convictions %>%
  group_by(`Month Code`) %>%
  summarise(Conv_Count = n(), .groups = "drop")

# Insert 0's for months not reported as having arrests
conv_by_mon <- all_months %>%
  left_join(conv_by_mon, by = "Month Code") %>%
  mutate(Conv_Count = ifelse(is.na(Conv_Count), 0, Conv_Count))

For the state level data – I am using the population from the counties in the CDC Wonder dataset and just combining them – but may be more accurate if we just find a population dataset for NJ state. I added population estimates from 2024 from the Census Burea website before it was rendered inoperable by the government shutdown.

# 1) Statewide population by year (summing population data across counties to determine state population -- may be best to find population data from another source for the state)
pop_by_year <- Overdose %>%
  group_by(Year) %>%
  summarise(Population = sum(Population, na.rm = TRUE), .groups = "drop")

#Added 2024 population estimates from Census bureau

library(dplyr)
pop_by_year <- pop_by_year %>%
  bind_rows(data.frame(
    Year = 2024,
    Population = 9500851
  ))


# 2) Arrests by year
arrests_by_year <- Arrest %>%
  group_by(`Arrest Year`) %>%
  summarise(Arrest_Count = n(), .groups = "drop") %>%
  rename(Year = `Arrest Year`)

# 3) Convictions by year 
#convictions_by_year_filter code removed

convictions_by_year<- Convictions %>%
  group_by(Sentence.Year) %>%
  summarise(Conviction_Count = n(), .groups = "drop") %>%
  rename(Year = Sentence.Year)

# 4) Deaths by year (statewide)

deaths_by_year <- State_OD_18 %>%
  group_by(Year) %>%
  summarise(Count = sum(Deaths, na.rm = TRUE), .groups = "drop")

#ARRESTS 2017-2024

library(ggplot2)

ggplot(arrests_by_year, aes(x = `Year`, y = Arrest_Count)) +
  geom_line(group = 1, color = "steelblue", size = 1.2) +
  geom_point(size = 2) +
  geom_text(aes(label = Arrest_Count), vjust = -0.5) +
  labs(title = "Drug-Induced Arrests by Year",
       x = "Year", y = "Arrest Count") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

@Jennifer, We need to double check that that arrest data is complete through 2024. I did not pull it…. looking at the dataset the last arrest included is September of 2024.

library(knitr)
county_summary_arr <- Arrest %>%
  count(County) %>%
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

kable(county_summary_arr,
      caption = "Table: Arrests by County",
      col.names = c("County", "Count", "Percent"),
      align = "lrr")

Table: Arrests by County
County	Count	Percent
Atlantic	57	15.5
Burlington	56	15.3
Cape May	47	12.8
Ocean	37	10.1
Middlesex	31	8.4
Statewide or Regional LEA	22	6.0
Monmouth	20	5.4
Morris	20	5.4
Bergen	18	4.9
Gloucester	14	3.8
Passaic	13	3.5
Essex	9	2.5
Cumberland	5	1.4
Hunterdon	4	1.1
Camden	3	0.8
Mercer	3	0.8
Salem	2	0.5
Union	2	0.5
Warren	2	0.5
Hudson	1	0.3
Somerset	1	0.3

##Strict Liability Conviction Data – 1988 - April 2025

library(dplyr)
library(ggplot2)

# Line plot
ggplot(convictions_by_year, aes(x = Year, y = Conviction_Count)) +
  geom_line(color = "darkblue", size = 1.2) +
  geom_point(size = 2) +
  geom_text(aes(label = Conviction_Count), vjust = -0.7, size = 3.5) +  # adds counts above points
  scale_x_continuous(breaks = seq(min(convictions_by_year$Year, na.rm = TRUE),
                                  max(convictions_by_year$Year, na.rm = TRUE),
                                  by = 2)) +
  labs(title = "Number of DID Convictions by Year",
       x = "Year",
       y = "Number of Convictions") +
  theme_minimal()

library(knitr)
county_summary_conv <- Convictions %>%
  count(County) %>%
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

kable(county_summary_arr,
      caption = "Table: Convictions by County 1999- 2024(partial)",
      col.names = c("County", "Count", "Percent"),
      align = "lrr")

Table: Convictions by County 1999- 2024(partial)
County	Count	Percent
Atlantic	57	15.5
Burlington	56	15.3
Cape May	47	12.8
Ocean	37	10.1
Middlesex	31	8.4
Statewide or Regional LEA	22	6.0
Monmouth	20	5.4
Morris	20	5.4
Bergen	18	4.9
Gloucester	14	3.8
Passaic	13	3.5
Essex	9	2.5
Cumberland	5	1.4
Hunterdon	4	1.1
Camden	3	0.8
Mercer	3	0.8
Salem	2	0.5
Union	2	0.5
Warren	2	0.5
Hudson	1	0.3
Somerset	1	0.3

Overdose

library(dplyr)
library(ggplot2)

# Summarize total deaths by year

State_OD_mon$Deaths <- as.numeric(State_OD_mon$Deaths)

## Warning: NAs introduced by coercion

deaths_by_year_tot <- State_OD_mon %>%
  group_by(Year) %>%
  summarise(Count = sum(Deaths, na.rm = TRUE), .groups = "drop")

# Line chart of total deaths over time
ggplot(deaths_by_year_tot, aes(x = Year, y = Count)) +
  geom_line(color = "darkred", size = 1.2) +
  geom_point(size = 2) +
  geom_text(aes(label = Count), vjust = -0.5, size = 3) +
  labs(title = "Total Overdose Deaths by Year",
       x = "Year", y = "Number of Deaths") +
  theme_minimal()

library(dplyr)
#Combine into one dataset
combined_data <- full_join(
  dplyr::select(arrests_by_year, Year, Arrest_Count),
  dplyr::select(deaths_by_year, Year, Count),
  by = "Year"
)

library(ggplot2)

# Choose a scaling factor to bring arrests to the same ballpark as deaths
scale_factor <- 30  # adjust as needed

ggplot(combined_data, aes(x = Year)) +
  geom_line(aes(y = Count, color = "Overdose Deaths"), size = 1.2) +
  geom_line(aes(y = Arrest_Count * scale_factor, color = "Arrests"), size = 1.2) +
  scale_y_continuous(
    name = "Overdose Deaths",
    sec.axis = sec_axis(~./scale_factor, name = "Arrests")
  ) +
  scale_color_manual(values = c("Overdose Deaths" = "blue", "Arrests" = "red")) +
  labs(
    title = "Overdose Deaths vs Arrests by Year (Dual Y-Axis)",
    x = "Year",
    color = "Metric"
  ) +
  theme_minimal()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Visually there does not appear to be a relationships whereby Arrests decrease when overdose deaths decrease. This means that the number of overdose deaths (the underlying incident rate) does not seem to predict whether or not there are arrests. Of course, we will test this later. It also doesn’t appear that arrests result in less overdoses the next year…

library(tidyr)

combined_long <- combined_data %>%
  pivot_longer(cols = c(Arrest_Count, Count), names_to = "Metric", values_to = "Value")

ggplot(combined_long, aes(x = Year, y = Value)) +
  geom_line(color = "steelblue", size = 1.2) +
  facet_wrap(
    ~ Metric,
    scales = "free_y",
    labeller = as_labeller(
      c("Count" = "Overdose Deaths", "Arrest_Count" = "Arrests")
    )
  ) +
  labs(
    title = "Arrests vs Overdose Deaths by Year",
    x = "Year",
    y = "Count"
  ) +
  theme_minimal()

Transformed to Z scores…

combined_data_z <- combined_data %>%
  mutate(
    Arrest_z = scale(Arrest_Count),
    Deaths_z = scale(Count)
  )

ggplot(combined_data_z, aes(x = Year)) +
  geom_line(aes(y = Arrest_z, color = "Arrests"), size = 1.2) +
  geom_line(aes(y = Deaths_z, color = "Overdose Deaths"), size = 1.2) +
  labs(title = "Z-Score Comparison: Arrests vs Overdose Deaths",
       y = "Z-Score (Standardized Values)",
       x = "Year",
       color = "Metric") +
  theme_minimal()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Arrest_OD_month <- left_join(arrests_by_mon, State_OD_mon, by = "Month Code")

Arrest_OD_month <- Arrest_OD_month %>%
  left_join(pop_by_year, by = "Year")

#Convictions/Overdoses

convictions_filtered <- Convictions %>%
  filter(Sentence.Year >= 2018, Sentence.Year <= 2025)


#Create aggregated count of convictions by month for the entire dataset time period
conv_sum <- Convictions %>%
  filter(!is.na(`Month Code`)) %>%
  group_by(`Month Code`) %>%
  summarise(Conviction_Count = n()) %>%
  arrange(`Month Code`)

library(dplyr)

combined_df_3 <- conv_by_mon %>%
  left_join(State_OD_mon,       by = "Month Code") 

#Filter out prior to 1999

combined_df_3 <- combined_df_3 %>%
  mutate(Month_Date = as.Date(paste0(`Month Code`, "/01"), format = "%Y/%m/%d"))

combined_df_3_filtered <- combined_df_3 %>%
  filter(Month_Date >= as.Date("1999-01-01")) #because this is as far back as the OD data goes back

#Add population to the dataset

combined_df_3 <-combined_df_3 %>%
  left_join(pop_by_year, by = "Year")
combined_df_3_filtered <- combined_df_3_filtered %>%
  left_join(pop_by_year, by = "Year")

combined_yearly <- deaths_by_year_tot %>%
  left_join(convictions_by_year,       by = "Year") 

#filter dates

combined_yearly <- combined_yearly %>%
   filter(Year >= as.Date(1999)) 

# Add 0's for convictions for N/A
combined_yearly <- combined_yearly %>%
  replace_na(list(
    Conviction_Count = 0
  ))

library(ggplot2)

scale_factor <- 20  # Adjust to visually match scales

ggplot(combined_yearly, aes(x = Year)) +
  geom_line(aes(y = Count, color = "Overdose Deaths"), size = 1.2) +
  geom_line(aes(y = Conviction_Count * scale_factor, color = "Convictions"), size = 1.2) +
  scale_y_continuous(
    name = "Overdose Deaths",
    sec.axis = sec_axis(~./scale_factor, name = "Convictions")
  ) +
  scale_color_manual(
    values = c("Overdose Deaths" = "blue", "Convictions" = "red")
  ) +
  labs(
    title = "Overdose Deaths vs Convictions by Year",
    x = "Year",
    color = "Metric"
  ) +
  theme_minimal()

State – further statistical analysis

Arrests and Overdose deaths

Correlation

Correlations between arrests & overdose deaths since data is count – ran a spearman rank correlation

cor.test(Arrest_OD_month$Arrest_Count,
         Arrest_OD_month$Deaths,
         method = "spearman")

## Warning in cor.test.default(Arrest_OD_month$Arrest_Count,
## Arrest_OD_month$Deaths, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  Arrest_OD_month$Arrest_Count and Arrest_OD_month$Deaths
## S = 113708, p-value = 0.02496
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2287815

There was a statistically significant, positive association between the two variables, ρ = 0.229, p = .025. This suggests that months with higher numbers of arrests tend to also have higher numbers of overdose deaths.

###Poisson Model Arrest_OD_month

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Poisson regression: model overdose deaths as count
poisson_model1 <- glm(
  Deaths ~ Arrest_Count + Population,
  family = poisson(link = "log"),
  data = Arrest_OD_month
)

summary(poisson_model1)

## 
## Call:
## glm(formula = Deaths ~ Arrest_Count + Population, family = poisson(link = "log"), 
##     data = Arrest_OD_month)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   8.405e+00  3.181e-01  26.423   <2e-16 ***
## Arrest_Count  4.468e-03  2.332e-03   1.916   0.0554 .  
## Population   -3.296e-07  3.448e-08  -9.557   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 686.71  on 95  degrees of freedom
## Residual deviance: 551.89  on 93  degrees of freedom
## AIC: 1252.9
## 
## Number of Fisher Scoring iterations: 4

exp(coef(poisson_model1))

##  (Intercept) Arrest_Count   Population 
## 4470.6019937    1.0044780    0.9999997

# Overdispersion test
overdispersion <- sum(residuals(poisson_model1, type = "pearson")^2) / df.residual(poisson_model1)
overdispersion

## [1] 5.718123

Model: Deaths ~ Arrest_Count + Population

Overdispersion Detected - 5.718123 (A value > 1 means variance > mean → Poisson assumptions are violated)

A Poisson regression model indicated that arrest counts were marginally associated with overdose deaths (p = .055), with a 1-unit increase in arrests associated with a 0.45% increase in expected deaths. Population size, however, was negatively associated with deaths and reached statistical significance (p < .001). Despite these findings, the model exhibited substantial overdispersion (dispersion = 5.72), suggesting that the Poisson assumptions were violated and that standard errors and significance tests may be unreliable.

Negative-Binomial model may be more appropriate for model.

Negative Binomial Model

library(MASS)
nb_model <- glm.nb(Deaths ~ Arrest_Count + Population, data = Arrest_OD_month)
summary(nb_model)

## 
## Call:
## glm.nb(formula = Deaths ~ Arrest_Count + Population, data = Arrest_OD_month, 
##     init.theta = 43.83360034, link = log)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   8.601e+00  7.852e-01  10.954  < 2e-16 ***
## Arrest_Count  5.112e-03  5.909e-03   0.865    0.387    
## Population   -3.513e-07  8.496e-08  -4.135 3.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(43.8336) family taken to be 1)
## 
##     Null deviance: 121.922  on 95  degrees of freedom
## Residual deviance:  98.233  on 93  degrees of freedom
## AIC: 974.65
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  43.83 
##           Std. Err.:  7.70 
## 
##  2 x log-likelihood:  -966.651

# Overdispersion test
overdispersion2 <- sum(residuals(nb_model, type = "pearson")^2) / df.residual(nb_model)
overdispersion2

## [1] 0.9765794

glm.nb(Deaths ~ Arrest_Count + Population, data = Arrest_OD_month)

Intercept | 8.601 | 5433.75 | < .001 | Baseline predicted deaths |
Arrest_Count | 0.00511 | 1.0051 | 0.387 | Not significant — arrests don’t predict deaths |
Population | -3.51e-07 | ~0.99999965 | < .001 | Significant negative association | |

overdispersion2= 0.9765794, no overdispersion

AIC = 974.65 — much lower than Poisson model (1252.9), so NB is clearly a better fit.

A Negative Binomial regression model was fitted to examine the association between monthly overdose deaths, arrest counts, and population size, addressing overdispersion observed in the earlier Poisson model. Arrest counts were not statistically significant (β = 0.0051, p = .387), suggesting no meaningful association between arrests and overdose deaths in the observed data.

The model showed no evidence of overdispersion (dispersion = 0.98), indicating a good fit. Compared to the Poisson model, the negative binomial regression provided a substantially improved fit (AIC = 974.65 vs. 1252.9), confirming it as the more appropriate model for overdispersed count data.

Negative Binomial Model with 1 month time lag.

library(dplyr)

Arrest_OD_month <- Arrest_OD_month %>%
  mutate(Arrest_Count_lag1 = lag(Arrest_Count, n = 1))

library(MASS)

nb_model_lag <- glm.nb(Deaths ~ Arrest_Count_lag1 + Population, data = Arrest_OD_month)
summary(nb_model_lag)

## 
## Call:
## glm.nb(formula = Deaths ~ Arrest_Count_lag1 + Population, data = Arrest_OD_month, 
##     init.theta = 42.94998848, link = log)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        8.818e+00  7.921e-01  11.133  < 2e-16 ***
## Arrest_Count_lag1  1.081e-03  6.001e-03   0.180    0.857    
## Population        -3.735e-07  8.569e-08  -4.359 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(42.95) family taken to be 1)
## 
##     Null deviance: 119.530  on 94  degrees of freedom
## Residual deviance:  97.187  on 92  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 965.97
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  42.95 
##           Std. Err.:  7.56 
## 
##  2 x log-likelihood:  -957.967

Term	Estimate	exp(β)	p-value	Interpretation
Intercept	8.818	6,760.39	< .001	Baseline deaths
Arrest_Count_lag1	0.00108	1.0011	0.857	Not significant
Population	-3.735e-07	~0.9999996	< .001	Significant

A negative binomial regression was conducted to test whether lagged arrest counts predict monthly overdose deaths, while accounting for population size. Arrest counts were lagged by one month to assess potential delayed effects. The model revealed that lagged arrests were not significantly associated with overdose deaths (β = 0.00108, p = .857), indicating no measurable delayed impact. Population size remained a significant negative predictor (β = -3.735e-07, p < .001).

The model demonstrated good fit to the data, with a dispersion statistic near 1 and a lower AIC (965.97) compared to the non-lagged model. These findings suggest that short-term changes in arrests do not meaningfully influence monthly overdose mortality, even when accounting for temporal lag of 1 month.

Convictions and Overdose Deaths

library(dplyr)

conv_data <- combined_df_3_filtered %>%
  dplyr::select(Month_Date, Year, Deaths, Conv_Count, Population) %>%
  arrange(Month_Date) %>%
  mutate(Conv_Count_lag1 = lag(Conv_Count, 1))

Correlation

cor.test(conv_data$Conv_Count,
         conv_data$Deaths,
         method = "spearman")

## Warning in cor.test.default(conv_data$Conv_Count, conv_data$Deaths, method =
## "spearman"): Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  conv_data$Conv_Count and conv_data$Deaths
## S = 2323669, p-value = 4.469e-13
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4102266

Poisson Regression

poisson_conv <- glm(
  Deaths ~ Conv_Count + Population,
  family = poisson(link = "log"),
  data = conv_data
)

summary(poisson_conv)

## 
## Call:
## glm(formula = Deaths ~ Conv_Count + Population, family = poisson(link = "log"), 
##     data = conv_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.283e+01  2.302e-01  -55.75   <2e-16 ***
## Conv_Count   4.470e-02  2.567e-03   17.42   <2e-16 ***
## Population   1.984e-06  2.604e-08   76.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 14017.4  on 286  degrees of freedom
## Residual deviance:  5190.6  on 284  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 7029.4
## 
## Number of Fisher Scoring iterations: 4

# Overdispersion check
overdispersion_conv <- sum(residuals(poisson_conv, type = "pearson")^2) / df.residual(poisson_conv)
overdispersion_conv

## [1] 19.11486

Negative Binomial Model

library(MASS)

nb_conv <- glm.nb(
  Deaths ~ Conv_Count + Population,
  data = conv_data
)

summary(nb_conv)

## 
## Call:
## glm.nb(formula = Deaths ~ Conv_Count + Population, data = conv_data, 
##     init.theta = 6.541899928, link = log)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.394e+01  1.008e+00 -13.831  < 2e-16 ***
## Conv_Count   6.506e-02  1.395e-02   4.665 3.09e-06 ***
## Population   2.106e-06  1.150e-07  18.305  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.5419) family taken to be 1)
## 
##     Null deviance: 782.51  on 286  degrees of freedom
## Residual deviance: 294.78  on 284  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 2931.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.542 
##           Std. Err.:  0.576 
## 
##  2 x log-likelihood:  -2923.581

# Overdispersion check
overdispersion_nb <- sum(residuals(nb_conv, type = "pearson")^2) / df.residual(nb_conv)
overdispersion_nb

## [1] 0.989071

Deaths ~ Conv_Count + Population

Term	Estimate	z	p-value
Intercept	-13.94	-13.44	< .001 ***
Conv_Count	0.06603	4.60	< .001 ***
Population	2.11e-06	17.78	< .001 ***
AIC:	2955.9
Dispersion	~0.95		acceptable

Lagged Negative Binomial Model

nb_conv_lag <- glm.nb(
  Deaths ~ Conv_Count_lag1 + Population,
  data = conv_data
)

summary(nb_conv_lag)

## 
## Call:
## glm.nb(formula = Deaths ~ Conv_Count_lag1 + Population, data = conv_data, 
##     init.theta = 6.513924753, link = log)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.407e+01  1.014e+00 -13.877  < 2e-16 ***
## Conv_Count_lag1  6.440e-02  1.394e-02   4.619 3.85e-06 ***
## Population       2.121e-06  1.157e-07  18.328  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.5139) family taken to be 1)
## 
##     Null deviance: 775.69  on 285  degrees of freedom
## Residual deviance: 293.71  on 283  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 2923.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.514 
##           Std. Err.:  0.574 
## 
##  2 x log-likelihood:  -2915.822

# Overdispersion check
overdispersion_nb2 <- sum(residuals(nb_conv_lag, type = "pearson")^2) / df.residual(nb_conv_lag)
overdispersion_nb2

## [1] 0.9911637

Deaths ~ Conv_Count_lag1 + Population | Term | Estimate | z | p-value | | ————— | ——– | —— | ———— | | Intercept | -14.07 | -13.49 | < .001 *** | | Conv_Count_lag1 | 0.06538 | 4.55 | < .001 *** | | Population | 2.12e-06 | 17.81 | < .001 *** | | AIC: | 2948.1 | | | | Dispersion | ~0.95 | | ✔ acceptable |

# For the NB model without lag
exp_coef_nb <- exp(coef(nb_conv))
exp_coef_nb

##  (Intercept)   Conv_Count   Population 
## 8.873064e-07 1.067222e+00 1.000002e+00

# For the lagged NB model
exp_coef_lag <- exp(coef(nb_conv_lag))
exp_coef_lag

##     (Intercept) Conv_Count_lag1      Population 
##    7.774756e-07    1.066519e+00    1.000002e+00

# Confidence intervals (exp) for the original NB model
exp_confint_nb <- exp(confint(nb_conv))

## Waiting for profiling to be done...

# Confidence intervals (exp) for the lagged NB model
exp_confint_lag <- exp(confint(nb_conv_lag))

## Waiting for profiling to be done...

anova(nb_conv_lag, nb_conv, test = "Chisq")

## Likelihood ratio tests of Negative Binomial Models
## 
## Response: Deaths
##                          Model    theta Resid. df    2 x log-lik.   Test    df
## 1      Conv_Count + Population 6.541900       284       -2923.581             
## 2 Conv_Count_lag1 + Population 6.513925       283       -2915.822 1 vs 2     1
##   LR stat.     Pr(Chi)
## 1                     
## 2 7.759112 0.005344223

library(DHARMa)

## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')

# For original model
sim1 <- simulateResiduals(nb_conv)
plot(sim1)

## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L = G$L,
## : Fitting terminated with step failure - check results carefully

# For lagged model
sim2 <- simulateResiduals(nb_conv_lag)
plot(sim2)

Spearman’s rank correlation showed a moderate positive relationship between convictions and overdose mortality (ρ = 0.41, p < .001).

Poisson regression was initially attempted but found to be highly overdispersed (ϕ = 19.4), justifying the use of a negative binomial model (ϕ ≈ 0.95).

A series of negative binomial regression models were fit to examine the association between conviction counts and overdose deaths from 1999 to 2024.

In the adjusted model without a lag, convictions were significantly associated with overdose deaths (β = 0.066, p < .001), controlling for population size. This association remained statistically significant when a 1-month lag of conviction counts was introduced (β = 0.065, p < .001), suggesting that monthly changes in convictions may predict subsequent changes in overdose mortality. In both models, population size was a robust predictor of overdose deaths (p < .001). Model fit improved with the lagged specification (AIC = 2948.1 vs. 2955.9).

In the Negative Binomial model, the incidence rate ratio (IRR) for conviction counts was 1.07 (IRR = exp(0.066)), indicating that each additional conviction in a given month was associated with a 7% increase in the expected count of overdose deaths, holding population constant.

In the lagged model, the IRR for 1-month lagged convictions was 1.068, again showing a 6.8% increase in overdose deaths for every one-unit increase in convictions from the prior month. This suggests a potential short-term temporal association.

The IRR for population was approximately 1.000002, indicating that increases in population were also associated with slightly higher overdose deaths, as expected in population-scaled outcomes.

library(dplyr)
library(gt)

# Install broom 
# install.packages("broom")

library(broom)

# Tidy the model and exponentiate coefficients + CI
conv_nb_lag_table <- tidy(nb_conv_lag, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)

# Display the table
print(conv_nb_lag_table)

## # A tibble: 3 × 7
##   term               estimate   std.error statistic  p.value  conf.low conf.high
##   <chr>                 <dbl>       <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
## 1 (Intercept)     0.000000777 1.01           -13.9  8.76e-44   1.09e-7   5.52e-6
## 2 Conv_Count_lag1 1.07        0.0139           4.62 3.85e- 6   1.04e+0   1.10e+0
## 3 Population      1.00        0.000000116     18.3  4.91e-75   1.00e+0   1.00e+0

conv_nb_lag_table %>%
  mutate(across(estimate:conf.high, ~ round(.x, 6))) %>%
  rename(
    Term = term,
    IRR = estimate,
    `CI Lower` = conf.low,
    `CI Upper` = conf.high,
    `P-Value` = p.value
  ) %>%
  gt() %>%
  tab_header(
    title = "Incidence Rate Ratios (IRRs) from Lagged Negative Binomial Model"
  )

Term	IRR	std.error	statistic	P-Value	CI Lower	CI Upper
Incidence Rate Ratios (IRRs) from Lagged Negative Binomial Model
(Intercept)	0.000001	1.013723	-13.876779	0e+00	0.000000	0.000006
Conv_Count_lag1	1.066519	0.013942	4.619333	4e-06	1.037923	1.096825
Population	1.000002	0.000000	18.328406	0e+00	1.000002	1.000002

Controlling for COVID

controlling for COVID-era effects using a factor variable, which allows the model to compare multiple time periods to the reference category (2023/2024).

conv_data <- conv_data %>%
  mutate(
    COVID_era = case_when(
      Year < 2020 ~ "Pre",
      Year == 2020 ~ "2020",
      Year == 2021 ~ "2021",
      Year == 2022 ~ "2022",
      TRUE ~ "2023_24"
    )
  )

nb_conv_covid_fe <- glm.nb(
  Deaths ~ Conv_Count_lag1 + Population + factor(COVID_era),
  data = conv_data
)
nb_conv_covid_fe

## 
## Call:  glm.nb(formula = Deaths ~ Conv_Count_lag1 + Population + factor(COVID_era), 
##     data = conv_data, init.theta = 7.834860144, link = log)
## 
## Coefficients:
##           (Intercept)        Conv_Count_lag1             Population  
##            -1.490e+01              6.917e-02              2.281e-06  
## factor(COVID_era)2021  factor(COVID_era)2022   factor(COVID_era)Pre  
##            -8.295e-01             -9.871e-01             -5.897e-01  
## 
## Degrees of Freedom: 285 Total (i.e. Null);  280 Residual
##   (2 observations deleted due to missingness)
## Null Deviance:       920.9 
## Residual Deviance: 295.3     AIC: 2882

A final model was estimated including time-period fixed effects to account for the COVID-19 pandemic and its aftermath. The results show that lagged convictions remain significantly associated with overdose deaths, even after controlling for population and temporal factors (IRR = 1.073, p < .001). Each additional conviction in the prior month was associated with a 7.3% increase in overdose deaths (IRR = 1.073, 95% CI: [1.044145, 1.102787], p < 0.001), controlling for population and pandemic time periods.

Overdose deaths were significantly lower in pre-COVID (IRR = 0.55, 95% CI: …), as well as in the early pandemic years of 2021 (IRR = 0.44) and 2022 (IRR = 0.37), relative to the 2023–2024 period.

# Load required packages
library(broom)
library(dplyr)
library(gt)

# Step 1: Extract IRRs + Confidence Intervals
covid_nb_table <- tidy(nb_conv_covid_fe, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)

# Step 2: Format for clean presentation
covid_nb_table %>%
  mutate(across(estimate:conf.high, ~ round(.x, 6))) %>%
  rename(
    Term = term,
    IRR = estimate,
    `CI Lower` = conf.low,
    `CI Upper` = conf.high,
    `P-Value` = p.value
  ) %>%
  gt() %>%
  tab_header(
    title = "Incidence Rate Ratios (IRRs) from COVID-Adjusted Negative Binomial Model"
  )

Term	IRR	std.error	statistic	P-Value	CI Lower	CI Upper
Incidence Rate Ratios (IRRs) from COVID-Adjusted Negative Binomial Model
(Intercept)	0.000000	1.189977	-12.523406	0	0.000000	0.000003
Conv_Count_lag1	1.071613	0.013373	5.172089	0	1.044087	1.100659
Population	1.000002	0.000000	17.044558	0	1.000002	1.000003
factor(COVID_era)2021	0.436256	0.156688	-5.294128	0	0.321383	0.592213
factor(COVID_era)2022	0.372670	0.158385	-6.232041	0	0.273592	0.507602
factor(COVID_era)Pre	0.554513	0.109757	-5.372464	0	0.445103	0.682471

County-Level Analysis

library(dplyr)

Overdose %>%
  filter(Deaths == "Suppressed") %>%
  tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1    60

#total observations in dataset

library(dplyr)

Overdose %>%
  tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1   525

60 out of 525, or 11.4% of the overdose observations per month by county is suppressed.

# Standardize County names in Overdose dataframe
Overdose <- Overdose %>%
  mutate(County = gsub(" County, NJ", "", County))

# 1. Convictions by county and year
convictions_by_county_year <- Convictions %>%
  filter(!is.na(Sentence.Year)) %>%
  group_by(County, Year = Sentence.Year) %>%
  summarise(Conviction_Count = n(), .groups = "drop")

# 2. Arrests by county and year
arrests_by_county_year <- Arrest %>%
  group_by(County, Year = `Arrest Year`) %>%
  summarise(Arrest_Count = n(), .groups = "drop")


# 3. Combine into one dataset
combined_df_county <- Overdose %>%
  left_join(arrests_by_county_year, by = c("County", "Year")) %>%
  left_join(convictions_by_county_year, by = c("County", "Year")) %>%
  mutate(
   Arrest_Count = ifelse(is.na(Arrest_Count) & Year >= 2017, 0, Arrest_Count),
    Conviction_Count = ifelse(is.na(Conviction_Count), 0, Conviction_Count)
  )

#Midpoint Method (setting them all to 5 which is the median)
combined_df_county_md <- combined_df_county %>%
  mutate(
    Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths),       # Step 1: Set Suppressed to NA
    Deaths_num = as.numeric(Deaths_clean),                            # Step 2: Convert remaining to numeric
    Deaths_num = ifelse(is.na(Deaths_num), 5, Deaths_num)             # Step 3: Set NAs (formerly Suppressed) to 5
  )

#Sensitivity Analysis
##lower bound
df_lower <- combined_df_county %>%
  mutate(
    Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths),       # Step 1: Set Suppressed to NA
    Deaths_num = as.numeric(Deaths_clean),                            # Step 2: Convert remaining to numeric
    Deaths_num = ifelse(is.na(Deaths_num), 1, Deaths_num)             # Step 3: Set NAs (formerly Suppressed) to 1
  )

##upper bound
df_upper <- combined_df_county %>%
  mutate(
    Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths),       # Step 1: Set Suppressed to NA
    Deaths_num = as.numeric(Deaths_clean),                            # Step 2: Convert remaining to numeric
    Deaths_num = ifelse(is.na(Deaths_num), 9, Deaths_num)             # Step 3: Set NAs (formerly Suppressed) to 9
  )

# Load required libraries
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(truncnorm)
library(dplyr)

# Step 1: Prepare dataset and flag suppressed deaths as NA
combined_df_county_MI <- combined_df_county %>%
  mutate(
    Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths)),
    County = as.factor(County) 
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
##   as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

# Step 2: Define custom truncated Poisson-style imputation function
mice.impute.truncated_poisson <- function(y, ry, x, ...) {
  n_miss <- sum(!ry)
  mean_est <- 5  # fixed midpoint-based mean
  imputed <- round(rtruncnorm(n = n_miss, a = 1, b = 9, mean = mean_est, sd = 2))
  return(imputed)
}

# Step 3: Specify imputation methods
mice_methods <- make.method(combined_df_county_MI)
mice_methods["Deaths_num"] <- "truncated_poisson"

# Step 4: Build predictor matrix — only safe variables
mice_pred <- make.predictorMatrix(combined_df_county_MI)
mice_pred[,] <- 0  # turn off all predictors by default
mice_pred["Deaths_num", "Population"] <- 1  # use population to inform imputation
# Optional: include County if it's a factor and not a primary predictor
mice_pred["Deaths_num", "County"] <- 1

# Step 5: Run multiple imputation
imp <- mice(
  combined_df_county_MI,
  method = mice_methods,
  predictorMatrix = mice_pred,
  m = 5,
  seed = 123
)

## 
##  iter imp variable
##   1   1  Arrest_Count  Deaths_num
##   1   2  Arrest_Count  Deaths_num
##   1   3  Arrest_Count  Deaths_num
##   1   4  Arrest_Count  Deaths_num
##   1   5  Arrest_Count  Deaths_num
##   2   1  Arrest_Count  Deaths_num
##   2   2  Arrest_Count  Deaths_num
##   2   3  Arrest_Count  Deaths_num
##   2   4  Arrest_Count  Deaths_num
##   2   5  Arrest_Count  Deaths_num
##   3   1  Arrest_Count  Deaths_num
##   3   2  Arrest_Count  Deaths_num
##   3   3  Arrest_Count  Deaths_num
##   3   4  Arrest_Count  Deaths_num
##   3   5  Arrest_Count  Deaths_num
##   4   1  Arrest_Count  Deaths_num
##   4   2  Arrest_Count  Deaths_num
##   4   3  Arrest_Count  Deaths_num
##   4   4  Arrest_Count  Deaths_num
##   4   5  Arrest_Count  Deaths_num
##   5   1  Arrest_Count  Deaths_num
##   5   2  Arrest_Count  Deaths_num
##   5   3  Arrest_Count  Deaths_num
##   5   4  Arrest_Count  Deaths_num
##   5   5  Arrest_Count  Deaths_num

## Warning: Number of logged events: 1

# Step 6: Get completed dataset
# Option A: View all imputations (long format)
completed_data_long <- complete(imp, "long")

# Option B: Use one completed dataset (e.g., the first)
imputed_df_MI <- complete(imp, 1)

library(ggplot2)

completed_data_long %>%
  ggplot(aes(x = Deaths_num)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  facet_wrap(~.imp) +
  labs(
    title = "Distribution of Imputed Deaths_num Values",
    x = "Imputed Deaths",
    y = "Frequency"
  )

# Load required libraries
library(brms)

## Loading required package: Rcpp

## Loading 'brms' package (version 2.23.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').

## 
## Attaching package: 'brms'

## The following object is masked from 'package:stats':
## 
##     ar

library(dplyr)

# Step 1: Convert 'Deaths' to numeric with NAs for "Suppressed"
combined_df_county <- combined_df_county %>%
  mutate(
    Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths))
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
##   as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

# Step 2: Subset to observed (non-missing) death counts
df_obs <- combined_df_county %>%
  filter(!is.na(Deaths_num))


#Step 3: Scale Population and Year
df_obs <- df_obs %>%
  mutate(
    Year_scaled = scale(Year),
    Pop_scaled = scale(Population),
    County = factor(County)
  )

# Step 4: Fit Bayesian Negative Binomial model (simplest first)
library(brms)

bayes_model <- brm(
  Deaths_num ~ Year_scaled + Pop_scaled,
  data = df_obs,
  family = negbinomial(),
  chains = 2,
  iter = 2000,
  seed = 123,
  control = list(adapt_delta = 0.95)
)

## Compiling Stan program...

## Trying to compile a simple C file

## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## using C compiler: ‘Apple clang version 17.0.0 (clang-1700.0.13.5)’
## using SDK: ‘MacOSX15.5.sdk’
## clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/opt/R/arm64/include    -fPIC  -falign-functions=64 -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
##   679 | #include <cmath>
##       |          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1

## Start sampling

## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 6.3e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.63 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.212 seconds (Warm-up)
## Chain 1:                0.196 seconds (Sampling)
## Chain 1:                0.408 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 2.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.28 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.207 seconds (Warm-up)
## Chain 2:                0.193 seconds (Sampling)
## Chain 2:                0.4 seconds (Total)
## Chain 2:

# Step 5: Subset missing
df_missing <- combined_df_county %>%
  filter(is.na(Deaths_num)) %>%
  mutate(
    County = factor(County, levels = levels(df_obs$County)),
    Year_scaled = scale(Year,
                        center = attr(df_obs$Year_scaled, "scaled:center"),
                        scale = attr(df_obs$Year_scaled, "scaled:scale")),
    Pop_scaled = scale(Population,
                       center = attr(df_obs$Pop_scaled, "scaled:center"),
                       scale = attr(df_obs$Pop_scaled, "scaled:scale"))
  )

# Step 6: Predict and constrain to [1, 9]
predicted_draws <- posterior_predict(bayes_model, newdata = df_missing)
set.seed(123)
imputed_values <- apply(predicted_draws, 2, function(x) sample(x, 1))
imputed_values <- pmin(pmax(imputed_values, 1), 9)

# Step 7: Insert into original data
combined_df_county_Bay1 <- combined_df_county
combined_df_county_Bay1$Deaths_num[is.na(combined_df_county_Bay1$Deaths_num)] <- imputed_values

summary(bayes_model)

##  Family: negbinomial 
##   Links: mu = log 
## Formula: Deaths_num ~ Year_scaled + Pop_scaled 
##    Data: df_obs (Number of observations: 465) 
##   Draws: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 2000
## 
## Regression Coefficients:
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       4.12      0.02     4.08     4.17 1.00     3023     1666
## Year_scaled     0.51      0.02     0.47     0.55 1.00     2556     1369
## Pop_scaled      0.51      0.03     0.46     0.56 1.00     2363     1326
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     4.67      0.31     4.07     5.30 1.00     2414     1664
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Rhat = 1.00 for all parameters → indicates convergence.

Bulk_ESS and Tail_ESS > 1000 for all → good effective sample sizes.

No warnings → model ran smoothly.

plot(bayes_model)       # Trace + density

pp_check(bayes_model)   # Posterior predictive checks

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

Matches the shape of the observed data.

Captures the overdispersion well.

Is producing plausible predictions

#Check imputation distribution for Bayesian pooling
hist(imputed_values, breaks = 9, col = "orange", main = "Imputed Deaths_num via Bayesian Model")

Issue here is that there are no 0’s but that could be ok…

# Histogram before applying pmin/pmax
hist(as.vector(apply(predicted_draws, 2, function(x) sample(x, 1))),
     main = "Posterior Samples Before Truncation",
     xlab = "Raw predicted values")

fitted_vals <- fitted(bayes_model, newdata = df_missing)
summary(fitted_vals)

##     Estimate        Est.Error           Q2.5             Q97.5       
##  Min.   : 11.14   Min.   :0.7248   Min.   :  9.776   Min.   : 12.58  
##  1st Qu.: 14.47   1st Qu.:0.8201   1st Qu.: 12.929   1st Qu.: 16.09  
##  Median : 20.52   Median :0.9837   Median : 18.657   Median : 22.41  
##  Mean   : 26.25   Mean   :1.1636   Mean   : 24.040   Mean   : 28.54  
##  3rd Qu.: 26.30   3rd Qu.:1.1268   3rd Qu.: 24.055   3rd Qu.: 28.54  
##  Max.   :130.51   Max.   :6.6528   Max.   :118.169   Max.   :144.51

# Load required libraries
library(brms)
library(dplyr)

# Step 1: Create new dataset to preserve original
combined_df_county_Bay2 <- combined_df_county %>%
  mutate(
    Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths))
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
##   as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion

# Step 2: Subset non-missing observations
df_obs_Bay2 <- combined_df_county_Bay2 %>%
  filter(!is.na(Deaths_num)) %>%
  mutate(
    Year_scaled = scale(Year),
    Pop_scaled = scale(Population),
    County = factor(County)  # Ensure County is a factor
  )

# Step 3: Fit Bayesian model with County as a random effect
bayes_model_Bay2 <- brm(
  Deaths_num ~ Year_scaled + Pop_scaled + (1 | County),
  data = df_obs_Bay2,
  family = negbinomial(),
  chains = 4,
  iter = 3000,
  seed = 123,
  control = list(adapt_delta = 0.95)
)

## Compiling Stan program...

## Trying to compile a simple C file

## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## using C compiler: ‘Apple clang version 17.0.0 (clang-1700.0.13.5)’
## using SDK: ‘MacOSX15.5.sdk’
## clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/opt/R/arm64/include    -fPIC  -falign-functions=64 -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
##   679 | #include <cmath>
##       |          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1

## Start sampling

## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000132 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.32 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 1: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 1: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 1: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 1: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 1: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 1: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 1: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 1: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 3.563 seconds (Warm-up)
## Chain 1:                2.853 seconds (Sampling)
## Chain 1:                6.416 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 4.2e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.42 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 2: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 2: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 2: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 2: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 2: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 2: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 2: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 2: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 3.193 seconds (Warm-up)
## Chain 2:                2.738 seconds (Sampling)
## Chain 2:                5.931 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 5.8e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.58 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 3: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 3: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 3: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 3: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 3: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 3: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 3: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 3: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 3.194 seconds (Warm-up)
## Chain 3:                2.765 seconds (Sampling)
## Chain 3:                5.959 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 4.9e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.49 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 3000 [  0%]  (Warmup)
## Chain 4: Iteration:  300 / 3000 [ 10%]  (Warmup)
## Chain 4: Iteration:  600 / 3000 [ 20%]  (Warmup)
## Chain 4: Iteration:  900 / 3000 [ 30%]  (Warmup)
## Chain 4: Iteration: 1200 / 3000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1500 / 3000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1501 / 3000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1800 / 3000 [ 60%]  (Sampling)
## Chain 4: Iteration: 2100 / 3000 [ 70%]  (Sampling)
## Chain 4: Iteration: 2400 / 3000 [ 80%]  (Sampling)
## Chain 4: Iteration: 2700 / 3000 [ 90%]  (Sampling)
## Chain 4: Iteration: 3000 / 3000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 3.234 seconds (Warm-up)
## Chain 4:                2.887 seconds (Sampling)
## Chain 4:                6.121 seconds (Total)
## Chain 4:

# Step 4: Subset missing values and apply scaling
df_missing_Bay2 <- combined_df_county_Bay2 %>%
  filter(is.na(Deaths_num)) %>%
  mutate(
    County = factor(County, levels = levels(df_obs_Bay2$County)),  # match levels
    Year_scaled = scale(Year,
                        center = attr(df_obs_Bay2$Year_scaled, "scaled:center"),
                        scale = attr(df_obs_Bay2$Year_scaled, "scaled:scale")),
    Pop_scaled = scale(Population,
                       center = attr(df_obs_Bay2$Pop_scaled, "scaled:center"),
                       scale = attr(df_obs_Bay2$Pop_scaled, "scaled:scale"))
  )

# Step 5: Posterior prediction and imputation
predicted_draws_Bay2 <- posterior_predict(bayes_model_Bay2, newdata = df_missing_Bay2)

set.seed(123)
imputed_values_Bay2 <- apply(predicted_draws_Bay2, 2, function(x) sample(x, 1))
imputed_values_Bay2 <- pmin(pmax(imputed_values_Bay2, 1), 9)  # constrain to 1–9

# Step 6: Fill in the imputed values
combined_df_county_Bay2$Deaths_num[is.na(combined_df_county_Bay2$Deaths_num)] <- imputed_values_Bay2

#Check imputation distribution for Bayesian pooling
hist(imputed_values_Bay2, breaks = 9, col = "orange", main = "Imputed Deaths_num via Bayesian Model 2")

summary(bayes_model)

##  Family: negbinomial 
##   Links: mu = log 
## Formula: Deaths_num ~ Year_scaled + Pop_scaled 
##    Data: df_obs (Number of observations: 465) 
##   Draws: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 2000
## 
## Regression Coefficients:
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       4.12      0.02     4.08     4.17 1.00     3023     1666
## Year_scaled     0.51      0.02     0.47     0.55 1.00     2556     1369
## Pop_scaled      0.51      0.03     0.46     0.56 1.00     2363     1326
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     4.67      0.31     4.07     5.30 1.00     2414     1664
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(bayes_model_Bay2)       # Trace + density

pp_check(bayes_model_Bay2)   # Posterior predictive checks

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

In the provided overdose data, 60 out of 525 county-month observations (~11.4%) were labeled as “Suppressed,” indicating that the true value ranged between 1 and 9 but was withheld for privacy reasons. Since these suppressed values represent a non-trivial portion of the dataset, appropriate imputation methods were employed to estimate plausible values while preserving statistical integrity.

We explored four different imputation strategies to address the suppressed values in the Deaths variable: 1. Midpoint Imputation (Deterministic)

All suppressed death values were replaced with the midpoint of the known range:

Value used: 5

This approach is simple and transparent but does not reflect variability in the imputed values.

Output: combined_df_county_md

2. Sensitivity Analysis

To assess the bounds of potential impact, we created two alternative datasets:

Lower-bound scenario: All suppressed values set to 1

Upper-bound scenario: All suppressed values set to 9

These datasets allow for robustness checks in downstream models by comparing how extreme assumptions influence the results.

Output:

df_lower

df_upper

3. Multiple Imputation via MICE with Truncated Distribution

A more flexible approach used the mice package to perform multiple imputation with a custom imputation function drawing from a truncated normal distribution constrained to values between 1 and 9. This method incorporates uncertainty across imputations while maintaining the known bounds.

We used Population (scaled) as a predictor. County was included as an optional random effect via its factor representation, as it serves as a control variable in future analyses.

Imputation settings: 5 imputations (m = 5)

Custom imputation function using rtruncnorm(mean = 5, sd = 2, a = 1, b = 9)

Random draws ensure variability across imputations

Output:

A long-format dataset with multiple imputations (completed_data_long)

A single completed dataset (imputed_df) was extracted for use in modeling

A histogram of the imputed values showed a plausible, near-symmetric distribution centered around the midpoint, consistent with prior expectations.

4. Bayesian Imputation via Negative Binomial Models

We fit two Bayesian models using the brms package, leveraging posterior predictive sampling to estimate the suppressed values.

A. Bayesian Model 1 (No County Effects)

Formula: Deaths_num ~ Year_scaled + Pop_scaled

Family: Negative Binomial (handles overdispersion)

Random effects: None

Dataset created: combined_df_county_Bay1

Imputation:

Posterior predictive distributions were sampled for missing rows.

A single draw per observation was taken and constrained to the 1–9 range.

Model output:

Rhat = 1.00 for all parameters → convergence achieved

Effective sample sizes (ESS) > 1000

Posterior predictive checks showed good fit to observed data

B. Bayesian Model 2 (Hierarchical: Random Intercepts by County)

Formula: Deaths_num ~ Year_scaled + Pop_scaled + (1 | County)

Family: Negative Binomial

Random effects: County (partial pooling)

Dataset created: combined_df_county_Bay2

Imputation:

Same approach as Model 1 (posterior predictions + truncation), but with better fit for county-level heterogeneity.

Model output:

Converged across all 4 chains

Posterior predictive distributions captured both mean and dispersion

Histogram of imputed values showed reasonable spread (1–9), with central tendency near observed mean

Summary of Imputation Methods

Method	Variables Used	Output Dataset
Midpoint (Set all to 5)	—	`combined_df_county_md`
Sensitivity: Lower Bound (1)	—	`df_lower`
Sensitivity: Upper Bound (9)	—	`df_upper`
Multiple Imputation (MICE)	`Population`	`imputed_df`, `completed_data_long`
Bayesian Model 1	`Year`, `Population`	`combined_df_county_Bay1`
Bayesian Model 2 (with County)	`Year`, `Population`, `County (random intercept)`	`combined_df_county_Bay2`

Summary of Model Justification: To address the 11.4% of observations in our dataset with suppressed overdose death counts (reported as “Suppressed” and known to fall within the range of 1 to 9), we implemented a series of imputation strategies to ensure robustness and transparency. We began with simple imputation methods, including a midpoint substitution (setting all suppressed values to 5) and sensitivity analyses using the lower and upper bounds (1 and 9, respectively). These provided boundary checks for how assumptions about suppressed values might influence our findings.

We then applied Multiple Imputation by Chained Equations (MICE) using a truncated distribution constrained between 1 and 9. In this model, we included Population and County as predictors. Population was retained as a meaningful, independent contextual variable that reflects the expected scale of overdose events. County was added as a categorical predictor to allow the imputation process to borrow information across geographic units, thereby accounting for unobserved regional variation in overdose trends.We chose not to include Year as a predictor in this imputation step to avoid conflating temporal patterns with structural variation in death reporting practices, and to reduce the risk of overfitting imputation models based on potentially nonstationary time trends. Importantly, we excluded Conviction_Count, which is a key predictor in subsequent outcome modeling, to avoid biasing those future estimates through the imputation mechanism.

Finally, we implemented two Bayesian pooling models using the brms package. The first included Year and Population as predictors to estimate the likely distribution of suppressed counts, leveraging both temporal and demographic trends. The second model added County as a random intercept to account for potential unobserved heterogeneity across counties. These models allowed us to borrow strength from the observed data while explicitly modeling uncertainty and variability in suppressed values. Each imputation approach was stored as a separate dataset for comparison and downstream analysis. We included Year as a predictor in the Bayesian imputation models because those models are explicitly designed to capture structured trends and uncertainty in the observed data.

@Eli, do you think this is sufficient? Or should I run a Bayesian imputation model without Year and compare it as well?

County/Convictions/Overdose Descriptive Statistics

I just ran this with the second Bayesian Pooling model but can re-run it with others if you all want to see it….

library(dplyr)

combined_df_county_Bay2 %>%
  group_by(County) %>%
  summarise(Total_Overdose_Deaths = sum(Deaths_num)) %>%
  arrange(desc(Total_Overdose_Deaths))

## # A tibble: 21 × 2
##    County     Total_Overdose_Deaths
##    <chr>                      <dbl>
##  1 Essex                       4019
##  2 Camden                      3881
##  3 Ocean                       2916
##  4 Middlesex                   2792
##  5 Monmouth                    2450
##  6 Bergen                      2280
##  7 Hudson                      2111
##  8 Burlington                  2061
##  9 Atlantic                    1954
## 10 Gloucester                  1910
## # ℹ 11 more rows

library(dplyr)
library(ggplot2)

# Summarize overdose deaths by county
county_death_totals <- combined_df_county_Bay2 %>%
  group_by(County) %>%
  summarise(Total_Deaths = sum(Deaths_num, na.rm = TRUE)) %>%
  arrange(desc(Total_Deaths))

# Plot the bar graph
ggplot(county_death_totals, aes(x = reorder(County, -Total_Deaths), y = Total_Deaths)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Total Overdose Deaths by County",
    x = "County",
    y = "Total Overdose Deaths"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
library(dplyr)

# Optional: arrange by descending conviction count
county_summary_conv <- county_summary_conv %>%
  arrange(desc(n))

# Plot bar chart
ggplot(county_summary_conv, aes(x = reorder(County, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Total Convictions by County",
    x = "County",
    y = "Number of Convictions"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visually –by comparing the two graphs – it is clear that the counties with the largest number of overdose deaths do not have the most convictions.

library(ggplot2)
library(dplyr)

# Optional: arrange by descending arr count
county_summary_arr <- county_summary_arr %>%
  arrange(desc(n))

# Plot bar chart
ggplot(county_summary_arr, aes(x = reorder(County, -n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Total Arrests by County",
    x = "County",
    y = "Number of Arrests"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Same comments for arrests.

Modeling Convictions and Overdoses at County level

imputed_df_MI$Deaths <- as.numeric(imputed_df_MI$Deaths)

## Warning: NAs introduced by coercion

imputed_df_MI$County <- factor(imputed_df_MI$County)

run_models <- function(data, label) {
  message("===== Running models for: ", label, " =====")
  
  # Ensure required variables are correctly formatted and drop missing values
  data <- data %>%
    mutate(
      Deaths_num = as.numeric(Deaths),
      Year = as.numeric(Year),
      County = as.factor(County),
      Population = as.numeric(Population),
      Conviction_Count = as.numeric(Conviction_Count)
    ) %>%
    filter(
      !is.na(Deaths_num),
      !is.na(Conviction_Count),
      !is.na(Population)
    )
  
  # Fit Poisson regression
  poisson_model <- glm(
    Deaths_num ~ Conviction_Count + Year + Population + County,
    data = data,
    family = poisson
  )

  # Fit Negative Binomial regression
  nb_model <- MASS::glm.nb(
    Deaths_num ~ Conviction_Count + Year + Population + County,
    data = data
  )
  
  # Return AIC summary and model objects
  return(list(
    summary = data.frame(
      Method = label,
      Poisson_AIC = AIC(poisson_model),
      NegBin_AIC = AIC(nb_model)
    ),
    models = list(
      poisson = poisson_model,
      negbin = nb_model
    )
  ))
}

results_md     <- run_models(combined_df_county_md, "Midpoint Imputation")

## ===== Running models for: Midpoint Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_lower  <- run_models(df_lower, "Lower Bound Imputation")

## ===== Running models for: Lower Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_upper  <- run_models(df_upper, "Upper Bound Imputation")

## ===== Running models for: Upper Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_mi     <- run_models(imputed_df_MI, "Multiple Imputation")

## ===== Running models for: Multiple Imputation =====

results_bayes1 <- run_models(combined_df_county_Bay1, "Bayesian Imputation 1")

## ===== Running models for: Bayesian Imputation 1 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_bayes2 <- run_models(combined_df_county_Bay2, "Bayesian Imputation 2")

## ===== Running models for: Bayesian Imputation 2 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries <- bind_rows(
  results_md$summary,
  results_lower$summary,
  results_upper$summary,
  results_mi$summary,
  results_bayes1$summary,
  results_bayes2$summary
)

# Print as a markdown table
kable(model_summaries, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC	NegBin_AIC
Midpoint Imputation	5466.882	4018.577
Lower Bound Imputation	5466.882	4018.577
Upper Bound Imputation	5466.882	4018.577
Multiple Imputation	5466.882	4018.577
Bayesian Imputation 1	5466.882	4018.577
Bayesian Imputation 2	5466.882	4018.577

The table summarizes AIC values for Poisson and Negative Binomial (NB) models across six different imputation methods for suppressed overdose death counts. Across all imputation strategies, the NB models consistently demonstrate lower AIC values compared to the Poisson models, indicating a better fit — particularly important given the overdispersion typically present in count data like overdose deaths. The best-fitting model (lowest AIC) was observed for: Multiple Imputation (MICE) Bayesian Imputation 1 Bayesian Imputation 2

All three had identical AICs for both Poisson and NB models (NB AIC = 4018.58).

library(dplyr)
# Helper function to extract and label model summaries
extract_model_summary <- function(model, method_label) {
  coef_summary <- summary(model)$coefficients
  as.data.frame(coef_summary) %>%
    tibble::rownames_to_column("Term") %>%
    mutate(Method = method_label)
}

# Extract coefficient summaries from each NegBin model
coef_mi     <- extract_model_summary(results_mi$models$negbin, "Multiple Imputation")
coef_bayes1 <- extract_model_summary(results_bayes1$models$negbin, "Bayesian Imputation 1")
coef_bayes2 <- extract_model_summary(results_bayes2$models$negbin, "Bayesian Imputation 2")

# Combine all into one table
combined_coefs <- bind_rows(coef_mi, coef_bayes1, coef_bayes2)

# Optional: re-order for readability
combined_coefs <- combined_coefs %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)

filtered_coefs <- combined_coefs %>%
  filter(Term %in% c("Conviction_Count", "Year", "Population")) %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)


# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	Conviction_Count	0.008	0.008	1.039	0.299
Multiple Imputation	Year	0.080	0.003	26.819	0.000
Multiple Imputation	Population	0.000	0.000	-1.311	0.190
Bayesian Imputation 1	Conviction_Count	0.008	0.008	1.039	0.299
Bayesian Imputation 1	Year	0.080	0.003	26.819	0.000
Bayesian Imputation 1	Population	0.000	0.000	-1.311	0.190
Bayesian Imputation 2	Conviction_Count	0.008	0.008	1.039	0.299
Bayesian Imputation 2	Year	0.080	0.003	26.819	0.000
Bayesian Imputation 2	Population	0.000	0.000	-1.311	0.190

combined_coefs %>%
  filter(grepl("^County", Term)) %>%
  summarize(
    avg_effect = mean(Estimate),
    max_effect = max(Estimate),
    min_effect = min(Estimate)
  )

##    avg_effect max_effect min_effect
## 1 -0.06762637    1.56226  -1.984721

Across all three imputation strategies:

Conviction_Count was not statistically significant (p ≈ 0.299), suggesting no strong evidence that convictions are associated with monthly overdose deaths at the county level after controlling for time and population.

#Testing Colinearity of Population and County
summary(lm(Population ~ County, data = combined_df_county_Bay1))

## 
## Call:
## lm(formula = Population ~ County, data = combined_df_county_Bay1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74724  -5586    186   4683  80466 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        268753       3504  76.708  < 2e-16 ***
## CountyBergen       646467       4955 130.472  < 2e-16 ***
## CountyBurlington   178114       4955  35.947  < 2e-16 ***
## CountyCamden       243642       4955  49.172  < 2e-16 ***
## CountyCape May    -171674       4955 -34.648  < 2e-16 ***
## CountyCumberland  -116336       4955 -23.479  < 2e-16 ***
## CountyEssex        529876       4955 106.941  < 2e-16 ***
## CountyGloucester    15276       4955   3.083  0.00216 ** 
## CountyHudson       379152       4955  76.522  < 2e-16 ***
## CountyHunterdon   -142180       4955 -28.695  < 2e-16 ***
## CountyMercer        97651       4955  19.708  < 2e-16 ***
## CountyMiddlesex    541174       4955 109.221  < 2e-16 ***
## CountyMonmouth     358691       4955  72.392  < 2e-16 ***
## CountyMorris       222895       4955  44.985  < 2e-16 ***
## CountyOcean        309978       4955  62.561  < 2e-16 ***
## CountyPassaic      232437       4955  46.911  < 2e-16 ***
## CountySalem       -204085       4955 -41.189  < 2e-16 ***
## CountySomerset      54227       4955  10.944  < 2e-16 ***
## CountySussex      -122602       4955 -24.744  < 2e-16 ***
## CountyUnion        273983       4955  55.296  < 2e-16 ***
## CountyWarren      -161313       4955 -32.557  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17520 on 504 degrees of freedom
## Multiple R-squared:  0.9953, Adjusted R-squared:  0.9951 
## F-statistic:  5342 on 20 and 504 DF,  p-value: < 2.2e-16

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

# Exclude County or reduce levels if needed
model <- glm(Deaths_num ~ County + Population, data = combined_df_county_Bay1, family = poisson)
vif(model)

##               GVIF Df GVIF^(1/(2*Df))
## County     120.727 20        1.127315
## Population 120.727  1       10.987584

I am going to run the models without Population next to avoid multicollinearity. (In the Bayesian imputation models — including one with County as a random intercept — we retained Population as a control for exposure size.)

run_models_2 <- function(data, label) {
  message("===== Running models for: ", label, " =====")
  
  # Ensure required variables are correctly formatted and drop missing values
  data <- data %>%
    mutate(
      Deaths_num = as.numeric(Deaths),
      Year = as.numeric(Year),
      County = as.factor(County),
      Conviction_Count = as.numeric(Conviction_Count)
    ) %>%
    filter(
      !is.na(Deaths_num),
      !is.na(Conviction_Count)
    )
  
  # Fit Poisson regression
  poisson_model_2 <- glm(
    Deaths_num ~ Conviction_Count + Year + County,
    data = data,
    family = poisson
  )

  # Fit Negative Binomial regression
  nb_model_2 <- MASS::glm.nb(
    Deaths_num ~ Conviction_Count + Year + County,
    data = data
  )
  
  # Return AIC summary and model objects
  return(list(
    summary = data.frame(
      Method = label,
      Poisson_AIC_2 = AIC(poisson_model_2),
      NegBin_AIC_2 = AIC(nb_model_2)
    ),
    models = list(
      poisson = poisson_model_2,
      negbin = nb_model_2
    )
  ))
}

results_md2     <- run_models_2(combined_df_county_md, "Midpoint Imputation")

## ===== Running models for: Midpoint Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_lower2  <- run_models_2(df_lower, "Lower Bound Imputation")

## ===== Running models for: Lower Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_upper2  <- run_models_2(df_upper, "Upper Bound Imputation")

## ===== Running models for: Upper Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_mi2     <- run_models_2(imputed_df_MI, "Multiple Imputation")

## ===== Running models for: Multiple Imputation =====

results_bayes3 <- run_models_2(combined_df_county_Bay1, "Bayesian Imputation 1")

## ===== Running models for: Bayesian Imputation 1 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_bayes4 <- run_models_2(combined_df_county_Bay2, "Bayesian Imputation 2")

## ===== Running models for: Bayesian Imputation 2 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries_2 <- bind_rows(
  results_md2$summary,
  results_lower2$summary,
  results_upper2$summary,
  results_mi2$summary,
  results_bayes3$summary,
  results_bayes4$summary
)

# Print as a markdown table
kable(model_summaries_2, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC_2	NegBin_AIC_2
Midpoint Imputation	5477.669	4018.365
Lower Bound Imputation	5477.669	4018.365
Upper Bound Imputation	5477.669	4018.365
Multiple Imputation	5477.669	4018.365
Bayesian Imputation 1	5477.669	4018.365
Bayesian Imputation 2	5477.669	4018.365

library(dplyr)

# Extract coefficient summaries from each NegBin model
coef_mi2     <- extract_model_summary(results_mi2$models$negbin, "Multiple Imputation")
coef_bayes3 <- extract_model_summary(results_bayes3$models$negbin, "Bayesian Imputation 1")
coef_bayes4 <- extract_model_summary(results_bayes4$models$negbin, "Bayesian Imputation 2")

# Combine all into one table
combined_coefs2 <- bind_rows(coef_mi2, coef_bayes3, coef_bayes4)

# Optional: re-order for readability
combined_coefs2 <- combined_coefs2 %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)

filtered_coefs2 <- combined_coefs2 %>%
  filter(Term %in% c("Conviction_Count", "Year")) %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)


# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	Conviction_Count	0.010	0.008	1.237	0.216
Multiple Imputation	Year	0.077	0.002	36.129	0.000
Bayesian Imputation 1	Conviction_Count	0.010	0.008	1.237	0.216
Bayesian Imputation 1	Year	0.077	0.002	36.129	0.000
Bayesian Imputation 2	Conviction_Count	0.010	0.008	1.237	0.216
Bayesian Imputation 2	Year	0.077	0.002	36.129	0.000

The coefficient on Conviction_Count is not statistically significant in any of the top 3 models (p = 0.216). The model fit did not worsen (same AICs), and coefficients stayed nearly identical – after removing Population

Finally, I am going to try treating County and Year as factors.

run_models_3 <- function(data, label) {
  message("===== Running models for: ", label, " =====")
  
  # Ensure required variables are correctly formatted and drop missing values
  data <- data %>%
    mutate(
      Deaths_num = as.numeric(Deaths),
      Year = as.numeric(Year),
      County = as.factor(County),
      Conviction_Count = as.numeric(Conviction_Count)
    ) %>%
    filter(
      !is.na(Deaths_num),
      !is.na(Conviction_Count)
    )
  
  # Fit Poisson regression
  poisson_model_3 <- glm(
    Deaths_num ~ Conviction_Count + factor(Year) + factor(County),
    data = data,
    family = poisson
  )

  # Fit Negative Binomial regression
  nb_model_3 <- MASS::glm.nb(
    Deaths_num ~ Conviction_Count + factor(Year) + factor(County),
    data = data
  )
  
  # Return AIC summary and model objects
  return(list(
    summary = data.frame(
      Method = label,
      Poisson_AIC_2 = AIC(poisson_model_3),
      NegBin_AIC_2 = AIC(nb_model_3)
    ),
    models = list(
      poisson = poisson_model_3,
      negbin = nb_model_3
    )
  ))
}

results_md3     <- run_models_3(combined_df_county_md, "Midpoint Imputation")

## ===== Running models for: Midpoint Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_lower3  <- run_models_3(df_lower, "Lower Bound Imputation")

## ===== Running models for: Lower Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_upper3  <- run_models_3(df_upper, "Upper Bound Imputation")

## ===== Running models for: Upper Bound Imputation =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_mi3     <- run_models_3(imputed_df_MI, "Multiple Imputation")

## ===== Running models for: Multiple Imputation =====

results_bayes5 <- run_models_3(combined_df_county_Bay1, "Bayesian Imputation 1")

## ===== Running models for: Bayesian Imputation 1 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

results_bayes6 <- run_models_3(combined_df_county_Bay2, "Bayesian Imputation 2")

## ===== Running models for: Bayesian Imputation 2 =====

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries_4 <- bind_rows(
  results_md3$summary,
  results_lower3$summary,
  results_upper3$summary,
  results_mi3$summary,
  results_bayes5$summary,
  results_bayes6$summary
)

# Print as a markdown table
kable(model_summaries_4, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC_2	NegBin_AIC_2
Midpoint Imputation	4073.933	3734.332
Lower Bound Imputation	4073.933	3734.332
Upper Bound Imputation	4073.933	3734.332
Multiple Imputation	4073.933	3734.332
Bayesian Imputation 1	4073.933	3734.332
Bayesian Imputation 2	4073.933	3734.332

library(dplyr)

# Extract coefficient summaries from each NegBin model
coef_mi3     <- extract_model_summary(results_mi3$models$negbin, "Multiple Imputation")
coef_bayes5 <- extract_model_summary(results_bayes5$models$negbin, "Bayesian Imputation 1")
coef_bayes6 <- extract_model_summary(results_bayes6$models$negbin, "Bayesian Imputation 2")

# Combine all into one table
combined_coefs2 <- bind_rows(coef_mi3, coef_bayes5, coef_bayes6)

# Optional: re-order for readability
combined_coefs2 <- combined_coefs2 %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)

filtered_coefs2 <- combined_coefs2 %>%
  filter(Term %in% c("Conviction_Count", "Year")) %>%
  dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)


# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 1	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 2	Conviction_Count	-0.002	0.005	-0.446	0.656

knitr::kable(combined_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")

Negative Binomial Model Coefficients: Top Imputation Methods
Method	Term	Estimate	Std. Error	z value	Pr(>\|z\|)
Multiple Imputation	(Intercept)	3.411	0.073	46.731	0.000
Multiple Imputation	Conviction_Count	-0.002	0.005	-0.446	0.656
Multiple Imputation	factor(Year)2000	0.145	0.084	1.719	0.086
Multiple Imputation	factor(Year)2001	0.204	0.083	2.475	0.013
Multiple Imputation	factor(Year)2002	0.362	0.081	4.454	0.000
Multiple Imputation	factor(Year)2003	0.240	0.082	2.931	0.003
Multiple Imputation	factor(Year)2004	0.031	0.084	0.363	0.717
Multiple Imputation	factor(Year)2005	0.346	0.080	4.311	0.000
Multiple Imputation	factor(Year)2006	0.379	0.080	4.731	0.000
Multiple Imputation	factor(Year)2007	0.236	0.082	2.867	0.004
Multiple Imputation	factor(Year)2008	0.231	0.083	2.794	0.005
Multiple Imputation	factor(Year)2009	-0.359	0.114	-3.153	0.002
Multiple Imputation	factor(Year)2010	0.437	0.080	5.456	0.000
Multiple Imputation	factor(Year)2011	0.568	0.078	7.255	0.000
Multiple Imputation	factor(Year)2012	0.747	0.077	9.683	0.000
Multiple Imputation	factor(Year)2013	0.818	0.077	10.614	0.000
Multiple Imputation	factor(Year)2014	0.774	0.077	10.080	0.000
Multiple Imputation	factor(Year)2015	0.940	0.076	12.382	0.000
Multiple Imputation	factor(Year)2016	1.276	0.075	17.115	0.000
Multiple Imputation	factor(Year)2017	1.545	0.074	20.940	0.000
Multiple Imputation	factor(Year)2018	1.632	0.073	22.220	0.000
Multiple Imputation	factor(Year)2019	1.585	0.074	21.356	0.000
Multiple Imputation	factor(Year)2020	1.605	0.074	21.814	0.000
Multiple Imputation	factor(Year)2021	1.663	0.074	22.586	0.000
Multiple Imputation	factor(Year)2022	1.627	0.075	21.808	0.000
Multiple Imputation	factor(Year)2023	1.514	0.075	20.266	0.000
Multiple Imputation	factor(County)Bergen	0.226	0.058	3.911	0.000
Multiple Imputation	factor(County)Burlington	0.071	0.057	1.244	0.213
Multiple Imputation	factor(County)Camden	0.718	0.056	12.886	0.000
Multiple Imputation	factor(County)Cape May	-1.041	0.071	-14.590	0.000
Multiple Imputation	factor(County)Cumberland	-0.775	0.067	-11.645	0.000
Multiple Imputation	factor(County)Essex	0.733	0.056	13.046	0.000
Multiple Imputation	factor(County)Gloucester	0.011	0.058	0.187	0.852
Multiple Imputation	factor(County)Hudson	0.098	0.059	1.675	0.094
Multiple Imputation	factor(County)Hunterdon	-1.877	0.096	-19.524	0.000
Multiple Imputation	factor(County)Mercer	-0.377	0.061	-6.166	0.000
Multiple Imputation	factor(County)Middlesex	0.371	0.056	6.619	0.000
Multiple Imputation	factor(County)Monmouth	0.236	0.057	4.132	0.000
Multiple Imputation	factor(County)Morris	-0.357	0.060	-5.900	0.000
Multiple Imputation	factor(County)Ocean	0.399	0.056	7.063	0.000
Multiple Imputation	factor(County)Passaic	-0.062	0.059	-1.060	0.289
Multiple Imputation	factor(County)Salem	-1.488	0.082	-18.202	0.000
Multiple Imputation	factor(County)Somerset	-0.882	0.066	-13.350	0.000
Multiple Imputation	factor(County)Sussex	-1.205	0.072	-16.727	0.000
Multiple Imputation	factor(County)Union	-0.167	0.060	-2.798	0.005
Multiple Imputation	factor(County)Warren	-1.438	0.080	-17.958	0.000
Bayesian Imputation 1	(Intercept)	3.411	0.073	46.731	0.000
Bayesian Imputation 1	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 1	factor(Year)2000	0.145	0.084	1.719	0.086
Bayesian Imputation 1	factor(Year)2001	0.204	0.083	2.475	0.013
Bayesian Imputation 1	factor(Year)2002	0.362	0.081	4.454	0.000
Bayesian Imputation 1	factor(Year)2003	0.240	0.082	2.931	0.003
Bayesian Imputation 1	factor(Year)2004	0.031	0.084	0.363	0.717
Bayesian Imputation 1	factor(Year)2005	0.346	0.080	4.311	0.000
Bayesian Imputation 1	factor(Year)2006	0.379	0.080	4.731	0.000
Bayesian Imputation 1	factor(Year)2007	0.236	0.082	2.867	0.004
Bayesian Imputation 1	factor(Year)2008	0.231	0.083	2.794	0.005
Bayesian Imputation 1	factor(Year)2009	-0.359	0.114	-3.153	0.002
Bayesian Imputation 1	factor(Year)2010	0.437	0.080	5.456	0.000
Bayesian Imputation 1	factor(Year)2011	0.568	0.078	7.255	0.000
Bayesian Imputation 1	factor(Year)2012	0.747	0.077	9.683	0.000
Bayesian Imputation 1	factor(Year)2013	0.818	0.077	10.614	0.000
Bayesian Imputation 1	factor(Year)2014	0.774	0.077	10.080	0.000
Bayesian Imputation 1	factor(Year)2015	0.940	0.076	12.382	0.000
Bayesian Imputation 1	factor(Year)2016	1.276	0.075	17.115	0.000
Bayesian Imputation 1	factor(Year)2017	1.545	0.074	20.940	0.000
Bayesian Imputation 1	factor(Year)2018	1.632	0.073	22.220	0.000
Bayesian Imputation 1	factor(Year)2019	1.585	0.074	21.356	0.000
Bayesian Imputation 1	factor(Year)2020	1.605	0.074	21.814	0.000
Bayesian Imputation 1	factor(Year)2021	1.663	0.074	22.586	0.000
Bayesian Imputation 1	factor(Year)2022	1.627	0.075	21.808	0.000
Bayesian Imputation 1	factor(Year)2023	1.514	0.075	20.266	0.000
Bayesian Imputation 1	factor(County)Bergen	0.226	0.058	3.911	0.000
Bayesian Imputation 1	factor(County)Burlington	0.071	0.057	1.244	0.213
Bayesian Imputation 1	factor(County)Camden	0.718	0.056	12.886	0.000
Bayesian Imputation 1	factor(County)Cape May	-1.041	0.071	-14.590	0.000
Bayesian Imputation 1	factor(County)Cumberland	-0.775	0.067	-11.645	0.000
Bayesian Imputation 1	factor(County)Essex	0.733	0.056	13.046	0.000
Bayesian Imputation 1	factor(County)Gloucester	0.011	0.058	0.187	0.852
Bayesian Imputation 1	factor(County)Hudson	0.098	0.059	1.675	0.094
Bayesian Imputation 1	factor(County)Hunterdon	-1.877	0.096	-19.524	0.000
Bayesian Imputation 1	factor(County)Mercer	-0.377	0.061	-6.166	0.000
Bayesian Imputation 1	factor(County)Middlesex	0.371	0.056	6.619	0.000
Bayesian Imputation 1	factor(County)Monmouth	0.236	0.057	4.132	0.000
Bayesian Imputation 1	factor(County)Morris	-0.357	0.060	-5.900	0.000
Bayesian Imputation 1	factor(County)Ocean	0.399	0.056	7.063	0.000
Bayesian Imputation 1	factor(County)Passaic	-0.062	0.059	-1.060	0.289
Bayesian Imputation 1	factor(County)Salem	-1.488	0.082	-18.202	0.000
Bayesian Imputation 1	factor(County)Somerset	-0.882	0.066	-13.350	0.000
Bayesian Imputation 1	factor(County)Sussex	-1.205	0.072	-16.727	0.000
Bayesian Imputation 1	factor(County)Union	-0.167	0.060	-2.798	0.005
Bayesian Imputation 1	factor(County)Warren	-1.438	0.080	-17.958	0.000
Bayesian Imputation 2	(Intercept)	3.411	0.073	46.731	0.000
Bayesian Imputation 2	Conviction_Count	-0.002	0.005	-0.446	0.656
Bayesian Imputation 2	factor(Year)2000	0.145	0.084	1.719	0.086
Bayesian Imputation 2	factor(Year)2001	0.204	0.083	2.475	0.013
Bayesian Imputation 2	factor(Year)2002	0.362	0.081	4.454	0.000
Bayesian Imputation 2	factor(Year)2003	0.240	0.082	2.931	0.003
Bayesian Imputation 2	factor(Year)2004	0.031	0.084	0.363	0.717
Bayesian Imputation 2	factor(Year)2005	0.346	0.080	4.311	0.000
Bayesian Imputation 2	factor(Year)2006	0.379	0.080	4.731	0.000
Bayesian Imputation 2	factor(Year)2007	0.236	0.082	2.867	0.004
Bayesian Imputation 2	factor(Year)2008	0.231	0.083	2.794	0.005
Bayesian Imputation 2	factor(Year)2009	-0.359	0.114	-3.153	0.002
Bayesian Imputation 2	factor(Year)2010	0.437	0.080	5.456	0.000
Bayesian Imputation 2	factor(Year)2011	0.568	0.078	7.255	0.000
Bayesian Imputation 2	factor(Year)2012	0.747	0.077	9.683	0.000
Bayesian Imputation 2	factor(Year)2013	0.818	0.077	10.614	0.000
Bayesian Imputation 2	factor(Year)2014	0.774	0.077	10.080	0.000
Bayesian Imputation 2	factor(Year)2015	0.940	0.076	12.382	0.000
Bayesian Imputation 2	factor(Year)2016	1.276	0.075	17.115	0.000
Bayesian Imputation 2	factor(Year)2017	1.545	0.074	20.940	0.000
Bayesian Imputation 2	factor(Year)2018	1.632	0.073	22.220	0.000
Bayesian Imputation 2	factor(Year)2019	1.585	0.074	21.356	0.000
Bayesian Imputation 2	factor(Year)2020	1.605	0.074	21.814	0.000
Bayesian Imputation 2	factor(Year)2021	1.663	0.074	22.586	0.000
Bayesian Imputation 2	factor(Year)2022	1.627	0.075	21.808	0.000
Bayesian Imputation 2	factor(Year)2023	1.514	0.075	20.266	0.000
Bayesian Imputation 2	factor(County)Bergen	0.226	0.058	3.911	0.000
Bayesian Imputation 2	factor(County)Burlington	0.071	0.057	1.244	0.213
Bayesian Imputation 2	factor(County)Camden	0.718	0.056	12.886	0.000
Bayesian Imputation 2	factor(County)Cape May	-1.041	0.071	-14.590	0.000
Bayesian Imputation 2	factor(County)Cumberland	-0.775	0.067	-11.645	0.000
Bayesian Imputation 2	factor(County)Essex	0.733	0.056	13.046	0.000
Bayesian Imputation 2	factor(County)Gloucester	0.011	0.058	0.187	0.852
Bayesian Imputation 2	factor(County)Hudson	0.098	0.059	1.675	0.094
Bayesian Imputation 2	factor(County)Hunterdon	-1.877	0.096	-19.524	0.000
Bayesian Imputation 2	factor(County)Mercer	-0.377	0.061	-6.166	0.000
Bayesian Imputation 2	factor(County)Middlesex	0.371	0.056	6.619	0.000
Bayesian Imputation 2	factor(County)Monmouth	0.236	0.057	4.132	0.000
Bayesian Imputation 2	factor(County)Morris	-0.357	0.060	-5.900	0.000
Bayesian Imputation 2	factor(County)Ocean	0.399	0.056	7.063	0.000
Bayesian Imputation 2	factor(County)Passaic	-0.062	0.059	-1.060	0.289
Bayesian Imputation 2	factor(County)Salem	-1.488	0.082	-18.202	0.000
Bayesian Imputation 2	factor(County)Somerset	-0.882	0.066	-13.350	0.000
Bayesian Imputation 2	factor(County)Sussex	-1.205	0.072	-16.727	0.000
Bayesian Imputation 2	factor(County)Union	-0.167	0.060	-2.798	0.005
Bayesian Imputation 2	factor(County)Warren	-1.438	0.080	-17.958	0.000

The Negative Binomial model with County and Year treated as factors are the best model fits, but Conviction_Count is still not statistically significant. Now let’s do a time lag.

library(dplyr)

# Create 1-year lagged variables by County
combined_df_county_lagged <- combined_df_county %>%
  arrange(County, Year) %>%
  group_by(County) %>%
  mutate(
    Conviction_Count_lag1 = lag(Conviction_Count, 1),
    Arrest_Count_lag1 = lag(Arrest_Count, 1)
  ) %>%
  ungroup()

# Load required package
library(dplyr)

# Define function to add lagged conviction count
lag_and_return <- function(df) {
  df %>%
    arrange(County, Year) %>%
    group_by(County) %>%
    mutate(
      Conviction_Count_lag1 = lag(Conviction_Count, 1)
    ) %>%
    ungroup()
}


# Apply function to each dataset
combined_df_county_md_lag     <- lag_and_return(combined_df_county_md)
df_lower_lag                  <- lag_and_return(df_lower)
df_upper_lag                  <- lag_and_return(df_upper)
imputed_df_MI_lag             <- lag_and_return(imputed_df_MI)
combined_df_county_Bay1_lag   <- lag_and_return(combined_df_county_Bay1)
combined_df_county_Bay2_lag   <- lag_and_return(combined_df_county_Bay2)

library(MASS)
library(dplyr)

run_lagged_models <- function(data, label) {
  cat("\n=== Running Models for:", label, "===\n")
  
 
  
  # Fit Poisson model
  poisson_model <- glm(
    Deaths_num ~ Conviction_Count_lag1 + County + Year,
    family = poisson,
    data = data
  )
  
  # Calculate dispersion
  dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
  cat("Overdispersion ratio:", round(dispersion, 2), "\n")
  
  # Fit Negative Binomial model
  nb_model <- glm.nb(
    Deaths_num ~ Conviction_Count_lag1 + County + Year,
    data = data
  )
  
  return(list(
    label = label,
    poisson_model = poisson_model,
    nb_model = nb_model,
    poisson_summary = summary(poisson_model),
    nb_summary = summary(nb_model),
    poisson_AIC = AIC(poisson_model),
    nb_AIC = AIC(nb_model),
    dispersion = dispersion
  ))
}

# Run models
results_md_lag     <- run_lagged_models(combined_df_county_md_lag,     "Midpoint Imputation")

## 
## === Running Models for: Midpoint Imputation ===
## Overdispersion ratio: 6.77

results_lower_lag  <- run_lagged_models(df_lower_lag,                  "Lower Bound Imputation")

## 
## === Running Models for: Lower Bound Imputation ===
## Overdispersion ratio: 7.48

results_upper_lag  <- run_lagged_models(df_upper_lag,                  "Upper Bound Imputation")

## 
## === Running Models for: Upper Bound Imputation ===
## Overdispersion ratio: 6.45

results_mi_lag     <- run_lagged_models(imputed_df_MI_lag,             "Multiple Imputation")

## 
## === Running Models for: Multiple Imputation ===
## Overdispersion ratio: 6.9

results_bayes1_lag <- run_lagged_models(combined_df_county_Bay1_lag,   "Bayesian Imputation 1")

## 
## === Running Models for: Bayesian Imputation 1 ===
## Overdispersion ratio: 6.47

results_bayes2_lag <- run_lagged_models(combined_df_county_Bay2_lag,   "Bayesian Imputation 2")

## 
## === Running Models for: Bayesian Imputation 2 ===
## Overdispersion ratio: 6.45

library(knitr)

lagged_model_summary <- bind_rows(
  data.frame(Method = results_md_lag$label,     Poisson_AIC = results_md_lag$poisson_AIC,     NegBin_AIC = results_md_lag$nb_AIC),
  data.frame(Method = results_lower_lag$label,  Poisson_AIC = results_lower_lag$poisson_AIC,  NegBin_AIC = results_lower_lag$nb_AIC),
  data.frame(Method = results_upper_lag$label,  Poisson_AIC = results_upper_lag$poisson_AIC,  NegBin_AIC = results_upper_lag$nb_AIC),
  data.frame(Method = results_mi_lag$label,     Poisson_AIC = results_mi_lag$poisson_AIC,     NegBin_AIC = results_mi_lag$nb_AIC),
  data.frame(Method = results_bayes1_lag$label, Poisson_AIC = results_bayes1_lag$poisson_AIC, NegBin_AIC = results_bayes1_lag$nb_AIC),
  data.frame(Method = results_bayes2_lag$label, Poisson_AIC = results_bayes2_lag$poisson_AIC, NegBin_AIC = results_bayes2_lag$nb_AIC)
)

kable(lagged_model_summary, caption = "Model AICs with Lagged Conviction Count (1-Year Lag)")

Model AICs with Lagged Conviction Count (1-Year Lag)
Method	Poisson_AIC	NegBin_AIC
Midpoint Imputation	6392.913	4382.249
Lower Bound Imputation	6896.009	4568.021
Upper Bound Imputation	6193.911	4329.119
Multiple Imputation	6501.502	4430.688
Bayesian Imputation 1	6202.982	4332.581
Bayesian Imputation 2	6183.374	4321.602

library(dplyr)
library(knitr)
library(broom)

# function to extract coefficients from a model
extract_lagged_coef2 <- function(model, method_label) {
  tidy(model) %>%
    filter(term == "Conviction_Count_lag1") %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

# Helper function to extract coefficients from a model
extract_lagged_coef <- function(model, method_label) {
  tidy(model) %>%
    filter(term == "Conviction_Count_lag1") %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

# Apply to all models
coef_md_lag     <- extract_lagged_coef(results_md_lag$nb_model,     "Midpoint Imputation")
coef_lower_lag  <- extract_lagged_coef(results_lower_lag$nb_model,  "Lower Bound Imputation")
coef_upper_lag  <- extract_lagged_coef(results_upper_lag$nb_model,  "Upper Bound Imputation")
coef_mi_lag     <- extract_lagged_coef(results_mi_lag$nb_model,     "Multiple Imputation")
coef_bayes1_lag <- extract_lagged_coef(results_bayes1_lag$nb_model, "Bayesian Imputation 1")
coef_bayes2_lag <- extract_lagged_coef(results_bayes2_lag$nb_model, "Bayesian Imputation 2")

# Combine all into one table
combined_lagged_coefs <- bind_rows(
  coef_md_lag,
  coef_lower_lag,
  coef_upper_lag,
  coef_mi_lag,
  coef_bayes1_lag,
  coef_bayes2_lag
)

# Pretty print
kable(combined_lagged_coefs, digits = 3, caption = "Effect of 1-Year Lagged Conviction Count on Overdose Deaths (Negative Binomial Models)")

Effect of 1-Year Lagged Conviction Count on Overdose Deaths (Negative Binomial Models)
Method	term	estimate	std.error	statistic	p.value
Midpoint Imputation	Conviction_Count_lag1	0.009	0.011	0.814	0.415
Lower Bound Imputation	Conviction_Count_lag1	0.007	0.014	0.501	0.616
Upper Bound Imputation	Conviction_Count_lag1	0.010	0.010	1.018	0.309
Multiple Imputation	Conviction_Count_lag1	0.010	0.011	0.884	0.377
Bayesian Imputation 1	Conviction_Count_lag1	0.010	0.010	1.017	0.309
Bayesian Imputation 2	Conviction_Count_lag1	0.009	0.010	0.915	0.360

library(dplyr)
library(knitr)
library(broom)

# Helper function to extract all coefficients from a model and label them
extract_all_coefs <- function(model, method_label) {
  tidy(model) %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

# Apply to all lagged Negative Binomial models
coef_md_lag_all     <- extract_all_coefs(results_md_lag$nb_model,     "Midpoint Imputation")
coef_lower_lag_all  <- extract_all_coefs(results_lower_lag$nb_model,  "Lower Bound Imputation")
coef_upper_lag_all  <- extract_all_coefs(results_upper_lag$nb_model,  "Upper Bound Imputation")
coef_mi_lag_all     <- extract_all_coefs(results_mi_lag$nb_model,     "Multiple Imputation")
coef_bayes1_lag_all <- extract_all_coefs(results_bayes1_lag$nb_model, "Bayesian Imputation 1")
coef_bayes2_lag_all <- extract_all_coefs(results_bayes2_lag$nb_model, "Bayesian Imputation 2")

# Combine into one big table
combined_all_coefs <- bind_rows(
  coef_md_lag_all,
  coef_lower_lag_all,
  coef_upper_lag_all,
  coef_mi_lag_all,
  coef_bayes1_lag_all,
  coef_bayes2_lag_all
)

# Optional: Print (first 20 rows only if table is too large)
kable(
  head(combined_all_coefs, 20),
  digits = 3,
  caption = "Sample of Coefficients from Lagged Negative Binomial Models"
)

Sample of Coefficients from Lagged Negative Binomial Models
Method	term	estimate	std.error	statistic	p.value
Midpoint Imputation	(Intercept)	-163.886	5.255	-31.186	0.000
Midpoint Imputation	Conviction_Count_lag1	0.009	0.011	0.814	0.415
Midpoint Imputation	CountyBergen	0.268	0.108	2.489	0.013
Midpoint Imputation	CountyBurlington	0.075	0.107	0.696	0.486
Midpoint Imputation	CountyCamden	0.750	0.107	7.031	0.000
Midpoint Imputation	CountyCape May	-1.187	0.114	-10.410	0.000
Midpoint Imputation	CountyCumberland	-0.816	0.113	-7.249	0.000
Midpoint Imputation	CountyEssex	0.784	0.107	7.336	0.000
Midpoint Imputation	CountyGloucester	0.050	0.107	0.463	0.643
Midpoint Imputation	CountyHudson	0.138	0.108	1.279	0.201
Midpoint Imputation	CountyHunterdon	-1.927	0.124	-15.497	0.000
Midpoint Imputation	CountyMercer	-0.367	0.110	-3.335	0.001
Midpoint Imputation	CountyMiddlesex	0.394	0.107	3.697	0.000
Midpoint Imputation	CountyMonmouth	0.286	0.108	2.664	0.008
Midpoint Imputation	CountyMorris	-0.307	0.109	-2.820	0.005
Midpoint Imputation	CountyOcean	0.396	0.106	3.723	0.000
Midpoint Imputation	CountyPassaic	-0.021	0.108	-0.193	0.847
Midpoint Imputation	CountySalem	-1.566	0.120	-13.086	0.000
Midpoint Imputation	CountySomerset	-0.837	0.112	-7.461	0.000
Midpoint Imputation	CountySussex	-1.218	0.116	-10.539	0.000

Arrests/Overdoses/County

run_arrest_models <- function(data, label) {
  cat("\n=== Running Models for:", label, "===\n")
  
  # Poisson
  poisson_model <- glm(
    Deaths_num ~ Arrest_Count + factor(County) + factor(Year),
    family = poisson,
    data = data
  )
  
  # Check overdispersion
  dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
  cat("Overdispersion ratio (Poisson):", round(dispersion, 2), "\n")
  
  # Negative Binomial
  nb_model <- glm.nb(
    Deaths_num ~ Arrest_Count + factor(County) + factor(Year),
    data = data
  )
  
 # AIC summary as a data frame
  model_summary <- data.frame(
    Method = label,
    Poisson_AIC = AIC(poisson_model),
    NegBin_AIC = AIC(nb_model),
    Overdispersion = round(dispersion, 2)
  )
  
  return(list(
    label = label,
    poisson_model = poisson_model,
    nb_model = nb_model,
    poisson_summary = summary(poisson_model),
    nb_summary = summary(nb_model),
    summary = model_summary,
    dispersion = dispersion
  ))
}

# 1. Filter datasets for Year >= 2017
combined_df_county_md_2017   <- combined_df_county_md   %>% filter(Year >= 2017)
df_lower_2017                <- df_lower                %>% filter(Year >= 2017)
df_upper_2017                <- df_upper                %>% filter(Year >= 2017)
imputed_df_2017              <- imputed_df_MI              %>% filter(Year >= 2017)
combined_df_county_Bay1_2017 <- combined_df_county_Bay1 %>% filter(Year >= 2017)
combined_df_county_Bay2_2017 <- combined_df_county_Bay2 %>% filter(Year >= 2017)

# 2. Run models
results_md_arrest     <- run_arrest_models(combined_df_county_md_2017, "Midpoint Imputation")

## 
## === Running Models for: Midpoint Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_lower_arrest  <- run_arrest_models(df_lower_2017,                "Lower Bound Imputation")

## 
## === Running Models for: Lower Bound Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_upper_arrest  <- run_arrest_models(df_upper_2017,                "Upper Bound Imputation")

## 
## === Running Models for: Upper Bound Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_mi_arrest     <- run_arrest_models(imputed_df_2017,              "Multiple Imputation")

## 
## === Running Models for: Multiple Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_bayes1_arrest <- run_arrest_models(combined_df_county_Bay1_2017, "Bayesian Imputation 1")

## 
## === Running Models for: Bayesian Imputation 1 ===
## Overdispersion ratio (Poisson): 2.26

results_bayes2_arrest <- run_arrest_models(combined_df_county_Bay2_2017, "Bayesian Imputation 2")

## 
## === Running Models for: Bayesian Imputation 2 ===
## Overdispersion ratio (Poisson): 2.26

library(dplyr)
library(knitr)

# Combine AIC results from all model runs
model_summaries_24 <- bind_rows(
  results_md_arrest$summary,
results_lower_arrest$summary, 
results_upper_arrest$summary,
results_mi_arrest$summary,  
results_bayes1_arrest$summary,
results_bayes2_arrest$summary 
)

# Print as a markdown table
kable(model_summaries_24, caption = "Model Fit Comparison: AIC Values")

Model Fit Comparison: AIC Values
Method	Poisson_AIC	NegBin_AIC	Overdispersion
Midpoint Imputation	1279.785	1241.888	2.26
Lower Bound Imputation	1279.785	1241.888	2.26
Upper Bound Imputation	1279.785	1241.888	2.26
Multiple Imputation	1279.785	1241.888	2.26
Bayesian Imputation 1	1279.785	1241.888	2.26
Bayesian Imputation 2	1279.785	1241.888	2.26

library(dplyr)
library(broom)
library(knitr)

# List of results
all_arrest_models <- list(
  results_md_arrest,
  results_lower_arrest,
  results_upper_arrest,
  results_mi_arrest,
  results_bayes1_arrest,
  results_bayes2_arrest
)

# Helper to extract and label coefficients from Poisson and NB models
extract_coefs <- function(model_list) {
  label <- model_list$label
  
  # Extract tidy summaries
  poisson_coefs <- broom::tidy(model_list$poisson_model) %>%
    mutate(Model = "Poisson", Method = label)
  
  nb_coefs <- broom::tidy(model_list$nb_model) %>%
    mutate(Model = "NegBin", Method = label)
  
  bind_rows(poisson_coefs, nb_coefs)
}

# Combine all coefficients
combined_coefs55 <- purrr::map_df(all_arrest_models, extract_coefs)

# Optional: Filter to terms of interest (e.g., Arrest_Count)
arrest_coefs <- combined_coefs55 %>%
  filter(term == "Arrest_Count") %>%
  dplyr::select(Method, Model, term, estimate, std.error, statistic, p.value)

# Print the results in a nice table
kable(combined_coefs55, digits = 3, caption = "Arrest Data Coefficients Across Models")

Arrest Data Coefficients Across Models
term	estimate	std.error	statistic	p.value	Model	Method
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Midpoint Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Midpoint Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Midpoint Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Midpoint Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Midpoint Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Midpoint Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Midpoint Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Midpoint Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Midpoint Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Midpoint Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Midpoint Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Midpoint Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Midpoint Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Midpoint Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Midpoint Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Midpoint Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Midpoint Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Midpoint Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Midpoint Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Midpoint Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Midpoint Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Midpoint Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Midpoint Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Midpoint Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Midpoint Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Midpoint Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Midpoint Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Midpoint Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Midpoint Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Midpoint Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Midpoint Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Midpoint Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Midpoint Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Midpoint Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Midpoint Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Midpoint Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Midpoint Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Midpoint Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Midpoint Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Midpoint Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Midpoint Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Midpoint Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Midpoint Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Midpoint Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Midpoint Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Midpoint Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Midpoint Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Midpoint Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Midpoint Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Midpoint Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Midpoint Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Midpoint Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Midpoint Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Midpoint Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Midpoint Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Midpoint Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Lower Bound Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Lower Bound Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Lower Bound Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Lower Bound Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Lower Bound Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Lower Bound Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Lower Bound Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Lower Bound Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Lower Bound Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Lower Bound Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Lower Bound Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Lower Bound Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Lower Bound Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Lower Bound Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Lower Bound Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Lower Bound Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Lower Bound Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Lower Bound Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Lower Bound Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Lower Bound Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Lower Bound Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Lower Bound Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Lower Bound Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Lower Bound Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Lower Bound Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Lower Bound Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Lower Bound Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Lower Bound Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Lower Bound Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Lower Bound Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Lower Bound Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Lower Bound Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Lower Bound Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Lower Bound Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Lower Bound Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Lower Bound Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Lower Bound Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Lower Bound Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Lower Bound Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Lower Bound Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Lower Bound Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Lower Bound Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Lower Bound Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Lower Bound Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Lower Bound Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Lower Bound Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Lower Bound Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Lower Bound Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Lower Bound Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Lower Bound Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Lower Bound Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Lower Bound Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Lower Bound Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Lower Bound Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Lower Bound Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Lower Bound Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Upper Bound Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Upper Bound Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Upper Bound Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Upper Bound Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Upper Bound Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Upper Bound Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Upper Bound Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Upper Bound Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Upper Bound Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Upper Bound Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Upper Bound Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Upper Bound Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Upper Bound Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Upper Bound Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Upper Bound Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Upper Bound Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Upper Bound Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Upper Bound Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Upper Bound Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Upper Bound Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Upper Bound Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Upper Bound Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Upper Bound Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Upper Bound Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Upper Bound Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Upper Bound Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Upper Bound Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Upper Bound Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Upper Bound Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Upper Bound Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Upper Bound Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Upper Bound Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Upper Bound Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Upper Bound Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Upper Bound Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Upper Bound Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Upper Bound Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Upper Bound Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Upper Bound Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Upper Bound Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Upper Bound Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Upper Bound Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Upper Bound Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Upper Bound Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Upper Bound Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Upper Bound Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Upper Bound Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Upper Bound Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Upper Bound Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Upper Bound Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Upper Bound Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Upper Bound Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Upper Bound Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Upper Bound Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Upper Bound Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Upper Bound Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Multiple Imputation
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Multiple Imputation
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Multiple Imputation
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Multiple Imputation
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Multiple Imputation
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Multiple Imputation
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Multiple Imputation
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Multiple Imputation
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Multiple Imputation
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Multiple Imputation
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Multiple Imputation
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Multiple Imputation
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Multiple Imputation
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Multiple Imputation
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Multiple Imputation
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Multiple Imputation
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Multiple Imputation
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Multiple Imputation
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Multiple Imputation
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Multiple Imputation
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Multiple Imputation
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Multiple Imputation
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Multiple Imputation
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Multiple Imputation
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Multiple Imputation
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Multiple Imputation
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Multiple Imputation
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Multiple Imputation
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Multiple Imputation
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Multiple Imputation
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Multiple Imputation
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Multiple Imputation
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Multiple Imputation
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Multiple Imputation
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Multiple Imputation
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Multiple Imputation
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Multiple Imputation
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Multiple Imputation
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Multiple Imputation
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Multiple Imputation
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Multiple Imputation
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Multiple Imputation
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Multiple Imputation
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Multiple Imputation
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Multiple Imputation
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Multiple Imputation
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Multiple Imputation
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Multiple Imputation
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Multiple Imputation
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Multiple Imputation
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Multiple Imputation
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Multiple Imputation
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Multiple Imputation
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Multiple Imputation
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Multiple Imputation
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Multiple Imputation
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Bayesian Imputation 1
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Bayesian Imputation 1
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Bayesian Imputation 1
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Bayesian Imputation 1
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Bayesian Imputation 1
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Bayesian Imputation 1
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Bayesian Imputation 1
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Bayesian Imputation 1
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Bayesian Imputation 1
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Bayesian Imputation 1
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Bayesian Imputation 1
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Bayesian Imputation 1
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Bayesian Imputation 1
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Bayesian Imputation 1
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Bayesian Imputation 1
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Bayesian Imputation 1
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Bayesian Imputation 1
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Bayesian Imputation 1
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Bayesian Imputation 1
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Bayesian Imputation 1
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Bayesian Imputation 1
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Bayesian Imputation 1
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Bayesian Imputation 1
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Bayesian Imputation 1
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Bayesian Imputation 1
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Bayesian Imputation 1
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Bayesian Imputation 1
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Bayesian Imputation 1
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Bayesian Imputation 1
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Bayesian Imputation 1
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Bayesian Imputation 1
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Bayesian Imputation 1
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Bayesian Imputation 1
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Bayesian Imputation 1
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Bayesian Imputation 1
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Bayesian Imputation 1
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Bayesian Imputation 1
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Bayesian Imputation 1
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Bayesian Imputation 1
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Bayesian Imputation 1
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Bayesian Imputation 1
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Bayesian Imputation 1
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Bayesian Imputation 1
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Bayesian Imputation 1
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Bayesian Imputation 1
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Bayesian Imputation 1
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Bayesian Imputation 1
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Bayesian Imputation 1
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Bayesian Imputation 1
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Bayesian Imputation 1
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Bayesian Imputation 1
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Bayesian Imputation 1
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Bayesian Imputation 1
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Bayesian Imputation 1
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Bayesian Imputation 1
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Bayesian Imputation 1
(Intercept)	5.022	0.040	127.046	0.000	Poisson	Bayesian Imputation 2
Arrest_Count	-0.002	0.002	-0.798	0.425	Poisson	Bayesian Imputation 2
factor(County)Bergen	-0.001	0.045	-0.027	0.978	Poisson	Bayesian Imputation 2
factor(County)Burlington	0.000	0.042	-0.011	0.992	Poisson	Bayesian Imputation 2
factor(County)Camden	0.604	0.041	14.838	0.000	Poisson	Bayesian Imputation 2
factor(County)Cape May	-1.146	0.061	-18.758	0.000	Poisson	Bayesian Imputation 2
factor(County)Cumberland	-0.654	0.054	-12.226	0.000	Poisson	Bayesian Imputation 2
factor(County)Essex	0.733	0.039	18.691	0.000	Poisson	Bayesian Imputation 2
factor(County)Gloucester	-0.132	0.046	-2.887	0.004	Poisson	Bayesian Imputation 2
factor(County)Hudson	0.047	0.045	1.047	0.295	Poisson	Bayesian Imputation 2
factor(County)Hunterdon	-2.136	0.093	-22.921	0.000	Poisson	Bayesian Imputation 2
factor(County)Mercer	-0.279	0.048	-5.754	0.000	Poisson	Bayesian Imputation 2
factor(County)Middlesex	0.298	0.040	7.393	0.000	Poisson	Bayesian Imputation 2
factor(County)Monmouth	0.127	0.043	2.990	0.003	Poisson	Bayesian Imputation 2
factor(County)Morris	-0.549	0.051	-10.795	0.000	Poisson	Bayesian Imputation 2
factor(County)Ocean	0.316	0.040	7.937	0.000	Poisson	Bayesian Imputation 2
factor(County)Passaic	-0.140	0.046	-3.059	0.002	Poisson	Bayesian Imputation 2
factor(County)Salem	-1.593	0.075	-21.377	0.000	Poisson	Bayesian Imputation 2
factor(County)Somerset	-1.063	0.061	-17.324	0.000	Poisson	Bayesian Imputation 2
factor(County)Sussex	-1.361	0.068	-19.916	0.000	Poisson	Bayesian Imputation 2
factor(County)Union	-0.068	0.046	-1.474	0.140	Poisson	Bayesian Imputation 2
factor(County)Warren	-1.571	0.074	-21.268	0.000	Poisson	Bayesian Imputation 2
factor(Year)2018	0.078	0.027	2.922	0.003	Poisson	Bayesian Imputation 2
factor(Year)2019	0.045	0.027	1.678	0.093	Poisson	Bayesian Imputation 2
factor(Year)2020	0.055	0.027	2.022	0.043	Poisson	Bayesian Imputation 2
factor(Year)2021	0.128	0.027	4.815	0.000	Poisson	Bayesian Imputation 2
factor(Year)2022	0.104	0.027	3.910	0.000	Poisson	Bayesian Imputation 2
factor(Year)2023	-0.002	0.028	-0.069	0.945	Poisson	Bayesian Imputation 2
(Intercept)	5.036	0.055	90.812	0.000	NegBin	Bayesian Imputation 2
Arrest_Count	-0.003	0.003	-0.817	0.414	NegBin	Bayesian Imputation 2
factor(County)Bergen	-0.008	0.062	-0.136	0.892	NegBin	Bayesian Imputation 2
factor(County)Burlington	0.000	0.059	0.001	0.999	NegBin	Bayesian Imputation 2
factor(County)Camden	0.599	0.060	9.935	0.000	NegBin	Bayesian Imputation 2
factor(County)Cape May	-1.148	0.074	-15.584	0.000	NegBin	Bayesian Imputation 2
factor(County)Cumberland	-0.659	0.069	-9.506	0.000	NegBin	Bayesian Imputation 2
factor(County)Essex	0.730	0.059	12.426	0.000	NegBin	Bayesian Imputation 2
factor(County)Gloucester	-0.136	0.063	-2.157	0.031	NegBin	Bayesian Imputation 2
factor(County)Hudson	0.041	0.064	0.650	0.515	NegBin	Bayesian Imputation 2
factor(County)Hunterdon	-2.141	0.103	-20.760	0.000	NegBin	Bayesian Imputation 2
factor(County)Mercer	-0.286	0.066	-4.348	0.000	NegBin	Bayesian Imputation 2
factor(County)Middlesex	0.294	0.058	5.073	0.000	NegBin	Bayesian Imputation 2
factor(County)Monmouth	0.123	0.060	2.043	0.041	NegBin	Bayesian Imputation 2
factor(County)Morris	-0.553	0.066	-8.322	0.000	NegBin	Bayesian Imputation 2
factor(County)Ocean	0.314	0.058	5.458	0.000	NegBin	Bayesian Imputation 2
factor(County)Passaic	-0.145	0.063	-2.306	0.021	NegBin	Bayesian Imputation 2
factor(County)Salem	-1.599	0.087	-18.422	0.000	NegBin	Bayesian Imputation 2
factor(County)Somerset	-1.070	0.076	-14.095	0.000	NegBin	Bayesian Imputation 2
factor(County)Sussex	-1.367	0.082	-16.734	0.000	NegBin	Bayesian Imputation 2
factor(County)Union	-0.075	0.064	-1.169	0.243	NegBin	Bayesian Imputation 2
factor(County)Warren	-1.577	0.086	-18.285	0.000	NegBin	Bayesian Imputation 2
factor(Year)2018	0.083	0.037	2.228	0.026	NegBin	Bayesian Imputation 2
factor(Year)2019	0.042	0.038	1.110	0.267	NegBin	Bayesian Imputation 2
factor(Year)2020	0.057	0.038	1.513	0.130	NegBin	Bayesian Imputation 2
factor(Year)2021	0.120	0.037	3.204	0.001	NegBin	Bayesian Imputation 2
factor(Year)2022	0.083	0.038	2.201	0.028	NegBin	Bayesian Imputation 2
factor(Year)2023	-0.029	0.038	-0.752	0.452	NegBin	Bayesian Imputation 2

kable(arrest_coefs, digits = 3, caption = "Arrest_Count Coefficients Across Models" )

Arrest_Count Coefficients Across Models
Method	Model	term	estimate	std.error	statistic	p.value
Midpoint Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Midpoint Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Lower Bound Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Lower Bound Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Upper Bound Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Upper Bound Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Multiple Imputation	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Multiple Imputation	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Bayesian Imputation 1	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Bayesian Imputation 1	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414
Bayesian Imputation 2	Poisson	Arrest_Count	-0.002	0.002	-0.798	0.425
Bayesian Imputation 2	NegBin	Arrest_Count	-0.003	0.003	-0.817	0.414

Arrests_Count is not statistically significant…. So not much to report here… @Eli any other ideas how I can model this better – would be great appreciated!

# Define reusable lagging function
lag_arrest <- function(df) {
  df %>%
    arrange(County, Year) %>%
    group_by(County) %>%
    mutate(Arrest_Count_lag1 = lag(Arrest_Count, 1)) %>%
    ungroup()
}

# Apply to each imputed dataset
combined_df_county_md_lag_arrest   <- lag_arrest(combined_df_county_md)
df_lower_lag_arrest                <- lag_arrest(df_lower)
df_upper_lag_arrest                <- lag_arrest(df_upper)
imputed_df_MI_lag_arrest           <- lag_arrest(imputed_df_MI)
combined_df_county_Bay1_lag_arrest <- lag_arrest(combined_df_county_Bay1)
combined_df_county_Bay2_lag_arrest <- lag_arrest(combined_df_county_Bay2)

run_lagged_arrest_models <- function(data, label) {
  cat("\n=== Running Lagged Arrest Models for:", label, "===\n")

  # Filter for Year >= 2017
  data <- data %>% filter(Year >= 2017)

  # Fit Poisson model
  poisson_model <- glm(
    Deaths_num ~ Arrest_Count_lag1 + factor(County) + factor(Year),
    family = poisson,
    data = data
  )

  # Check for overdispersion
  dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
  cat("Overdispersion ratio (Poisson):", round(dispersion, 2), "\n")

  # Fit NB model
  nb_model <- MASS::glm.nb(
    Deaths_num ~ Arrest_Count_lag1 + factor(County) + factor(Year),
    data = data
  )

  return(list(
    label = label,
    poisson_model = poisson_model,
    nb_model = nb_model,
    poisson_summary = summary(poisson_model),
    nb_summary = summary(nb_model),
    poisson_AIC = AIC(poisson_model),
    nb_AIC = AIC(nb_model),
    dispersion = dispersion
  ))
}

results_md_arrest_lag     <- run_lagged_arrest_models(combined_df_county_md_lag_arrest,   "Midpoint Imputation")

## 
## === Running Lagged Arrest Models for: Midpoint Imputation ===
## Overdispersion ratio (Poisson): 2.19

results_lower_arrest_lag  <- run_lagged_arrest_models(df_lower_lag_arrest,                "Lower Bound Imputation")

## 
## === Running Lagged Arrest Models for: Lower Bound Imputation ===
## Overdispersion ratio (Poisson): 2.19

results_upper_arrest_lag  <- run_lagged_arrest_models(df_upper_lag_arrest,                "Upper Bound Imputation")

## 
## === Running Lagged Arrest Models for: Upper Bound Imputation ===
## Overdispersion ratio (Poisson): 2.19

results_mi_arrest_lag     <- run_lagged_arrest_models(imputed_df_MI_lag_arrest,           "Multiple Imputation")

## 
## === Running Lagged Arrest Models for: Multiple Imputation ===
## Overdispersion ratio (Poisson): 2.26

results_bayes1_arrest_lag <- run_lagged_arrest_models(combined_df_county_Bay1_lag_arrest, "Bayesian Imputation 1")

## 
## === Running Lagged Arrest Models for: Bayesian Imputation 1 ===
## Overdispersion ratio (Poisson): 2.19

results_bayes2_arrest_lag <- run_lagged_arrest_models(combined_df_county_Bay2_lag_arrest, "Bayesian Imputation 2")

## 
## === Running Lagged Arrest Models for: Bayesian Imputation 2 ===
## Overdispersion ratio (Poisson): 2.19

library(broom)

extract_arrest_lag_coefs <- function(model, method_label) {
  tidy(model) %>%
    filter(term == "Arrest_Count_lag1") %>%
    mutate(Method = method_label) %>%
    dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}

arrest_lag_coefs <- bind_rows(
  extract_arrest_lag_coefs(results_md_arrest_lag$nb_model,     "Midpoint"),
  extract_arrest_lag_coefs(results_lower_arrest_lag$nb_model,  "Lower Bound"),
  extract_arrest_lag_coefs(results_upper_arrest_lag$nb_model,  "Upper Bound"),
  extract_arrest_lag_coefs(results_mi_arrest_lag$nb_model,     "Multiple Imputation"),
  extract_arrest_lag_coefs(results_bayes1_arrest_lag$nb_model, "Bayesian 1"),
  extract_arrest_lag_coefs(results_bayes2_arrest_lag$nb_model, "Bayesian 2")
)

# Display results
knitr::kable(arrest_lag_coefs, digits = 3, caption = "Effect of Lagged Arrests on Overdose Deaths (NB Models)")

Effect of Lagged Arrests on Overdose Deaths (NB Models)
Method	term	estimate	std.error	statistic	p.value
Midpoint	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Lower Bound	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Upper Bound	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Multiple Imputation	Arrest_Count_lag1	0.000	0.003	0.027	0.979
Bayesian 1	Arrest_Count_lag1	0.001	0.003	0.231	0.817
Bayesian 2	Arrest_Count_lag1	0.001	0.003	0.231	0.817

Arrest_Count still not statistically significant predictor.

Possible theoretical explanation: 1 year is too long of an increment to see an affect. Literature suggests the impact is 1 month after arrests/convictions. We can rerun this analysis when we get the CDC’s unsuppressed monthly county data…

~~~~~~~~~~~~~~~~ Notes on Dataset~~~~~~~~~~~~~~~~~~~

State monthly overdose data was a combination of CDC Wonder DATA (1999-2022) and data for “unconfirmed OD’s” 2023-2025 https://ocsme.nj.gov/dashboard - The State of New Jersey, Office of the Chief State Medical Examiner

I added the population data from NJ for 2024 from here because the U.S. Census website is down because of the government closure. https://www.njspotlightnews.org/2025/01/nj-population-2024-census/

2025-10-NJ DID Overdose Analysis

Taleed El-Sabawi

2025-10-08

Overdose

State – further statistical analysis

Arrests and Overdose deaths

Correlation

Negative Binomial Model

Negative Binomial Model with 1 month time lag.

Convictions and Overdose Deaths

Correlation

Poisson Regression

Negative Binomial Model

Lagged Negative Binomial Model

Controlling for COVID

County-Level Analysis

County/Convictions/Overdose Descriptive Statistics

Modeling Convictions and Overdoses at County level

Arrests/Overdoses/County