library(readxl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
Convictions <- read_excel("Strict Liability spread sheet.xlsx")
Arrest <- read_excel("NJ DIH Arrest data 2017-2024.xlsx")
Overdose <-Underlying_Cause_of_Death_1999_2023 <- read_csv("Underlying Cause of Death 1999-2023.csv",
col_types = cols(Notes = col_skip(),
Year = col_integer(), `Year Code` = col_integer(),
Population = col_integer()))
State_OD_18 <- read_csv("Statewide_data_month_2018-2025-5.csv",
col_types = cols(Notes = col_skip(),
State = col_skip(), `State Code` = col_skip(),
Population = col_skip(), `Crude Rate` = col_skip()))
State_OD_mon <- read_csv("State Month Underlying Cause of Death, 1999-2025.csv")
## Rows: 317 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Month, Month Code, Deaths
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Arrest <- Arrest %>%
mutate(
Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
)
Convictions <- Convictions %>%
mutate(
Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
)
Convictions <- Convictions %>%
mutate(
Ethnicity = ifelse(Ethnicity %in% c("Unknown", "Not Provided", "NA", ""), NA, Ethnicity)
)
# Create Conviction Year Variable in
Convictions$Sentence.Date <- as.Date(Convictions$`Date of Sentence`)
Convictions$Sentence.Year <- as.numeric(format(Convictions$Sentence.Date, "%Y"))
#Create Conviction Month & Year code
Convictions$`Month Code` <- format(Convictions$Sentence.Date, "%Y/%m")
#Create Year & Month Variable in State_OD
library(dplyr)
State_OD_18 <- State_OD_18 %>%
mutate(
Year = as.numeric(sub("/.*", "", `Month Code`)),
Month = as.numeric(sub(".*/", "", `Month Code`))
)
State_OD_mon <- State_OD_mon %>%
mutate(
Year = as.numeric(sub("/.*", "", `Month Code`)),
Month = as.numeric(sub(".*/", "", `Month Code`))
)
# Create Month Code
Arrest$Month_NUM <- match(Arrest$`Arrest Month`, month.name)
Arrest$`Month Code` <- paste0(Arrest$`Arrest Year`, "/", sprintf("%02d", Arrest$Month_NUM))
#Create list of all months to include; dates: 2017, through 2024.
mons_list <- data.frame(`Month Code` = format(
seq.Date(
from = as.Date("2017-01-01"), # set start
to = as.Date("2024-12-31"), # set end
by = "month"
),
"%Y/%m"
))
mons_list <- mons_list %>%
rename(`Month Code` = Month.Code)
#Count arrests
arrests_by_mon <- Arrest %>%
group_by(`Month Code`) %>%
summarise(Arrest_Count = n(), .groups = "drop")
# Insert 0's for months not reported as having arrests
arrests_by_mon <- mons_list %>%
left_join(arrests_by_mon, by = "Month Code") %>%
mutate(Arrest_Count = ifelse(is.na(Arrest_Count), 0, Arrest_Count))
library(dplyr)
library(tidyr)
# Step 1: Create list of all months to include; dates: August 5, 1988, through June 23, 2025.
all_months <- data.frame(`Month Code` = format(
seq.Date(
from = as.Date("1988-01-01"), # set start
to = as.Date("2022-12-01"), # set end
by = "month"
),
"%Y/%m"
))
all_months <- all_months %>%
rename(`Month Code` = Month.Code)
# Step 2: Count convictions
conv_by_mon <- Convictions %>%
group_by(`Month Code`) %>%
summarise(Conv_Count = n(), .groups = "drop")
# Insert 0's for months not reported as having arrests
conv_by_mon <- all_months %>%
left_join(conv_by_mon, by = "Month Code") %>%
mutate(Conv_Count = ifelse(is.na(Conv_Count), 0, Conv_Count))
For the state level data – I am using the population from the counties in the CDC Wonder dataset and just combining them – but may be more accurate if we just find a population dataset for NJ state. I added population estimates from 2024 from the Census Burea website before it was rendered inoperable by the government shutdown.
# 1) Statewide population by year (summing population data across counties to determine state population -- may be best to find population data from another source for the state)
pop_by_year <- Overdose %>%
group_by(Year) %>%
summarise(Population = sum(Population, na.rm = TRUE), .groups = "drop")
#Added 2024 population estimates from Census bureau
library(dplyr)
pop_by_year <- pop_by_year %>%
bind_rows(data.frame(
Year = 2024,
Population = 9500851
))
# 2) Arrests by year
arrests_by_year <- Arrest %>%
group_by(`Arrest Year`) %>%
summarise(Arrest_Count = n(), .groups = "drop") %>%
rename(Year = `Arrest Year`)
# 3) Convictions by year
#convictions_by_year_filter code removed
convictions_by_year<- Convictions %>%
group_by(Sentence.Year) %>%
summarise(Conviction_Count = n(), .groups = "drop") %>%
rename(Year = Sentence.Year)
# 4) Deaths by year (statewide)
deaths_by_year <- State_OD_18 %>%
group_by(Year) %>%
summarise(Count = sum(Deaths, na.rm = TRUE), .groups = "drop")
#ARRESTS 2017-2024
library(ggplot2)
ggplot(arrests_by_year, aes(x = `Year`, y = Arrest_Count)) +
geom_line(group = 1, color = "steelblue", size = 1.2) +
geom_point(size = 2) +
geom_text(aes(label = Arrest_Count), vjust = -0.5) +
labs(title = "Drug-Induced Arrests by Year",
x = "Year", y = "Arrest Count") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
@Jennifer, We need to double check that that arrest data is complete through 2024. I did not pull it…. looking at the dataset the last arrest included is September of 2024.
library(knitr)
county_summary_arr <- Arrest %>%
count(County) %>%
mutate(Percent = round(n / sum(n) * 100, 1)) %>%
arrange(desc(n))
kable(county_summary_arr,
caption = "Table: Arrests by County",
col.names = c("County", "Count", "Percent"),
align = "lrr")
| County | Count | Percent |
|---|---|---|
| Atlantic | 57 | 15.5 |
| Burlington | 56 | 15.3 |
| Cape May | 47 | 12.8 |
| Ocean | 37 | 10.1 |
| Middlesex | 31 | 8.4 |
| Statewide or Regional LEA | 22 | 6.0 |
| Monmouth | 20 | 5.4 |
| Morris | 20 | 5.4 |
| Bergen | 18 | 4.9 |
| Gloucester | 14 | 3.8 |
| Passaic | 13 | 3.5 |
| Essex | 9 | 2.5 |
| Cumberland | 5 | 1.4 |
| Hunterdon | 4 | 1.1 |
| Camden | 3 | 0.8 |
| Mercer | 3 | 0.8 |
| Salem | 2 | 0.5 |
| Union | 2 | 0.5 |
| Warren | 2 | 0.5 |
| Hudson | 1 | 0.3 |
| Somerset | 1 | 0.3 |
##Strict Liability Conviction Data – 1988 - April 2025
library(dplyr)
library(ggplot2)
# Line plot
ggplot(convictions_by_year, aes(x = Year, y = Conviction_Count)) +
geom_line(color = "darkblue", size = 1.2) +
geom_point(size = 2) +
geom_text(aes(label = Conviction_Count), vjust = -0.7, size = 3.5) + # adds counts above points
scale_x_continuous(breaks = seq(min(convictions_by_year$Year, na.rm = TRUE),
max(convictions_by_year$Year, na.rm = TRUE),
by = 2)) +
labs(title = "Number of DID Convictions by Year",
x = "Year",
y = "Number of Convictions") +
theme_minimal()
library(knitr)
county_summary_conv <- Convictions %>%
count(County) %>%
mutate(Percent = round(n / sum(n) * 100, 1)) %>%
arrange(desc(n))
kable(county_summary_conv,
caption = "Table: Convictions by County 1999- 2025(June)",
col.names = c("County", "Count", "Percent"),
align = "lrr")
| County | Count | Percent |
|---|---|---|
| Cape May | 68 | 17.7 |
| Atlantic | 66 | 17.1 |
| Burlington | 56 | 14.5 |
| Middlesex | 34 | 8.8 |
| Ocean | 26 | 6.8 |
| Morris | 20 | 5.2 |
| Gloucester | 18 | 4.7 |
| Passaic | 16 | 4.2 |
| Monmouth | 13 | 3.4 |
| Camden | 9 | 2.3 |
| Hunterdon | 9 | 2.3 |
| Somerset | 9 | 2.3 |
| Essex | 8 | 2.1 |
| Hudson | 6 | 1.6 |
| Mercer | 6 | 1.6 |
| Bergen | 5 | 1.3 |
| Sussex | 5 | 1.3 |
| Union | 5 | 1.3 |
| Warren | 4 | 1.0 |
| Cumberland | 2 | 0.5 |
library(dplyr)
library(ggplot2)
# Summarize total deaths by year
State_OD_mon$Deaths <- as.numeric(State_OD_mon$Deaths)
## Warning: NAs introduced by coercion
deaths_by_year_tot <- State_OD_mon %>%
group_by(Year) %>%
summarise(Count = sum(Deaths, na.rm = TRUE), .groups = "drop")
# Line chart of total deaths over time
ggplot(deaths_by_year_tot, aes(x = Year, y = Count)) +
geom_line(color = "darkred", size = 1.2) +
geom_point(size = 2) +
geom_text(aes(label = Count), vjust = -0.5, size = 3) +
labs(title = "Total Overdose Deaths by Year",
x = "Year", y = "Number of Deaths") +
theme_minimal()
library(dplyr)
#Combine into one dataset
combined_data <- full_join(
dplyr::select(arrests_by_year, Year, Arrest_Count),
dplyr::select(deaths_by_year, Year, Count),
by = "Year"
)
library(ggplot2)
# Choose a scaling factor to bring arrests to the same ballpark as deaths
scale_factor <- 30 # adjust as needed
ggplot(combined_data, aes(x = Year)) +
geom_line(aes(y = Count, color = "Overdose Deaths"), size = 1.2) +
geom_line(aes(y = Arrest_Count * scale_factor, color = "Arrests"), size = 1.2) +
scale_y_continuous(
name = "Overdose Deaths",
sec.axis = sec_axis(~./scale_factor, name = "Arrests")
) +
scale_color_manual(values = c("Overdose Deaths" = "blue", "Arrests" = "red")) +
labs(
title = "Overdose Deaths vs Arrests by Year (Dual Y-Axis)",
x = "Year",
color = "Metric"
) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Visually there does not appear to be a relationships whereby Arrests
decrease when overdose deaths decrease. This means that the number of
overdose deaths (the underlying incident rate) does not seem to predict
whether or not there are arrests. Of course, we will test this later. It
also doesn’t appear that arrests result in less overdoses the next
year…
library(tidyr)
combined_long <- combined_data %>%
pivot_longer(cols = c(Arrest_Count, Count), names_to = "Metric", values_to = "Value")
ggplot(combined_long, aes(x = Year, y = Value)) +
geom_line(color = "steelblue", size = 1.2) +
facet_wrap(
~ Metric,
scales = "free_y",
labeller = as_labeller(
c("Count" = "Overdose Deaths", "Arrest_Count" = "Arrests")
)
) +
labs(
title = "Arrests vs Overdose Deaths by Year",
x = "Year",
y = "Count"
) +
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
Transformed to Z scores…
combined_data_z <- combined_data %>%
mutate(
Arrest_z = scale(Arrest_Count),
Deaths_z = scale(Count)
)
ggplot(combined_data_z, aes(x = Year)) +
geom_line(aes(y = Arrest_z, color = "Arrests"), size = 1.2) +
geom_line(aes(y = Deaths_z, color = "Overdose Deaths"), size = 1.2) +
labs(title = "Z-Score Comparison: Arrests vs Overdose Deaths",
y = "Z-Score (Standardized Values)",
x = "Year",
color = "Metric") +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Arrest_OD_month <- left_join(arrests_by_mon, State_OD_mon, by = "Month Code")
Arrest_OD_month <- Arrest_OD_month %>%
left_join(pop_by_year, by = "Year")
#Convictions/Overdoses
convictions_filtered <- Convictions %>%
filter(Sentence.Year >= 2018, Sentence.Year <= 2025)
#Create aggregated count of convictions by month for the entire dataset time period
conv_sum <- Convictions %>%
filter(!is.na(`Month Code`)) %>%
group_by(`Month Code`) %>%
summarise(Conviction_Count = n()) %>%
arrange(`Month Code`)
library(dplyr)
combined_df_3 <- conv_by_mon %>%
left_join(State_OD_mon, by = "Month Code")
#Filter out prior to 1999
combined_df_3 <- combined_df_3 %>%
mutate(Month_Date = as.Date(paste0(`Month Code`, "/01"), format = "%Y/%m/%d"))
combined_df_3_filtered <- combined_df_3 %>%
filter(Month_Date >= as.Date("1999-01-01")) #because this is as far back as the OD data goes back
#Add population to the dataset
combined_df_3 <-combined_df_3 %>%
left_join(pop_by_year, by = "Year")
combined_df_3_filtered <- combined_df_3_filtered %>%
left_join(pop_by_year, by = "Year")
combined_yearly <- deaths_by_year_tot %>%
left_join(convictions_by_year, by = "Year")
#filter dates
combined_yearly <- combined_yearly %>%
filter(Year >= as.Date(1999))
# Add 0's for convictions for N/A
combined_yearly <- combined_yearly %>%
replace_na(list(
Conviction_Count = 0
))
library(ggplot2)
scale_factor <- 20 # Adjust to visually match scales
ggplot(combined_yearly, aes(x = Year)) +
geom_line(aes(y = Count, color = "Overdose Deaths"), size = 1.2) +
geom_line(aes(y = Conviction_Count * scale_factor, color = "Convictions"), size = 1.2) +
scale_y_continuous(
name = "Overdose Deaths",
sec.axis = sec_axis(~./scale_factor, name = "Convictions")
) +
scale_color_manual(
values = c("Overdose Deaths" = "blue", "Convictions" = "red")
) +
labs(
title = "Overdose Deaths vs Convictions by Year",
x = "Year",
color = "Metric"
) +
theme_minimal()
Correlations between arrests & overdose deaths since data is count – ran a spearman rank correlation
cor.test(Arrest_OD_month$Arrest_Count,
Arrest_OD_month$Deaths,
method = "spearman")
## Warning in cor.test.default(Arrest_OD_month$Arrest_Count,
## Arrest_OD_month$Deaths, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: Arrest_OD_month$Arrest_Count and Arrest_OD_month$Deaths
## S = 113708, p-value = 0.02496
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2287815
There was a statistically significant, positive association between the two variables, ρ = 0.229, p = .025. This suggests that months with higher numbers of arrests tend to also have higher numbers of overdose deaths.
###Poisson Model Arrest_OD_month
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Poisson regression: model overdose deaths as count
poisson_model1 <- glm(
Deaths ~ Arrest_Count + offset (log(Population)),
family = poisson(link = "log"),
data = Arrest_OD_month
)
summary(poisson_model1)
##
## Call:
## glm(formula = Deaths ~ Arrest_Count + offset(log(Population)),
## family = poisson(link = "log"), data = Arrest_OD_month)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.675628 0.010808 -987.791 < 2e-16 ***
## Arrest_Count 0.017381 0.002119 8.201 2.39e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 778.99 on 95 degrees of freedom
## Residual deviance: 712.84 on 94 degrees of freedom
## AIC: 1411.8
##
## Number of Fisher Scoring iterations: 4
exp(coef(poisson_model1))
## (Intercept) Arrest_Count
## 2.310116e-05 1.017533e+00
# Overdispersion test
overdispersion <- sum(residuals(poisson_model1, type = "pearson")^2) / df.residual(poisson_model1)
overdispersion
## [1] 7.051196
Model: Deaths ~ Arrest_Count + offset (log(Population))
Overdispersion Detected - 7.05 (A value > 1 means variance > mean → Poisson assumptions are violated)
A Poisson regression model indicated that arrest counts were associated with overdose deaths (p < 0.001), with a 1-unit increase in arrests associated with a 1.75% increase in expected deaths, after controlling for population. Despite these findings, the model exhibited substantial overdispersion (dispersion = 7.05), suggesting that the Poisson assumptions were violated and that standard errors and significance tests may be unreliable.
Negative-Binomial model may be more appropriate for model.
library(MASS)
nb_model <- glm.nb(Deaths ~ Arrest_Count + offset (log(Population)), data = Arrest_OD_month)
summary(nb_model)
##
## Call:
## glm.nb(formula = Deaths ~ Arrest_Count + offset(log(Population)),
## data = Arrest_OD_month, init.theta = 32.30496007, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.673850 0.030274 -352.578 < 2e-16 ***
## Arrest_Count 0.017343 0.006095 2.845 0.00444 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(32.305) family taken to be 1)
##
## Null deviance: 105.788 on 95 degrees of freedom
## Residual deviance: 97.835 on 94 degrees of freedom
## AIC: 997.36
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 32.30
## Std. Err.: 5.39
##
## 2 x log-likelihood: -991.361
exp(coef(nb_model))
## (Intercept) Arrest_Count
## 2.314226e-05 1.017494e+00
# Overdispersion test
overdispersion2 <- sum(residuals(nb_model, type = "pearson")^2) / df.residual(nb_model)
overdispersion2
## [1] 0.8896591
glm.nb(Deaths ~ Arrest_Count + offset (log(Population)), data = Arrest_OD_month)
overdispersion2= 0.8896591, no overdispersion
AIC = 997.36 — much lower than Poisson model (1411.8), so NB is clearly a better fit.
A Negative Binomial regression model was fitted to examine the association between monthly overdose deaths, arrest counts, and population size, addressing overdispersion observed in the earlier Poisson model. Arrest counts were associated with overdose deaths (p < 0.01), with a 1-unit increase in arrests associated with a 1.7% increase in expected deaths, after controlling for population.
The model showed no evidence of overdispersion (dispersion = 0.8896591), indicating a good fit. Compared to the Poisson model, the negative binomial regression provided a substantially improved fit (AIC = 997.36 vs. 1411.8), confirming it as the more appropriate model for overdispersed count data.
library(dplyr)
Arrest_OD_month <- Arrest_OD_month %>%
mutate(Arrest_Count_lag1 = lag(Arrest_Count, n = 1))
library(MASS)
nb_model_lag <- glm.nb(Deaths ~ Arrest_Count_lag1 + offset (log(Population)), data = Arrest_OD_month)
summary(nb_model_lag)
##
## Call:
## glm.nb(formula = Deaths ~ Arrest_Count_lag1 + offset(log(Population)),
## data = Arrest_OD_month, init.theta = 30.97458706, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.662585 0.031235 -341.368 <2e-16 ***
## Arrest_Count_lag1 0.014060 0.006262 2.245 0.0247 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(30.9746) family taken to be 1)
##
## Null deviance: 101.667 on 94 degrees of freedom
## Residual deviance: 96.729 on 93 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 990.24
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 30.97
## Std. Err.: 5.16
##
## 2 x log-likelihood: -984.239
exp(coef(nb_model_lag))
## (Intercept) Arrest_Count_lag1
## 2.340443e-05 1.014160e+00
# Overdispersion test
overdispersion3 <- sum(residuals(nb_model_lag, type = "pearson")^2) / df.residual(nb_model_lag)
overdispersion3
## [1] 0.8930095
A negative binomial regression was conducted to test whether lagged arrest counts predict monthly overdose deaths, while controlling for population size. Arrest counts were lagged by one month to assess potential delayed effects. The model revealed that lagged arrests were significantly associated with overdose deaths (p < 0.05) after controlling for population, indicating a measurable delayed impact, with 1 arrest associated with a 1.4% increase in overdose deaths the following month.
The model demonstrated good fit to the data, with a dispersion statistic below 1 (0.89) and a slightly lower AIC (990.24) compared to the non-lagged model (AIC = 997.36). These findings suggest that changes in arrests may influence monthly overdose mortality, when accounting for temporal lag of 1 month.
I further compare these models below…
# Confidence intervals (exp) for the original NB model
exp_confint_nb_1 <- exp(confint(nb_model))
## Waiting for profiling to be done...
# Confidence intervals (exp) for the lagged NB model
exp_confint_lag_1 <- exp(confint(nb_model_lag))
## Waiting for profiling to be done...
anova(nb_model, nb_model_lag, test = "Chisq")
## Likelihood ratio tests of Negative Binomial Models
##
## Response: Deaths
## Model theta Resid. df
## 1 Arrest_Count + offset(log(Population)) 32.30496 94
## 2 Arrest_Count_lag1 + offset(log(Population)) 30.97459 93
## 2 x log-lik. Test df LR stat. Pr(Chi)
## 1 -991.3613
## 2 -984.2395 1 vs 2 1 7.121889 0.007614845
Does including Arrest_Count_lag1 improve the model significantly more than Arrest_Count?
model 2 has a higher log-likelihood (less negative), it fits the data better. This difference is statistically significant.
A likelihood ratio test comparing two Negative Binomial models showed that including the lagged arrest count (Arrest_Count_lag1) significantly improved model fit compared to the current month’s arrest count (Arrest_Count), χ²(1) = 7.12, p = 0.0076.
library(DHARMa)
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
# For original model
sim1 <- simulateResiduals(nb_model)
plot(sim1)
Panel 1 (Left): QQ Plot of Residuals for Negative Binomial
Model This panel compares the distribution of simulated residuals
to what we’d expect under a perfectly fitting model. KS test
(Kolmogorov-Smirnov test): p = 0.02974 → this is statistically
significant at the 0.05 level. ❗ This indicates non-uniformity in
residuals — i.e., the residuals do not match the expected distribution.
Conclusion: There is some systematic deviation, possibly suggesting
model misspecification (e.g., wrong functional form, missing predictors,
etc.). Dispersion test (p = 0.4): ✅ Not significant → the variance of
the residuals is appropriate. No overdispersion or underdispersion is
detected. Outlier test (p = 0.7): ✅ Not significant → no evidence of
extreme outliers beyond expectation.
Panel 2 (Right): Residuals vs. Predicted Values for Negative Binomial Model This panel plots DHARMa residuals against predicted values. Red curves represent quantile deviations (like a nonparametric confidence band). ❗ The residuals deviate systematically from what is expected under the null hypothesis of perfect fit. ❗ Combined adjusted quantile test significant” means: The model shows non-random patterns in the residuals. Likely indicates nonlinearity or misspecification (e.g., maybe the model needs interaction terms, a different predictor form, or additional covariates).
# For lagged model
sim2 <- simulateResiduals(nb_model_lag)
plot(sim2)
For the Lagged NB Model:
Here again there does not seem to be any issues related to Dispersion.
Left Panel: QQ Plot of Residuals This compares observed vs. expected uniform quantiles. Ideally, points fall along the 45° line. KS test (Kolmogorov-Smirnov): p = 0.02015 → significant at p < 0.05 ❗ This means: the model’s residuals significantly deviate from what is expected under a perfect fit. Suggests non-uniformity, which could imply model misspecification (e.g., missing variable, wrong functional form). Dispersion test (p = 0.48): ✅ Not significant → No evidence of overdispersion or underdispersion. YModel’s Variance structure is okay. Outlier test (p = 0.22): ✅ Not significant → No evidence of extreme outliers.
Right Panel: Residuals vs. Predicted Values This plots DHARMa residuals vs. model predictions (rank-transformed). Red curves (quantile bands): The red bands depart from the horizontal line, especially in the upper and middle ranges. These indicate significant quantile deviations, which again suggests systematic bias in residuals across predicted values. ❗ Combined adjusted quantile test significant reinforces this interpretation: residuals are not independent of the fitted values, which breaks a core assumption of model.
Summary of Interpretation: ✅ No major issues with dispersion or outliers. ❗ BUT, both the KS test and residual vs. prediction plot show significant deviations, suggesting this model does not fully capture the structure in the data. We may need to: -Try different predictors (e.g., lag terms, nonlinear terms), -Or explore interaction effects. - or try ZNIB model
library(glmmTMB)
zinb_model <- glmmTMB(Deaths ~ Arrest_Count + offset(log(Population)),
ziformula = ~1, # intercept-only zero-inflation
family = nbinom2, data = Arrest_OD_month)
summary(zinb_model)
## Family: nbinom2 ( log )
## Formula: Deaths ~ Arrest_Count + offset(log(Population))
## Zero inflation: ~1
## Data: Arrest_OD_month
##
## AIC BIC logLik -2*log(L) df.resid
## 999.4 1009.6 -495.7 991.4 92
##
##
## Dispersion parameter for nbinom2 family (): 32.3
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.67385 0.03056 -349.3 < 2e-16 ***
## Arrest_Count 0.01734 0.00619 2.8 0.00508 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.38 4492.02 -0.005 0.996
overdispersion23 <- sum(residuals(zinb_model, type = "pearson")^2) / df.residual(nb_model_lag)
overdispersion23
## [1] 0.8992278
AIC for nb_model_lag (better model): 990.24 AIC for zinb_model: 999.4
The Negative Binomial model with lagged arrest counts provides a better fit to the data than the Zero-Inflated Negative Binomial model, based on AIC comparison.
Among several models tested, the Negative Binomial model including a one-month lag of arrest counts (Arrest_Count_lag1) yielded the lowest AIC (990.24), outperforming both the standard NB model and the Zero-Inflated NB model.
Since the ZINB did not solve our problem, I am going to try a non-linear model…
#Polynomial Lagged model
nb_poly_model <- glm.nb(
Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) + offset(log(Population)),
data = Arrest_OD_month
)
#Spline model
library(splines)
nb_spline_model <- glm.nb(
Deaths ~ ns(Arrest_Count_lag1, df = 3) + offset(log(Population)),
data = Arrest_OD_month
)
AIC(nb_model_lag, nb_poly_model, nb_spline_model)
## df AIC
## nb_model_lag 3 990.2395
## nb_poly_model 4 989.8070
## nb_spline_model 5 990.6807
The polynomial model has the best AIC, although all three are very close
library(DHARMa)
# Check each model
simulateResiduals(nb_model_lag) %>% plot()
nb_model_lag: ❌ KS test significant (p = 0.020): residuals deviate from
expected ❗ Red quantile bands curved → residual pattern suggests
misspecification
simulateResiduals(nb_poly_model) %>% plot()
nb_poly_model: ✅ KS test not significant (p = 0.0718) ✅ Residuals
align better with expected quantiles 👍 Still shows some quantile
deviation, but much improved fit vs linear
simulateResiduals(nb_spline_model) %>% plot()
nb_spline_model: ❌ KS test significant again (p = 0.01491) Residual vs. predicted curve is wavy, maybe overfitting AIC is slightly higher — so not much benefit despite flexibility
A quadratic term for lagged arrest counts improved model fit (AIC = 989.8) and resolved residual issues present in the linear model. This suggests that the association between arrests and overdose deaths is nonlinear, with the marginal effect of arrests changing depending on the arrest level.
Although the polynomial model improved fit and resolved most residual issues, the DHARMa quantile test remained significant, suggesting some remaining nonlinearity. More flexible models like GAMs may further reduce these residual patterns.
library(MASS)
#Increase Polynomial degree
nb_poly3 <- glm.nb(Deaths ~ poly(Arrest_Count_lag1, 3, raw = TRUE) + offset(log(Population)), data = Arrest_OD_month)
summary(nb_poly3)
##
## Call:
## glm.nb(formula = Deaths ~ poly(Arrest_Count_lag1, 3, raw = TRUE) +
## offset(log(Population)), data = Arrest_OD_month, init.theta = 32.21751312,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.073e+01 5.042e-02 -212.885 <2e-16
## poly(Arrest_Count_lag1, 3, raw = TRUE)1 7.495e-02 4.096e-02 1.830 0.0672
## poly(Arrest_Count_lag1, 3, raw = TRUE)2 -1.020e-02 8.676e-03 -1.175 0.2399
## poly(Arrest_Count_lag1, 3, raw = TRUE)3 4.348e-04 4.943e-04 0.880 0.3791
##
## (Intercept) ***
## poly(Arrest_Count_lag1, 3, raw = TRUE)1 .
## poly(Arrest_Count_lag1, 3, raw = TRUE)2
## poly(Arrest_Count_lag1, 3, raw = TRUE)3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(32.2175) family taken to be 1)
##
## Null deviance: 105.181 on 94 degrees of freedom
## Residual deviance: 96.817 on 91 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 991.04
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 32.22
## Std. Err.: 5.40
##
## 2 x log-likelihood: -981.044
simulateResiduals(nb_poly3) %>% plot()
We fitted a cubic negative binomial model (nb_poly3) to account for
potential nonlinearity. While the dispersion and outlier tests showed no
significant deviation, both the Kolmogorov-Smirnov and adjusted quantile
tests from the DHARMa package suggested residual non-uniformity,
indicating potential model misspecification. Further it shows a worse
model fit with a slightly higher AIC than the polynomial model (991.04
vs. 989.81).
So far the polynomial nb is the best fit.
A GAM may provide a better fit by flexibly capturing nonlinear patterns.
#Generalized Additive Model (GAM) for better smoothing:
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-4. For overview type '?mgcv'.
gam_model <- gam(Deaths ~ s(Arrest_Count_lag1) + offset(log(Population)),
family = nb(), data = Arrest_OD_month)
summary(gam_model)
##
## Family: Negative Binomial(30.648)
## Link function: log
##
## Formula:
## Deaths ~ s(Arrest_Count_lag1) + offset(log(Population))
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.60859 0.01975 -537 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(Arrest_Count_lag1) 1.64 2.042 6.118 0.0508 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = -0.0694 Deviance explained = 6.62%
## -REML = 497.96 Scale est. = 1 n = 95
GAMs are nonparametric smoothers, often outperforming polynomials or splines in flexibility without overfitting.
library(mgcViz)
## Loading required package: qgam
## Registered S3 method overwritten by 'mgcViz':
## method from
## +.gg ggplot2
##
## Attaching package: 'mgcViz'
## The following objects are masked from 'package:stats':
##
## qqline, qqnorm, qqplot
simulateResiduals(gam_model) %>% plot()
Conclusion: The polynomial model shows slightly better residual
uniformity based on the KS test than the GAMM.
AIC(gam_model)
## [1] 990.5519
Polynomial NB also has slightly better AIC (AIC = 989.8 versus 990.5519).
Summary Polynomial NB model is slightly favored based on AIC and KS test. GAM offers smoother, more flexible fitting and may still be useful for capturing nonlinear trends if interpretability or visual diagnostics suggest it’s better. From a model selection standpoint, unless we have strong theoretical reasons to prefer smoothing, Polynomial NB is the slightly better choice here.
However, because there are strong arguments also for GAMM models as being preferable – I’m going to interpret both here.
Interpreting the Polynomial Model
summary(nb_poly_model)
##
## Call:
## glm.nb(formula = Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) +
## offset(log(Population)), data = Arrest_OD_month, init.theta = 31.91663863,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.707879 0.041862 -255.787 <2e-16
## poly(Arrest_Count_lag1, 2, raw = TRUE)1 0.042992 0.019286 2.229 0.0258
## poly(Arrest_Count_lag1, 2, raw = TRUE)2 -0.002706 0.001712 -1.581 0.1140
##
## (Intercept) ***
## poly(Arrest_Count_lag1, 2, raw = TRUE)1 *
## poly(Arrest_Count_lag1, 2, raw = TRUE)2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(31.9166) family taken to be 1)
##
## Null deviance: 104.334 on 94 degrees of freedom
## Residual deviance: 96.798 on 92 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 989.81
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 31.92
## Std. Err.: 5.34
##
## 2 x log-likelihood: -981.807
exp(coef(nb_poly_model))
## (Intercept) poly(Arrest_Count_lag1, 2, raw = TRUE)1
## 0.000022368 1.043929914
## poly(Arrest_Count_lag1, 2, raw = TRUE)2
## 0.997298063
Linear term is significant and positive Interpretation: For small values of arrest count, an increase in arrests is associated with a higher risk of overdose deaths in the next time period. Since the model uses a log link, the coefficient of 0.043 implies: → A 1-unit increase in lagged arrest count is associated with a 4.4% increase in overdose deaths (holding population constant).
Quadratic term is negative but not significant The negative value suggests the relationship bends downward, i.e., the effect of arrests flattens or reverses at higher arrest counts. But since the p-value = 0.114, it’s not statistically significant → this quadratic term may not be needed.
This produces a parabola that opens downward — suggesting that at very high arrest levels, the effect on deaths might decrease or plateau, though we cannot say this with statistical confidence.
#Test for Autocorrelation
testTemporalAutocorrelation(simulateResiduals(nb_poly_model), time = Arrest_OD_month$Month_Code)
## DHARMa::testTemporalAutocorrelation - no time argument provided, using random times for each data point
##
## Durbin-Watson test
##
## data: simulationOutput$scaledResiduals ~ 1
## DW = 1.975, p-value = 0.9029
## alternative hypothesis: true autocorrelation is not 0
Temporal Autocorrelation Test (Durbin-Watson) for NB polynomial model Durbin-Watson p = 0.99956 Interpretation: No significant autocorrelation in residuals. Visual confirmation: ACF bars (right plot) mostly within blue bounds; residuals (left plot) show no visible pattern over time.
polynomial NB model: Has slightly better AIC than GAM Has clean residuals No autocorrelation Improves fit over a simple linear NB
But because of the theoretical reasosn to prefer smoothing…also going to report the GAMM
summary(gam_model)
##
## Family: Negative Binomial(30.648)
## Link function: log
##
## Formula:
## Deaths ~ s(Arrest_Count_lag1) + offset(log(Population))
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.60859 0.01975 -537 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(Arrest_Count_lag1) 1.64 2.042 6.118 0.0508 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = -0.0694 Deviance explained = 6.62%
## -REML = 497.96 Scale est. = 1 n = 95
exp(coef(gam_model))
## (Intercept) s(Arrest_Count_lag1).1 s(Arrest_Count_lag1).2
## 2.470295e-05 9.782360e-01 1.020862e+00
## s(Arrest_Count_lag1).3 s(Arrest_Count_lag1).4 s(Arrest_Count_lag1).5
## 1.007248e+00 1.015289e+00 1.011148e+00
## s(Arrest_Count_lag1).6 s(Arrest_Count_lag1).7 s(Arrest_Count_lag1).8
## 9.855573e-01 1.001127e+00 1.059593e+00
## s(Arrest_Count_lag1).9
## 1.049867e+00
This generalized additive model estimates the relationship between overdose deaths and the lagged arrest count, using a smoothed spline term to capture potential nonlinearity. The parametric intercept is highly significant, indicating a very low baseline risk when the arrest count is near zero. The smooth term s(Arrest_Count_lag1) has an effective degrees of freedom (edf) of 1.64, suggesting mild nonlinearity in the relationship, though not extreme. The p-value for the smooth term is 0.0508, right on the border of statistical significance, indicating that the arrest-death relationship may not be strictly linear, but the evidence is modest.
The exponentiated partial effects (in the exp(coef(gam_model)) output) suggest that the local incidence rate ratios (IRRs) across the smoothed arrest count values vary only modestly, mostly clustering around 1.00–1.06. This implies that the effect of arrest counts on overdose deaths is positive but not dramatically varying across the range of arrest counts.
Overall, the GAM provides a flexible fit with slight improvement in residual behavior over strictly parametric models, as seen in the diagnostic plots. However, given the near-significance and mild nonlinearity, this model may be roughly equivalent in fit to simpler polynomial models, which also had comparable or slightly better AIC values.
When the lagged arrest count is at its reference value, the model predicts approximately 2.47 overdose deaths per 100,000 population, per month.
The generalized additive model (GAM) explained only 6.62% of the deviance in overdose deaths, indicating that while the smooth function of lagged arrest counts may capture some nonlinear trends, the model leaves a large portion of the variation unexplained. This suggests that other important predictors or structural factors may be missing from the model.
controlling for COVID-era effects using a factor variable, which allows the model to compare multiple time periods to the reference category (2023/2024).
Arrest_OD_month <- Arrest_OD_month %>%
mutate(
COVID_era = case_when(
Year < 2020 ~ "Pre",
Year == 2020 ~ "2020",
Year == 2021 ~ "2021",
Year == 2022 ~ "2022",
TRUE ~ "2023_24"
)
)
nb_model_poly_covid <- glm.nb(
Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) + factor(COVID_era) + offset(log(Population)),
data = Arrest_OD_month)
summary(nb_model_poly_covid)
##
## Call:
## glm.nb(formula = Deaths ~ poly(Arrest_Count_lag1, 2, raw = TRUE) +
## factor(COVID_era) + offset(log(Population)), data = Arrest_OD_month,
## init.theta = 64.44430919, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.582873 0.051041 -207.340 < 2e-16
## poly(Arrest_Count_lag1, 2, raw = TRUE)1 0.025024 0.014707 1.701 0.0889
## poly(Arrest_Count_lag1, 2, raw = TRUE)2 -0.002027 0.001308 -1.549 0.1213
## factor(COVID_era)2021 0.029450 0.057327 0.514 0.6074
## factor(COVID_era)2022 0.006358 0.057661 0.110 0.9122
## factor(COVID_era)2023_24 -0.305412 0.051402 -5.942 2.82e-09
## factor(COVID_era)Pre -0.019873 0.047650 -0.417 0.6766
##
## (Intercept) ***
## poly(Arrest_Count_lag1, 2, raw = TRUE)1 .
## poly(Arrest_Count_lag1, 2, raw = TRUE)2
## factor(COVID_era)2021
## factor(COVID_era)2022
## factor(COVID_era)2023_24 ***
## factor(COVID_era)Pre
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(64.4443) family taken to be 1)
##
## Null deviance: 184.853 on 94 degrees of freedom
## Residual deviance: 98.688 on 88 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 944.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 64.4
## Std. Err.: 12.5
##
## 2 x log-likelihood: -928.305
exp(coef(nb_model_poly_covid))
## (Intercept) poly(Arrest_Count_lag1, 2, raw = TRUE)1
## 2.534642e-05 1.025340e+00
## poly(Arrest_Count_lag1, 2, raw = TRUE)2 factor(COVID_era)2021
## 9.979750e-01 1.029888e+00
## factor(COVID_era)2022 factor(COVID_era)2023_24
## 1.006378e+00 7.368198e-01
## factor(COVID_era)Pre
## 9.803228e-01
library(mgcViz)
simulateResiduals(nb_model_poly_covid) %>% plot()
controlling for temporal COVID-era effects attenuates the relationship
between arrests and deaths, suggesting the association may be partly
confounded by time-related factors.
anova(nb_poly_model, nb_model_poly_covid, test = "LRT")
## Warning in anova.negbin(nb_poly_model, nb_model_poly_covid, test = "LRT"): only
## Chi-squared LR tests are implemented
## Likelihood ratio tests of Negative Binomial Models
##
## Response: Deaths
## Model
## 1 poly(Arrest_Count_lag1, 2, raw = TRUE) + offset(log(Population))
## 2 poly(Arrest_Count_lag1, 2, raw = TRUE) + factor(COVID_era) + offset(log(Population))
## theta Resid. df 2 x log-lik. Test df LR stat. Pr(Chi)
## 1 31.91664 92 -981.807
## 2 64.44431 88 -928.305 1 vs 2 4 53.50201 6.690604e-11
The likelihood ratio test (LRT) shows a highly significant improvement in model fit when the COVID-era variable is added. This means: The model with factor(COVID_era) explains the data substantially better than the simpler model.
A likelihood ratio test comparing the polynomial model with and without COVID-era fixed effects indicates a significantly improved fit when including the time-period controls (χ² = 53.5, df = 4, p < 0.000000001). However, the inclusion of COVID-era indicators substantially attenuated the association between lagged arrests and overdose deaths, suggesting that temporal variation across pandemic phases may confound the arrest–mortality relationship.
#Covid era effects in GAM model
library(mgcv)
gam_model_covid <- gam(
Deaths ~ s(Arrest_Count_lag1) + factor(COVID_era) + offset(log(Population)),
family = nb(),
data = Arrest_OD_month
)
summary(gam_model_covid)
##
## Family: Negative Binomial(58.409)
## Link function: log
##
## Formula:
## Deaths ~ s(Arrest_Count_lag1) + factor(COVID_era) + offset(log(Population))
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.534848 0.042247 -249.365 < 2e-16 ***
## factor(COVID_era)2021 0.029919 0.059659 0.502 0.616
## factor(COVID_era)2022 0.006883 0.060023 0.115 0.909
## factor(COVID_era)2023_24 -0.307934 0.053413 -5.765 8.16e-09 ***
## factor(COVID_era)Pre -0.022523 0.049486 -0.455 0.649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(Arrest_Count_lag1) 1.822 2.282 2.238 0.471
##
## R-sq.(adj) = 0.403 Deviance explained = 46.7%
## -REML = 480.44 Scale est. = 1 n = 95
Deviance Explained: 46.7% of the deviance is explained by this model (a big jump from the 6.6% in the previous GAM without the COVID controls). This indicates much stronger model fit — the COVID-era variable captures substantial variation in deaths.
Effect of Arrests (Smooth Term): The smooth function for Arrest_Count_lag1 is not significant (Chi-sq = 2.238, p = 0.471). Interpretation: After controlling for COVID-era, the relationship between arrests and deaths no longer appears systematic or nonlinear.
Effect of Arrests (Smooth Term): The smooth function for Arrest_Count_lag1 is not significant (Chi-sq = 2.238, p = 0.471). Interpretation: After controlling for COVID-era, the relationship between arrests and deaths no longer appears systematic or nonlinear.
After adjusting for COVID-era effects, the model explains nearly half the deviance in overdose deaths. However, the previously observed association between lagged arrest counts and deaths disappears, suggesting that temporal variation across pandemic phases may better explain changes in death rates than arrest fluctuations alone
Convictions and Overdose Deaths
library(dplyr)
conv_data <- combined_df_3_filtered %>%
dplyr::select(Month_Date, Year, Deaths, Conv_Count, Population) %>%
arrange(Month_Date) %>%
mutate(Conv_Count_lag1 = lag(Conv_Count, 1))
cor.test(conv_data$Conv_Count,
conv_data$Deaths,
method = "spearman")
## Warning in cor.test.default(conv_data$Conv_Count, conv_data$Deaths, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: conv_data$Conv_Count and conv_data$Deaths
## S = 2323669, p-value = 4.469e-13
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4102266
poisson_conv <- glm(
Deaths ~ Conv_Count + offset (log(Population)),
family = poisson(link = "log"),
data = conv_data
)
summary(poisson_conv)
##
## Call:
## glm(formula = Deaths ~ Conv_Count + offset(log(Population)),
## family = poisson(link = "log"), data = conv_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.370617 0.006383 -1781.34 <2e-16 ***
## Conv_Count 0.129345 0.002253 57.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 13164 on 286 degrees of freedom
## Residual deviance: 10375 on 285 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 12212
##
## Number of Fisher Scoring iterations: 5
exp(coef(poisson_conv))
## (Intercept) Conv_Count
## 1.152932e-05 1.138083e+00
# Overdispersion check
overdispersion_conv <- sum(residuals(poisson_conv, type = "pearson")^2) / df.residual(poisson_conv)
overdispersion_conv
## [1] 39.46199
Poisson Model Conviction Counts: Deaths ~ Conv_Count + offset(log(Population))
Overdispersion is extremely high = 39.46 ; so moving on to the next model.
library(MASS)
nb_conv <- glm.nb(
Deaths ~ Conv_Count + offset (log(Population)),
data = conv_data
)
summary(nb_conv)
##
## Call:
## glm.nb(formula = Deaths ~ Conv_Count + offset(log(Population)),
## data = conv_data, init.theta = 3.14821001, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.39538 0.03754 -303.587 < 2e-16 ***
## Conv_Count 0.14275 0.01843 7.744 9.61e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.1482) family taken to be 1)
##
## Null deviance: 367.57 on 286 degrees of freedom
## Residual deviance: 301.24 on 285 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 3141.9
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.148
## Std. Err.: 0.259
##
## 2 x log-likelihood: -3135.869
exp(coef(nb_conv))
## (Intercept) Conv_Count
## 1.124728e-05 1.153439e+00
# Overdispersion check
overdispersion_nb <- sum(residuals(nb_conv, type = "pearson")^2) / df.residual(nb_conv)
overdispersion_nb
## [1] 1.133245
Negative Binomial Model = Deaths ~ Conv_Count + offset(log(Population))
Overdispersion is acceptable at 1.133. AIC is quite high at 3141.9. After controlling for the population, an increase by 1 conviction is associated with a 15% increase in overdose deaths (p<0.0001)
nb_conv_lag <- glm.nb(
Deaths ~ Conv_Count_lag1 + offset (log(Population)),
data = conv_data
)
summary(nb_conv_lag)
##
## Call:
## glm.nb(formula = Deaths ~ Conv_Count_lag1 + offset(log(Population)),
## data = conv_data, init.theta = 3.119003956, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.38857 0.03775 -301.664 < 2e-16 ***
## Conv_Count_lag1 0.13938 0.01852 7.527 5.2e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.119) family taken to be 1)
##
## Null deviance: 362.65 on 285 degrees of freedom
## Residual deviance: 300.32 on 284 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 3135.1
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.119
## Std. Err.: 0.257
##
## 2 x log-likelihood: -3129.099
exp(coef(nb_conv_lag))
## (Intercept) Conv_Count_lag1
## 0.0000113242 1.1495656951
# Overdispersion check
overdispersion_nb2 <- sum(residuals(nb_conv_lag, type = "pearson")^2) / df.residual(nb_conv_lag)
overdispersion_nb2
## [1] 1.142208
1 month Lagged Negative Binomial Model – Deaths ~ Conv_Count_lag1 + offset(log(Population))
Overdispersion is acceptabe = 1.14
AIC is still quite high at 3135.1, but still less than the non-lagged NB model (3141.9). After controlling for the population, an increase by 1 conviction is associated with a 15% increase in overdose deaths the following month (p<0.0001)
# Confidence intervals (exp) for the original NB model
exp_confint_nb <- exp(confint(nb_conv))
## Waiting for profiling to be done...
# Confidence intervals (exp) for the lagged NB model
exp_confint_lag <- exp(confint(nb_conv_lag))
## Waiting for profiling to be done...
anova(nb_conv_lag, nb_conv, test = "Chisq")
## Likelihood ratio tests of Negative Binomial Models
##
## Response: Deaths
## Model theta Resid. df 2 x log-lik.
## 1 Conv_Count + offset(log(Population)) 3.148210 285 -3135.869
## 2 Conv_Count_lag1 + offset(log(Population)) 3.119004 284 -3129.099
## Test df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 1 6.769973 0.009270422
ORIGINAL MODEL
library(DHARMa)
# For original model
sim1 <- simulateResiduals(nb_conv)
plot(sim1)
## qu = 0.5, log(sigma) = -2.42894 : outer Newton did not converge fully.
LAGGED MODEL
# For lagged model
sim2 <- simulateResiduals(nb_conv_lag)
plot(sim2)
Spearman’s rank correlation showed a moderate positive relationship between convictions and overdose mortality (ρ = 0.41, p < .001).
Poisson regression was initially attempted but found to be highly overdispersed, justifying the use of a negative binomial model.
A series of negative binomial regression models were fit to examine the association between conviction counts and overdose deaths from 1999 to 2024.
In the adjusted model without a lag, convictions were significantly associated with overdose deaths, controlling for population size. This association remained statistically significant when a 1-month lag of conviction counts was introduced, suggesting that monthly changes in convictions may predict subsequent changes in overdose mortality. Model fit improved with the lagged specification.
Even though the lagged model shows marginally improved residual behavior, both models fail the KS test and show non-uniform quantile behavior, as evidenced by the red curves in the right-hand plot. This suggests: Some important structure in the data might not be captured Potentially missing covariates (e.g., COVID-era, temporal trend, etc.) Or the functional form might not be flexible enough
library(dplyr)
library(gt)
# Install broom
# install.packages("broom")
library(broom)
# Tidy the model and exponentiate coefficients + CI
conv_nb_lag_table <- tidy(nb_conv_lag, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)
# Display the table
print(conv_nb_lag_table)
## # A tibble: 2 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0000113 0.0378 -302. 0 0.0000105 0.0000122
## 2 Conv_Count_lag1 1.15 0.0185 7.53 5.20e-14 1.11 1.20
conv_nb_lag_table %>%
mutate(across(estimate:conf.high, ~ round(.x, 6))) %>%
rename(
Term = term,
IRR = estimate,
`CI Lower` = conf.low,
`CI Upper` = conf.high,
`P-Value` = p.value
) %>%
gt() %>%
tab_header(
title = "Incidence Rate Ratios (IRRs) from Lagged Negative Binomial Model"
)
| Incidence Rate Ratios (IRRs) from Lagged Negative Binomial Model | ||||||
| Term | IRR | std.error | statistic | P-Value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.000011 | 0.037753 | -301.663816 | 0 | 0.000011 | 0.000012 |
| Conv_Count_lag1 | 1.149566 | 0.018518 | 7.526767 | 0 | 1.107652 | 1.195387 |
controlling for COVID-era effects using a factor variable, which allows the model to compare multiple time periods to the reference category (2023/2024).
conv_data <- conv_data %>%
mutate(
COVID_era = case_when(
Year < 2020 ~ "Pre",
Year == 2020 ~ "2020",
Year == 2021 ~ "2021",
Year == 2022 ~ "2022",
TRUE ~ "2023_24"
)
)
nb_conv_covid_fe <- glm.nb(
Deaths ~ Conv_Count_lag1 + offset (log(Population)) + factor(COVID_era),
data = conv_data
)
summary(nb_conv_covid_fe)
##
## Call:
## glm.nb(formula = Deaths ~ Conv_Count_lag1 + offset(log(Population)) +
## factor(COVID_era), data = conv_data, init.theta = 3.859229599,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.683237 0.152050 -70.261 < 2e-16 ***
## Conv_Count_lag1 0.105905 0.018605 5.692 1.25e-08 ***
## factor(COVID_era)2021 -0.008205 0.209521 -0.039 0.969
## factor(COVID_era)2022 -0.251604 0.214235 -1.174 0.240
## factor(COVID_era)Pre -0.798325 0.153428 -5.203 1.96e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.8592) family taken to be 1)
##
## Null deviance: 445.24 on 285 degrees of freedom
## Residual deviance: 297.94 on 281 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 3077.6
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.859
## Std. Err.: 0.324
##
## 2 x log-likelihood: -3065.562
exp(coef(nb_conv_covid_fe))
## (Intercept) Conv_Count_lag1 factor(COVID_era)2021
## 2.292603e-05 1.111716e+00 9.918290e-01
## factor(COVID_era)2022 factor(COVID_era)Pre
## 7.775526e-01 4.500824e-01
A final model was estimated including time-period fixed effects to account for the COVID-19 pandemic and its aftermath. The results show that lagged convictions remain significantly associated with overdose deaths, even after controlling for population and temporal factors (p < .001). Each additional conviction in the prior month was associated with a 11.2% increase in overdose deaths, controlling for population and pandemic time periods.
Model fit (AIC = 3077.6) better than nb model without covid-era effects.
# Load required packages
library(broom)
library(dplyr)
library(gt)
# Step 1: Extract IRRs + Confidence Intervals
covid_nb_table <- tidy(nb_conv_covid_fe, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)
# Step 2: Format for clean presentation
covid_nb_table %>%
mutate(across(estimate:conf.high, ~ round(.x, 6))) %>%
rename(
Term = term,
IRR = estimate,
`CI Lower` = conf.low,
`CI Upper` = conf.high,
`P-Value` = p.value
) %>%
gt() %>%
tab_header(
title = "Incidence Rate Ratios (IRRs) from COVID-Adjusted Negative Binomial Model"
)
| Incidence Rate Ratios (IRRs) from COVID-Adjusted Negative Binomial Model | ||||||
| Term | IRR | std.error | statistic | P-Value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.000023 | 0.152050 | -70.261273 | 0.000000 | 0.000017 | 0.000031 |
| Conv_Count_lag1 | 1.111716 | 0.018605 | 5.692208 | 0.000000 | 1.072570 | 1.154094 |
| factor(COVID_era)2021 | 0.991829 | 0.209521 | -0.039159 | 0.968764 | 0.656714 | 1.497939 |
| factor(COVID_era)2022 | 0.777553 | 0.214235 | -1.174432 | 0.240222 | 0.510331 | 1.184592 |
| factor(COVID_era)Pre | 0.450082 | 0.153428 | -5.203239 | 0.000000 | 0.329617 | 0.598887 |
# For lagged model
sim55 <- simulateResiduals(nb_conv_covid_fe)
plot(sim55)
xWhile the model passes global diagnostics (like KS and dispersion
tests), there’s some non-random structure in the residuals relative to
predictions, possibly hinting at: Missing covariates (e.g., unmodeled
seasonality or nonlinearity), Interactions, Or insufficiently flexible
functional forms (like needing a smoother or more flexible lag
effect).
library(MASS) # for glm.nb
# Fit 2nd-degree polynomial NB model
nb_model_poly_conv <- glm.nb(
Deaths ~ poly(Conv_Count_lag1, 2, raw = TRUE) +
factor(COVID_era) +
offset(log(Population)),
data = conv_data
)
# Model summary
summary(nb_model_poly_conv)
##
## Call:
## glm.nb(formula = Deaths ~ poly(Conv_Count_lag1, 2, raw = TRUE) +
## factor(COVID_era) + offset(log(Population)), data = conv_data,
## init.theta = 3.960386459, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.691889 0.150234 -71.168 < 2e-16
## poly(Conv_Count_lag1, 2, raw = TRUE)1 0.224769 0.048238 4.660 3.17e-06
## poly(Conv_Count_lag1, 2, raw = TRUE)2 -0.019436 0.007265 -2.675 0.00746
## factor(COVID_era)2021 -0.035174 0.207006 -0.170 0.86508
## factor(COVID_era)2022 -0.263438 0.211555 -1.245 0.21304
## factor(COVID_era)Pre -0.819309 0.151557 -5.406 6.45e-08
##
## (Intercept) ***
## poly(Conv_Count_lag1, 2, raw = TRUE)1 ***
## poly(Conv_Count_lag1, 2, raw = TRUE)2 **
## factor(COVID_era)2021
## factor(COVID_era)2022
## factor(COVID_era)Pre ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.9604) family taken to be 1)
##
## Null deviance: 456.43 on 285 degrees of freedom
## Residual deviance: 297.72 on 280 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 3072
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.960
## Std. Err.: 0.333
##
## 2 x log-likelihood: -3057.982
# Exponentiated coefficients
exp(coef(nb_model_poly_conv))
## (Intercept) poly(Conv_Count_lag1, 2, raw = TRUE)1
## 2.272855e-05 1.252033e+00
## poly(Conv_Count_lag1, 2, raw = TRUE)2 factor(COVID_era)2021
## 9.807519e-01 9.654376e-01
## factor(COVID_era)2022 factor(COVID_era)Pre
## 7.684055e-01 4.407362e-01
library(mgcv)
# Fit GAM with smooth term on Conv_Count_lag1
gam_model_conv <- gam(
Deaths ~ s(Conv_Count_lag1) +
factor(COVID_era) +
offset(log(Population)),
family = nb(), # negative binomial
data = conv_data
)
# Model summary
summary(gam_model_conv)
##
## Family: Negative Binomial(3.901)
## Link function: log
##
## Formula:
## Deaths ~ s(Conv_Count_lag1) + factor(COVID_era) + offset(log(Population))
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.56043 0.14942 -70.674 < 2e-16 ***
## factor(COVID_era)2021 -0.04817 0.20923 -0.230 0.818
## factor(COVID_era)2022 -0.27724 0.21410 -1.295 0.195
## factor(COVID_era)Pre -0.83409 0.15369 -5.427 5.73e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(Conv_Count_lag1) 2.56 3.17 39.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.408 Deviance explained = 35.3%
## -REML = 1538.1 Scale est. = 1 n = 286
# Plot the smooth function
plot(gam_model_conv, shade = TRUE, main = "Smooth term for Convictions (lagged)")
AIC(nb_model_poly_conv, gam_model_conv)
## df AIC
## nb_model_poly_conv 7.000000 3071.982
## gam_model_conv 8.169749 3072.148
sim29 <- simulateResiduals(nb_model_poly_conv)
plot(sim29)
sim59 <- simulateResiduals(gam_model_conv)
plot(sim59)
## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L = G$L,
## : Fitting terminated with step failure - check results carefully
| Model Type | Deviance Explained | AIC | Residuals (DHARMa) | | ———- |
—————— | —— | ————————————————- | | Polynomial | 34.7% | 3072 | Very
good (all tests non-significant) | | GAMM | 35.3% | 3072.1 | Also good,
but small numerical warning in fitting |
Both a quadratic polynomial model and a generalized additive model (GAMM) fit the data comparably well, with nearly identical AIC values and deviance explained (~35%). Given the slightly better interpretability and stability, the polynomial model may be preferred, though the GAMM confirms the robustness of the association under a more flexible functional form.
While overall model diagnostics suggest a good fit, DHARMa residual checks indicate quantile deviations across the range of predicted values. This suggests some remaining structured variation not fully captured by the current model, possibly due to unmodeled heterogeneity or interactions. Future modeling could explore interaction terms or hierarchical structures to address these deviations more fully.
library(dplyr)
Overdose %>%
filter(Deaths == "Suppressed") %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 60
#total observations in dataset
library(dplyr)
Overdose %>%
tally()
## # A tibble: 1 × 1
## n
## <int>
## 1 525
60 out of 525, or 11.4% of the overdose observations per month by county is suppressed.
# Standardize County names in Overdose dataframe
Overdose <- Overdose %>%
mutate(County = gsub(" County, NJ", "", County))
# 1. Convictions by county and year
convictions_by_county_year <- Convictions %>%
filter(!is.na(Sentence.Year)) %>%
group_by(County, Year = Sentence.Year) %>%
summarise(Conviction_Count = n(), .groups = "drop")
# 2. Arrests by county and year
arrests_by_county_year <- Arrest %>%
group_by(County, Year = `Arrest Year`) %>%
summarise(Arrest_Count = n(), .groups = "drop")
# 3. Combine into one dataset
combined_df_county <- Overdose %>%
left_join(arrests_by_county_year, by = c("County", "Year")) %>%
left_join(convictions_by_county_year, by = c("County", "Year")) %>%
mutate(
Arrest_Count = ifelse(is.na(Arrest_Count) & Year >= 2017, 0, Arrest_Count),
Conviction_Count = ifelse(is.na(Conviction_Count), 0, Conviction_Count)
)
#Midpoint Method (setting them all to 5 which is the median)
combined_df_county_md <- combined_df_county %>%
mutate(
Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths), # Step 1: Set Suppressed to NA
Deaths_num = as.numeric(Deaths_clean), # Step 2: Convert remaining to numeric
Deaths_num = ifelse(is.na(Deaths_num), 5, Deaths_num) # Step 3: Set NAs (formerly Suppressed) to 5
)
#Sensitivity Analysis
##lower bound
df_lower <- combined_df_county %>%
mutate(
Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths), # Step 1: Set Suppressed to NA
Deaths_num = as.numeric(Deaths_clean), # Step 2: Convert remaining to numeric
Deaths_num = ifelse(is.na(Deaths_num), 1, Deaths_num) # Step 3: Set NAs (formerly Suppressed) to 1
)
##upper bound
df_upper <- combined_df_county %>%
mutate(
Deaths_clean = ifelse(Deaths == "Suppressed", NA, Deaths), # Step 1: Set Suppressed to NA
Deaths_num = as.numeric(Deaths_clean), # Step 2: Convert remaining to numeric
Deaths_num = ifelse(is.na(Deaths_num), 9, Deaths_num) # Step 3: Set NAs (formerly Suppressed) to 9
)
# Load required libraries
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(truncnorm)
library(dplyr)
# Step 1: Prepare dataset and flag suppressed deaths as NA
combined_df_county_MI <- combined_df_county %>%
mutate(
Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths)),
County = as.factor(County)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
## as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
# Step 2: Define custom truncated Poisson-style imputation function
mice.impute.truncated_poisson <- function(y, ry, x, ...) {
n_miss <- sum(!ry)
mean_est <- 5 # fixed midpoint-based mean
imputed <- round(rtruncnorm(n = n_miss, a = 1, b = 9, mean = mean_est, sd = 2))
return(imputed)
}
# Step 3: Specify imputation methods
mice_methods <- make.method(combined_df_county_MI)
mice_methods["Deaths_num"] <- "truncated_poisson"
# Step 4: Build predictor matrix — only safe variables
mice_pred <- make.predictorMatrix(combined_df_county_MI)
mice_pred[,] <- 0 # turn off all predictors by default
mice_pred["Deaths_num", "Population"] <- 1 # use population to inform imputation
# Optional: include County if it's a factor and not a primary predictor
mice_pred["Deaths_num", "County"] <- 1
# Step 5: Run multiple imputation
imp <- mice(
combined_df_county_MI,
method = mice_methods,
predictorMatrix = mice_pred,
m = 5,
seed = 123
)
##
## iter imp variable
## 1 1 Arrest_Count Deaths_num
## 1 2 Arrest_Count Deaths_num
## 1 3 Arrest_Count Deaths_num
## 1 4 Arrest_Count Deaths_num
## 1 5 Arrest_Count Deaths_num
## 2 1 Arrest_Count Deaths_num
## 2 2 Arrest_Count Deaths_num
## 2 3 Arrest_Count Deaths_num
## 2 4 Arrest_Count Deaths_num
## 2 5 Arrest_Count Deaths_num
## 3 1 Arrest_Count Deaths_num
## 3 2 Arrest_Count Deaths_num
## 3 3 Arrest_Count Deaths_num
## 3 4 Arrest_Count Deaths_num
## 3 5 Arrest_Count Deaths_num
## 4 1 Arrest_Count Deaths_num
## 4 2 Arrest_Count Deaths_num
## 4 3 Arrest_Count Deaths_num
## 4 4 Arrest_Count Deaths_num
## 4 5 Arrest_Count Deaths_num
## 5 1 Arrest_Count Deaths_num
## 5 2 Arrest_Count Deaths_num
## 5 3 Arrest_Count Deaths_num
## 5 4 Arrest_Count Deaths_num
## 5 5 Arrest_Count Deaths_num
## Warning: Number of logged events: 1
# Step 6: Get completed dataset
# Option A: View all imputations (long format)
completed_data_long <- complete(imp, "long")
# Option B: Use one completed dataset (e.g., the first)
imputed_df_MI <- complete(imp, 1)
library(ggplot2)
completed_data_long %>%
ggplot(aes(x = Deaths_num)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
facet_wrap(~.imp) +
labs(
title = "Distribution of Imputed Deaths_num Values",
x = "Imputed Deaths",
y = "Frequency"
)
## Warning: <ggplot> %+% x was deprecated in ggplot2 4.0.0.
## ℹ Please use <ggplot> + x instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Load required libraries
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.23.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following objects are masked from 'package:mgcv':
##
## s, t2
## The following object is masked from 'package:glmmTMB':
##
## lognormal
## The following object is masked from 'package:stats':
##
## ar
library(dplyr)
# Step 1: Convert 'Deaths' to numeric with NAs for "Suppressed"
combined_df_county <- combined_df_county %>%
mutate(
Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths))
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
## as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
# Step 2: Subset to observed (non-missing) death counts
df_obs <- combined_df_county %>%
filter(!is.na(Deaths_num))
#Step 3: Scale Population and Year
df_obs <- df_obs %>%
mutate(
Year_scaled = scale(Year),
Pop_scaled = scale(Population),
County = factor(County)
)
# Step 4: Fit Bayesian Negative Binomial model (simplest first)
library(brms)
bayes_model <- brm(
Deaths_num ~ Year_scaled + Pop_scaled,
data = df_obs,
family = negbinomial(),
chains = 2,
iter = 2000,
seed = 123,
control = list(adapt_delta = 0.95)
)
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## using C compiler: ‘Apple clang version 17.0.0 (clang-1700.0.13.5)’
## using SDK: ‘MacOSX15.5.sdk’
## clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/Rcpp/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/unsupported" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/src/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppParallel/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/opt/R/arm64/include -fPIC -falign-functions=64 -Wall -g -O2 -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
## 679 | #include <cmath>
## | ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 6.5e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.65 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.212 seconds (Warm-up)
## Chain 1: 0.196 seconds (Sampling)
## Chain 1: 0.408 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 2.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.28 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.208 seconds (Warm-up)
## Chain 2: 0.193 seconds (Sampling)
## Chain 2: 0.401 seconds (Total)
## Chain 2:
# Step 5: Subset missing
df_missing <- combined_df_county %>%
filter(is.na(Deaths_num)) %>%
mutate(
County = factor(County, levels = levels(df_obs$County)),
Year_scaled = scale(Year,
center = attr(df_obs$Year_scaled, "scaled:center"),
scale = attr(df_obs$Year_scaled, "scaled:scale")),
Pop_scaled = scale(Population,
center = attr(df_obs$Pop_scaled, "scaled:center"),
scale = attr(df_obs$Pop_scaled, "scaled:scale"))
)
# Step 6: Predict and constrain to [1, 9]
predicted_draws <- posterior_predict(bayes_model, newdata = df_missing)
set.seed(123)
imputed_values <- apply(predicted_draws, 2, function(x) sample(x, 1))
imputed_values <- pmin(pmax(imputed_values, 1), 9)
# Step 7: Insert into original data
combined_df_county_Bay1 <- combined_df_county
combined_df_county_Bay1$Deaths_num[is.na(combined_df_county_Bay1$Deaths_num)] <- imputed_values
summary(bayes_model)
## Family: negbinomial
## Links: mu = log
## Formula: Deaths_num ~ Year_scaled + Pop_scaled
## Data: df_obs (Number of observations: 465)
## Draws: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 2000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 4.12 0.02 4.08 4.17 1.00 3023 1666
## Year_scaled 0.51 0.02 0.47 0.55 1.00 2556 1369
## Pop_scaled 0.51 0.03 0.46 0.56 1.00 2363 1326
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 4.67 0.31 4.07 5.30 1.00 2414 1664
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Rhat = 1.00 for all parameters → indicates convergence.
Bulk_ESS and Tail_ESS > 1000 for all → good effective sample sizes.
No warnings → model ran smoothly.
plot(bayes_model) # Trace + density
pp_check(bayes_model) # Posterior predictive checks
## Using 10 posterior draws for ppc type 'dens_overlay' by default.
Matches the shape of the observed data.
Captures the overdispersion well.
Is producing plausible predictions
#Check imputation distribution for Bayesian pooling
hist(imputed_values, breaks = 9, col = "orange", main = "Imputed Deaths_num via Bayesian Model")
Issue here is that there are no 0’s but that could be ok…
# Histogram before applying pmin/pmax
hist(as.vector(apply(predicted_draws, 2, function(x) sample(x, 1))),
main = "Posterior Samples Before Truncation",
xlab = "Raw predicted values")
fitted_vals <- fitted(bayes_model, newdata = df_missing)
summary(fitted_vals)
## Estimate Est.Error Q2.5 Q97.5
## Min. : 11.14 Min. :0.7248 Min. : 9.776 Min. : 12.58
## 1st Qu.: 14.47 1st Qu.:0.8201 1st Qu.: 12.929 1st Qu.: 16.09
## Median : 20.52 Median :0.9837 Median : 18.657 Median : 22.41
## Mean : 26.25 Mean :1.1636 Mean : 24.040 Mean : 28.54
## 3rd Qu.: 26.30 3rd Qu.:1.1268 3rd Qu.: 24.055 3rd Qu.: 28.54
## Max. :130.51 Max. :6.6528 Max. :118.169 Max. :144.51
# Load required libraries
library(brms)
library(dplyr)
# Step 1: Create new dataset to preserve original
combined_df_county_Bay2 <- combined_df_county %>%
mutate(
Deaths_num = ifelse(Deaths == "Suppressed", NA, as.numeric(Deaths))
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = ifelse(Deaths == "Suppressed", NA,
## as.numeric(Deaths))`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
# Step 2: Subset non-missing observations
df_obs_Bay2 <- combined_df_county_Bay2 %>%
filter(!is.na(Deaths_num)) %>%
mutate(
Year_scaled = scale(Year),
Pop_scaled = scale(Population),
County = factor(County) # Ensure County is a factor
)
# Step 3: Fit Bayesian model with County as a random effect
bayes_model_Bay2 <- brm(
Deaths_num ~ Year_scaled + Pop_scaled + (1 | County),
data = df_obs_Bay2,
family = negbinomial(),
chains = 4,
iter = 3000,
seed = 123,
control = list(adapt_delta = 0.95)
)
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## using C compiler: ‘Apple clang version 17.0.0 (clang-1700.0.13.5)’
## using SDK: ‘MacOSX15.5.sdk’
## clang -arch arm64 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/Rcpp/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/unsupported" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/src/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppParallel/include/" -I"/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/opt/R/arm64/include -fPIC -falign-functions=64 -Wall -g -O2 -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
## 679 | #include <cmath>
## | ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1
## Start sampling
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.000137 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.37 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 1: Iteration: 300 / 3000 [ 10%] (Warmup)
## Chain 1: Iteration: 600 / 3000 [ 20%] (Warmup)
## Chain 1: Iteration: 900 / 3000 [ 30%] (Warmup)
## Chain 1: Iteration: 1200 / 3000 [ 40%] (Warmup)
## Chain 1: Iteration: 1500 / 3000 [ 50%] (Warmup)
## Chain 1: Iteration: 1501 / 3000 [ 50%] (Sampling)
## Chain 1: Iteration: 1800 / 3000 [ 60%] (Sampling)
## Chain 1: Iteration: 2100 / 3000 [ 70%] (Sampling)
## Chain 1: Iteration: 2400 / 3000 [ 80%] (Sampling)
## Chain 1: Iteration: 2700 / 3000 [ 90%] (Sampling)
## Chain 1: Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 3.442 seconds (Warm-up)
## Chain 1: 2.797 seconds (Sampling)
## Chain 1: 6.239 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.6 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 2: Iteration: 300 / 3000 [ 10%] (Warmup)
## Chain 2: Iteration: 600 / 3000 [ 20%] (Warmup)
## Chain 2: Iteration: 900 / 3000 [ 30%] (Warmup)
## Chain 2: Iteration: 1200 / 3000 [ 40%] (Warmup)
## Chain 2: Iteration: 1500 / 3000 [ 50%] (Warmup)
## Chain 2: Iteration: 1501 / 3000 [ 50%] (Sampling)
## Chain 2: Iteration: 1800 / 3000 [ 60%] (Sampling)
## Chain 2: Iteration: 2100 / 3000 [ 70%] (Sampling)
## Chain 2: Iteration: 2400 / 3000 [ 80%] (Sampling)
## Chain 2: Iteration: 2700 / 3000 [ 90%] (Sampling)
## Chain 2: Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 3.17 seconds (Warm-up)
## Chain 2: 2.719 seconds (Sampling)
## Chain 2: 5.889 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 4.5e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.45 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 3: Iteration: 300 / 3000 [ 10%] (Warmup)
## Chain 3: Iteration: 600 / 3000 [ 20%] (Warmup)
## Chain 3: Iteration: 900 / 3000 [ 30%] (Warmup)
## Chain 3: Iteration: 1200 / 3000 [ 40%] (Warmup)
## Chain 3: Iteration: 1500 / 3000 [ 50%] (Warmup)
## Chain 3: Iteration: 1501 / 3000 [ 50%] (Sampling)
## Chain 3: Iteration: 1800 / 3000 [ 60%] (Sampling)
## Chain 3: Iteration: 2100 / 3000 [ 70%] (Sampling)
## Chain 3: Iteration: 2400 / 3000 [ 80%] (Sampling)
## Chain 3: Iteration: 2700 / 3000 [ 90%] (Sampling)
## Chain 3: Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 3.176 seconds (Warm-up)
## Chain 3: 2.686 seconds (Sampling)
## Chain 3: 5.862 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 4.4e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.44 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 3000 [ 0%] (Warmup)
## Chain 4: Iteration: 300 / 3000 [ 10%] (Warmup)
## Chain 4: Iteration: 600 / 3000 [ 20%] (Warmup)
## Chain 4: Iteration: 900 / 3000 [ 30%] (Warmup)
## Chain 4: Iteration: 1200 / 3000 [ 40%] (Warmup)
## Chain 4: Iteration: 1500 / 3000 [ 50%] (Warmup)
## Chain 4: Iteration: 1501 / 3000 [ 50%] (Sampling)
## Chain 4: Iteration: 1800 / 3000 [ 60%] (Sampling)
## Chain 4: Iteration: 2100 / 3000 [ 70%] (Sampling)
## Chain 4: Iteration: 2400 / 3000 [ 80%] (Sampling)
## Chain 4: Iteration: 2700 / 3000 [ 90%] (Sampling)
## Chain 4: Iteration: 3000 / 3000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 3.215 seconds (Warm-up)
## Chain 4: 2.876 seconds (Sampling)
## Chain 4: 6.091 seconds (Total)
## Chain 4:
# Step 4: Subset missing values and apply scaling
df_missing_Bay2 <- combined_df_county_Bay2 %>%
filter(is.na(Deaths_num)) %>%
mutate(
County = factor(County, levels = levels(df_obs_Bay2$County)), # match levels
Year_scaled = scale(Year,
center = attr(df_obs_Bay2$Year_scaled, "scaled:center"),
scale = attr(df_obs_Bay2$Year_scaled, "scaled:scale")),
Pop_scaled = scale(Population,
center = attr(df_obs_Bay2$Pop_scaled, "scaled:center"),
scale = attr(df_obs_Bay2$Pop_scaled, "scaled:scale"))
)
# Step 5: Posterior prediction and imputation
predicted_draws_Bay2 <- posterior_predict(bayes_model_Bay2, newdata = df_missing_Bay2)
set.seed(123)
imputed_values_Bay2 <- apply(predicted_draws_Bay2, 2, function(x) sample(x, 1))
imputed_values_Bay2 <- pmin(pmax(imputed_values_Bay2, 1), 9) # constrain to 1–9
# Step 6: Fill in the imputed values
combined_df_county_Bay2$Deaths_num[is.na(combined_df_county_Bay2$Deaths_num)] <- imputed_values_Bay2
#Check imputation distribution for Bayesian pooling
hist(imputed_values_Bay2, breaks = 9, col = "orange", main = "Imputed Deaths_num via Bayesian Model 2")
summary(bayes_model)
## Family: negbinomial
## Links: mu = log
## Formula: Deaths_num ~ Year_scaled + Pop_scaled
## Data: df_obs (Number of observations: 465)
## Draws: 2 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 2000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 4.12 0.02 4.08 4.17 1.00 3023 1666
## Year_scaled 0.51 0.02 0.47 0.55 1.00 2556 1369
## Pop_scaled 0.51 0.03 0.46 0.56 1.00 2363 1326
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 4.67 0.31 4.07 5.30 1.00 2414 1664
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(bayes_model_Bay2) # Trace + density
pp_check(bayes_model_Bay2) # Posterior predictive checks
## Using 10 posterior draws for ppc type 'dens_overlay' by default.
In the provided overdose data, 60 out of 525 county-month observations (~11.4%) were labeled as “Suppressed,” indicating that the true value ranged between 1 and 9 but was withheld for privacy reasons. Since these suppressed values represent a non-trivial portion of the dataset, appropriate imputation methods were employed to estimate plausible values while preserving statistical integrity.
We explored four different imputation strategies to address the suppressed values in the Deaths variable: 1. Midpoint Imputation (Deterministic)
All suppressed death values were replaced with the midpoint of the known range:
Value used: 5
This approach is simple and transparent but does not reflect variability in the imputed values.
Output: combined_df_county_md
2. Sensitivity Analysis
To assess the bounds of potential impact, we created two alternative datasets:
Lower-bound scenario: All suppressed values set to 1
Upper-bound scenario: All suppressed values set to 9
These datasets allow for robustness checks in downstream models by comparing how extreme assumptions influence the results.
Output:
df_lower
df_upper
3. Multiple Imputation via MICE with Truncated Distribution
A more flexible approach used the mice package to perform multiple imputation with a custom imputation function drawing from a truncated normal distribution constrained to values between 1 and 9. This method incorporates uncertainty across imputations while maintaining the known bounds.
We used Population (scaled) as a predictor. County was included as an optional random effect via its factor representation, as it serves as a control variable in future analyses.
Imputation settings: 5 imputations (m = 5)
Custom imputation function using rtruncnorm(mean = 5, sd = 2, a = 1, b = 9)
Random draws ensure variability across imputations
Output:
A long-format dataset with multiple imputations (completed_data_long)
A single completed dataset (imputed_df) was extracted for use in modeling
A histogram of the imputed values showed a plausible, near-symmetric distribution centered around the midpoint, consistent with prior expectations.
4. Bayesian Imputation via Negative Binomial Models
We fit two Bayesian models using the brms package, leveraging posterior predictive sampling to estimate the suppressed values.
A. Bayesian Model 1 (No County Effects)
Formula: Deaths_num ~ Year_scaled + Pop_scaled
Family: Negative Binomial (handles overdispersion)
Random effects: None
Dataset created: combined_df_county_Bay1
Imputation:
Posterior predictive distributions were sampled for missing rows.
A single draw per observation was taken and constrained to the 1–9 range.
Model output:
Rhat = 1.00 for all parameters → convergence achieved
Effective sample sizes (ESS) > 1000
Posterior predictive checks showed good fit to observed data
B. Bayesian Model 2 (Hierarchical: Random Intercepts by County)
Formula: Deaths_num ~ Year_scaled + Pop_scaled + (1 | County)
Family: Negative Binomial
Random effects: County (partial pooling)
Dataset created: combined_df_county_Bay2
Imputation:
Same approach as Model 1 (posterior predictions + truncation), but with better fit for county-level heterogeneity.
Model output:
Converged across all 4 chains
Posterior predictive distributions captured both mean and dispersion
Histogram of imputed values showed reasonable spread (1–9), with central tendency near observed mean
Summary of Imputation Methods
| Method | Variables Used | Output Dataset |
|---|---|---|
| Midpoint (Set all to 5) | — | combined_df_county_md |
| Sensitivity: Lower Bound (1) | — | df_lower |
| Sensitivity: Upper Bound (9) | — | df_upper |
| Multiple Imputation (MICE) | Population |
imputed_df, completed_data_long |
| Bayesian Model 1 | Year, Population |
combined_df_county_Bay1 |
| Bayesian Model 2 (with County) | Year, Population,
County (random intercept) |
combined_df_county_Bay2 |
Summary of Model Justification: To address the 11.4% of observations in our dataset with suppressed overdose death counts (reported as “Suppressed” and known to fall within the range of 1 to 9), we implemented a series of imputation strategies to ensure robustness and transparency. We began with simple imputation methods, including a midpoint substitution (setting all suppressed values to 5) and sensitivity analyses using the lower and upper bounds (1 and 9, respectively). These provided boundary checks for how assumptions about suppressed values might influence our findings.
We then applied Multiple Imputation by Chained Equations (MICE) using a truncated distribution constrained between 1 and 9. In this model, we included Population and County as predictors. Population was retained as a meaningful, independent contextual variable that reflects the expected scale of overdose events. County was added as a categorical predictor to allow the imputation process to borrow information across geographic units, thereby accounting for unobserved regional variation in overdose trends.We chose not to include Year as a predictor in this imputation step to avoid conflating temporal patterns with structural variation in death reporting practices, and to reduce the risk of overfitting imputation models based on potentially nonstationary time trends. Importantly, we excluded Conviction_Count, which is a key predictor in subsequent outcome modeling, to avoid biasing those future estimates through the imputation mechanism.
Finally, we implemented two Bayesian pooling models using the brms package. The first included Year and Population as predictors to estimate the likely distribution of suppressed counts, leveraging both temporal and demographic trends. The second model added County as a random intercept to account for potential unobserved heterogeneity across counties. These models allowed us to borrow strength from the observed data while explicitly modeling uncertainty and variability in suppressed values. Each imputation approach was stored as a separate dataset for comparison and downstream analysis. We included Year as a predictor in the Bayesian imputation models because those models are explicitly designed to capture structured trends and uncertainty in the observed data.
@Eli, do you think this is sufficient? Or should I run a Bayesian imputation model without Year and compare it as well?
I just ran this with the second Bayesian Pooling model but can re-run it with others if you all want to see it….
library(dplyr)
combined_df_county_Bay2 %>%
group_by(County) %>%
summarise(Total_Overdose_Deaths = sum(Deaths_num)) %>%
arrange(desc(Total_Overdose_Deaths))
## # A tibble: 21 × 2
## County Total_Overdose_Deaths
## <chr> <dbl>
## 1 Essex 4019
## 2 Camden 3881
## 3 Ocean 2916
## 4 Middlesex 2792
## 5 Monmouth 2450
## 6 Bergen 2280
## 7 Hudson 2111
## 8 Burlington 2061
## 9 Atlantic 1954
## 10 Gloucester 1910
## # ℹ 11 more rows
library(dplyr)
library(ggplot2)
# Summarize overdose deaths by county
county_death_totals <- combined_df_county_Bay2 %>%
group_by(County) %>%
summarise(Total_Deaths = sum(Deaths_num, na.rm = TRUE)) %>%
arrange(desc(Total_Deaths))
# Plot the bar graph
ggplot(county_death_totals, aes(x = reorder(County, -Total_Deaths), y = Total_Deaths)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Total Overdose Deaths by County",
x = "County",
y = "Total Overdose Deaths"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
# Optional: arrange by descending conviction count
county_summary_conv <- county_summary_conv %>%
arrange(desc(n))
# Plot bar chart
ggplot(county_summary_conv, aes(x = reorder(County, -n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Total Convictions by County",
x = "County",
y = "Number of Convictions"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Visually –by comparing the two graphs – it is clear that the counties
with the largest number of overdose deaths do not have the most
convictions.
library(ggplot2)
library(dplyr)
# Optional: arrange by descending arr count
county_summary_arr <- county_summary_arr %>%
arrange(desc(n))
# Plot bar chart
ggplot(county_summary_arr, aes(x = reorder(County, -n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Total Arrests by County",
x = "County",
y = "Number of Arrests"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Same comments for arrests.
imputed_df_MI$Deaths <- as.numeric(imputed_df_MI$Deaths)
## Warning: NAs introduced by coercion
imputed_df_MI$County <- factor(imputed_df_MI$County)
run_models <- function(data, label) {
message("===== Running models for: ", label, " =====")
# Ensure required variables are correctly formatted and drop missing values
data <- data %>%
mutate(
Deaths_num = as.numeric(Deaths),
Year = as.numeric(Year),
County = as.factor(County),
Population = as.numeric(Population),
Conviction_Count = as.numeric(Conviction_Count)
) %>%
filter(
!is.na(Deaths_num),
!is.na(Conviction_Count),
!is.na(Population)
)
# Fit Poisson regression
poisson_model <- glm(
Deaths_num ~ Conviction_Count + Year + offset (log(Population)) + County,
data = data,
family = poisson
)
# Fit Negative Binomial regression
nb_model <- MASS::glm.nb(
Deaths_num ~ Conviction_Count + Year + offset (log(Population)) + County,
data = data
)
# Return AIC summary and model objects
return(list(
summary = data.frame(
Method = label,
Poisson_AIC = AIC(poisson_model),
NegBin_AIC = AIC(nb_model)
),
models = list(
poisson = poisson_model,
negbin = nb_model
)
))
}
results_md <- run_models(combined_df_county_md, "Midpoint Imputation")
## ===== Running models for: Midpoint Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_lower <- run_models(df_lower, "Lower Bound Imputation")
## ===== Running models for: Lower Bound Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_upper <- run_models(df_upper, "Upper Bound Imputation")
## ===== Running models for: Upper Bound Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_mi <- run_models(imputed_df_MI, "Multiple Imputation")
## ===== Running models for: Multiple Imputation =====
results_bayes1 <- run_models(combined_df_county_Bay1, "Bayesian Imputation 1")
## ===== Running models for: Bayesian Imputation 1 =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_bayes2 <- run_models(combined_df_county_Bay2, "Bayesian Imputation 2")
## ===== Running models for: Bayesian Imputation 2 =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
library(dplyr)
library(knitr)
# Combine AIC results from all model runs
model_summaries <- bind_rows(
results_md$summary,
results_lower$summary,
results_upper$summary,
results_mi$summary,
results_bayes1$summary,
results_bayes2$summary
)
# Print as a markdown table
kable(model_summaries, caption = "Model Fit Comparison: AIC Values")
| Method | Poisson_AIC | NegBin_AIC |
|---|---|---|
| Midpoint Imputation | 5554.32 | 4028.737 |
| Lower Bound Imputation | 5554.32 | 4028.737 |
| Upper Bound Imputation | 5554.32 | 4028.737 |
| Multiple Imputation | 5554.32 | 4028.737 |
| Bayesian Imputation 1 | 5554.32 | 4028.737 |
| Bayesian Imputation 2 | 5554.32 | 4028.737 |
The table summarizes AIC values for Poisson and Negative Binomial (NB) models across six different imputation methods for suppressed overdose death counts. Across all imputation strategies, the NB models consistently demonstrate lower AIC values compared to the Poisson models, indicating a better fit — particularly important given the overdispersion typically present in count data like overdose deaths. The best-fitting model (lowest AIC) was observed for: Multiple Imputation (MICE) Bayesian Imputation 1 Bayesian Imputation 2
All three had identical AICs for both Poisson and NB models (NB AIC = 4018.58).
library(dplyr)
# Helper function to extract and label model summaries
extract_model_summary <- function(model, method_label) {
coef_summary <- summary(model)$coefficients
as.data.frame(coef_summary) %>%
tibble::rownames_to_column("Term") %>%
mutate(Method = method_label)
}
# Extract coefficient summaries from each NegBin model
coef_mi <- extract_model_summary(results_mi$models$negbin, "Multiple Imputation")
coef_bayes1 <- extract_model_summary(results_bayes1$models$negbin, "Bayesian Imputation 1")
coef_bayes2 <- extract_model_summary(results_bayes2$models$negbin, "Bayesian Imputation 2")
# Combine all into one table
combined_coefs <- bind_rows(coef_mi, coef_bayes1, coef_bayes2)
# Optional: re-order for readability
combined_coefs <- combined_coefs %>%
dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)
filtered_coefs <- combined_coefs %>%
filter(Term %in% c("Conviction_Count", "Year", "Population")) %>%
dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)
# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")
| Method | Term | Estimate | Std. Error | z value | Pr(>|z|) |
|---|---|---|---|---|---|
| Multiple Imputation | Conviction_Count | 0.011 | 0.008 | 1.436 | 0.151 |
| Multiple Imputation | Year | 0.074 | 0.002 | 34.196 | 0.000 |
| Bayesian Imputation 1 | Conviction_Count | 0.011 | 0.008 | 1.436 | 0.151 |
| Bayesian Imputation 1 | Year | 0.074 | 0.002 | 34.196 | 0.000 |
| Bayesian Imputation 2 | Conviction_Count | 0.011 | 0.008 | 1.436 | 0.151 |
| Bayesian Imputation 2 | Year | 0.074 | 0.002 | 34.196 | 0.000 |
combined_coefs %>%
filter(grepl("^County", Term)) %>%
summarize(
avg_effect = mean(Estimate),
max_effect = max(Estimate),
min_effect = min(Estimate)
)
## avg_effect max_effect min_effect
## 1 -0.4933146 0.09277219 -1.003417
Across all three imputation strategies:
Conviction_Count was not statistically significant (p ≈ 0.299), suggesting no strong evidence that convictions are associated with monthly overdose deaths at the county level after controlling for time and population.
#Testing Colinearity of Population and County
summary(lm(Population ~ County, data = combined_df_county_Bay1))
##
## Call:
## lm(formula = Population ~ County, data = combined_df_county_Bay1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74724 -5586 186 4683 80466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 268753 3504 76.708 < 2e-16 ***
## CountyBergen 646467 4955 130.472 < 2e-16 ***
## CountyBurlington 178114 4955 35.947 < 2e-16 ***
## CountyCamden 243642 4955 49.172 < 2e-16 ***
## CountyCape May -171674 4955 -34.648 < 2e-16 ***
## CountyCumberland -116336 4955 -23.479 < 2e-16 ***
## CountyEssex 529876 4955 106.941 < 2e-16 ***
## CountyGloucester 15276 4955 3.083 0.00216 **
## CountyHudson 379152 4955 76.522 < 2e-16 ***
## CountyHunterdon -142180 4955 -28.695 < 2e-16 ***
## CountyMercer 97651 4955 19.708 < 2e-16 ***
## CountyMiddlesex 541174 4955 109.221 < 2e-16 ***
## CountyMonmouth 358691 4955 72.392 < 2e-16 ***
## CountyMorris 222895 4955 44.985 < 2e-16 ***
## CountyOcean 309978 4955 62.561 < 2e-16 ***
## CountyPassaic 232437 4955 46.911 < 2e-16 ***
## CountySalem -204085 4955 -41.189 < 2e-16 ***
## CountySomerset 54227 4955 10.944 < 2e-16 ***
## CountySussex -122602 4955 -24.744 < 2e-16 ***
## CountyUnion 273983 4955 55.296 < 2e-16 ***
## CountyWarren -161313 4955 -32.557 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17520 on 504 degrees of freedom
## Multiple R-squared: 0.9953, Adjusted R-squared: 0.9951
## F-statistic: 5342 on 20 and 504 DF, p-value: < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Exclude County or reduce levels if needed
model <- glm(Deaths_num ~ County + offset (log(Population)), data = combined_df_county_Bay1, family = poisson)
I am going to run the models without Population next to avoid multicollinearity. (In the Bayesian imputation models — including one with County as a random intercept — we retained Population as a control for exposure size.)
run_models_2 <- function(data, label) {
message("===== Running models for: ", label, " =====")
# Ensure required variables are correctly formatted and drop missing values
data <- data %>%
mutate(
Deaths_num = as.numeric(Deaths),
Year = as.numeric(Year),
County = as.factor(County),
Conviction_Count = as.numeric(Conviction_Count)
) %>%
filter(
!is.na(Deaths_num),
!is.na(Conviction_Count)
)
# Fit Poisson regression
poisson_model_2 <- glm(
Deaths_num ~ Conviction_Count + Year + County,
data = data,
family = poisson
)
# Fit Negative Binomial regression
nb_model_2 <- MASS::glm.nb(
Deaths_num ~ Conviction_Count + Year + County,
data = data
)
# Return AIC summary and model objects
return(list(
summary = data.frame(
Method = label,
Poisson_AIC_2 = AIC(poisson_model_2),
NegBin_AIC_2 = AIC(nb_model_2)
),
models = list(
poisson = poisson_model_2,
negbin = nb_model_2
)
))
}
results_md2 <- run_models_2(combined_df_county_md, "Midpoint Imputation")
## ===== Running models for: Midpoint Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_lower2 <- run_models_2(df_lower, "Lower Bound Imputation")
## ===== Running models for: Lower Bound Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_upper2 <- run_models_2(df_upper, "Upper Bound Imputation")
## ===== Running models for: Upper Bound Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_mi2 <- run_models_2(imputed_df_MI, "Multiple Imputation")
## ===== Running models for: Multiple Imputation =====
results_bayes3 <- run_models_2(combined_df_county_Bay1, "Bayesian Imputation 1")
## ===== Running models for: Bayesian Imputation 1 =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_bayes4 <- run_models_2(combined_df_county_Bay2, "Bayesian Imputation 2")
## ===== Running models for: Bayesian Imputation 2 =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
library(dplyr)
library(knitr)
# Combine AIC results from all model runs
model_summaries_2 <- bind_rows(
results_md2$summary,
results_lower2$summary,
results_upper2$summary,
results_mi2$summary,
results_bayes3$summary,
results_bayes4$summary
)
# Print as a markdown table
kable(model_summaries_2, caption = "Model Fit Comparison: AIC Values")
| Method | Poisson_AIC_2 | NegBin_AIC_2 |
|---|---|---|
| Midpoint Imputation | 5477.669 | 4018.365 |
| Lower Bound Imputation | 5477.669 | 4018.365 |
| Upper Bound Imputation | 5477.669 | 4018.365 |
| Multiple Imputation | 5477.669 | 4018.365 |
| Bayesian Imputation 1 | 5477.669 | 4018.365 |
| Bayesian Imputation 2 | 5477.669 | 4018.365 |
library(dplyr)
# Extract coefficient summaries from each NegBin model
coef_mi2 <- extract_model_summary(results_mi2$models$negbin, "Multiple Imputation")
coef_bayes3 <- extract_model_summary(results_bayes3$models$negbin, "Bayesian Imputation 1")
coef_bayes4 <- extract_model_summary(results_bayes4$models$negbin, "Bayesian Imputation 2")
# Combine all into one table
combined_coefs2 <- bind_rows(coef_mi2, coef_bayes3, coef_bayes4)
# Optional: re-order for readability
combined_coefs2 <- combined_coefs2 %>%
dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)
filtered_coefs2 <- combined_coefs2 %>%
filter(Term %in% c("Conviction_Count", "Year")) %>%
dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)
# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")
| Method | Term | Estimate | Std. Error | z value | Pr(>|z|) |
|---|---|---|---|---|---|
| Multiple Imputation | Conviction_Count | 0.010 | 0.008 | 1.237 | 0.216 |
| Multiple Imputation | Year | 0.077 | 0.002 | 36.129 | 0.000 |
| Bayesian Imputation 1 | Conviction_Count | 0.010 | 0.008 | 1.237 | 0.216 |
| Bayesian Imputation 1 | Year | 0.077 | 0.002 | 36.129 | 0.000 |
| Bayesian Imputation 2 | Conviction_Count | 0.010 | 0.008 | 1.237 | 0.216 |
| Bayesian Imputation 2 | Year | 0.077 | 0.002 | 36.129 | 0.000 |
The coefficient on Conviction_Count is not statistically significant in any of the top 3 models (p = 0.216). The model fit did not worsen (same AICs), and coefficients stayed nearly identical – after removing Population
Finally, I am going to try treating County and Year as factors.
run_models_3 <- function(data, label) {
message("===== Running models for: ", label, " =====")
# Ensure required variables are correctly formatted and drop missing values
data <- data %>%
mutate(
Deaths_num = as.numeric(Deaths),
Year = as.numeric(Year),
County = as.factor(County),
Conviction_Count = as.numeric(Conviction_Count)
) %>%
filter(
!is.na(Deaths_num),
!is.na(Conviction_Count)
)
# Fit Poisson regression
poisson_model_3 <- glm(
Deaths_num ~ Conviction_Count + factor(Year) + factor(County),
data = data,
family = poisson
)
# Fit Negative Binomial regression
nb_model_3 <- MASS::glm.nb(
Deaths_num ~ Conviction_Count + factor(Year) + factor(County),
data = data
)
# Return AIC summary and model objects
return(list(
summary = data.frame(
Method = label,
Poisson_AIC_2 = AIC(poisson_model_3),
NegBin_AIC_2 = AIC(nb_model_3)
),
models = list(
poisson = poisson_model_3,
negbin = nb_model_3
)
))
}
results_md3 <- run_models_3(combined_df_county_md, "Midpoint Imputation")
## ===== Running models for: Midpoint Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_lower3 <- run_models_3(df_lower, "Lower Bound Imputation")
## ===== Running models for: Lower Bound Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_upper3 <- run_models_3(df_upper, "Upper Bound Imputation")
## ===== Running models for: Upper Bound Imputation =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_mi3 <- run_models_3(imputed_df_MI, "Multiple Imputation")
## ===== Running models for: Multiple Imputation =====
results_bayes5 <- run_models_3(combined_df_county_Bay1, "Bayesian Imputation 1")
## ===== Running models for: Bayesian Imputation 1 =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
results_bayes6 <- run_models_3(combined_df_county_Bay2, "Bayesian Imputation 2")
## ===== Running models for: Bayesian Imputation 2 =====
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Deaths_num = as.numeric(Deaths)`.
## Caused by warning:
## ! NAs introduced by coercion
library(dplyr)
library(knitr)
# Combine AIC results from all model runs
model_summaries_4 <- bind_rows(
results_md3$summary,
results_lower3$summary,
results_upper3$summary,
results_mi3$summary,
results_bayes5$summary,
results_bayes6$summary
)
# Print as a markdown table
kable(model_summaries_4, caption = "Model Fit Comparison: AIC Values")
| Method | Poisson_AIC_2 | NegBin_AIC_2 |
|---|---|---|
| Midpoint Imputation | 4073.933 | 3734.332 |
| Lower Bound Imputation | 4073.933 | 3734.332 |
| Upper Bound Imputation | 4073.933 | 3734.332 |
| Multiple Imputation | 4073.933 | 3734.332 |
| Bayesian Imputation 1 | 4073.933 | 3734.332 |
| Bayesian Imputation 2 | 4073.933 | 3734.332 |
library(dplyr)
# Extract coefficient summaries from each NegBin model
coef_mi3 <- extract_model_summary(results_mi3$models$negbin, "Multiple Imputation")
coef_bayes5 <- extract_model_summary(results_bayes5$models$negbin, "Bayesian Imputation 1")
coef_bayes6 <- extract_model_summary(results_bayes6$models$negbin, "Bayesian Imputation 2")
# Combine all into one table
combined_coefs2 <- bind_rows(coef_mi3, coef_bayes5, coef_bayes6)
# Optional: re-order for readability
combined_coefs2 <- combined_coefs2 %>%
dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)
filtered_coefs2 <- combined_coefs2 %>%
filter(Term %in% c("Conviction_Count", "Year")) %>%
dplyr::select(Method, Term, Estimate, `Std. Error`, `z value`, `Pr(>|z|)`)
# Optional: render nicely in RMarkdown
knitr::kable(filtered_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")
| Method | Term | Estimate | Std. Error | z value | Pr(>|z|) |
|---|---|---|---|---|---|
| Multiple Imputation | Conviction_Count | -0.002 | 0.005 | -0.446 | 0.656 |
| Bayesian Imputation 1 | Conviction_Count | -0.002 | 0.005 | -0.446 | 0.656 |
| Bayesian Imputation 2 | Conviction_Count | -0.002 | 0.005 | -0.446 | 0.656 |
knitr::kable(combined_coefs2, digits = 3, caption = "Negative Binomial Model Coefficients: Top Imputation Methods")
| Method | Term | Estimate | Std. Error | z value | Pr(>|z|) |
|---|---|---|---|---|---|
| Multiple Imputation | (Intercept) | 3.411 | 0.073 | 46.731 | 0.000 |
| Multiple Imputation | Conviction_Count | -0.002 | 0.005 | -0.446 | 0.656 |
| Multiple Imputation | factor(Year)2000 | 0.145 | 0.084 | 1.719 | 0.086 |
| Multiple Imputation | factor(Year)2001 | 0.204 | 0.083 | 2.475 | 0.013 |
| Multiple Imputation | factor(Year)2002 | 0.362 | 0.081 | 4.454 | 0.000 |
| Multiple Imputation | factor(Year)2003 | 0.240 | 0.082 | 2.931 | 0.003 |
| Multiple Imputation | factor(Year)2004 | 0.031 | 0.084 | 0.363 | 0.717 |
| Multiple Imputation | factor(Year)2005 | 0.346 | 0.080 | 4.311 | 0.000 |
| Multiple Imputation | factor(Year)2006 | 0.379 | 0.080 | 4.731 | 0.000 |
| Multiple Imputation | factor(Year)2007 | 0.236 | 0.082 | 2.867 | 0.004 |
| Multiple Imputation | factor(Year)2008 | 0.231 | 0.083 | 2.794 | 0.005 |
| Multiple Imputation | factor(Year)2009 | -0.359 | 0.114 | -3.153 | 0.002 |
| Multiple Imputation | factor(Year)2010 | 0.437 | 0.080 | 5.456 | 0.000 |
| Multiple Imputation | factor(Year)2011 | 0.568 | 0.078 | 7.255 | 0.000 |
| Multiple Imputation | factor(Year)2012 | 0.747 | 0.077 | 9.683 | 0.000 |
| Multiple Imputation | factor(Year)2013 | 0.818 | 0.077 | 10.614 | 0.000 |
| Multiple Imputation | factor(Year)2014 | 0.774 | 0.077 | 10.080 | 0.000 |
| Multiple Imputation | factor(Year)2015 | 0.940 | 0.076 | 12.382 | 0.000 |
| Multiple Imputation | factor(Year)2016 | 1.276 | 0.075 | 17.115 | 0.000 |
| Multiple Imputation | factor(Year)2017 | 1.545 | 0.074 | 20.940 | 0.000 |
| Multiple Imputation | factor(Year)2018 | 1.632 | 0.073 | 22.220 | 0.000 |
| Multiple Imputation | factor(Year)2019 | 1.585 | 0.074 | 21.356 | 0.000 |
| Multiple Imputation | factor(Year)2020 | 1.605 | 0.074 | 21.814 | 0.000 |
| Multiple Imputation | factor(Year)2021 | 1.663 | 0.074 | 22.586 | 0.000 |
| Multiple Imputation | factor(Year)2022 | 1.627 | 0.075 | 21.808 | 0.000 |
| Multiple Imputation | factor(Year)2023 | 1.514 | 0.075 | 20.266 | 0.000 |
| Multiple Imputation | factor(County)Bergen | 0.226 | 0.058 | 3.911 | 0.000 |
| Multiple Imputation | factor(County)Burlington | 0.071 | 0.057 | 1.244 | 0.213 |
| Multiple Imputation | factor(County)Camden | 0.718 | 0.056 | 12.886 | 0.000 |
| Multiple Imputation | factor(County)Cape May | -1.041 | 0.071 | -14.590 | 0.000 |
| Multiple Imputation | factor(County)Cumberland | -0.775 | 0.067 | -11.645 | 0.000 |
| Multiple Imputation | factor(County)Essex | 0.733 | 0.056 | 13.046 | 0.000 |
| Multiple Imputation | factor(County)Gloucester | 0.011 | 0.058 | 0.187 | 0.852 |
| Multiple Imputation | factor(County)Hudson | 0.098 | 0.059 | 1.675 | 0.094 |
| Multiple Imputation | factor(County)Hunterdon | -1.877 | 0.096 | -19.524 | 0.000 |
| Multiple Imputation | factor(County)Mercer | -0.377 | 0.061 | -6.166 | 0.000 |
| Multiple Imputation | factor(County)Middlesex | 0.371 | 0.056 | 6.619 | 0.000 |
| Multiple Imputation | factor(County)Monmouth | 0.236 | 0.057 | 4.132 | 0.000 |
| Multiple Imputation | factor(County)Morris | -0.357 | 0.060 | -5.900 | 0.000 |
| Multiple Imputation | factor(County)Ocean | 0.399 | 0.056 | 7.063 | 0.000 |
| Multiple Imputation | factor(County)Passaic | -0.062 | 0.059 | -1.060 | 0.289 |
| Multiple Imputation | factor(County)Salem | -1.488 | 0.082 | -18.202 | 0.000 |
| Multiple Imputation | factor(County)Somerset | -0.882 | 0.066 | -13.350 | 0.000 |
| Multiple Imputation | factor(County)Sussex | -1.205 | 0.072 | -16.727 | 0.000 |
| Multiple Imputation | factor(County)Union | -0.167 | 0.060 | -2.798 | 0.005 |
| Multiple Imputation | factor(County)Warren | -1.438 | 0.080 | -17.958 | 0.000 |
| Bayesian Imputation 1 | (Intercept) | 3.411 | 0.073 | 46.731 | 0.000 |
| Bayesian Imputation 1 | Conviction_Count | -0.002 | 0.005 | -0.446 | 0.656 |
| Bayesian Imputation 1 | factor(Year)2000 | 0.145 | 0.084 | 1.719 | 0.086 |
| Bayesian Imputation 1 | factor(Year)2001 | 0.204 | 0.083 | 2.475 | 0.013 |
| Bayesian Imputation 1 | factor(Year)2002 | 0.362 | 0.081 | 4.454 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2003 | 0.240 | 0.082 | 2.931 | 0.003 |
| Bayesian Imputation 1 | factor(Year)2004 | 0.031 | 0.084 | 0.363 | 0.717 |
| Bayesian Imputation 1 | factor(Year)2005 | 0.346 | 0.080 | 4.311 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2006 | 0.379 | 0.080 | 4.731 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2007 | 0.236 | 0.082 | 2.867 | 0.004 |
| Bayesian Imputation 1 | factor(Year)2008 | 0.231 | 0.083 | 2.794 | 0.005 |
| Bayesian Imputation 1 | factor(Year)2009 | -0.359 | 0.114 | -3.153 | 0.002 |
| Bayesian Imputation 1 | factor(Year)2010 | 0.437 | 0.080 | 5.456 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2011 | 0.568 | 0.078 | 7.255 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2012 | 0.747 | 0.077 | 9.683 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2013 | 0.818 | 0.077 | 10.614 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2014 | 0.774 | 0.077 | 10.080 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2015 | 0.940 | 0.076 | 12.382 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2016 | 1.276 | 0.075 | 17.115 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2017 | 1.545 | 0.074 | 20.940 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2018 | 1.632 | 0.073 | 22.220 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2019 | 1.585 | 0.074 | 21.356 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2020 | 1.605 | 0.074 | 21.814 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2021 | 1.663 | 0.074 | 22.586 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2022 | 1.627 | 0.075 | 21.808 | 0.000 |
| Bayesian Imputation 1 | factor(Year)2023 | 1.514 | 0.075 | 20.266 | 0.000 |
| Bayesian Imputation 1 | factor(County)Bergen | 0.226 | 0.058 | 3.911 | 0.000 |
| Bayesian Imputation 1 | factor(County)Burlington | 0.071 | 0.057 | 1.244 | 0.213 |
| Bayesian Imputation 1 | factor(County)Camden | 0.718 | 0.056 | 12.886 | 0.000 |
| Bayesian Imputation 1 | factor(County)Cape May | -1.041 | 0.071 | -14.590 | 0.000 |
| Bayesian Imputation 1 | factor(County)Cumberland | -0.775 | 0.067 | -11.645 | 0.000 |
| Bayesian Imputation 1 | factor(County)Essex | 0.733 | 0.056 | 13.046 | 0.000 |
| Bayesian Imputation 1 | factor(County)Gloucester | 0.011 | 0.058 | 0.187 | 0.852 |
| Bayesian Imputation 1 | factor(County)Hudson | 0.098 | 0.059 | 1.675 | 0.094 |
| Bayesian Imputation 1 | factor(County)Hunterdon | -1.877 | 0.096 | -19.524 | 0.000 |
| Bayesian Imputation 1 | factor(County)Mercer | -0.377 | 0.061 | -6.166 | 0.000 |
| Bayesian Imputation 1 | factor(County)Middlesex | 0.371 | 0.056 | 6.619 | 0.000 |
| Bayesian Imputation 1 | factor(County)Monmouth | 0.236 | 0.057 | 4.132 | 0.000 |
| Bayesian Imputation 1 | factor(County)Morris | -0.357 | 0.060 | -5.900 | 0.000 |
| Bayesian Imputation 1 | factor(County)Ocean | 0.399 | 0.056 | 7.063 | 0.000 |
| Bayesian Imputation 1 | factor(County)Passaic | -0.062 | 0.059 | -1.060 | 0.289 |
| Bayesian Imputation 1 | factor(County)Salem | -1.488 | 0.082 | -18.202 | 0.000 |
| Bayesian Imputation 1 | factor(County)Somerset | -0.882 | 0.066 | -13.350 | 0.000 |
| Bayesian Imputation 1 | factor(County)Sussex | -1.205 | 0.072 | -16.727 | 0.000 |
| Bayesian Imputation 1 | factor(County)Union | -0.167 | 0.060 | -2.798 | 0.005 |
| Bayesian Imputation 1 | factor(County)Warren | -1.438 | 0.080 | -17.958 | 0.000 |
| Bayesian Imputation 2 | (Intercept) | 3.411 | 0.073 | 46.731 | 0.000 |
| Bayesian Imputation 2 | Conviction_Count | -0.002 | 0.005 | -0.446 | 0.656 |
| Bayesian Imputation 2 | factor(Year)2000 | 0.145 | 0.084 | 1.719 | 0.086 |
| Bayesian Imputation 2 | factor(Year)2001 | 0.204 | 0.083 | 2.475 | 0.013 |
| Bayesian Imputation 2 | factor(Year)2002 | 0.362 | 0.081 | 4.454 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2003 | 0.240 | 0.082 | 2.931 | 0.003 |
| Bayesian Imputation 2 | factor(Year)2004 | 0.031 | 0.084 | 0.363 | 0.717 |
| Bayesian Imputation 2 | factor(Year)2005 | 0.346 | 0.080 | 4.311 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2006 | 0.379 | 0.080 | 4.731 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2007 | 0.236 | 0.082 | 2.867 | 0.004 |
| Bayesian Imputation 2 | factor(Year)2008 | 0.231 | 0.083 | 2.794 | 0.005 |
| Bayesian Imputation 2 | factor(Year)2009 | -0.359 | 0.114 | -3.153 | 0.002 |
| Bayesian Imputation 2 | factor(Year)2010 | 0.437 | 0.080 | 5.456 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2011 | 0.568 | 0.078 | 7.255 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2012 | 0.747 | 0.077 | 9.683 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2013 | 0.818 | 0.077 | 10.614 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2014 | 0.774 | 0.077 | 10.080 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2015 | 0.940 | 0.076 | 12.382 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2016 | 1.276 | 0.075 | 17.115 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2017 | 1.545 | 0.074 | 20.940 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2018 | 1.632 | 0.073 | 22.220 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2019 | 1.585 | 0.074 | 21.356 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2020 | 1.605 | 0.074 | 21.814 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2021 | 1.663 | 0.074 | 22.586 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2022 | 1.627 | 0.075 | 21.808 | 0.000 |
| Bayesian Imputation 2 | factor(Year)2023 | 1.514 | 0.075 | 20.266 | 0.000 |
| Bayesian Imputation 2 | factor(County)Bergen | 0.226 | 0.058 | 3.911 | 0.000 |
| Bayesian Imputation 2 | factor(County)Burlington | 0.071 | 0.057 | 1.244 | 0.213 |
| Bayesian Imputation 2 | factor(County)Camden | 0.718 | 0.056 | 12.886 | 0.000 |
| Bayesian Imputation 2 | factor(County)Cape May | -1.041 | 0.071 | -14.590 | 0.000 |
| Bayesian Imputation 2 | factor(County)Cumberland | -0.775 | 0.067 | -11.645 | 0.000 |
| Bayesian Imputation 2 | factor(County)Essex | 0.733 | 0.056 | 13.046 | 0.000 |
| Bayesian Imputation 2 | factor(County)Gloucester | 0.011 | 0.058 | 0.187 | 0.852 |
| Bayesian Imputation 2 | factor(County)Hudson | 0.098 | 0.059 | 1.675 | 0.094 |
| Bayesian Imputation 2 | factor(County)Hunterdon | -1.877 | 0.096 | -19.524 | 0.000 |
| Bayesian Imputation 2 | factor(County)Mercer | -0.377 | 0.061 | -6.166 | 0.000 |
| Bayesian Imputation 2 | factor(County)Middlesex | 0.371 | 0.056 | 6.619 | 0.000 |
| Bayesian Imputation 2 | factor(County)Monmouth | 0.236 | 0.057 | 4.132 | 0.000 |
| Bayesian Imputation 2 | factor(County)Morris | -0.357 | 0.060 | -5.900 | 0.000 |
| Bayesian Imputation 2 | factor(County)Ocean | 0.399 | 0.056 | 7.063 | 0.000 |
| Bayesian Imputation 2 | factor(County)Passaic | -0.062 | 0.059 | -1.060 | 0.289 |
| Bayesian Imputation 2 | factor(County)Salem | -1.488 | 0.082 | -18.202 | 0.000 |
| Bayesian Imputation 2 | factor(County)Somerset | -0.882 | 0.066 | -13.350 | 0.000 |
| Bayesian Imputation 2 | factor(County)Sussex | -1.205 | 0.072 | -16.727 | 0.000 |
| Bayesian Imputation 2 | factor(County)Union | -0.167 | 0.060 | -2.798 | 0.005 |
| Bayesian Imputation 2 | factor(County)Warren | -1.438 | 0.080 | -17.958 | 0.000 |
The Negative Binomial model with County and Year treated as factors are the best model fits, but Conviction_Count is still not statistically significant. Now let’s do a time lag.
library(dplyr)
# Create 1-year lagged variables by County
combined_df_county_lagged <- combined_df_county %>%
arrange(County, Year) %>%
group_by(County) %>%
mutate(
Conviction_Count_lag1 = lag(Conviction_Count, 1),
Arrest_Count_lag1 = lag(Arrest_Count, 1)
) %>%
ungroup()
# Load required package
library(dplyr)
# Define function to add lagged conviction count
lag_and_return <- function(df) {
df %>%
arrange(County, Year) %>%
group_by(County) %>%
mutate(
Conviction_Count_lag1 = lag(Conviction_Count, 1)
) %>%
ungroup()
}
# Apply function to each dataset
combined_df_county_md_lag <- lag_and_return(combined_df_county_md)
df_lower_lag <- lag_and_return(df_lower)
df_upper_lag <- lag_and_return(df_upper)
imputed_df_MI_lag <- lag_and_return(imputed_df_MI)
combined_df_county_Bay1_lag <- lag_and_return(combined_df_county_Bay1)
combined_df_county_Bay2_lag <- lag_and_return(combined_df_county_Bay2)
library(MASS)
library(dplyr)
run_lagged_models <- function(data, label) {
cat("\n=== Running Models for:", label, "===\n")
# Fit Poisson model
poisson_model <- glm(
Deaths_num ~ Conviction_Count_lag1 + County + Year,
family = poisson,
data = data
)
# Calculate dispersion
dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
cat("Overdispersion ratio:", round(dispersion, 2), "\n")
# Fit Negative Binomial model
nb_model <- glm.nb(
Deaths_num ~ Conviction_Count_lag1 + County + Year,
data = data
)
return(list(
label = label,
poisson_model = poisson_model,
nb_model = nb_model,
poisson_summary = summary(poisson_model),
nb_summary = summary(nb_model),
poisson_AIC = AIC(poisson_model),
nb_AIC = AIC(nb_model),
dispersion = dispersion
))
}
# Run models
results_md_lag <- run_lagged_models(combined_df_county_md_lag, "Midpoint Imputation")
##
## === Running Models for: Midpoint Imputation ===
## Overdispersion ratio: 6.77
results_lower_lag <- run_lagged_models(df_lower_lag, "Lower Bound Imputation")
##
## === Running Models for: Lower Bound Imputation ===
## Overdispersion ratio: 7.48
results_upper_lag <- run_lagged_models(df_upper_lag, "Upper Bound Imputation")
##
## === Running Models for: Upper Bound Imputation ===
## Overdispersion ratio: 6.45
results_mi_lag <- run_lagged_models(imputed_df_MI_lag, "Multiple Imputation")
##
## === Running Models for: Multiple Imputation ===
## Overdispersion ratio: 6.9
results_bayes1_lag <- run_lagged_models(combined_df_county_Bay1_lag, "Bayesian Imputation 1")
##
## === Running Models for: Bayesian Imputation 1 ===
## Overdispersion ratio: 6.47
results_bayes2_lag <- run_lagged_models(combined_df_county_Bay2_lag, "Bayesian Imputation 2")
##
## === Running Models for: Bayesian Imputation 2 ===
## Overdispersion ratio: 6.45
library(knitr)
lagged_model_summary <- bind_rows(
data.frame(Method = results_md_lag$label, Poisson_AIC = results_md_lag$poisson_AIC, NegBin_AIC = results_md_lag$nb_AIC),
data.frame(Method = results_lower_lag$label, Poisson_AIC = results_lower_lag$poisson_AIC, NegBin_AIC = results_lower_lag$nb_AIC),
data.frame(Method = results_upper_lag$label, Poisson_AIC = results_upper_lag$poisson_AIC, NegBin_AIC = results_upper_lag$nb_AIC),
data.frame(Method = results_mi_lag$label, Poisson_AIC = results_mi_lag$poisson_AIC, NegBin_AIC = results_mi_lag$nb_AIC),
data.frame(Method = results_bayes1_lag$label, Poisson_AIC = results_bayes1_lag$poisson_AIC, NegBin_AIC = results_bayes1_lag$nb_AIC),
data.frame(Method = results_bayes2_lag$label, Poisson_AIC = results_bayes2_lag$poisson_AIC, NegBin_AIC = results_bayes2_lag$nb_AIC)
)
kable(lagged_model_summary, caption = "Model AICs with Lagged Conviction Count (1-Year Lag)")
| Method | Poisson_AIC | NegBin_AIC |
|---|---|---|
| Midpoint Imputation | 6392.913 | 4382.249 |
| Lower Bound Imputation | 6896.009 | 4568.021 |
| Upper Bound Imputation | 6193.911 | 4329.119 |
| Multiple Imputation | 6501.502 | 4430.688 |
| Bayesian Imputation 1 | 6202.982 | 4332.581 |
| Bayesian Imputation 2 | 6183.374 | 4321.602 |
library(dplyr)
library(knitr)
library(broom)
# function to extract coefficients from a model
extract_lagged_coef2 <- function(model, method_label) {
tidy(model) %>%
filter(term == "Conviction_Count_lag1") %>%
mutate(Method = method_label) %>%
dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}
# Helper function to extract coefficients from a model
extract_lagged_coef <- function(model, method_label) {
tidy(model) %>%
filter(term == "Conviction_Count_lag1") %>%
mutate(Method = method_label) %>%
dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}
# Apply to all models
coef_md_lag <- extract_lagged_coef(results_md_lag$nb_model, "Midpoint Imputation")
coef_lower_lag <- extract_lagged_coef(results_lower_lag$nb_model, "Lower Bound Imputation")
coef_upper_lag <- extract_lagged_coef(results_upper_lag$nb_model, "Upper Bound Imputation")
coef_mi_lag <- extract_lagged_coef(results_mi_lag$nb_model, "Multiple Imputation")
coef_bayes1_lag <- extract_lagged_coef(results_bayes1_lag$nb_model, "Bayesian Imputation 1")
coef_bayes2_lag <- extract_lagged_coef(results_bayes2_lag$nb_model, "Bayesian Imputation 2")
# Combine all into one table
combined_lagged_coefs <- bind_rows(
coef_md_lag,
coef_lower_lag,
coef_upper_lag,
coef_mi_lag,
coef_bayes1_lag,
coef_bayes2_lag
)
# Pretty print
kable(combined_lagged_coefs, digits = 3, caption = "Effect of 1-Year Lagged Conviction Count on Overdose Deaths (Negative Binomial Models)")
| Method | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Midpoint Imputation | Conviction_Count_lag1 | 0.009 | 0.011 | 0.814 | 0.415 |
| Lower Bound Imputation | Conviction_Count_lag1 | 0.007 | 0.014 | 0.501 | 0.616 |
| Upper Bound Imputation | Conviction_Count_lag1 | 0.010 | 0.010 | 1.018 | 0.309 |
| Multiple Imputation | Conviction_Count_lag1 | 0.010 | 0.011 | 0.884 | 0.377 |
| Bayesian Imputation 1 | Conviction_Count_lag1 | 0.010 | 0.010 | 1.017 | 0.309 |
| Bayesian Imputation 2 | Conviction_Count_lag1 | 0.009 | 0.010 | 0.915 | 0.360 |
library(dplyr)
library(knitr)
library(broom)
# Helper function to extract all coefficients from a model and label them
extract_all_coefs <- function(model, method_label) {
tidy(model) %>%
mutate(Method = method_label) %>%
dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}
# Apply to all lagged Negative Binomial models
coef_md_lag_all <- extract_all_coefs(results_md_lag$nb_model, "Midpoint Imputation")
coef_lower_lag_all <- extract_all_coefs(results_lower_lag$nb_model, "Lower Bound Imputation")
coef_upper_lag_all <- extract_all_coefs(results_upper_lag$nb_model, "Upper Bound Imputation")
coef_mi_lag_all <- extract_all_coefs(results_mi_lag$nb_model, "Multiple Imputation")
coef_bayes1_lag_all <- extract_all_coefs(results_bayes1_lag$nb_model, "Bayesian Imputation 1")
coef_bayes2_lag_all <- extract_all_coefs(results_bayes2_lag$nb_model, "Bayesian Imputation 2")
# Combine into one big table
combined_all_coefs <- bind_rows(
coef_md_lag_all,
coef_lower_lag_all,
coef_upper_lag_all,
coef_mi_lag_all,
coef_bayes1_lag_all,
coef_bayes2_lag_all
)
# Optional: Print (first 20 rows only if table is too large)
kable(
head(combined_all_coefs, 20),
digits = 3,
caption = "Sample of Coefficients from Lagged Negative Binomial Models"
)
| Method | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Midpoint Imputation | (Intercept) | -163.886 | 5.255 | -31.186 | 0.000 |
| Midpoint Imputation | Conviction_Count_lag1 | 0.009 | 0.011 | 0.814 | 0.415 |
| Midpoint Imputation | CountyBergen | 0.268 | 0.108 | 2.489 | 0.013 |
| Midpoint Imputation | CountyBurlington | 0.075 | 0.107 | 0.696 | 0.486 |
| Midpoint Imputation | CountyCamden | 0.750 | 0.107 | 7.031 | 0.000 |
| Midpoint Imputation | CountyCape May | -1.187 | 0.114 | -10.410 | 0.000 |
| Midpoint Imputation | CountyCumberland | -0.816 | 0.113 | -7.249 | 0.000 |
| Midpoint Imputation | CountyEssex | 0.784 | 0.107 | 7.336 | 0.000 |
| Midpoint Imputation | CountyGloucester | 0.050 | 0.107 | 0.463 | 0.643 |
| Midpoint Imputation | CountyHudson | 0.138 | 0.108 | 1.279 | 0.201 |
| Midpoint Imputation | CountyHunterdon | -1.927 | 0.124 | -15.497 | 0.000 |
| Midpoint Imputation | CountyMercer | -0.367 | 0.110 | -3.335 | 0.001 |
| Midpoint Imputation | CountyMiddlesex | 0.394 | 0.107 | 3.697 | 0.000 |
| Midpoint Imputation | CountyMonmouth | 0.286 | 0.108 | 2.664 | 0.008 |
| Midpoint Imputation | CountyMorris | -0.307 | 0.109 | -2.820 | 0.005 |
| Midpoint Imputation | CountyOcean | 0.396 | 0.106 | 3.723 | 0.000 |
| Midpoint Imputation | CountyPassaic | -0.021 | 0.108 | -0.193 | 0.847 |
| Midpoint Imputation | CountySalem | -1.566 | 0.120 | -13.086 | 0.000 |
| Midpoint Imputation | CountySomerset | -0.837 | 0.112 | -7.461 | 0.000 |
| Midpoint Imputation | CountySussex | -1.218 | 0.116 | -10.539 | 0.000 |
run_arrest_models <- function(data, label) {
cat("\n=== Running Models for:", label, "===\n")
# Poisson
poisson_model <- glm(
Deaths_num ~ Arrest_Count + factor(County) + factor(Year),
family = poisson,
data = data
)
# Check overdispersion
dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
cat("Overdispersion ratio (Poisson):", round(dispersion, 2), "\n")
# Negative Binomial
nb_model <- glm.nb(
Deaths_num ~ Arrest_Count + factor(County) + factor(Year),
data = data
)
# AIC summary as a data frame
model_summary <- data.frame(
Method = label,
Poisson_AIC = AIC(poisson_model),
NegBin_AIC = AIC(nb_model),
Overdispersion = round(dispersion, 2)
)
return(list(
label = label,
poisson_model = poisson_model,
nb_model = nb_model,
poisson_summary = summary(poisson_model),
nb_summary = summary(nb_model),
summary = model_summary,
dispersion = dispersion
))
}
# 1. Filter datasets for Year >= 2017
combined_df_county_md_2017 <- combined_df_county_md %>% filter(Year >= 2017)
df_lower_2017 <- df_lower %>% filter(Year >= 2017)
df_upper_2017 <- df_upper %>% filter(Year >= 2017)
imputed_df_2017 <- imputed_df_MI %>% filter(Year >= 2017)
combined_df_county_Bay1_2017 <- combined_df_county_Bay1 %>% filter(Year >= 2017)
combined_df_county_Bay2_2017 <- combined_df_county_Bay2 %>% filter(Year >= 2017)
# 2. Run models
results_md_arrest <- run_arrest_models(combined_df_county_md_2017, "Midpoint Imputation")
##
## === Running Models for: Midpoint Imputation ===
## Overdispersion ratio (Poisson): 2.26
results_lower_arrest <- run_arrest_models(df_lower_2017, "Lower Bound Imputation")
##
## === Running Models for: Lower Bound Imputation ===
## Overdispersion ratio (Poisson): 2.26
results_upper_arrest <- run_arrest_models(df_upper_2017, "Upper Bound Imputation")
##
## === Running Models for: Upper Bound Imputation ===
## Overdispersion ratio (Poisson): 2.26
results_mi_arrest <- run_arrest_models(imputed_df_2017, "Multiple Imputation")
##
## === Running Models for: Multiple Imputation ===
## Overdispersion ratio (Poisson): 2.26
results_bayes1_arrest <- run_arrest_models(combined_df_county_Bay1_2017, "Bayesian Imputation 1")
##
## === Running Models for: Bayesian Imputation 1 ===
## Overdispersion ratio (Poisson): 2.26
results_bayes2_arrest <- run_arrest_models(combined_df_county_Bay2_2017, "Bayesian Imputation 2")
##
## === Running Models for: Bayesian Imputation 2 ===
## Overdispersion ratio (Poisson): 2.26
library(dplyr)
library(knitr)
# Combine AIC results from all model runs
model_summaries_24 <- bind_rows(
results_md_arrest$summary,
results_lower_arrest$summary,
results_upper_arrest$summary,
results_mi_arrest$summary,
results_bayes1_arrest$summary,
results_bayes2_arrest$summary
)
# Print as a markdown table
kable(model_summaries_24, caption = "Model Fit Comparison: AIC Values")
| Method | Poisson_AIC | NegBin_AIC | Overdispersion |
|---|---|---|---|
| Midpoint Imputation | 1279.785 | 1241.888 | 2.26 |
| Lower Bound Imputation | 1279.785 | 1241.888 | 2.26 |
| Upper Bound Imputation | 1279.785 | 1241.888 | 2.26 |
| Multiple Imputation | 1279.785 | 1241.888 | 2.26 |
| Bayesian Imputation 1 | 1279.785 | 1241.888 | 2.26 |
| Bayesian Imputation 2 | 1279.785 | 1241.888 | 2.26 |
library(dplyr)
library(broom)
library(knitr)
# List of results
all_arrest_models <- list(
results_md_arrest,
results_lower_arrest,
results_upper_arrest,
results_mi_arrest,
results_bayes1_arrest,
results_bayes2_arrest
)
# Helper to extract and label coefficients from Poisson and NB models
extract_coefs <- function(model_list) {
label <- model_list$label
# Extract tidy summaries
poisson_coefs <- broom::tidy(model_list$poisson_model) %>%
mutate(Model = "Poisson", Method = label)
nb_coefs <- broom::tidy(model_list$nb_model) %>%
mutate(Model = "NegBin", Method = label)
bind_rows(poisson_coefs, nb_coefs)
}
# Combine all coefficients
combined_coefs55 <- purrr::map_df(all_arrest_models, extract_coefs)
# Optional: Filter to terms of interest (e.g., Arrest_Count)
arrest_coefs <- combined_coefs55 %>%
filter(term == "Arrest_Count") %>%
dplyr::select(Method, Model, term, estimate, std.error, statistic, p.value)
# Print the results in a nice table
kable(combined_coefs55, digits = 3, caption = "Arrest Data Coefficients Across Models")
| term | estimate | std.error | statistic | p.value | Model | Method |
|---|---|---|---|---|---|---|
| (Intercept) | 5.022 | 0.040 | 127.046 | 0.000 | Poisson | Midpoint Imputation |
| Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 | Poisson | Midpoint Imputation |
| factor(County)Bergen | -0.001 | 0.045 | -0.027 | 0.978 | Poisson | Midpoint Imputation |
| factor(County)Burlington | 0.000 | 0.042 | -0.011 | 0.992 | Poisson | Midpoint Imputation |
| factor(County)Camden | 0.604 | 0.041 | 14.838 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Cape May | -1.146 | 0.061 | -18.758 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Cumberland | -0.654 | 0.054 | -12.226 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Essex | 0.733 | 0.039 | 18.691 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Gloucester | -0.132 | 0.046 | -2.887 | 0.004 | Poisson | Midpoint Imputation |
| factor(County)Hudson | 0.047 | 0.045 | 1.047 | 0.295 | Poisson | Midpoint Imputation |
| factor(County)Hunterdon | -2.136 | 0.093 | -22.921 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Mercer | -0.279 | 0.048 | -5.754 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Middlesex | 0.298 | 0.040 | 7.393 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Monmouth | 0.127 | 0.043 | 2.990 | 0.003 | Poisson | Midpoint Imputation |
| factor(County)Morris | -0.549 | 0.051 | -10.795 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Ocean | 0.316 | 0.040 | 7.937 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Passaic | -0.140 | 0.046 | -3.059 | 0.002 | Poisson | Midpoint Imputation |
| factor(County)Salem | -1.593 | 0.075 | -21.377 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Somerset | -1.063 | 0.061 | -17.324 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Sussex | -1.361 | 0.068 | -19.916 | 0.000 | Poisson | Midpoint Imputation |
| factor(County)Union | -0.068 | 0.046 | -1.474 | 0.140 | Poisson | Midpoint Imputation |
| factor(County)Warren | -1.571 | 0.074 | -21.268 | 0.000 | Poisson | Midpoint Imputation |
| factor(Year)2018 | 0.078 | 0.027 | 2.922 | 0.003 | Poisson | Midpoint Imputation |
| factor(Year)2019 | 0.045 | 0.027 | 1.678 | 0.093 | Poisson | Midpoint Imputation |
| factor(Year)2020 | 0.055 | 0.027 | 2.022 | 0.043 | Poisson | Midpoint Imputation |
| factor(Year)2021 | 0.128 | 0.027 | 4.815 | 0.000 | Poisson | Midpoint Imputation |
| factor(Year)2022 | 0.104 | 0.027 | 3.910 | 0.000 | Poisson | Midpoint Imputation |
| factor(Year)2023 | -0.002 | 0.028 | -0.069 | 0.945 | Poisson | Midpoint Imputation |
| (Intercept) | 5.036 | 0.055 | 90.812 | 0.000 | NegBin | Midpoint Imputation |
| Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 | NegBin | Midpoint Imputation |
| factor(County)Bergen | -0.008 | 0.062 | -0.136 | 0.892 | NegBin | Midpoint Imputation |
| factor(County)Burlington | 0.000 | 0.059 | 0.001 | 0.999 | NegBin | Midpoint Imputation |
| factor(County)Camden | 0.599 | 0.060 | 9.935 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Cape May | -1.148 | 0.074 | -15.584 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Cumberland | -0.659 | 0.069 | -9.506 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Essex | 0.730 | 0.059 | 12.426 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Gloucester | -0.136 | 0.063 | -2.157 | 0.031 | NegBin | Midpoint Imputation |
| factor(County)Hudson | 0.041 | 0.064 | 0.650 | 0.515 | NegBin | Midpoint Imputation |
| factor(County)Hunterdon | -2.141 | 0.103 | -20.760 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Mercer | -0.286 | 0.066 | -4.348 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Middlesex | 0.294 | 0.058 | 5.073 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Monmouth | 0.123 | 0.060 | 2.043 | 0.041 | NegBin | Midpoint Imputation |
| factor(County)Morris | -0.553 | 0.066 | -8.322 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Ocean | 0.314 | 0.058 | 5.458 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Passaic | -0.145 | 0.063 | -2.306 | 0.021 | NegBin | Midpoint Imputation |
| factor(County)Salem | -1.599 | 0.087 | -18.422 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Somerset | -1.070 | 0.076 | -14.095 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Sussex | -1.367 | 0.082 | -16.734 | 0.000 | NegBin | Midpoint Imputation |
| factor(County)Union | -0.075 | 0.064 | -1.169 | 0.243 | NegBin | Midpoint Imputation |
| factor(County)Warren | -1.577 | 0.086 | -18.285 | 0.000 | NegBin | Midpoint Imputation |
| factor(Year)2018 | 0.083 | 0.037 | 2.228 | 0.026 | NegBin | Midpoint Imputation |
| factor(Year)2019 | 0.042 | 0.038 | 1.110 | 0.267 | NegBin | Midpoint Imputation |
| factor(Year)2020 | 0.057 | 0.038 | 1.513 | 0.130 | NegBin | Midpoint Imputation |
| factor(Year)2021 | 0.120 | 0.037 | 3.204 | 0.001 | NegBin | Midpoint Imputation |
| factor(Year)2022 | 0.083 | 0.038 | 2.201 | 0.028 | NegBin | Midpoint Imputation |
| factor(Year)2023 | -0.029 | 0.038 | -0.752 | 0.452 | NegBin | Midpoint Imputation |
| (Intercept) | 5.022 | 0.040 | 127.046 | 0.000 | Poisson | Lower Bound Imputation |
| Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 | Poisson | Lower Bound Imputation |
| factor(County)Bergen | -0.001 | 0.045 | -0.027 | 0.978 | Poisson | Lower Bound Imputation |
| factor(County)Burlington | 0.000 | 0.042 | -0.011 | 0.992 | Poisson | Lower Bound Imputation |
| factor(County)Camden | 0.604 | 0.041 | 14.838 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Cape May | -1.146 | 0.061 | -18.758 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Cumberland | -0.654 | 0.054 | -12.226 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Essex | 0.733 | 0.039 | 18.691 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Gloucester | -0.132 | 0.046 | -2.887 | 0.004 | Poisson | Lower Bound Imputation |
| factor(County)Hudson | 0.047 | 0.045 | 1.047 | 0.295 | Poisson | Lower Bound Imputation |
| factor(County)Hunterdon | -2.136 | 0.093 | -22.921 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Mercer | -0.279 | 0.048 | -5.754 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Middlesex | 0.298 | 0.040 | 7.393 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Monmouth | 0.127 | 0.043 | 2.990 | 0.003 | Poisson | Lower Bound Imputation |
| factor(County)Morris | -0.549 | 0.051 | -10.795 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Ocean | 0.316 | 0.040 | 7.937 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Passaic | -0.140 | 0.046 | -3.059 | 0.002 | Poisson | Lower Bound Imputation |
| factor(County)Salem | -1.593 | 0.075 | -21.377 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Somerset | -1.063 | 0.061 | -17.324 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Sussex | -1.361 | 0.068 | -19.916 | 0.000 | Poisson | Lower Bound Imputation |
| factor(County)Union | -0.068 | 0.046 | -1.474 | 0.140 | Poisson | Lower Bound Imputation |
| factor(County)Warren | -1.571 | 0.074 | -21.268 | 0.000 | Poisson | Lower Bound Imputation |
| factor(Year)2018 | 0.078 | 0.027 | 2.922 | 0.003 | Poisson | Lower Bound Imputation |
| factor(Year)2019 | 0.045 | 0.027 | 1.678 | 0.093 | Poisson | Lower Bound Imputation |
| factor(Year)2020 | 0.055 | 0.027 | 2.022 | 0.043 | Poisson | Lower Bound Imputation |
| factor(Year)2021 | 0.128 | 0.027 | 4.815 | 0.000 | Poisson | Lower Bound Imputation |
| factor(Year)2022 | 0.104 | 0.027 | 3.910 | 0.000 | Poisson | Lower Bound Imputation |
| factor(Year)2023 | -0.002 | 0.028 | -0.069 | 0.945 | Poisson | Lower Bound Imputation |
| (Intercept) | 5.036 | 0.055 | 90.812 | 0.000 | NegBin | Lower Bound Imputation |
| Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 | NegBin | Lower Bound Imputation |
| factor(County)Bergen | -0.008 | 0.062 | -0.136 | 0.892 | NegBin | Lower Bound Imputation |
| factor(County)Burlington | 0.000 | 0.059 | 0.001 | 0.999 | NegBin | Lower Bound Imputation |
| factor(County)Camden | 0.599 | 0.060 | 9.935 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Cape May | -1.148 | 0.074 | -15.584 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Cumberland | -0.659 | 0.069 | -9.506 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Essex | 0.730 | 0.059 | 12.426 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Gloucester | -0.136 | 0.063 | -2.157 | 0.031 | NegBin | Lower Bound Imputation |
| factor(County)Hudson | 0.041 | 0.064 | 0.650 | 0.515 | NegBin | Lower Bound Imputation |
| factor(County)Hunterdon | -2.141 | 0.103 | -20.760 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Mercer | -0.286 | 0.066 | -4.348 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Middlesex | 0.294 | 0.058 | 5.073 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Monmouth | 0.123 | 0.060 | 2.043 | 0.041 | NegBin | Lower Bound Imputation |
| factor(County)Morris | -0.553 | 0.066 | -8.322 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Ocean | 0.314 | 0.058 | 5.458 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Passaic | -0.145 | 0.063 | -2.306 | 0.021 | NegBin | Lower Bound Imputation |
| factor(County)Salem | -1.599 | 0.087 | -18.422 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Somerset | -1.070 | 0.076 | -14.095 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Sussex | -1.367 | 0.082 | -16.734 | 0.000 | NegBin | Lower Bound Imputation |
| factor(County)Union | -0.075 | 0.064 | -1.169 | 0.243 | NegBin | Lower Bound Imputation |
| factor(County)Warren | -1.577 | 0.086 | -18.285 | 0.000 | NegBin | Lower Bound Imputation |
| factor(Year)2018 | 0.083 | 0.037 | 2.228 | 0.026 | NegBin | Lower Bound Imputation |
| factor(Year)2019 | 0.042 | 0.038 | 1.110 | 0.267 | NegBin | Lower Bound Imputation |
| factor(Year)2020 | 0.057 | 0.038 | 1.513 | 0.130 | NegBin | Lower Bound Imputation |
| factor(Year)2021 | 0.120 | 0.037 | 3.204 | 0.001 | NegBin | Lower Bound Imputation |
| factor(Year)2022 | 0.083 | 0.038 | 2.201 | 0.028 | NegBin | Lower Bound Imputation |
| factor(Year)2023 | -0.029 | 0.038 | -0.752 | 0.452 | NegBin | Lower Bound Imputation |
| (Intercept) | 5.022 | 0.040 | 127.046 | 0.000 | Poisson | Upper Bound Imputation |
| Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 | Poisson | Upper Bound Imputation |
| factor(County)Bergen | -0.001 | 0.045 | -0.027 | 0.978 | Poisson | Upper Bound Imputation |
| factor(County)Burlington | 0.000 | 0.042 | -0.011 | 0.992 | Poisson | Upper Bound Imputation |
| factor(County)Camden | 0.604 | 0.041 | 14.838 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Cape May | -1.146 | 0.061 | -18.758 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Cumberland | -0.654 | 0.054 | -12.226 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Essex | 0.733 | 0.039 | 18.691 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Gloucester | -0.132 | 0.046 | -2.887 | 0.004 | Poisson | Upper Bound Imputation |
| factor(County)Hudson | 0.047 | 0.045 | 1.047 | 0.295 | Poisson | Upper Bound Imputation |
| factor(County)Hunterdon | -2.136 | 0.093 | -22.921 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Mercer | -0.279 | 0.048 | -5.754 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Middlesex | 0.298 | 0.040 | 7.393 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Monmouth | 0.127 | 0.043 | 2.990 | 0.003 | Poisson | Upper Bound Imputation |
| factor(County)Morris | -0.549 | 0.051 | -10.795 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Ocean | 0.316 | 0.040 | 7.937 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Passaic | -0.140 | 0.046 | -3.059 | 0.002 | Poisson | Upper Bound Imputation |
| factor(County)Salem | -1.593 | 0.075 | -21.377 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Somerset | -1.063 | 0.061 | -17.324 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Sussex | -1.361 | 0.068 | -19.916 | 0.000 | Poisson | Upper Bound Imputation |
| factor(County)Union | -0.068 | 0.046 | -1.474 | 0.140 | Poisson | Upper Bound Imputation |
| factor(County)Warren | -1.571 | 0.074 | -21.268 | 0.000 | Poisson | Upper Bound Imputation |
| factor(Year)2018 | 0.078 | 0.027 | 2.922 | 0.003 | Poisson | Upper Bound Imputation |
| factor(Year)2019 | 0.045 | 0.027 | 1.678 | 0.093 | Poisson | Upper Bound Imputation |
| factor(Year)2020 | 0.055 | 0.027 | 2.022 | 0.043 | Poisson | Upper Bound Imputation |
| factor(Year)2021 | 0.128 | 0.027 | 4.815 | 0.000 | Poisson | Upper Bound Imputation |
| factor(Year)2022 | 0.104 | 0.027 | 3.910 | 0.000 | Poisson | Upper Bound Imputation |
| factor(Year)2023 | -0.002 | 0.028 | -0.069 | 0.945 | Poisson | Upper Bound Imputation |
| (Intercept) | 5.036 | 0.055 | 90.812 | 0.000 | NegBin | Upper Bound Imputation |
| Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 | NegBin | Upper Bound Imputation |
| factor(County)Bergen | -0.008 | 0.062 | -0.136 | 0.892 | NegBin | Upper Bound Imputation |
| factor(County)Burlington | 0.000 | 0.059 | 0.001 | 0.999 | NegBin | Upper Bound Imputation |
| factor(County)Camden | 0.599 | 0.060 | 9.935 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Cape May | -1.148 | 0.074 | -15.584 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Cumberland | -0.659 | 0.069 | -9.506 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Essex | 0.730 | 0.059 | 12.426 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Gloucester | -0.136 | 0.063 | -2.157 | 0.031 | NegBin | Upper Bound Imputation |
| factor(County)Hudson | 0.041 | 0.064 | 0.650 | 0.515 | NegBin | Upper Bound Imputation |
| factor(County)Hunterdon | -2.141 | 0.103 | -20.760 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Mercer | -0.286 | 0.066 | -4.348 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Middlesex | 0.294 | 0.058 | 5.073 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Monmouth | 0.123 | 0.060 | 2.043 | 0.041 | NegBin | Upper Bound Imputation |
| factor(County)Morris | -0.553 | 0.066 | -8.322 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Ocean | 0.314 | 0.058 | 5.458 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Passaic | -0.145 | 0.063 | -2.306 | 0.021 | NegBin | Upper Bound Imputation |
| factor(County)Salem | -1.599 | 0.087 | -18.422 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Somerset | -1.070 | 0.076 | -14.095 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Sussex | -1.367 | 0.082 | -16.734 | 0.000 | NegBin | Upper Bound Imputation |
| factor(County)Union | -0.075 | 0.064 | -1.169 | 0.243 | NegBin | Upper Bound Imputation |
| factor(County)Warren | -1.577 | 0.086 | -18.285 | 0.000 | NegBin | Upper Bound Imputation |
| factor(Year)2018 | 0.083 | 0.037 | 2.228 | 0.026 | NegBin | Upper Bound Imputation |
| factor(Year)2019 | 0.042 | 0.038 | 1.110 | 0.267 | NegBin | Upper Bound Imputation |
| factor(Year)2020 | 0.057 | 0.038 | 1.513 | 0.130 | NegBin | Upper Bound Imputation |
| factor(Year)2021 | 0.120 | 0.037 | 3.204 | 0.001 | NegBin | Upper Bound Imputation |
| factor(Year)2022 | 0.083 | 0.038 | 2.201 | 0.028 | NegBin | Upper Bound Imputation |
| factor(Year)2023 | -0.029 | 0.038 | -0.752 | 0.452 | NegBin | Upper Bound Imputation |
| (Intercept) | 5.022 | 0.040 | 127.046 | 0.000 | Poisson | Multiple Imputation |
| Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 | Poisson | Multiple Imputation |
| factor(County)Bergen | -0.001 | 0.045 | -0.027 | 0.978 | Poisson | Multiple Imputation |
| factor(County)Burlington | 0.000 | 0.042 | -0.011 | 0.992 | Poisson | Multiple Imputation |
| factor(County)Camden | 0.604 | 0.041 | 14.838 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Cape May | -1.146 | 0.061 | -18.758 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Cumberland | -0.654 | 0.054 | -12.226 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Essex | 0.733 | 0.039 | 18.691 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Gloucester | -0.132 | 0.046 | -2.887 | 0.004 | Poisson | Multiple Imputation |
| factor(County)Hudson | 0.047 | 0.045 | 1.047 | 0.295 | Poisson | Multiple Imputation |
| factor(County)Hunterdon | -2.136 | 0.093 | -22.921 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Mercer | -0.279 | 0.048 | -5.754 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Middlesex | 0.298 | 0.040 | 7.393 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Monmouth | 0.127 | 0.043 | 2.990 | 0.003 | Poisson | Multiple Imputation |
| factor(County)Morris | -0.549 | 0.051 | -10.795 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Ocean | 0.316 | 0.040 | 7.937 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Passaic | -0.140 | 0.046 | -3.059 | 0.002 | Poisson | Multiple Imputation |
| factor(County)Salem | -1.593 | 0.075 | -21.377 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Somerset | -1.063 | 0.061 | -17.324 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Sussex | -1.361 | 0.068 | -19.916 | 0.000 | Poisson | Multiple Imputation |
| factor(County)Union | -0.068 | 0.046 | -1.474 | 0.140 | Poisson | Multiple Imputation |
| factor(County)Warren | -1.571 | 0.074 | -21.268 | 0.000 | Poisson | Multiple Imputation |
| factor(Year)2018 | 0.078 | 0.027 | 2.922 | 0.003 | Poisson | Multiple Imputation |
| factor(Year)2019 | 0.045 | 0.027 | 1.678 | 0.093 | Poisson | Multiple Imputation |
| factor(Year)2020 | 0.055 | 0.027 | 2.022 | 0.043 | Poisson | Multiple Imputation |
| factor(Year)2021 | 0.128 | 0.027 | 4.815 | 0.000 | Poisson | Multiple Imputation |
| factor(Year)2022 | 0.104 | 0.027 | 3.910 | 0.000 | Poisson | Multiple Imputation |
| factor(Year)2023 | -0.002 | 0.028 | -0.069 | 0.945 | Poisson | Multiple Imputation |
| (Intercept) | 5.036 | 0.055 | 90.812 | 0.000 | NegBin | Multiple Imputation |
| Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 | NegBin | Multiple Imputation |
| factor(County)Bergen | -0.008 | 0.062 | -0.136 | 0.892 | NegBin | Multiple Imputation |
| factor(County)Burlington | 0.000 | 0.059 | 0.001 | 0.999 | NegBin | Multiple Imputation |
| factor(County)Camden | 0.599 | 0.060 | 9.935 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Cape May | -1.148 | 0.074 | -15.584 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Cumberland | -0.659 | 0.069 | -9.506 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Essex | 0.730 | 0.059 | 12.426 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Gloucester | -0.136 | 0.063 | -2.157 | 0.031 | NegBin | Multiple Imputation |
| factor(County)Hudson | 0.041 | 0.064 | 0.650 | 0.515 | NegBin | Multiple Imputation |
| factor(County)Hunterdon | -2.141 | 0.103 | -20.760 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Mercer | -0.286 | 0.066 | -4.348 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Middlesex | 0.294 | 0.058 | 5.073 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Monmouth | 0.123 | 0.060 | 2.043 | 0.041 | NegBin | Multiple Imputation |
| factor(County)Morris | -0.553 | 0.066 | -8.322 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Ocean | 0.314 | 0.058 | 5.458 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Passaic | -0.145 | 0.063 | -2.306 | 0.021 | NegBin | Multiple Imputation |
| factor(County)Salem | -1.599 | 0.087 | -18.422 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Somerset | -1.070 | 0.076 | -14.095 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Sussex | -1.367 | 0.082 | -16.734 | 0.000 | NegBin | Multiple Imputation |
| factor(County)Union | -0.075 | 0.064 | -1.169 | 0.243 | NegBin | Multiple Imputation |
| factor(County)Warren | -1.577 | 0.086 | -18.285 | 0.000 | NegBin | Multiple Imputation |
| factor(Year)2018 | 0.083 | 0.037 | 2.228 | 0.026 | NegBin | Multiple Imputation |
| factor(Year)2019 | 0.042 | 0.038 | 1.110 | 0.267 | NegBin | Multiple Imputation |
| factor(Year)2020 | 0.057 | 0.038 | 1.513 | 0.130 | NegBin | Multiple Imputation |
| factor(Year)2021 | 0.120 | 0.037 | 3.204 | 0.001 | NegBin | Multiple Imputation |
| factor(Year)2022 | 0.083 | 0.038 | 2.201 | 0.028 | NegBin | Multiple Imputation |
| factor(Year)2023 | -0.029 | 0.038 | -0.752 | 0.452 | NegBin | Multiple Imputation |
| (Intercept) | 5.022 | 0.040 | 127.046 | 0.000 | Poisson | Bayesian Imputation 1 |
| Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 | Poisson | Bayesian Imputation 1 |
| factor(County)Bergen | -0.001 | 0.045 | -0.027 | 0.978 | Poisson | Bayesian Imputation 1 |
| factor(County)Burlington | 0.000 | 0.042 | -0.011 | 0.992 | Poisson | Bayesian Imputation 1 |
| factor(County)Camden | 0.604 | 0.041 | 14.838 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Cape May | -1.146 | 0.061 | -18.758 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Cumberland | -0.654 | 0.054 | -12.226 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Essex | 0.733 | 0.039 | 18.691 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Gloucester | -0.132 | 0.046 | -2.887 | 0.004 | Poisson | Bayesian Imputation 1 |
| factor(County)Hudson | 0.047 | 0.045 | 1.047 | 0.295 | Poisson | Bayesian Imputation 1 |
| factor(County)Hunterdon | -2.136 | 0.093 | -22.921 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Mercer | -0.279 | 0.048 | -5.754 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Middlesex | 0.298 | 0.040 | 7.393 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Monmouth | 0.127 | 0.043 | 2.990 | 0.003 | Poisson | Bayesian Imputation 1 |
| factor(County)Morris | -0.549 | 0.051 | -10.795 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Ocean | 0.316 | 0.040 | 7.937 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Passaic | -0.140 | 0.046 | -3.059 | 0.002 | Poisson | Bayesian Imputation 1 |
| factor(County)Salem | -1.593 | 0.075 | -21.377 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Somerset | -1.063 | 0.061 | -17.324 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Sussex | -1.361 | 0.068 | -19.916 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(County)Union | -0.068 | 0.046 | -1.474 | 0.140 | Poisson | Bayesian Imputation 1 |
| factor(County)Warren | -1.571 | 0.074 | -21.268 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(Year)2018 | 0.078 | 0.027 | 2.922 | 0.003 | Poisson | Bayesian Imputation 1 |
| factor(Year)2019 | 0.045 | 0.027 | 1.678 | 0.093 | Poisson | Bayesian Imputation 1 |
| factor(Year)2020 | 0.055 | 0.027 | 2.022 | 0.043 | Poisson | Bayesian Imputation 1 |
| factor(Year)2021 | 0.128 | 0.027 | 4.815 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(Year)2022 | 0.104 | 0.027 | 3.910 | 0.000 | Poisson | Bayesian Imputation 1 |
| factor(Year)2023 | -0.002 | 0.028 | -0.069 | 0.945 | Poisson | Bayesian Imputation 1 |
| (Intercept) | 5.036 | 0.055 | 90.812 | 0.000 | NegBin | Bayesian Imputation 1 |
| Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 | NegBin | Bayesian Imputation 1 |
| factor(County)Bergen | -0.008 | 0.062 | -0.136 | 0.892 | NegBin | Bayesian Imputation 1 |
| factor(County)Burlington | 0.000 | 0.059 | 0.001 | 0.999 | NegBin | Bayesian Imputation 1 |
| factor(County)Camden | 0.599 | 0.060 | 9.935 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Cape May | -1.148 | 0.074 | -15.584 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Cumberland | -0.659 | 0.069 | -9.506 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Essex | 0.730 | 0.059 | 12.426 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Gloucester | -0.136 | 0.063 | -2.157 | 0.031 | NegBin | Bayesian Imputation 1 |
| factor(County)Hudson | 0.041 | 0.064 | 0.650 | 0.515 | NegBin | Bayesian Imputation 1 |
| factor(County)Hunterdon | -2.141 | 0.103 | -20.760 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Mercer | -0.286 | 0.066 | -4.348 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Middlesex | 0.294 | 0.058 | 5.073 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Monmouth | 0.123 | 0.060 | 2.043 | 0.041 | NegBin | Bayesian Imputation 1 |
| factor(County)Morris | -0.553 | 0.066 | -8.322 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Ocean | 0.314 | 0.058 | 5.458 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Passaic | -0.145 | 0.063 | -2.306 | 0.021 | NegBin | Bayesian Imputation 1 |
| factor(County)Salem | -1.599 | 0.087 | -18.422 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Somerset | -1.070 | 0.076 | -14.095 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Sussex | -1.367 | 0.082 | -16.734 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(County)Union | -0.075 | 0.064 | -1.169 | 0.243 | NegBin | Bayesian Imputation 1 |
| factor(County)Warren | -1.577 | 0.086 | -18.285 | 0.000 | NegBin | Bayesian Imputation 1 |
| factor(Year)2018 | 0.083 | 0.037 | 2.228 | 0.026 | NegBin | Bayesian Imputation 1 |
| factor(Year)2019 | 0.042 | 0.038 | 1.110 | 0.267 | NegBin | Bayesian Imputation 1 |
| factor(Year)2020 | 0.057 | 0.038 | 1.513 | 0.130 | NegBin | Bayesian Imputation 1 |
| factor(Year)2021 | 0.120 | 0.037 | 3.204 | 0.001 | NegBin | Bayesian Imputation 1 |
| factor(Year)2022 | 0.083 | 0.038 | 2.201 | 0.028 | NegBin | Bayesian Imputation 1 |
| factor(Year)2023 | -0.029 | 0.038 | -0.752 | 0.452 | NegBin | Bayesian Imputation 1 |
| (Intercept) | 5.022 | 0.040 | 127.046 | 0.000 | Poisson | Bayesian Imputation 2 |
| Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 | Poisson | Bayesian Imputation 2 |
| factor(County)Bergen | -0.001 | 0.045 | -0.027 | 0.978 | Poisson | Bayesian Imputation 2 |
| factor(County)Burlington | 0.000 | 0.042 | -0.011 | 0.992 | Poisson | Bayesian Imputation 2 |
| factor(County)Camden | 0.604 | 0.041 | 14.838 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Cape May | -1.146 | 0.061 | -18.758 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Cumberland | -0.654 | 0.054 | -12.226 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Essex | 0.733 | 0.039 | 18.691 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Gloucester | -0.132 | 0.046 | -2.887 | 0.004 | Poisson | Bayesian Imputation 2 |
| factor(County)Hudson | 0.047 | 0.045 | 1.047 | 0.295 | Poisson | Bayesian Imputation 2 |
| factor(County)Hunterdon | -2.136 | 0.093 | -22.921 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Mercer | -0.279 | 0.048 | -5.754 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Middlesex | 0.298 | 0.040 | 7.393 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Monmouth | 0.127 | 0.043 | 2.990 | 0.003 | Poisson | Bayesian Imputation 2 |
| factor(County)Morris | -0.549 | 0.051 | -10.795 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Ocean | 0.316 | 0.040 | 7.937 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Passaic | -0.140 | 0.046 | -3.059 | 0.002 | Poisson | Bayesian Imputation 2 |
| factor(County)Salem | -1.593 | 0.075 | -21.377 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Somerset | -1.063 | 0.061 | -17.324 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Sussex | -1.361 | 0.068 | -19.916 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(County)Union | -0.068 | 0.046 | -1.474 | 0.140 | Poisson | Bayesian Imputation 2 |
| factor(County)Warren | -1.571 | 0.074 | -21.268 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(Year)2018 | 0.078 | 0.027 | 2.922 | 0.003 | Poisson | Bayesian Imputation 2 |
| factor(Year)2019 | 0.045 | 0.027 | 1.678 | 0.093 | Poisson | Bayesian Imputation 2 |
| factor(Year)2020 | 0.055 | 0.027 | 2.022 | 0.043 | Poisson | Bayesian Imputation 2 |
| factor(Year)2021 | 0.128 | 0.027 | 4.815 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(Year)2022 | 0.104 | 0.027 | 3.910 | 0.000 | Poisson | Bayesian Imputation 2 |
| factor(Year)2023 | -0.002 | 0.028 | -0.069 | 0.945 | Poisson | Bayesian Imputation 2 |
| (Intercept) | 5.036 | 0.055 | 90.812 | 0.000 | NegBin | Bayesian Imputation 2 |
| Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 | NegBin | Bayesian Imputation 2 |
| factor(County)Bergen | -0.008 | 0.062 | -0.136 | 0.892 | NegBin | Bayesian Imputation 2 |
| factor(County)Burlington | 0.000 | 0.059 | 0.001 | 0.999 | NegBin | Bayesian Imputation 2 |
| factor(County)Camden | 0.599 | 0.060 | 9.935 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Cape May | -1.148 | 0.074 | -15.584 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Cumberland | -0.659 | 0.069 | -9.506 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Essex | 0.730 | 0.059 | 12.426 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Gloucester | -0.136 | 0.063 | -2.157 | 0.031 | NegBin | Bayesian Imputation 2 |
| factor(County)Hudson | 0.041 | 0.064 | 0.650 | 0.515 | NegBin | Bayesian Imputation 2 |
| factor(County)Hunterdon | -2.141 | 0.103 | -20.760 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Mercer | -0.286 | 0.066 | -4.348 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Middlesex | 0.294 | 0.058 | 5.073 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Monmouth | 0.123 | 0.060 | 2.043 | 0.041 | NegBin | Bayesian Imputation 2 |
| factor(County)Morris | -0.553 | 0.066 | -8.322 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Ocean | 0.314 | 0.058 | 5.458 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Passaic | -0.145 | 0.063 | -2.306 | 0.021 | NegBin | Bayesian Imputation 2 |
| factor(County)Salem | -1.599 | 0.087 | -18.422 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Somerset | -1.070 | 0.076 | -14.095 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Sussex | -1.367 | 0.082 | -16.734 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(County)Union | -0.075 | 0.064 | -1.169 | 0.243 | NegBin | Bayesian Imputation 2 |
| factor(County)Warren | -1.577 | 0.086 | -18.285 | 0.000 | NegBin | Bayesian Imputation 2 |
| factor(Year)2018 | 0.083 | 0.037 | 2.228 | 0.026 | NegBin | Bayesian Imputation 2 |
| factor(Year)2019 | 0.042 | 0.038 | 1.110 | 0.267 | NegBin | Bayesian Imputation 2 |
| factor(Year)2020 | 0.057 | 0.038 | 1.513 | 0.130 | NegBin | Bayesian Imputation 2 |
| factor(Year)2021 | 0.120 | 0.037 | 3.204 | 0.001 | NegBin | Bayesian Imputation 2 |
| factor(Year)2022 | 0.083 | 0.038 | 2.201 | 0.028 | NegBin | Bayesian Imputation 2 |
| factor(Year)2023 | -0.029 | 0.038 | -0.752 | 0.452 | NegBin | Bayesian Imputation 2 |
kable(arrest_coefs, digits = 3, caption = "Arrest_Count Coefficients Across Models" )
| Method | Model | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|---|
| Midpoint Imputation | Poisson | Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 |
| Midpoint Imputation | NegBin | Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 |
| Lower Bound Imputation | Poisson | Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 |
| Lower Bound Imputation | NegBin | Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 |
| Upper Bound Imputation | Poisson | Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 |
| Upper Bound Imputation | NegBin | Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 |
| Multiple Imputation | Poisson | Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 |
| Multiple Imputation | NegBin | Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 |
| Bayesian Imputation 1 | Poisson | Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 |
| Bayesian Imputation 1 | NegBin | Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 |
| Bayesian Imputation 2 | Poisson | Arrest_Count | -0.002 | 0.002 | -0.798 | 0.425 |
| Bayesian Imputation 2 | NegBin | Arrest_Count | -0.003 | 0.003 | -0.817 | 0.414 |
Arrests_Count is not statistically significant…. So not much to report here… @Eli any other ideas how I can model this better – would be great appreciated!
# Define reusable lagging function
lag_arrest <- function(df) {
df %>%
arrange(County, Year) %>%
group_by(County) %>%
mutate(Arrest_Count_lag1 = lag(Arrest_Count, 1)) %>%
ungroup()
}
# Apply to each imputed dataset
combined_df_county_md_lag_arrest <- lag_arrest(combined_df_county_md)
df_lower_lag_arrest <- lag_arrest(df_lower)
df_upper_lag_arrest <- lag_arrest(df_upper)
imputed_df_MI_lag_arrest <- lag_arrest(imputed_df_MI)
combined_df_county_Bay1_lag_arrest <- lag_arrest(combined_df_county_Bay1)
combined_df_county_Bay2_lag_arrest <- lag_arrest(combined_df_county_Bay2)
run_lagged_arrest_models <- function(data, label) {
cat("\n=== Running Lagged Arrest Models for:", label, "===\n")
# Filter for Year >= 2017
data <- data %>% filter(Year >= 2017)
# Fit Poisson model
poisson_model <- glm(
Deaths_num ~ Arrest_Count_lag1 + factor(County) + factor(Year),
family = poisson,
data = data
)
# Check for overdispersion
dispersion <- sum(residuals(poisson_model, type = "pearson")^2) / poisson_model$df.residual
cat("Overdispersion ratio (Poisson):", round(dispersion, 2), "\n")
# Fit NB model
nb_model <- MASS::glm.nb(
Deaths_num ~ Arrest_Count_lag1 + factor(County) + factor(Year),
data = data
)
return(list(
label = label,
poisson_model = poisson_model,
nb_model = nb_model,
poisson_summary = summary(poisson_model),
nb_summary = summary(nb_model),
poisson_AIC = AIC(poisson_model),
nb_AIC = AIC(nb_model),
dispersion = dispersion
))
}
results_md_arrest_lag <- run_lagged_arrest_models(combined_df_county_md_lag_arrest, "Midpoint Imputation")
##
## === Running Lagged Arrest Models for: Midpoint Imputation ===
## Overdispersion ratio (Poisson): 2.19
results_lower_arrest_lag <- run_lagged_arrest_models(df_lower_lag_arrest, "Lower Bound Imputation")
##
## === Running Lagged Arrest Models for: Lower Bound Imputation ===
## Overdispersion ratio (Poisson): 2.19
results_upper_arrest_lag <- run_lagged_arrest_models(df_upper_lag_arrest, "Upper Bound Imputation")
##
## === Running Lagged Arrest Models for: Upper Bound Imputation ===
## Overdispersion ratio (Poisson): 2.19
results_mi_arrest_lag <- run_lagged_arrest_models(imputed_df_MI_lag_arrest, "Multiple Imputation")
##
## === Running Lagged Arrest Models for: Multiple Imputation ===
## Overdispersion ratio (Poisson): 2.26
results_bayes1_arrest_lag <- run_lagged_arrest_models(combined_df_county_Bay1_lag_arrest, "Bayesian Imputation 1")
##
## === Running Lagged Arrest Models for: Bayesian Imputation 1 ===
## Overdispersion ratio (Poisson): 2.19
results_bayes2_arrest_lag <- run_lagged_arrest_models(combined_df_county_Bay2_lag_arrest, "Bayesian Imputation 2")
##
## === Running Lagged Arrest Models for: Bayesian Imputation 2 ===
## Overdispersion ratio (Poisson): 2.19
library(broom)
extract_arrest_lag_coefs <- function(model, method_label) {
tidy(model) %>%
filter(term == "Arrest_Count_lag1") %>%
mutate(Method = method_label) %>%
dplyr::select(Method, term, estimate, std.error, statistic, p.value)
}
arrest_lag_coefs <- bind_rows(
extract_arrest_lag_coefs(results_md_arrest_lag$nb_model, "Midpoint"),
extract_arrest_lag_coefs(results_lower_arrest_lag$nb_model, "Lower Bound"),
extract_arrest_lag_coefs(results_upper_arrest_lag$nb_model, "Upper Bound"),
extract_arrest_lag_coefs(results_mi_arrest_lag$nb_model, "Multiple Imputation"),
extract_arrest_lag_coefs(results_bayes1_arrest_lag$nb_model, "Bayesian 1"),
extract_arrest_lag_coefs(results_bayes2_arrest_lag$nb_model, "Bayesian 2")
)
# Display results
knitr::kable(arrest_lag_coefs, digits = 3, caption = "Effect of Lagged Arrests on Overdose Deaths (NB Models)")
| Method | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Midpoint | Arrest_Count_lag1 | 0.001 | 0.003 | 0.231 | 0.817 |
| Lower Bound | Arrest_Count_lag1 | 0.001 | 0.003 | 0.231 | 0.817 |
| Upper Bound | Arrest_Count_lag1 | 0.001 | 0.003 | 0.231 | 0.817 |
| Multiple Imputation | Arrest_Count_lag1 | 0.000 | 0.003 | 0.027 | 0.979 |
| Bayesian 1 | Arrest_Count_lag1 | 0.001 | 0.003 | 0.231 | 0.817 |
| Bayesian 2 | Arrest_Count_lag1 | 0.001 | 0.003 | 0.231 | 0.817 |
Arrest_Count still not statistically significant predictor.
Possible theoretical explanation: 1 year is too long of an increment to see an affect. Literature suggests the impact is 1 month after arrests/convictions. We can rerun this analysis when we get the CDC’s unsuppressed monthly county data…
~~~~~~~~~~~~~~~~ Notes on Dataset~~~~~~~~~~~~~~~~~~~
State monthly overdose data was a combination of CDC Wonder DATA (1999-2022) and data for “unconfirmed OD’s” 2023-2025 https://ocsme.nj.gov/dashboard - The State of New Jersey, Office of the Chief State Medical Examiner
I added the population data from NJ for 2024 from here because the U.S. Census website is down because of the government closure. https://www.njspotlightnews.org/2025/01/nj-population-2024-census/
dates: August 5, 1988, through June 23, 2025.
variables: Sex, Race, Ethnicity, Date of Sentence, Begin Serve, Min Offense Term, Max Offense Term, County
codebook: none provided
n=385
The following limitations were provided by the Board: Be advised that this report was compiled within the limits of information-gathering technology of the Parole Board Information System and may not include every existing conviction pursuant to N.J.S.A. 2C:35-9a. Be further advised that information related to race/ethnicity is reported based on data provided by the New Jersey Department of Corrections and cannot be verified.
Unknown was changed to NA
dates: 2017 through 2024
variables: ID, County, Arrest Year, Arrest Month, Race, Age Range, Gender, Offense Statute, Offense Category, Arrest source, Status
n=373
codebook: ?
Unknown & Not Provided were changed to NA
dates: 1999- 2023 variables: Notes, County, County Code, Year, Year Code, Deaths, Population, Crude Rate
crude rate calculated per 100,000
N= 529
Datasource limitations: a big limitation here is that CDC supresses data if the county has less than 15 deaths to report. We are in the process of requesting the unsupressed data.
• About revised 2019 deaths: On March 11, 2021, the 2019 mortality data on CDC WONDER was updated with the 2019 mortality data updated by NCHS on March 4, 2021 to include corrected information for residents of Texas affecting 5 records previously coded to cause code U01.4, Terrorism involving firearms (homicide). The underlying and multiple cause of death codes for 5 records were corrected in the 2019 data. Underlying and multiple cause of death codes for those 5 records were recoded to Assault (homicide) by other and unspecified firearm discharge, ICD-10 code X95. The corrected final death records replaces the data released on December 22, 2020. • About revised 2014 deaths: A revised data file for 2014 was released on April 3, 2017 to include corrections affecting 125 deaths previously coded to “Accidental discharge of firearms” (ICD-10 codes W32-W34). This corrected file replaced the file released on December 9, 2015. The underlying cause of death changed for 125 deaths in 2014 that occurred in Tennessee (80%) and Massachusetts (20%). (Note: Data by state of residence are also impacted in Connecticut, Florida, Georgia, Kentucky, North Carolina and Virginia for records where the death occurred in Tennessee or Massachusetts but the decedent was a resident of another state.) Prior to the revision, the underlying cause of death for all 125 deaths were classified as “Accidental discharge of firearms” (ICD-10 codes W32-W34). After the revision, 50% of the changed records are now classified as “Assault (homicide) by discharge of firearms” (ICD-10 codes U01.4, X93-X95); 42% are now classified as “Intentional self-harm (suicide) by discharge of firearms (ICD-10 codes X72-X74); 4% of the revised death records are now classified as”Discharge of firearms, undetermined intent” (ICD-10 codes Y22-Y24); and the remaining 4% to various other causes. For more information, please refer to Deaths: Final Data for 2014.
Query Parameters for 1999-2020: ICD-10 Codes: X40 (Accidental
poisoning by and exposure to nonopioid analgesics, antipyretics and
antirheumatics); X41 (Accidental poisoning by and exposure to
antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic
drugs, not elsewhere classified); X42 (Accidental poisoning by and
exposure to narcotics and psychodysleptics [hallucinogens], not
elsewhere classified); X43 (Accidental poisoning by and exposure to
other drugs acting on the autonomic nervous system); X44(Accidental
poisoning by and exposure to other and unspecified drugs, medicaments
and biological substances); X60 (Intentional
self-poisoning by and exposure to nonopioid analgesics, antipyretics and
antirheumatics); X61 (Intentional self-poisoning by and
exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and
psychotropic drugs, not elsewhere classified); X62
(Intentional self-poisoning by and exposure to narcotics and
psychodysleptics [hallucinogens], not elsewhere classified); X63
(Intentional self-poisoning by and exposure to other drugs acting on the
autonomic nervous system); X64 (Intentional
self-poisoning by and exposure to other and unspecified drugs,
medicaments and biological substances); X85 (Assault by drugs,
medicaments and biological substances); Y10 (Poisoning by and exposure
to nonopioid analgesics, antipyretics and antirheumatics,
undetermined intent); Y11 (Poisoning by and exposure to antiepileptic,
sedative-hypnotic, antiparkinsonism and psychotropic
drugs, not elsewhere classified, undetermined intent); Y12 (Poisoning by
and exposure to narcotics and psychodysleptics
[hallucinogens], not elsewhere classified, undetermined intent); Y13
(Poisoning by and exposure to other drugs acting on the
autonomic nervous system, undetermined intent); Y14 (Poisoning by and
exposure to other and unspecified drugs, medicaments and
biological substances, undetermined intent)
Query parameters for 2021-2023 = used location of incident (not resident of decedent) and followed the same definition of overdose death as NVSS (X40-44 [unintentional self harm/injury], X60-64 [intentional self harm/suicide], X85 [assault], Y10-14 [undetermined intent]).
dates: 1989-2023
1989 is from here: https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/comp8090.htm
1990-2000 https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/cest9199.html
2000-2010—under county estimates/intercensal county estimates/2000-2010/by race sex and Hispanic origin https://www.nj.gov/labor/labormarketinformation/demographics/population-household-estimates/index.shtml
Notes: For 1989, they categorize race into only 3 categories: ‘white’, ‘black’, and ‘other races’.
For 1990-1999 they separated race data into ‘Persons Not of Hispanic Origin’–white, black, ai/an, asian or pacific islander and ‘Persons of Hispanic Origin (Latinos)’–white, black, ai/an, asian or pacific islander (same categories).
Data was retrieved from the New Jersey Department of Labor and Workforce Development’s Population and Household Estimates intercensal population estimates.”
I also found this language at the bottom of one of the 1990-2000 data:
“Source: US Bureau of the Census, Population Estimates Branch. Prepared by: New Jersey Department of Labor, Division of Labor Market & Demographic Research, 4/02”