library(readxl)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DHARMa)
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
Convictions <- read_excel("Strict Liability spread sheet.xlsx")
Arrest <- read_excel("NJ DIH Arrest data 2017-2024.xlsx")
Overdose <-Underlying_Cause_of_Death_1999_2023 <- read_csv("Underlying Cause of Death 1999-2023.csv", 
    col_types = cols(Notes = col_skip(), 
        Year = col_integer(), `Year Code` = col_integer(), 
        Population = col_integer()))
Race_county_ab <- read_excel("NJ County race demographics_2017-2023_white_black.xlsx", 
    col_types = c("skip", "text", "numeric", 
        "numeric", "numeric", "numeric"))
Arrest <- Arrest %>%
  mutate(
    Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
  )

Convictions <- Convictions  %>%
  mutate(
    Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
  )

Convictions <- Convictions %>%
  mutate(
    Ethnicity = ifelse(Ethnicity %in% c("Unknown", "Not Provided", "NA", ""), NA, Ethnicity)
  )
# In setup chunk
make_data_dictionary <- function(df, n_examples = 3) {
  data.frame(
    Variable = names(df),
    Class    = sapply(df, function(x) paste(class(x), collapse = ", ")),
    Example  = sapply(df, function(x) {
      ex <- unique(x[!is.na(x)])
      ex <- head(ex, n_examples)
      paste(ex, collapse = ", ")
    }),
    stringsAsFactors = FALSE
  )
}
# Create Conviction Year Variable in
Convictions$Sentence.Date <- as.Date(Convictions$`Date of Sentence`)
Convictions$Sentence.Year <- as.numeric(format(Convictions$Sentence.Date, "%Y"))

#Data Structure for Datasets

This provides us with the name of each variable in each dataset and what their structure is.

# Data dictionary for Overdose
knitr::kable(make_data_dictionary(Overdose),
             caption = "Overdose Data Dictionary")
Overdose Data Dictionary
Variable Class Example
County County character Atlantic County, NJ, Bergen County, NJ, Burlington County, NJ
County Code County Code numeric 34001, 34003, 34005
Year Year integer 1999, 2000, 2001
Year Code Year Code integer 1999, 2000, 2001
Deaths Deaths character 35, 40, 33
Population Population integer 250432, 880225, 420542
Crude Rate Crude Rate character 14, 4.5, 7.8
# Data dictionary for Convictions
knitr::kable(make_data_dictionary(Convictions),
             caption = "Convictions Data Dictionary")
Convictions Data Dictionary
Variable Class Example
Sex Sex character Male, Female
Race Race character White, Black, Hispanic, White
Ethnicity Ethnicity character White, Black, Hispanic
Date of Sentence Date of Sentence POSIXct, POSIXt 1988-08-05, 1988-11-18, 1989-01-26
Begin Serve Begin Serve POSIXct, POSIXt 1988-08-05, 1988-11-18, 1989-01-26
Min Offense Term Min Offense Term character 0, Years Months Days, 5 Years
Max Offense Term Max Offense Term character 10 Years, 7 Years, 15 Years
County County character Burlington, Hudson, Atlantic
Sentence.Date Sentence.Date Date 1988-08-05, 1988-11-18, 1989-01-26
Sentence.Year Sentence.Year numeric 1988, 1989, 1991
# Data dictionary for Arrest
knitr::kable(make_data_dictionary(Arrest),
             caption = "Arrest Data Dictionary")
Arrest Data Dictionary
Variable Class Example
ID ID character XXX3H7XYDB, XXX3H98D8H, XXX7DH3OYE
County County character Bergen, Morris, Ocean
Arrest Year Arrest Year numeric 2024, 2023, 2022
Arrest Month Arrest Month character February, July, April
Race Race character White (incl. Hispanic origin), Black
Age Range Age Range character 35-44, 25-34, 18-24
Gender Gender character Male, Female
Offense Statute Offense Statute character 2C:35-9A, 2C:35-9
Offense Category Offense Category character Drugs
Arrest source Arrest source character Warrant, Summons, Other
Status Status character Pending, Dismissed Charges, Guilty

Although the following tables give general data structure info – note that they do not capture missingness where the missingness is noted by something other than “NA”… so missingness is explored later in the code after I correct that.

library(skimr)
skim(Overdose)
Data summary
Name Overdose
Number of rows 525
Number of columns 7
_______________________
Column type frequency:
character 3
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
County 0 1 16 21 0 21 0
Deaths 0 1 2 10 0 175 0
Crude Rate 2 1 1 10 0 249 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
County Code 0 1 34021.0 12.12 34001 34011 34021 34031 34041 ▇▆▆▆▆
Year 0 1 2011.0 7.22 1999 2005 2011 2017 2023 ▇▇▇▇▇
Year Code 0 1 2011.0 7.22 1999 2005 2011 2017 2023 ▇▇▇▇▇
Population 0 1 419484.7 250725.93 62385 150508 447560 597943 957736 ▇▅▇▃▃
library(skimr)
skim(Arrest)
Data summary
Name Arrest
Number of rows 367
Number of columns 11
_______________________
Column type frequency:
character 10
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ID 0 1.00 10 10 0 210 0
County 0 1.00 5 25 0 21 0
Arrest Month 0 1.00 3 9 0 12 0
Race 11 0.97 5 29 0 2 0
Age Range 0 1.00 3 5 0 7 0
Gender 0 1.00 4 6 0 2 0
Offense Statute 0 1.00 7 8 0 2 0
Offense Category 0 1.00 5 5 0 1 0
Arrest source 0 1.00 5 7 0 3 0
Status 0 1.00 5 17 0 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Arrest Year 0 1 2019.69 2.06 2017 2018 2019 2021 2024 ▇▅▆▂▃
library(skimr)
skim(Convictions)
Data summary
Name Convictions
Number of rows 385
Number of columns 10
_______________________
Column type frequency:
character 6
Date 1
numeric 1
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Sex 0 1.00 4 6 0 2 0
Race 0 1.00 3 22 0 6 0
Ethnicity 53 0.86 5 12 0 4 0
Min Offense Term 0 1.00 1 25 0 44 0
Max Offense Term 0 1.00 7 16 0 19 0
County 0 1.00 5 10 0 20 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Sentence.Date 0 1 1988-08-05 2025-04-25 2020-01-30 159

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Sentence.Year 0 1 2016.18 9.36 1988 2016 2020 2023 2025 ▁▁▁▂▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
Date of Sentence 0 1.00 1988-08-05 2025-04-25 2020-01-30 159
Begin Serve 6 0.98 1988-08-05 2025-04-25 2019-12-20 153

#Missingness in Data ##Racial Categories Across Datasets

library(dplyr)
library(knitr)

# Arrest race categories
arrest_race_table <- Arrest %>%
  count(Race, sort = TRUE) %>%
  rename(`Race Category` = Race, Count = n)

kable(arrest_race_table, caption = "Race Categories in Arrests")
Race Categories in Arrests
Race Category Count
Black 185
White (incl. Hispanic origin) 171
NA 11
# Convictions race categories
convictions_race_table <- Convictions %>%
  count(Race, sort = TRUE) %>%
  rename(`Race Category` = Race, Count = n)

kable(convictions_race_table, caption = "Race Categories in Convictions")
Race Categories in Convictions
Race Category Count
N/A 200
Black 108
White 45
Hispanic, White 20
Hispanic, Unspfd BL/WH 10
Hispanic, Black 2

One of the issues with this data is that it includes White/Hispanic in one category. They do not provide us with White - Non-Hispanic, which would allow us to really see how they are treating white people. Consider saying we need more information here in the brief, because I suspect that Hispanic persons may be disproprionately charged as well and we won’t be able to see that. And, we also won’t be able to see the discrepancy between non-white/white charging rates.

##Missingness in Convictions Race Data: Year? County?

52% of cases are missing a racial category in the Convictions dataset. I next test if this missingness is time-dependent, i.e. if for e.g. they did not do a good job of capturing race in the past (like the 1990s). And, that is not the case… it seems like it is relevant even for 2025…

library(dplyr)
library(knitr)

# Step 1: Convert "N/A" to proper NA in Race
Convictions <- Convictions %>%
  mutate(Race = ifelse(Race %in% c("N/A", "", "NA"), NA, Race))

# Step 2: Count missing vs non-missing by Sentence.Year
race_missing_by_year <- Convictions %>%
  group_by(Sentence.Year) %>%
  summarise(
    Missing = sum(is.na(Race)),
    Non_Missing = sum(!is.na(Race)),
    Total       = n(),
    .groups = "drop"
  ) %>%
  mutate(Missing_Pct = round((Missing / Total) * 100, 1))

# Step 3: Print as a nice table
kable(race_missing_by_year, 
      caption = "Missing Race Data in Convictions by Year")
Missing Race Data in Convictions by Year
Sentence.Year Missing Non_Missing Total Missing_Pct
1988 0 3 3 0.0
1989 0 2 2 0.0
1991 0 4 4 0.0
1994 0 6 6 0.0
1995 0 7 7 0.0
1996 0 4 4 0.0
1997 0 3 3 0.0
1998 0 6 6 0.0
1999 0 2 2 0.0
2000 0 7 7 0.0
2001 0 2 2 0.0
2002 2 8 10 20.0
2003 1 1 2 50.0
2004 7 3 10 70.0
2005 0 2 2 0.0
2006 2 2 4 50.0
2007 3 0 3 100.0
2009 1 0 1 100.0
2010 1 0 1 100.0
2011 4 3 7 57.1
2013 4 0 4 100.0
2014 2 0 2 100.0
2015 1 0 1 100.0
2016 10 4 14 71.4
2017 11 12 23 47.8
2018 13 5 18 72.2
2019 25 15 40 62.5
2020 17 5 22 77.3
2021 16 12 28 57.1
2022 27 23 50 54.0
2023 14 33 47 29.8
2024 27 11 38 71.1
2025 12 0 12 100.0

COUNTY: Convictions: Race: Missingness

This finding may be something you want to include in your brief. It looks like certain counties are rountinely not collecting race data for DiD charges.

Moving forward: For us to look to see if racial disparities are occurring in the enforcement of DiD, we will need to use the Arrest data and cannot use the Convictions data, but before we completely put the Convictions Dataset aside, let’s look at the ethnicity variable.

#Ethnicity Data in Convictions Dataset

Before we getting into testing racial disparaties – we need to see if the ethnicity data can add to our analysis.

For completions sake, I am going to look at the Convictions data, although we already know that more than half of the race data in the convictions dataset is missing.

# Ethnicity summary
ethnicity_summary <- Convictions %>%
  count(Ethnicity) %>%
  mutate(Percent = paste0(round(n / sum(n) * 100, 1), "%"))

kable(ethnicity_summary, col.names = c("Ethnicity", "Count", "Percent"),
      caption = "Table: Ethnicity Distribution in Convictions", align = "lrr")
Table: Ethnicity Distribution in Convictions
Ethnicity Count Percent
Black 14 3.6%
Hispanic 44 11.4%
Not Hispanic 232 60.3%
White 42 10.9%
NA 53 13.8%

What is interesting in looking at the Convictions dataset is that while Race was missing for 52% of the dataset, Ethnicity was only missing for 13.8% of the sample. This suggests that the Race data was not provided to us.

I did do an analysis of the data that we do have and found sentencing disparities BUT it is useless because 52% of the data is missing and it is clearly linked to County, so it is not random.

Conclusion I think that you can argue in your brief that we need more data to determine both whether there are racial disparities in conviction and sentencing. I believe you said that the sentencing argument wasn’t going to be used in the current brief but I can provide you with it, if you need it to argue that we do not have enough data to determine whether or not sentencing disparaties exist by race – but that data that we do have, suggests that there are sentencing disparities by race.

In case this is useful for you to show issues with data collection and reporting, I have created a new variable called Race and Ethnicity from the Convictions data. For e.g., they report Not Hispanic for 146 cases but report NA for race….

# Combine Race and Ethnicity into one grouping column

library(dplyr)
# Add column for Race/Ethnicity
Convictions <- Convictions %>%
  mutate(Race_Ethnicity = paste(Race, Ethnicity, sep = " - "))

race_ethnicity_summary <- Convictions %>%
  filter(!is.na(Race_Ethnicity)) %>%
  count(Race_Ethnicity) %>%
  mutate(
    Percent = round(n / sum(n) * 100, 1),
    Label = paste0(Percent, "%")
  )

library(knitr)

kable(race_ethnicity_summary,
      col.names = c("Race + Ethnicity", "Count", "Percent of Charges", "Label"),
      caption = "Table: Distribution of Charges by Combined Race and Ethnicity",
      align = "lrrr")
Table: Distribution of Charges by Combined Race and Ethnicity
Race + Ethnicity Count Percent of Charges Label
Black - Black 7 1.8 1.8%
Black - Hispanic 3 0.8 0.8%
Black - NA 21 5.5 5.5%
Black - Not Hispanic 77 20.0 20%
Hispanic, Black - Hispanic 2 0.5 0.5%
Hispanic, Unspfd BL/WH - Hispanic 7 1.8 1.8%
Hispanic, Unspfd BL/WH - Not Hispanic 3 0.8 0.8%
Hispanic, White - Hispanic 20 5.2 5.2%
NA - Black 6 1.6 1.6%
NA - Hispanic 12 3.1 3.1%
NA - NA 23 6.0 6%
NA - Not Hispanic 146 37.9 37.9%
NA - White 13 3.4 3.4%
White - Black 1 0.3 0.3%
White - NA 9 2.3 2.3%
White - Not Hispanic 6 1.6 1.6%
White - White 29 7.5 7.5%

In Conclusion to test for racial disparities, we will need to use the Arrest data because it is more complete.

#Arrest Data: Are there Racial Disparities in Enforcement of DiD?

Unfortunately, although the Arrest data is more complete, it has very few racial categories. Just Black or White.

kable(arrest_race_table, caption = "Race Categories in Arrests")
Race Categories in Arrests
Race Category Count
Black 185
White (incl. Hispanic origin) 171
NA 11
library(ggplot2)

race_summary_arr <- Arrest %>%
  count(Race) %>%
  mutate(Percent = round(n / sum(n) * 100, 1))

ggplot(race_summary_arr, aes(x = reorder(Race, n), y = n, fill = Race)) +
  geom_col() +
  geom_text(aes(label = n), vjust = -0.3) +
  labs(title = "Arrests by Race", x = "Race", y = "Count") +
  theme_minimal() +
  theme(legend.position = "none",
  plot.title = element_text(hjust = 0.5)
  )

library(dplyr)
library(stringr)

# 2. Adjust the County names in the Race_county data to match the other datasets
 Race_county_ab <- Race_county_ab %>%
  mutate(
    County = str_replace(County, " County, NJ$", ""), # remove " County, NJ"
    County = str_trim(County)                         # remove extra spaces
  )
 
 head(Race_county_ab)
## # A tibble: 6 × 5
##   County   `County Code`  Year `Black only` `White only`
##   <chr>            <dbl> <dbl>        <dbl>        <dbl>
## 1 Atlantic         34001  2023        38094       159688
## 2 Atlantic         34001  2022        37060       164950
## 3 Atlantic         34001  2021        37629       168911
## 4 Atlantic         34001  2020        37622       167448
## 5 Atlantic         34001  2019        38907       174978
## 6 Atlantic         34001  2018        40319       173409
library(dplyr)
library(tidyr)

# --- Arrests by County-Year-Race ---
arrests_by_race_county_year <- Arrest %>%
  filter(Race %in% c("White (incl. Hispanic origin)", "Black")) %>%
  count(County, `Arrest Year`, Race, name = "Arrest_Count") %>%
  rename(Year = `Arrest Year`) %>%
  mutate(Race = case_when(
    Race == "White (incl. Hispanic origin)" ~ "White",
    Race == "Black" ~ "Black"
  ))

# --- Population by County-Year-Race ---
race_pop_county <- Race_county_ab %>%
  dplyr::select(County, Year, `White only`, `Black only`) %>%
  pivot_longer(
    cols = c(`White only`, `Black only`),
    names_to = "Race",
    values_to = "Population"
  ) %>%
  mutate(Race = case_when(
    Race == "White only" ~ "White",
    Race == "Black only" ~ "Black"
  ))

# --- Merge population with arrests ---
county_race_year <- race_pop_county %>%
  left_join(arrests_by_race_county_year, by = c("County", "Year", "Race")) %>%
  mutate(
    Arrest_Count = replace_na(Arrest_Count, 0),      # fill in 0 where missing
    Arrest_Rate  = (Arrest_Count / Population) * 1e5 # per 100,000
  )

CHI-SQUARE TEST

# Collapse to statewide totals
statewide_data <- county_race_year %>%
  group_by(Race) %>%
  summarise(
    Arrests    = sum(Arrest_Count, na.rm = TRUE),
    Population = sum(Population, na.rm = TRUE),
    .groups    = "drop"
  )

# Chi-square test: are arrests proportional to population?
chisq_result <- chisq.test(
  x = statewide_data$Arrests,
  p = statewide_data$Population / sum(statewide_data$Population)
)

chisq_result
## 
##  Chi-squared test for given probabilities
## 
## data:  statewide_data$Arrests
## X-squared = 271.16, df = 1, p-value < 2.2e-16

Null hypothesis (H₀): Arrests are distributed across White vs Black in proportion to their share of the statewide population.

Chi-squared statistic (X²) = 271.16 → This is very large, meaning the observed arrest distribution is very far from the expected distribution (if arrests followed population proportions).

p-value < 2.2e-16 → Essentially zero. Strong evidence against the null hypothesis.

Arrests do not match the racial composition of the statewide population. The difference is highly statistically significant — it’s extremely unlikely this disparity is due to chance. This means either White or Black residents are being arrested at disproportionately higher rates compared to their share of the population.

Next Step (to interpret direction of disparity)

statewide_data %>%
  mutate(Expected_Arrests = sum(Arrests) * (Population / sum(Population)))
## # A tibble: 2 × 4
##   Race  Arrests Population Expected_Arrests
##   <chr>   <int>      <dbl>            <dbl>
## 1 Black     168    8432720             55.9
## 2 White     157   40560005            269.

Expected arrests (if proportional to population): Black ≈ 56, White ≈ 269

Black people make up a much smaller share of the state’s population than White people, but their observed arrests are three times higher than expected (168 vs. 56).

The chi-square statistic is extremely large (X² = 271, p < 0.001), meaning the difference between observed and expected arrests is highly statistically significant

There is strong evidence of racial disparity in arrests statewide. Black residents are disproportionately arrested relative to their population size, while White residents are underrepresented in arrest counts.

#Modeling Race as a predictor

#Poisson Model

I am going to compare several model for model fit before I interpret the results. First is a mixed-effects poisson regression.

county_race_year <- county_race_year %>%
  mutate(Year_c = Year - 2017)  # set first year as baseline

library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
county_race_year <- county_race_year %>%
  mutate(Race = factor(Race, levels = c("Black", "White")))

# Mixed-effects Poisson regression of arrest counts
model_mixed <- glmer(
  Arrest_Count ~ Race + Year_c + (1 | County),
  offset = log(Population),
  family = poisson,
  data = county_race_year
)

summary(model_mixed)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County)
##    Data: county_race_year
##  Offset: log(Population)
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     869.4     884.1    -430.7     861.4       292 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3907 -0.7982 -0.3709 -0.1283  5.7161 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  County (Intercept) 2.263    1.504   
## Number of obs: 296, groups:  County, 21
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.76380    0.35796 -30.070  < 2e-16 ***
## RaceWhite    -2.18729    0.11823 -18.500  < 2e-16 ***
## Year_c       -0.11743    0.02739  -4.287 1.81e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) RacWht
## RaceWhite -0.148       
## Year_c    -0.192  0.008

Next, we will simulate residuals.

sim_res <- simulateResiduals(fittedModel = model_mixed)
plot(sim_res)

testDispersion(sim_res)

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.065475, p-value = 0.832
## alternative hypothesis: two.sided
testZeroInflation(sim_res)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 1.1949, p-value = 0.096
## alternative hypothesis: two.sided

testDispersion(sim_res) This tests whether variance in data is greater than expected under possion model ( Poisson assumes mean = variance). p-value = 0.832 The fitted model handles variance well.

testZeroInflation(sim_res) This tests whether there are more zeros in data than the model expects. p-value = 0.096 Observed-to-expected zero ratio: 1.1949 (i.e., ~19% more zeros than expected) BUT – this is not significant based on the p-value.

However, I am still going to run a couple other models and test AIC/BIC, especially because of the Quantile Deviation (KS test & residual vs predicted plot).

KS test (QQ plot): p = 0.00133 → significant deviation Residuals vs. predicted: Deviations in upper quantiles (red bands outside gray CI) Residuals deviate slightly from uniformity, especially in upper predictions.

SUMMARY: We assessed model fit using simulated residual diagnostics from the DHARMa package. No evidence of overdispersion was observed (p = 0.832). A borderline non-significant test for zero-inflation (p = 0.096) and a significant quantile deviation (KS test p = 0.0013) suggest some modest model misfit.

#Negative Binomial model

NEXT MODEL = negative binomial model – fixed effects = County

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
model_nb <- glmer.nb(
  Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population)),
  data = county_race_year
)
model_nb
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: Negative Binomial(0.5668)  ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population))
##    Data: county_race_year
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  717.6919  736.1437 -353.8460  707.6919       291 
## Random effects:
##  Groups Name        Std.Dev.
##  County (Intercept) 1.453   
## Number of obs: 296, groups:  County, 21
## Fixed Effects:
## (Intercept)    RaceWhite       Year_c  
##     -10.730       -2.069       -0.141
AIC(model_mixed, model_nb)
##             df      AIC
## model_mixed  4 869.3782
## model_nb     5 717.6919

The negative bionomial model performs better than the poisson model.

Now to plot the residuals….

sim_res2 <- simulateResiduals(fittedModel = model_nb)
plot(sim_res2)

KS Test (Kolmogorov-Smirnov):

p = 0.18672 → Not significant

Interpretation: The residual distribution does not deviate significantly from the expected uniform distribution.

Dispersion Test:

p = 0.296 → Not significant

Interpretation: No evidence of overdispersion

Outlier Test:

p = 0.94 → Not significant

Interpretation: No excessive outliers in residuals.

The NB model seems well-specified in terms of variance, dispersion, and distribution.

testDispersion(sim_res2)

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.058817, p-value = 0.296
## alternative hypothesis: two.sided
testZeroInflation(sim_res2)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 1.0163, p-value = 0.872
## alternative hypothesis: two.sided

Test: testZeroInflation(sim_res) Result:

p-value = 0.872

Observed-to-expected zero ratio = 1.0163

The test indicates no significant zero-inflation.

Test: testDispersion(sim_res) Result:

p-value = 0.296

Dispersion = 0.058817

No evidence of overdispersion in the residuals. The model appropriately captures variance in the data.

Model diagnostics were conducted using the DHARMa package in R to assess the fit of the Negative Binomial model. Simulated quantile residuals demonstrated no evidence of systematic deviation, with a non-significant Kolmogorov-Smirnov (KS) test (p = 0.187), indicating that the residual distribution did not differ significantly from the expected uniform distribution. The residuals-versus-predicted plot showed no apparent structure or heteroskedasticity, and the 95% quantile simulation bands fell within acceptable limits, suggesting an adequate model fit.

Tests for overdispersion and outliers confirmed that the model handled variance appropriately. The nonparametric dispersion test yielded a p-value of 0.296, indicating no evidence of residual overdispersion. Additionally, the outlier test was non-significant (p = 0.94), supporting the absence of extreme or influential residuals.

A zero-inflation test was also conducted to determine whether the model underestimated the number of zeros in the data. Results indicated no excess zero-inflation, with a p-value of 0.872 and an observed-to-expected zero ratio of 1.016. The observed number of zeros was well within the simulated distribution under the null hypothesis.

Taken together, these diagnostics indicate that the Negative Binomial model is appropriately specified, with no need for additional correction for overdispersion or zero-inflation. The model can be reliably used for inference regarding treatment effects and associations in the observed count data.

#Zero-Inflated Negative Binomial versus Negative Binomial

We could stop here, but I decided to just go one step further and run a zero-inflated negative binomial model – although all indicators suggest it is unnecessary.

library(glmmTMB)
model_zi <- glmmTMB(
  Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population)),
  ziformula = ~1,
  family = nbinom2,
  data = county_race_year
)
AIC(model_nb, model_zi)
##          df      AIC
## model_nb  5 717.6919
## model_zi  6 718.0269

The negative binomial model without zero-inflation fits the data slightly better according to AIC. There is no evidence that a zero-inflated model meaningfully improves fit — suggesting no significant excess zeros beyond what the negative binomial already accounts for.

Just to be certain I will run the Dharma tests.

sim_res3 <- simulateResiduals(fittedModel = model_zi)
plot(sim_res3)

testDispersion(sim_res3)

## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.022091, p-value = 0.296
## alternative hypothesis: two.sided
testZeroInflation(sim_res3)

## 
##  DHARMa zero-inflation test via comparison to expected zeros with
##  simulation under H0 = fitted model
## 
## data:  simulationOutput
## ratioObsSim = 1.0055, p-value = 0.984
## alternative hypothesis: two.sided

DHARMa Diagnostics for ZINB Model KS Test (QQ plot): No deviation from expected residual distribution. p = 0.734 Dispersion Test: No overdispersion detected. p = 0.296 Outlier Test: No influential outliers detected. p = 0.700 Residual vs. Predicted Plot: No visible heteroskedasticity or systematic trends.

Zero-Inflation Test Observed-to-expected zeros ratio = 1.0055 No significant zero inflation. (p = 0.984)

Even with a zero-inflation component explicitly modeled, the number of observed zeros is almost identical to what the model expects.

This suggests that zero-inflation may not be necessary — the NB model already handled this well.

Dispersion Test Dispersion = 0.022, p = 0.296

SUMMARY: Model diagnostics for the Zero-Inflated Negative Binomial (ZINB) model indicated excellent fit. Simulated quantile residuals followed a uniform distribution, confirmed by a non-significant Kolmogorov-Smirnov (KS) test (p = 0.734), and residuals were symmetrically distributed around model predictions. There was no evidence of overdispersion (p = 0.296) or extreme outliers (p = 0.700).Zero-inflation diagnostics revealed no statistically significant deviation between observed and expected zeros (p = 0.984; ratio = 1.0055), suggesting that the inclusion of a zero-inflation component was not strictly necessary.

Although the Zero-Inflated Negative Binomial (ZINB) model demonstrated excellent residual diagnostics and no evidence of overdispersion or zero-inflation, comparison of model fit using Akaike Information Criterion (AIC) favored the simpler Negative Binomial (NB) model (AIC = 717.69 vs. 718.03). However, the AIC difference (ΔAIC = 0.34) is well below the conventional threshold of 2, indicating that the two models perform equivalently in terms of fit.

Given that the NB model achieves nearly identical performance with fewer parameters and is more parsimonious, it may be preferred for final interpretation and reporting. The ZINB model did not offer meaningful improvement in fit or capture excess zeros beyond what the NB model already accounted for.

#Negative Binomial Fixed Effect Model Results So, negative binomial fixed effect model is best specification for arrest counts by county/race/year.

Now to get the results from that model for interpretation.

model_nb
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: Negative Binomial(0.5668)  ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population))
##    Data: county_race_year
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  717.6919  736.1437 -353.8460  707.6919       291 
## Random effects:
##  Groups Name        Std.Dev.
##  County (Intercept) 1.453   
## Number of obs: 296, groups:  County, 21
## Fixed Effects:
## (Intercept)    RaceWhite       Year_c  
##     -10.730       -2.069       -0.141
exp(fixef(model_nb))
##  (Intercept)    RaceWhite       Year_c 
## 2.188191e-05 1.263374e-01 8.684646e-01
summary(model_nb)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: Negative Binomial(0.5668)  ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population))
##    Data: county_race_year
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     717.7     736.1    -353.8     707.7       291 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.7183 -0.5071 -0.3238 -0.1009  4.6955 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  County (Intercept) 2.111    1.453   
## Number of obs: 296, groups:  County, 21
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.72985    0.39541 -27.136   <2e-16 ***
## RaceWhite    -2.06880    0.23259  -8.895   <2e-16 ***
## Year_c       -0.14103    0.05896  -2.392   0.0168 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) RacWht
## RaceWhite -0.236       
## Year_c    -0.377 -0.087
library(broom.mixed)
tidy(model_nb, effects = "fixed", conf.int = TRUE, conf.method = "Wald", exponentiate = TRUE)
## # A tibble: 3 × 8
##   effect term         estimate  std.error statistic   p.value conf.low conf.high
##   <chr>  <chr>           <dbl>      <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 fixed  (Intercept) 0.0000219 0.00000865    -27.1  3.72e-162  1.01e-5 0.0000475
## 2 fixed  RaceWhite   0.126     0.0294         -8.89 5.86e- 19  8.01e-2 0.199    
## 3 fixed  Year_c      0.868     0.0512         -2.39 1.68e-  2  7.74e-1 0.975

A generalized linear mixed model with a negative binomial distribution was estimated to examine racial disparities in arrest counts under N.J.S.A. 2C:35‑9a. The outcome variable was the number of arrests per county, year, and racial group (Black or White), and the model included a log-offset for the race-specific population in each county-year to ensure estimates reflected per-capita arrest rates. The model also included a fixed effect for centered year (to account for time trends) and a random intercept for county (to account for baseline differences in arrest rates across counties). After adjusting for population size, county-level differences, and temporal trends, the results indicated that White residents had a significantly lower arrest rate than Black residents. White residents had an arrest rate approximately 87.4% lower than Black residents (IRR = 0.126, 95% CI 0.080–0.199, p < 0.001), after adjusting for population size (of each racial demographic), time trends, and county‑level variation. In addition, arrest rates declined over time, with each additional year associated with a 13% reduction in arrest rate (IRR = 0.868, 95% CI [0.774, 0.975], p = 0.017). These results suggest strong and statistically significant racial disparities in the enforcement of drug-induced death laws, with Black individuals arrested at substantially higher rates than white individuals, even after accounting for differences in population size and location.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

There are 4 datasets: 1. Convictions = this a dataset from the New Jersey State Parole Board with numerical and demographic information of people sentenced pursuant to N.J.S.A. 2C:35-9a strict-liability drug-induced death.

dates: August 5, 1988, through June 23, 2025.

variables: Sex, Race, Ethnicity, Date of Sentence, Begin Serve, Min Offense Term, Max Offense Term, County

codebook: none provided

n=385

The following limitations were provided by the Board: Be advised that this report was compiled within the limits of information-gathering technology of the Parole Board Information System and may not include every existing conviction pursuant to N.J.S.A. 2C:35-9a. Be further advised that information related to race/ethnicity is reported based on data provided by the New Jersey Department of Corrections and cannot be verified.

Unknown was changed to NA

  1. Arrests = raw data from the NJ AG’s arrest portal. This is 2017 through 2024, arrests for N.J.S.A. 2C:35-9a strict-liability drug-induced death by county and race. They have collapsed White and Hispanic into one category and it only has three racial categories… White (including Hispanic), Black, and unknown…

dates: 2017 through 2024

variables: ID, County, Arrest Year, Arrest Month, Race, Age Range, Gender, Offense Statute, Offense Category, Arrest source, Status

n=373

codebook: ?

Unknown & Not Provided were changed to NA

  1. Overdose = CDC wonder data, counts of overdose death, yearly

dates: 1999- 2023 variables: Notes, County, County Code, Year, Year Code, Deaths, Population, Crude Rate

crude rate calculated per 100,000

N= 529

Datasource limitations: a big limitation here is that CDC supresses data if the county has less than 15 deaths to report. We are in the process of requesting the unsupressed data.

• About revised 2019 deaths: On March 11, 2021, the 2019 mortality data on CDC WONDER was updated with the 2019 mortality data updated by NCHS on March 4, 2021 to include corrected information for residents of Texas affecting 5 records previously coded to cause code U01.4, Terrorism involving firearms (homicide). The underlying and multiple cause of death codes for 5 records were corrected in the 2019 data. Underlying and multiple cause of death codes for those 5 records were recoded to Assault (homicide) by other and unspecified firearm discharge, ICD-10 code X95. The corrected final death records replaces the data released on December 22, 2020. • About revised 2014 deaths: A revised data file for 2014 was released on April 3, 2017 to include corrections affecting 125 deaths previously coded to “Accidental discharge of firearms” (ICD-10 codes W32-W34). This corrected file replaced the file released on December 9, 2015. The underlying cause of death changed for 125 deaths in 2014 that occurred in Tennessee (80%) and Massachusetts (20%). (Note: Data by state of residence are also impacted in Connecticut, Florida, Georgia, Kentucky, North Carolina and Virginia for records where the death occurred in Tennessee or Massachusetts but the decedent was a resident of another state.) Prior to the revision, the underlying cause of death for all 125 deaths were classified as “Accidental discharge of firearms” (ICD-10 codes W32-W34). After the revision, 50% of the changed records are now classified as “Assault (homicide) by discharge of firearms” (ICD-10 codes U01.4, X93-X95); 42% are now classified as “Intentional self-harm (suicide) by discharge of firearms (ICD-10 codes X72-X74); 4% of the revised death records are now classified as”Discharge of firearms, undetermined intent” (ICD-10 codes Y22-Y24); and the remaining 4% to various other causes. For more information, please refer to Deaths: Final Data for 2014.

Query Parameters for 1999-2020: ICD-10 Codes: X40 (Accidental poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics); X41 (Accidental poisoning by and exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic drugs, not elsewhere classified); X42 (Accidental poisoning by and exposure to narcotics and psychodysleptics [hallucinogens], not elsewhere classified); X43 (Accidental poisoning by and exposure to other drugs acting on the autonomic nervous system); X44(Accidental poisoning by and exposure to other and unspecified drugs, medicaments and biological substances); X60 (Intentional
self-poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics); X61 (Intentional self-poisoning by and
exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic drugs, not elsewhere classified); X62
(Intentional self-poisoning by and exposure to narcotics and psychodysleptics [hallucinogens], not elsewhere classified); X63
(Intentional self-poisoning by and exposure to other drugs acting on the autonomic nervous system); X64 (Intentional
self-poisoning by and exposure to other and unspecified drugs, medicaments and biological substances); X85 (Assault by drugs,
medicaments and biological substances); Y10 (Poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics,
undetermined intent); Y11 (Poisoning by and exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic
drugs, not elsewhere classified, undetermined intent); Y12 (Poisoning by and exposure to narcotics and psychodysleptics
[hallucinogens], not elsewhere classified, undetermined intent); Y13 (Poisoning by and exposure to other drugs acting on the
autonomic nervous system, undetermined intent); Y14 (Poisoning by and exposure to other and unspecified drugs, medicaments and
biological substances, undetermined intent)

Query parameters for 2021-2023 = used location of incident (not resident of decedent) and followed the same definition of overdose death as NVSS (X40-44 [unintentional self harm/injury], X60-64 [intentional self harm/suicide], X85 [assault], Y10-14 [undetermined intent]).

  1. Racial Demographics of the Population =

dates: 1989-2023

1989 is from here: https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/comp8090.htm

1990-2000 https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/cest9199.html

2000-2010—under county estimates/intercensal county estimates/2000-2010/by race sex and Hispanic origin https://www.nj.gov/labor/labormarketinformation/demographics/population-household-estimates/index.shtml

2010-2023: https://data.census.gov/table/ACSDT5Y2019.B01001A?t=Race+and+Ethnicity&g=040XX00US34$0500000&moe=false

Notes: For 1989, they categorize race into only 3 categories: ‘white’, ‘black’, and ‘other races’.

For 1990-1999 they separated race data into ‘Persons Not of Hispanic Origin’–white, black, ai/an, asian or pacific islander and ‘Persons of Hispanic Origin (Latinos)’–white, black, ai/an, asian or pacific islander (same categories).

Data was retrieved from the New Jersey Department of Labor and Workforce Development’s Population and Household Estimates intercensal population estimates.”

I also found this language at the bottom of one of the 1990-2000 data:

“Source: US Bureau of the Census, Population Estimates Branch. Prepared by: New Jersey Department of Labor, Division of Labor Market & Demographic Research, 4/02”

Not included in the analysis below, but available = a. statewide monthly CDC overdose death data: Statewide_data_month_Underlying Cause of Death, 2018-2023, Single Race (2).csv

  1. monthly county overdose data: Underlying Cause of Death, 1999-2020, Single Race.csv ; too much data was surpressed for this dataset to be useful; we will revisit this after we get the unspressed data from the CDC.