library(readxl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DHARMa)
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
Convictions <- read_excel("Strict Liability spread sheet.xlsx")
Arrest <- read_excel("NJ DIH Arrest data 2017-2024.xlsx")
Overdose <-Underlying_Cause_of_Death_1999_2023 <- read_csv("Underlying Cause of Death 1999-2023.csv",
col_types = cols(Notes = col_skip(),
Year = col_integer(), `Year Code` = col_integer(),
Population = col_integer()))
Race_county_ab <- read_excel("NJ County race demographics_2017-2023_white_black.xlsx",
col_types = c("skip", "text", "numeric",
"numeric", "numeric", "numeric"))
Arrest <- Arrest %>%
mutate(
Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
)
Convictions <- Convictions %>%
mutate(
Race = ifelse(Race %in% c("Unknown", "Not Provided"), NA, Race)
)
Convictions <- Convictions %>%
mutate(
Ethnicity = ifelse(Ethnicity %in% c("Unknown", "Not Provided", "NA", ""), NA, Ethnicity)
)
# In setup chunk
make_data_dictionary <- function(df, n_examples = 3) {
data.frame(
Variable = names(df),
Class = sapply(df, function(x) paste(class(x), collapse = ", ")),
Example = sapply(df, function(x) {
ex <- unique(x[!is.na(x)])
ex <- head(ex, n_examples)
paste(ex, collapse = ", ")
}),
stringsAsFactors = FALSE
)
}
# Create Conviction Year Variable in
Convictions$Sentence.Date <- as.Date(Convictions$`Date of Sentence`)
Convictions$Sentence.Year <- as.numeric(format(Convictions$Sentence.Date, "%Y"))
#Data Structure for Datasets
This provides us with the name of each variable in each dataset and what their structure is.
# Data dictionary for Overdose
knitr::kable(make_data_dictionary(Overdose),
caption = "Overdose Data Dictionary")
| Variable | Class | Example | |
|---|---|---|---|
| County | County | character | Atlantic County, NJ, Bergen County, NJ, Burlington County, NJ |
| County Code | County Code | numeric | 34001, 34003, 34005 |
| Year | Year | integer | 1999, 2000, 2001 |
| Year Code | Year Code | integer | 1999, 2000, 2001 |
| Deaths | Deaths | character | 35, 40, 33 |
| Population | Population | integer | 250432, 880225, 420542 |
| Crude Rate | Crude Rate | character | 14, 4.5, 7.8 |
# Data dictionary for Convictions
knitr::kable(make_data_dictionary(Convictions),
caption = "Convictions Data Dictionary")
| Variable | Class | Example | |
|---|---|---|---|
| Sex | Sex | character | Male, Female |
| Race | Race | character | White, Black, Hispanic, White |
| Ethnicity | Ethnicity | character | White, Black, Hispanic |
| Date of Sentence | Date of Sentence | POSIXct, POSIXt | 1988-08-05, 1988-11-18, 1989-01-26 |
| Begin Serve | Begin Serve | POSIXct, POSIXt | 1988-08-05, 1988-11-18, 1989-01-26 |
| Min Offense Term | Min Offense Term | character | 0, Years Months Days, 5 Years |
| Max Offense Term | Max Offense Term | character | 10 Years, 7 Years, 15 Years |
| County | County | character | Burlington, Hudson, Atlantic |
| Sentence.Date | Sentence.Date | Date | 1988-08-05, 1988-11-18, 1989-01-26 |
| Sentence.Year | Sentence.Year | numeric | 1988, 1989, 1991 |
# Data dictionary for Arrest
knitr::kable(make_data_dictionary(Arrest),
caption = "Arrest Data Dictionary")
| Variable | Class | Example | |
|---|---|---|---|
| ID | ID | character | XXX3H7XYDB, XXX3H98D8H, XXX7DH3OYE |
| County | County | character | Bergen, Morris, Ocean |
| Arrest Year | Arrest Year | numeric | 2024, 2023, 2022 |
| Arrest Month | Arrest Month | character | February, July, April |
| Race | Race | character | White (incl. Hispanic origin), Black |
| Age Range | Age Range | character | 35-44, 25-34, 18-24 |
| Gender | Gender | character | Male, Female |
| Offense Statute | Offense Statute | character | 2C:35-9A, 2C:35-9 |
| Offense Category | Offense Category | character | Drugs |
| Arrest source | Arrest source | character | Warrant, Summons, Other |
| Status | Status | character | Pending, Dismissed Charges, Guilty |
Although the following tables give general data structure info – note that they do not capture missingness where the missingness is noted by something other than “NA”… so missingness is explored later in the code after I correct that.
library(skimr)
skim(Overdose)
| Name | Overdose |
| Number of rows | 525 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| County | 0 | 1 | 16 | 21 | 0 | 21 | 0 |
| Deaths | 0 | 1 | 2 | 10 | 0 | 175 | 0 |
| Crude Rate | 2 | 1 | 1 | 10 | 0 | 249 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| County Code | 0 | 1 | 34021.0 | 12.12 | 34001 | 34011 | 34021 | 34031 | 34041 | ▇▆▆▆▆ |
| Year | 0 | 1 | 2011.0 | 7.22 | 1999 | 2005 | 2011 | 2017 | 2023 | ▇▇▇▇▇ |
| Year Code | 0 | 1 | 2011.0 | 7.22 | 1999 | 2005 | 2011 | 2017 | 2023 | ▇▇▇▇▇ |
| Population | 0 | 1 | 419484.7 | 250725.93 | 62385 | 150508 | 447560 | 597943 | 957736 | ▇▅▇▃▃ |
library(skimr)
skim(Arrest)
| Name | Arrest |
| Number of rows | 367 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ID | 0 | 1.00 | 10 | 10 | 0 | 210 | 0 |
| County | 0 | 1.00 | 5 | 25 | 0 | 21 | 0 |
| Arrest Month | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
| Race | 11 | 0.97 | 5 | 29 | 0 | 2 | 0 |
| Age Range | 0 | 1.00 | 3 | 5 | 0 | 7 | 0 |
| Gender | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| Offense Statute | 0 | 1.00 | 7 | 8 | 0 | 2 | 0 |
| Offense Category | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
| Arrest source | 0 | 1.00 | 5 | 7 | 0 | 3 | 0 |
| Status | 0 | 1.00 | 5 | 17 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Arrest Year | 0 | 1 | 2019.69 | 2.06 | 2017 | 2018 | 2019 | 2021 | 2024 | ▇▅▆▂▃ |
library(skimr)
skim(Convictions)
| Name | Convictions |
| Number of rows | 385 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| Date | 1 |
| numeric | 1 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Sex | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| Race | 0 | 1.00 | 3 | 22 | 0 | 6 | 0 |
| Ethnicity | 53 | 0.86 | 5 | 12 | 0 | 4 | 0 |
| Min Offense Term | 0 | 1.00 | 1 | 25 | 0 | 44 | 0 |
| Max Offense Term | 0 | 1.00 | 7 | 16 | 0 | 19 | 0 |
| County | 0 | 1.00 | 5 | 10 | 0 | 20 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Sentence.Date | 0 | 1 | 1988-08-05 | 2025-04-25 | 2020-01-30 | 159 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sentence.Year | 0 | 1 | 2016.18 | 9.36 | 1988 | 2016 | 2020 | 2023 | 2025 | ▁▁▁▂▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Date of Sentence | 0 | 1.00 | 1988-08-05 | 2025-04-25 | 2020-01-30 | 159 |
| Begin Serve | 6 | 0.98 | 1988-08-05 | 2025-04-25 | 2019-12-20 | 153 |
#Missingness in Data ##Racial Categories Across Datasets
library(dplyr)
library(knitr)
# Arrest race categories
arrest_race_table <- Arrest %>%
count(Race, sort = TRUE) %>%
rename(`Race Category` = Race, Count = n)
kable(arrest_race_table, caption = "Race Categories in Arrests")
| Race Category | Count |
|---|---|
| Black | 185 |
| White (incl. Hispanic origin) | 171 |
| NA | 11 |
# Convictions race categories
convictions_race_table <- Convictions %>%
count(Race, sort = TRUE) %>%
rename(`Race Category` = Race, Count = n)
kable(convictions_race_table, caption = "Race Categories in Convictions")
| Race Category | Count |
|---|---|
| N/A | 200 |
| Black | 108 |
| White | 45 |
| Hispanic, White | 20 |
| Hispanic, Unspfd BL/WH | 10 |
| Hispanic, Black | 2 |
One of the issues with this data is that it includes White/Hispanic in one category. They do not provide us with White - Non-Hispanic, which would allow us to really see how they are treating white people. Consider saying we need more information here in the brief, because I suspect that Hispanic persons may be disproprionately charged as well and we won’t be able to see that. And, we also won’t be able to see the discrepancy between non-white/white charging rates.
##Missingness in Convictions Race Data: Year? County?
52% of cases are missing a racial category in the Convictions dataset. I next test if this missingness is time-dependent, i.e. if for e.g. they did not do a good job of capturing race in the past (like the 1990s). And, that is not the case… it seems like it is relevant even for 2025…
library(dplyr)
library(knitr)
# Step 1: Convert "N/A" to proper NA in Race
Convictions <- Convictions %>%
mutate(Race = ifelse(Race %in% c("N/A", "", "NA"), NA, Race))
# Step 2: Count missing vs non-missing by Sentence.Year
race_missing_by_year <- Convictions %>%
group_by(Sentence.Year) %>%
summarise(
Missing = sum(is.na(Race)),
Non_Missing = sum(!is.na(Race)),
Total = n(),
.groups = "drop"
) %>%
mutate(Missing_Pct = round((Missing / Total) * 100, 1))
# Step 3: Print as a nice table
kable(race_missing_by_year,
caption = "Missing Race Data in Convictions by Year")
| Sentence.Year | Missing | Non_Missing | Total | Missing_Pct |
|---|---|---|---|---|
| 1988 | 0 | 3 | 3 | 0.0 |
| 1989 | 0 | 2 | 2 | 0.0 |
| 1991 | 0 | 4 | 4 | 0.0 |
| 1994 | 0 | 6 | 6 | 0.0 |
| 1995 | 0 | 7 | 7 | 0.0 |
| 1996 | 0 | 4 | 4 | 0.0 |
| 1997 | 0 | 3 | 3 | 0.0 |
| 1998 | 0 | 6 | 6 | 0.0 |
| 1999 | 0 | 2 | 2 | 0.0 |
| 2000 | 0 | 7 | 7 | 0.0 |
| 2001 | 0 | 2 | 2 | 0.0 |
| 2002 | 2 | 8 | 10 | 20.0 |
| 2003 | 1 | 1 | 2 | 50.0 |
| 2004 | 7 | 3 | 10 | 70.0 |
| 2005 | 0 | 2 | 2 | 0.0 |
| 2006 | 2 | 2 | 4 | 50.0 |
| 2007 | 3 | 0 | 3 | 100.0 |
| 2009 | 1 | 0 | 1 | 100.0 |
| 2010 | 1 | 0 | 1 | 100.0 |
| 2011 | 4 | 3 | 7 | 57.1 |
| 2013 | 4 | 0 | 4 | 100.0 |
| 2014 | 2 | 0 | 2 | 100.0 |
| 2015 | 1 | 0 | 1 | 100.0 |
| 2016 | 10 | 4 | 14 | 71.4 |
| 2017 | 11 | 12 | 23 | 47.8 |
| 2018 | 13 | 5 | 18 | 72.2 |
| 2019 | 25 | 15 | 40 | 62.5 |
| 2020 | 17 | 5 | 22 | 77.3 |
| 2021 | 16 | 12 | 28 | 57.1 |
| 2022 | 27 | 23 | 50 | 54.0 |
| 2023 | 14 | 33 | 47 | 29.8 |
| 2024 | 27 | 11 | 38 | 71.1 |
| 2025 | 12 | 0 | 12 | 100.0 |
COUNTY: Convictions: Race: Missingness
This finding may be something you want to include in your brief. It looks like certain counties are rountinely not collecting race data for DiD charges.
Moving forward: For us to look to see if racial disparities are occurring in the enforcement of DiD, we will need to use the Arrest data and cannot use the Convictions data, but before we completely put the Convictions Dataset aside, let’s look at the ethnicity variable.
#Ethnicity Data in Convictions Dataset
Before we getting into testing racial disparaties – we need to see if the ethnicity data can add to our analysis.
For completions sake, I am going to look at the Convictions data, although we already know that more than half of the race data in the convictions dataset is missing.
# Ethnicity summary
ethnicity_summary <- Convictions %>%
count(Ethnicity) %>%
mutate(Percent = paste0(round(n / sum(n) * 100, 1), "%"))
kable(ethnicity_summary, col.names = c("Ethnicity", "Count", "Percent"),
caption = "Table: Ethnicity Distribution in Convictions", align = "lrr")
| Ethnicity | Count | Percent |
|---|---|---|
| Black | 14 | 3.6% |
| Hispanic | 44 | 11.4% |
| Not Hispanic | 232 | 60.3% |
| White | 42 | 10.9% |
| NA | 53 | 13.8% |
What is interesting in looking at the Convictions dataset is that while Race was missing for 52% of the dataset, Ethnicity was only missing for 13.8% of the sample. This suggests that the Race data was not provided to us.
I did do an analysis of the data that we do have and found sentencing disparities BUT it is useless because 52% of the data is missing and it is clearly linked to County, so it is not random.
Conclusion I think that you can argue in your brief that we need more data to determine both whether there are racial disparities in conviction and sentencing. I believe you said that the sentencing argument wasn’t going to be used in the current brief but I can provide you with it, if you need it to argue that we do not have enough data to determine whether or not sentencing disparaties exist by race – but that data that we do have, suggests that there are sentencing disparities by race.
In case this is useful for you to show issues with data collection and reporting, I have created a new variable called Race and Ethnicity from the Convictions data. For e.g., they report Not Hispanic for 146 cases but report NA for race….
# Combine Race and Ethnicity into one grouping column
library(dplyr)
# Add column for Race/Ethnicity
Convictions <- Convictions %>%
mutate(Race_Ethnicity = paste(Race, Ethnicity, sep = " - "))
race_ethnicity_summary <- Convictions %>%
filter(!is.na(Race_Ethnicity)) %>%
count(Race_Ethnicity) %>%
mutate(
Percent = round(n / sum(n) * 100, 1),
Label = paste0(Percent, "%")
)
library(knitr)
kable(race_ethnicity_summary,
col.names = c("Race + Ethnicity", "Count", "Percent of Charges", "Label"),
caption = "Table: Distribution of Charges by Combined Race and Ethnicity",
align = "lrrr")
| Race + Ethnicity | Count | Percent of Charges | Label |
|---|---|---|---|
| Black - Black | 7 | 1.8 | 1.8% |
| Black - Hispanic | 3 | 0.8 | 0.8% |
| Black - NA | 21 | 5.5 | 5.5% |
| Black - Not Hispanic | 77 | 20.0 | 20% |
| Hispanic, Black - Hispanic | 2 | 0.5 | 0.5% |
| Hispanic, Unspfd BL/WH - Hispanic | 7 | 1.8 | 1.8% |
| Hispanic, Unspfd BL/WH - Not Hispanic | 3 | 0.8 | 0.8% |
| Hispanic, White - Hispanic | 20 | 5.2 | 5.2% |
| NA - Black | 6 | 1.6 | 1.6% |
| NA - Hispanic | 12 | 3.1 | 3.1% |
| NA - NA | 23 | 6.0 | 6% |
| NA - Not Hispanic | 146 | 37.9 | 37.9% |
| NA - White | 13 | 3.4 | 3.4% |
| White - Black | 1 | 0.3 | 0.3% |
| White - NA | 9 | 2.3 | 2.3% |
| White - Not Hispanic | 6 | 1.6 | 1.6% |
| White - White | 29 | 7.5 | 7.5% |
In Conclusion to test for racial disparities, we will need to use the Arrest data because it is more complete.
#Arrest Data: Are there Racial Disparities in Enforcement of DiD?
Unfortunately, although the Arrest data is more complete, it has very few racial categories. Just Black or White.
kable(arrest_race_table, caption = "Race Categories in Arrests")
| Race Category | Count |
|---|---|
| Black | 185 |
| White (incl. Hispanic origin) | 171 |
| NA | 11 |
library(ggplot2)
race_summary_arr <- Arrest %>%
count(Race) %>%
mutate(Percent = round(n / sum(n) * 100, 1))
ggplot(race_summary_arr, aes(x = reorder(Race, n), y = n, fill = Race)) +
geom_col() +
geom_text(aes(label = n), vjust = -0.3) +
labs(title = "Arrests by Race", x = "Race", y = "Count") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)
)
library(dplyr)
library(stringr)
# 2. Adjust the County names in the Race_county data to match the other datasets
Race_county_ab <- Race_county_ab %>%
mutate(
County = str_replace(County, " County, NJ$", ""), # remove " County, NJ"
County = str_trim(County) # remove extra spaces
)
head(Race_county_ab)
## # A tibble: 6 × 5
## County `County Code` Year `Black only` `White only`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Atlantic 34001 2023 38094 159688
## 2 Atlantic 34001 2022 37060 164950
## 3 Atlantic 34001 2021 37629 168911
## 4 Atlantic 34001 2020 37622 167448
## 5 Atlantic 34001 2019 38907 174978
## 6 Atlantic 34001 2018 40319 173409
library(dplyr)
library(tidyr)
# --- Arrests by County-Year-Race ---
arrests_by_race_county_year <- Arrest %>%
filter(Race %in% c("White (incl. Hispanic origin)", "Black")) %>%
count(County, `Arrest Year`, Race, name = "Arrest_Count") %>%
rename(Year = `Arrest Year`) %>%
mutate(Race = case_when(
Race == "White (incl. Hispanic origin)" ~ "White",
Race == "Black" ~ "Black"
))
# --- Population by County-Year-Race ---
race_pop_county <- Race_county_ab %>%
dplyr::select(County, Year, `White only`, `Black only`) %>%
pivot_longer(
cols = c(`White only`, `Black only`),
names_to = "Race",
values_to = "Population"
) %>%
mutate(Race = case_when(
Race == "White only" ~ "White",
Race == "Black only" ~ "Black"
))
# --- Merge population with arrests ---
county_race_year <- race_pop_county %>%
left_join(arrests_by_race_county_year, by = c("County", "Year", "Race")) %>%
mutate(
Arrest_Count = replace_na(Arrest_Count, 0), # fill in 0 where missing
Arrest_Rate = (Arrest_Count / Population) * 1e5 # per 100,000
)
CHI-SQUARE TEST
# Collapse to statewide totals
statewide_data <- county_race_year %>%
group_by(Race) %>%
summarise(
Arrests = sum(Arrest_Count, na.rm = TRUE),
Population = sum(Population, na.rm = TRUE),
.groups = "drop"
)
# Chi-square test: are arrests proportional to population?
chisq_result <- chisq.test(
x = statewide_data$Arrests,
p = statewide_data$Population / sum(statewide_data$Population)
)
chisq_result
##
## Chi-squared test for given probabilities
##
## data: statewide_data$Arrests
## X-squared = 271.16, df = 1, p-value < 2.2e-16
Null hypothesis (H₀): Arrests are distributed across White vs Black in proportion to their share of the statewide population.
Chi-squared statistic (X²) = 271.16 → This is very large, meaning the observed arrest distribution is very far from the expected distribution (if arrests followed population proportions).
p-value < 2.2e-16 → Essentially zero. Strong evidence against the null hypothesis.
Arrests do not match the racial composition of the statewide population. The difference is highly statistically significant — it’s extremely unlikely this disparity is due to chance. This means either White or Black residents are being arrested at disproportionately higher rates compared to their share of the population.
Next Step (to interpret direction of disparity)
statewide_data %>%
mutate(Expected_Arrests = sum(Arrests) * (Population / sum(Population)))
## # A tibble: 2 × 4
## Race Arrests Population Expected_Arrests
## <chr> <int> <dbl> <dbl>
## 1 Black 168 8432720 55.9
## 2 White 157 40560005 269.
Expected arrests (if proportional to population): Black ≈ 56, White ≈ 269
Black people make up a much smaller share of the state’s population than White people, but their observed arrests are three times higher than expected (168 vs. 56).
The chi-square statistic is extremely large (X² = 271, p < 0.001), meaning the difference between observed and expected arrests is highly statistically significant
There is strong evidence of racial disparity in arrests statewide. Black residents are disproportionately arrested relative to their population size, while White residents are underrepresented in arrest counts.
#Modeling Race as a predictor
#Poisson Model
I am going to compare several model for model fit before I interpret the results. First is a mixed-effects poisson regression.
county_race_year <- county_race_year %>%
mutate(Year_c = Year - 2017) # set first year as baseline
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
county_race_year <- county_race_year %>%
mutate(Race = factor(Race, levels = c("Black", "White")))
# Mixed-effects Poisson regression of arrest counts
model_mixed <- glmer(
Arrest_Count ~ Race + Year_c + (1 | County),
offset = log(Population),
family = poisson,
data = county_race_year
)
summary(model_mixed)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County)
## Data: county_race_year
## Offset: log(Population)
##
## AIC BIC logLik -2*log(L) df.resid
## 869.4 884.1 -430.7 861.4 292
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3907 -0.7982 -0.3709 -0.1283 5.7161
##
## Random effects:
## Groups Name Variance Std.Dev.
## County (Intercept) 2.263 1.504
## Number of obs: 296, groups: County, 21
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.76380 0.35796 -30.070 < 2e-16 ***
## RaceWhite -2.18729 0.11823 -18.500 < 2e-16 ***
## Year_c -0.11743 0.02739 -4.287 1.81e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) RacWht
## RaceWhite -0.148
## Year_c -0.192 0.008
Next, we will simulate residuals.
sim_res <- simulateResiduals(fittedModel = model_mixed)
plot(sim_res)
testDispersion(sim_res)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.065475, p-value = 0.832
## alternative hypothesis: two.sided
testZeroInflation(sim_res)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 1.1949, p-value = 0.096
## alternative hypothesis: two.sided
testDispersion(sim_res) This tests whether variance in data is greater than expected under possion model ( Poisson assumes mean = variance). p-value = 0.832 The fitted model handles variance well.
testZeroInflation(sim_res) This tests whether there are more zeros in data than the model expects. p-value = 0.096 Observed-to-expected zero ratio: 1.1949 (i.e., ~19% more zeros than expected) BUT – this is not significant based on the p-value.
However, I am still going to run a couple other models and test AIC/BIC, especially because of the Quantile Deviation (KS test & residual vs predicted plot).
KS test (QQ plot): p = 0.00133 → significant deviation Residuals vs. predicted: Deviations in upper quantiles (red bands outside gray CI) Residuals deviate slightly from uniformity, especially in upper predictions.
SUMMARY: We assessed model fit using simulated residual diagnostics from the DHARMa package. No evidence of overdispersion was observed (p = 0.832). A borderline non-significant test for zero-inflation (p = 0.096) and a significant quantile deviation (KS test p = 0.0013) suggest some modest model misfit.
#Negative Binomial model
NEXT MODEL = negative binomial model – fixed effects = County
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model_nb <- glmer.nb(
Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population)),
data = county_race_year
)
model_nb
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: Negative Binomial(0.5668) ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population))
## Data: county_race_year
## AIC BIC logLik -2*log(L) df.resid
## 717.6919 736.1437 -353.8460 707.6919 291
## Random effects:
## Groups Name Std.Dev.
## County (Intercept) 1.453
## Number of obs: 296, groups: County, 21
## Fixed Effects:
## (Intercept) RaceWhite Year_c
## -10.730 -2.069 -0.141
AIC(model_mixed, model_nb)
## df AIC
## model_mixed 4 869.3782
## model_nb 5 717.6919
The negative bionomial model performs better than the poisson model.
Now to plot the residuals….
sim_res2 <- simulateResiduals(fittedModel = model_nb)
plot(sim_res2)
KS Test (Kolmogorov-Smirnov):
p = 0.18672 → Not significant
Interpretation: The residual distribution does not deviate significantly from the expected uniform distribution.
Dispersion Test:
p = 0.296 → Not significant
Interpretation: No evidence of overdispersion
Outlier Test:
p = 0.94 → Not significant
Interpretation: No excessive outliers in residuals.
The NB model seems well-specified in terms of variance, dispersion, and distribution.
testDispersion(sim_res2)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.058817, p-value = 0.296
## alternative hypothesis: two.sided
testZeroInflation(sim_res2)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 1.0163, p-value = 0.872
## alternative hypothesis: two.sided
Test: testZeroInflation(sim_res) Result:
p-value = 0.872
Observed-to-expected zero ratio = 1.0163
The test indicates no significant zero-inflation.
Test: testDispersion(sim_res) Result:
p-value = 0.296
Dispersion = 0.058817
No evidence of overdispersion in the residuals. The model appropriately captures variance in the data.
Model diagnostics were conducted using the DHARMa package in R to assess the fit of the Negative Binomial model. Simulated quantile residuals demonstrated no evidence of systematic deviation, with a non-significant Kolmogorov-Smirnov (KS) test (p = 0.187), indicating that the residual distribution did not differ significantly from the expected uniform distribution. The residuals-versus-predicted plot showed no apparent structure or heteroskedasticity, and the 95% quantile simulation bands fell within acceptable limits, suggesting an adequate model fit.
Tests for overdispersion and outliers confirmed that the model handled variance appropriately. The nonparametric dispersion test yielded a p-value of 0.296, indicating no evidence of residual overdispersion. Additionally, the outlier test was non-significant (p = 0.94), supporting the absence of extreme or influential residuals.
A zero-inflation test was also conducted to determine whether the model underestimated the number of zeros in the data. Results indicated no excess zero-inflation, with a p-value of 0.872 and an observed-to-expected zero ratio of 1.016. The observed number of zeros was well within the simulated distribution under the null hypothesis.
Taken together, these diagnostics indicate that the Negative Binomial model is appropriately specified, with no need for additional correction for overdispersion or zero-inflation. The model can be reliably used for inference regarding treatment effects and associations in the observed count data.
#Zero-Inflated Negative Binomial versus Negative Binomial
We could stop here, but I decided to just go one step further and run a zero-inflated negative binomial model – although all indicators suggest it is unnecessary.
library(glmmTMB)
model_zi <- glmmTMB(
Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population)),
ziformula = ~1,
family = nbinom2,
data = county_race_year
)
AIC(model_nb, model_zi)
## df AIC
## model_nb 5 717.6919
## model_zi 6 718.0269
The negative binomial model without zero-inflation fits the data slightly better according to AIC. There is no evidence that a zero-inflated model meaningfully improves fit — suggesting no significant excess zeros beyond what the negative binomial already accounts for.
Just to be certain I will run the Dharma tests.
sim_res3 <- simulateResiduals(fittedModel = model_zi)
plot(sim_res3)
testDispersion(sim_res3)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.022091, p-value = 0.296
## alternative hypothesis: two.sided
testZeroInflation(sim_res3)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = 1.0055, p-value = 0.984
## alternative hypothesis: two.sided
DHARMa Diagnostics for ZINB Model KS Test (QQ plot): No deviation from expected residual distribution. p = 0.734 Dispersion Test: No overdispersion detected. p = 0.296 Outlier Test: No influential outliers detected. p = 0.700 Residual vs. Predicted Plot: No visible heteroskedasticity or systematic trends.
Zero-Inflation Test Observed-to-expected zeros ratio = 1.0055 No significant zero inflation. (p = 0.984)
Even with a zero-inflation component explicitly modeled, the number of observed zeros is almost identical to what the model expects.
This suggests that zero-inflation may not be necessary — the NB model already handled this well.
Dispersion Test Dispersion = 0.022, p = 0.296
SUMMARY: Model diagnostics for the Zero-Inflated Negative Binomial (ZINB) model indicated excellent fit. Simulated quantile residuals followed a uniform distribution, confirmed by a non-significant Kolmogorov-Smirnov (KS) test (p = 0.734), and residuals were symmetrically distributed around model predictions. There was no evidence of overdispersion (p = 0.296) or extreme outliers (p = 0.700).Zero-inflation diagnostics revealed no statistically significant deviation between observed and expected zeros (p = 0.984; ratio = 1.0055), suggesting that the inclusion of a zero-inflation component was not strictly necessary.
Although the Zero-Inflated Negative Binomial (ZINB) model demonstrated excellent residual diagnostics and no evidence of overdispersion or zero-inflation, comparison of model fit using Akaike Information Criterion (AIC) favored the simpler Negative Binomial (NB) model (AIC = 717.69 vs. 718.03). However, the AIC difference (ΔAIC = 0.34) is well below the conventional threshold of 2, indicating that the two models perform equivalently in terms of fit.
Given that the NB model achieves nearly identical performance with fewer parameters and is more parsimonious, it may be preferred for final interpretation and reporting. The ZINB model did not offer meaningful improvement in fit or capture excess zeros beyond what the NB model already accounted for.
#Negative Binomial Fixed Effect Model Results So, negative binomial fixed effect model is best specification for arrest counts by county/race/year.
Now to get the results from that model for interpretation.
model_nb
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: Negative Binomial(0.5668) ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population))
## Data: county_race_year
## AIC BIC logLik -2*log(L) df.resid
## 717.6919 736.1437 -353.8460 707.6919 291
## Random effects:
## Groups Name Std.Dev.
## County (Intercept) 1.453
## Number of obs: 296, groups: County, 21
## Fixed Effects:
## (Intercept) RaceWhite Year_c
## -10.730 -2.069 -0.141
exp(fixef(model_nb))
## (Intercept) RaceWhite Year_c
## 2.188191e-05 1.263374e-01 8.684646e-01
summary(model_nb)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: Negative Binomial(0.5668) ( log )
## Formula: Arrest_Count ~ Race + Year_c + (1 | County) + offset(log(Population))
## Data: county_race_year
##
## AIC BIC logLik -2*log(L) df.resid
## 717.7 736.1 -353.8 707.7 291
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.7183 -0.5071 -0.3238 -0.1009 4.6955
##
## Random effects:
## Groups Name Variance Std.Dev.
## County (Intercept) 2.111 1.453
## Number of obs: 296, groups: County, 21
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.72985 0.39541 -27.136 <2e-16 ***
## RaceWhite -2.06880 0.23259 -8.895 <2e-16 ***
## Year_c -0.14103 0.05896 -2.392 0.0168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) RacWht
## RaceWhite -0.236
## Year_c -0.377 -0.087
library(broom.mixed)
tidy(model_nb, effects = "fixed", conf.int = TRUE, conf.method = "Wald", exponentiate = TRUE)
## # A tibble: 3 × 8
## effect term estimate std.error statistic p.value conf.low conf.high
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed (Intercept) 0.0000219 0.00000865 -27.1 3.72e-162 1.01e-5 0.0000475
## 2 fixed RaceWhite 0.126 0.0294 -8.89 5.86e- 19 8.01e-2 0.199
## 3 fixed Year_c 0.868 0.0512 -2.39 1.68e- 2 7.74e-1 0.975
A generalized linear mixed model with a negative binomial distribution was estimated to examine racial disparities in arrest counts under N.J.S.A. 2C:35‑9a. The outcome variable was the number of arrests per county, year, and racial group (Black or White), and the model included a log-offset for the race-specific population in each county-year to ensure estimates reflected per-capita arrest rates. The model also included a fixed effect for centered year (to account for time trends) and a random intercept for county (to account for baseline differences in arrest rates across counties). After adjusting for population size, county-level differences, and temporal trends, the results indicated that White residents had a significantly lower arrest rate than Black residents. White residents had an arrest rate approximately 87.4% lower than Black residents (IRR = 0.126, 95% CI 0.080–0.199, p < 0.001), after adjusting for population size (of each racial demographic), time trends, and county‑level variation. In addition, arrest rates declined over time, with each additional year associated with a 13% reduction in arrest rate (IRR = 0.868, 95% CI [0.774, 0.975], p = 0.017). These results suggest strong and statistically significant racial disparities in the enforcement of drug-induced death laws, with Black individuals arrested at substantially higher rates than white individuals, even after accounting for differences in population size and location.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are 4 datasets: 1. Convictions = this a dataset from the New Jersey State Parole Board with numerical and demographic information of people sentenced pursuant to N.J.S.A. 2C:35-9a strict-liability drug-induced death.
dates: August 5, 1988, through June 23, 2025.
variables: Sex, Race, Ethnicity, Date of Sentence, Begin Serve, Min Offense Term, Max Offense Term, County
codebook: none provided
n=385
The following limitations were provided by the Board: Be advised that this report was compiled within the limits of information-gathering technology of the Parole Board Information System and may not include every existing conviction pursuant to N.J.S.A. 2C:35-9a. Be further advised that information related to race/ethnicity is reported based on data provided by the New Jersey Department of Corrections and cannot be verified.
Unknown was changed to NA
dates: 2017 through 2024
variables: ID, County, Arrest Year, Arrest Month, Race, Age Range, Gender, Offense Statute, Offense Category, Arrest source, Status
n=373
codebook: ?
Unknown & Not Provided were changed to NA
dates: 1999- 2023 variables: Notes, County, County Code, Year, Year Code, Deaths, Population, Crude Rate
crude rate calculated per 100,000
N= 529
Datasource limitations: a big limitation here is that CDC supresses data if the county has less than 15 deaths to report. We are in the process of requesting the unsupressed data.
• About revised 2019 deaths: On March 11, 2021, the 2019 mortality data on CDC WONDER was updated with the 2019 mortality data updated by NCHS on March 4, 2021 to include corrected information for residents of Texas affecting 5 records previously coded to cause code U01.4, Terrorism involving firearms (homicide). The underlying and multiple cause of death codes for 5 records were corrected in the 2019 data. Underlying and multiple cause of death codes for those 5 records were recoded to Assault (homicide) by other and unspecified firearm discharge, ICD-10 code X95. The corrected final death records replaces the data released on December 22, 2020. • About revised 2014 deaths: A revised data file for 2014 was released on April 3, 2017 to include corrections affecting 125 deaths previously coded to “Accidental discharge of firearms” (ICD-10 codes W32-W34). This corrected file replaced the file released on December 9, 2015. The underlying cause of death changed for 125 deaths in 2014 that occurred in Tennessee (80%) and Massachusetts (20%). (Note: Data by state of residence are also impacted in Connecticut, Florida, Georgia, Kentucky, North Carolina and Virginia for records where the death occurred in Tennessee or Massachusetts but the decedent was a resident of another state.) Prior to the revision, the underlying cause of death for all 125 deaths were classified as “Accidental discharge of firearms” (ICD-10 codes W32-W34). After the revision, 50% of the changed records are now classified as “Assault (homicide) by discharge of firearms” (ICD-10 codes U01.4, X93-X95); 42% are now classified as “Intentional self-harm (suicide) by discharge of firearms (ICD-10 codes X72-X74); 4% of the revised death records are now classified as”Discharge of firearms, undetermined intent” (ICD-10 codes Y22-Y24); and the remaining 4% to various other causes. For more information, please refer to Deaths: Final Data for 2014.
Query Parameters for 1999-2020: ICD-10 Codes: X40 (Accidental
poisoning by and exposure to nonopioid analgesics, antipyretics and
antirheumatics); X41 (Accidental poisoning by and exposure to
antiepileptic, sedative-hypnotic, antiparkinsonism and psychotropic
drugs, not elsewhere classified); X42 (Accidental poisoning by and
exposure to narcotics and psychodysleptics [hallucinogens], not
elsewhere classified); X43 (Accidental poisoning by and exposure to
other drugs acting on the autonomic nervous system); X44(Accidental
poisoning by and exposure to other and unspecified drugs, medicaments
and biological substances); X60 (Intentional
self-poisoning by and exposure to nonopioid analgesics, antipyretics and
antirheumatics); X61 (Intentional self-poisoning by and
exposure to antiepileptic, sedative-hypnotic, antiparkinsonism and
psychotropic drugs, not elsewhere classified); X62
(Intentional self-poisoning by and exposure to narcotics and
psychodysleptics [hallucinogens], not elsewhere classified); X63
(Intentional self-poisoning by and exposure to other drugs acting on the
autonomic nervous system); X64 (Intentional
self-poisoning by and exposure to other and unspecified drugs,
medicaments and biological substances); X85 (Assault by drugs,
medicaments and biological substances); Y10 (Poisoning by and exposure
to nonopioid analgesics, antipyretics and antirheumatics,
undetermined intent); Y11 (Poisoning by and exposure to antiepileptic,
sedative-hypnotic, antiparkinsonism and psychotropic
drugs, not elsewhere classified, undetermined intent); Y12 (Poisoning by
and exposure to narcotics and psychodysleptics
[hallucinogens], not elsewhere classified, undetermined intent); Y13
(Poisoning by and exposure to other drugs acting on the
autonomic nervous system, undetermined intent); Y14 (Poisoning by and
exposure to other and unspecified drugs, medicaments and
biological substances, undetermined intent)
Query parameters for 2021-2023 = used location of incident (not resident of decedent) and followed the same definition of overdose death as NVSS (X40-44 [unintentional self harm/injury], X60-64 [intentional self harm/suicide], X85 [assault], Y10-14 [undetermined intent]).
dates: 1989-2023
1989 is from here: https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/comp8090.htm
1990-2000 https://nj.gov/labor/labormarketinformation/assets/PDFs/dmograph/est/cest9199.html
2000-2010—under county estimates/intercensal county estimates/2000-2010/by race sex and Hispanic origin https://www.nj.gov/labor/labormarketinformation/demographics/population-household-estimates/index.shtml
Notes: For 1989, they categorize race into only 3 categories: ‘white’, ‘black’, and ‘other races’.
For 1990-1999 they separated race data into ‘Persons Not of Hispanic Origin’–white, black, ai/an, asian or pacific islander and ‘Persons of Hispanic Origin (Latinos)’–white, black, ai/an, asian or pacific islander (same categories).
Data was retrieved from the New Jersey Department of Labor and Workforce Development’s Population and Household Estimates intercensal population estimates.”
I also found this language at the bottom of one of the 1990-2000 data:
“Source: US Bureau of the Census, Population Estimates Branch. Prepared by: New Jersey Department of Labor, Division of Labor Market & Demographic Research, 4/02”
Not included in the analysis below, but available = a. statewide monthly CDC overdose death data: Statewide_data_month_Underlying Cause of Death, 2018-2023, Single Race (2).csv