DIH April 2026

library(googlesheets4)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(scales)
library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(DT)
library(ggplot2)

DIH_DATA <- read_sheet(
  "https://docs.google.com/spreadsheets/d/1OS4-QPBoffGKsxfqPzW3LyLRoxga2j3wLYPPxXtnUHw/edit?usp=sharing",
  col_types = "c"
)

## ℹ Suitable tokens found in the cache, associated with these emails:

## • 'katie.mccreedy17@gmail.com'

## • 'mccreedy.k@husky.neu.edu'

##   Defaulting to the first email.

## ! Using an auto-discovered, cached token.

##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.

##   See gargle's "Non-interactive auth" vignette for more details:

##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>

## ℹ The googlesheets4 package is using a cached token for
##   'katie.mccreedy17@gmail.com'.

## ✔ Reading from "DIH Data 2026".

## ✔ Range 'Data'.

only run this after refreshing dataset from google sheets

#DIH_DATA <- DIH_DATA %>% # select(-Sentence Quartile)

#cleaning + vars

#make sentence quart var
DIH_DATA <- DIH_DATA %>%
  mutate(
    Sentence_Quartile_calc = ntile(Sentence, 4)
  )

# numeric
DIH_DATA$Year_Charged  <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code    <- as.numeric(DIH_DATA$Rural_Code)

DIH_DATA$Accused_Age  <- as.numeric(DIH_DATA$Accused_Age)

## Warning: NAs introduced by coercion

DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)

## Warning: NAs introduced by coercion

DIH_DATA <- DIH_DATA %>%
  filter(`Date Incident` >= as.Date("1910-01-01") | is.na(`Date Incident`))

DIH_DATA <- DIH_DATA %>%
  mutate(
    Plea = na_if(Plea, "NA"),
    Plea = case_when(
      str_detect(tolower(Plea), "guilty") & !str_detect(tolower(Plea), "lesser") ~ "Guilty",
      str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
      str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
      str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
      str_detect(tolower(Plea), "pending") ~ "Pending/Awaiting",
      Plea %in% c("State", "X", "Other") ~ NA_character_,
      TRUE ~ Plea
    ),
    Plea = case_when(
      Plea == "Judge ruled it was not murder" ~ NA_character_,
      Plea == "Pleads not guily" ~ "Guilty",
      Plea == "Pled guity to DIH charges" ~ "Guilty",
      TRUE ~ Plea
    )
  )

# substance cleaning
DIH_DATA <- DIH_DATA %>%
  mutate(
    Substance = tolower(Substance),
    Substance = na_if(Substance, "na"),
    Substance_clean = case_when(
      str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
      str_detect(Substance, "fentanyl") ~ "Fentanyl",
      str_detect(Substance, "heroin") ~ "Heroin",
      str_detect(Substance, "meth") ~ "Methamphetamine",
      str_detect(Substance, "cocaine") ~ "Cocaine",
      str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
      str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
      TRUE ~ "Other"
    )
  )

# analytic sample
dih_analytic <- DIH_DATA %>%
  filter(Year_Charged >= 2011 & Year_Charged <= 2021) %>%
  filter(!is.na(Rural_Code))

# county distribution
county_dist <- dih_analytic %>%
  distinct(County_Fips, Rural_Code) %>%
  count(Rural_Code) %>%
  mutate(county_pct = n / sum(n) * 100)

# DIH distribution
dih_dist <- dih_analytic %>%
  count(Rural_Code) %>%
  mutate(dih_pct = n / sum(n) * 100)

# build Table 2
table2 <- county_dist %>%
  left_join(dih_dist, by = "Rural_Code") %>%
  arrange(Rural_Code) %>%
  mutate(
    Classification = case_when(
      Rural_Code == 1 ~ "Metro (large)",
      Rural_Code == 2 ~ "Metro (medium)",
      Rural_Code == 3 ~ "Metro (small)",
      Rural_Code == 4 ~ "Suburban (large adjacent)",
      Rural_Code == 5 ~ "Suburban (small adjacent)",
      Rural_Code == 6 ~ "Suburban (non-adjacent)",
      Rural_Code == 7 ~ "Rural (large)",
      Rural_Code == 8 ~ "Rural (medium)",
      Rural_Code == 9 ~ "Rural (small/isolated)"
    ),
    Counties_pct = round(county_pct, 2),
    DIH_pct = round(dih_pct, 2)
  ) %>%
  select(Rural_Code, Classification, Counties_pct, DIH_pct)

# optional display
table1(~ Counties_pct + DIH_pct | factor(Rural_Code), data = table2)

	1 (N=1)	2 (N=1)	3 (N=1)	4 (N=1)	5 (N=1)	6 (N=1)	7 (N=1)	8 (N=1)	9 (N=1)	Overall (N=9)
Counties_pct
Mean (SD)	28.4 (NA)	20.1 (NA)	15.7 (NA)	9.99 (NA)	2.59 (NA)	11.0 (NA)	4.28 (NA)	4.80 (NA)	3.11 (NA)	11.1 (8.84)
Median [Min, Max]	28.4 [28.4, 28.4]	20.1 [20.1, 20.1]	15.7 [15.7, 15.7]	9.99 [9.99, 9.99]	2.59 [2.59, 2.59]	11.0 [11.0, 11.0]	4.28 [4.28, 4.28]	4.80 [4.80, 4.80]	3.11 [3.11, 3.11]	9.99 [2.59, 28.4]
DIH_pct
Mean (SD)	39.4 (NA)	25.9 (NA)	14.7 (NA)	7.13 (NA)	1.20 (NA)	6.14 (NA)	2.29 (NA)	1.94 (NA)	1.20 (NA)	11.1 (13.4)
Median [Min, Max]	39.4 [39.4, 39.4]	25.9 [25.9, 25.9]	14.7 [14.7, 14.7]	7.13 [7.13, 7.13]	1.20 [1.20, 1.20]	6.14 [6.14, 6.14]	2.29 [2.29, 2.29]	1.94 [1.94, 1.94]	1.20 [1.20, 1.20]	6.14 [1.20, 39.4]

# view
table2

## # A tibble: 9 × 4
##   Rural_Code Classification            Counties_pct DIH_pct
##        <dbl> <chr>                            <dbl>   <dbl>
## 1          1 Metro (large)                    28.4    39.4 
## 2          2 Metro (medium)                   20.1    25.9 
## 3          3 Metro (small)                    15.7    14.7 
## 4          4 Suburban (large adjacent)         9.99    7.13
## 5          5 Suburban (small adjacent)         2.59    1.2 
## 6          6 Suburban (non-adjacent)          11.0     6.14
## 7          7 Rural (large)                     4.28    2.29
## 8          8 Rural (medium)                    4.8     1.94
## 9          9 Rural (small/isolated)            3.11    1.2

# ===== hist =====
# Cases over time
DIH_DATA %>%
  filter(!is.na(`Date Incident`)) %>%
  count(`Date Incident`) %>%
  ggplot(aes(x = `Date Incident`, y = n)) +
  geom_line() +
  labs(title = "Number of Cases Over Time", x = "Date", y = "Count")

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Bar charts
ggplot(DIH_DATA, aes(x = Accused_Ethnicity)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(DIH_DATA, aes(x = Accused_Sex)) +
  geom_bar()

ggplot(DIH_DATA, aes(x = Accused_Race)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Accused age density
ggplot(DIH_DATA, aes(x = Accused_Age)) +
  geom_density()

## Warning: Removed 2222 rows containing non-finite outside the scale range
## (`stat_density()`).

# Deceased characteristics
ggplot(DIH_DATA, aes(x = Deceased_Race)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(DIH_DATA, aes(x = Deceased_Ethnicity)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(DIH_DATA, aes(x = Deceased_Sex)) +
  geom_bar()

# Rural code
ggplot(DIH_DATA, aes(x = factor(Rural_Code))) +
  geom_bar()

# Plea
ggplot(DIH_DATA, aes(x = Plea)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ===== Deceased Age Distribution =====
ggplot(DIH_DATA, aes(x = Deceased_Age)) +
  geom_histogram(bins = 30) +
  labs(
    title = "Distribution of Deceased Age",
    x = "Age",
    y = "Count"
  )

## Warning: Removed 544 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(DIH_DATA, aes(x = Deceased_Age)) +
  geom_density() +
  labs(
    title = "Density of Deceased Age",
    x = "Age",
    y = "Density"
  )

## Warning: Removed 544 rows containing non-finite outside the scale range
## (`stat_density()`).

# ===== Urbanicity Distribution =====

DIH_DATA <- DIH_DATA %>%
  mutate(
    Urbanicity = case_when(
      Rural_Code %in% 1:3 ~ "Metropolitan",
      Rural_Code %in% 4:6 ~ "Suburban",
      Rural_Code %in% 7:9 ~ "Rural",
      TRUE ~ NA_character_
    )
  )


ggplot(DIH_DATA %>% filter(!is.na(Urbanicity)),
       aes(x = Urbanicity)) +
  geom_bar() +
  labs(
    title = "Distribution of Urbanicity",
    x = "Urbanicity",
    y = "Count"
  )

# ===== Urbanicity Proportion =====
ggplot(DIH_DATA %>% filter(!is.na(Urbanicity)),
       aes(x = Urbanicity)) +
  geom_bar(aes(y = after_stat(prop), group = 1)) +
  labs(
    title = "Proportion of Urbanicity",
    x = "Urbanicity",
    y = "Proportion"
  )

# ===== State Distribution =====
ggplot(DIH_DATA %>% filter(!is.na(State)),
       aes(x = State)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Distribution of States",
    x = "State",
    y = "Count"
  )

# ===== State Proportion =====
ggplot(DIH_DATA %>% filter(!is.na(State)),
       aes(x = State)) +
  geom_bar(aes(y = after_stat(prop), group = 1)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Proportion of States",
    x = "State",
    y = "Proportion"
  )

# ===== Rural Code Distribution =====
ggplot(DIH_DATA %>% filter(!is.na(Rural_Code)),
       aes(x = factor(Rural_Code))) +
  geom_bar() +
  labs(
    title = "Distribution of Rural Code",
    x = "RUCC",
    y = "Count"
  )

# ===== Rural Code Proportion =====
ggplot(DIH_DATA %>% filter(!is.na(Rural_Code)),
       aes(x = factor(Rural_Code))) +
  geom_bar(aes(y = after_stat(prop), group = 1)) +
  labs(
    title = "Proportion of Rural Code",
    x = "RUCC",
    y = "Proportion"
  )

DIH April 2026

Katie McCreedy

2026-04-16

only run this after refreshing dataset from google sheets