library(googlesheets4)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(scales)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(ggplot2)
DIH_DATA <- read_sheet(
"https://docs.google.com/spreadsheets/d/1OS4-QPBoffGKsxfqPzW3LyLRoxga2j3wLYPPxXtnUHw/edit?usp=sharing",
col_types = "c"
)
## ℹ Suitable tokens found in the cache, associated with these emails:
## • 'katie.mccreedy17@gmail.com'
## • 'mccreedy.k@husky.neu.edu'
## Defaulting to the first email.
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for
## 'katie.mccreedy17@gmail.com'.
## ✔ Reading from "DIH Data 2026".
## ✔ Range 'Data'.
#DIH_DATA <- DIH_DATA %>% #
select(-Sentence Quartile)
#cleaning + vars
#make sentence quart var
DIH_DATA <- DIH_DATA %>%
mutate(
Sentence_Quartile_calc = ntile(Sentence, 4)
)
# numeric
DIH_DATA$Year_Charged <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code <- as.numeric(DIH_DATA$Rural_Code)
DIH_DATA$Accused_Age <- as.numeric(DIH_DATA$Accused_Age)
## Warning: NAs introduced by coercion
DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)
## Warning: NAs introduced by coercion
DIH_DATA <- DIH_DATA %>%
filter(`Date Incident` >= as.Date("1910-01-01") | is.na(`Date Incident`))
DIH_DATA <- DIH_DATA %>%
mutate(
Plea = na_if(Plea, "NA"),
Plea = case_when(
str_detect(tolower(Plea), "guilty") & !str_detect(tolower(Plea), "lesser") ~ "Guilty",
str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
str_detect(tolower(Plea), "pending") ~ "Pending/Awaiting",
Plea %in% c("State", "X", "Other") ~ NA_character_,
TRUE ~ Plea
),
Plea = case_when(
Plea == "Judge ruled it was not murder" ~ NA_character_,
Plea == "Pleads not guily" ~ "Guilty",
Plea == "Pled guity to DIH charges" ~ "Guilty",
TRUE ~ Plea
)
)
# substance cleaning
DIH_DATA <- DIH_DATA %>%
mutate(
Substance = tolower(Substance),
Substance = na_if(Substance, "na"),
Substance_clean = case_when(
str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
str_detect(Substance, "fentanyl") ~ "Fentanyl",
str_detect(Substance, "heroin") ~ "Heroin",
str_detect(Substance, "meth") ~ "Methamphetamine",
str_detect(Substance, "cocaine") ~ "Cocaine",
str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
TRUE ~ "Other"
)
)
# analytic sample
dih_analytic <- DIH_DATA %>%
filter(Year_Charged >= 2011 & Year_Charged <= 2021) %>%
filter(!is.na(Rural_Code))
# county distribution
county_dist <- dih_analytic %>%
distinct(County_Fips, Rural_Code) %>%
count(Rural_Code) %>%
mutate(county_pct = n / sum(n) * 100)
# DIH distribution
dih_dist <- dih_analytic %>%
count(Rural_Code) %>%
mutate(dih_pct = n / sum(n) * 100)
# build Table 2
table2 <- county_dist %>%
left_join(dih_dist, by = "Rural_Code") %>%
arrange(Rural_Code) %>%
mutate(
Classification = case_when(
Rural_Code == 1 ~ "Metro (large)",
Rural_Code == 2 ~ "Metro (medium)",
Rural_Code == 3 ~ "Metro (small)",
Rural_Code == 4 ~ "Suburban (large adjacent)",
Rural_Code == 5 ~ "Suburban (small adjacent)",
Rural_Code == 6 ~ "Suburban (non-adjacent)",
Rural_Code == 7 ~ "Rural (large)",
Rural_Code == 8 ~ "Rural (medium)",
Rural_Code == 9 ~ "Rural (small/isolated)"
),
Counties_pct = round(county_pct, 2),
DIH_pct = round(dih_pct, 2)
) %>%
select(Rural_Code, Classification, Counties_pct, DIH_pct)
# optional display
table1(~ Counties_pct + DIH_pct | factor(Rural_Code), data = table2)
| 1 (N=1) |
2 (N=1) |
3 (N=1) |
4 (N=1) |
5 (N=1) |
6 (N=1) |
7 (N=1) |
8 (N=1) |
9 (N=1) |
Overall (N=9) |
|
|---|---|---|---|---|---|---|---|---|---|---|
| Counties_pct | ||||||||||
| Mean (SD) | 28.4 (NA) | 20.1 (NA) | 15.7 (NA) | 9.99 (NA) | 2.59 (NA) | 11.0 (NA) | 4.28 (NA) | 4.80 (NA) | 3.11 (NA) | 11.1 (8.84) |
| Median [Min, Max] | 28.4 [28.4, 28.4] | 20.1 [20.1, 20.1] | 15.7 [15.7, 15.7] | 9.99 [9.99, 9.99] | 2.59 [2.59, 2.59] | 11.0 [11.0, 11.0] | 4.28 [4.28, 4.28] | 4.80 [4.80, 4.80] | 3.11 [3.11, 3.11] | 9.99 [2.59, 28.4] |
| DIH_pct | ||||||||||
| Mean (SD) | 39.4 (NA) | 25.9 (NA) | 14.7 (NA) | 7.13 (NA) | 1.20 (NA) | 6.14 (NA) | 2.29 (NA) | 1.94 (NA) | 1.20 (NA) | 11.1 (13.4) |
| Median [Min, Max] | 39.4 [39.4, 39.4] | 25.9 [25.9, 25.9] | 14.7 [14.7, 14.7] | 7.13 [7.13, 7.13] | 1.20 [1.20, 1.20] | 6.14 [6.14, 6.14] | 2.29 [2.29, 2.29] | 1.94 [1.94, 1.94] | 1.20 [1.20, 1.20] | 6.14 [1.20, 39.4] |
# view
table2
## # A tibble: 9 × 4
## Rural_Code Classification Counties_pct DIH_pct
## <dbl> <chr> <dbl> <dbl>
## 1 1 Metro (large) 28.4 39.4
## 2 2 Metro (medium) 20.1 25.9
## 3 3 Metro (small) 15.7 14.7
## 4 4 Suburban (large adjacent) 9.99 7.13
## 5 5 Suburban (small adjacent) 2.59 1.2
## 6 6 Suburban (non-adjacent) 11.0 6.14
## 7 7 Rural (large) 4.28 2.29
## 8 8 Rural (medium) 4.8 1.94
## 9 9 Rural (small/isolated) 3.11 1.2
# ===== hist =====
# Cases over time
DIH_DATA %>%
filter(!is.na(`Date Incident`)) %>%
count(`Date Incident`) %>%
ggplot(aes(x = `Date Incident`, y = n)) +
geom_line() +
labs(title = "Number of Cases Over Time", x = "Date", y = "Count")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
# Bar charts
ggplot(DIH_DATA, aes(x = Accused_Ethnicity)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(DIH_DATA, aes(x = Accused_Sex)) +
geom_bar()
ggplot(DIH_DATA, aes(x = Accused_Race)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Accused age density
ggplot(DIH_DATA, aes(x = Accused_Age)) +
geom_density()
## Warning: Removed 2222 rows containing non-finite outside the scale range
## (`stat_density()`).
# Deceased characteristics
ggplot(DIH_DATA, aes(x = Deceased_Race)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(DIH_DATA, aes(x = Deceased_Ethnicity)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(DIH_DATA, aes(x = Deceased_Sex)) +
geom_bar()
# Rural code
ggplot(DIH_DATA, aes(x = factor(Rural_Code))) +
geom_bar()
# Plea
ggplot(DIH_DATA, aes(x = Plea)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ===== Deceased Age Distribution =====
ggplot(DIH_DATA, aes(x = Deceased_Age)) +
geom_histogram(bins = 30) +
labs(
title = "Distribution of Deceased Age",
x = "Age",
y = "Count"
)
## Warning: Removed 544 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(DIH_DATA, aes(x = Deceased_Age)) +
geom_density() +
labs(
title = "Density of Deceased Age",
x = "Age",
y = "Density"
)
## Warning: Removed 544 rows containing non-finite outside the scale range
## (`stat_density()`).
# ===== Urbanicity Distribution =====
DIH_DATA <- DIH_DATA %>%
mutate(
Urbanicity = case_when(
Rural_Code %in% 1:3 ~ "Metropolitan",
Rural_Code %in% 4:6 ~ "Suburban",
Rural_Code %in% 7:9 ~ "Rural",
TRUE ~ NA_character_
)
)
ggplot(DIH_DATA %>% filter(!is.na(Urbanicity)),
aes(x = Urbanicity)) +
geom_bar() +
labs(
title = "Distribution of Urbanicity",
x = "Urbanicity",
y = "Count"
)
# ===== Urbanicity Proportion =====
ggplot(DIH_DATA %>% filter(!is.na(Urbanicity)),
aes(x = Urbanicity)) +
geom_bar(aes(y = after_stat(prop), group = 1)) +
labs(
title = "Proportion of Urbanicity",
x = "Urbanicity",
y = "Proportion"
)
# ===== State Distribution =====
ggplot(DIH_DATA %>% filter(!is.na(State)),
aes(x = State)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Distribution of States",
x = "State",
y = "Count"
)
# ===== State Proportion =====
ggplot(DIH_DATA %>% filter(!is.na(State)),
aes(x = State)) +
geom_bar(aes(y = after_stat(prop), group = 1)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Proportion of States",
x = "State",
y = "Proportion"
)
# ===== Rural Code Distribution =====
ggplot(DIH_DATA %>% filter(!is.na(Rural_Code)),
aes(x = factor(Rural_Code))) +
geom_bar() +
labs(
title = "Distribution of Rural Code",
x = "RUCC",
y = "Count"
)
# ===== Rural Code Proportion =====
ggplot(DIH_DATA %>% filter(!is.na(Rural_Code)),
aes(x = factor(Rural_Code))) +
geom_bar(aes(y = after_stat(prop), group = 1)) +
labs(
title = "Proportion of Rural Code",
x = "RUCC",
y = "Proportion"
)