#Data isnt full cleaned so loading everything as text and then will specify column type
library(readxl)
DIH_DATA <- read_excel("/Users/taleedel-sabawi/Library/CloudStorage/Dropbox/1-Research/Leo DIH Paper/DIH DATA.xlsx",
  sheet = "Clean Data",
   col_types = "text"
)

## New names:
## • `` -> `...1`

#Assigning columns manually to data type
DIH_DATA$Year_Charged  <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code    <- as.numeric(DIH_DATA$Rural_Code)
DIH_DATA$Sentence_Quartile <- as.numeric(DIH_DATA$Sentence_Quartile)

# Dates
#These first dates are in excel serial data structure...
DIH_DATA$`Date Incident` <- as.Date(as.numeric(DIH_DATA$`Date Incident`), origin = "1899-12-30")

## Warning in as.Date(as.numeric(DIH_DATA$`Date Incident`), origin =
## "1899-12-30"): NAs introduced by coercion

DIH_DATA$`Date Charge` <- as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30")

## Warning in as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30"):
## NAs introduced by coercion

# Ages
DIH_DATA$Accused_Age  <- as.numeric(DIH_DATA$Accused_Age)

## Warning: NAs introduced by coercion

DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)

## Warning: NAs introduced by coercion

DIH_DATA$Sentence_clean <- as.numeric(DIH_DATA$Sentence_clean)

#checking data structure
str(DIH_DATA)

## tibble [3,264 × 32] (S3: tbl_df/tbl/data.frame)
##  $ ...1              : chr [1:3264] "1678" "2622" "1514" "320" ...
##  $ URL               : chr [1:3264] "http://www.nydailynews.com/news/crime/pain-relieving-patch-kills-harlem-girl-6-foster-mother-charged-article-1.328165" "http://whdh.com/news/woman-boyfriend-charged-in-fatal-cocaine-overdose-of-ohio-boy-9/" "http://www.capitalgazette.com/news/for_the_record/ac-cn-delvalle-overdose-verdict-20180620-story.html" "http://www.thisweeknews.com/news/20180719/fourth-person-expected-to-be-charged-in-connection-with-grandview-ove"| __truncated__ ...
##  $ Court             : chr [1:3264] NA NA "Baltimore County Court" NA ...
##  $ Rural_Code        : num [1:3264] 1 2 1 1 1 1 1 5 7 2 ...
##  $ County_Fips       : chr [1:3264] "36061" "39099" "24005" "39049" ...
##  $ County_FullName   : chr [1:3264] "New York County" "Mahoning County" "Baltimore County" "Franklin County" ...
##  $ County_Name       : chr [1:3264] "New York" "Mahoning" "Baltimore" "Franklin" ...
##  $ State             : chr [1:3264] "New York" "Ohio" "Maryland" "Ohio" ...
##  $ Year_Charged      : num [1:3264] 2004 2018 2018 2018 2011 ...
##  $ Date Charge       : Date[1:3264], format: "2004-12-25" "2018-01-05" ...
##  $ Date Incident     : Date[1:3264], format: "2004-12-15" "2017-12-26" ...
##  $ Accused_Name      : chr [1:3264] "Joanne Alvarez" "Raenell Allen" "Jason Patton Baker" "Benjamin Bussey" ...
##  $ Sentence_clean    : num [1:3264] NA NA NA NA NA NA NA NA NA NA ...
##  $ Sentence          : chr [1:3264] NA NA NA NA ...
##  $ Sentence_Quartile : num [1:3264] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Charge            : chr [1:3264] "Reckless/Negligent Homicide" "Involuntary Manslaughter" "Manslaughter" "Involuntary Manslaughter" ...
##  $ Accused_Race      : chr [1:3264] NA "White" "White" "White" ...
##  $ Accused_Ethnicity : chr [1:3264] "Hispanic/Latino" NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
##  $ Accused_Sex       : chr [1:3264] "Female" "Female" "Male" "Male" ...
##  $ Accused_Age       : num [1:3264] NA NA 46 19 NA NA NA NA NA NA ...
##  $ Accused_City      : chr [1:3264] "East Harlem" "Youngstown" "Millersville" "Dublin" ...
##  $ Accused_State     : chr [1:3264] "New York" "Ohio" "Pennsylvania" "Ohio" ...
##  $ Court_Type        : chr [1:3264] "State" "State" "State" "State" ...
##  $ Plea              : chr [1:3264] "Guilty" NA "Not Guilty" "Guilty to a Lesser Charge" ...
##  $ Deceased_Name     : chr [1:3264] "Taylor Webster" NA "Josiah Christopher Klaes" "Haleah Myers" ...
##  $ Deceased_Race     : chr [1:3264] "White" NA "White" "White" ...
##  $ Deceased_Ethnicity: chr [1:3264] NA NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
##  $ Deceased_Age      : num [1:3264] 6 9 16 17 17 18 18 18 18 18 ...
##  $ Deceased_Sex      : chr [1:3264] "Female" "Male" "Male" "Female" ...
##  $ Substance         : chr [1:3264] "Fentanyl (Analog)" NA "Fentanyl (Analog)" "Fentanyl (Analog)" ...
##  $ Lawyer            : chr [1:3264] NA NA "do'connell@opd.state.md.us" NA ...
##  $ Relationship      : chr [1:3264] "Caretaker/Family/Friend/Partner/Co-User" "Caretaker/Family/Friend/Partner/Co-User" "Dealer/Buyer" "Caretaker/Family/Friend/Partner/Co-User" ...

#Looking to see what Sentence contains
sort(unique(DIH_DATA$Sentence))

##   [1] "0.1052511416"                   "0.16164383560000001"           
##   [3] "0.16666666669999999"            "0.21917808220000001"           
##   [5] "0.25"                           "0.32500000000000001"           
##   [7] "0.33300000000000002"            "0.33333333329999998"           
##   [9] "0.375"                          "0.4"                           
##  [11] "0.40500000000000003"            "0.41666666670000002"           
##  [13] "0.5"                            "0.57260273969999997"           
##  [15] "0.58333333330000003"            "0.66666666669999997"           
##  [17] "0.75"                           "0.82199999999999995"           
##  [19] "0.83333333330000003"            "0.91666666669999997"           
##  [21] "0.95833333330000003"            "1"                             
##  [23] "1 year"                         "1.002"                         
##  [25] "1.25"                           "1.3"                           
##  [27] "1.3333333329999999"             "1.4166666670000001"            
##  [29] "1.5"                            "1.5-3"                         
##  [31] "1.583"                          "1.66"                          
##  [33] "1.6666666670000001"             "1.7"                           
##  [35] "1.75"                           "1.8"                           
##  [37] "1.9166666670000001"             "1.917"                         
##  [39] "1.92"                           "1.97"                          
##  [41] "10"                             "10.16"                         
##  [43] "10.41"                          "10.5"                          
##  [45] "10.67"                          "10.8"                          
##  [47] "10.83"                          "10.83333333"                   
##  [49] "11"                             "11 years"                      
##  [51] "11.1"                           "11.16666667"                   
##  [53] "11.25"                          "11.5"                          
##  [55] "11.83"                          "11.9"                          
##  [57] "113"                            "12"                            
##  [59] "12 years"                       "12.25"                         
##  [61] "12.5"                           "12.75"                         
##  [63] "124"                            "13"                            
##  [65] "13.33"                          "13.5"                          
##  [67] "14"                             "14.5"                          
##  [69] "143"                            "15"                            
##  [71] "15.5"                           "15.67"                         
##  [73] "16"                             "16.5"                          
##  [75] "17"                             "17.5"                          
##  [77] "18"                             "18.25"                         
##  [79] "18.666666670000001"             "180"                           
##  [81] "19"                             "19.25"                         
##  [83] "19.583333329999999"             "2"                             
##  [85] "2.25"                           "2.3333333330000001"            
##  [87] "2.4"                            "2.5"                           
##  [89] "2.75"                           "20"                            
##  [91] "20.5"                           "20.833333329999999"            
##  [93] "21"                             "22"                            
##  [95] "22.5"                           "23"                            
##  [97] "24"                             "25"                            
##  [99] "25 years"                       "25 years to life"              
## [101] "25.5"                           "26"                            
## [103] "26.83"                          "27"                            
## [105] "27.083333329999999"             "28"                            
## [107] "29"                             "3"                             
## [109] "3 to 9"                         "3.08"                          
## [111] "3.0833333330000001"             "3.25"                          
## [113] "3.3"                            "3.33"                          
## [115] "3.3333333330000001"             "3.4"                           
## [117] "3.4166666669999999"             "3.5"                           
## [119] "3.6666666669999999"             "3.83"                          
## [121] "3.8333333330000001"             "3.92"                          
## [123] "30"                             "30.416666670000001"            
## [125] "31"                             "32"                            
## [127] "33"                             "33.25"                         
## [129] "34"                             "35"                            
## [131] "36"                             "37"                            
## [133] "38"                             "4"                             
## [135] "4.0833333329999997"             "4.17"                          
## [137] "4.25"                           "4.4166666670000003"            
## [139] "4.5"                            "4.6666666670000003"            
## [141] "4.75"                           "4.8"                           
## [143] "4.83"                           "4.8330000000000002"            
## [145] "4.8333333329999997"             "4.9000000000000004"            
## [147] "40"                             "42"                            
## [149] "46"                             "47"                            
## [151] "48"                             "48.33"                         
## [153] "5"                              "5 years"                       
## [155] "5.0599999999999996"             "5.25"                          
## [157] "5.5"                            "5.6666666670000003"            
## [159] "5.75"                           "5.83"                          
## [161] "5.9166666670000003"             "5.92"                          
## [163] "50"                             "52"                            
## [165] "54"                             "55"                            
## [167] "6"                              "6.0833333329999997"            
## [169] "6.1666666670000003"             "6.25"                          
## [171] "6.5"                            "6.67"                          
## [173] "6.7"                            "6.75"                          
## [175] "60"                             "62"                            
## [177] "63"                             "65"                            
## [179] "7"                              "7 to 14 years"                 
## [181] "7.25"                           "7.33"                          
## [183] "7.5"                            "7.6666666670000003"            
## [185] "72"                             "75"                            
## [187] "8"                              "8.08"                          
## [189] "8.2191780820000004E-2"          "8.3299999999999999E-2"         
## [191] "8.33"                           "8.3330000000000002"            
## [193] "8.3333333329999995E-2"          "8.3333333330000006"            
## [195] "8.5"                            "8.67"                          
## [197] "84"                             "9"                             
## [199] "9.33"                           "9.5"                           
## [201] "9.5-20"                         "9.9169999999999998"            
## [203] "90"                             "96"                            
## [205] "Face 40 years"                  "Faces 6 to 30 years"           
## [207] "Faces up to 30 years in prison" "Faces up to 40 years"          
## [209] "Life"                           "pending"                       
## [211] "Pending"                        "Up to 20 years"                
## [213] "Up to 4 years"                  "Up to 40 years"

@Leo how was this sentencing data gathered? The above “up to 40 years” do not sound like an actual sentence, but rather something a reporter said… please advise.

I have manually gone into the data and created a new variable called “Sentence_clean”

For now I will drop anything that says “Face” or “Up to” from the dataset because it seems they are erroneous.

In it, I changed “1 year” to 1 etc.. Made anything “Faces” or “Up to” as NA. Changed 1 instance of 25 years to life to 25. If there was a range then I took the average. For e.g. 1.5-3 years…2.25 years

I changed all “life” sentences to 100 years.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
summary(DIH_DATA$Sentence_clean)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##   0.08219   4.00000   8.00000  12.44014  15.00000 180.00000      1183

sum(!is.na(DIH_DATA$Sentence_clean))

## [1] 2081

What this shows is that not all of these are actually sentences, but what the person was “facing” aka the longest sentence that could occur. @Leo I would like to see the codebook. Were folks told to record actual sentence length or statements by reporters saying things like the person is “facing this amount of time”. How were the sentences recorded in the data?

The current draft of the article says that sentencing information was available for 2,489 of 3,266 cases with a recorded disposition.

Yet here there are only 2,081 observations for sentence years…. where did the other 400 observations come from?

#General Summary of All Variables

summary(DIH_DATA)

##      ...1               URL               Court             Rural_Code   
##  Length:3264        Length:3264        Length:3264        Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.000  
##                                                           Mean   :2.474  
##                                                           3rd Qu.:3.000  
##                                                           Max.   :9.000  
##                                                           NA's   :112    
##  County_Fips        County_FullName    County_Name           State          
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Year_Charged   Date Charge         Date Incident        Accused_Name      
##  Min.   :1974   Min.   :1899-12-30   Min.   :1899-12-30   Length:3264       
##  1st Qu.:2015   1st Qu.:2014-02-26   1st Qu.:2013-10-21   Class :character  
##  Median :2017   Median :2017-01-14   Median :2016-08-01   Mode  :character  
##  Mean   :2016   Mean   :2007-11-19   Mean   :2009-09-03                     
##  3rd Qu.:2018   3rd Qu.:2018-07-08   3rd Qu.:2018-01-28                     
##  Max.   :2026   Max.   :2026-08-01   Max.   :2022-02-15                     
##                 NA's   :14           NA's   :11                             
##  Sentence_clean        Sentence         Sentence_Quartile    Charge         
##  Min.   :  0.08219   Length:3264        Min.   :1.000     Length:3264       
##  1st Qu.:  4.00000   Class :character   1st Qu.:2.000     Class :character  
##  Median :  8.00000   Mode  :character   Median :3.000     Mode  :character  
##  Mean   : 12.44014                      Mean   :2.499                       
##  3rd Qu.: 15.00000                      3rd Qu.:3.000                       
##  Max.   :180.00000                      Max.   :4.000                       
##  NA's   :1183                           NA's   :1041                        
##  Accused_Race       Accused_Ethnicity  Accused_Sex         Accused_Age   
##  Length:3264        Length:3264        Length:3264        Min.   :17.00  
##  Class :character   Class :character   Class :character   1st Qu.:27.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :32.00  
##                                                           Mean   :34.34  
##                                                           3rd Qu.:40.00  
##                                                           Max.   :87.00  
##                                                           NA's   :2224   
##  Accused_City       Accused_State       Court_Type            Plea          
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Deceased_Name      Deceased_Race      Deceased_Ethnicity  Deceased_Age  
##  Length:3264        Length:3264        Length:3264        Min.   : 0.00  
##  Class :character   Class :character   Class :character   1st Qu.:23.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :28.00  
##                                                           Mean   :29.63  
##                                                           3rd Qu.:35.00  
##                                                           Max.   :91.00  
##                                                           NA's   :547    
##  Deceased_Sex        Substance            Lawyer          Relationship      
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##

Missingness

colSums(is.na(DIH_DATA))

##               ...1                URL              Court         Rural_Code 
##                  0                103                999                112 
##        County_Fips    County_FullName        County_Name              State 
##                 66                  5                  6                  0 
##       Year_Charged        Date Charge      Date Incident       Accused_Name 
##                  0                 14                 11                  7 
##     Sentence_clean           Sentence  Sentence_Quartile             Charge 
##               1183                906               1041                  1 
##       Accused_Race  Accused_Ethnicity        Accused_Sex        Accused_Age 
##                461               2286                 17               2224 
##       Accused_City      Accused_State         Court_Type               Plea 
##                181                 65                 72                907 
##      Deceased_Name      Deceased_Race Deceased_Ethnicity       Deceased_Age 
##                522               1103               2198                547 
##       Deceased_Sex          Substance             Lawyer       Relationship 
##                220                617               2459                276

#Creating Freq table function to reuse later
freq_table <- function(data, var) {
  data %>%
    count({{ var }}) %>%
    mutate(percent = n / sum(n) * 100) %>%
    arrange(desc(n))
}

#Creating Bar chart function for categorical variables to reuse later
library(ggplot2)
library(dplyr)

bar_plot_cat <- function(data, var) {
  ggplot(data, aes(x = {{ var }}, fill = {{ var }})) +
    geom_bar() +
    geom_text(
      stat = "count",
      aes(label = ..count..),
      vjust = -0.3
    ) +
    labs(
      x = deparse(substitute(var)),
      y = "Count",
      title = paste("Distribution of", deparse(substitute(var)))
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    guides(fill = "none")
}

#Creating histogram function
hist_plot <- function(data, var) {
  ggplot(data, aes(x = {{ var }})) +
    geom_histogram(bins = 30, fill = "steelblue") +
    scale_x_continuous(breaks = seq(0, 200, by = 10)) +  # 👈 more numbers
    labs(
      x = deparse(substitute(var)),
      y = "Count",
      title = paste("Distribution of", deparse(substitute(var)))
    )
}

#creating density plot function
density_plot <- function(data, var) {
  ggplot(data, aes(x = {{ var }})) +
    geom_density(fill = "steelblue", alpha = 0.5) +
    labs(
      x = deparse(substitute(var)),
      y = "Density",
      title = paste("Density of", deparse(substitute(var)))
    )
}

#🎯 Outcome variable: Sentence_clean Mean : 12.44
Median : 8
Max : 180
NA’s : 1183

Interpretation: • Right-skewed distribution (mean > median, max = 180) • Likely long tail (e.g., very long sentences or life approximations) • ⚠️ ~36% missing (1183 / 3264)

Usable sample =

sum(!is.na(DIH_DATA$Sentence_clean))

## [1] 2081

hist_plot(DIH_DATA, Sentence_clean)

## Warning: Removed 1183 rows containing non-finite outside the scale range
## (`stat_bin()`).

density_plot(DIH_DATA, Sentence_clean)

## Warning: Removed 1183 rows containing non-finite outside the scale range
## (`stat_density()`).

Will need to explore missingness to see if it is random.

#📅 Dates Range: 1899 → 2022

1899 will need to be removed from the dataset. As will 1905

DIH_DATA <- DIH_DATA %>%
  filter(`Date Incident` >= as.Date("1910-01-01") | is.na(`Date Incident`))

sort(unique(DIH_DATA$`Date Incident`))[1:20]

##  [1] "1974-09-23" "1982-03-05" "1987-10-18" "1989-10-04" "1991-12-03"
##  [6] "1992-06-01" "1994-02-04" "1994-09-03" "1995-06-30" "1995-08-09"
## [11] "1999-03-01" "1999-04-28" "1999-09-24" "1999-11-24" "2000-01-10"
## [16] "2000-09-16" "2001-02-03" "2001-02-08" "2001-04-01" "2001-04-10"

library(dplyr)
library(ggplot2)

DIH_DATA %>%
  filter(!is.na(`Date Incident`)) %>%
  count(`Date Incident`) %>%
  ggplot(aes(x = `Date Incident`, y = n)) +
  geom_line() +
  labs(
    title = "Number of Cases Over Time",
    x = "Date",
    y = "Count"
  )

#👤 Accused characteristics ## Accused Age Mean: 34
Median: 32
NA’s: 2224 Over 68% missing → major limitation

density_plot(DIH_DATA, Accused_Age)

## Warning: Removed 2095 rows containing non-finite outside the scale range
## (`stat_density()`).

## • Accused_Ethnicity

freq_table(DIH_DATA,Accused_Ethnicity)

## # A tibble: 7 × 3
##   Accused_Ethnicity       n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2150 69.5   
## 2 Not Hispanic/Latino   704 22.8   
## 3 Hispanic/Latino       143  4.62  
## 4 NA                     93  3.01  
## 5 Not HIspanic/Latino     1  0.0323
## 6 White                   1  0.0323
## 7 not Hispanic/Latino     1  0.0323

Cleaning typos in data…

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Ethnicity = str_to_title(Accused_Ethnicity),   # fix casing
    Accused_Ethnicity = str_replace_all(Accused_Ethnicity, "Hispanic", "Hispanic"), # normalize
    Accused_Ethnicity = na_if(Accused_Ethnicity, "Na"),     # fix "NA"
    
    Accused_Ethnicity = case_when(
      Accused_Ethnicity %in% c("Not Hispanic/Latino", "Not Hispanic/Latino") ~ "Not Hispanic/Latino",
      Accused_Ethnicity %in% c("Hispanic/Latino") ~ "Hispanic/Latino",
      Accused_Ethnicity == "White" ~ NA_character_,  # wrong variable → set to NA
      TRUE ~ Accused_Ethnicity
    )
  )

bar_plot_cat(DIH_DATA, Accused_Ethnicity)

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

• Accused_Sex

bar_plot_cat(DIH_DATA, Accused_Sex)

freq_table(DIH_DATA, Accused_Sex)

## # A tibble: 5 × 3
##   Accused_Sex     n percent
##   <chr>       <int>   <dbl>
## 1 Male         2257 73.0   
## 2 Female        818 26.4   
## 3 <NA>           13  0.420 
## 4 NA              3  0.0970
## 5 feMale          2  0.0647

Fixing typos in the data again…

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Sex = str_to_title(Accused_Sex),   # fixes feMale → Female
    Accused_Sex = na_if(Accused_Sex, "Na")     # fixes "NA" string → real NA
  )

• Accused_Race

freq_table(DIH_DATA, Accused_Race)

## # A tibble: 11 × 3
##    Accused_Race                                                        n percent
##    <chr>                                                           <int>   <dbl>
##  1 White                                                            1972 63.8   
##  2 Black                                                             601 19.4   
##  3 <NA>                                                              429 13.9   
##  4 NA                                                                 36  1.16  
##  5 Unknown POC                                                        25  0.808 
##  6 Asian                                                              14  0.453 
##  7 Native American or Alaskan Native                                  10  0.323 
##  8 Hispanic/Latino                                                     3  0.0970
##  9 Arabic                                                              1  0.0323
## 10 Manufacture, Delivery, or Possession With Intent to Manufactur…     1  0.0323
## 11 white                                                               1  0.0323

“white” → should be “White” • “NA” → should be real NA • “Arabic” → you already decided → “White” • that long drug charge string → clearly wrong variable → should be NA

•   "Unknown POC" → not a usable category → ❌
•   "Hispanic/Latino" → ethnicity, not race → ❌

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Race = na_if(Accused_Race, "NA"),
    Accused_Race = case_when(
      Accused_Race %in% c("white", "White") ~ "White",
      Accused_Race %in% c("black", "Black") ~ "Black",
      Accused_Race == "Arabic" ~ "White",
      Accused_Race %in% c("Unknown POC", "Hispanic/Latino", "X") ~ NA_character_,
      str_detect(Accused_Race, "Manufacture") ~ NA_character_,
      TRUE ~ Accused_Race
    )
  )

freq_table(DIH_DATA, Accused_Race)

## # A tibble: 5 × 3
##   Accused_Race                          n percent
##   <chr>                             <int>   <dbl>
## 1 White                              1974  63.8  
## 2 Black                               601  19.4  
## 3 <NA>                                494  16.0  
## 4 Asian                                14   0.453
## 5 Native American or Alaskan Native    10   0.323

bar_plot_cat(DIH_DATA, Accused_Race)

#👤 Deceased characteristics

Deceased Age

Range: 0-91 Mean: 29.6
Median: 28
NA’s: 547 Only about 17% missingness here – which makes sense as news articles will focus on the age of the deceased, something all readers will want to know… Was there really a 91 year old who was the deceased?

deceased under 18 =>

sum(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)

## [1] 255

percent of sample (after dropping those missing) 18 or under =>

mean(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)

## [1] 0.09673748

DIH_DATA %>%
  filter(Deceased_Age <= 18) %>%
  count(Deceased_Age, sort = TRUE)

## # A tibble: 40 × 2
##    Deceased_Age     n
##           <dbl> <int>
##  1           18    66
##  2           17    33
##  3           16    24
##  4            2    17
##  5            1    15
##  6           15    15
##  7            3    11
##  8           14     8
##  9            5     7
## 10            6     5
## # ℹ 30 more rows

percent of sample 2 or younger =>

mean(DIH_DATA$Deceased_Age <= 2, na.rm = TRUE)* 100

## [1] 2.579666

Quartiles for 18 and younger =>

quantile(
  DIH_DATA$Deceased_Age[DIH_DATA$Deceased_Age <= 18],
  probs = c(0.25, 0.5, 0.75),
  na.rm = TRUE
)

## 25% 50% 75% 
##   2  15  18

Interested in the breakdown of deceased by age under 18, creating groups and graphing. =>

library(dplyr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    age_group = case_when(
      Deceased_Age >= 0  & Deceased_Age <= 1  ~ "0–1",
      Deceased_Age >= 2  & Deceased_Age <= 3  ~ "2–3",
      Deceased_Age >= 4  & Deceased_Age <= 5  ~ "4–5",
      Deceased_Age >= 6  & Deceased_Age <= 7  ~ "6–7",
      Deceased_Age >= 8  & Deceased_Age <= 9  ~ "8–9",
      Deceased_Age >= 10 & Deceased_Age <= 11 ~ "10–11",
      Deceased_Age >= 12 & Deceased_Age <= 13 ~ "12–13",
      Deceased_Age >= 14 & Deceased_Age <= 15 ~ "14–15",
      Deceased_Age >= 16 & Deceased_Age <= 17 ~ "16–17",
      Deceased_Age == 18                      ~ "18",
      TRUE ~ NA_character_
    )
  )
  
#Ordered

DIH_DATA$age_group <- factor(
  DIH_DATA$age_group,
  levels = c("0–1","2–3","4–5","6–7","8–9","10–11","12–13","14–15","16–17","18")
)

#Plot
library(ggplot2)

ggplot(
  DIH_DATA %>% filter(Deceased_Age <= 18, !is.na(age_group)),
  aes(x = age_group, fill = age_group)
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Deceased Age (≤ 18)",
    x = "Age Group",
    y = "Count"
  ) +
  guides(fill = "none")   # removes legend if you don’t want it

freq_table(DIH_DATA, Deceased_Age)

## # A tibble: 95 × 3
##    Deceased_Age     n percent
##           <dbl> <int>   <dbl>
##  1           NA   457   14.8 
##  2           28   135    4.36
##  3           21   133    4.30
##  4           26   132    4.27
##  5           23   124    4.01
##  6           25   120    3.88
##  7           24   117    3.78
##  8           30   114    3.69
##  9           22   109    3.52
## 10           27   106    3.43
## # ℹ 85 more rows

freq_table(DIH_DATA, age_group)

## # A tibble: 11 × 3
##    age_group     n percent
##    <fct>     <int>   <dbl>
##  1 <NA>       2847 92.0   
##  2 18           66  2.13  
##  3 16–17        57  1.84  
##  4 0–1          42  1.36  
##  5 2–3          28  0.905 
##  6 14–15        23  0.744 
##  7 4–5           9  0.291 
##  8 10–11         7  0.226 
##  9 6–7           6  0.194 
## 10 12–13         5  0.162 
## 11 8–9           3  0.0970

• Deceased_Race

freq_table(DIH_DATA, Deceased_Race)

## # A tibble: 12 × 3
##    Deceased_Race                         n percent
##    <chr>                             <int>   <dbl>
##  1 White                              1920 62.1   
##  2 <NA>                                983 31.8   
##  3 Black                                86  2.78  
##  4 NA                                   64  2.07  
##  5 Unknown POC                          17  0.550 
##  6 Asian                                10  0.323 
##  7 Native American or Alaskan Native     6  0.194 
##  8 Hispanic/Latino                       3  0.0970
##  9 Arabic                                1  0.0323
## 10 Thomas McGuinness                     1  0.0323
## 11 Unknown                               1  0.0323
## 12 X                                     1  0.0323

Need to fix these… Arabic is changed to White. Thomas McGuinness changed to NA. Fix the NAs, X and Unknown

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Race = na_if(Deceased_Race, "NA"),
    Deceased_Race = case_when(
      Deceased_Race == "Arabic" ~ "White",
      Deceased_Race == "Thomas McGuinness" ~ NA_character_,
      Deceased_Race %in% c("Unknown", "X") ~ NA_character_,
      TRUE ~ Deceased_Race
    )
  )

freq_table(DIH_DATA, Deceased_Race)

## # A tibble: 7 × 3
##   Deceased_Race                         n percent
##   <chr>                             <int>   <dbl>
## 1 White                              1921 62.1   
## 2 <NA>                               1050 33.9   
## 3 Black                                86  2.78  
## 4 Unknown POC                          17  0.550 
## 5 Asian                                10  0.323 
## 6 Native American or Alaskan Native     6  0.194 
## 7 Hispanic/Latino                       3  0.0970

bar_plot_cat(DIH_DATA, Deceased_Race)

• Deceased_Ethnicity

freq_table(DIH_DATA, Deceased_Ethnicity)

## # A tibble: 8 × 3
##   Deceased_Ethnicity      n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2061 66.6   
## 2 Not Hispanic/Latino   816 26.4   
## 3 NA                    113  3.65  
## 4 Hispanic/Latino        97  3.14  
## 5 Hispanic Latino         2  0.0647
## 6 White                   2  0.0647
## 7 Not Hispanic/latino     1  0.0323
## 8 X                       1  0.0323

Again, fixing the data…

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Ethnicity = na_if(Deceased_Ethnicity, "NA"),
    Deceased_Ethnicity = case_when(
      Deceased_Ethnicity == "Hispanic Latino" ~ "Hispanic/Latino",
      Deceased_Ethnicity == "Not Hispanic/latino" ~ "Not Hispanic/Latino",
      Deceased_Ethnicity == "White" ~ NA_character_,
      Deceased_Ethnicity == "X" ~ NA_character_,
      TRUE ~ Deceased_Ethnicity
    )
  )

freq_table(DIH_DATA, Deceased_Ethnicity)

## # A tibble: 3 × 3
##   Deceased_Ethnicity      n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2177   70.4 
## 2 Not Hispanic/Latino   817   26.4 
## 3 Hispanic/Latino        99    3.20

bar_plot_cat(DIH_DATA, Deceased_Ethnicity)

Large missingness

• Deceased_Sex

freq_table(DIH_DATA, Deceased_Sex)

## # A tibble: 13 × 3
##    Deceased_Sex     n percent
##    <chr>        <int>   <dbl>
##  1 Male          1849 59.8   
##  2 Female        1050 33.9   
##  3 <NA>           156  5.04  
##  4 NA              23  0.744 
##  5 male             5  0.162 
##  6 Famale           2  0.0647
##  7 female           2  0.0647
##  8 27               1  0.0323
##  9 Both Female      1  0.0323
## 10 Both male        1  0.0323
## 11 Female, Male     1  0.0323
## 12 Other            1  0.0323
## 13 X                1  0.0323

changing both to be NA

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Sex = case_when(
      Deceased_Sex %in% c("NA","27","X","Other",
                         "Both Female","Both male","Female, Male", "Both") ~ NA_character_,
      Deceased_Sex %in% c("male","Male") ~ "Male",
      Deceased_Sex %in% c("female","Female","Famale") ~ "Female",
      TRUE ~ Deceased_Sex
    )
  )

freq_table(DIH_DATA, Deceased_Sex)

## # A tibble: 3 × 3
##   Deceased_Sex     n percent
##   <chr>        <int>   <dbl>
## 1 Male          1854   59.9 
## 2 Female        1054   34.1 
## 3 <NA>           185    5.98

#🌍 Rural Code NA’s: 112

ggplot(
  DIH_DATA %>% filter(!is.na(Rural_Code)),
  aes(x = factor(Rural_Code), fill = factor(Rural_Code))
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Rural_Code",
    x = "Rural Code",
    y = "Count"
  ) +
  guides(fill = "none")

Creating urbanacity variable

DIH_DATA <- DIH_DATA %>%
  mutate(
    Urbanicity = case_when(
      Rural_Code %in% 1:3 ~ "Metropolitan",
      Rural_Code %in% 4:6 ~ "Suburban",
      Rural_Code %in% 7:9 ~ "Rural",
      TRUE ~ NA_character_
    )
  )

#Ordering for plotting
DIH_DATA$Urbanicity <- factor(
  DIH_DATA$Urbanicity,
  levels = c("Metropolitan", "Suburban", "Rural")
)

ggplot(
  DIH_DATA %>% filter(!is.na(Urbanicity)),
  aes(x = Urbanicity, fill = Urbanicity)
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Urbanicity",
    x = "Urbanicity",
    y = "Count"
  ) +
  guides(fill = "none")

Would be interesting to see state by urbancity…

ggplot(
  DIH_DATA %>% filter(!is.na(State), !is.na(Urbanicity)),
  aes(x = State, fill = Urbanicity)
) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Urbanicity by State",
    x = "State",
    y = "Proportion"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

I guess this could just be a function of how rural the state is…but we could test further…

#📍State

There is a lowercase pennsylvania in the dataset…fixing it…

DIH_DATA %>%
  filter(State == "pennsylvania")

## # A tibble: 1 × 34
##   ...1  URL       Court Rural_Code County_Fips County_FullName County_Name State
##   <chr> <chr>     <chr>      <dbl> <chr>       <chr>           <chr>       <chr>
## 1 540   3 senten… <NA>           2 42071       Lancaster Coun… Lancaster   penn…
## # ℹ 26 more variables: Year_Charged <dbl>, `Date Charge` <date>,
## #   `Date Incident` <date>, Accused_Name <chr>, Sentence_clean <dbl>,
## #   Sentence <chr>, Sentence_Quartile <dbl>, Charge <chr>, Accused_Race <chr>,
## #   Accused_Ethnicity <chr>, Accused_Sex <chr>, Accused_Age <dbl>,
## #   Accused_City <chr>, Accused_State <chr>, Court_Type <chr>, Plea <chr>,
## #   Deceased_Name <chr>, Deceased_Race <chr>, Deceased_Ethnicity <chr>,
## #   Deceased_Age <dbl>, Deceased_Sex <chr>, Substance <chr>, Lawyer <chr>, …

DIH_DATA <- DIH_DATA %>%
  mutate(
    State = ifelse(State == "pennsylvania", "Pennsylvania", State)
  )

bar_plot_cat(DIH_DATA, State)

freq_table(DIH_DATA, State)

## # A tibble: 50 × 3
##    State              n percent
##    <chr>          <int>   <dbl>
##  1 Pennsylvania     560   18.1 
##  2 Wisconsin        338   10.9 
##  3 Ohio             332   10.7 
##  4 Illinois         288    9.31
##  5 Florida          113    3.65
##  6 Minnesota        113    3.65
##  7 New Jersey       113    3.65
##  8 North Carolina   110    3.56
##  9 New York         103    3.33
## 10 Michigan          94    3.04
## # ℹ 40 more rows

#⚖️ Sentence Quartile Mean: 2.5
Range: 1–4
NA’s: 1041

sum(!is.na(DIH_DATA$Sentence_Quartile))

## [1] 2118

Somehow there are more observations that have Quartiles than actual Sentences… @Katie & @Leo … any idea what happened here? Coding errors? I have created a table below with the observations thave have a response for Sentence Quartile but not Sentences…

problem_cases <- DIH_DATA %>%
  filter(!is.na(Sentence_Quartile) & is.na(Sentence))

library(dplyr)
library(DT)

problem_cases <- DIH_DATA %>%
  filter(!is.na(Sentence_Quartile) & is.na(Sentence))

datatable(
  problem_cases,
  options = list(
    pageLength = 25,
    scrollX = TRUE
  ),
  filter = "top",
  rownames = FALSE
)

I am not going to use this variable until we figure out what is going on here…

#⚖️ Legal / case variables ## • Charge

freq_table(DIH_DATA,Charge)

## # A tibble: 31 × 3
##    Charge                                                              n percent
##    <chr>                                                           <int>   <dbl>
##  1 Drug Delivery/Distribution Resulting in Death                     580   18.8 
##  2 Involuntary Manslaughter                                          459   14.8 
##  3 Reckless/Negligent Homicide                                       414   13.4 
##  4 Murder (2nd or 3rd Degree)                                        334   10.8 
##  5 Drug Induced Homicide                                             295    9.54
##  6 Manslaughter                                                      238    7.69
##  7 Delivery/Distribution of Heroin/Fentanyl Resulting in Death       143    4.62
##  8 Delivery/Distribution Of a Controlled Substance Causing/Result…   111    3.59
##  9 Murder (1st Degree)                                                95    3.07
## 10 Other (Please Specify in Notes)                                    87    2.81
## # ℹ 21 more rows

@Leo, can you plese let me know how you would like these categorized? I do not have the previous code that you used so I do not know what previous decisions were made…

• Plea

freq_table(DIH_DATA,Plea)

## # A tibble: 32 × 3
##    Plea                              n percent
##    <chr>                         <int>   <dbl>
##  1 Guilty                         1679  54.3  
##  2 <NA>                            843  27.3  
##  3 Not Guilty                      196   6.34 
##  4 NA                              181   5.85 
##  5 Guilty of a Lesser Charge        43   1.39 
##  6 No Contest                       40   1.29 
##  7 Guilty to a Lesser Charge        30   0.970
##  8 Awaiting Plea Hearing            28   0.905
##  9 Pleaded Guilty                   13   0.420
## 10 Guilty to a Lesser DIH Charge    12   0.388
## # ℹ 22 more rows

Fixed the data as follows…

DIH_DATA <- DIH_DATA %>%
  mutate(
    Plea = na_if(Plea, "NA"),
    Plea = case_when(
      str_detect(tolower(Plea), "guilty") & !str_detect(tolower(Plea), "lesser") ~ "Guilty",
      str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
      str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
      str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
      str_detect(tolower(Plea), "pending") ~ "Pending/Awaiting",
      Plea %in% c("State", "X", "Other") ~ NA_character_,
      TRUE ~ Plea
    )
  )

DIH_DATA <- DIH_DATA %>%
  mutate(
    Plea = case_when(
      Plea == "Judge ruled it was not murder" ~ NA_character_,
      Plea == "Pleads not guily" ~ "Guilty",
      Plea == "Pled guity to DIH charges" ~ "Guilty",
      TRUE ~ Plea
    )
  )

freq_table(DIH_DATA, Plea)

## # A tibble: 6 × 3
##   Plea                        n percent
##   <chr>                   <int>   <dbl>
## 1 Guilty                   1907 61.7   
## 2 <NA>                     1026 33.2   
## 3 Guilty to Lesser Charge    85  2.75  
## 4 No Contest                 46  1.49  
## 5 Awaiting Plea Hearing      28  0.905 
## 6 Alford Plea                 1  0.0323

bar_plot_cat(DIH_DATA, Plea)

Do we have data on convictions? Versus just pleas

• Court_Type

freq_table(DIH_DATA,Court_Type)

## # A tibble: 6 × 3
##   Court_Type         n percent
##   <chr>          <int>   <dbl>
## 1 State           2678 86.6   
## 2 Federal          332 10.7   
## 3 <NA>              69  2.23  
## 4 NA                11  0.356 
## 5 federal            2  0.0647
## 6 Fayette County     1  0.0323

DIH_DATA <- DIH_DATA %>%
  mutate(
    Court_Type = na_if(Court_Type, "NA"),
    Court_Type = case_when(
      Court_Type == "federal" ~ "Federal",
      Court_Type == "Fayette County" ~ NA_character_,
      TRUE ~ Court_Type
    )
  )

freq_table(DIH_DATA, Court_Type)

## # A tibble: 3 × 3
##   Court_Type     n percent
##   <chr>      <int>   <dbl>
## 1 State       2678   86.6 
## 2 Federal      334   10.8 
## 3 <NA>          81    2.62

🔗 Case context

• Relationship

freq_table(DIH_DATA, Relationship)

## # A tibble: 6 × 3
##   Relationship                                n percent
##   <chr>                                   <int>   <dbl>
## 1 Dealer/Buyer                             2011  65.0  
## 2 Caretaker/Family/Friend/Partner/Co-User   744  24.1  
## 3 <NA>                                      248   8.02 
## 4 Doctor/Patient                             57   1.84 
## 5 NA                                         25   0.808
## 6 Dealer/buyer                                8   0.259

Fixing typos.

DIH_DATA <- DIH_DATA %>%
  mutate(
    Relationship = na_if(Relationship, "NA"),
    Relationship = case_when(
      Relationship %in% c("Dealer/buyer", "Dealer/Buyer") ~ "Dealer/Buyer",
      TRUE ~ Relationship
    )
  )

freq_table(DIH_DATA, Relationship)

## # A tibble: 4 × 3
##   Relationship                                n percent
##   <chr>                                   <int>   <dbl>
## 1 Dealer/Buyer                             2019   65.3 
## 2 Caretaker/Family/Friend/Partner/Co-User   744   24.1 
## 3 <NA>                                      273    8.83
## 4 Doctor/Patient                             57    1.84

• Substance

freq_table(DIH_DATA, Substance)

## # A tibble: 39 × 3
##    Substance                                    n percent
##    <chr>                                    <int>   <dbl>
##  1 Heroin                                     907   29.3 
##  2 <NA>                                       563   18.2 
##  3 Fentanyl (Analog)                          444   14.4 
##  4 Fentanyl (Analog) and Heroin               397   12.8 
##  5 Prescription Opioid                        121    3.91
##  6 Mixed Drugs Not Listed (Comment in Cell)   104    3.36
##  7 Methadone                                   75    2.42
##  8 Methamphetamine                             64    2.07
##  9 Other Drug (Comment in Cell)                62    2.00
## 10 Fentanyl Analog                             47    1.52
## # ℹ 29 more rows

Cleaning data as follows…

library(stringr)
library(dplyr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Substance = tolower(Substance),
    Substance = na_if(Substance, "na"),
    
    Substance_clean = case_when(
      str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
      str_detect(Substance, "fentanyl") ~ "Fentanyl",
      str_detect(Substance, "heroin") ~ "Heroin",
      str_detect(Substance, "meth") ~ "Methamphetamine",
      str_detect(Substance, "cocaine") ~ "Cocaine",
      str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
      str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
      str_detect(Substance, "xanax|benzo") ~ "Other",
      str_detect(Substance, "ecstasy") ~ "Other",
      str_detect(Substance, "benadryl") ~ "Other",
      str_detect(Substance, "bullskin") ~ NA_character_,
      TRUE ~ "Other"
    )
  )

freq_table(DIH_DATA, Substance_clean)

## # A tibble: 9 × 3
##   Substance_clean       n percent
##   <chr>             <int>   <dbl>
## 1 Heroin              908 29.4   
## 2 Other               647 20.9   
## 3 Fentanyl            645 20.9   
## 4 Fentanyl + Heroin   417 13.5   
## 5 Methamphetamine     152  4.91  
## 6 Other Opioid        145  4.69  
## 7 Polysubstance       104  3.36  
## 8 Cocaine              74  2.39  
## 9 <NA>                  1  0.0323

bar_plot_cat(DIH_DATA, Substance_clean)

DIH Media

Taleed El-Sabawi

2026-04-13