library(googlesheets4)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(scales)
library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(DT)
library(ggplot2)

#Data isnt full cleaned so loading everything as text and then will specify column type
library(googlesheets4)
library(dplyr)
DIH_DATA <- read_sheet(
  "https://docs.google.com/spreadsheets/d/1OS4-QPBoffGKsxfqPzW3LyLRoxga2j3wLYPPxXtnUHw/edit?usp=sharing",
  col_types = "c"
)

## ! Using an auto-discovered, cached token.

##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.

##   See gargle's "Non-interactive auth" vignette for more details:

##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>

## ℹ The googlesheets4 package is using a cached token for
##   'taleedsabawi@gmail.com'.

## ✔ Reading from "DIH Data 2026".

## ✔ Range 'Data'.

#Assigning columns manually to data type
DIH_DATA$Year_Charged  <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code    <- as.numeric(DIH_DATA$Rural_Code)

# Dates
#These first dates are in excel serial data structure...
DIH_DATA$`Date Incident` <- as.Date(as.numeric(DIH_DATA$`Date Incident`), origin = "1899-12-30")

## Warning in as.Date(as.numeric(DIH_DATA$`Date Incident`), origin =
## "1899-12-30"): NAs introduced by coercion

DIH_DATA$`Date Charge` <- as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30")

## Warning in as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30"):
## NAs introduced by coercion

# Ages
DIH_DATA$Accused_Age  <- as.numeric(DIH_DATA$Accused_Age)

## Warning: NAs introduced by coercion

DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)

## Warning: NAs introduced by coercion

#checking data structure
str(DIH_DATA)

## tibble [3,264 × 31] (S3: tbl_df/tbl/data.frame)
##  $ ID                : chr [1:3264] "1678" "2622" "1514" "320" ...
##  $ URL               : chr [1:3264] "http://www.nydailynews.com/news/crime/pain-relieving-patch-kills-harlem-girl-6-foster-mother-charged-article-1.328165" "http://whdh.com/news/woman-boyfriend-charged-in-fatal-cocaine-overdose-of-ohio-boy-9/" "http://www.capitalgazette.com/news/for_the_record/ac-cn-delvalle-overdose-verdict-20180620-story.html" "http://www.thisweeknews.com/news/20180719/fourth-person-expected-to-be-charged-in-connection-with-grandview-ove"| __truncated__ ...
##  $ Court             : chr [1:3264] NA NA "Baltimore County Court" NA ...
##  $ Rural_Code        : num [1:3264] 1 2 1 1 1 1 1 5 7 2 ...
##  $ County_Fips       : chr [1:3264] "36061" "39099" "24005" "39049" ...
##  $ County_FullName   : chr [1:3264] "New York County" "Mahoning County" "Baltimore County" "Franklin County" ...
##  $ County_Name       : chr [1:3264] "New York" "Mahoning" "Baltimore" "Franklin" ...
##  $ State             : chr [1:3264] "New York" "Ohio" "Maryland" "Ohio" ...
##  $ Year_Charged      : num [1:3264] 2004 2018 2018 2018 2011 ...
##  $ Date Charge       : Date[1:3264], format: NA NA ...
##  $ Date Incident     : Date[1:3264], format: NA NA ...
##  $ Accused_Name      : chr [1:3264] "Joanne Alvarez" "Raenell Allen" "Jason Patton Baker" "Benjamin Bussey" ...
##  $ Sentence          : chr [1:3264] NA NA NA NA ...
##  $ Sentence Quartile : chr [1:3264] NA NA NA NA ...
##  $ Charge            : chr [1:3264] "Reckless/Negligent Homicide" "Involuntary Manslaughter" "Manslaughter" "Involuntary Manslaughter" ...
##  $ Accused_Race      : chr [1:3264] NA "White" "White" "White" ...
##  $ Accused_Ethnicity : chr [1:3264] "Hispanic/Latino" NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
##  $ Accused_Sex       : chr [1:3264] "Female" "Female" "Male" "Male" ...
##  $ Accused_Age       : num [1:3264] NA NA 46 19 NA NA NA NA NA NA ...
##  $ Accused_City      : chr [1:3264] "East Harlem" "Youngstown" "Millersville" "Dublin" ...
##  $ Accused_State     : chr [1:3264] "New York" "Ohio" "Pennsylvania" "Ohio" ...
##  $ Court_Type        : chr [1:3264] "State" "State" "State" "State" ...
##  $ Plea              : chr [1:3264] "Guilty" NA "Not Guilty" "Guilty to a Lesser Charge" ...
##  $ Deceased_Name     : chr [1:3264] "Taylor Webster" NA "Josiah Christopher Klaes" "Haleah Myers" ...
##  $ Deceased_Race     : chr [1:3264] "White" NA "White" "White" ...
##  $ Deceased_Ethnicity: chr [1:3264] NA NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
##  $ Deceased_Age      : num [1:3264] 6 9 16 17 17 18 18 18 18 18 ...
##  $ Deceased_Sex      : chr [1:3264] "Female" "Male" "Male" "Female" ...
##  $ Substance         : chr [1:3264] "Fentanyl (Analog)" NA "Fentanyl (Analog)" "Fentanyl (Analog)" ...
##  $ Lawyer            : chr [1:3264] NA NA "do'connell@opd.state.md.us" NA ...
##  $ Relationship      : chr [1:3264] "Caretaker/Family/Friend/Partner/Co-User" "Caretaker/Family/Friend/Partner/Co-User" "Dealer/Buyer" "Caretaker/Family/Friend/Partner/Co-User" ...

#Looking to see what Sentence contains
sort(unique(DIH_DATA$Sentence))

##   [1] "0.08219178082" "0.0833"        "0.08333333333" "0.1052511416" 
##   [5] "0.1616438356"  "0.1666666667"  "0.2191780822"  "0.25"         
##   [9] "0.325"         "0.333"         "0.3333333333"  "0.375"        
##  [13] "0.4"           "0.405"         "0.4166666667"  "0.5"          
##  [17] "0.5726027397"  "0.5833333333"  "0.6666666667"  "0.75"         
##  [21] "0.822"         "0.8333333333"  "0.9166666667"  "0.9583333333" 
##  [25] "1"             "1.002"         "1.25"          "1.3"          
##  [29] "1.333333333"   "1.416666667"   "1.5"           "1.583"        
##  [33] "1.66"          "1.666666667"   "1.7"           "1.75"         
##  [37] "1.8"           "1.916666667"   "1.917"         "1.92"         
##  [41] "1.97"          "10"            "10.16"         "10.41"        
##  [45] "10.5"          "10.67"         "10.8"          "10.83"        
##  [49] "10.83333333"   "11"            "11.1"          "11.16666667"  
##  [53] "11.25"         "11.5"          "11.83"         "11.9"         
##  [57] "113"           "12"            "12.25"         "12.5"         
##  [61] "12.75"         "124"           "13"            "13.33"        
##  [65] "13.5"          "14"            "14.5"          "143"          
##  [69] "15"            "15.5"          "15.67"         "16"           
##  [73] "16.5"          "17"            "17.5"          "18"           
##  [77] "18.25"         "18.66666667"   "180"           "19"           
##  [81] "19.25"         "19.58333333"   "2"             "2.25"         
##  [85] "2.333333333"   "2.4"           "2.5"           "2.75"         
##  [89] "20"            "20.5"          "20.83333333"   "21"           
##  [93] "22"            "22.5"          "23"            "24"           
##  [97] "25"            "25.5"          "26"            "26.83"        
## [101] "27"            "27.08333333"   "28"            "29"           
## [105] "3"             "3.08"          "3.083333333"   "3.25"         
## [109] "3.3"           "3.33"          "3.333333333"   "3.4"          
## [113] "3.416666667"   "3.5"           "3.666666667"   "3.83"         
## [117] "3.833333333"   "3.92"          "30"            "30.41666667"  
## [121] "31"            "32"            "33"            "33.25"        
## [125] "34"            "35"            "36"            "37"           
## [129] "38"            "4"             "4.083333333"   "4.17"         
## [133] "4.25"          "4.416666667"   "4.5"           "4.666666667"  
## [137] "4.75"          "4.8"           "4.83"          "4.833"        
## [141] "4.833333333"   "4.9"           "40"            "42"           
## [145] "46"            "47"            "48"            "48.33"        
## [149] "5"             "5.06"          "5.25"          "5.5"          
## [153] "5.666666667"   "5.75"          "5.83"          "5.916666667"  
## [157] "5.92"          "50"            "52"            "54"           
## [161] "55"            "6"             "6.083333333"   "6.166666667"  
## [165] "6.25"          "6.5"           "6.67"          "6.7"          
## [169] "6.75"          "60"            "62"            "63"           
## [173] "65"            "7"             "7.25"          "7.33"         
## [177] "7.5"           "7.666666667"   "72"            "75"           
## [181] "8"             "8.08"          "8.33"          "8.333"        
## [185] "8.333333333"   "8.5"           "8.67"          "84"           
## [189] "9"             "9.33"          "9.5"           "9.917"        
## [193] "90"            "96"            "Life"          "pending"      
## [197] "Pending"

I have manually gone into the data and created a new variable called “Sentence_num” – which makes Sentence a continuous numeric variable. Just for visualization sake…

For now I will drop anything that says “pending” or “Pending” from the dataset and replace them with NA

I changed all “life” sentences to 100 years, just for th

DIH_DATA <- DIH_DATA %>%
  mutate(
    Sentence_num = na_if(Sentence, "Pending"),
    Sentence_num = na_if(Sentence, "pending"),
    Sentence_num = as.numeric(Sentence)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sentence_num = as.numeric(Sentence)`.
## Caused by warning:
## ! NAs introduced by coercion

#make sentence quart var
DIH_DATA <- DIH_DATA %>%
  mutate(
    Sentence_Quartile_calc = ntile(Sentence_num, 4)
  )

library(dplyr)
library(stringr)
summary(DIH_DATA$Sentence_num)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##   0.08219   4.00000   8.00000  11.54717  15.00000 180.00000      1209

sum(!is.na(DIH_DATA$Sentence))

## [1] 2348

sum(!is.na(DIH_DATA$Sentence_num))

## [1] 2055

All observations 3,364 Observations with sentencing data 2,055

#General Summary of All Variables

summary(DIH_DATA)

##       ID                URL               Court             Rural_Code   
##  Length:3264        Length:3264        Length:3264        Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.000  
##                                                           Mean   :2.474  
##                                                           3rd Qu.:3.000  
##                                                           Max.   :9.000  
##                                                           NA's   :112    
##  County_Fips        County_FullName    County_Name           State          
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Year_Charged   Date Charge   Date Incident  Accused_Name      
##  Min.   :1974   Min.   :NA     Min.   :NA     Length:3264       
##  1st Qu.:2015   1st Qu.:NA     1st Qu.:NA     Class :character  
##  Median :2017   Median :NA     Median :NA     Mode  :character  
##  Mean   :2016   Mean   :NaN    Mean   :NaN                      
##  3rd Qu.:2018   3rd Qu.:NA     3rd Qu.:NA                       
##  Max.   :2026   Max.   :NA     Max.   :NA                       
##                 NA's   :3264   NA's   :3264                     
##    Sentence         Sentence Quartile     Charge          Accused_Race      
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Accused_Ethnicity  Accused_Sex         Accused_Age    Accused_City      
##  Length:3264        Length:3264        Min.   :17.00   Length:3264       
##  Class :character   Class :character   1st Qu.:27.00   Class :character  
##  Mode  :character   Mode  :character   Median :32.00   Mode  :character  
##                                        Mean   :34.34                     
##                                        3rd Qu.:40.00                     
##                                        Max.   :87.00                     
##                                        NA's   :2224                      
##  Accused_State       Court_Type            Plea           Deceased_Name     
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Deceased_Race      Deceased_Ethnicity  Deceased_Age   Deceased_Sex      
##  Length:3264        Length:3264        Min.   : 0.00   Length:3264       
##  Class :character   Class :character   1st Qu.:22.25   Class :character  
##  Mode  :character   Mode  :character   Median :28.00   Mode  :character  
##                                        Mean   :29.62                     
##                                        3rd Qu.:35.00                     
##                                        Max.   :91.00                     
##                                        NA's   :546                       
##   Substance            Lawyer          Relationship        Sentence_num      
##  Length:3264        Length:3264        Length:3264        Min.   :  0.08219  
##  Class :character   Class :character   Class :character   1st Qu.:  4.00000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  8.00000  
##                                                           Mean   : 11.54717  
##                                                           3rd Qu.: 15.00000  
##                                                           Max.   :180.00000  
##                                                           NA's   :1209       
##  Sentence_Quartile_calc
##  Min.   :1.000         
##  1st Qu.:1.500         
##  Median :2.000         
##  Mean   :2.499         
##  3rd Qu.:3.000         
##  Max.   :4.000         
##  NA's   :1209

Missingness

colSums(is.na(DIH_DATA))

##                     ID                    URL                  Court 
##                      0                    103                    999 
##             Rural_Code            County_Fips        County_FullName 
##                    112                      0                      5 
##            County_Name                  State           Year_Charged 
##                      2                      0                      0 
##            Date Charge          Date Incident           Accused_Name 
##                   3264                   3264                      7 
##               Sentence      Sentence Quartile                 Charge 
##                    916                   3264                      1 
##           Accused_Race      Accused_Ethnicity            Accused_Sex 
##                    461                   2286                     17 
##            Accused_Age           Accused_City          Accused_State 
##                   2224                    181                     65 
##             Court_Type                   Plea          Deceased_Name 
##                     72                    907                    522 
##          Deceased_Race     Deceased_Ethnicity           Deceased_Age 
##                   1103                   2198                    546 
##           Deceased_Sex              Substance                 Lawyer 
##                    220                    617                   2459 
##           Relationship           Sentence_num Sentence_Quartile_calc 
##                    276                   1209                   1209

#Creating Freq table function to reuse later
freq_table <- function(data, var) {
  data %>%
    count({{ var }}) %>%
    mutate(percent = n / sum(n) * 100) %>%
    arrange(desc(n))
}

#Creating Bar chart function for categorical variables to reuse later
library(ggplot2)
library(dplyr)

bar_plot_cat <- function(data, var) {
  ggplot(data, aes(x = {{ var }}, fill = {{ var }})) +
    geom_bar() +
    geom_text(
      stat = "count",
      aes(label = ..count..),
      vjust = -0.3
    ) +
    labs(
      x = deparse(substitute(var)),
      y = "Count",
      title = paste("Distribution of", deparse(substitute(var)))
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    guides(fill = "none")
}

#Creating histogram function
hist_plot <- function(data, var) {
  ggplot(data, aes(x = {{ var }})) +
    geom_histogram(bins = 30, fill = "steelblue") +
    scale_x_continuous(breaks = seq(0, 200, by = 10)) +  # 👈 more numbers
    labs(
      x = deparse(substitute(var)),
      y = "Count",
      title = paste("Distribution of", deparse(substitute(var)))
    )
}

#creating density plot function
density_plot <- function(data, var) {
  ggplot(data, aes(x = {{ var }})) +
    geom_density(fill = "steelblue", alpha = 0.5) +
    labs(
      x = deparse(substitute(var)),
      y = "Density",
      title = paste("Density of", deparse(substitute(var)))
    )
}

hist_plot(DIH_DATA, Sentence_num)

## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_bin()`).

Will need to explore missingness to see if it is random.

#📅Date Charged

freq_table (DIH_DATA, Year_Charged)

## # A tibble: 34 × 3
##    Year_Charged     n percent
##           <dbl> <int>   <dbl>
##  1         2017   560   17.2 
##  2         2018   532   16.3 
##  3         2016   384   11.8 
##  4         2019   326    9.99
##  5         2015   266    8.15
##  6         2020   263    8.06
##  7         2014   213    6.53
##  8         2013   120    3.68
##  9         2021    99    3.03
## 10         2012    88    2.70
## # ℹ 24 more rows

library(dplyr)
library(ggplot2)

DIH_DATA %>%
  filter(!is.na(Year_Charged)) %>%
  mutate(Year_Charged = as.numeric(Year_Charged)) %>%
  count(Year_Charged) %>%
  ggplot(aes(x = Year_Charged, y = n)) +
  geom_line() +
  labs(
    title = "Number of Cases Over Time",
    x = "Year",
    y = "Count"
  )

#👤Accused characteristics ## Accused Age

density_plot(DIH_DATA, Accused_Age)

## Warning: Removed 2224 rows containing non-finite outside the scale range
## (`stat_density()`).

## • Accused_Ethnicity

freq_table(DIH_DATA,Accused_Ethnicity)

## # A tibble: 7 × 3
##   Accused_Ethnicity       n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2286 70.0   
## 2 Not Hispanic/Latino   734 22.5   
## 3 Hispanic/Latino       147  4.50  
## 4 NA                     94  2.88  
## 5 Not HIspanic/Latino     1  0.0306
## 6 White                   1  0.0306
## 7 not Hispanic/Latino     1  0.0306

Cleaning typos in data…

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Ethnicity = str_to_title(Accused_Ethnicity),   # fix casing
    Accused_Ethnicity = str_replace_all(Accused_Ethnicity, "Hispanic", "Hispanic"), # normalize
    Accused_Ethnicity = na_if(Accused_Ethnicity, "Na"),     # fix "NA"
    
    Accused_Ethnicity = case_when(
      Accused_Ethnicity %in% c("Not Hispanic/Latino", "Not Hispanic/Latino") ~ "Not Hispanic/Latino",
      Accused_Ethnicity %in% c("Hispanic/Latino") ~ "Hispanic/Latino",
      Accused_Ethnicity == "White" ~ NA_character_,  # wrong variable → set to NA
      TRUE ~ Accused_Ethnicity
    )
  )

bar_plot_cat(DIH_DATA, Accused_Ethnicity)

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

• Accused_Sex

bar_plot_cat(DIH_DATA, Accused_Sex)

freq_table(DIH_DATA, Accused_Sex)

## # A tibble: 5 × 3
##   Accused_Sex     n percent
##   <chr>       <int>   <dbl>
## 1 Male         2388 73.2   
## 2 Female        853 26.1   
## 3 <NA>           17  0.521 
## 4 NA              3  0.0919
## 5 feMale          3  0.0919

Fixing typos in the data again…

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Sex = str_to_title(Accused_Sex),   # fixes feMale → Female
    Accused_Sex = na_if(Accused_Sex, "Na")     # fixes "NA" string → real NA
  )

• Accused_Race

freq_table(DIH_DATA, Accused_Race)

## # A tibble: 12 × 3
##    Accused_Race                                                        n percent
##    <chr>                                                           <int>   <dbl>
##  1 White                                                            2067 63.3   
##  2 Black                                                             641 19.6   
##  3 <NA>                                                              461 14.1   
##  4 NA                                                                 37  1.13  
##  5 Unknown POC                                                        25  0.766 
##  6 Asian                                                              16  0.490 
##  7 Native American or Alaskan Native                                  10  0.306 
##  8 Hispanic/Latino                                                     3  0.0919
##  9 Arabic                                                              1  0.0306
## 10 Indian American                                                     1  0.0306
## 11 Manufacture, Delivery, or Possession With Intent to Manufactur…     1  0.0306
## 12 white                                                               1  0.0306

“white” → should be “White” • “NA” → should be real NA • “Arabic” → you already decided → “White” • that long drug charge string → clearly wrong variable → should be NA

•   "Unknown POC" → not a usable category → ❌
•   "Hispanic/Latino" → ethnicity, not race → ❌

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Race = na_if(Accused_Race, "NA"),
    Accused_Race = case_when(
      Accused_Race %in% c("white", "White") ~ "White",
      Accused_Race %in% c("black", "Black") ~ "Black",
      Accused_Race == "Arabic" ~ "White",
      Accused_Race %in% c("Unknown POC", "Hispanic/Latino", "X") ~ NA_character_,
      str_detect(Accused_Race, "Manufacture") ~ NA_character_,
      TRUE ~ Accused_Race
    )
  )

freq_table(DIH_DATA, Accused_Race)

## # A tibble: 6 × 3
##   Accused_Race                          n percent
##   <chr>                             <int>   <dbl>
## 1 White                              2069 63.4   
## 2 Black                               641 19.6   
## 3 <NA>                                527 16.1   
## 4 Asian                                16  0.490 
## 5 Native American or Alaskan Native    10  0.306 
## 6 Indian American                       1  0.0306

bar_plot_cat(DIH_DATA, Accused_Race)

##Race/Ethnicity Variable

DIH_DATA <- DIH_DATA %>%
  mutate(
    Race_Ethnicity = case_when(
      Accused_Ethnicity == "Hispanic/Latino" ~ "Hispanic",
      Accused_Race == "White" ~ "White (Non-Hispanic)",
      Accused_Race == "Black" ~ "Black (Non-Hispanic)",
      Accused_Race == "Asian" ~ "Asian",
      Accused_Race == "Native American or Alaskan Native" ~ "Native American",
      TRUE ~ NA_character_
    )
  )

DIH_DATA <- DIH_DATA %>%
  mutate(
    Race_Ethnicity = case_when(
      Race_Ethnicity %in% c("Asian", "Native American") ~ "Other",
      TRUE ~ Race_Ethnicity
    )
  )

freq_table(DIH_DATA,  Race_Ethnicity)

## # A tibble: 5 × 3
##   Race_Ethnicity           n percent
##   <chr>                <int>   <dbl>
## 1 White (Non-Hispanic)  1993  61.1  
## 2 Black (Non-Hispanic)   629  19.3  
## 3 <NA>                   472  14.5  
## 4 Hispanic               147   4.50 
## 5 Other                   23   0.705

bar_plot_cat(DIH_DATA, Race_Ethnicity)

#👤Deceased characteristics

Deceased Age

Was there really a 91 year old who was the deceased?

deceased under 18 =>

sum(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)

## [1] 262

percent of sample (after dropping those missing) 18 or under =>

mean(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)

## [1] 0.09639441

DIH_DATA %>%
  filter(Deceased_Age <= 18) %>%
  count(Deceased_Age, sort = TRUE)

## # A tibble: 19 × 2
##    Deceased_Age     n
##           <dbl> <int>
##  1           18    68
##  2           17    37
##  3            1    29
##  4           16    24
##  5            2    21
##  6            0    19
##  7           15    15
##  8            3    11
##  9           14     8
## 10            5     7
## 11            6     5
## 12           10     5
## 13           13     3
## 14            4     2
## 15            9     2
## 16           11     2
## 17           12     2
## 18            7     1
## 19            8     1

percent of sample 2 or younger =>

mean(DIH_DATA$Deceased_Age <= 2, na.rm = TRUE)* 100

## [1] 2.538631

Quartiles for 18 and younger =>

quantile(
  DIH_DATA$Deceased_Age[DIH_DATA$Deceased_Age <= 18],
  probs = c(0.25, 0.5, 0.75),
  na.rm = TRUE
)

## 25% 50% 75% 
##   2  15  18

Interested in the breakdown of deceased by age under 18, creating groups and graphing. =>

library(dplyr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    age_group = case_when(
      Deceased_Age >= 0  & Deceased_Age <= 1  ~ "0–1",
      Deceased_Age >= 2  & Deceased_Age <= 3  ~ "2–3",
      Deceased_Age >= 4  & Deceased_Age <= 5  ~ "4–5",
      Deceased_Age >= 6  & Deceased_Age <= 7  ~ "6–7",
      Deceased_Age >= 8  & Deceased_Age <= 9  ~ "8–9",
      Deceased_Age >= 10 & Deceased_Age <= 11 ~ "10–11",
      Deceased_Age >= 12 & Deceased_Age <= 13 ~ "12–13",
      Deceased_Age >= 14 & Deceased_Age <= 15 ~ "14–15",
      Deceased_Age >= 16 & Deceased_Age <= 17 ~ "16–17",
      Deceased_Age == 18                      ~ "18",
      TRUE ~ NA_character_
    )
  )
  
#Ordered

DIH_DATA$age_group <- factor(
  DIH_DATA$age_group,
  levels = c("0–1","2–3","4–5","6–7","8–9","10–11","12–13","14–15","16–17","18")
)

#Plot
library(ggplot2)

ggplot(
  DIH_DATA %>% filter(Deceased_Age <= 18, !is.na(age_group)),
  aes(x = age_group, fill = age_group)
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Deceased Age (≤ 18)",
    x = "Age Group",
    y = "Count"
  ) +
  guides(fill = "none")   # removes legend if you don’t want it

freq_table(DIH_DATA, Deceased_Age)

## # A tibble: 75 × 3
##    Deceased_Age     n percent
##           <dbl> <int>   <dbl>
##  1           NA   546   16.7 
##  2           26   141    4.32
##  3           28   137    4.20
##  4           21   134    4.11
##  5           23   126    3.86
##  6           25   122    3.74
##  7           24   121    3.71
##  8           30   116    3.55
##  9           22   114    3.49
## 10           27   106    3.25
## # ℹ 65 more rows

freq_table(DIH_DATA, age_group)

## # A tibble: 11 × 3
##    age_group     n percent
##    <fct>     <int>   <dbl>
##  1 <NA>       3002 92.0   
##  2 18           68  2.08  
##  3 16–17        61  1.87  
##  4 0–1          48  1.47  
##  5 2–3          32  0.980 
##  6 14–15        23  0.705 
##  7 4–5           9  0.276 
##  8 10–11         7  0.214 
##  9 6–7           6  0.184 
## 10 12–13         5  0.153 
## 11 8–9           3  0.0919

• Deceased_Race

freq_table(DIH_DATA, Deceased_Race)

## # A tibble: 13 × 3
##    Deceased_Race                         n percent
##    <chr>                             <int>   <dbl>
##  1 White                              1964 60.2   
##  2 <NA>                               1103 33.8   
##  3 Black                                90  2.76  
##  4 NA                                   66  2.02  
##  5 Unknown POC                          17  0.521 
##  6 Asian                                10  0.306 
##  7 Native American or Alaskan Native     6  0.184 
##  8 Hispanic/Latino                       3  0.0919
##  9 Arabic                                1  0.0306
## 10 Multiple Deceased                     1  0.0306
## 11 Thomas McGuinness                     1  0.0306
## 12 Unknown                               1  0.0306
## 13 X                                     1  0.0306

Need to fix these… Arabic is changed to White. Thomas McGuinness changed to NA. Fix the NAs, X and Unknown

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Race = na_if(Deceased_Race, "NA"),
    Deceased_Race = case_when(
      Deceased_Race == "Arabic" ~ "White",
      Deceased_Race == "Thomas McGuinness" ~ NA_character_,
      Deceased_Race %in% c("Unknown", "X") ~ NA_character_,
      TRUE ~ Deceased_Race
    )
  )

freq_table(DIH_DATA, Deceased_Race)

## # A tibble: 8 × 3
##   Deceased_Race                         n percent
##   <chr>                             <int>   <dbl>
## 1 White                              1965 60.2   
## 2 <NA>                               1172 35.9   
## 3 Black                                90  2.76  
## 4 Unknown POC                          17  0.521 
## 5 Asian                                10  0.306 
## 6 Native American or Alaskan Native     6  0.184 
## 7 Hispanic/Latino                       3  0.0919
## 8 Multiple Deceased                     1  0.0306

bar_plot_cat(DIH_DATA, Deceased_Race)

• Deceased_Ethnicity

freq_table(DIH_DATA, Deceased_Ethnicity)

## # A tibble: 9 × 3
##   Deceased_Ethnicity      n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2198 67.3   
## 2 Not Hispanic/Latino   844 25.9   
## 3 NA                    116  3.55  
## 4 Hispanic/Latino        99  3.03  
## 5 Hispanic Latino         2  0.0613
## 6 White                   2  0.0613
## 7 Multiple Deceased       1  0.0306
## 8 Not Hispanic/latino     1  0.0306
## 9 X                       1  0.0306

Again, fixing the data…

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Ethnicity = na_if(Deceased_Ethnicity, "NA"),
    Deceased_Ethnicity = case_when(
      Deceased_Ethnicity == "Hispanic Latino" ~ "Hispanic/Latino",
      Deceased_Ethnicity == "Not Hispanic/latino" ~ "Not Hispanic/Latino",
      Deceased_Ethnicity == "White" ~ NA_character_,
      Deceased_Ethnicity == "X" ~ NA_character_,
      TRUE ~ Deceased_Ethnicity
    )
  )

freq_table(DIH_DATA, Deceased_Ethnicity)

## # A tibble: 4 × 3
##   Deceased_Ethnicity      n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2317 71.0   
## 2 Not Hispanic/Latino   845 25.9   
## 3 Hispanic/Latino       101  3.09  
## 4 Multiple Deceased       1  0.0306

bar_plot_cat(DIH_DATA, Deceased_Ethnicity)

Large missingness

• Deceased_Sex

freq_table(DIH_DATA, Deceased_Sex)

## # A tibble: 14 × 3
##    Deceased_Sex          n percent
##    <chr>             <int>   <dbl>
##  1 Male               1912 58.6   
##  2 Female             1090 33.4   
##  3 <NA>                220  6.74  
##  4 NA                   25  0.766 
##  5 male                  6  0.184 
##  6 Famale                2  0.0613
##  7 female                2  0.0613
##  8 27                    1  0.0306
##  9 Both Female           1  0.0306
## 10 Both male             1  0.0306
## 11 Female, Male          1  0.0306
## 12 Multiple Deceased     1  0.0306
## 13 Other                 1  0.0306
## 14 X                     1  0.0306

changing both to be NA

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Sex = case_when(
      Deceased_Sex %in% c("NA","27","X","Other",
                         "Both Female","Both male","Female, Male", "Both") ~ NA_character_,
      Deceased_Sex %in% c("male","Male") ~ "Male",
      Deceased_Sex %in% c("female","Female","Famale") ~ "Female",
      TRUE ~ Deceased_Sex
    )
  )

freq_table(DIH_DATA, Deceased_Sex)

## # A tibble: 4 × 3
##   Deceased_Sex          n percent
##   <chr>             <int>   <dbl>
## 1 Male               1918 58.8   
## 2 Female             1094 33.5   
## 3 <NA>                251  7.69  
## 4 Multiple Deceased     1  0.0306

#🌍Rural Code NA’s: 112

ggplot(
  DIH_DATA %>% filter(!is.na(Rural_Code)),
  aes(x = factor(Rural_Code), fill = factor(Rural_Code))
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Rural_Code",
    x = "Rural Code",
    y = "Count"
  ) +
  guides(fill = "none")

Creating urbanacity variable

DIH_DATA <- DIH_DATA %>%
  mutate(
    Urbanicity = case_when(
      Rural_Code %in% 1:3 ~ "Metropolitan",
      Rural_Code %in% 4:6 ~ "Suburban",
      Rural_Code %in% 7:9 ~ "Rural",
      TRUE ~ NA_character_
    )
  )

#Ordering for plotting
DIH_DATA$Urbanicity <- factor(
  DIH_DATA$Urbanicity,
  levels = c("Metropolitan", "Suburban", "Rural")
)

ggplot(
  DIH_DATA %>% filter(!is.na(Urbanicity)),
  aes(x = Urbanicity, fill = Urbanicity)
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Urbanicity",
    x = "Urbanicity",
    y = "Count"
  ) +
  guides(fill = "none")

Would be interesting to see state by urbancity…

ggplot(
  DIH_DATA %>% filter(!is.na(State), !is.na(Urbanicity)),
  aes(x = State, fill = Urbanicity)
) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Urbanicity by State",
    x = "State",
    y = "Proportion"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

I guess this could just be a function of how rural the state is…but we could test further…

#📍State

There is a lowercase pennsylvania in the dataset…fixing it…

DIH_DATA %>%
  filter(State == "pennsylvania")

## # A tibble: 1 × 36
##   ID    URL       Court Rural_Code County_Fips County_FullName County_Name State
##   <chr> <chr>     <chr>      <dbl> <chr>       <chr>           <chr>       <chr>
## 1 540   3 senten… <NA>           2 42071       Lancaster Coun… Lancaster   penn…
## # ℹ 28 more variables: Year_Charged <dbl>, `Date Charge` <date>,
## #   `Date Incident` <date>, Accused_Name <chr>, Sentence <chr>,
## #   `Sentence Quartile` <chr>, Charge <chr>, Accused_Race <chr>,
## #   Accused_Ethnicity <chr>, Accused_Sex <chr>, Accused_Age <dbl>,
## #   Accused_City <chr>, Accused_State <chr>, Court_Type <chr>, Plea <chr>,
## #   Deceased_Name <chr>, Deceased_Race <chr>, Deceased_Ethnicity <chr>,
## #   Deceased_Age <dbl>, Deceased_Sex <chr>, Substance <chr>, Lawyer <chr>, …

DIH_DATA <- DIH_DATA %>%
  mutate(
    State = ifelse(State == "pennsylvania", "Pennsylvania", State)
  )

bar_plot_cat(DIH_DATA, State)

freq_table(DIH_DATA, State)

## # A tibble: 50 × 3
##    State              n percent
##    <chr>          <int>   <dbl>
##  1 Pennsylvania     578   17.7 
##  2 Ohio             354   10.8 
##  3 Wisconsin        351   10.8 
##  4 Illinois         299    9.16
##  5 Florida          125    3.83
##  6 Minnesota        116    3.55
##  7 New Jersey       113    3.46
##  8 New York         112    3.43
##  9 North Carolina   111    3.40
## 10 Michigan          99    3.03
## # ℹ 40 more rows

#⚖️ Sentence Quartile

sum(!is.na(DIH_DATA$Sentence_Quartile_calc))

## [1] 2055

#⚖️ Legal / case variables ## • Charge

freq_table(DIH_DATA,Charge)

## # A tibble: 32 × 3
##    Charge                                                              n percent
##    <chr>                                                           <int>   <dbl>
##  1 Drug Delivery/Distribution Resulting in Death                     610   18.7 
##  2 Involuntary Manslaughter                                          479   14.7 
##  3 Reckless/Negligent Homicide                                       432   13.2 
##  4 Murder (2nd or 3rd Degree)                                        342   10.5 
##  5 Drug Induced Homicide                                             306    9.38
##  6 Manslaughter                                                      252    7.72
##  7 Delivery/Distribution of Heroin/Fentanyl Resulting in Death       151    4.63
##  8 Delivery/Distribution Of a Controlled Substance Causing/Result…   122    3.74
##  9 Other (Please Specify in Notes)                                   101    3.09
## 10 Murder (1st Degree)                                                99    3.03
## # ℹ 22 more rows

@Katie Flagging this – still don’t know how these were categorized…

• Plea

freq_table(DIH_DATA,Plea)

## # A tibble: 37 × 3
##    Plea                              n percent
##    <chr>                         <int>   <dbl>
##  1 Guilty                         1756  53.8  
##  2 <NA>                            907  27.8  
##  3 Not Guilty                      205   6.28 
##  4 NA                              188   5.76 
##  5 Guilty of a Lesser Charge        46   1.41 
##  6 No Contest                       42   1.29 
##  7 Awaiting Plea Hearing            31   0.950
##  8 Guilty to a Lesser Charge        31   0.950
##  9 Pleaded Guilty                   13   0.398
## 10 Guilty to a Lesser DIH Charge    12   0.368
## # ℹ 27 more rows

Fixed the data as follows…

DIH_DATA <- DIH_DATA %>%
  mutate(
    Plea = na_if(Plea, "NA"),
    Plea = case_when(
      str_detect(tolower(Plea), "guilty") & !str_detect(tolower(Plea), "lesser") ~ "Guilty",
      str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
      str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
      str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
      str_detect(tolower(Plea), "pending") ~ "Pending/Awaiting",
      Plea %in% c("State", "X", "Other") ~ NA_character_,
      TRUE ~ Plea
    )
  )

DIH_DATA <- DIH_DATA %>%
  mutate(
    Plea = case_when(
      Plea == "Judge ruled it was not murder" ~ NA_character_,
      Plea == "Pleads not guily" ~ "Guilty",
      Plea == "Pled guity to DIH charges" ~ "Guilty",
      TRUE ~ Plea
    )
  )

freq_table(DIH_DATA, Plea)

## # A tibble: 6 × 3
##   Plea                        n percent
##   <chr>                   <int>   <dbl>
## 1 Guilty                   1996 61.2   
## 2 <NA>                     1097 33.6   
## 3 Guilty to Lesser Charge    91  2.79  
## 4 No Contest                 48  1.47  
## 5 Awaiting Plea Hearing      31  0.950 
## 6 Alford Plea                 1  0.0306

bar_plot_cat(DIH_DATA, Plea)

Do we have data on verdicts? Versus just pleas

• Court_Type

freq_table(DIH_DATA,Court_Type)

## # A tibble: 6 × 3
##   Court_Type         n percent
##   <chr>          <int>   <dbl>
## 1 State           2789 85.4   
## 2 Federal          388 11.9   
## 3 <NA>              72  2.21  
## 4 NA                12  0.368 
## 5 federal            2  0.0613
## 6 Fayette County     1  0.0306

DIH_DATA <- DIH_DATA %>%
  mutate(
    Court_Type = na_if(Court_Type, "NA"),
    Court_Type = case_when(
      Court_Type == "federal" ~ "Federal",
      Court_Type == "Fayette County" ~ NA_character_,
      TRUE ~ Court_Type
    )
  )

freq_table(DIH_DATA, Court_Type)

## # A tibble: 3 × 3
##   Court_Type     n percent
##   <chr>      <int>   <dbl>
## 1 State       2789   85.4 
## 2 Federal      390   11.9 
## 3 <NA>          85    2.60

🔗 Case context

• Relationship

freq_table(DIH_DATA, Relationship)

## # A tibble: 6 × 3
##   Relationship                                n percent
##   <chr>                                   <int>   <dbl>
## 1 Dealer/Buyer                             2097  64.2  
## 2 Caretaker/Family/Friend/Partner/Co-User   768  23.5  
## 3 <NA>                                      276   8.46 
## 4 Doctor/Patient                             88   2.70 
## 5 NA                                         26   0.797
## 6 Dealer/buyer                                9   0.276

Fixing typos.

DIH_DATA <- DIH_DATA %>%
  mutate(
    Relationship = na_if(Relationship, "NA"),
    Relationship = case_when(
      Relationship %in% c("Dealer/buyer", "Dealer/Buyer") ~ "Dealer/Buyer",
      TRUE ~ Relationship
    )
  )

freq_table(DIH_DATA, Relationship)

## # A tibble: 4 × 3
##   Relationship                                n percent
##   <chr>                                   <int>   <dbl>
## 1 Dealer/Buyer                             2106   64.5 
## 2 Caretaker/Family/Friend/Partner/Co-User   768   23.5 
## 3 <NA>                                      302    9.25
## 4 Doctor/Patient                             88    2.70

• Substance

freq_table(DIH_DATA, Substance)

## # A tibble: 39 × 3
##    Substance                                    n percent
##    <chr>                                    <int>   <dbl>
##  1 Heroin                                     944   28.9 
##  2 <NA>                                       617   18.9 
##  3 Fentanyl (Analog)                          459   14.1 
##  4 Fentanyl (Analog) and Heroin               408   12.5 
##  5 Prescription Opioid                        137    4.20
##  6 Mixed Drugs Not Listed (Comment in Cell)   113    3.46
##  7 Methadone                                   80    2.45
##  8 Other Drug (Comment in Cell)                68    2.08
##  9 Methamphetamine                             66    2.02
## 10 Fentanyl Analog                             47    1.44
## # ℹ 29 more rows

Cleaning data as follows…

library(stringr)
library(dplyr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Substance = tolower(Substance),
    Substance = na_if(Substance, "na"),
    
    Substance_clean = case_when(
      str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
      str_detect(Substance, "fentanyl") ~ "Fentanyl",
      str_detect(Substance, "heroin") ~ "Heroin",
      str_detect(Substance, "meth") ~ "Methamphetamine",
      str_detect(Substance, "cocaine") ~ "Cocaine",
      str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
      str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
      str_detect(Substance, "xanax|benzo") ~ "Other",
      str_detect(Substance, "ecstasy") ~ "Other",
      str_detect(Substance, "benadryl") ~ "Other",
      str_detect(Substance, "bullskin") ~ NA_character_,
      TRUE ~ "Other"
    )
  )

freq_table(DIH_DATA, Substance_clean)

## # A tibble: 9 × 3
##   Substance_clean       n percent
##   <chr>             <int>   <dbl>
## 1 Heroin              945 29.0   
## 2 Other               707 21.7   
## 3 Fentanyl            670 20.5   
## 4 Fentanyl + Heroin   429 13.1   
## 5 Other Opioid        162  4.96  
## 6 Methamphetamine     161  4.93  
## 7 Polysubstance       113  3.46  
## 8 Cocaine              76  2.33  
## 9 <NA>                  1  0.0306

bar_plot_cat(DIH_DATA, Substance_clean)

library(dplyr)
library(tidyr)

options(scipen = 999)

numeric_summary <- DIH_DATA %>%
  summarise(
    across(
      c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
      list(
        nonmissing = ~sum(!is.na(.)),
        missing = ~sum(is.na(.)),
        mean = ~mean(., na.rm = TRUE),
        sd = ~sd(., na.rm = TRUE),
        median = ~median(., na.rm = TRUE),
        min = ~min(., na.rm = TRUE),
        max = ~max(., na.rm = TRUE)
      )
    )
  ) %>%
  pivot_longer(
    everything(),
    names_to = c("Variable", "Statistic"),
    names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
  ) %>%
  pivot_wider(names_from = Statistic, values_from = value)

numeric_summary <- numeric_summary %>%
  mutate(
    min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
    median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
    max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
  )


numeric_summary

## # A tibble: 5 × 8
##   Variable     nonmissing missing    mean    sd median min           max  
##   <chr>             <dbl>   <dbl>   <dbl> <dbl> <chr>  <chr>         <chr>
## 1 Rural_Code         3152     112    2.47  1.89 2      1             9    
## 2 Year_Charged       3264       0 2016.    4.02 2017   1974          2026 
## 3 Accused_Age        1040    2224   34.3  10.3  32     17            87   
## 4 Deceased_Age       2718     546   29.6  11.5  28     0             91   
## 5 Sentence_num       2055    1209   11.5  12.1  8      0.08219178082 180

library(dplyr)
library(purrr)
library(tidyr)
library(gt)

cat_vars <- c(
  "State",
  "Charge",
  "Accused_Race",
  "Accused_Ethnicity",
  "Accused_Sex",
  "Accused_State",
  "Court_Type",
  "Plea",
  "Deceased_Race",
  "Deceased_Ethnicity",
  "Deceased_Sex",
  "Substance",
  "Relationship",
  "Urbanicity",
  "Sentence_Quartile_calc"
)

categorical_summary <- map_dfr(cat_vars, function(v) {
  x <- DIH_DATA[[v]]
  tab <- sort(table(x, useNA = "no"), decreasing = TRUE)

  tibble(
    Variable = v,
    nonmissing = sum(!is.na(x)),
    missing = sum(is.na(x)),
    n_categories = dplyr::n_distinct(x, na.rm = TRUE),
    most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
    most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
    most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
  )
})

categorical_summary

## # A tibble: 15 × 7
##    Variable            nonmissing missing n_categories most_common most_common_n
##    <chr>                    <int>   <int>        <int> <chr>               <int>
##  1 State                     3264       0           50 Pennsylvan…           578
##  2 Charge                    3263       1           31 Drug Deliv…           610
##  3 Accused_Race              2737     527            5 White                2069
##  4 Accused_Ethnicity          883    2381            2 Not Hispan…           736
##  5 Accused_Sex               3244      20            2 Male                 2388
##  6 Accused_State             3199      65           59 Pennsylvan…           549
##  7 Court_Type                3179      85            2 State                2789
##  8 Plea                      2167    1097            5 Guilty               1996
##  9 Deceased_Race             2092    1172            7 White                1965
## 10 Deceased_Ethnicity         947    2317            3 Not Hispan…           845
## 11 Deceased_Sex              3013     251            3 Male                 1918
## 12 Substance                 2641     623           36 heroin                945
## 13 Relationship              2962     302            3 Dealer/Buy…          2106
## 14 Urbanicity                3152     112            3 Metropolit…          2525
## 15 Sentence_Quartile_…       2055    1209            4 1                     514
## # ℹ 1 more variable: most_common_pct <dbl>

Accused_State has 59 states so it needs to be cleaned if we want to use that variable… not going to bother right now because we aren’t using it

#Bivariate Relationships ##Numeric vs. Numer

cor.test(
  DIH_DATA$Sentence_num,
  DIH_DATA$Accused_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Accused_Age, :
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Sentence_num and DIH_DATA$Accused_Age
## S = 12670082, p-value = 0.005528
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1314717

cor.test(
  DIH_DATA$Sentence_num,
  DIH_DATA$Deceased_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Deceased_Age, :
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Sentence_num and DIH_DATA$Deceased_Age
## S = 930447121, p-value = 0.2802
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.02575955

ggplot(DIH_DATA, aes(x = Accused_Age, y = Sentence_num)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2820 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2820 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(DIH_DATA, aes(x = Deceased_Age, y = Sentence_num)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1505 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1505 rows containing missing values or values outside the scale range
## (`geom_point()`).

outcome variable: • heavily skewed • heteroskedastic (variance increases with age for accused and Spread increases in the middle age range (~20–50) for deceased) • outliers present

This violates assumptions of:
•   OLS regression
•   Pearson correlation

So used Spearman correlation -- @Katie ... any thoughts on other tests to run here?
Given the skewed distribution of sentence length and the presence of outliers, Spearman’s rank correlation was used.

###Sentence length by Sex

DIH_DATA %>%
  group_by(Accused_Sex) %>%
  summarise(
    n = sum(!is.na(Sentence_num)),
    mean_sentence = mean(Sentence_num, na.rm = TRUE),
    median_sentence = median(Sentence_num, na.rm = TRUE)
  )

## # A tibble: 3 × 4
##   Accused_Sex     n mean_sentence median_sentence
##   <chr>       <int>         <dbl>           <dbl>
## 1 Female        560          9.83               6
## 2 Male         1486         12.2                9
## 3 <NA>            9          6.86               5

ggplot(DIH_DATA, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
  geom_boxplot() +
  guides(fill = "none")

## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

t.test(Sentence_num ~ Accused_Sex, data = DIH_DATA)

## 
##  Welch Two Sample t-test
## 
## data:  Sentence_num by Accused_Sex
## t = -4.1076, df = 1071.3, p-value = 0.00004303
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -3.535442 -1.249629
## sample estimates:
## mean in group Female   mean in group Male 
##             9.830096            12.222631

Comparison of Sentence Length by Accused Sex

A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -4.11, p < 0.001). • Mean sentence (Female): 9.83 • Mean sentence (Male): 12.22 • Mean difference: approximately 2.39 units higher for males • 95% CI for the difference: [-3.54, -1.25]

These results suggest that, on average, male defendants receive longer sentences than female defendants.

However, several important caveats should be noted: • The distribution of sentence length is highly skewed with substantial outliers, particularly among male defendants. • Because of this skewness, the mean may not fully represent the typical case, and results should be interpreted cautiously. • A nonparametric alternative (e.g., Mann–Whitney U test) or analysis using transformed data (e.g., log(sentence)) may provide a more robust assessment.

Additionally, the boxplot visualization shows: • Greater variability in sentence length among males • More extreme high-value outliers in the male group • Substantial overlap between groups despite the difference in means

Finally, the plotting warning indicates that 1,209 observations were removed due to missing or non-finite values, which may affect representativeness and should be investigated further.

let’s discuss if I should run these additional tests or if we are satisfied with these findings

##Sentence_Quartile Bivariates on full dataset

cor.test(
  DIH_DATA$Accused_Age,
  DIH_DATA$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Accused_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Accused_Age and DIH_DATA$Sentence_Quartile_calc
## S = 12484370, p-value = 0.00232
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1442022

cor.test(
  DIH_DATA$Deceased_Age,
  DIH_DATA$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Deceased_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Deceased_Age and DIH_DATA$Sentence_Quartile_calc
## S = 902691085, p-value = 0.8393
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## 0.004839738

###SentQ+Charge

kruskal.test(Sentence_Quartile_calc ~ Charge, data = DIH_DATA)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 258.17, df = 28, p-value <
## 0.00000000000000022

Again this wont mean much until we clean the Charge variable

###SentQ+Relationship Tests whether distributions differ across groups

kruskal.test(Sentence_Quartile_calc ~ Relationship, data = DIH_DATA)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 40.074, df = 2, p-value = 0.000000001986

table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)

##                                          
##                                             1   2   3   4
##   Caretaker/Family/Friend/Partner/Co-User 165 131 116  93
##   Dealer/Buyer                            291 339 353 363
##   Doctor/Patient                            5  10  10  22

tab2 <- table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)
chisq.test(tab2)

## 
##  Pearson's Chi-squared test
## 
## data:  tab2
## X-squared = 44.016, df = 6, p-value = 0.00000007337

#Analytic Sample

@Katie we will need to include a justification in the methods section if it is not already in there about why we restricted the data to this time frame.

dih_analytic <- DIH_DATA %>%
  filter(Year_Charged >= 2011 & Year_Charged <= 2021) %>%
  filter(!is.na(Rural_Code))

Hmmmm if we are going to restrict the analytic data sample by years… then perhaps we need to run descriptives here again with the analytic sample to include in the methods section?

library(dplyr)
library(tidyr)

options(scipen = 999)

numeric_summary_B <- dih_analytic %>%
  summarise(
    across(
      c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
      list(
        nonmissing = ~sum(!is.na(.)),
        missing = ~sum(is.na(.)),
        mean = ~mean(., na.rm = TRUE),
        sd = ~sd(., na.rm = TRUE),
        median = ~median(., na.rm = TRUE),
        min = ~min(., na.rm = TRUE),
        max = ~max(., na.rm = TRUE)
      )
    )
  ) %>%
  pivot_longer(
    everything(),
    names_to = c("Variable", "Statistic"),
    names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
  ) %>%
  pivot_wider(names_from = Statistic, values_from = value)

numeric_summary_B <- numeric_summary_B %>%
  mutate(
    min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
    median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
    max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
  )


numeric_summary_B

## # A tibble: 5 × 8
##   Variable     nonmissing missing    mean    sd median min           max  
##   <chr>             <dbl>   <dbl>   <dbl> <dbl> <chr>  <chr>         <chr>
## 1 Rural_Code         2838       0    2.49  1.88 2      1             9    
## 2 Year_Charged       2838       0 2017.    2.36 2017   2011          2021 
## 3 Accused_Age         976    1862   34.3  10.4  32     17            87   
## 4 Deceased_Age       2375     463   29.9  11.5  28     0             91   
## 5 Sentence_num       1772    1066   11.4  11.8  8      0.08219178082 180

library(dplyr)
library(purrr)
library(tidyr)
library(gt)

categorical_summary2 <- map_dfr(cat_vars, function(v) {
  x <- dih_analytic[[v]]
  tab <- sort(table(x, useNA = "no"), decreasing = TRUE)

  tibble(
    Variable = v,
    nonmissing = sum(!is.na(x)),
    missing = sum(is.na(x)),
    n_categories = dplyr::n_distinct(x, na.rm = TRUE),
    most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
    most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
    most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
  )
})

categorical_summary2

## # A tibble: 15 × 7
##    Variable            nonmissing missing n_categories most_common most_common_n
##    <chr>                    <int>   <int>        <int> <chr>               <int>
##  1 State                     2838       0           47 Pennsylvan…           553
##  2 Charge                    2837       1           30 Drug Deliv…           571
##  3 Accused_Race              2441     397            5 White                1819
##  4 Accused_Ethnicity          829    2009            2 Not Hispan…           697
##  5 Accused_Sex               2823      15            2 Male                 2082
##  6 Accused_State             2777      61           58 Pennsylvan…           525
##  7 Court_Type                2760      78            2 State                2436
##  8 Plea                      1835    1003            5 Guilty               1688
##  9 Deceased_Race             1867     971            7 White                1750
## 10 Deceased_Ethnicity         873    1965            3 Not Hispan…           781
## 11 Deceased_Sex              2632     206            3 Male                 1685
## 12 Substance                 2317     521           34 heroin                823
## 13 Relationship              2576     262            3 Dealer/Buy…          1883
## 14 Urbanicity                2838       0            3 Metropolit…          2274
## 15 Sentence_Quartile_…       1772    1066            4 2                     457
## # ℹ 1 more variable: most_common_pct <dbl>

##Testing Bivariate Relationships in Analytic Sample - Outcome is Sentence_num

cor.test(
  dih_analytic$Sentence_num,
  dih_analytic$Accused_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Accused_Age, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Sentence_num and dih_analytic$Accused_Age
## S = 10194801, p-value = 0.003246
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1441686

cor.test(
  dih_analytic$Sentence_num,
  dih_analytic$Deceased_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Deceased_Age, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Sentence_num and dih_analytic$Deceased_Age
## S = 601445934, p-value = 0.1897
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.03369232

cor.test(
  dih_analytic$Sentence_num,
  dih_analytic$Year_Charged,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Year_Charged, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Sentence_num and dih_analytic$Year_Charged
## S = 903777791, p-value = 0.285
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02541011

dih_analytic %>%
  group_by(Accused_Sex) %>%
  summarise(
    n = sum(!is.na(Sentence_num)),
    mean_sentence = mean(Sentence_num, na.rm = TRUE),
    median_sentence = median(Sentence_num, na.rm = TRUE)
  )

## # A tibble: 3 × 4
##   Accused_Sex     n mean_sentence median_sentence
##   <chr>       <int>         <dbl>           <dbl>
## 1 Female        479         10.0                6
## 2 Male         1286         11.9                9
## 3 <NA>            7          7.25               5

ggplot(dih_analytic, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
  geom_boxplot() +
  guides(fill = "none")

## Warning: Removed 1066 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

t.test(Sentence_num ~ Accused_Sex, data = dih_analytic)

## 
##  Welch Two Sample t-test
## 
## data:  Sentence_num by Accused_Sex
## t = -2.9574, df = 831.36, p-value = 0.00319
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -3.1560701 -0.6379943
## sample estimates:
## mean in group Female   mean in group Male 
##             10.00579             11.90282

Comparison of Sentence Length by Accused Sex (2017–2021 Subset)

A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants for cases from 2017–2021. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -2.96, p = 0.003). • Mean sentence (Female): 10.01 • Mean sentence (Male): 11.90 • Mean difference: approximately 1.9 units higher for males • 95% CI for the difference: [-3.16, -0.64]

These results suggest that, within this time period, male defendants receive longer sentences on average than female defendants.

Additional descriptive statistics support this pattern: • Median sentence (Female): 6 • Median sentence (Male): 9

This indicates that the difference is also present at the median level, not just driven by extreme values.

However, several important considerations apply: • Sentence length remains highly skewed with notable outliers, particularly among male defendants. • Although the mean difference is statistically significant, the magnitude of the difference is modest relative to the overall variability in sentencing. • The boxplot shows substantial overlap between the distributions for males and females. • A nonparametric test (e.g., Mann–Whitney U) or analysis using transformed sentence length may provide a more robust assessment.

Additionally, the plotting warning indicates that 1,066 observations were removed due to missing or non-finite values, which should be examined to ensure no systematic bias in the analyzed subset.

summary(aov(Sentence_num ~ Urbanicity, data = dih_analytic))

##               Df Sum Sq Mean Sq F value Pr(>F)
## Urbanicity     2    408   204.0   1.463  0.232
## Residuals   1769 246685   139.4               
## 1066 observations deleted due to missingness

summary(aov(Sentence_num ~ Charge, data = dih_analytic))

##               Df Sum Sq Mean Sq F value         Pr(>F)    
## Charge        27  13356   494.7   3.691 0.000000000635 ***
## Residuals   1744 233737   134.0                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1066 observations deleted due to missingness

This suggest the sentence length varies by charge… but we need information about how charges were grouped before we can run this analysis properly

summary(aov(Sentence_num ~ Relationship, data = dih_analytic))

##                Df Sum Sq Mean Sq F value   Pr(>F)    
## Relationship    2   2048  1024.0   7.471 0.000589 ***
## Residuals    1632 223687   137.1                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1203 observations deleted due to missingness

##Testing Bivariate Relationships in Analytic Sample: Outcome is Sentencing Quartile.

###SentQ+DecAge

cor.test(
  dih_analytic$Accused_Age,
  dih_analytic$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Accused_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Accused_Age and dih_analytic$Sentence_Quartile_calc
## S = 10081186, p-value = 0.001687
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1537063

cor.test(
  dih_analytic$Deceased_Age,
  dih_analytic$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Deceased_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Deceased_Age and dih_analytic$Sentence_Quartile_calc
## S = 582264805, p-value = 0.9775
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##           rho 
## -0.0007261225

###SentQ+Yr

cor.test(
  dih_analytic$Year_Charged,
  dih_analytic$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Year_Charged,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Year_Charged and dih_analytic$Sentence_Quartile_calc
## S = 904918982, p-value = 0.309
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02417951

Later years are associated with higher sentence quartiles ρ ≈ 0.28 → moderate (not strong)

Over time, cases are more likely to fall into higher relative sentence bins

###SentQ+Charge

kruskal.test(Sentence_Quartile_calc ~ Charge, data = dih_analytic)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 235.81, df = 27, p-value <
## 0.00000000000000022

Again this wont mean much until we clean the Charge variable

###SentQ+Relationship Tests whether distributions differ across groups

kruskal.test(Sentence_Quartile_calc ~ Relationship, data = dih_analytic)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 27.177, df = 2, p-value = 0.000001255

table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)

##                                          
##                                             1   2   3   4
##   Caretaker/Family/Friend/Partner/Co-User 124 112  91  73
##   Dealer/Buyer                            265 310 309 318
##   Doctor/Patient                            4   5  10  14

tab22 <- table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)
chisq.test(tab22)

## 
##  Pearson's Chi-squared test
## 
## data:  tab22
## X-squared = 27.92, df = 6, p-value = 0.00009729

###SentQ+Plea

tab_plea <- table(dih_analytic$Plea, dih_analytic$Sentence_Quartile_calc)
tab_plea

##                          
##                             1   2   3   4
##   Alford Plea               0   0   1   0
##   Awaiting Plea Hearing     0   0   0   1
##   Guilty                  360 355 351 350
##   Guilty to Lesser Charge  18  28  11   9
##   No Contest               16   7   7   5

Well that is a problem! why do we have “awaiting plea hearing” for ones that we have sentences for?

DIH_DATA %>%
  filter(
    Plea == "Awaiting Plea Hearing" &
    (!is.na(Sentence_num) | !is.na(Sentence_Quartile_calc))
  ) %>%
  summarise(n = n())

## # A tibble: 1 × 1
##       n
##   <int>
## 1     1

DIH_DATA %>%
  filter(Plea == "Awaiting Plea Hearing") %>%
  count(Sentence_num, Sentence_Quartile_calc, sort = TRUE)

## # A tibble: 2 × 3
##   Sentence_num Sentence_Quartile_calc     n
##          <dbl>                  <int> <int>
## 1           NA                     NA    30
## 2           36                      4     1

DIH_DATA %>%
  filter(Plea == "Awaiting Plea Hearing") %>%
  count(Sentence, Sentence_Quartile_calc, sort = TRUE)

## # A tibble: 3 × 3
##   Sentence Sentence_Quartile_calc     n
##   <chr>                     <int> <int>
## 1 Pending                      NA    28
## 2 pending                      NA     2
## 3 36                            4     1

#Logit Model Sent Q

#Other Relationships

Sex

“Do cases involving women tend to occur when the deceased is younger?”

wilcox.test(Deceased_Age ~ Accused_Sex, data = DIH_DATA)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Deceased_Age by Accused_Sex
## W = 761528, p-value = 0.06442
## alternative hypothesis: true location shift is not equal to 0

DIH_DATA %>%
  group_by(Accused_Sex) %>%
  summarise(
    n = sum(!is.na(Deceased_Age)),
    mean_age = mean(Deceased_Age, na.rm = TRUE),
    median_age = median(Deceased_Age, na.rm = TRUE)
  )

## # A tibble: 3 × 4
##   Accused_Sex     n mean_age median_age
##   <chr>       <int>    <dbl>      <dbl>
## 1 Female        741     29.8         30
## 2 Male         1965     29.6         28
## 3 <NA>           12     24           23

ggplot(DIH_DATA, aes(x = Accused_Sex, y = Deceased_Age, fill = Accused_Sex)) +
  geom_boxplot() +
  guides(fill = "none")

## Warning: Removed 546 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

“Are younger victims more likely to involve female defendants?”

glm(I(Accused_Sex == "Female") ~ Deceased_Age,
    data = DIH_DATA,
    family = binomial)

## 
## Call:  glm(formula = I(Accused_Sex == "Female") ~ Deceased_Age, family = binomial, 
##     data = DIH_DATA)
## 
## Coefficients:
##  (Intercept)  Deceased_Age  
##    -1.011494      0.001221  
## 
## Degrees of Freedom: 2705 Total (i.e. Null);  2704 Residual
##   (558 observations deleted due to missingness)
## Null Deviance:       3177 
## Residual Deviance: 3177  AIC: 3181

There is no strong evidence that cases involving female defendants are associated with younger (or older) victims.

##Child cases

child_data <- DIH_DATA %>%
  filter(!is.na(age_group))

child_data %>%
  group_by(age_group, Accused_Sex) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(age_group) %>%
  mutate(prop = n / sum(n))

## # A tibble: 22 × 4
## # Groups:   age_group [10]
##    age_group Accused_Sex     n  prop
##    <fct>     <chr>       <int> <dbl>
##  1 0–1       Female         30 0.625
##  2 0–1       Male           18 0.375
##  3 2–3       Female         22 0.688
##  4 2–3       Male           10 0.312
##  5 4–5       Female          6 0.667
##  6 4–5       Male            3 0.333
##  7 6–7       Female          1 0.167
##  8 6–7       Male            5 0.833
##  9 8–9       Female          2 0.667
## 10 8–9       Male            1 0.333
## # ℹ 12 more rows

ggplot(child_data, aes(x = age_group, fill = Accused_Sex)) +
  geom_bar(position = "fill") +
  labs(
    x = "Child Age Group",
    y = "Proportion",
    fill = "Accused Sex"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This is pretty striking.

Going to change the age groupings

DIH_DATA <- DIH_DATA %>%
  mutate(
    age_group2 = case_when(
      Deceased_Age <= 5 ~ "0–5",
      Deceased_Age <= 12 ~ "6–12",
      Deceased_Age <= 17 ~ "13–17",
      TRUE ~ NA_character_
    )
  )

DIH_DATA %>%
  filter(!is.na(age_group2)) %>%
  group_by(age_group2, Accused_Sex) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(age_group2) %>%
  mutate(percent = 100 * n / sum(n))

## # A tibble: 7 × 4
## # Groups:   age_group2 [3]
##   age_group2 Accused_Sex     n percent
##   <chr>      <chr>       <int>   <dbl>
## 1 0–5        Female         58   65.2 
## 2 0–5        Male           31   34.8 
## 3 13–17      Female         20   23.0 
## 4 13–17      Male           64   73.6 
## 5 13–17      <NA>            3    3.45
## 6 6–12       Female          9   50   
## 7 6–12       Male            9   50

tab <- DIH_DATA %>%
  filter(!is.na(age_group2)) %>%
  count(age_group2, Accused_Sex) %>%
  tidyr::pivot_wider(
    names_from = Accused_Sex,
    values_from = n,
    values_fill = 0
  )

tab_test <- table(
  DIH_DATA$age_group2,
  DIH_DATA$Accused_Sex
)

tab_test

##        
##         Female Male
##   0–5       58   31
##   13–17     20   64
##   6–12       9    9

chisq.test(tab_test)

## 
##  Pearson's Chi-squared test
## 
## data:  tab_test
## X-squared = 29.963, df = 2, p-value = 0.0000003116

chisq.test(tab_test)$expected

##        
##            Female      Male
##   0–5   40.539267 48.460733
##   13–17 38.261780 45.738220
##   6–12   8.198953  9.801047

No expected counts are less than 0 so we are good.

DIH_DATA <- DIH_DATA %>%
  mutate(female = ifelse(Accused_Sex == "Female", 1, 0))

library(DescTools)

CochranArmitageTest(
  table(DIH_DATA$female, DIH_DATA$age_group2)
)

## 
##  Cochran-Armitage test for trend
## 
## data:  table(DIH_DATA$female, DIH_DATA$age_group2)
## Z = 3.7259, dim = 3, p-value = 0.0001946
## alternative hypothesis: two.sided

model <- glm(
  I(Accused_Sex == "Female") ~ age_group2,
  data = DIH_DATA,
  family = binomial
)

summary(model)

## 
## Call:
## glm(formula = I(Accused_Sex == "Female") ~ age_group2, family = binomial, 
##     data = DIH_DATA)
## 
## Coefficients:
##                 Estimate Std. Error z value    Pr(>|z|)    
## (Intercept)       0.6265     0.2225   2.816     0.00487 ** 
## age_group213–17  -1.7896     0.3393  -5.274 0.000000133 ***
## age_group26–12   -0.6265     0.5213  -1.202     0.22945    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 263.27  on 190  degrees of freedom
## Residual deviance: 232.22  on 188  degrees of freedom
##   (3073 observations deleted due to missingness)
## AIC: 238.22
## 
## Number of Fisher Scoring iterations: 4

Are younger victims more likely to involve female defendants? (Age Group Analysis)

Cases were grouped into three age categories (0–5, 6–12, 13–17), and the distribution of defendant sex across these groups was examined.

Descriptive results show a clear pattern: • Ages 0–5: • Female defendants: 65.2% • Male defendants: 34.8% • Ages 6–12: • Female defendants: 50.0% • Male defendants: 50.0% • Ages 13–17: • Female defendants: 23.0% • Male defendants: 73.6%

This suggests that female defendants are substantially overrepresented in cases involving the youngest victims (ages 0–5), while male defendants dominate cases involving older minors (13–17).

A chi-square test of independence confirms that this association is statistically significant: • χ²(2) = 29.96, p < 0.001

This indicates that the distribution of defendant sex differs meaningfully across age groups.

Trend Analysis

A Cochran–Armitage trend test was also conducted to assess whether there is a systematic trend across ordered age categories. Results from the logistic regression model (with age group as a predictor) further support this: • Compared to ages 0–5 (reference group), cases involving ages 13–17 show a large and statistically significant decrease in the likelihood of a female defendant (p < 0.001) • The 6–12 group does not differ significantly from the youngest group (p = 0.23)

This indicates a strong downward trend:

As victim age increases, the likelihood that the defendant is female decreases substantially, particularly when comparing very young children (0–5) to older adolescents (13–17).

Though I stopped the analysis here – we would need to consider whether to include other control variables in this regression if we ant to report it

##Sex & Relationship

tab_rel_sex <- table(DIH_DATA$Relationship, DIH_DATA$Accused_Sex)
tab_rel_sex

##                                          
##                                           Female Male
##   Caretaker/Family/Friend/Partner/Co-User    304  459
##   Dealer/Buyer                               461 1637
##   Doctor/Patient                              13   75

chisq.test(tab_rel_sex)

## 
##  Pearson's Chi-squared test
## 
## data:  tab_rel_sex
## X-squared = 98.285, df = 2, p-value < 0.00000000000000022

chisq.test(tab_rel_sex)$expected

##                                          
##                                              Female       Male
##   Caretaker/Family/Friend/Partner/Co-User 201.29332  561.70668
##   Dealer/Buyer                            553.49067 1544.50933
##   Doctor/Patient                           23.21601   64.78399

No cells less than 5

prop.table(tab_rel_sex, margin = 1)

##                                          
##                                              Female      Male
##   Caretaker/Family/Friend/Partner/Co-User 0.3984273 0.6015727
##   Dealer/Buyer                            0.2197331 0.7802669
##   Doctor/Patient                          0.1477273 0.8522727

ggplot(DIH_DATA, aes(x = Relationship, fill = Accused_Sex)) +
  geom_bar(position = "fill") +
  labs(
    y = "Proportion",
    fill = "Accused Sex"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Urbancity & State

table(DIH_DATA$Urbanicity, DIH_DATA$State)

##               
##                Alabama Alaska Arizona Arkansas California Colorado Connecticut
##   Metropolitan       9      0      13        1         76       14           0
##   Suburban           3      0       0        0          3       10           0
##   Rural              0      0       0        0          2        1           0
##               
##                Delaware Florida Georgia Idaho Illinois Indiana Iowa Kansas
##   Metropolitan        3     121      18     5      238      44   23      3
##   Suburban            0       3       2     0       26       9    3      1
##   Rural               0       0       1     2       26       0    2      3
##               
##                Kentucky Louisana Louisiana Maine Maryland Massachusetts
##   Metropolitan       25        0        47     5       46            26
##   Suburban            1        0         3     3        7             0
##   Rural               6        0         0     6        0             1
##               
##                Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada
##   Metropolitan       72        84           1       18       8        1     14
##   Suburban           13        24           1        9       3        0      2
##   Rural              13         5           0       10       0        0      0
##               
##                New Hampshire New Jersey New Mexico New York North Carolina
##   Metropolitan            26        108          3       90             74
##   Suburban                12          0          0       13             30
##   Rural                    0          0          0        6              5
##               
##                North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island
##   Metropolitan           14  296       17     17          466            2
##   Suburban                1   49        8      0           88            0
##   Rural                   2    6        2      0           23            0
##               
##                South Carolina South Dakota Tennessee Texas Utah Vermont
##   Metropolitan             13            3        69    23   13       1
##   Suburban                  2            2         7     4    0       8
##   Rural                     0            1         2     0    4       5
##               
##                Virginia Washington West Virginia Wisconsin Wyoming
##   Metropolitan       45         54            18       252       6
##   Suburban            7         17             5        67       1
##   Rural               4          0             7        32       3

ggplot(DIH_DATA, aes(x = State, fill = Urbanicity)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

DIH Media

Taleed El-Sabawi

2026-04-13