library(googlesheets4)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(scales)
library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(DT)
library(ggplot2)

#Data isnt full cleaned so loading everything as text and then will specify column type
library(googlesheets4)
library(dplyr)
DIH_DATA <- read_sheet(
  "https://docs.google.com/spreadsheets/d/1OS4-QPBoffGKsxfqPzW3LyLRoxga2j3wLYPPxXtnUHw/edit?usp=sharing",
  col_types = "c"
)

## ! Using an auto-discovered, cached token.

##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.

##   See gargle's "Non-interactive auth" vignette for more details:

##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>

## ℹ The googlesheets4 package is using a cached token for
##   'taleedsabawi@gmail.com'.

## ✔ Reading from "DIH Data 2026".

## ✔ Range 'Data'.

#Assigning columns manually to data type
DIH_DATA$Year_Charged  <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code    <- as.numeric(DIH_DATA$Rural_Code)

# Dates
#These first dates are in excel serial data structure...
DIH_DATA$`Date Incident` <- as.Date(as.numeric(DIH_DATA$`Date Incident`), origin = "1899-12-30")

## Warning in as.Date(as.numeric(DIH_DATA$`Date Incident`), origin =
## "1899-12-30"): NAs introduced by coercion

DIH_DATA$`Date Charge` <- as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30")

## Warning in as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30"):
## NAs introduced by coercion

# Ages
DIH_DATA$Accused_Age  <- as.numeric(DIH_DATA$Accused_Age)

## Warning: NAs introduced by coercion

DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)

## Warning: NAs introduced by coercion

#checking data structure
str(DIH_DATA)

## tibble [3,264 × 31] (S3: tbl_df/tbl/data.frame)
##  $ ID                : chr [1:3264] "1678" "2622" "1514" "320" ...
##  $ URL               : chr [1:3264] "http://www.nydailynews.com/news/crime/pain-relieving-patch-kills-harlem-girl-6-foster-mother-charged-article-1.328165" "http://whdh.com/news/woman-boyfriend-charged-in-fatal-cocaine-overdose-of-ohio-boy-9/" "http://www.capitalgazette.com/news/for_the_record/ac-cn-delvalle-overdose-verdict-20180620-story.html" "http://www.thisweeknews.com/news/20180719/fourth-person-expected-to-be-charged-in-connection-with-grandview-ove"| __truncated__ ...
##  $ Court             : chr [1:3264] NA NA "Baltimore County Court" NA ...
##  $ Rural_Code        : num [1:3264] 1 2 1 1 1 1 1 5 7 2 ...
##  $ County_Fips       : chr [1:3264] "36061" "39099" "24005" "39049" ...
##  $ County_FullName   : chr [1:3264] "New York County" "Mahoning County" "Baltimore County" "Franklin County" ...
##  $ County_Name       : chr [1:3264] "New York" "Mahoning" "Baltimore" "Franklin" ...
##  $ State             : chr [1:3264] "New York" "Ohio" "Maryland" "Ohio" ...
##  $ Year_Charged      : num [1:3264] 2004 2018 2018 2018 2011 ...
##  $ Date Charge       : Date[1:3264], format: NA NA ...
##  $ Date Incident     : Date[1:3264], format: NA NA ...
##  $ Accused_Name      : chr [1:3264] "Joanne Alvarez" "Raenell Allen" "Jason Patton Baker" "Benjamin Bussey" ...
##  $ Sentence          : chr [1:3264] NA NA NA NA ...
##  $ Sentence Quartile : chr [1:3264] NA NA NA NA ...
##  $ Charge            : chr [1:3264] "Reckless/Negligent Homicide" "Involuntary Manslaughter" "Manslaughter" "Involuntary Manslaughter" ...
##  $ Accused_Race      : chr [1:3264] NA "White" "White" "White" ...
##  $ Accused_Ethnicity : chr [1:3264] "Hispanic/Latino" NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
##  $ Accused_Sex       : chr [1:3264] "Female" "Female" "Male" "Male" ...
##  $ Accused_Age       : num [1:3264] NA NA 46 19 NA NA NA NA NA NA ...
##  $ Accused_City      : chr [1:3264] "East Harlem" "Youngstown" "Millersville" "Dublin" ...
##  $ Accused_State     : chr [1:3264] "New York" "Ohio" "Pennsylvania" "Ohio" ...
##  $ Court_Type        : chr [1:3264] "State" "State" "State" "State" ...
##  $ Plea              : chr [1:3264] "Guilty" NA "Not Guilty" "Guilty to a Lesser Charge" ...
##  $ Deceased_Name     : chr [1:3264] "Taylor Webster" NA "Josiah Christopher Klaes" "Haleah Myers" ...
##  $ Deceased_Race     : chr [1:3264] "White" NA "White" "White" ...
##  $ Deceased_Ethnicity: chr [1:3264] NA NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
##  $ Deceased_Age      : num [1:3264] 6 9 16 17 17 18 18 18 18 18 ...
##  $ Deceased_Sex      : chr [1:3264] "Female" "Male" "Male" "Female" ...
##  $ Substance         : chr [1:3264] "Fentanyl (Analog)" NA "Fentanyl (Analog)" "Fentanyl (Analog)" ...
##  $ Lawyer            : chr [1:3264] NA NA "do'connell@opd.state.md.us" NA ...
##  $ Relationship      : chr [1:3264] "Caretaker/Family/Friend/Partner/Co-User" "Caretaker/Family/Friend/Partner/Co-User" "Dealer/Buyer" "Caretaker/Family/Friend/Partner/Co-User" ...

#Looking to see what Sentence contains
sort(unique(DIH_DATA$Sentence))

##   [1] "0.08219178082" "0.0833"        "0.08333333333" "0.1052511416" 
##   [5] "0.1616438356"  "0.1666666667"  "0.2191780822"  "0.25"         
##   [9] "0.325"         "0.333"         "0.3333333333"  "0.375"        
##  [13] "0.4"           "0.405"         "0.4166666667"  "0.5"          
##  [17] "0.5726027397"  "0.5833333333"  "0.6666666667"  "0.75"         
##  [21] "0.822"         "0.8333333333"  "0.9166666667"  "0.9583333333" 
##  [25] "1"             "1.002"         "1.25"          "1.3"          
##  [29] "1.333333333"   "1.416666667"   "1.5"           "1.583"        
##  [33] "1.66"          "1.666666667"   "1.7"           "1.75"         
##  [37] "1.8"           "1.916666667"   "1.917"         "1.92"         
##  [41] "1.97"          "10"            "10.16"         "10.41"        
##  [45] "10.5"          "10.67"         "10.8"          "10.83"        
##  [49] "10.83333333"   "11"            "11.1"          "11.16666667"  
##  [53] "11.25"         "11.5"          "11.83"         "11.9"         
##  [57] "113"           "12"            "12.25"         "12.5"         
##  [61] "12.75"         "124"           "13"            "13.33"        
##  [65] "13.5"          "14"            "14.5"          "143"          
##  [69] "15"            "15.5"          "15.67"         "16"           
##  [73] "16.5"          "17"            "17.5"          "18"           
##  [77] "18.25"         "18.66666667"   "180"           "19"           
##  [81] "19.25"         "19.58333333"   "2"             "2.25"         
##  [85] "2.333333333"   "2.4"           "2.5"           "2.75"         
##  [89] "20"            "20.5"          "20.83333333"   "21"           
##  [93] "22"            "22.5"          "23"            "24"           
##  [97] "25"            "25.5"          "26"            "26.83"        
## [101] "27"            "27.08333333"   "28"            "29"           
## [105] "3"             "3.08"          "3.083333333"   "3.25"         
## [109] "3.3"           "3.33"          "3.333333333"   "3.4"          
## [113] "3.416666667"   "3.5"           "3.666666667"   "3.83"         
## [117] "3.833333333"   "3.92"          "30"            "30.41666667"  
## [121] "31"            "32"            "33"            "33.25"        
## [125] "34"            "35"            "36"            "37"           
## [129] "38"            "4"             "4.083333333"   "4.17"         
## [133] "4.25"          "4.416666667"   "4.5"           "4.666666667"  
## [137] "4.75"          "4.8"           "4.83"          "4.833"        
## [141] "4.833333333"   "4.9"           "40"            "42"           
## [145] "46"            "47"            "48"            "48.33"        
## [149] "5"             "5.06"          "5.25"          "5.5"          
## [153] "5.666666667"   "5.75"          "5.83"          "5.916666667"  
## [157] "5.92"          "50"            "52"            "54"           
## [161] "55"            "6"             "6.083333333"   "6.166666667"  
## [165] "6.25"          "6.5"           "6.67"          "6.7"          
## [169] "6.75"          "60"            "62"            "63"           
## [173] "65"            "7"             "7.25"          "7.33"         
## [177] "7.5"           "7.666666667"   "72"            "75"           
## [181] "8"             "8.08"          "8.33"          "8.333"        
## [185] "8.333333333"   "8.5"           "8.67"          "84"           
## [189] "9"             "9.33"          "9.5"           "9.917"        
## [193] "90"            "96"            "Life"          "pending"      
## [197] "Pending"

I have manually gone into the data and created a new variable called “Sentence_num” – which makes Sentence a continuous numeric variable. Just for visualization sake…

For now I will drop anything that says “pending” or “Pending” from the dataset and replace them with NA

I changed all “life” sentences to 100 years, just for th

DIH_DATA <- DIH_DATA %>%
  mutate(
    Sentence_num = na_if(Sentence, "Pending"),
    Sentence_num = na_if(Sentence, "pending"),
    Sentence_num = as.numeric(Sentence)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sentence_num = as.numeric(Sentence)`.
## Caused by warning:
## ! NAs introduced by coercion

library(dplyr)
library(stringr)
summary(DIH_DATA$Sentence_num)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##   0.08219   4.00000   8.00000  11.54717  15.00000 180.00000      1209

sum(!is.na(DIH_DATA$Sentence))

## [1] 2348

sum(!is.na(DIH_DATA$Sentence_num))

## [1] 2055

All observations 3,364 Observations with sentencing data 2,055

#General Summary of All Variables

summary(DIH_DATA)

##       ID                URL               Court             Rural_Code   
##  Length:3264        Length:3264        Length:3264        Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.000  
##                                                           Mean   :2.474  
##                                                           3rd Qu.:3.000  
##                                                           Max.   :9.000  
##                                                           NA's   :112    
##  County_Fips        County_FullName    County_Name           State          
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Year_Charged   Date Charge   Date Incident  Accused_Name      
##  Min.   :1974   Min.   :NA     Min.   :NA     Length:3264       
##  1st Qu.:2015   1st Qu.:NA     1st Qu.:NA     Class :character  
##  Median :2017   Median :NA     Median :NA     Mode  :character  
##  Mean   :2016   Mean   :NaN    Mean   :NaN                      
##  3rd Qu.:2018   3rd Qu.:NA     3rd Qu.:NA                       
##  Max.   :2026   Max.   :NA     Max.   :NA                       
##                 NA's   :3264   NA's   :3264                     
##    Sentence         Sentence Quartile     Charge          Accused_Race      
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Accused_Ethnicity  Accused_Sex         Accused_Age    Accused_City      
##  Length:3264        Length:3264        Min.   :17.00   Length:3264       
##  Class :character   Class :character   1st Qu.:27.00   Class :character  
##  Mode  :character   Mode  :character   Median :32.00   Mode  :character  
##                                        Mean   :34.34                     
##                                        3rd Qu.:40.00                     
##                                        Max.   :87.00                     
##                                        NA's   :2224                      
##  Accused_State       Court_Type            Plea           Deceased_Name     
##  Length:3264        Length:3264        Length:3264        Length:3264       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Deceased_Race      Deceased_Ethnicity  Deceased_Age   Deceased_Sex      
##  Length:3264        Length:3264        Min.   : 0.00   Length:3264       
##  Class :character   Class :character   1st Qu.:22.25   Class :character  
##  Mode  :character   Mode  :character   Median :28.00   Mode  :character  
##                                        Mean   :29.62                     
##                                        3rd Qu.:35.00                     
##                                        Max.   :91.00                     
##                                        NA's   :546                       
##   Substance            Lawyer          Relationship        Sentence_num      
##  Length:3264        Length:3264        Length:3264        Min.   :  0.08219  
##  Class :character   Class :character   Class :character   1st Qu.:  4.00000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  8.00000  
##                                                           Mean   : 11.54717  
##                                                           3rd Qu.: 15.00000  
##                                                           Max.   :180.00000  
##                                                           NA's   :1209

Missingness

colSums(is.na(DIH_DATA))

##                 ID                URL              Court         Rural_Code 
##                  0                103                999                112 
##        County_Fips    County_FullName        County_Name              State 
##                  0                  5                  2                  0 
##       Year_Charged        Date Charge      Date Incident       Accused_Name 
##                  0               3264               3264                  7 
##           Sentence  Sentence Quartile             Charge       Accused_Race 
##                916               3264                  1                461 
##  Accused_Ethnicity        Accused_Sex        Accused_Age       Accused_City 
##               2286                 17               2224                181 
##      Accused_State         Court_Type               Plea      Deceased_Name 
##                 65                 72                907                522 
##      Deceased_Race Deceased_Ethnicity       Deceased_Age       Deceased_Sex 
##               1103               2198                546                220 
##          Substance             Lawyer       Relationship       Sentence_num 
##                617               2459                276               1209

#Creating Freq table function to reuse later
freq_table <- function(data, var) {
  data %>%
    count({{ var }}) %>%
    mutate(percent = n / sum(n) * 100) %>%
    arrange(desc(n))
}

#Creating Bar chart function for categorical variables to reuse later
library(ggplot2)
library(dplyr)

bar_plot_cat <- function(data, var) {
  ggplot(data, aes(x = {{ var }}, fill = {{ var }})) +
    geom_bar() +
    geom_text(
      stat = "count",
      aes(label = ..count..),
      vjust = -0.3
    ) +
    labs(
      x = deparse(substitute(var)),
      y = "Count",
      title = paste("Distribution of", deparse(substitute(var)))
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    guides(fill = "none")
}

#Creating histogram function
hist_plot <- function(data, var) {
  ggplot(data, aes(x = {{ var }})) +
    geom_histogram(bins = 30, fill = "steelblue") +
    scale_x_continuous(breaks = seq(0, 200, by = 10)) +  # 👈 more numbers
    labs(
      x = deparse(substitute(var)),
      y = "Count",
      title = paste("Distribution of", deparse(substitute(var)))
    )
}

#creating density plot function
density_plot <- function(data, var) {
  ggplot(data, aes(x = {{ var }})) +
    geom_density(fill = "steelblue", alpha = 0.5) +
    labs(
      x = deparse(substitute(var)),
      y = "Density",
      title = paste("Density of", deparse(substitute(var)))
    )
}

hist_plot(DIH_DATA, Sentence_num)

## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_bin()`).

Will need to explore missingness to see if it is random.

#📅Date Charged

freq_table (DIH_DATA, Year_Charged)

## # A tibble: 34 × 3
##    Year_Charged     n percent
##           <dbl> <int>   <dbl>
##  1         2017   560   17.2 
##  2         2018   532   16.3 
##  3         2016   384   11.8 
##  4         2019   326    9.99
##  5         2015   266    8.15
##  6         2020   263    8.06
##  7         2014   213    6.53
##  8         2013   120    3.68
##  9         2021    99    3.03
## 10         2012    88    2.70
## # ℹ 24 more rows

sum(DIH_DATA$Year_Charged <= 2010, na.rm = TRUE)

## [1] 305

library(dplyr)
library(ggplot2)

DIH_DATA %>%
  filter(!is.na(Year_Charged)) %>%
  mutate(Year_Charged = as.numeric(Year_Charged)) %>%
  count(Year_Charged) %>%
  ggplot(aes(x = Year_Charged, y = n)) +
  geom_line() +
  labs(
    title = "Number of Cases Over Time",
    x = "Year",
    y = "Count"
  )

#👤Accused characteristics ## Accused Age

density_plot(DIH_DATA, Accused_Age)

## Warning: Removed 2224 rows containing non-finite outside the scale range
## (`stat_density()`).

## • Accused_Ethnicity

freq_table(DIH_DATA,Accused_Ethnicity)

## # A tibble: 7 × 3
##   Accused_Ethnicity       n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2286 70.0   
## 2 Not Hispanic/Latino   734 22.5   
## 3 Hispanic/Latino       147  4.50  
## 4 NA                     94  2.88  
## 5 Not HIspanic/Latino     1  0.0306
## 6 White                   1  0.0306
## 7 not Hispanic/Latino     1  0.0306

Cleaning typos in data…

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Ethnicity = str_to_title(Accused_Ethnicity),   # fix casing
    Accused_Ethnicity = str_replace_all(Accused_Ethnicity, "Hispanic", "Hispanic"), # normalize
    Accused_Ethnicity = na_if(Accused_Ethnicity, "Na"),     # fix "NA"
    
    Accused_Ethnicity = case_when(
      Accused_Ethnicity %in% c("Not Hispanic/Latino", "Not Hispanic/Latino") ~ "Not Hispanic/Latino",
      Accused_Ethnicity %in% c("Hispanic/Latino") ~ "Hispanic/Latino",
      Accused_Ethnicity == "White" ~ NA_character_,  # wrong variable → set to NA
      TRUE ~ Accused_Ethnicity
    )
  )

bar_plot_cat(DIH_DATA, Accused_Ethnicity)

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

• Accused_Sex

bar_plot_cat(DIH_DATA, Accused_Sex)

freq_table(DIH_DATA, Accused_Sex)

## # A tibble: 5 × 3
##   Accused_Sex     n percent
##   <chr>       <int>   <dbl>
## 1 Male         2388 73.2   
## 2 Female        853 26.1   
## 3 <NA>           17  0.521 
## 4 NA              3  0.0919
## 5 feMale          3  0.0919

Fixing typos in the data again…

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Sex = str_to_title(Accused_Sex),   # fixes feMale → Female
    Accused_Sex = na_if(Accused_Sex, "Na")     # fixes "NA" string → real NA
  )

• Accused_Race

freq_table(DIH_DATA, Accused_Race)

## # A tibble: 12 × 3
##    Accused_Race                                                        n percent
##    <chr>                                                           <int>   <dbl>
##  1 White                                                            2067 63.3   
##  2 Black                                                             641 19.6   
##  3 <NA>                                                              461 14.1   
##  4 NA                                                                 37  1.13  
##  5 Unknown POC                                                        25  0.766 
##  6 Asian                                                              16  0.490 
##  7 Native American or Alaskan Native                                  10  0.306 
##  8 Hispanic/Latino                                                     3  0.0919
##  9 Arabic                                                              1  0.0306
## 10 Indian American                                                     1  0.0306
## 11 Manufacture, Delivery, or Possession With Intent to Manufactur…     1  0.0306
## 12 white                                                               1  0.0306

“white” → should be “White” • “NA” → should be real NA • “Arabic” → you already decided → “White” • that long drug charge string → clearly wrong variable → should be NA

•   "Unknown POC" → not a usable category → ❌
•   "Hispanic/Latino" → ethnicity, not race → ❌

DIH_DATA <- DIH_DATA %>%
  mutate(
    Accused_Race = na_if(Accused_Race, "NA"),
    Accused_Race = case_when(
      Accused_Race %in% c("white", "White") ~ "White",
      Accused_Race %in% c("black", "Black") ~ "Black",
      Accused_Race == "Arabic" ~ "White",
      Accused_Race %in% c("Unknown POC", "Hispanic/Latino", "X") ~ NA_character_,
      str_detect(Accused_Race, "Manufacture") ~ NA_character_,
      TRUE ~ Accused_Race
    )
  )

freq_table(DIH_DATA, Accused_Race)

## # A tibble: 6 × 3
##   Accused_Race                          n percent
##   <chr>                             <int>   <dbl>
## 1 White                              2069 63.4   
## 2 Black                               641 19.6   
## 3 <NA>                                527 16.1   
## 4 Asian                                16  0.490 
## 5 Native American or Alaskan Native    10  0.306 
## 6 Indian American                       1  0.0306

bar_plot_cat(DIH_DATA, Accused_Race)

##Race/Ethnicity Variable

DIH_DATA <- DIH_DATA %>%
  mutate(
    Race_Ethnicity = case_when(
      Accused_Ethnicity == "Hispanic/Latino" ~ "Hispanic",
      Accused_Race == "White" ~ "White (Non-Hispanic)",
      Accused_Race == "Black" ~ "Black (Non-Hispanic)",
      Accused_Race == "Asian" ~ "Asian",
      Accused_Race == "Native American or Alaskan Native" ~ "Native American",
      TRUE ~ NA_character_
    )
  )

DIH_DATA <- DIH_DATA %>%
  mutate(
    Race_Ethnicity = case_when(
      Race_Ethnicity %in% c("Asian", "Native American") ~ "Other",
      TRUE ~ Race_Ethnicity
    )
  )

freq_table(DIH_DATA,  Race_Ethnicity)

## # A tibble: 5 × 3
##   Race_Ethnicity           n percent
##   <chr>                <int>   <dbl>
## 1 White (Non-Hispanic)  1993  61.1  
## 2 Black (Non-Hispanic)   629  19.3  
## 3 <NA>                   472  14.5  
## 4 Hispanic               147   4.50 
## 5 Other                   23   0.705

bar_plot_cat(DIH_DATA, Race_Ethnicity)

#👤Deceased characteristics

Deceased Age

Was there really a 91 year old who was the deceased?

deceased under 18 =>

sum(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)

## [1] 262

percent of sample (after dropping those missing) 18 or under =>

mean(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)

## [1] 0.09639441

DIH_DATA %>%
  filter(Deceased_Age <= 18) %>%
  count(Deceased_Age, sort = TRUE)

## # A tibble: 19 × 2
##    Deceased_Age     n
##           <dbl> <int>
##  1           18    68
##  2           17    37
##  3            1    29
##  4           16    24
##  5            2    21
##  6            0    19
##  7           15    15
##  8            3    11
##  9           14     8
## 10            5     7
## 11            6     5
## 12           10     5
## 13           13     3
## 14            4     2
## 15            9     2
## 16           11     2
## 17           12     2
## 18            7     1
## 19            8     1

percent of sample 2 or younger =>

mean(DIH_DATA$Deceased_Age <= 2, na.rm = TRUE)* 100

## [1] 2.538631

Quartiles for 18 and younger =>

quantile(
  DIH_DATA$Deceased_Age[DIH_DATA$Deceased_Age <= 18],
  probs = c(0.25, 0.5, 0.75),
  na.rm = TRUE
)

## 25% 50% 75% 
##   2  15  18

Interested in the breakdown of deceased by age under 18, creating groups and graphing. =>

library(dplyr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    age_group = case_when(
      Deceased_Age >= 0  & Deceased_Age <= 1  ~ "0–1",
      Deceased_Age >= 2  & Deceased_Age <= 3  ~ "2–3",
      Deceased_Age >= 4  & Deceased_Age <= 5  ~ "4–5",
      Deceased_Age >= 6  & Deceased_Age <= 7  ~ "6–7",
      Deceased_Age >= 8  & Deceased_Age <= 9  ~ "8–9",
      Deceased_Age >= 10 & Deceased_Age <= 11 ~ "10–11",
      Deceased_Age >= 12 & Deceased_Age <= 13 ~ "12–13",
      Deceased_Age >= 14 & Deceased_Age <= 15 ~ "14–15",
      Deceased_Age >= 16 & Deceased_Age <= 17 ~ "16–17",
      Deceased_Age >= 18 ~ "18+",
      TRUE ~ NA_character_
    )
  )
  
#Ordered

DIH_DATA$age_group <- factor(
  DIH_DATA$age_group,
  levels = c("0–1","2–3","4–5","6–7","8–9","10–11","12–13","14–15","16–17","18+")
)

#Plot
library(ggplot2)

ggplot(
  DIH_DATA %>% filter(Deceased_Age <= 18, !is.na(age_group)),
  aes(x = age_group, fill = age_group)
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Deceased Age (≤ 18)",
    x = "Age Group",
    y = "Count"
  ) +
  guides(fill = "none")   # removes legend if you don’t want it

freq_table(DIH_DATA, Deceased_Age)

## # A tibble: 75 × 3
##    Deceased_Age     n percent
##           <dbl> <int>   <dbl>
##  1           NA   546   16.7 
##  2           26   141    4.32
##  3           28   137    4.20
##  4           21   134    4.11
##  5           23   126    3.86
##  6           25   122    3.74
##  7           24   121    3.71
##  8           30   116    3.55
##  9           22   114    3.49
## 10           27   106    3.25
## # ℹ 65 more rows

freq_table(DIH_DATA, age_group)

## # A tibble: 11 × 3
##    age_group     n percent
##    <fct>     <int>   <dbl>
##  1 18+        2524 77.3   
##  2 <NA>        546 16.7   
##  3 16–17        61  1.87  
##  4 0–1          48  1.47  
##  5 2–3          32  0.980 
##  6 14–15        23  0.705 
##  7 4–5           9  0.276 
##  8 10–11         7  0.214 
##  9 6–7           6  0.184 
## 10 12–13         5  0.153 
## 11 8–9           3  0.0919

creating a child variable

DIH_DATA <- DIH_DATA %>%

  mutate(

    child = ifelse(Deceased_Age < 18, 1, 0)

  )

DIH_DATA <- DIH_DATA %>%

  mutate(

    child = ifelse(is.na(Deceased_Age), NA,

                   ifelse(Deceased_Age < 18, 1, 0))

  )

DIH_DATA <- DIH_DATA %>%
  mutate(
    child = factor(child, levels = c(0, 1),
                   labels = c("Adult (18+)", "Child (<18)"))
  )


freq_table(DIH_DATA, child)

## # A tibble: 3 × 3
##   child           n percent
##   <fct>       <int>   <dbl>
## 1 Adult (18+)  2524   77.3 
## 2 <NA>          546   16.7 
## 3 Child (<18)   194    5.94

• Deceased_Race

freq_table(DIH_DATA, Deceased_Race)

## # A tibble: 13 × 3
##    Deceased_Race                         n percent
##    <chr>                             <int>   <dbl>
##  1 White                              1964 60.2   
##  2 <NA>                               1103 33.8   
##  3 Black                                90  2.76  
##  4 NA                                   66  2.02  
##  5 Unknown POC                          17  0.521 
##  6 Asian                                10  0.306 
##  7 Native American or Alaskan Native     6  0.184 
##  8 Hispanic/Latino                       3  0.0919
##  9 Arabic                                1  0.0306
## 10 Multiple Deceased                     1  0.0306
## 11 Thomas McGuinness                     1  0.0306
## 12 Unknown                               1  0.0306
## 13 X                                     1  0.0306

Need to fix these… Arabic is changed to White. Thomas McGuinness changed to NA. Fix the NAs, X and Unknown

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Race = na_if(Deceased_Race, "NA"),
    Deceased_Race = case_when(
      Deceased_Race == "Arabic" ~ "White",
      Deceased_Race == "Thomas McGuinness" ~ NA_character_,
      Deceased_Race %in% c("Unknown", "X") ~ NA_character_,
      TRUE ~ Deceased_Race
    )
  )

freq_table(DIH_DATA, Deceased_Race)

## # A tibble: 8 × 3
##   Deceased_Race                         n percent
##   <chr>                             <int>   <dbl>
## 1 White                              1965 60.2   
## 2 <NA>                               1172 35.9   
## 3 Black                                90  2.76  
## 4 Unknown POC                          17  0.521 
## 5 Asian                                10  0.306 
## 6 Native American or Alaskan Native     6  0.184 
## 7 Hispanic/Latino                       3  0.0919
## 8 Multiple Deceased                     1  0.0306

bar_plot_cat(DIH_DATA, Deceased_Race)

• Deceased_Ethnicity

freq_table(DIH_DATA, Deceased_Ethnicity)

## # A tibble: 9 × 3
##   Deceased_Ethnicity      n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2198 67.3   
## 2 Not Hispanic/Latino   844 25.9   
## 3 NA                    116  3.55  
## 4 Hispanic/Latino        99  3.03  
## 5 Hispanic Latino         2  0.0613
## 6 White                   2  0.0613
## 7 Multiple Deceased       1  0.0306
## 8 Not Hispanic/latino     1  0.0306
## 9 X                       1  0.0306

Again, fixing the data…

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Ethnicity = na_if(Deceased_Ethnicity, "NA"),
    Deceased_Ethnicity = case_when(
      Deceased_Ethnicity == "Hispanic Latino" ~ "Hispanic/Latino",
      Deceased_Ethnicity == "Not Hispanic/latino" ~ "Not Hispanic/Latino",
      Deceased_Ethnicity == "White" ~ NA_character_,
      Deceased_Ethnicity == "X" ~ NA_character_,
      TRUE ~ Deceased_Ethnicity
    )
  )

freq_table(DIH_DATA, Deceased_Ethnicity)

## # A tibble: 4 × 3
##   Deceased_Ethnicity      n percent
##   <chr>               <int>   <dbl>
## 1 <NA>                 2317 71.0   
## 2 Not Hispanic/Latino   845 25.9   
## 3 Hispanic/Latino       101  3.09  
## 4 Multiple Deceased       1  0.0306

bar_plot_cat(DIH_DATA, Deceased_Ethnicity)

Large missingness

• Deceased_Sex

freq_table(DIH_DATA, Deceased_Sex)

## # A tibble: 14 × 3
##    Deceased_Sex          n percent
##    <chr>             <int>   <dbl>
##  1 Male               1912 58.6   
##  2 Female             1090 33.4   
##  3 <NA>                220  6.74  
##  4 NA                   25  0.766 
##  5 male                  6  0.184 
##  6 Famale                2  0.0613
##  7 female                2  0.0613
##  8 27                    1  0.0306
##  9 Both Female           1  0.0306
## 10 Both male             1  0.0306
## 11 Female, Male          1  0.0306
## 12 Multiple Deceased     1  0.0306
## 13 Other                 1  0.0306
## 14 X                     1  0.0306

changing both to be NA

DIH_DATA <- DIH_DATA %>%
  mutate(
    Deceased_Sex = case_when(
      Deceased_Sex %in% c("NA","27","X","Other",
                         "Both Female","Both male","Female, Male", "Both") ~ NA_character_,
      Deceased_Sex %in% c("male","Male") ~ "Male",
      Deceased_Sex %in% c("female","Female","Famale") ~ "Female",
      TRUE ~ Deceased_Sex
    )
  )

freq_table(DIH_DATA, Deceased_Sex)

## # A tibble: 4 × 3
##   Deceased_Sex          n percent
##   <chr>             <int>   <dbl>
## 1 Male               1918 58.8   
## 2 Female             1094 33.5   
## 3 <NA>                251  7.69  
## 4 Multiple Deceased     1  0.0306

#🌍Rural Code NA’s: 112

ggplot(
  DIH_DATA %>% filter(!is.na(Rural_Code)),
  aes(x = factor(Rural_Code), fill = factor(Rural_Code))
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Rural_Code",
    x = "Rural Code",
    y = "Count"
  ) +
  guides(fill = "none")

Creating urbanacity variable

DIH_DATA <- DIH_DATA %>%
  mutate(
    Urbanicity = case_when(
      Rural_Code %in% 1:3 ~ "Metropolitan",
      Rural_Code %in% 4:6 ~ "Suburban",
      Rural_Code %in% 7:9 ~ "Rural",
      TRUE ~ NA_character_
    )
  )

#Ordering for plotting
DIH_DATA$Urbanicity <- factor(
  DIH_DATA$Urbanicity,
  levels = c("Metropolitan", "Suburban", "Rural")
)

ggplot(
  DIH_DATA %>% filter(!is.na(Urbanicity)),
  aes(x = Urbanicity, fill = Urbanicity)
) +
  geom_bar() +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.3
  ) +
  labs(
    title = "Distribution of Urbanicity",
    x = "Urbanicity",
    y = "Count"
  ) +
  guides(fill = "none")

Would be interesting to see state by urbancity…

ggplot(
  DIH_DATA %>% filter(!is.na(State), !is.na(Urbanicity)),
  aes(x = State, fill = Urbanicity)
) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Urbanicity by State",
    x = "State",
    y = "Proportion"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

I guess this could just be a function of how rural the state is…but we could test further…

#📍State

There is a lowercase pennsylvania in the dataset…fixing it…

DIH_DATA %>%
  filter(State == "pennsylvania")

## # A tibble: 1 × 36
##   ID    URL       Court Rural_Code County_Fips County_FullName County_Name State
##   <chr> <chr>     <chr>      <dbl> <chr>       <chr>           <chr>       <chr>
## 1 540   3 senten… <NA>           2 42071       Lancaster Coun… Lancaster   penn…
## # ℹ 28 more variables: Year_Charged <dbl>, `Date Charge` <date>,
## #   `Date Incident` <date>, Accused_Name <chr>, Sentence <chr>,
## #   `Sentence Quartile` <chr>, Charge <chr>, Accused_Race <chr>,
## #   Accused_Ethnicity <chr>, Accused_Sex <chr>, Accused_Age <dbl>,
## #   Accused_City <chr>, Accused_State <chr>, Court_Type <chr>, Plea <chr>,
## #   Deceased_Name <chr>, Deceased_Race <chr>, Deceased_Ethnicity <chr>,
## #   Deceased_Age <dbl>, Deceased_Sex <chr>, Substance <chr>, Lawyer <chr>, …

DIH_DATA <- DIH_DATA %>%
  mutate(
    State = ifelse(State == "pennsylvania", "Pennsylvania", State)
  )

bar_plot_cat(DIH_DATA, State)

freq_table(DIH_DATA, State)

## # A tibble: 50 × 3
##    State              n percent
##    <chr>          <int>   <dbl>
##  1 Pennsylvania     578   17.7 
##  2 Ohio             354   10.8 
##  3 Wisconsin        351   10.8 
##  4 Illinois         299    9.16
##  5 Florida          125    3.83
##  6 Minnesota        116    3.55
##  7 New Jersey       113    3.46
##  8 New York         112    3.43
##  9 North Carolina   111    3.40
## 10 Michigan          99    3.03
## # ℹ 40 more rows

#⚖️ Legal / case variables ## • Charge

freq_table(DIH_DATA,Charge)

## # A tibble: 32 × 3
##    Charge                                                              n percent
##    <chr>                                                           <int>   <dbl>
##  1 Drug Delivery/Distribution Resulting in Death                     610   18.7 
##  2 Involuntary Manslaughter                                          479   14.7 
##  3 Reckless/Negligent Homicide                                       432   13.2 
##  4 Murder (2nd or 3rd Degree)                                        342   10.5 
##  5 Drug Induced Homicide                                             306    9.38
##  6 Manslaughter                                                      252    7.72
##  7 Delivery/Distribution of Heroin/Fentanyl Resulting in Death       151    4.63
##  8 Delivery/Distribution Of a Controlled Substance Causing/Result…   122    3.74
##  9 Other (Please Specify in Notes)                                   101    3.09
## 10 Murder (1st Degree)                                                99    3.03
## # ℹ 22 more rows

@Katie Flagging this – still don’t know how these were categorized…

I am going to go ahead and group these charges and provide a justification for doing so, but if you all disagree, I can change them…

DIH_DATA <- DIH_DATA %>%
  mutate(
    Charge_Group = case_when(
      
      # 1. Core DIH (highest)
      str_detect(tolower(Charge), "drug.*death|controlled substance homicide|drug induced homicide") &
      !str_detect(tolower(Charge), "conspiracy|possess|intent") ~ "Drug-Induced Homicide",
      
      # 2. Distribution / possession with intent
      str_detect(tolower(Charge), "possess|intent|trafficking") &
      str_detect(tolower(Charge), "death") ~ "Distribution/Possession Causing Death",
      
      # 3. Conspiracy
      str_detect(tolower(Charge), "conspiracy") ~ "Conspiracy",
      
      # 4. Murder
      str_detect(tolower(Charge), "murder|felony murder") ~ "Murder",
      
      # 5. Manslaughter
      str_detect(tolower(Charge), "manslaughter") ~ "Manslaughter",
      
      # 6. Negligent
      str_detect(tolower(Charge), "negligent|reckless") ~ "Negligent/Reckless",
      
      # Missing / junk
      Charge %in% c("Pending", "Unknown") ~ NA_character_,
      
      TRUE ~ "Other"
    )
  )

Charge categories were consolidated based on relative levels of legal culpability and the nature of the defendant’s causal connection to the decedent’s death, distinguishing between direct drug-induced homicide offenses, distribution-related offenses, conspiracy-based liability, and traditional homicide and negligence-based charges.

• Plea

freq_table(DIH_DATA,Plea)

## # A tibble: 37 × 3
##    Plea                              n percent
##    <chr>                         <int>   <dbl>
##  1 Guilty                         1756  53.8  
##  2 <NA>                            907  27.8  
##  3 Not Guilty                      205   6.28 
##  4 NA                              188   5.76 
##  5 Guilty of a Lesser Charge        46   1.41 
##  6 No Contest                       42   1.29 
##  7 Awaiting Plea Hearing            31   0.950
##  8 Guilty to a Lesser Charge        31   0.950
##  9 Pleaded Guilty                   13   0.398
## 10 Guilty to a Lesser DIH Charge    12   0.368
## # ℹ 27 more rows

Fixed the data as follows… inlcuding changed “pending” or awaiting plea hearing to NA.

DIH_DATA <- DIH_DATA %>%
  mutate(
    Plea = na_if(Plea, "NA"),
    Plea = case_when(
      Plea %in% c("State", "X", "Other", "Awaiting Plea Hearing", "Pending/Awaiting", "pending") ~ NA_character_,
      Plea == "Judge ruled it was not murder" ~ NA_character_,
      Plea == "Pleads not guily" ~ "Guilty",
      Plea == "Pled guity to DIH charges" ~ "Guilty",
      str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
      str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
      str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
      str_detect(tolower(Plea), "guilty") ~ "Guilty",
      TRUE ~ Plea
    )
  )

freq_table(DIH_DATA, Plea)

## # A tibble: 6 × 3
##   Plea                        n percent
##   <chr>                   <int>   <dbl>
## 1 Guilty                   1781 54.6   
## 2 <NA>                     1128 34.6   
## 3 Not Guilty                215  6.59  
## 4 Guilty to Lesser Charge    90  2.76  
## 5 No Contest                 49  1.50  
## 6 Alford Plea                 1  0.0306

bar_plot_cat(DIH_DATA, Plea)

Do we have data on verdicts? Versus just pleas

#⚖️ Sentence Quartile

#make sentence quart var
DIH_DATA <- DIH_DATA %>%
  mutate(
    Sentence_Quartile_calc = ntile(Sentence_num, 4)
  )

sum(!is.na(DIH_DATA$Sentence_Quartile_calc))

## [1] 2055

• Court_Type

freq_table(DIH_DATA,Court_Type)

## # A tibble: 6 × 3
##   Court_Type         n percent
##   <chr>          <int>   <dbl>
## 1 State           2789 85.4   
## 2 Federal          388 11.9   
## 3 <NA>              72  2.21  
## 4 NA                12  0.368 
## 5 federal            2  0.0613
## 6 Fayette County     1  0.0306

DIH_DATA <- DIH_DATA %>%
  mutate(
    Court_Type = na_if(Court_Type, "NA"),
    Court_Type = case_when(
      Court_Type == "federal" ~ "Federal",
      Court_Type == "Fayette County" ~ NA_character_,
      TRUE ~ Court_Type
    )
  )

freq_table(DIH_DATA, Court_Type)

## # A tibble: 3 × 3
##   Court_Type     n percent
##   <chr>      <int>   <dbl>
## 1 State       2789   85.4 
## 2 Federal      390   11.9 
## 3 <NA>          85    2.60

🔗 Case context

• Relationship

freq_table(DIH_DATA, Relationship)

## # A tibble: 6 × 3
##   Relationship                                n percent
##   <chr>                                   <int>   <dbl>
## 1 Dealer/Buyer                             2097  64.2  
## 2 Caretaker/Family/Friend/Partner/Co-User   768  23.5  
## 3 <NA>                                      276   8.46 
## 4 Doctor/Patient                             88   2.70 
## 5 NA                                         26   0.797
## 6 Dealer/buyer                                9   0.276

Fixing typos.

DIH_DATA <- DIH_DATA %>%
  mutate(
    Relationship = na_if(Relationship, "NA"),
    Relationship = case_when(
      Relationship %in% c("Dealer/buyer", "Dealer/Buyer") ~ "Dealer/Buyer",
      TRUE ~ Relationship
    )
  )

freq_table(DIH_DATA, Relationship)

## # A tibble: 4 × 3
##   Relationship                                n percent
##   <chr>                                   <int>   <dbl>
## 1 Dealer/Buyer                             2106   64.5 
## 2 Caretaker/Family/Friend/Partner/Co-User   768   23.5 
## 3 <NA>                                      302    9.25
## 4 Doctor/Patient                             88    2.70

• Substance

freq_table(DIH_DATA, Substance)

## # A tibble: 39 × 3
##    Substance                                    n percent
##    <chr>                                    <int>   <dbl>
##  1 Heroin                                     944   28.9 
##  2 <NA>                                       617   18.9 
##  3 Fentanyl (Analog)                          459   14.1 
##  4 Fentanyl (Analog) and Heroin               408   12.5 
##  5 Prescription Opioid                        137    4.20
##  6 Mixed Drugs Not Listed (Comment in Cell)   113    3.46
##  7 Methadone                                   80    2.45
##  8 Other Drug (Comment in Cell)                68    2.08
##  9 Methamphetamine                             66    2.02
## 10 Fentanyl Analog                             47    1.44
## # ℹ 29 more rows

Cleaning data as follows…

library(stringr)
library(dplyr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Substance = tolower(Substance),
    Substance = na_if(Substance, "na"),
    
    Substance_clean = case_when(
      str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
      str_detect(Substance, "fentanyl") ~ "Fentanyl",
      str_detect(Substance, "heroin") ~ "Heroin",
      str_detect(Substance, "meth") ~ "Methamphetamine",
      str_detect(Substance, "cocaine") ~ "Cocaine",
      str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
      str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
      str_detect(Substance, "xanax|benzo") ~ "Other",
      str_detect(Substance, "ecstasy") ~ "Other",
      str_detect(Substance, "benadryl") ~ "Other",
      str_detect(Substance, "bullskin") ~ NA_character_,
      TRUE ~ "Other"
    )
  )

freq_table(DIH_DATA, Substance_clean)

## # A tibble: 9 × 3
##   Substance_clean       n percent
##   <chr>             <int>   <dbl>
## 1 Heroin              945 29.0   
## 2 Other               707 21.7   
## 3 Fentanyl            670 20.5   
## 4 Fentanyl + Heroin   429 13.1   
## 5 Other Opioid        162  4.96  
## 6 Methamphetamine     161  4.93  
## 7 Polysubstance       113  3.46  
## 8 Cocaine              76  2.33  
## 9 <NA>                  1  0.0306

bar_plot_cat(DIH_DATA, Substance_clean)

Collapsing Substance below, lmk if you have any issues with the way I collapsed the categories; Benzos has 10 or less so collapsing with “other”

library(dplyr)
library(stringr)

DIH_DATA <- DIH_DATA %>%
  mutate(
    Substance_Group = case_when(
      
    is.na(Substance) ~ NA_character_, 
      # Polysubstance (multiple drugs mentioned)
      str_detect(tolower(Substance), " and |-|multiple") ~ "Polysubstance",
      
      # Fentanyl (very important to isolate)
      str_detect(tolower(Substance), "fentanyl") ~ "Fentanyl",
      
      # Heroin / opioids (non-fentanyl)
      str_detect(tolower(Substance), "heroin|opioid|methadone|prescription") ~ "Opioid (non-fentanyl)",
      
      # Stimulants
      str_detect(tolower(Substance), "methamphetamine|cocaine") ~ "Stimulant",
      
      # Missing / junk
      Substance %in% c("Unknown", "Pending") ~ NA_character_,
      
      TRUE ~ "Other"
    )
  )

freq_table(DIH_DATA, Substance_Group)

## # A tibble: 6 × 3
##   Substance_Group           n percent
##   <chr>                 <int>   <dbl>
## 1 Opioid (non-fentanyl)  1162   35.6 
## 2 Polysubstance           674   20.6 
## 3 <NA>                    623   19.1 
## 4 Fentanyl                507   15.5 
## 5 Other                   200    6.13
## 6 Stimulant                98    3.00

library(dplyr)
library(tidyr)

options(scipen = 999)

numeric_summary <- DIH_DATA %>%
  summarise(
    across(
      c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
      list(
        nonmissing = ~sum(!is.na(.)),
        missing = ~sum(is.na(.)),
        mean = ~mean(., na.rm = TRUE),
        sd = ~sd(., na.rm = TRUE),
        median = ~median(., na.rm = TRUE),
        min = ~min(., na.rm = TRUE),
        max = ~max(., na.rm = TRUE)
      )
    )
  ) %>%
  pivot_longer(
    everything(),
    names_to = c("Variable", "Statistic"),
    names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
  ) %>%
  pivot_wider(names_from = Statistic, values_from = value)

numeric_summary <- numeric_summary %>%
  mutate(
    min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
    median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
    max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
  )


numeric_summary

## # A tibble: 5 × 8
##   Variable     nonmissing missing    mean    sd median min           max  
##   <chr>             <dbl>   <dbl>   <dbl> <dbl> <chr>  <chr>         <chr>
## 1 Rural_Code         3152     112    2.47  1.89 2      1             9    
## 2 Year_Charged       3264       0 2016.    4.02 2017   1974          2026 
## 3 Accused_Age        1040    2224   34.3  10.3  32     17            87   
## 4 Deceased_Age       2718     546   29.6  11.5  28     0             91   
## 5 Sentence_num       2055    1209   11.5  12.1  8      0.08219178082 180

library(dplyr)
library(purrr)
library(tidyr)
library(gt)

cat_vars <- c(
  "State",
  "Charge",
  "Accused_Race",
  "Accused_Ethnicity",
  "Accused_Sex",
  "Accused_State",
  "Court_Type",
  "Plea",
  "Deceased_Race",
  "Deceased_Ethnicity",
  "Deceased_Sex",
  "Substance",
  "Relationship",
  "Urbanicity",
  "Sentence_Quartile_calc"
)

categorical_summary <- map_dfr(cat_vars, function(v) {
  x <- DIH_DATA[[v]]
  tab <- sort(table(x, useNA = "no"), decreasing = TRUE)

  tibble(
    Variable = v,
    nonmissing = sum(!is.na(x)),
    missing = sum(is.na(x)),
    n_categories = dplyr::n_distinct(x, na.rm = TRUE),
    most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
    most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
    most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
  )
})

categorical_summary

## # A tibble: 15 × 7
##    Variable            nonmissing missing n_categories most_common most_common_n
##    <chr>                    <int>   <int>        <int> <chr>               <int>
##  1 State                     3264       0           50 Pennsylvan…           578
##  2 Charge                    3263       1           31 Drug Deliv…           610
##  3 Accused_Race              2737     527            5 White                2069
##  4 Accused_Ethnicity          883    2381            2 Not Hispan…           736
##  5 Accused_Sex               3244      20            2 Male                 2388
##  6 Accused_State             3199      65           59 Pennsylvan…           549
##  7 Court_Type                3179      85            2 State                2789
##  8 Plea                      2136    1128            5 Guilty               1781
##  9 Deceased_Race             2092    1172            7 White                1965
## 10 Deceased_Ethnicity         947    2317            3 Not Hispan…           845
## 11 Deceased_Sex              3013     251            3 Male                 1918
## 12 Substance                 2641     623           36 heroin                945
## 13 Relationship              2962     302            3 Dealer/Buy…          2106
## 14 Urbanicity                3152     112            3 Metropolit…          2525
## 15 Sentence_Quartile_…       2055    1209            4 1                     514
## # ℹ 1 more variable: most_common_pct <dbl>

Accused_State has 59 states so it needs to be cleaned if we want to use that variable… not going to bother right now because we aren’t using it

#Bivariate Relationships ##Numeric vs. Numer

cor.test(
  DIH_DATA$Sentence_num,
  DIH_DATA$Accused_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Accused_Age, :
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Sentence_num and DIH_DATA$Accused_Age
## S = 12670082, p-value = 0.005528
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1314717

cor.test(
  DIH_DATA$Sentence_num,
  DIH_DATA$Deceased_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Deceased_Age, :
## Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Sentence_num and DIH_DATA$Deceased_Age
## S = 930447121, p-value = 0.2802
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.02575955

ggplot(DIH_DATA, aes(x = Accused_Age, y = Sentence_num)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2820 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2820 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(DIH_DATA, aes(x = Deceased_Age, y = Sentence_num)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1505 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1505 rows containing missing values or values outside the scale range
## (`geom_point()`).

outcome variable: • heavily skewed • heteroskedastic (variance increases with age for accused and Spread increases in the middle age range (~20–50) for deceased) • outliers present

This violates assumptions of:
•   OLS regression
•   Pearson correlation

So used Spearman correlation -- @Katie ... any thoughts on other tests to run here?
Given the skewed distribution of sentence length and the presence of outliers, Spearman’s rank correlation was used.

###Sentence length by Sex

DIH_DATA %>%
  group_by(Accused_Sex) %>%
  summarise(
    n = sum(!is.na(Sentence_num)),
    mean_sentence = mean(Sentence_num, na.rm = TRUE),
    median_sentence = median(Sentence_num, na.rm = TRUE)
  )

## # A tibble: 3 × 4
##   Accused_Sex     n mean_sentence median_sentence
##   <chr>       <int>         <dbl>           <dbl>
## 1 Female        560          9.83               6
## 2 Male         1486         12.2                9
## 3 <NA>            9          6.86               5

ggplot(DIH_DATA, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
  geom_boxplot() +
  guides(fill = "none")

## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

t.test(Sentence_num ~ Accused_Sex, data = DIH_DATA)

## 
##  Welch Two Sample t-test
## 
## data:  Sentence_num by Accused_Sex
## t = -4.1076, df = 1071.3, p-value = 0.00004303
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -3.535442 -1.249629
## sample estimates:
## mean in group Female   mean in group Male 
##             9.830096            12.222631

Comparison of Sentence Length by Accused Sex

A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -4.11, p < 0.001). • Mean sentence (Female): 9.83 • Mean sentence (Male): 12.22 • Mean difference: approximately 2.39 units higher for males • 95% CI for the difference: [-3.54, -1.25]

These results suggest that, on average, male defendants receive longer sentences than female defendants.

However, several important caveats should be noted: • The distribution of sentence length is highly skewed with substantial outliers, particularly among male defendants. • Because of this skewness, the mean may not fully represent the typical case, and results should be interpreted cautiously. • A nonparametric alternative (e.g., Mann–Whitney U test) or analysis using transformed data (e.g., log(sentence)) may provide a more robust assessment.

Additionally, the boxplot visualization shows: • Greater variability in sentence length among males • More extreme high-value outliers in the male group • Substantial overlap between groups despite the difference in means

Finally, the plotting warning indicates that 1,209 observations were removed due to missing or non-finite values, which may affect representativeness and should be investigated further.

let’s discuss if I should run these additional tests or if we are satisfied with these findings

##Sentence_Quartile Bivariates on full dataset

cor.test(
  DIH_DATA$Accused_Age,
  DIH_DATA$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Accused_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Accused_Age and DIH_DATA$Sentence_Quartile_calc
## S = 12484370, p-value = 0.00232
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1442022

cor.test(
  DIH_DATA$Deceased_Age,
  DIH_DATA$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(DIH_DATA$Deceased_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DIH_DATA$Deceased_Age and DIH_DATA$Sentence_Quartile_calc
## S = 902691085, p-value = 0.8393
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## 0.004839738

###SentQ+Charge

kruskal.test(Sentence_Quartile_calc ~ Charge, data = DIH_DATA)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 258.17, df = 28, p-value <
## 0.00000000000000022

Again this wont mean much until we clean the Charge variable

###SentQ+Relationship Tests whether distributions differ across groups

kruskal.test(Sentence_Quartile_calc ~ Relationship, data = DIH_DATA)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 40.074, df = 2, p-value = 0.000000001986

table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)

##                                          
##                                             1   2   3   4
##   Caretaker/Family/Friend/Partner/Co-User 165 131 116  93
##   Dealer/Buyer                            291 339 353 363
##   Doctor/Patient                            5  10  10  22

tab2 <- table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)
chisq.test(tab2)

## 
##  Pearson's Chi-squared test
## 
## data:  tab2
## X-squared = 44.016, df = 6, p-value = 0.00000007337

#Analytic Sample

@Katie we will need to include a justification in the methods section if it is not already in there about why we restricted the data to this time frame.

dih_analytic <- DIH_DATA %>%
  filter(Year_Charged >= 2011 & Year_Charged <= 2021) %>%
  filter(!is.na(Rural_Code))

Hmmmm if we are going to restrict the analytic data sample by years… then perhaps we need to run descriptives here again with the analytic sample to include in the methods section?

library(dplyr)
library(tidyr)

options(scipen = 999)

numeric_summary_B <- dih_analytic %>%
  summarise(
    across(
      c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
      list(
        nonmissing = ~sum(!is.na(.)),
        missing = ~sum(is.na(.)),
        mean = ~mean(., na.rm = TRUE),
        sd = ~sd(., na.rm = TRUE),
        median = ~median(., na.rm = TRUE),
        min = ~min(., na.rm = TRUE),
        max = ~max(., na.rm = TRUE)
      )
    )
  ) %>%
  pivot_longer(
    everything(),
    names_to = c("Variable", "Statistic"),
    names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
  ) %>%
  pivot_wider(names_from = Statistic, values_from = value)

numeric_summary_B <- numeric_summary_B %>%
  mutate(
    min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
    median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
    max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
  )


numeric_summary_B

## # A tibble: 5 × 8
##   Variable     nonmissing missing    mean    sd median min           max  
##   <chr>             <dbl>   <dbl>   <dbl> <dbl> <chr>  <chr>         <chr>
## 1 Rural_Code         2838       0    2.49  1.88 2      1             9    
## 2 Year_Charged       2838       0 2017.    2.36 2017   2011          2021 
## 3 Accused_Age         976    1862   34.3  10.4  32     17            87   
## 4 Deceased_Age       2375     463   29.9  11.5  28     0             91   
## 5 Sentence_num       1772    1066   11.4  11.8  8      0.08219178082 180

library(dplyr)
library(purrr)
library(tidyr)
library(gt)

categorical_summary2 <- map_dfr(cat_vars, function(v) {
  x <- dih_analytic[[v]]
  tab <- sort(table(x, useNA = "no"), decreasing = TRUE)

  tibble(
    Variable = v,
    nonmissing = sum(!is.na(x)),
    missing = sum(is.na(x)),
    n_categories = dplyr::n_distinct(x, na.rm = TRUE),
    most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
    most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
    most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
  )
})

categorical_summary2

## # A tibble: 15 × 7
##    Variable            nonmissing missing n_categories most_common most_common_n
##    <chr>                    <int>   <int>        <int> <chr>               <int>
##  1 State                     2838       0           47 Pennsylvan…           553
##  2 Charge                    2837       1           30 Drug Deliv…           571
##  3 Accused_Race              2441     397            5 White                1819
##  4 Accused_Ethnicity          829    2009            2 Not Hispan…           697
##  5 Accused_Sex               2823      15            2 Male                 2082
##  6 Accused_State             2777      61           58 Pennsylvan…           525
##  7 Court_Type                2760      78            2 State                2436
##  8 Plea                      1813    1025            5 Guilty               1497
##  9 Deceased_Race             1867     971            7 White                1750
## 10 Deceased_Ethnicity         873    1965            3 Not Hispan…           781
## 11 Deceased_Sex              2632     206            3 Male                 1685
## 12 Substance                 2317     521           34 heroin                823
## 13 Relationship              2576     262            3 Dealer/Buy…          1883
## 14 Urbanicity                2838       0            3 Metropolit…          2274
## 15 Sentence_Quartile_…       1772    1066            4 2                     457
## # ℹ 1 more variable: most_common_pct <dbl>

##Testing Bivariate Relationships in Analytic Sample - Outcome is Sentence_num

cor.test(
  dih_analytic$Sentence_num,
  dih_analytic$Accused_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Accused_Age, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Sentence_num and dih_analytic$Accused_Age
## S = 10194801, p-value = 0.003246
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1441686

cor.test(
  dih_analytic$Sentence_num,
  dih_analytic$Deceased_Age,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Deceased_Age, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Sentence_num and dih_analytic$Deceased_Age
## S = 601445934, p-value = 0.1897
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.03369232

cor.test(
  dih_analytic$Sentence_num,
  dih_analytic$Year_Charged,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Year_Charged, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Sentence_num and dih_analytic$Year_Charged
## S = 903777791, p-value = 0.285
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02541011

dih_analytic %>%
  group_by(Accused_Sex) %>%
  summarise(
    n = sum(!is.na(Sentence_num)),
    mean_sentence = mean(Sentence_num, na.rm = TRUE),
    median_sentence = median(Sentence_num, na.rm = TRUE)
  )

## # A tibble: 3 × 4
##   Accused_Sex     n mean_sentence median_sentence
##   <chr>       <int>         <dbl>           <dbl>
## 1 Female        479         10.0                6
## 2 Male         1286         11.9                9
## 3 <NA>            7          7.25               5

ggplot(dih_analytic, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
  geom_boxplot() +
  guides(fill = "none")

## Warning: Removed 1066 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

t.test(Sentence_num ~ Accused_Sex, data = dih_analytic)

## 
##  Welch Two Sample t-test
## 
## data:  Sentence_num by Accused_Sex
## t = -2.9574, df = 831.36, p-value = 0.00319
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -3.1560701 -0.6379943
## sample estimates:
## mean in group Female   mean in group Male 
##             10.00579             11.90282

Comparison of Sentence Length by Accused Sex (2017–2021 Subset)

A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants for cases from 2017–2021. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -2.96, p = 0.003). • Mean sentence (Female): 10.01 • Mean sentence (Male): 11.90 • Mean difference: approximately 1.9 units higher for males • 95% CI for the difference: [-3.16, -0.64]

These results suggest that, within this time period, male defendants receive longer sentences on average than female defendants.

Additional descriptive statistics support this pattern: • Median sentence (Female): 6 • Median sentence (Male): 9

This indicates that the difference is also present at the median level, not just driven by extreme values.

However, several important considerations apply: • Sentence length remains highly skewed with notable outliers, particularly among male defendants. • Although the mean difference is statistically significant, the magnitude of the difference is modest relative to the overall variability in sentencing. • The boxplot shows substantial overlap between the distributions for males and females. • A nonparametric test (e.g., Mann–Whitney U) or analysis using transformed sentence length may provide a more robust assessment.

Additionally, the plotting warning indicates that 1,066 observations were removed due to missing or non-finite values, which should be examined to ensure no systematic bias in the analyzed subset.

summary(aov(Sentence_num ~ Urbanicity, data = dih_analytic))

##               Df Sum Sq Mean Sq F value Pr(>F)
## Urbanicity     2    408   204.0   1.463  0.232
## Residuals   1769 246685   139.4               
## 1066 observations deleted due to missingness

summary(aov(Sentence_num ~ Charge, data = dih_analytic))

##               Df Sum Sq Mean Sq F value         Pr(>F)    
## Charge        27  13356   494.7   3.691 0.000000000635 ***
## Residuals   1744 233737   134.0                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1066 observations deleted due to missingness

This suggest the sentence length varies by charge… but we need information about how charges were grouped before we can run this analysis properly

summary(aov(Sentence_num ~ Relationship, data = dih_analytic))

##                Df Sum Sq Mean Sq F value   Pr(>F)    
## Relationship    2   2048  1024.0   7.471 0.000589 ***
## Residuals    1632 223687   137.1                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1203 observations deleted due to missingness

##Testing Bivariate Relationships in Analytic Sample: Outcome is Sentencing Quartile.

###SentQ+DecAge

cor.test(
  dih_analytic$Accused_Age,
  dih_analytic$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Accused_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Accused_Age and dih_analytic$Sentence_Quartile_calc
## S = 10081186, p-value = 0.001687
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1537063

cor.test(
  dih_analytic$Deceased_Age,
  dih_analytic$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Deceased_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Deceased_Age and dih_analytic$Sentence_Quartile_calc
## S = 582264805, p-value = 0.9775
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##           rho 
## -0.0007261225

###SentQ+Yr

cor.test(
  dih_analytic$Year_Charged,
  dih_analytic$Sentence_Quartile_calc,
  method = "spearman",
  use = "complete.obs"
)

## Warning in cor.test.default(dih_analytic$Year_Charged,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  dih_analytic$Year_Charged and dih_analytic$Sentence_Quartile_calc
## S = 904918982, p-value = 0.309
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02417951

Later years are associated with higher sentence quartiles ρ ≈ 0.28 → moderate (not strong)

Over time, cases are more likely to fall into higher relative sentence bins

###SentQ+Charge

kruskal.test(Sentence_Quartile_calc ~ Charge, data = dih_analytic)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 235.81, df = 27, p-value <
## 0.00000000000000022

Again this wont mean much until we clean the Charge variable

###SentQ+Relationship Tests whether distributions differ across groups

kruskal.test(Sentence_Quartile_calc ~ Relationship, data = dih_analytic)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 27.177, df = 2, p-value = 0.000001255

table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)

##                                          
##                                             1   2   3   4
##   Caretaker/Family/Friend/Partner/Co-User 124 112  91  73
##   Dealer/Buyer                            265 310 309 318
##   Doctor/Patient                            4   5  10  14

tab22 <- table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)
chisq.test(tab22)

## 
##  Pearson's Chi-squared test
## 
## data:  tab22
## X-squared = 27.92, df = 6, p-value = 0.00009729

###SentQ+Plea

tab_plea <- table(dih_analytic$Plea, dih_analytic$Sentence_Quartile_calc)
tab_plea

##                          
##                             1   2   3   4
##   Alford Plea               0   0   1   0
##   Guilty                  353 341 335 321
##   Guilty to Lesser Charge  18  27  11   9
##   No Contest               16   8   7   5
##   Not Guilty                7  14  16  29

Fixed the issue with pending/awaiting trial showing up in the analytic dataset.

DIH_DATA %>%
  filter(
    Plea == "Awaiting Plea Hearing" &
    (!is.na(Sentence_num) | !is.na(Sentence_Quartile_calc))
  ) %>%
  summarise(n = n())

## # A tibble: 1 × 1
##       n
##   <int>
## 1     0

DIH_DATA %>%
  filter(Plea == "Awaiting Plea Hearing") %>%
  count(Sentence_num, Sentence_Quartile_calc, sort = TRUE)

## # A tibble: 0 × 3
## # ℹ 3 variables: Sentence_num <dbl>, Sentence_Quartile_calc <int>, n <int>

DIH_DATA %>%
  filter(Plea == "Awaiting Plea Hearing") %>%
  count(Sentence, Sentence_Quartile_calc, sort = TRUE)

## # A tibble: 0 × 3
## # ℹ 3 variables: Sentence <chr>, Sentence_Quartile_calc <int>, n <int>

#Logit Model Sent Q Analytic Sample

Decided not to include Race/Ethnicity of deceased because of large missingness in the data (over 2,000 observations missing)

Accused Ethnicity also has large missingness with over 2,000 missing, so rather than use the Race_Ethnicity Variable, I am just using the Accused_Race, which has 397 missing.

I decided to put in a child variable instead of the Deceased Age for theoretical reasons.

#Making Sure Variable is Ordered
dih_analytic <- dih_analytic %>%
  mutate(Sentence_Quartile_calc = ordered(Sentence_Quartile_calc))

#refit all models on the same complete-case dataset so ANOVA will work later
model_data <- dih_analytic %>%
  dplyr::select(
    Sentence_Quartile_calc,
    Charge_Group,
    Relationship,
    Urbanicity,
    Accused_Race,
    Accused_Sex,
    Court_Type,
    Plea,
    Substance_Group,
    child
  ) %>%
  filter(complete.cases(.))

model_data only has 906 observations….

what if I did this on the full dataset instead of just the analytic?

#refit all models on the same complete-case dataset so ANOVA will work later
model_data2 <- DIH_DATA %>%
  dplyr::select(
    Sentence_Quartile_calc,
    Charge_Group,
    Relationship,
    Urbanicity,
    Accused_Race,
    Accused_Sex,
    Court_Type,
    Plea,
    Substance_Group,
    child
  ) %>%
  filter(complete.cases(.))

We get an extra 93 observations…

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

model_full <- polr(
  as.factor(Sentence_Quartile_calc) ~ Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child,
  data = model_data,
  Hess = TRUE
)


summary(model_full)

## Call:
## polr(formula = as.factor(Sentence_Quartile_calc) ~ Charge_Group + 
##     Relationship + Urbanicity + Accused_Race + Accused_Sex + 
##     Court_Type + Plea + Substance_Group + child, data = model_data, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                       Value Std. Error  t value
## Charge_GroupDistribution/Possession Causing Death -0.806485     1.0139 -0.79546
## Charge_GroupDrug-Induced Homicide                  0.122844     0.5722  0.21470
## Charge_GroupManslaughter                          -0.677037     0.5781 -1.17106
## Charge_GroupMurder                                 0.739672     0.5962  1.24068
## Charge_GroupNegligent/Reckless                    -1.023815     0.5885 -1.73982
## Charge_GroupOther                                  0.445379     0.5787  0.76961
## RelationshipDealer/Buyer                           0.140116     0.1546  0.90655
## RelationshipDoctor/Patient                         1.150077     0.5622  2.04585
## UrbanicitySuburban                                 0.075172     0.1875  0.40095
## UrbanicityRural                                    0.552541     0.2800  1.97365
## Accused_RaceBlack                                  0.278980     0.7744  0.36024
## Accused_RaceNative American or Alaskan Native     -0.689710     1.2418 -0.55541
## Accused_RaceWhite                                 -0.219247     0.7661 -0.28619
## Accused_SexMale                                    0.265375     0.1440  1.84232
## Court_TypeState                                   -0.755750     0.2445 -3.09081
## PleaGuilty                                        -0.451714     1.5077 -0.29960
## PleaGuilty to Lesser Charge                       -1.326889     1.5284 -0.86818
## PleaNo Contest                                    -1.248193     1.5550 -0.80268
## PleaNot Guilty                                     0.536800     1.5355  0.34960
## Substance_GroupOpioid (non-fentanyl)              -0.220652     0.1758 -1.25478
## Substance_GroupOther                               0.003549     0.3183  0.01115
## Substance_GroupPolysubstance                       0.089881     0.1975  0.45519
## Substance_GroupStimulant                           0.293040     0.3397  0.86276
## childChild (<18)                                   0.293834     0.2743  1.07136
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -2.3100  1.7988    -1.2842
## 2|3 -0.9300  1.7970    -0.5175
## 3|4  0.3683  1.7967     0.2050
## 
## Residual Deviance: 2318.509 
## AIC: 2372.509

#testing model specification
library(brant)
brant(model_full)

## ------------------------------------------------------------------------------------ 
## Test for                     X2  df  probability 
## ------------------------------------------------------------------------------------ 
## Omnibus                          17.11   48  1
## Charge_GroupDistribution/Possession Causing Death    0   2   1
## Charge_GroupDrug-Induced Homicide            0.76    2   0.68
## Charge_GroupManslaughter             0.07    2   0.97
## Charge_GroupMurder                   0.26    2   0.88
## Charge_GroupNegligent/Reckless           1.51    2   0.47
## Charge_GroupOther                    0.61    2   0.74
## RelationshipDealer/Buyer             3.96    2   0.14
## RelationshipDoctor/Patient               1.62    2   0.45
## UrbanicitySuburban                   1.97    2   0.37
## UrbanicityRural                  0.58    2   0.75
## Accused_RaceBlack                    0.4 2   0.82
## Accused_RaceNative American or Alaskan Native    0   2   1
## Accused_RaceWhite                    0.4 2   0.82
## Accused_SexMale                  1.57    2   0.46
## Court_TypeState                  1.08    2   0.58
## PleaGuilty                       0   2   1
## PleaGuilty to Lesser Charge              0   2   1
## PleaNo Contest                       0   2   1
## PleaNot Guilty                       0   2   1
## Substance_GroupOpioid (non-fentanyl)     3.35    2   0.19
## Substance_GroupOther                 2.97    2   0.23
## Substance_GroupPolysubstance             1.75    2   0.42
## Substance_GroupStimulant             6.35    2   0.04
## childChild (<18)                 0.14    2   0.93
## ------------------------------------------------------------------------------------ 
## 
## H0: Parallel Regression Assumption holds

## Warning in brant(model_full): 201065 combinations in table(dv,ivs) do not
## occur. Because of that, the test results might be invalid.

FAIL to reject the proportional odds assumption

data are too sparse across combinations of predictors and outcome levels

many category combinations have zero observations
the test is unstable / unreliable

This is an ordinal logistic regression with the proportional odds assumption

It assumes: The effect of each predictor is the same across all cutpoints of the outcome (aka the sentencing quartile). Effect of Charge_Group is the SAME in all 3 comparisons

Going to start dropping variables and comparing model fit.

# Drop one variable at a time
model_no_substance <- update(model_full, . ~ . - Substance_Group)
model_no_urban    <- update(model_full, . ~ . - Urbanicity)
model_no_race     <- update(model_full, . ~ . - Accused_Race)
model_no_sex      <- update(model_full, . ~ . - Accused_Sex)
model_no_court    <- update(model_full, . ~ . - Court_Type)
model_no_plea     <- update(model_full, . ~ . - Plea)
model_no_child    <- update(model_full, . ~ . - child)
model_no_relationship    <- update(model_full, . ~ . - Relationship)
model_no_charge <- update(model_full, . ~ . - Charge_Group)

Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child

AIC(
  model_full,
  model_no_substance,
  model_no_urban,
  model_no_race,
  model_no_sex,
  model_no_court,
  model_no_plea,
  model_no_child,
  model_no_relationship,
  model_no_charge
)

##                       df      AIC
## model_full            27 2372.509
## model_no_substance    23 2370.662
## model_no_urban        25 2372.431
## model_no_race         24 2376.382
## model_no_sex          26 2373.909
## model_no_court        26 2380.190
## model_no_plea         23 2389.495
## model_no_child        26 2371.655
## model_no_relationship 25 2373.027
## model_no_charge       21 2445.748

To evaluate model parsimony and identify predictors contributing meaningfully to model fit, a series of nested ordinal logistic regression models were compared using Akaike Information Criterion (AIC).

anova(model_no_substance, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1                   Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1       883   2324.662                                
## 2       879   2318.509 1 vs 2     4 6.152442 0.1880494

anova(model_no_urban, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1              Charge_Group + Relationship + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1       881   2322.431                                
## 2       879   2318.509 1 vs 2     2 3.922153 0.1407069

anova(model_no_race, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1                Charge_Group + Relationship + Urbanicity + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.    Pr(Chi)
## 1       882   2328.382                                 
## 2       879   2318.509 1 vs 2     3 9.872421 0.01968232

anova(model_no_sex, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1               Charge_Group + Relationship + Urbanicity + Accused_Race + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.    Pr(Chi)
## 1       880   2321.909                                 
## 2       879   2318.509 1 vs 2     1 3.400056 0.06519422

anova(model_no_court, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1              Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.     Pr(Chi)
## 1       880   2328.190                                  
## 2       879   2318.509 1 vs 2     1 9.680627 0.001862211

anova(model_no_plea, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1        Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.      Pr(Chi)
## 1       883   2343.495                                   
## 2       879   2318.509 1 vs 2     4 24.98588 0.0000506397

anova(model_no_child, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1         Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1       880   2319.655                                
## 2       879   2318.509 1 vs 2     1 1.145306 0.2845339

 anova(model_no_relationship,model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1                Charge_Group + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1       881   2323.027                                
## 2       879   2318.509 1 vs 2     2 4.517587 0.1044765

   anova(model_no_charge, model_full)

## Likelihood ratio tests of ordinal regression models
## 
## Response: as.factor(Sentence_Quartile_calc)
##                                                                                                                 Model
## 1                Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Resid. df Resid. Dev   Test    Df LR stat.                  Pr(Chi)
## 1       885   2403.748                                               
## 2       879   2318.509 1 vs 2     6 85.23858 0.0000000000000003330669

The variables that clearly improve model fit are:

Charge_Group — strongest contributor, p < .001
Plea — p < .001
Court_Type — p = .002
Accused_Race — p = .020

Variables that do not clearly improve fit:

Substance_Group — p = .188
Urbanicity — p = .141
child — p = .285
Relationship — p = .104
Accused_Sex — borderline, p = .065

Ok going to run the defensible model then include sensitivity checks…

model_core <- polr(
  ordered(Sentence_Quartile_calc) ~ 
    Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data,
  Hess = TRUE
)
summary(model_core)

## Call:
## polr(formula = ordered(Sentence_Quartile_calc) ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race, data = model_data, Hess = TRUE)
## 
## Coefficients:
##                                                      Value Std. Error  t value
## Charge_GroupDistribution/Possession Causing Death -0.92348     0.9929 -0.93006
## Charge_GroupDrug-Induced Homicide                  0.05331     0.5672  0.09399
## Charge_GroupManslaughter                          -0.68925     0.5745 -1.19976
## Charge_GroupMurder                                 0.66275     0.5922  1.11904
## Charge_GroupNegligent/Reckless                    -1.10969     0.5847 -1.89782
## Charge_GroupOther                                  0.42304     0.5724  0.73907
## PleaGuilty                                        -0.77141     1.4919 -0.51708
## PleaGuilty to Lesser Charge                       -1.55796     1.5133 -1.02953
## PleaNo Contest                                    -1.46579     1.5395 -0.95215
## PleaNot Guilty                                     0.27038     1.5220  0.17765
## Court_TypeState                                   -0.87979     0.2390 -3.68081
## Accused_RaceBlack                                  0.03106     0.7608  0.04082
## Accused_RaceNative American or Alaskan Native     -0.57072     1.2293 -0.46427
## Accused_RaceWhite                                 -0.51803     0.7529 -0.68808
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -3.3409  1.7604    -1.8978
## 2|3 -1.9856  1.7576    -1.1297
## 3|4 -0.7050  1.7565    -0.4013
## 
## Residual Deviance: 2336.71 
## AIC: 2370.71

brant(model_core)

## ------------------------------------------------------------------------------------ 
## Test for                     X2  df  probability 
## ------------------------------------------------------------------------------------ 
## Omnibus                          -67.9   28  1
## Charge_GroupDistribution/Possession Causing Death    0.07    2   0.97
## Charge_GroupDrug-Induced Homicide            0.7 2   0.71
## Charge_GroupManslaughter             0.07    2   0.96
## Charge_GroupMurder                   0.32    2   0.85
## Charge_GroupNegligent/Reckless           1.48    2   0.48
## Charge_GroupOther                    0.47    2   0.79
## PleaGuilty                       0   2   1
## PleaGuilty to Lesser Charge              0   2   1
## PleaNo Contest                       0   2   1
## PleaNot Guilty                       0   2   1
## Court_TypeState                  1.76    2   0.41
## Accused_RaceBlack                    0.27    2   0.87
## Accused_RaceNative American or Alaskan Native    0   2   1
## Accused_RaceWhite                    0.4 2   0.82
## ------------------------------------------------------------------------------------ 
## 
## H0: Parallel Regression Assumption holds

## Warning in brant(model_core): 983 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.

library(MASS)

# Sensitivity 1: add Accused_Sex
model_sens_sex <- update(model_core, . ~ . + Accused_Sex)

# Sensitivity 2: add Relationship
model_sens_relationship <- update(model_core, . ~ . + Relationship)

# Sensitivity 3: add child
model_sens_child <- update(model_core, . ~ . + child)

# Sensitivity 4: add all theoretically interesting covariates
model_sens_all <- update(
  model_core,
  . ~ . + Accused_Sex + Relationship + child
)

AIC(
  model_core,
  model_sens_sex,
  model_sens_relationship,
  model_sens_child,
  model_sens_all
)

##                         df      AIC
## model_core              17 2370.710
## model_sens_sex          18 2369.473
## model_sens_relationship 19 2371.068
## model_sens_child        18 2372.214
## model_sens_all          21 2370.847

anova(model_core, model_sens_sex)

## Likelihood ratio tests of ordinal regression models
## 
## Response: ordered(Sentence_Quartile_calc)
##                                                           Model Resid. df
## 1               Charge_Group + Plea + Court_Type + Accused_Race       889
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Accused_Sex       888
##   Resid. Dev   Test    Df LR stat.    Pr(Chi)
## 1   2336.710                                 
## 2   2333.473 1 vs 2     1 3.237434 0.07197326

anova(model_core, model_sens_relationship)

## Likelihood ratio tests of ordinal regression models
## 
## Response: ordered(Sentence_Quartile_calc)
##                                                            Model Resid. df
## 1                Charge_Group + Plea + Court_Type + Accused_Race       889
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Relationship       887
##   Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1   2336.710                                
## 2   2333.068 1 vs 2     2 3.642627 0.1618131

anova(model_core, model_sens_child)

## Likelihood ratio tests of ordinal regression models
## 
## Response: ordered(Sentence_Quartile_calc)
##                                                     Model Resid. df Resid. Dev
## 1         Charge_Group + Plea + Court_Type + Accused_Race       889   2336.710
## 2 Charge_Group + Plea + Court_Type + Accused_Race + child       888   2336.214
##     Test    Df LR stat.   Pr(Chi)
## 1                                
## 2 1 vs 2     1 0.496671 0.4809665

anova(model_core, model_sens_all)

## Likelihood ratio tests of ordinal regression models
## 
## Response: ordered(Sentence_Quartile_calc)
##                                                                                  Model
## 1                                      Charge_Group + Plea + Court_Type + Accused_Race
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Accused_Sex + Relationship + child
##   Resid. df Resid. Dev   Test    Df LR stat.    Pr(Chi)
## 1       889   2336.710                                 
## 2       885   2328.847 1 vs 2     4 7.863004 0.09672744

Sensitivity models adding Accused_Sex, Relationship, and child did not substantially improve model fit compared with the core model. Although adding Accused_Sex slightly reduced AIC, the likelihood ratio test was marginal, so the more parsimonious core model was retained.

Because complete-case modeling substantially reduced the analytic sample, models were re-estimated using a more parsimonious set of predictors with lower missingness. Variables with substantial missingness were evaluated in sensitivity analyses rather than included in the primary model.

colSums(is.na(dih_analytic %>%
  dplyr::select(
    Sentence_Quartile_calc, Charge_Group, Plea, Court_Type,
    Accused_Race, Accused_Sex, Relationship, child,
    Substance_Group, Urbanicity
  )))

## Sentence_Quartile_calc           Charge_Group                   Plea 
##                   1066                      7                   1025 
##             Court_Type           Accused_Race            Accused_Sex 
##                     78                    397                     15 
##           Relationship                  child        Substance_Group 
##                    262                    463                    521 
##             Urbanicity 
##                      0

model_data_core <- dih_analytic %>%

  dplyr::select(

    Sentence_Quartile_calc,

    Charge_Group,

    Plea,

    Court_Type,
    
    Accused_Race

  ) %>%

  filter(complete.cases(.))

model_core2 <- MASS::polr(
  Sentence_Quartile_calc ~ 
    Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data_core,
  Hess = TRUE
)
summary(model_core2)

## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race, data = model_data_core, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                      Value Std. Error   t value
## Charge_GroupDistribution/Possession Causing Death -0.30000     0.7810 -0.384108
## Charge_GroupDrug-Induced Homicide                  0.03233     0.3852  0.083925
## Charge_GroupManslaughter                          -0.70066     0.3919 -1.788051
## Charge_GroupMurder                                 0.58097     0.4087  1.421552
## Charge_GroupNegligent/Reckless                    -0.59518     0.3999 -1.488226
## Charge_GroupOther                                  0.42410     0.3841  1.104219
## PleaGuilty                                        -0.80994     1.4857 -0.545172
## PleaGuilty to Lesser Charge                       -1.41279     1.5015 -0.940891
## PleaNo Contest                                    -1.42517     1.5221 -0.936301
## PleaNot Guilty                                     0.01495     1.5036  0.009941
## Court_TypeState                                   -1.02898     0.1917 -5.368195
## Accused_RaceBlack                                  0.12947     0.6514  0.198763
## Accused_RaceIndian American                       -1.92695     1.6535 -1.165392
## Accused_RaceNative American or Alaskan Native     -0.69366     1.1604 -0.597791
## Accused_RaceWhite                                 -0.49990     0.6456 -0.774291
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -3.3585  1.6594    -2.0239
## 2|3 -2.1151  1.6577    -1.2759
## 3|4 -0.8777  1.6569    -0.5297
## 
## Residual Deviance: 3455.907 
## AIC: 3491.907

Plea looks unstable because the reference category is probably Alford Plea with only 1 case. That’s why the standard errors are huge (~1.5). Dropping or recoding Alford Plea to NA, then refit.

model_data_core <- model_data_core %>%
  mutate(
    Plea = ifelse(Plea == "Alford Plea", NA, Plea)
  ) %>%
  filter(complete.cases(.)) %>%
  droplevels()

Also, Accused_Race has tiny categories like Native American; collapsing these before modeling:

model_data_core <- model_data_core %>%
  mutate(
    Accused_Race = case_when(
      Accused_Race %in% c("White", "Black") ~ Accused_Race,
      TRUE ~ "Other"
    ),
    Accused_Race = factor(Accused_Race)
  )

model_core2 <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data_core,
  Hess = TRUE
)
summary(model_core2)

## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race, data = model_data_core, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                     Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.2303     0.7782 -0.2959
## Charge_GroupDrug-Induced Homicide                  0.1073     0.3793  0.2828
## Charge_GroupManslaughter                          -0.6270     0.3862 -1.6238
## Charge_GroupMurder                                 0.6503     0.4031  1.6135
## Charge_GroupNegligent/Reckless                    -0.5247     0.3941 -1.3313
## Charge_GroupOther                                  0.4973     0.3780  1.3154
## PleaGuilty to Lesser Charge                       -0.6008     0.2345 -2.5618
## PleaNo Contest                                    -0.6142     0.3343 -1.8375
## PleaNot Guilty                                     0.8200     0.2481  3.3053
## Court_TypeState                                   -1.0305     0.1916 -5.3792
## Accused_RaceOther                                 -0.5091     0.5159 -0.9868
## Accused_RaceWhite                                 -0.6292     0.1224 -5.1401
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -2.6065  0.3546    -7.3505
## 2|3 -1.3635  0.3495    -3.9008
## 3|4 -0.1305  0.3476    -0.3756
## 
## Residual Deviance: 3454.928 
## AIC: 3484.928

Key interpretation:

PleaNot Guilty is positive (0.82), meaning not-guilty pleas are associated with higher odds of being in a higher sentence quartile compared with the reference plea group.
PleaGuilty to Lesser Charge is negative (-0.60), meaning lower odds of higher sentence quartile.
Court_TypeState is strongly negative (-1.03), so state cases have lower odds of higher sentence quartile than federal cases.
Accused_RaceWhite is negative (-0.63), indicating lower odds of higher sentence quartile compared with the reference race group.
Charge_Group coefficients are not especially strong here after adjustment.

NEXT STEPS: 1. exp(coef(model_core2)) and interpret 2. RUN Model on full dataset (not analytic)

The primary models were estimated using the analytic sample restricted to the study period. As a sensitivity analysis, models were re-estimated using the full dataset to assess whether results were robust to inclusion of earlier cases.

RUN linear model instead of the quartile

#Logit Model on Full Dataset

Making adjustments to the model to reflect the changes made to plea and accused race above…

model_data_all <- DIH_DATA %>%
  dplyr::select(
    Sentence_Quartile_calc,
    Charge_Group,
    Plea,
    Court_Type,
    Accused_Race
  ) %>%
  mutate(
    Plea = ifelse(Plea == "Alford Plea", NA, Plea),
    Accused_Race = case_when(
      Accused_Race %in% c("White", "Black") ~ Accused_Race,
      TRUE ~ "Other"
    ),
    Accused_Race = factor(Accused_Race),
    Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
  ) %>%
  filter(complete.cases(.)) %>%
  droplevels()

DIH_DATA <- DIH_DATA %>%
  mutate(
    Sentence_num = case_when(
      Sentence %in% c("Pending", "pending") ~ NA_character_,
      TRUE ~ Sentence
    ),
    Sentence_num = as.numeric(Sentence_num)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sentence_num = as.numeric(Sentence_num)`.
## Caused by warning:
## ! NAs introduced by coercion

model_core2_all <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data_all,
  Hess = TRUE
)

summary(model_core2_all)

## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race, data = model_data_all, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                     Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.3739     0.6497 -0.5755
## Charge_GroupDrug-Induced Homicide                 -0.1255     0.2987 -0.4202
## Charge_GroupManslaughter                          -0.8265     0.3045 -2.7143
## Charge_GroupMurder                                 0.4005     0.3246  1.2338
## Charge_GroupNegligent/Reckless                    -0.7391     0.3149 -2.3471
## Charge_GroupOther                                  0.3819     0.2970  1.2858
## PleaGuilty to Lesser Charge                       -0.6233     0.2192 -2.8439
## PleaNo Contest                                    -0.5119     0.2838 -1.8039
## PleaNot Guilty                                     0.8357     0.2282  3.6628
## Court_TypeState                                   -0.9628     0.1542 -6.2450
## Accused_RaceOther                                 -0.3508     0.1526 -2.2991
## Accused_RaceWhite                                 -0.6148     0.1172 -5.2450
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -2.7595  0.2900    -9.5159
## 2|3 -1.5549  0.2857    -5.4425
## 3|4 -0.3035  0.2830    -1.0722
## 
## Residual Deviance: 4623.239 
## AIC: 4653.239

nrow(model_data_core)

## [1] 1323

nrow(model_data_all)

## [1] 1773

Looks like we gain about 400 observations from doing the full dataset.

coef(summary(model_core2))

##                                                        Value Std. Error
## Charge_GroupDistribution/Possession Causing Death -0.2302897  0.7782069
## Charge_GroupDrug-Induced Homicide                  0.1072662  0.3793125
## Charge_GroupManslaughter                          -0.6270398  0.3861507
## Charge_GroupMurder                                 0.6503113  0.4030527
## Charge_GroupNegligent/Reckless                    -0.5247348  0.3941440
## Charge_GroupOther                                  0.4972592  0.3780309
## PleaGuilty to Lesser Charge                       -0.6007545  0.2345075
## PleaNo Contest                                    -0.6142136  0.3342613
## PleaNot Guilty                                     0.8200252  0.2480926
## Court_TypeState                                   -1.0305461  0.1915814
## Accused_RaceOther                                 -0.5090729  0.5158819
## Accused_RaceWhite                                 -0.6292061  0.1224101
## 1|2                                               -2.6065429  0.3546067
## 2|3                                               -1.3635059  0.3495413
## 3|4                                               -0.1305438  0.3475875
##                                                      t value
## Charge_GroupDistribution/Possession Causing Death -0.2959235
## Charge_GroupDrug-Induced Homicide                  0.2827912
## Charge_GroupManslaughter                          -1.6238215
## Charge_GroupMurder                                 1.6134648
## Charge_GroupNegligent/Reckless                    -1.3313277
## Charge_GroupOther                                  1.3153931
## PleaGuilty to Lesser Charge                       -2.5617712
## PleaNo Contest                                    -1.8375254
## PleaNot Guilty                                     3.3053194
## Court_TypeState                                   -5.3791554
## Accused_RaceOther                                 -0.9868012
## Accused_RaceWhite                                 -5.1401498
## 1|2                                               -7.3505172
## 2|3                                               -3.9008431
## 3|4                                               -0.3755712

coef(summary(model_core2_all))

##                                                        Value Std. Error
## Charge_GroupDistribution/Possession Causing Death -0.3739163  0.6496682
## Charge_GroupDrug-Induced Homicide                 -0.1255201  0.2987132
## Charge_GroupManslaughter                          -0.8264920  0.3044950
## Charge_GroupMurder                                 0.4004812  0.3245876
## Charge_GroupNegligent/Reckless                    -0.7390633  0.3148842
## Charge_GroupOther                                  0.3818782  0.2969966
## PleaGuilty to Lesser Charge                       -0.6233344  0.2191799
## PleaNo Contest                                    -0.5119005  0.2837773
## PleaNot Guilty                                     0.8356956  0.2281593
## Court_TypeState                                   -0.9628109  0.1541731
## Accused_RaceOther                                 -0.3508181  0.1525918
## Accused_RaceWhite                                 -0.6148201  0.1172203
## 1|2                                               -2.7595002  0.2899890
## 2|3                                               -1.5548866  0.2856953
## 3|4                                               -0.3034814  0.2830489
##                                                      t value
## Charge_GroupDistribution/Possession Causing Death -0.5755498
## Charge_GroupDrug-Induced Homicide                 -0.4202027
## Charge_GroupManslaughter                          -2.7143042
## Charge_GroupMurder                                 1.2338155
## Charge_GroupNegligent/Reckless                    -2.3470953
## Charge_GroupOther                                  1.2857998
## PleaGuilty to Lesser Charge                       -2.8439399
## PleaNo Contest                                    -1.8038812
## PleaNot Guilty                                     3.6627727
## Court_TypeState                                   -6.2450013
## Accused_RaceOther                                 -2.2990622
## Accused_RaceWhite                                 -5.2449953
## 1|2                                               -9.5158779
## 2|3                                               -5.4424645
## 3|4                                               -1.0721870

The full-dataset sensitivity model used 1,773 cases compared with 1,323 in the analytic sample. The main findings were largely consistent across both models: not guilty pleas were associated with higher sentence quartiles, guilty-to-lesser pleas with lower quartiles, state cases with lower quartiles than federal cases, and White defendants with lower sentence quartiles relative to the reference group.

In the full dataset, some Charge_Group effects became stronger, especially manslaughter and negligent/reckless charges, but the overall direction of the core legal/process variables remained stable.

Full Model with Relationship

Now testing the full dataset with relationship added baxk in…

model_data_all_rel <- DIH_DATA %>%
  dplyr::select(
    Sentence_Quartile_calc,
    Charge_Group,
    Plea,
    Court_Type,
    Accused_Race,
    Relationship
  ) %>%
  mutate(
    Plea = ifelse(Plea == "Alford Plea", NA, Plea),
    Accused_Race = case_when(
      Accused_Race %in% c("White", "Black") ~ Accused_Race,
      TRUE ~ "Other"
    ),
    Accused_Race = factor(Accused_Race),
    Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
  ) %>%
  filter(complete.cases(.)) %>%
  droplevels()

model_core2_all_rel_base <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data_all_rel,
  Hess = TRUE
)

model_core2_all_rel <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race + Relationship,
  data = model_data_all_rel,
  Hess = TRUE
)

summary(model_core2_all_rel)

## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race + Relationship, data = model_data_all_rel, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                     Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.7070     0.6850 -1.0322
## Charge_GroupDrug-Induced Homicide                 -0.1858     0.3170 -0.5859
## Charge_GroupManslaughter                          -0.9158     0.3240 -2.8263
## Charge_GroupMurder                                 0.3407     0.3439  0.9909
## Charge_GroupNegligent/Reckless                    -0.7932     0.3336 -2.3778
## Charge_GroupOther                                  0.2840     0.3144  0.9031
## PleaGuilty to Lesser Charge                       -0.6621     0.2276 -2.9088
## PleaNo Contest                                    -0.4195     0.2886 -1.4536
## PleaNot Guilty                                     0.7611     0.2422  3.1430
## Court_TypeState                                   -0.9372     0.1656 -5.6585
## Accused_RaceOther                                 -0.2269     0.1611 -1.4087
## Accused_RaceWhite                                 -0.5666     0.1243 -4.5596
## RelationshipDealer/Buyer                           0.2222     0.1060  2.0957
## RelationshipDoctor/Patient                         0.5751     0.3243  1.7734
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -2.6356  0.3234    -8.1501
## 2|3 -1.4012  0.3195    -4.3854
## 3|4 -0.1366  0.3170    -0.4309
## 
## Residual Deviance: 4253.50 
## AIC: 4287.50

AIC(model_core2_all_rel_base, model_core2_all_rel)

##                          df      AIC
## model_core2_all_rel_base 15 4289.635
## model_core2_all_rel      17 4287.500

anova(model_core2_all_rel_base, model_core2_all_rel)

## Likelihood ratio tests of ordinal regression models
## 
## Response: Sentence_Quartile_calc
##                                                            Model Resid. df
## 1                Charge_Group + Plea + Court_Type + Accused_Race      1623
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Relationship      1621
##   Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1   4259.635                                
## 2   4253.500 1 vs 2     2 6.134865 0.0465405

nrow(model_data_all_rel)

## [1] 1638

In the full-dataset sensitivity model, adding Relationship modestly improved model fit (LR χ² = 6.13, df = 2, p = .047), with dealer/buyer cases associated with higher sentence quartiles relative to the reference group. This suggests relationship category may matter, although the effect was weaker and less consistent than the core legal-process variables. Significant Charge_GroupManslaughter t = -2.83 # significant Charge_GroupNegligent/Reckless t = -2.38 # significant PleaGuilty to Lesser Charge t = -2.91 # significant PleaNot Guilty t = 3.14 # significant Court_TypeState t = -5.66 # significant Accused_RaceWhite t = -4.56 # significant RelationshipDealer/Buyer t = 2.10 # significant-ish

Not Significant Distribution/Possession Drug-Induced Homicide Murder Other charge No Contest Accused_RaceOther RelationshipDoctor/Patient

For the t value in polr(), the usual rule of thumb is:

|t| > 1.96 → approximately significant at p < .05
|t| > 2.58 → approximately significant at p < .01
|t| > 3.29 → approximately significant at p < .001

Full Model with Child

model_data_all_child <- DIH_DATA %>%
  dplyr::select(
    Sentence_Quartile_calc,
    Charge_Group,
    Plea,
    Court_Type,
    Accused_Race,
    child
  ) %>%
  mutate(
    Plea = ifelse(Plea == "Alford Plea", NA, Plea),
    Accused_Race = case_when(
      Accused_Race %in% c("White", "Black") ~ Accused_Race,
      TRUE ~ "Other"
    ),
    Accused_Race = factor(Accused_Race),
    Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
  ) %>%
  filter(complete.cases(.)) %>%
  droplevels()

model_all_child_base <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data_all_child,
  Hess = TRUE
)

model_all_child <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race + child,
  data = model_data_all_child,
  Hess = TRUE
)

summary(model_all_child)

## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race + child, data = model_data_all_child, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                     Value Std. Error  t value
## Charge_GroupDistribution/Possession Causing Death -0.5217     0.7520 -0.69373
## Charge_GroupDrug-Induced Homicide                  0.1429     0.3811  0.37486
## Charge_GroupManslaughter                          -0.6396     0.3850 -1.66129
## Charge_GroupMurder                                 0.6808     0.4059  1.67718
## Charge_GroupNegligent/Reckless                    -0.5401     0.3938 -1.37136
## Charge_GroupOther                                  0.6858     0.3828  1.79167
## PleaGuilty to Lesser Charge                       -0.5397     0.2428 -2.22304
## PleaNo Contest                                    -0.3160     0.2902 -1.08921
## PleaNot Guilty                                     0.8989     0.2555  3.51791
## Court_TypeState                                   -0.8934     0.1734 -5.15298
## Accused_RaceOther                                 -0.3589     0.1687 -2.12684
## Accused_RaceWhite                                 -0.5916     0.1258 -4.70273
## childChild (<18)                                   0.0169     0.1889  0.08947
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -2.3867  0.3740    -6.3818
## 2|3 -1.1776  0.3706    -3.1780
## 3|4  0.0898  0.3693     0.2431
## 
## Residual Deviance: 3983.899 
## AIC: 4015.899

AIC(model_all_child_base, model_all_child)

##                      df      AIC
## model_all_child_base 15 4013.907
## model_all_child      16 4015.899

anova(model_all_child_base, model_all_child)

## Likelihood ratio tests of ordinal regression models
## 
## Response: Sentence_Quartile_calc
##                                                     Model Resid. df Resid. Dev
## 1         Charge_Group + Plea + Court_Type + Accused_Race      1511   3983.907
## 2 Charge_Group + Plea + Court_Type + Accused_Race + child      1510   3983.899
##     Test    Df    LR stat.   Pr(Chi)
## 1                                   
## 2 1 vs 2     1 0.008005908 0.9287038

table(model_data_all_child$child)

## 
## Adult (18+) Child (<18) 
##        1421         105

Adding a child-victim indicator did not improve model fit in the full-dataset sensitivity model and was not associated with sentence quartile after adjustment. Given the small number of child cases in the model sample and the lack of improvement in AIC or likelihood ratio testing, this variable was not retained in the final model.

Back to Logit Analytic Model with Relationship

model_data_core_rel <- dih_analytic %>%
  dplyr::select(
    Sentence_Quartile_calc,
    Charge_Group,
    Plea,
    Court_Type,
    Accused_Race,
    Relationship
  ) %>%
  mutate(
    Plea = ifelse(Plea == "Alford Plea", NA, Plea),
    Accused_Race = case_when(
      Accused_Race %in% c("White", "Black") ~ Accused_Race,
      TRUE ~ "Other"
    ),
    Accused_Race = factor(Accused_Race),
    Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
  ) %>%
  filter(complete.cases(.)) %>%
  droplevels()

model_core2_rel_base <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
  data = model_data_core_rel,
  Hess = TRUE
)

model_core2_rel <- MASS::polr(
  Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race + Relationship,
  data = model_data_core_rel,
  Hess = TRUE
)

summary(model_core2_rel)

## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group + 
##     Plea + Court_Type + Accused_Race + Relationship, data = model_data_core_rel, 
##     Hess = TRUE)
## 
## Coefficients:
##                                                     Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.3549     0.7063 -0.5025
## Charge_GroupDrug-Induced Homicide                 -0.1482     0.3379 -0.4387
## Charge_GroupManslaughter                          -0.9348     0.3463 -2.6993
## Charge_GroupMurder                                 0.2982     0.3659  0.8150
## Charge_GroupNegligent/Reckless                    -0.7804     0.3554 -2.1954
## Charge_GroupOther                                  0.3020     0.3346  0.9028
## PleaGuilty to Lesser Charge                       -0.6619     0.2362 -2.8017
## PleaNo Contest                                    -0.4348     0.3288 -1.3224
## PleaNot Guilty                                     0.7651     0.2508  3.0513
## Court_TypeState                                   -0.9456     0.1825 -5.1806
## Accused_RaceOther                                 -0.1132     0.1777 -0.6369
## Accused_RaceWhite                                 -0.6063     0.1299 -4.6662
## RelationshipDealer/Buyer                           0.1335     0.1175  1.1360
## RelationshipDoctor/Patient                         0.3142     0.3878  0.8102
## 
## Intercepts:
##     Value   Std. Error t value
## 1|2 -2.7100  0.3434    -7.8914
## 2|3 -1.4340  0.3386    -4.2355
## 3|4 -0.1730  0.3359    -0.5152
## 
## Residual Deviance: 3621.323 
## AIC: 3655.323

AIC(model_core2_rel_base, model_core2_rel)

##                      df      AIC
## model_core2_rel_base 15 3652.930
## model_core2_rel      17 3655.323

anova(model_core2_rel_base, model_core2_rel)

## Likelihood ratio tests of ordinal regression models
## 
## Response: Sentence_Quartile_calc
##                                                            Model Resid. df
## 1                Charge_Group + Plea + Court_Type + Accused_Race      1381
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Relationship      1379
##   Resid. Dev   Test    Df LR stat.   Pr(Chi)
## 1   3622.930                                
## 2   3621.323 1 vs 2     2 1.607377 0.4476747

nrow(model_data_core_rel)

## [1] 1396

In the analytic sample, adding Relationship did not improve model fit and the relationship categories were not statistically significant predictors of sentence quartile. Although Relationship improved fit modestly in the full-dataset sensitivity model, it was not retained in the primary analytic model.

Note to self This is interesting and makes me wonder what the difference is between the full dataset and the analytic dataset…

#Linear Model

running a linear model with Sentence_num as the outcome, but plain OLS may be shaky because Sentence_num is highly skewed with big outliers.

OLS on sentence length for easy interpretation
OLS on log sentence length as a robustness check

lm_data_full <- DIH_DATA %>%
  dplyr::select(
    Sentence_num,
    Charge_Group,
    Relationship,
    Urbanicity,
    Accused_Race,
    Accused_Sex,
    Court_Type,
    Plea,
    Substance_Group,
    child
  ) %>%
  mutate(
    Plea = ifelse(Plea == "Alford Plea", NA, Plea),
    Accused_Race = case_when(
      Accused_Race %in% c("White", "Black") ~ Accused_Race,
      TRUE ~ "Other"
    ),
    Accused_Race = factor(Accused_Race)
  ) %>%
  filter(complete.cases(.)) %>%
  droplevels()

##full OLS

lm_full <- lm(
  Sentence_num ~ Charge_Group + Relationship + Urbanicity +
    Accused_Race + Accused_Sex + Court_Type + Plea +
    Substance_Group + child,
  data = lm_data_full
)

summary(lm_full)

## 
## Call:
## lm(formula = Sentence_num ~ Charge_Group + Relationship + Urbanicity + 
##     Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + 
##     child, data = lm_data_full)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.294  -5.756  -2.273   2.840 171.643 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                       13.96682    2.94009   4.750
## Charge_GroupDistribution/Possession Causing Death -1.30623    4.99499  -0.262
## Charge_GroupDrug-Induced Homicide                 -0.41040    2.73009  -0.150
## Charge_GroupManslaughter                          -3.79114    2.77920  -1.364
## Charge_GroupMurder                                 3.13818    2.90087   1.082
## Charge_GroupNegligent/Reckless                    -2.80906    2.83305  -0.992
## Charge_GroupOther                                  1.07790    2.74977   0.392
## RelationshipDealer/Buyer                           0.76682    0.83050   0.923
## RelationshipDoctor/Patient                         7.24098    2.58396   2.802
## UrbanicitySuburban                                 0.07022    1.01722   0.069
## UrbanicityRural                                    1.21458    1.47217   0.825
## Accused_RaceOther                                 -1.53658    1.28789  -1.193
## Accused_RaceWhite                                 -2.24824    0.96984  -2.318
## Accused_SexMale                                    1.21500    0.77854   1.561
## Court_TypeState                                   -2.10031    1.25634  -1.672
## PleaGuilty to Lesser Charge                       -3.33070    1.66830  -1.996
## PleaNo Contest                                    -3.12979    2.12461  -1.473
## PleaNot Guilty                                     7.12540    1.82914   3.896
## Substance_GroupOpioid (non-fentanyl)              -0.43382    0.96691  -0.449
## Substance_GroupOther                               1.45654    1.59582   0.913
## Substance_GroupPolysubstance                      -0.56932    1.10647  -0.515
## Substance_GroupStimulant                           0.02411    1.89557   0.013
## childChild (<18)                                   3.41459    1.46113   2.337
##                                                    Pr(>|t|)    
## (Intercept)                                       0.0000023 ***
## Charge_GroupDistribution/Possession Causing Death  0.793750    
## Charge_GroupDrug-Induced Homicide                  0.880537    
## Charge_GroupManslaughter                           0.172811    
## Charge_GroupMurder                                 0.279575    
## Charge_GroupNegligent/Reckless                     0.321645    
## Charge_GroupOther                                  0.695138    
## RelationshipDealer/Buyer                           0.356041    
## RelationshipDoctor/Patient                         0.005163 ** 
## UrbanicitySuburban                                 0.944974    
## UrbanicityRural                                    0.409536    
## Accused_RaceOther                                  0.233088    
## Accused_RaceWhite                                  0.020623 *  
## Accused_SexMale                                    0.118900    
## Court_TypeState                                    0.094853 .  
## PleaGuilty to Lesser Charge                        0.046129 *  
## PleaNo Contest                                     0.141006    
## PleaNot Guilty                                     0.000104 ***
## Substance_GroupOpioid (non-fentanyl)               0.653757    
## Substance_GroupOther                               0.361589    
## Substance_GroupPolysubstance                       0.606976    
## Substance_GroupStimulant                           0.989852    
## childChild (<18)                                   0.019620 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.22 on 1102 degrees of freedom
## Multiple R-squared:  0.0957, Adjusted R-squared:  0.07765 
## F-statistic: 5.301 on 22 and 1102 DF,  p-value: 0.00000000000005156

##log-sentence model:

lm_data_full <- lm_data_full %>%
  mutate(log_sentence = log(Sentence_num))

lm_full_log <- lm(
  log_sentence ~ Charge_Group + Relationship + Urbanicity +
    Accused_Race + Accused_Sex + Court_Type + Plea +
    Substance_Group + child,
  data = lm_data_full
)

summary(lm_full_log)

## 
## Call:
## lm(formula = log_sentence ~ Charge_Group + Relationship + Urbanicity + 
##     Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + 
##     child, data = lm_data_full)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4569 -0.4568  0.0613  0.5605  3.5886 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                        2.22514    0.24100   9.233
## Charge_GroupDistribution/Possession Causing Death -0.08361    0.40943  -0.204
## Charge_GroupDrug-Induced Homicide                  0.20422    0.22378   0.913
## Charge_GroupManslaughter                          -0.19413    0.22781  -0.852
## Charge_GroupMurder                                 0.49091    0.23778   2.065
## Charge_GroupNegligent/Reckless                    -0.16341    0.23222  -0.704
## Charge_GroupOther                                  0.30933    0.22540   1.372
## RelationshipDealer/Buyer                           0.13902    0.06808   2.042
## RelationshipDoctor/Patient                         0.47673    0.21180   2.251
## UrbanicitySuburban                                 0.01887    0.08338   0.226
## UrbanicityRural                                    0.11915    0.12067   0.987
## Accused_RaceOther                                 -0.11120    0.10557  -1.053
## Accused_RaceWhite                                 -0.24895    0.07950  -3.132
## Accused_SexMale                                    0.14029    0.06382   2.198
## Court_TypeState                                   -0.38672    0.10298  -3.755
## PleaGuilty to Lesser Charge                       -0.35728    0.13675  -2.613
## PleaNo Contest                                    -0.20093    0.17415  -1.154
## PleaNot Guilty                                     0.47306    0.14993   3.155
## Substance_GroupOpioid (non-fentanyl)              -0.10106    0.07926  -1.275
## Substance_GroupOther                              -0.03781    0.13081  -0.289
## Substance_GroupPolysubstance                       0.04251    0.09070   0.469
## Substance_GroupStimulant                           0.18146    0.15538   1.168
## childChild (<18)                                   0.22084    0.11977   1.844
##                                                               Pr(>|t|)    
## (Intercept)                                       < 0.0000000000000002 ***
## Charge_GroupDistribution/Possession Causing Death             0.838225    
## Charge_GroupDrug-Induced Homicide                             0.361655    
## Charge_GroupManslaughter                                      0.394315    
## Charge_GroupMurder                                            0.039198 *  
## Charge_GroupNegligent/Reckless                                0.481792    
## Charge_GroupOther                                             0.170226    
## RelationshipDealer/Buyer                                      0.041379 *  
## RelationshipDoctor/Patient                                    0.024594 *  
## UrbanicitySuburban                                            0.820974    
## UrbanicityRural                                               0.323678    
## Accused_RaceOther                                             0.292391    
## Accused_RaceWhite                                             0.001785 ** 
## Accused_SexMale                                               0.028136 *  
## Court_TypeState                                               0.000182 ***
## PleaGuilty to Lesser Charge                                   0.009107 ** 
## PleaNo Contest                                                0.248856    
## PleaNot Guilty                                                0.001647 ** 
## Substance_GroupOpioid (non-fentanyl)                          0.202563    
## Substance_GroupOther                                          0.772615    
## Substance_GroupPolysubstance                                  0.639406    
## Substance_GroupStimulant                                      0.243115    
## childChild (<18)                                              0.065465 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9196 on 1102 degrees of freedom
## Multiple R-squared:  0.1529, Adjusted R-squared:  0.136 
## F-statistic:  9.04 on 22 and 1102 DF,  p-value: < 0.00000000000000022

#Check Sample Size
nrow(lm_data_full)

## [1] 1125

#Check Diagnostics
par(mfrow = c(2, 2))
plot(lm_full)

par(mfrow = c(2, 2))
plot(lm_full_log)

The raw OLS model has assumption problems: the residuals are highly skewed, the Q-Q plot shows major non-normality, and a few extreme sentences are very influential. So I would not rely on the raw sentence model as the main linear model.

The log-sentence model is better. Its diagnostics look more reasonable, and it explains more variance: Raw sentence model: Adjusted R² = .078 Log sentence model: Adjusted R² = .136

Substantively, the log model suggests higher sentences for:

Murder
Dealer/Buyer
Doctor/Patient
Male defendants
Not Guilty pleas

And lower sentences for:

White defendants
State cases
Guilty to Lesser Charge pleas

child is marginal in the log model (p = .065), so I would not treat it as a clear predictor, but it is worth noting as suggestive.

Because sentence length was highly skewed with extreme outliers, the raw OLS model showed substantial assumption violations. A log-transformed sentence model fit the data better and produced results generally consistent with the ordinal models, particularly for plea, court type, race, and relationship. Therefore, the linear models are best treated as sensitivity analyses rather than the primary specification.

Some predictors appear significant in the log sentence model but not in the sentence quartile model because the log model preserves more information about sentence length, while the quartile model reduces the outcome to four ordered categories. These findings suggest that some variables may affect the magnitude of sentence length without consistently shifting cases into higher quartiles.

Because quartiling collapses continuous sentence values into four categories, it can obscure meaningful within-quartile differences and reduce statistical power. The log model therefore provides a more sensitive test of factors associated with sentence length, while the quartile model can be retained as a robustness check.

Why:

robust to outliers
non-linear categorization
checks whether findings depend on exact sentence magnitudes

###Nested Loglinear Model

lm_no_charge <- update(lm_full_log, . ~ . - Charge_Group)
lm_no_relationship <- update(lm_full_log, . ~ . - Relationship)
lm_no_urban <- update(lm_full_log, . ~ . - Urbanicity)
lm_no_race <- update(lm_full_log, . ~ . - Accused_Race)
lm_no_sex <- update(lm_full_log, . ~ . - Accused_Sex)
lm_no_court <- update(lm_full_log, . ~ . - Court_Type)
lm_no_plea <- update(lm_full_log, . ~ . - Plea)
lm_no_substance <- update(lm_full_log, . ~ . - Substance_Group)
lm_no_child <- update(lm_full_log, . ~ . - child)

AIC(
  lm_full_log,
  lm_no_charge,
  lm_no_relationship,
  lm_no_urban,
  lm_no_race,
  lm_no_sex,
  lm_no_court,
  lm_no_plea,
  lm_no_substance,
  lm_no_child
)

##                    df      AIC
## lm_full_log        24 3028.897
## lm_no_charge       18 3088.240
## lm_no_relationship 22 3032.548
## lm_no_urban        22 3025.908
## lm_no_race         22 3035.815
## lm_no_sex          23 3031.819
## lm_no_court        23 3041.201
## lm_no_plea         21 3042.154
## lm_no_substance    20 3028.288
## lm_no_child        23 3030.362

anova(lm_no_charge, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Relationship + Urbanicity + Accused_Race + Accused_Sex + 
##     Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F             Pr(>F)    
## 1   1108 993.03                                           
## 2   1102 932.01  6    61.019 12.025 0.0000000000004068 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(lm_no_relationship, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Urbanicity + Accused_Race + Accused_Sex + 
##     Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   1104 938.37                              
## 2   1102 932.01  2    6.3605 3.7603 0.02358 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(lm_no_urban, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Accused_Race + Accused_Sex + 
##     Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1104 932.85                           
## 2   1102 932.01  2   0.83796 0.4954 0.6095

anova(lm_no_race, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Sex + 
##     Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)   
## 1   1104 941.10                               
## 2   1102 932.01  2    9.0892 5.3735 0.00476 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(lm_no_sex, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   1103 936.10                              
## 2   1102 932.01  1     4.087 4.8325 0.02814 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(lm_no_court, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   1103 943.94                                  
## 2   1102 932.01  1    11.927 14.102 0.0001822 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(lm_no_plea, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   1105 948.10                                  
## 2   1102 932.01  3    16.091 6.3419 0.0002906 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(lm_no_substance, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1   1106 938.16                          
## 2   1102 932.01  4    6.1437 1.816 0.1234

anova(lm_no_child, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq   F  Pr(>F)  
## 1   1103 934.89                           
## 2   1102 932.01  1    2.8755 3.4 0.06546 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Keep in the model:

Charge_Group — strongest contributor, p < .001
Plea — p < .001
Court_Type — p < .001
Accused_Race — p = .005
Accused_Sex — p = .028
Relationship — p = .024

Do not retain unless theoretically required:

Urbanicity — p = .61
Substance_Group — p = .12
child — marginal, p = .065

AIC tells the same general story. Dropping Urbanicity and Substance_Group slightly improves or barely changes AIC, so they are good candidates to remove. Dropping child also only modestly worsens AIC and is not conventionally significant.

#simplified model
lm_final_log <- lm(
  log_sentence ~ Charge_Group + Relationship + Accused_Race +
    Accused_Sex + Court_Type + Plea,
  data = lm_data_full
)

summary(lm_final_log)

## 
## Call:
## lm(formula = log_sentence ~ Charge_Group + Relationship + Accused_Race + 
##     Accused_Sex + Court_Type + Plea, data = lm_data_full)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5232 -0.4506  0.0602  0.5662  3.5260 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                        2.24514    0.23265   9.650
## Charge_GroupDistribution/Possession Causing Death -0.10962    0.40931  -0.268
## Charge_GroupDrug-Induced Homicide                  0.18322    0.22350   0.820
## Charge_GroupManslaughter                          -0.19477    0.22736  -0.857
## Charge_GroupMurder                                 0.49697    0.23729   2.094
## Charge_GroupNegligent/Reckless                    -0.18811    0.23188  -0.811
## Charge_GroupOther                                  0.30112    0.22481   1.339
## RelationshipDealer/Buyer                           0.11102    0.06569   1.690
## RelationshipDoctor/Patient                         0.41970    0.20996   1.999
## Accused_RaceOther                                 -0.12014    0.10496  -1.145
## Accused_RaceWhite                                 -0.24841    0.07890  -3.149
## Accused_SexMale                                    0.13860    0.06390   2.169
## Court_TypeState                                   -0.39130    0.10297  -3.800
## PleaGuilty to Lesser Charge                       -0.31520    0.13599  -2.318
## PleaNo Contest                                    -0.17085    0.17381  -0.983
## PleaNot Guilty                                     0.50398    0.14897   3.383
##                                                               Pr(>|t|)    
## (Intercept)                                       < 0.0000000000000002 ***
## Charge_GroupDistribution/Possession Causing Death             0.788896    
## Charge_GroupDrug-Induced Homicide                             0.412524    
## Charge_GroupManslaughter                                      0.391823    
## Charge_GroupMurder                                            0.036457 *  
## Charge_GroupNegligent/Reckless                                0.417411    
## Charge_GroupOther                                             0.180701    
## RelationshipDealer/Buyer                                      0.091283 .  
## RelationshipDoctor/Patient                                    0.045861 *  
## Accused_RaceOther                                             0.252583    
## Accused_RaceWhite                                             0.001685 ** 
## Accused_SexMale                                               0.030295 *  
## Court_TypeState                                               0.000152 ***
## PleaGuilty to Lesser Charge                                   0.020638 *  
## PleaNo Contest                                                0.325846    
## PleaNot Guilty                                                0.000742 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9216 on 1109 degrees of freedom
## Multiple R-squared:  0.1438, Adjusted R-squared:  0.1323 
## F-statistic: 12.42 on 15 and 1109 DF,  p-value: < 0.00000000000000022

AIC(lm_full_log, lm_final_log)

##              df      AIC
## lm_full_log  24 3028.897
## lm_final_log 17 3026.825

anova(lm_final_log, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Accused_Race + Accused_Sex + 
##     Court_Type + Plea
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1109 941.95                           
## 2   1102 932.01  7    9.9346 1.6781 0.1105

Interpretation:

The simplified log model fits slightly better than the full model by AIC:
- full: 3028.90
- final: 3026.83
The ANOVA comparing the simplified model to the full model is not significant:
- p = .1105

A simplified log-linear model excluding Urbanicity, Substance_Group, and child provided a slightly better AIC than the full model and did not significantly worsen model fit in the nested model comparison. The final model retained Charge_Group, Relationship, Accused_Race, Accused_Sex, Court_Type, and Plea.

Modeling Rationale: After estimating the full log-linear model, predictors were evaluated using drop-one-variable comparisons. Variables that did not meaningfully improve fit were removed together, and the reduced model was compared against the full model. The reduced model had a slightly lower AIC and did not significantly worsen fit, supporting retention of the more parsimonious specification.

Can do this all on anlaytic sampel if we would like

###Colinearity Tests

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

car::vif(lm_final_log)

##                  GVIF Df GVIF^(1/(2*Df))
## Charge_Group 1.542000  6        1.036749
## Relationship 1.190607  2        1.044581
## Accused_Race 1.191073  2        1.044683
## Accused_Sex  1.100737  1        1.049160
## Court_Type   1.425678  1        1.194018
## Plea         1.105746  3        1.016894

vif_vals <- car::vif(lm_final_log)
vif_vals

##                  GVIF Df GVIF^(1/(2*Df))
## Charge_Group 1.542000  6        1.036749
## Relationship 1.190607  2        1.044581
## Accused_Race 1.191073  2        1.044683
## Accused_Sex  1.100737  1        1.049160
## Court_Type   1.425678  1        1.194018
## Plea         1.105746  3        1.016894

vif_vals[, "GVIF^(1/(2*Df))"]

## Charge_Group Relationship Accused_Race  Accused_Sex   Court_Type         Plea 
##     1.036749     1.044581     1.044683     1.049160     1.194018     1.016894

Rule of thumb:

< 2 = fine
2–5 = watch
5 = possible concern

Multicollinearity was assessed using generalized variance inflation factors. Adjusted GVIF values were all close to 1, indicating no evidence of problematic collinearity among predictors in the final log-linear model.

###Model Diagnostics

par(mfrow = c(2, 2))
plot(lm_final_log)

Residuals vs Fitted What we want:

no strong curve
residuals roughly centered around 0
no dramatic funnel shape

we have:

mild heteroskedasticity still present
but much improved compared with the raw sentence model
no major nonlinear pattern

This is the weakest assumption area.

The tails deviate from the line, especially lower quantiles.

That means:

residuals are not perfectly normal
but this is extremely common in sentencing/criminal justice data

OLS does not require perfectly normal residuals for unbiased coefficients. With n > 1000, inference is usually fairly robust.

The key question is whether the model is “good enough,” not perfect.

And compared with the raw model:

this is substantially better
the severe skewness was reduced a lot

That supports the log transformation.

Scale-Location

This checks homoskedasticity.

we still have:

slight downward trend
mild unequal variance

But again:

much better than the raw sentence model
not catastrophic

to strengthen the analysis further, we can use robust standard errors.

Residuals vs Leverage We have:

a few moderate leverage points
but nothing screaming catastrophic influence

No obvious Cook’s distance disaster.

Overall Assessment Strengths:

log transformation materially improved assumptions
no collinearity
residual patterns are acceptable
model fit improved
theoretically interpretable
retains continuous sentence information

Weaknesses (which are normal and reportable):

residuals are not perfectly normal
mild heteroskedasticity remains
sentencing data inherently contain outliers/skew

Narrative: Because sentence length was highly skewed, analyses were conducted using the natural log of sentence length. Diagnostic plots indicated that the log transformation substantially improved model assumptions, reducing skewness and heteroskedasticity relative to models using raw sentence length. Residual deviations from normality remained modest but were considered acceptable given the large sample size and robustness of linear regression estimates.

###Robust Standard Errors

library(sandwich)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

coeftest(
  lm_final_log,
  vcov = vcovHC(lm_final_log, type = "HC3")
)

## 
## t test of coefficients:
## 
##                                                    Estimate Std. Error t value
## (Intercept)                                        2.245144   0.294277  7.6294
## Charge_GroupDistribution/Possession Causing Death -0.109618   0.530832 -0.2065
## Charge_GroupDrug-Induced Homicide                  0.183217   0.318445  0.5753
## Charge_GroupManslaughter                          -0.194770   0.323092 -0.6028
## Charge_GroupMurder                                 0.496971   0.328255  1.5140
## Charge_GroupNegligent/Reckless                    -0.188106   0.325981 -0.5770
## Charge_GroupOther                                  0.301125   0.308827  0.9751
## RelationshipDealer/Buyer                           0.111017   0.070158  1.5824
## RelationshipDoctor/Patient                         0.419699   0.199195  2.1070
## Accused_RaceOther                                 -0.120144   0.097332 -1.2344
## Accused_RaceWhite                                 -0.248411   0.072323 -3.4348
## Accused_SexMale                                    0.138596   0.068523  2.0226
## Court_TypeState                                   -0.391304   0.095821 -4.0837
## PleaGuilty to Lesser Charge                       -0.315203   0.134271 -2.3475
## PleaNo Contest                                    -0.170848   0.158813 -1.0758
## PleaNot Guilty                                     0.503975   0.136168  3.7011
##                                                              Pr(>|t|)    
## (Intercept)                                       0.00000000000005062 ***
## Charge_GroupDistribution/Possession Causing Death           0.8364359    
## Charge_GroupDrug-Induced Homicide                           0.5651718    
## Charge_GroupManslaughter                                    0.5467432    
## Charge_GroupMurder                                          0.1303166    
## Charge_GroupNegligent/Reckless                              0.5640249    
## Charge_GroupOther                                           0.3297425    
## RelationshipDealer/Buyer                                    0.1138459    
## RelationshipDoctor/Patient                                  0.0353440 *  
## Accused_RaceOther                                           0.2173269    
## Accused_RaceWhite                                           0.0006150 ***
## Accused_SexMale                                             0.0433506 *  
## Court_TypeState                                   0.00004750706714125 ***
## PleaGuilty to Lesser Charge                                 0.0190738 *  
## PleaNo Contest                                              0.2822593    
## PleaNot Guilty                                              0.0002252 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

RelationshipDoctor/Patient p = .035 Accused_RaceWhite p < .001 Accused_SexMale p = .043 Court_TypeState p < .001 PleaGuilty to Lesser Charge p = .019 PleaNot Guilty p < .001

No longer significant: Charge_GroupMurder RelationshipDealer/Buyer

Summary: After using heteroskedasticity-robust standard errors, the associations for plea, court type, accused race, accused sex, and doctor/patient relationship remained statistically significant. Some charge-group and dealer/buyer relationship effects were attenuated, suggesting those findings are less robust.

###Correcting Reference Groups & Re-running everything

coef_table <- data.frame(
  term = names(coef(lm_final_log)),
  estimate = coef(lm_final_log),
  percent_change = (exp(coef(lm_final_log)) - 1) * 100
)

coef_table

##                                                                                                term
## (Intercept)                                                                             (Intercept)
## Charge_GroupDistribution/Possession Causing Death Charge_GroupDistribution/Possession Causing Death
## Charge_GroupDrug-Induced Homicide                                 Charge_GroupDrug-Induced Homicide
## Charge_GroupManslaughter                                                   Charge_GroupManslaughter
## Charge_GroupMurder                                                               Charge_GroupMurder
## Charge_GroupNegligent/Reckless                                       Charge_GroupNegligent/Reckless
## Charge_GroupOther                                                                 Charge_GroupOther
## RelationshipDealer/Buyer                                                   RelationshipDealer/Buyer
## RelationshipDoctor/Patient                                               RelationshipDoctor/Patient
## Accused_RaceOther                                                                 Accused_RaceOther
## Accused_RaceWhite                                                                 Accused_RaceWhite
## Accused_SexMale                                                                     Accused_SexMale
## Court_TypeState                                                                     Court_TypeState
## PleaGuilty to Lesser Charge                                             PleaGuilty to Lesser Charge
## PleaNo Contest                                                                       PleaNo Contest
## PleaNot Guilty                                                                       PleaNot Guilty
##                                                     estimate percent_change
## (Intercept)                                        2.2451438      844.17732
## Charge_GroupDistribution/Possession Causing Death -0.1096183      -10.38239
## Charge_GroupDrug-Induced Homicide                  0.1832172       20.10753
## Charge_GroupManslaughter                          -0.1947702      -17.69762
## Charge_GroupMurder                                 0.4969711       64.37351
## Charge_GroupNegligent/Reckless                    -0.1881061      -17.14732
## Charge_GroupOther                                  0.3011248       35.13780
## RelationshipDealer/Buyer                           0.1110171       11.74140
## RelationshipDoctor/Patient                         0.4196992       52.15038
## Accused_RaceOther                                 -0.1201441      -11.32073
## Accused_RaceWhite                                 -0.2484112      -21.99609
## Accused_SexMale                                    0.1385962       14.86602
## Court_TypeState                                   -0.3913044      -32.38257
## PleaGuilty to Lesser Charge                       -0.3152035      -27.03596
## PleaNo Contest                                    -0.1708477      -15.70501
## PleaNot Guilty                                     0.5039755       65.52888

(exp(0.4197) - 1) * 100   # Doctor/Patient

## [1] 52.1505

(exp(-0.2484) - 1) * 100  # White

## [1] -21.99521

(exp(0.1386) - 1) * 100   # Male

## [1] 14.86645

(exp(-0.3913) - 1) * 100  # State

## [1] -32.38227

(exp(-0.3152) - 1) * 100  # Guilty to lesser charge

## [1] -27.03571

(exp(0.5040) - 1) * 100   # Not guilty

## [1] 65.53294

#Checking Reference Groups
levels(lm_data_full$Relationship)

## NULL

levels(lm_data_full$Accused_Race)

## [1] "Black" "Other" "White"

levels(lm_data_full$Accused_Sex)

## NULL

levels(lm_data_full$Court_Type)

## NULL

levels(lm_data_full$Plea)

## NULL

levels(lm_data_full$Charge_Group)

## NULL

unique(lm_data_full$Relationship)

## [1] "Caretaker/Family/Friend/Partner/Co-User"
## [2] "Dealer/Buyer"                           
## [3] "Doctor/Patient"

unique(lm_data_full$Accused_Sex)

## [1] "Female" "Male"

unique(lm_data_full$Court_Type)

## [1] "State"   "Federal"

unique(lm_data_full$Plea)

## [1] "Guilty"                  "No Contest"             
## [3] "Guilty to Lesser Charge" "Not Guilty"

unique(lm_data_full$Charge_Group)

## [1] "Manslaughter"                         
## [2] "Drug-Induced Homicide"                
## [3] "Conspiracy"                           
## [4] "Murder"                               
## [5] "Other"                                
## [6] "Negligent/Reckless"                   
## [7] "Distribution/Possession Causing Death"

#explicitly convert and set reference groups:
lm_data_full <- lm_data_full %>%
  mutate(
    Relationship = factor(Relationship),
    Accused_Sex = factor(Accused_Sex),
    Court_Type = factor(Court_Type),
    Plea = factor(Plea),
    Charge_Group = factor(Charge_Group)
  )

levels(lm_data_full$Relationship)

## [1] "Caretaker/Family/Friend/Partner/Co-User"
## [2] "Dealer/Buyer"                           
## [3] "Doctor/Patient"

levels(lm_data_full$Accused_Sex)

## [1] "Female" "Male"

levels(lm_data_full$Court_Type)

## [1] "Federal" "State"

levels(lm_data_full$Plea)

## [1] "Guilty"                  "Guilty to Lesser Charge"
## [3] "No Contest"              "Not Guilty"

levels(lm_data_full$Charge_Group)

## [1] "Conspiracy"                           
## [2] "Distribution/Possession Causing Death"
## [3] "Drug-Induced Homicide"                
## [4] "Manslaughter"                         
## [5] "Murder"                               
## [6] "Negligent/Reckless"                   
## [7] "Other"

NOW I need to rerun the final model again with the correct reference categories…

lm_final_log <- lm(
  log_sentence ~ 
    Charge_Group +
    Relationship +
    Accused_Race +
    Accused_Sex +
    Court_Type +
    Plea,
  data = lm_data_full
)

summary(lm_final_log)

## 
## Call:
## lm(formula = log_sentence ~ Charge_Group + Relationship + Accused_Race + 
##     Accused_Sex + Court_Type + Plea, data = lm_data_full)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5232 -0.4506  0.0602  0.5662  3.5260 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                        2.24514    0.23265   9.650
## Charge_GroupDistribution/Possession Causing Death -0.10962    0.40931  -0.268
## Charge_GroupDrug-Induced Homicide                  0.18322    0.22350   0.820
## Charge_GroupManslaughter                          -0.19477    0.22736  -0.857
## Charge_GroupMurder                                 0.49697    0.23729   2.094
## Charge_GroupNegligent/Reckless                    -0.18811    0.23188  -0.811
## Charge_GroupOther                                  0.30112    0.22481   1.339
## RelationshipDealer/Buyer                           0.11102    0.06569   1.690
## RelationshipDoctor/Patient                         0.41970    0.20996   1.999
## Accused_RaceOther                                 -0.12014    0.10496  -1.145
## Accused_RaceWhite                                 -0.24841    0.07890  -3.149
## Accused_SexMale                                    0.13860    0.06390   2.169
## Court_TypeState                                   -0.39130    0.10297  -3.800
## PleaGuilty to Lesser Charge                       -0.31520    0.13599  -2.318
## PleaNo Contest                                    -0.17085    0.17381  -0.983
## PleaNot Guilty                                     0.50398    0.14897   3.383
##                                                               Pr(>|t|)    
## (Intercept)                                       < 0.0000000000000002 ***
## Charge_GroupDistribution/Possession Causing Death             0.788896    
## Charge_GroupDrug-Induced Homicide                             0.412524    
## Charge_GroupManslaughter                                      0.391823    
## Charge_GroupMurder                                            0.036457 *  
## Charge_GroupNegligent/Reckless                                0.417411    
## Charge_GroupOther                                             0.180701    
## RelationshipDealer/Buyer                                      0.091283 .  
## RelationshipDoctor/Patient                                    0.045861 *  
## Accused_RaceOther                                             0.252583    
## Accused_RaceWhite                                             0.001685 ** 
## Accused_SexMale                                               0.030295 *  
## Court_TypeState                                               0.000152 ***
## PleaGuilty to Lesser Charge                                   0.020638 *  
## PleaNo Contest                                                0.325846    
## PleaNot Guilty                                                0.000742 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9216 on 1109 degrees of freedom
## Multiple R-squared:  0.1438, Adjusted R-squared:  0.1323 
## F-statistic: 12.42 on 15 and 1109 DF,  p-value: < 0.00000000000000022

Robust standard errors

library(lmtest)
library(sandwich)

coeftest(
  lm_final_log,
  vcov = vcovHC(lm_final_log, type = "HC3")
)

## 
## t test of coefficients:
## 
##                                                    Estimate Std. Error t value
## (Intercept)                                        2.245144   0.294277  7.6294
## Charge_GroupDistribution/Possession Causing Death -0.109618   0.530832 -0.2065
## Charge_GroupDrug-Induced Homicide                  0.183217   0.318445  0.5753
## Charge_GroupManslaughter                          -0.194770   0.323092 -0.6028
## Charge_GroupMurder                                 0.496971   0.328255  1.5140
## Charge_GroupNegligent/Reckless                    -0.188106   0.325981 -0.5770
## Charge_GroupOther                                  0.301125   0.308827  0.9751
## RelationshipDealer/Buyer                           0.111017   0.070158  1.5824
## RelationshipDoctor/Patient                         0.419699   0.199195  2.1070
## Accused_RaceOther                                 -0.120144   0.097332 -1.2344
## Accused_RaceWhite                                 -0.248411   0.072323 -3.4348
## Accused_SexMale                                    0.138596   0.068523  2.0226
## Court_TypeState                                   -0.391304   0.095821 -4.0837
## PleaGuilty to Lesser Charge                       -0.315203   0.134271 -2.3475
## PleaNo Contest                                    -0.170848   0.158813 -1.0758
## PleaNot Guilty                                     0.503975   0.136168  3.7011
##                                                              Pr(>|t|)    
## (Intercept)                                       0.00000000000005062 ***
## Charge_GroupDistribution/Possession Causing Death           0.8364359    
## Charge_GroupDrug-Induced Homicide                           0.5651718    
## Charge_GroupManslaughter                                    0.5467432    
## Charge_GroupMurder                                          0.1303166    
## Charge_GroupNegligent/Reckless                              0.5640249    
## Charge_GroupOther                                           0.3297425    
## RelationshipDealer/Buyer                                    0.1138459    
## RelationshipDoctor/Patient                                  0.0353440 *  
## Accused_RaceOther                                           0.2173269    
## Accused_RaceWhite                                           0.0006150 ***
## Accused_SexMale                                             0.0433506 *  
## Court_TypeState                                   0.00004750706714125 ***
## PleaGuilty to Lesser Charge                                 0.0190738 *  
## PleaNo Contest                                              0.2822593    
## PleaNot Guilty                                              0.0002252 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

####VIF / multicollinearity

library(car)

car::vif(lm_final_log)

##                  GVIF Df GVIF^(1/(2*Df))
## Charge_Group 1.542000  6        1.036749
## Relationship 1.190607  2        1.044581
## Accused_Race 1.191073  2        1.044683
## Accused_Sex  1.100737  1        1.049160
## Court_Type   1.425678  1        1.194018
## Plea         1.105746  3        1.016894

vif_vals <- car::vif(lm_final_log)

vif_vals[, "GVIF^(1/(2*Df))"]

## Charge_Group Relationship Accused_Race  Accused_Sex   Court_Type         Plea 
##     1.036749     1.044581     1.044683     1.049160     1.194018     1.016894

####Diagnostics Plot

par(mfrow = c(2,2))
plot(lm_final_log)

###AIC comparison with the larger/full model

AIC(lm_full_log, lm_final_log)

##              df      AIC
## lm_full_log  24 3028.897
## lm_final_log 17 3026.825

###Nested model comparison

anova(lm_final_log, lm_full_log)

## Analysis of Variance Table
## 
## Model 1: log_sentence ~ Charge_Group + Relationship + Accused_Race + Accused_Sex + 
##     Court_Type + Plea
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race + 
##     Accused_Sex + Court_Type + Plea + Substance_Group + child
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1109 941.95                           
## 2   1102 932.01  7    9.9346 1.6781 0.1105

###Percent-change interpretation table

coef_table <- data.frame(
  term = names(coef(lm_final_log)),
  estimate = coef(lm_final_log),
  percent_change = (exp(coef(lm_final_log)) - 1) * 100
)

coef_table

##                                                                                                term
## (Intercept)                                                                             (Intercept)
## Charge_GroupDistribution/Possession Causing Death Charge_GroupDistribution/Possession Causing Death
## Charge_GroupDrug-Induced Homicide                                 Charge_GroupDrug-Induced Homicide
## Charge_GroupManslaughter                                                   Charge_GroupManslaughter
## Charge_GroupMurder                                                               Charge_GroupMurder
## Charge_GroupNegligent/Reckless                                       Charge_GroupNegligent/Reckless
## Charge_GroupOther                                                                 Charge_GroupOther
## RelationshipDealer/Buyer                                                   RelationshipDealer/Buyer
## RelationshipDoctor/Patient                                               RelationshipDoctor/Patient
## Accused_RaceOther                                                                 Accused_RaceOther
## Accused_RaceWhite                                                                 Accused_RaceWhite
## Accused_SexMale                                                                     Accused_SexMale
## Court_TypeState                                                                     Court_TypeState
## PleaGuilty to Lesser Charge                                             PleaGuilty to Lesser Charge
## PleaNo Contest                                                                       PleaNo Contest
## PleaNot Guilty                                                                       PleaNot Guilty
##                                                     estimate percent_change
## (Intercept)                                        2.2451438      844.17732
## Charge_GroupDistribution/Possession Causing Death -0.1096183      -10.38239
## Charge_GroupDrug-Induced Homicide                  0.1832172       20.10753
## Charge_GroupManslaughter                          -0.1947702      -17.69762
## Charge_GroupMurder                                 0.4969711       64.37351
## Charge_GroupNegligent/Reckless                    -0.1881061      -17.14732
## Charge_GroupOther                                  0.3011248       35.13780
## RelationshipDealer/Buyer                           0.1110171       11.74140
## RelationshipDoctor/Patient                         0.4196992       52.15038
## Accused_RaceOther                                 -0.1201441      -11.32073
## Accused_RaceWhite                                 -0.2484112      -21.99609
## Accused_SexMale                                    0.1385962       14.86602
## Court_TypeState                                   -0.3913044      -32.38257
## PleaGuilty to Lesser Charge                       -0.3152035      -27.03596
## PleaNo Contest                                    -0.1708477      -15.70501
## PleaNot Guilty                                     0.5039755       65.52888

###confidence intervals

confint(lm_final_log)

##                                                         2.5 %      97.5 %
## (Intercept)                                        1.78865806  2.70162955
## Charge_GroupDistribution/Possession Causing Death -0.91273621  0.69349957
## Charge_GroupDrug-Induced Homicide                 -0.25531088  0.62174528
## Charge_GroupManslaughter                          -0.64087949  0.25133919
## Charge_GroupMurder                                 0.03137742  0.96256487
## Charge_GroupNegligent/Reckless                    -0.64307802  0.26686581
## Charge_GroupOther                                 -0.13998336  0.74223295
## RelationshipDealer/Buyer                          -0.01786457  0.23989871
## RelationshipDoctor/Patient                         0.00772919  0.83166921
## Accused_RaceOther                                 -0.32608143  0.08579329
## Accused_RaceWhite                                 -0.40321562 -0.09360686
## Accused_SexMale                                    0.01321998  0.26397245
## Court_TypeState                                   -0.59334216 -0.18926669
## PleaGuilty to Lesser Charge                       -0.58202866 -0.04837829
## PleaNo Contest                                    -0.51188347  0.17018802
## PleaNot Guilty                                     0.21168700  0.79626399

exponentiated

exp(confint(lm_final_log))

##                                                       2.5 %     97.5 %
## (Intercept)                                       5.9814204 14.9039988
## Charge_GroupDistribution/Possession Causing Death 0.4014243  2.0007049
## Charge_GroupDrug-Induced Homicide                 0.7746756  1.8621752
## Charge_GroupManslaughter                          0.5268289  1.2857461
## Charge_GroupMurder                                1.0318749  2.6184037
## Charge_GroupNegligent/Reckless                    0.5256719  1.3058652
## Charge_GroupOther                                 0.8693727  2.1006209
## RelationshipDealer/Buyer                          0.9822941  1.2711204
## RelationshipDoctor/Patient                        1.0077591  2.2971500
## Accused_RaceOther                                 0.7217464  1.0895811
## Accused_RaceWhite                                 0.6681680  0.9106407
## Accused_SexMale                                   1.0133077  1.3020923
## Court_TypeState                                   0.5524777  0.8275658
## PleaGuilty to Lesser Charge                       0.5587637  0.9527733
## PleaNo Contest                                    0.5993656  1.1855277
## PleaNot Guilty                                    1.2357610  2.2172418

###Clean Model Tables & Interpretion

library(broom)

tidy(lm_final_log, conf.int = TRUE)

## # A tibble: 16 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)             2.25     0.233      9.65  3.25e-21  1.79       2.70  
##  2 Charge_GroupDistrib…   -0.110    0.409     -0.268 7.89e- 1 -0.913      0.693 
##  3 Charge_GroupDrug-In…    0.183    0.223      0.820 4.13e- 1 -0.255      0.622 
##  4 Charge_GroupManslau…   -0.195    0.227     -0.857 3.92e- 1 -0.641      0.251 
##  5 Charge_GroupMurder      0.497    0.237      2.09  3.65e- 2  0.0314     0.963 
##  6 Charge_GroupNeglige…   -0.188    0.232     -0.811 4.17e- 1 -0.643      0.267 
##  7 Charge_GroupOther       0.301    0.225      1.34  1.81e- 1 -0.140      0.742 
##  8 RelationshipDealer/…    0.111    0.0657     1.69  9.13e- 2 -0.0179     0.240 
##  9 RelationshipDoctor/…    0.420    0.210      2.00  4.59e- 2  0.00773    0.832 
## 10 Accused_RaceOther      -0.120    0.105     -1.14  2.53e- 1 -0.326      0.0858
## 11 Accused_RaceWhite      -0.248    0.0789    -3.15  1.68e- 3 -0.403     -0.0936
## 12 Accused_SexMale         0.139    0.0639     2.17  3.03e- 2  0.0132     0.264 
## 13 Court_TypeState        -0.391    0.103     -3.80  1.52e- 4 -0.593     -0.189 
## 14 PleaGuilty to Lesse…   -0.315    0.136     -2.32  2.06e- 2 -0.582     -0.0484
## 15 PleaNo Contest         -0.171    0.174     -0.983 3.26e- 1 -0.512      0.170 
## 16 PleaNot Guilty          0.504    0.149      3.38  7.42e- 4  0.212      0.796

with robust SE

library(broom)

robust_results <- broom::tidy(
  coeftest(
    lm_final_log,
    vcov = vcovHC(lm_final_log, type = "HC3")
  )
)

robust_results

## # A tibble: 16 × 5
##    term                                    estimate std.error statistic  p.value
##    <chr>                                      <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                                2.25     0.294      7.63  5.06e-14
##  2 Charge_GroupDistribution/Possession Ca…   -0.110    0.531     -0.207 8.36e- 1
##  3 Charge_GroupDrug-Induced Homicide          0.183    0.318      0.575 5.65e- 1
##  4 Charge_GroupManslaughter                  -0.195    0.323     -0.603 5.47e- 1
##  5 Charge_GroupMurder                         0.497    0.328      1.51  1.30e- 1
##  6 Charge_GroupNegligent/Reckless            -0.188    0.326     -0.577 5.64e- 1
##  7 Charge_GroupOther                          0.301    0.309      0.975 3.30e- 1
##  8 RelationshipDealer/Buyer                   0.111    0.0702     1.58  1.14e- 1
##  9 RelationshipDoctor/Patient                 0.420    0.199      2.11  3.53e- 2
## 10 Accused_RaceOther                         -0.120    0.0973    -1.23  2.17e- 1
## 11 Accused_RaceWhite                         -0.248    0.0723    -3.43  6.15e- 4
## 12 Accused_SexMale                            0.139    0.0685     2.02  4.34e- 2
## 13 Court_TypeState                           -0.391    0.0958    -4.08  4.75e- 5
## 14 PleaGuilty to Lesser Charge               -0.315    0.134     -2.35  1.91e- 2
## 15 PleaNo Contest                            -0.171    0.159     -1.08  2.82e- 1
## 16 PleaNot Guilty                             0.504    0.136      3.70  2.25e- 4

The final log-linear model indicated that several predictors were associated with sentence length after adjustment. Compared with caretaker/family/friend/partner/co-user cases, doctor/patient cases were associated with approximately 52% longer sentences (β = 0.420, p = .035). Male defendants received approximately 15% longer sentences than female defendants (β = 0.139, p = .043). White defendants received approximately 22% shorter sentences than Black defendants (β = -0.248, p < .001). State court cases were associated with approximately 32% shorter sentences than federal cases (β = -0.391, p < .001). Compared with guilty pleas, guilty pleas to a lesser charge were associated with approximately 27% shorter sentences (β = -0.315, p = .019), while not guilty pleas were associated with approximately 66% longer sentences (β = 0.504, p < .001).

Because the dependent variable was log-transformed, exponentiated coefficients can be interpreted as multiplicative changes in sentence length. Confidence intervals that do not cross 1 indicate statistically significant associations. In the final model, doctor/patient cases were associated with longer sentences than caretaker/family/friend/partner/co-user cases, with sentences estimated to be between 1.01 and 2.30 times as long. White defendants received shorter sentences than Black defendants, with sentences estimated to be about 0.67 to 0.91 times as long, or roughly 9% to 33% shorter. Male defendants received longer sentences than female defendants, with sentences estimated to be 1.01 to 1.30 times as long. State court cases received shorter sentences than federal cases, with sentences estimated to be 0.55 to 0.83 times as long, or about 17% to 45% shorter. Guilty pleas to a lesser charge were associated with shorter sentences than guilty pleas, with sentences estimated to be 0.56 to 0.95 times as long, while not guilty pleas were associated with longer sentences, with sentences estimated to be 1.24 to 2.22 times as long.

Model Diagnostics:

Variance inflation factors (VIFs) indicated no evidence of problematic multicollinearity among predictors. All adjusted GVIF values were well below conventional thresholds for concern (all GVIF^(1/(2*Df)) < 1.2), suggesting the included predictors were not highly correlated.

Diagnostic plots suggested acceptable model fit following log transformation of sentence length. Residual variance was substantially improved compared with the untransformed model, though minor deviations from normality remained in the tails. Robust standard errors were therefore used to account for potential heteroskedasticity. No evidence of problematic multicollinearity or influential observations was detected.

Model Selection: The simplified log-linear model had a slightly lower AIC than the larger model (3026.83 vs. 3028.90), indicating a modest improvement in parsimony. A nested model comparison showed that removing Urbanicity, Substance_Group, and child did not significantly worsen model fit (F = 1.68, p = .111). Therefore, the reduced model was retained as the final specification.

library(modelsummary)

modelsummary(
  lm_final_log,
  exponentiate = TRUE
)

	(1)
(Intercept)	9.442
	(2.197)
Charge_GroupDistribution/Possession Causing Death	0.896
	(0.367)
Charge_GroupDrug-Induced Homicide	1.201
	(0.268)
Charge_GroupManslaughter	0.823
	(0.187)
Charge_GroupMurder	1.644
	(0.390)
Charge_GroupNegligent/Reckless	0.829
	(0.192)
Charge_GroupOther	1.351
	(0.304)
RelationshipDealer/Buyer	1.117
	(0.073)
RelationshipDoctor/Patient	1.522
	(0.319)
Accused_RaceOther	0.887
	(0.093)
Accused_RaceWhite	0.780
	(0.062)
Accused_SexMale	1.149
	(0.073)
Court_TypeState	0.676
	(0.070)
PleaGuilty to Lesser Charge	0.730
	(0.099)
PleaNo Contest	0.843
	(0.147)
PleaNot Guilty	1.655
	(0.247)
Num.Obs.	1125
R2	0.144
R2 Adj.	0.132
AIC	3026.8
BIC	3112.3
Log.Lik.	-1496.412
F	12.422
RMSE	0.92

modelsummary(
  lm_final_log,
  vcov = "HC3",
  exponentiate = TRUE
)

	(1)
(Intercept)	9.442
	(2.778)
Charge_GroupDistribution/Possession Causing Death	0.896
	(0.476)
Charge_GroupDrug-Induced Homicide	1.201
	(0.382)
Charge_GroupManslaughter	0.823
	(0.266)
Charge_GroupMurder	1.644
	(0.540)
Charge_GroupNegligent/Reckless	0.829
	(0.270)
Charge_GroupOther	1.351
	(0.417)
RelationshipDealer/Buyer	1.117
	(0.078)
RelationshipDoctor/Patient	1.522
	(0.303)
Accused_RaceOther	0.887
	(0.086)
Accused_RaceWhite	0.780
	(0.056)
Accused_SexMale	1.149
	(0.079)
Court_TypeState	0.676
	(0.065)
PleaGuilty to Lesser Charge	0.730
	(0.098)
PleaNo Contest	0.843
	(0.134)
PleaNot Guilty	1.655
	(0.225)
Num.Obs.	1125
R2	0.144
R2 Adj.	0.132
AIC	3026.8
BIC	3112.3
Log.Lik.	-1496.412
F	15.759
RMSE	0.92
Std.Errors	HC3

library(pandoc)
modelsummary(
  lm_final_log,
  vcov = "HC3",
  exponentiate = TRUE,
  output = "table1.docx"
)

#Other Relationships

Sex

“Do cases involving women tend to occur when the deceased is younger?”

wilcox.test(Deceased_Age ~ Accused_Sex, data = DIH_DATA)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Deceased_Age by Accused_Sex
## W = 761528, p-value = 0.06442
## alternative hypothesis: true location shift is not equal to 0

DIH_DATA %>%
  group_by(Accused_Sex) %>%
  summarise(
    n = sum(!is.na(Deceased_Age)),
    mean_age = mean(Deceased_Age, na.rm = TRUE),
    median_age = median(Deceased_Age, na.rm = TRUE)
  )

## # A tibble: 3 × 4
##   Accused_Sex     n mean_age median_age
##   <chr>       <int>    <dbl>      <dbl>
## 1 Female        741     29.8         30
## 2 Male         1965     29.6         28
## 3 <NA>           12     24           23

ggplot(DIH_DATA, aes(x = Accused_Sex, y = Deceased_Age, fill = Accused_Sex)) +
  geom_boxplot() +
  guides(fill = "none")

## Warning: Removed 546 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

“Are younger victims more likely to involve female defendants?”

glm(I(Accused_Sex == "Female") ~ Deceased_Age,
    data = DIH_DATA,
    family = binomial)

## 
## Call:  glm(formula = I(Accused_Sex == "Female") ~ Deceased_Age, family = binomial, 
##     data = DIH_DATA)
## 
## Coefficients:
##  (Intercept)  Deceased_Age  
##    -1.011494      0.001221  
## 
## Degrees of Freedom: 2705 Total (i.e. Null);  2704 Residual
##   (558 observations deleted due to missingness)
## Null Deviance:       3177 
## Residual Deviance: 3177  AIC: 3181

There is no strong evidence that cases involving female defendants are associated with younger (or older) victims.

##Child cases

child_data <- DIH_DATA %>%
  filter(!is.na(age_group))

child_data %>%
  group_by(age_group, Accused_Sex) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(age_group) %>%
  mutate(prop = n / sum(n))

## # A tibble: 23 × 4
## # Groups:   age_group [10]
##    age_group Accused_Sex     n  prop
##    <fct>     <chr>       <int> <dbl>
##  1 0–1       Female         30 0.625
##  2 0–1       Male           18 0.375
##  3 2–3       Female         22 0.688
##  4 2–3       Male           10 0.312
##  5 4–5       Female          6 0.667
##  6 4–5       Male            3 0.333
##  7 6–7       Female          1 0.167
##  8 6–7       Male            5 0.833
##  9 8–9       Female          2 0.667
## 10 8–9       Male            1 0.333
## # ℹ 13 more rows

ggplot(child_data, aes(x = age_group, fill = Accused_Sex)) +
  geom_bar(position = "fill") +
  labs(
    x = "Child Age Group",
    y = "Proportion",
    fill = "Accused Sex"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This is pretty striking.

Going to change the age groupings

DIH_DATA <- DIH_DATA %>%
  mutate(
    age_group2 = case_when(
      Deceased_Age <= 5 ~ "0–5",
      Deceased_Age <= 12 ~ "6–12",
      Deceased_Age <= 17 ~ "13–17",
      TRUE ~ NA_character_
    )
  )

DIH_DATA %>%
  filter(!is.na(age_group2)) %>%
  group_by(age_group2, Accused_Sex) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(age_group2) %>%
  mutate(percent = 100 * n / sum(n))

## # A tibble: 7 × 4
## # Groups:   age_group2 [3]
##   age_group2 Accused_Sex     n percent
##   <chr>      <chr>       <int>   <dbl>
## 1 0–5        Female         58   65.2 
## 2 0–5        Male           31   34.8 
## 3 13–17      Female         20   23.0 
## 4 13–17      Male           64   73.6 
## 5 13–17      <NA>            3    3.45
## 6 6–12       Female          9   50   
## 7 6–12       Male            9   50

tab <- DIH_DATA %>%
  filter(!is.na(age_group2)) %>%
  count(age_group2, Accused_Sex) %>%
  tidyr::pivot_wider(
    names_from = Accused_Sex,
    values_from = n,
    values_fill = 0
  )

tab_test <- table(
  DIH_DATA$age_group2,
  DIH_DATA$Accused_Sex
)

tab_test

##        
##         Female Male
##   0–5       58   31
##   13–17     20   64
##   6–12       9    9

chisq.test(tab_test)

## 
##  Pearson's Chi-squared test
## 
## data:  tab_test
## X-squared = 29.963, df = 2, p-value = 0.0000003116

chisq.test(tab_test)$expected

##        
##            Female      Male
##   0–5   40.539267 48.460733
##   13–17 38.261780 45.738220
##   6–12   8.198953  9.801047

No expected counts are less than 0 so we are good.

DIH_DATA <- DIH_DATA %>%
  mutate(female = ifelse(Accused_Sex == "Female", 1, 0))

library(DescTools)

## 
## Attaching package: 'DescTools'

## The following objects are masked from 'package:modelsummary':
## 
##     Format, Mean, Median, N, SD, Var

## The following object is masked from 'package:car':
## 
##     Recode

CochranArmitageTest(
  table(DIH_DATA$female, DIH_DATA$age_group2)
)

## 
##  Cochran-Armitage test for trend
## 
## data:  table(DIH_DATA$female, DIH_DATA$age_group2)
## Z = 3.7259, dim = 3, p-value = 0.0001946
## alternative hypothesis: two.sided

model <- glm(
  I(Accused_Sex == "Female") ~ age_group2,
  data = DIH_DATA,
  family = binomial
)

summary(model)

## 
## Call:
## glm(formula = I(Accused_Sex == "Female") ~ age_group2, family = binomial, 
##     data = DIH_DATA)
## 
## Coefficients:
##                 Estimate Std. Error z value    Pr(>|z|)    
## (Intercept)       0.6265     0.2225   2.816     0.00487 ** 
## age_group213–17  -1.7896     0.3393  -5.274 0.000000133 ***
## age_group26–12   -0.6265     0.5213  -1.202     0.22945    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 263.27  on 190  degrees of freedom
## Residual deviance: 232.22  on 188  degrees of freedom
##   (3073 observations deleted due to missingness)
## AIC: 238.22
## 
## Number of Fisher Scoring iterations: 4

Are younger victims more likely to involve female defendants? (Age Group Analysis)

Cases were grouped into three age categories (0–5, 6–12, 13–17), and the distribution of defendant sex across these groups was examined.

Descriptive results show a clear pattern: • Ages 0–5: • Female defendants: 65.2% • Male defendants: 34.8% • Ages 6–12: • Female defendants: 50.0% • Male defendants: 50.0% • Ages 13–17: • Female defendants: 23.0% • Male defendants: 73.6%

This suggests that female defendants are substantially overrepresented in cases involving the youngest victims (ages 0–5), while male defendants dominate cases involving older minors (13–17).

A chi-square test of independence confirms that this association is statistically significant: • χ²(2) = 29.96, p < 0.001

This indicates that the distribution of defendant sex differs meaningfully across age groups.

Trend Analysis

A Cochran–Armitage trend test was also conducted to assess whether there is a systematic trend across ordered age categories. Results from the logistic regression model (with age group as a predictor) further support this: • Compared to ages 0–5 (reference group), cases involving ages 13–17 show a large and statistically significant decrease in the likelihood of a female defendant (p < 0.001) • The 6–12 group does not differ significantly from the youngest group (p = 0.23)

This indicates a strong downward trend:

As victim age increases, the likelihood that the defendant is female decreases substantially, particularly when comparing very young children (0–5) to older adolescents (13–17).

Though I stopped the analysis here – we would need to consider whether to include other control variables in this regression if we ant to report it

##Sex & Relationship

tab_rel_sex <- table(DIH_DATA$Relationship, DIH_DATA$Accused_Sex)
tab_rel_sex

##                                          
##                                           Female Male
##   Caretaker/Family/Friend/Partner/Co-User    304  459
##   Dealer/Buyer                               461 1637
##   Doctor/Patient                              13   75

chisq.test(tab_rel_sex)

## 
##  Pearson's Chi-squared test
## 
## data:  tab_rel_sex
## X-squared = 98.285, df = 2, p-value < 0.00000000000000022

chisq.test(tab_rel_sex)$expected

##                                          
##                                              Female       Male
##   Caretaker/Family/Friend/Partner/Co-User 201.29332  561.70668
##   Dealer/Buyer                            553.49067 1544.50933
##   Doctor/Patient                           23.21601   64.78399

No cells less than 5

prop.table(tab_rel_sex, margin = 1)

##                                          
##                                              Female      Male
##   Caretaker/Family/Friend/Partner/Co-User 0.3984273 0.6015727
##   Dealer/Buyer                            0.2197331 0.7802669
##   Doctor/Patient                          0.1477273 0.8522727

ggplot(DIH_DATA, aes(x = Relationship, fill = Accused_Sex)) +
  geom_bar(position = "fill") +
  labs(
    y = "Proportion",
    fill = "Accused Sex"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Urbancity & State

table(DIH_DATA$Urbanicity, DIH_DATA$State)

##               
##                Alabama Alaska Arizona Arkansas California Colorado Connecticut
##   Metropolitan       9      0      13        1         76       14           0
##   Suburban           3      0       0        0          3       10           0
##   Rural              0      0       0        0          2        1           0
##               
##                Delaware Florida Georgia Idaho Illinois Indiana Iowa Kansas
##   Metropolitan        3     121      18     5      238      44   23      3
##   Suburban            0       3       2     0       26       9    3      1
##   Rural               0       0       1     2       26       0    2      3
##               
##                Kentucky Louisana Louisiana Maine Maryland Massachusetts
##   Metropolitan       25        0        47     5       46            26
##   Suburban            1        0         3     3        7             0
##   Rural               6        0         0     6        0             1
##               
##                Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada
##   Metropolitan       72        84           1       18       8        1     14
##   Suburban           13        24           1        9       3        0      2
##   Rural              13         5           0       10       0        0      0
##               
##                New Hampshire New Jersey New Mexico New York North Carolina
##   Metropolitan            26        108          3       90             74
##   Suburban                12          0          0       13             30
##   Rural                    0          0          0        6              5
##               
##                North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island
##   Metropolitan           14  296       17     17          466            2
##   Suburban                1   49        8      0           88            0
##   Rural                   2    6        2      0           23            0
##               
##                South Carolina South Dakota Tennessee Texas Utah Vermont
##   Metropolitan             13            3        69    23   13       1
##   Suburban                  2            2         7     4    0       8
##   Rural                     0            1         2     0    4       5
##               
##                Virginia Washington West Virginia Wisconsin Wyoming
##   Metropolitan       45         54            18       252       6
##   Suburban            7         17             5        67       1
##   Rural               4          0             7        32       3

ggplot(DIH_DATA, aes(x = State, fill = Urbanicity)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

DIH Media

Taleed El-Sabawi

2026-04-13