library(googlesheets4)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(scales)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 4.0.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(ggplot2)
#Data isnt full cleaned so loading everything as text and then will specify column type
library(googlesheets4)
library(dplyr)
DIH_DATA <- read_sheet(
"https://docs.google.com/spreadsheets/d/1OS4-QPBoffGKsxfqPzW3LyLRoxga2j3wLYPPxXtnUHw/edit?usp=sharing",
col_types = "c"
)
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for
## 'taleedsabawi@gmail.com'.
## ✔ Reading from "DIH Data 2026".
## ✔ Range 'Data'.
#Assigning columns manually to data type
DIH_DATA$Year_Charged <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code <- as.numeric(DIH_DATA$Rural_Code)
# Dates
#These first dates are in excel serial data structure...
DIH_DATA$`Date Incident` <- as.Date(as.numeric(DIH_DATA$`Date Incident`), origin = "1899-12-30")
## Warning in as.Date(as.numeric(DIH_DATA$`Date Incident`), origin =
## "1899-12-30"): NAs introduced by coercion
DIH_DATA$`Date Charge` <- as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30")
## Warning in as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30"):
## NAs introduced by coercion
# Ages
DIH_DATA$Accused_Age <- as.numeric(DIH_DATA$Accused_Age)
## Warning: NAs introduced by coercion
DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)
## Warning: NAs introduced by coercion
#checking data structure
str(DIH_DATA)
## tibble [3,264 × 31] (S3: tbl_df/tbl/data.frame)
## $ ID : chr [1:3264] "1678" "2622" "1514" "320" ...
## $ URL : chr [1:3264] "http://www.nydailynews.com/news/crime/pain-relieving-patch-kills-harlem-girl-6-foster-mother-charged-article-1.328165" "http://whdh.com/news/woman-boyfriend-charged-in-fatal-cocaine-overdose-of-ohio-boy-9/" "http://www.capitalgazette.com/news/for_the_record/ac-cn-delvalle-overdose-verdict-20180620-story.html" "http://www.thisweeknews.com/news/20180719/fourth-person-expected-to-be-charged-in-connection-with-grandview-ove"| __truncated__ ...
## $ Court : chr [1:3264] NA NA "Baltimore County Court" NA ...
## $ Rural_Code : num [1:3264] 1 2 1 1 1 1 1 5 7 2 ...
## $ County_Fips : chr [1:3264] "36061" "39099" "24005" "39049" ...
## $ County_FullName : chr [1:3264] "New York County" "Mahoning County" "Baltimore County" "Franklin County" ...
## $ County_Name : chr [1:3264] "New York" "Mahoning" "Baltimore" "Franklin" ...
## $ State : chr [1:3264] "New York" "Ohio" "Maryland" "Ohio" ...
## $ Year_Charged : num [1:3264] 2004 2018 2018 2018 2011 ...
## $ Date Charge : Date[1:3264], format: NA NA ...
## $ Date Incident : Date[1:3264], format: NA NA ...
## $ Accused_Name : chr [1:3264] "Joanne Alvarez" "Raenell Allen" "Jason Patton Baker" "Benjamin Bussey" ...
## $ Sentence : chr [1:3264] NA NA NA NA ...
## $ Sentence Quartile : chr [1:3264] NA NA NA NA ...
## $ Charge : chr [1:3264] "Reckless/Negligent Homicide" "Involuntary Manslaughter" "Manslaughter" "Involuntary Manslaughter" ...
## $ Accused_Race : chr [1:3264] NA "White" "White" "White" ...
## $ Accused_Ethnicity : chr [1:3264] "Hispanic/Latino" NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
## $ Accused_Sex : chr [1:3264] "Female" "Female" "Male" "Male" ...
## $ Accused_Age : num [1:3264] NA NA 46 19 NA NA NA NA NA NA ...
## $ Accused_City : chr [1:3264] "East Harlem" "Youngstown" "Millersville" "Dublin" ...
## $ Accused_State : chr [1:3264] "New York" "Ohio" "Pennsylvania" "Ohio" ...
## $ Court_Type : chr [1:3264] "State" "State" "State" "State" ...
## $ Plea : chr [1:3264] "Guilty" NA "Not Guilty" "Guilty to a Lesser Charge" ...
## $ Deceased_Name : chr [1:3264] "Taylor Webster" NA "Josiah Christopher Klaes" "Haleah Myers" ...
## $ Deceased_Race : chr [1:3264] "White" NA "White" "White" ...
## $ Deceased_Ethnicity: chr [1:3264] NA NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
## $ Deceased_Age : num [1:3264] 6 9 16 17 17 18 18 18 18 18 ...
## $ Deceased_Sex : chr [1:3264] "Female" "Male" "Male" "Female" ...
## $ Substance : chr [1:3264] "Fentanyl (Analog)" NA "Fentanyl (Analog)" "Fentanyl (Analog)" ...
## $ Lawyer : chr [1:3264] NA NA "do'connell@opd.state.md.us" NA ...
## $ Relationship : chr [1:3264] "Caretaker/Family/Friend/Partner/Co-User" "Caretaker/Family/Friend/Partner/Co-User" "Dealer/Buyer" "Caretaker/Family/Friend/Partner/Co-User" ...
#Looking to see what Sentence contains
sort(unique(DIH_DATA$Sentence))
## [1] "0.08219178082" "0.0833" "0.08333333333" "0.1052511416"
## [5] "0.1616438356" "0.1666666667" "0.2191780822" "0.25"
## [9] "0.325" "0.333" "0.3333333333" "0.375"
## [13] "0.4" "0.405" "0.4166666667" "0.5"
## [17] "0.5726027397" "0.5833333333" "0.6666666667" "0.75"
## [21] "0.822" "0.8333333333" "0.9166666667" "0.9583333333"
## [25] "1" "1.002" "1.25" "1.3"
## [29] "1.333333333" "1.416666667" "1.5" "1.583"
## [33] "1.66" "1.666666667" "1.7" "1.75"
## [37] "1.8" "1.916666667" "1.917" "1.92"
## [41] "1.97" "10" "10.16" "10.41"
## [45] "10.5" "10.67" "10.8" "10.83"
## [49] "10.83333333" "11" "11.1" "11.16666667"
## [53] "11.25" "11.5" "11.83" "11.9"
## [57] "113" "12" "12.25" "12.5"
## [61] "12.75" "124" "13" "13.33"
## [65] "13.5" "14" "14.5" "143"
## [69] "15" "15.5" "15.67" "16"
## [73] "16.5" "17" "17.5" "18"
## [77] "18.25" "18.66666667" "180" "19"
## [81] "19.25" "19.58333333" "2" "2.25"
## [85] "2.333333333" "2.4" "2.5" "2.75"
## [89] "20" "20.5" "20.83333333" "21"
## [93] "22" "22.5" "23" "24"
## [97] "25" "25.5" "26" "26.83"
## [101] "27" "27.08333333" "28" "29"
## [105] "3" "3.08" "3.083333333" "3.25"
## [109] "3.3" "3.33" "3.333333333" "3.4"
## [113] "3.416666667" "3.5" "3.666666667" "3.83"
## [117] "3.833333333" "3.92" "30" "30.41666667"
## [121] "31" "32" "33" "33.25"
## [125] "34" "35" "36" "37"
## [129] "38" "4" "4.083333333" "4.17"
## [133] "4.25" "4.416666667" "4.5" "4.666666667"
## [137] "4.75" "4.8" "4.83" "4.833"
## [141] "4.833333333" "4.9" "40" "42"
## [145] "46" "47" "48" "48.33"
## [149] "5" "5.06" "5.25" "5.5"
## [153] "5.666666667" "5.75" "5.83" "5.916666667"
## [157] "5.92" "50" "52" "54"
## [161] "55" "6" "6.083333333" "6.166666667"
## [165] "6.25" "6.5" "6.67" "6.7"
## [169] "6.75" "60" "62" "63"
## [173] "65" "7" "7.25" "7.33"
## [177] "7.5" "7.666666667" "72" "75"
## [181] "8" "8.08" "8.33" "8.333"
## [185] "8.333333333" "8.5" "8.67" "84"
## [189] "9" "9.33" "9.5" "9.917"
## [193] "90" "96" "Life" "pending"
## [197] "Pending"
I have manually gone into the data and created a new variable called “Sentence_num” – which makes Sentence a continuous numeric variable. Just for visualization sake…
For now I will drop anything that says “pending” or “Pending” from the dataset and replace them with NA
I changed all “life” sentences to 100 years, just for th
DIH_DATA <- DIH_DATA %>%
mutate(
Sentence_num = na_if(Sentence, "Pending"),
Sentence_num = na_if(Sentence, "pending"),
Sentence_num = as.numeric(Sentence)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sentence_num = as.numeric(Sentence)`.
## Caused by warning:
## ! NAs introduced by coercion
library(dplyr)
library(stringr)
summary(DIH_DATA$Sentence_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.08219 4.00000 8.00000 11.54717 15.00000 180.00000 1209
sum(!is.na(DIH_DATA$Sentence))
## [1] 2348
sum(!is.na(DIH_DATA$Sentence_num))
## [1] 2055
All observations 3,364 Observations with sentencing data 2,055
#General Summary of All Variables
summary(DIH_DATA)
## ID URL Court Rural_Code
## Length:3264 Length:3264 Length:3264 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Mode :character Median :2.000
## Mean :2.474
## 3rd Qu.:3.000
## Max. :9.000
## NA's :112
## County_Fips County_FullName County_Name State
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Year_Charged Date Charge Date Incident Accused_Name
## Min. :1974 Min. :NA Min. :NA Length:3264
## 1st Qu.:2015 1st Qu.:NA 1st Qu.:NA Class :character
## Median :2017 Median :NA Median :NA Mode :character
## Mean :2016 Mean :NaN Mean :NaN
## 3rd Qu.:2018 3rd Qu.:NA 3rd Qu.:NA
## Max. :2026 Max. :NA Max. :NA
## NA's :3264 NA's :3264
## Sentence Sentence Quartile Charge Accused_Race
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Accused_Ethnicity Accused_Sex Accused_Age Accused_City
## Length:3264 Length:3264 Min. :17.00 Length:3264
## Class :character Class :character 1st Qu.:27.00 Class :character
## Mode :character Mode :character Median :32.00 Mode :character
## Mean :34.34
## 3rd Qu.:40.00
## Max. :87.00
## NA's :2224
## Accused_State Court_Type Plea Deceased_Name
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Deceased_Race Deceased_Ethnicity Deceased_Age Deceased_Sex
## Length:3264 Length:3264 Min. : 0.00 Length:3264
## Class :character Class :character 1st Qu.:22.25 Class :character
## Mode :character Mode :character Median :28.00 Mode :character
## Mean :29.62
## 3rd Qu.:35.00
## Max. :91.00
## NA's :546
## Substance Lawyer Relationship Sentence_num
## Length:3264 Length:3264 Length:3264 Min. : 0.08219
## Class :character Class :character Class :character 1st Qu.: 4.00000
## Mode :character Mode :character Mode :character Median : 8.00000
## Mean : 11.54717
## 3rd Qu.: 15.00000
## Max. :180.00000
## NA's :1209
Missingness
colSums(is.na(DIH_DATA))
## ID URL Court Rural_Code
## 0 103 999 112
## County_Fips County_FullName County_Name State
## 0 5 2 0
## Year_Charged Date Charge Date Incident Accused_Name
## 0 3264 3264 7
## Sentence Sentence Quartile Charge Accused_Race
## 916 3264 1 461
## Accused_Ethnicity Accused_Sex Accused_Age Accused_City
## 2286 17 2224 181
## Accused_State Court_Type Plea Deceased_Name
## 65 72 907 522
## Deceased_Race Deceased_Ethnicity Deceased_Age Deceased_Sex
## 1103 2198 546 220
## Substance Lawyer Relationship Sentence_num
## 617 2459 276 1209
#Creating Freq table function to reuse later
freq_table <- function(data, var) {
data %>%
count({{ var }}) %>%
mutate(percent = n / sum(n) * 100) %>%
arrange(desc(n))
}
#Creating Bar chart function for categorical variables to reuse later
library(ggplot2)
library(dplyr)
bar_plot_cat <- function(data, var) {
ggplot(data, aes(x = {{ var }}, fill = {{ var }})) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
x = deparse(substitute(var)),
y = "Count",
title = paste("Distribution of", deparse(substitute(var)))
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
}
#Creating histogram function
hist_plot <- function(data, var) {
ggplot(data, aes(x = {{ var }})) +
geom_histogram(bins = 30, fill = "steelblue") +
scale_x_continuous(breaks = seq(0, 200, by = 10)) + # 👈 more numbers
labs(
x = deparse(substitute(var)),
y = "Count",
title = paste("Distribution of", deparse(substitute(var)))
)
}
#creating density plot function
density_plot <- function(data, var) {
ggplot(data, aes(x = {{ var }})) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(
x = deparse(substitute(var)),
y = "Density",
title = paste("Density of", deparse(substitute(var)))
)
}
hist_plot(DIH_DATA, Sentence_num)
## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_bin()`).
Will need to explore missingness to see if it is random.
#📅Date Charged
freq_table (DIH_DATA, Year_Charged)
## # A tibble: 34 × 3
## Year_Charged n percent
## <dbl> <int> <dbl>
## 1 2017 560 17.2
## 2 2018 532 16.3
## 3 2016 384 11.8
## 4 2019 326 9.99
## 5 2015 266 8.15
## 6 2020 263 8.06
## 7 2014 213 6.53
## 8 2013 120 3.68
## 9 2021 99 3.03
## 10 2012 88 2.70
## # ℹ 24 more rows
sum(DIH_DATA$Year_Charged <= 2010, na.rm = TRUE)
## [1] 305
library(dplyr)
library(ggplot2)
DIH_DATA %>%
filter(!is.na(Year_Charged)) %>%
mutate(Year_Charged = as.numeric(Year_Charged)) %>%
count(Year_Charged) %>%
ggplot(aes(x = Year_Charged, y = n)) +
geom_line() +
labs(
title = "Number of Cases Over Time",
x = "Year",
y = "Count"
)
#👤Accused characteristics ## Accused Age
density_plot(DIH_DATA, Accused_Age)
## Warning: Removed 2224 rows containing non-finite outside the scale range
## (`stat_density()`).
## • Accused_Ethnicity
freq_table(DIH_DATA,Accused_Ethnicity)
## # A tibble: 7 × 3
## Accused_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2286 70.0
## 2 Not Hispanic/Latino 734 22.5
## 3 Hispanic/Latino 147 4.50
## 4 NA 94 2.88
## 5 Not HIspanic/Latino 1 0.0306
## 6 White 1 0.0306
## 7 not Hispanic/Latino 1 0.0306
Cleaning typos in data…
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Ethnicity = str_to_title(Accused_Ethnicity), # fix casing
Accused_Ethnicity = str_replace_all(Accused_Ethnicity, "Hispanic", "Hispanic"), # normalize
Accused_Ethnicity = na_if(Accused_Ethnicity, "Na"), # fix "NA"
Accused_Ethnicity = case_when(
Accused_Ethnicity %in% c("Not Hispanic/Latino", "Not Hispanic/Latino") ~ "Not Hispanic/Latino",
Accused_Ethnicity %in% c("Hispanic/Latino") ~ "Hispanic/Latino",
Accused_Ethnicity == "White" ~ NA_character_, # wrong variable → set to NA
TRUE ~ Accused_Ethnicity
)
)
bar_plot_cat(DIH_DATA, Accused_Ethnicity)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
bar_plot_cat(DIH_DATA, Accused_Sex)
freq_table(DIH_DATA, Accused_Sex)
## # A tibble: 5 × 3
## Accused_Sex n percent
## <chr> <int> <dbl>
## 1 Male 2388 73.2
## 2 Female 853 26.1
## 3 <NA> 17 0.521
## 4 NA 3 0.0919
## 5 feMale 3 0.0919
Fixing typos in the data again…
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Sex = str_to_title(Accused_Sex), # fixes feMale → Female
Accused_Sex = na_if(Accused_Sex, "Na") # fixes "NA" string → real NA
)
freq_table(DIH_DATA, Accused_Race)
## # A tibble: 12 × 3
## Accused_Race n percent
## <chr> <int> <dbl>
## 1 White 2067 63.3
## 2 Black 641 19.6
## 3 <NA> 461 14.1
## 4 NA 37 1.13
## 5 Unknown POC 25 0.766
## 6 Asian 16 0.490
## 7 Native American or Alaskan Native 10 0.306
## 8 Hispanic/Latino 3 0.0919
## 9 Arabic 1 0.0306
## 10 Indian American 1 0.0306
## 11 Manufacture, Delivery, or Possession With Intent to Manufactur… 1 0.0306
## 12 white 1 0.0306
“white” → should be “White” • “NA” → should be real NA • “Arabic” → you already decided → “White” • that long drug charge string → clearly wrong variable → should be NA
• "Unknown POC" → not a usable category → ❌
• "Hispanic/Latino" → ethnicity, not race → ❌
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Race = na_if(Accused_Race, "NA"),
Accused_Race = case_when(
Accused_Race %in% c("white", "White") ~ "White",
Accused_Race %in% c("black", "Black") ~ "Black",
Accused_Race == "Arabic" ~ "White",
Accused_Race %in% c("Unknown POC", "Hispanic/Latino", "X") ~ NA_character_,
str_detect(Accused_Race, "Manufacture") ~ NA_character_,
TRUE ~ Accused_Race
)
)
freq_table(DIH_DATA, Accused_Race)
## # A tibble: 6 × 3
## Accused_Race n percent
## <chr> <int> <dbl>
## 1 White 2069 63.4
## 2 Black 641 19.6
## 3 <NA> 527 16.1
## 4 Asian 16 0.490
## 5 Native American or Alaskan Native 10 0.306
## 6 Indian American 1 0.0306
bar_plot_cat(DIH_DATA, Accused_Race)
##Race/Ethnicity Variable
DIH_DATA <- DIH_DATA %>%
mutate(
Race_Ethnicity = case_when(
Accused_Ethnicity == "Hispanic/Latino" ~ "Hispanic",
Accused_Race == "White" ~ "White (Non-Hispanic)",
Accused_Race == "Black" ~ "Black (Non-Hispanic)",
Accused_Race == "Asian" ~ "Asian",
Accused_Race == "Native American or Alaskan Native" ~ "Native American",
TRUE ~ NA_character_
)
)
DIH_DATA <- DIH_DATA %>%
mutate(
Race_Ethnicity = case_when(
Race_Ethnicity %in% c("Asian", "Native American") ~ "Other",
TRUE ~ Race_Ethnicity
)
)
freq_table(DIH_DATA, Race_Ethnicity)
## # A tibble: 5 × 3
## Race_Ethnicity n percent
## <chr> <int> <dbl>
## 1 White (Non-Hispanic) 1993 61.1
## 2 Black (Non-Hispanic) 629 19.3
## 3 <NA> 472 14.5
## 4 Hispanic 147 4.50
## 5 Other 23 0.705
bar_plot_cat(DIH_DATA, Race_Ethnicity)
#👤Deceased characteristics
Was there really a 91 year old who was the deceased?
deceased under 18 =>
sum(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)
## [1] 262
percent of sample (after dropping those missing) 18 or under =>
mean(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)
## [1] 0.09639441
DIH_DATA %>%
filter(Deceased_Age <= 18) %>%
count(Deceased_Age, sort = TRUE)
## # A tibble: 19 × 2
## Deceased_Age n
## <dbl> <int>
## 1 18 68
## 2 17 37
## 3 1 29
## 4 16 24
## 5 2 21
## 6 0 19
## 7 15 15
## 8 3 11
## 9 14 8
## 10 5 7
## 11 6 5
## 12 10 5
## 13 13 3
## 14 4 2
## 15 9 2
## 16 11 2
## 17 12 2
## 18 7 1
## 19 8 1
percent of sample 2 or younger =>
mean(DIH_DATA$Deceased_Age <= 2, na.rm = TRUE)* 100
## [1] 2.538631
Quartiles for 18 and younger =>
quantile(
DIH_DATA$Deceased_Age[DIH_DATA$Deceased_Age <= 18],
probs = c(0.25, 0.5, 0.75),
na.rm = TRUE
)
## 25% 50% 75%
## 2 15 18
Interested in the breakdown of deceased by age under 18, creating groups and graphing. =>
library(dplyr)
DIH_DATA <- DIH_DATA %>%
mutate(
age_group = case_when(
Deceased_Age >= 0 & Deceased_Age <= 1 ~ "0–1",
Deceased_Age >= 2 & Deceased_Age <= 3 ~ "2–3",
Deceased_Age >= 4 & Deceased_Age <= 5 ~ "4–5",
Deceased_Age >= 6 & Deceased_Age <= 7 ~ "6–7",
Deceased_Age >= 8 & Deceased_Age <= 9 ~ "8–9",
Deceased_Age >= 10 & Deceased_Age <= 11 ~ "10–11",
Deceased_Age >= 12 & Deceased_Age <= 13 ~ "12–13",
Deceased_Age >= 14 & Deceased_Age <= 15 ~ "14–15",
Deceased_Age >= 16 & Deceased_Age <= 17 ~ "16–17",
Deceased_Age >= 18 ~ "18+",
TRUE ~ NA_character_
)
)
#Ordered
DIH_DATA$age_group <- factor(
DIH_DATA$age_group,
levels = c("0–1","2–3","4–5","6–7","8–9","10–11","12–13","14–15","16–17","18+")
)
#Plot
library(ggplot2)
ggplot(
DIH_DATA %>% filter(Deceased_Age <= 18, !is.na(age_group)),
aes(x = age_group, fill = age_group)
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Deceased Age (≤ 18)",
x = "Age Group",
y = "Count"
) +
guides(fill = "none") # removes legend if you don’t want it
freq_table(DIH_DATA, Deceased_Age)
## # A tibble: 75 × 3
## Deceased_Age n percent
## <dbl> <int> <dbl>
## 1 NA 546 16.7
## 2 26 141 4.32
## 3 28 137 4.20
## 4 21 134 4.11
## 5 23 126 3.86
## 6 25 122 3.74
## 7 24 121 3.71
## 8 30 116 3.55
## 9 22 114 3.49
## 10 27 106 3.25
## # ℹ 65 more rows
freq_table(DIH_DATA, age_group)
## # A tibble: 11 × 3
## age_group n percent
## <fct> <int> <dbl>
## 1 18+ 2524 77.3
## 2 <NA> 546 16.7
## 3 16–17 61 1.87
## 4 0–1 48 1.47
## 5 2–3 32 0.980
## 6 14–15 23 0.705
## 7 4–5 9 0.276
## 8 10–11 7 0.214
## 9 6–7 6 0.184
## 10 12–13 5 0.153
## 11 8–9 3 0.0919
creating a child variable
DIH_DATA <- DIH_DATA %>%
mutate(
child = ifelse(Deceased_Age < 18, 1, 0)
)
DIH_DATA <- DIH_DATA %>%
mutate(
child = ifelse(is.na(Deceased_Age), NA,
ifelse(Deceased_Age < 18, 1, 0))
)
DIH_DATA <- DIH_DATA %>%
mutate(
child = factor(child, levels = c(0, 1),
labels = c("Adult (18+)", "Child (<18)"))
)
freq_table(DIH_DATA, child)
## # A tibble: 3 × 3
## child n percent
## <fct> <int> <dbl>
## 1 Adult (18+) 2524 77.3
## 2 <NA> 546 16.7
## 3 Child (<18) 194 5.94
freq_table(DIH_DATA, Deceased_Race)
## # A tibble: 13 × 3
## Deceased_Race n percent
## <chr> <int> <dbl>
## 1 White 1964 60.2
## 2 <NA> 1103 33.8
## 3 Black 90 2.76
## 4 NA 66 2.02
## 5 Unknown POC 17 0.521
## 6 Asian 10 0.306
## 7 Native American or Alaskan Native 6 0.184
## 8 Hispanic/Latino 3 0.0919
## 9 Arabic 1 0.0306
## 10 Multiple Deceased 1 0.0306
## 11 Thomas McGuinness 1 0.0306
## 12 Unknown 1 0.0306
## 13 X 1 0.0306
Need to fix these… Arabic is changed to White. Thomas McGuinness changed to NA. Fix the NAs, X and Unknown
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Race = na_if(Deceased_Race, "NA"),
Deceased_Race = case_when(
Deceased_Race == "Arabic" ~ "White",
Deceased_Race == "Thomas McGuinness" ~ NA_character_,
Deceased_Race %in% c("Unknown", "X") ~ NA_character_,
TRUE ~ Deceased_Race
)
)
freq_table(DIH_DATA, Deceased_Race)
## # A tibble: 8 × 3
## Deceased_Race n percent
## <chr> <int> <dbl>
## 1 White 1965 60.2
## 2 <NA> 1172 35.9
## 3 Black 90 2.76
## 4 Unknown POC 17 0.521
## 5 Asian 10 0.306
## 6 Native American or Alaskan Native 6 0.184
## 7 Hispanic/Latino 3 0.0919
## 8 Multiple Deceased 1 0.0306
bar_plot_cat(DIH_DATA, Deceased_Race)
freq_table(DIH_DATA, Deceased_Ethnicity)
## # A tibble: 9 × 3
## Deceased_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2198 67.3
## 2 Not Hispanic/Latino 844 25.9
## 3 NA 116 3.55
## 4 Hispanic/Latino 99 3.03
## 5 Hispanic Latino 2 0.0613
## 6 White 2 0.0613
## 7 Multiple Deceased 1 0.0306
## 8 Not Hispanic/latino 1 0.0306
## 9 X 1 0.0306
Again, fixing the data…
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Ethnicity = na_if(Deceased_Ethnicity, "NA"),
Deceased_Ethnicity = case_when(
Deceased_Ethnicity == "Hispanic Latino" ~ "Hispanic/Latino",
Deceased_Ethnicity == "Not Hispanic/latino" ~ "Not Hispanic/Latino",
Deceased_Ethnicity == "White" ~ NA_character_,
Deceased_Ethnicity == "X" ~ NA_character_,
TRUE ~ Deceased_Ethnicity
)
)
freq_table(DIH_DATA, Deceased_Ethnicity)
## # A tibble: 4 × 3
## Deceased_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2317 71.0
## 2 Not Hispanic/Latino 845 25.9
## 3 Hispanic/Latino 101 3.09
## 4 Multiple Deceased 1 0.0306
bar_plot_cat(DIH_DATA, Deceased_Ethnicity)
Large missingness
freq_table(DIH_DATA, Deceased_Sex)
## # A tibble: 14 × 3
## Deceased_Sex n percent
## <chr> <int> <dbl>
## 1 Male 1912 58.6
## 2 Female 1090 33.4
## 3 <NA> 220 6.74
## 4 NA 25 0.766
## 5 male 6 0.184
## 6 Famale 2 0.0613
## 7 female 2 0.0613
## 8 27 1 0.0306
## 9 Both Female 1 0.0306
## 10 Both male 1 0.0306
## 11 Female, Male 1 0.0306
## 12 Multiple Deceased 1 0.0306
## 13 Other 1 0.0306
## 14 X 1 0.0306
changing both to be NA
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Sex = case_when(
Deceased_Sex %in% c("NA","27","X","Other",
"Both Female","Both male","Female, Male", "Both") ~ NA_character_,
Deceased_Sex %in% c("male","Male") ~ "Male",
Deceased_Sex %in% c("female","Female","Famale") ~ "Female",
TRUE ~ Deceased_Sex
)
)
freq_table(DIH_DATA, Deceased_Sex)
## # A tibble: 4 × 3
## Deceased_Sex n percent
## <chr> <int> <dbl>
## 1 Male 1918 58.8
## 2 Female 1094 33.5
## 3 <NA> 251 7.69
## 4 Multiple Deceased 1 0.0306
#🌍Rural Code NA’s: 112
ggplot(
DIH_DATA %>% filter(!is.na(Rural_Code)),
aes(x = factor(Rural_Code), fill = factor(Rural_Code))
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Rural_Code",
x = "Rural Code",
y = "Count"
) +
guides(fill = "none")
Creating urbanacity variable
DIH_DATA <- DIH_DATA %>%
mutate(
Urbanicity = case_when(
Rural_Code %in% 1:3 ~ "Metropolitan",
Rural_Code %in% 4:6 ~ "Suburban",
Rural_Code %in% 7:9 ~ "Rural",
TRUE ~ NA_character_
)
)
#Ordering for plotting
DIH_DATA$Urbanicity <- factor(
DIH_DATA$Urbanicity,
levels = c("Metropolitan", "Suburban", "Rural")
)
ggplot(
DIH_DATA %>% filter(!is.na(Urbanicity)),
aes(x = Urbanicity, fill = Urbanicity)
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Urbanicity",
x = "Urbanicity",
y = "Count"
) +
guides(fill = "none")
Would be interesting to see state by urbancity…
ggplot(
DIH_DATA %>% filter(!is.na(State), !is.na(Urbanicity)),
aes(x = State, fill = Urbanicity)
) +
geom_bar(position = "fill") +
labs(
title = "Proportion of Urbanicity by State",
x = "State",
y = "Proportion"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
I guess this could just be a function of how rural the state is…but we
could test further…
#📍State
There is a lowercase pennsylvania in the dataset…fixing it…
DIH_DATA %>%
filter(State == "pennsylvania")
## # A tibble: 1 × 36
## ID URL Court Rural_Code County_Fips County_FullName County_Name State
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 540 3 senten… <NA> 2 42071 Lancaster Coun… Lancaster penn…
## # ℹ 28 more variables: Year_Charged <dbl>, `Date Charge` <date>,
## # `Date Incident` <date>, Accused_Name <chr>, Sentence <chr>,
## # `Sentence Quartile` <chr>, Charge <chr>, Accused_Race <chr>,
## # Accused_Ethnicity <chr>, Accused_Sex <chr>, Accused_Age <dbl>,
## # Accused_City <chr>, Accused_State <chr>, Court_Type <chr>, Plea <chr>,
## # Deceased_Name <chr>, Deceased_Race <chr>, Deceased_Ethnicity <chr>,
## # Deceased_Age <dbl>, Deceased_Sex <chr>, Substance <chr>, Lawyer <chr>, …
DIH_DATA <- DIH_DATA %>%
mutate(
State = ifelse(State == "pennsylvania", "Pennsylvania", State)
)
bar_plot_cat(DIH_DATA, State)
freq_table(DIH_DATA, State)
## # A tibble: 50 × 3
## State n percent
## <chr> <int> <dbl>
## 1 Pennsylvania 578 17.7
## 2 Ohio 354 10.8
## 3 Wisconsin 351 10.8
## 4 Illinois 299 9.16
## 5 Florida 125 3.83
## 6 Minnesota 116 3.55
## 7 New Jersey 113 3.46
## 8 New York 112 3.43
## 9 North Carolina 111 3.40
## 10 Michigan 99 3.03
## # ℹ 40 more rows
#⚖️ Legal / case variables ## • Charge
freq_table(DIH_DATA,Charge)
## # A tibble: 32 × 3
## Charge n percent
## <chr> <int> <dbl>
## 1 Drug Delivery/Distribution Resulting in Death 610 18.7
## 2 Involuntary Manslaughter 479 14.7
## 3 Reckless/Negligent Homicide 432 13.2
## 4 Murder (2nd or 3rd Degree) 342 10.5
## 5 Drug Induced Homicide 306 9.38
## 6 Manslaughter 252 7.72
## 7 Delivery/Distribution of Heroin/Fentanyl Resulting in Death 151 4.63
## 8 Delivery/Distribution Of a Controlled Substance Causing/Result… 122 3.74
## 9 Other (Please Specify in Notes) 101 3.09
## 10 Murder (1st Degree) 99 3.03
## # ℹ 22 more rows
@Katie Flagging this – still don’t know how these were categorized…
I am going to go ahead and group these charges and provide a justification for doing so, but if you all disagree, I can change them…
DIH_DATA <- DIH_DATA %>%
mutate(
Charge_Group = case_when(
# 1. Core DIH (highest)
str_detect(tolower(Charge), "drug.*death|controlled substance homicide|drug induced homicide") &
!str_detect(tolower(Charge), "conspiracy|possess|intent") ~ "Drug-Induced Homicide",
# 2. Distribution / possession with intent
str_detect(tolower(Charge), "possess|intent|trafficking") &
str_detect(tolower(Charge), "death") ~ "Distribution/Possession Causing Death",
# 3. Conspiracy
str_detect(tolower(Charge), "conspiracy") ~ "Conspiracy",
# 4. Murder
str_detect(tolower(Charge), "murder|felony murder") ~ "Murder",
# 5. Manslaughter
str_detect(tolower(Charge), "manslaughter") ~ "Manslaughter",
# 6. Negligent
str_detect(tolower(Charge), "negligent|reckless") ~ "Negligent/Reckless",
# Missing / junk
Charge %in% c("Pending", "Unknown") ~ NA_character_,
TRUE ~ "Other"
)
)
Charge categories were consolidated based on relative levels of legal culpability and the nature of the defendant’s causal connection to the decedent’s death, distinguishing between direct drug-induced homicide offenses, distribution-related offenses, conspiracy-based liability, and traditional homicide and negligence-based charges.
freq_table(DIH_DATA,Plea)
## # A tibble: 37 × 3
## Plea n percent
## <chr> <int> <dbl>
## 1 Guilty 1756 53.8
## 2 <NA> 907 27.8
## 3 Not Guilty 205 6.28
## 4 NA 188 5.76
## 5 Guilty of a Lesser Charge 46 1.41
## 6 No Contest 42 1.29
## 7 Awaiting Plea Hearing 31 0.950
## 8 Guilty to a Lesser Charge 31 0.950
## 9 Pleaded Guilty 13 0.398
## 10 Guilty to a Lesser DIH Charge 12 0.368
## # ℹ 27 more rows
Fixed the data as follows… inlcuding changed “pending” or awaiting plea hearing to NA.
DIH_DATA <- DIH_DATA %>%
mutate(
Plea = na_if(Plea, "NA"),
Plea = case_when(
Plea %in% c("State", "X", "Other", "Awaiting Plea Hearing", "Pending/Awaiting", "pending") ~ NA_character_,
Plea == "Judge ruled it was not murder" ~ NA_character_,
Plea == "Pleads not guily" ~ "Guilty",
Plea == "Pled guity to DIH charges" ~ "Guilty",
str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
str_detect(tolower(Plea), "guilty") ~ "Guilty",
TRUE ~ Plea
)
)
freq_table(DIH_DATA, Plea)
## # A tibble: 6 × 3
## Plea n percent
## <chr> <int> <dbl>
## 1 Guilty 1781 54.6
## 2 <NA> 1128 34.6
## 3 Not Guilty 215 6.59
## 4 Guilty to Lesser Charge 90 2.76
## 5 No Contest 49 1.50
## 6 Alford Plea 1 0.0306
bar_plot_cat(DIH_DATA, Plea)
Do we have data on verdicts? Versus just pleas
#⚖️ Sentence Quartile
#make sentence quart var
DIH_DATA <- DIH_DATA %>%
mutate(
Sentence_Quartile_calc = ntile(Sentence_num, 4)
)
sum(!is.na(DIH_DATA$Sentence_Quartile_calc))
## [1] 2055
freq_table(DIH_DATA,Court_Type)
## # A tibble: 6 × 3
## Court_Type n percent
## <chr> <int> <dbl>
## 1 State 2789 85.4
## 2 Federal 388 11.9
## 3 <NA> 72 2.21
## 4 NA 12 0.368
## 5 federal 2 0.0613
## 6 Fayette County 1 0.0306
DIH_DATA <- DIH_DATA %>%
mutate(
Court_Type = na_if(Court_Type, "NA"),
Court_Type = case_when(
Court_Type == "federal" ~ "Federal",
Court_Type == "Fayette County" ~ NA_character_,
TRUE ~ Court_Type
)
)
freq_table(DIH_DATA, Court_Type)
## # A tibble: 3 × 3
## Court_Type n percent
## <chr> <int> <dbl>
## 1 State 2789 85.4
## 2 Federal 390 11.9
## 3 <NA> 85 2.60
freq_table(DIH_DATA, Relationship)
## # A tibble: 6 × 3
## Relationship n percent
## <chr> <int> <dbl>
## 1 Dealer/Buyer 2097 64.2
## 2 Caretaker/Family/Friend/Partner/Co-User 768 23.5
## 3 <NA> 276 8.46
## 4 Doctor/Patient 88 2.70
## 5 NA 26 0.797
## 6 Dealer/buyer 9 0.276
Fixing typos.
DIH_DATA <- DIH_DATA %>%
mutate(
Relationship = na_if(Relationship, "NA"),
Relationship = case_when(
Relationship %in% c("Dealer/buyer", "Dealer/Buyer") ~ "Dealer/Buyer",
TRUE ~ Relationship
)
)
freq_table(DIH_DATA, Relationship)
## # A tibble: 4 × 3
## Relationship n percent
## <chr> <int> <dbl>
## 1 Dealer/Buyer 2106 64.5
## 2 Caretaker/Family/Friend/Partner/Co-User 768 23.5
## 3 <NA> 302 9.25
## 4 Doctor/Patient 88 2.70
freq_table(DIH_DATA, Substance)
## # A tibble: 39 × 3
## Substance n percent
## <chr> <int> <dbl>
## 1 Heroin 944 28.9
## 2 <NA> 617 18.9
## 3 Fentanyl (Analog) 459 14.1
## 4 Fentanyl (Analog) and Heroin 408 12.5
## 5 Prescription Opioid 137 4.20
## 6 Mixed Drugs Not Listed (Comment in Cell) 113 3.46
## 7 Methadone 80 2.45
## 8 Other Drug (Comment in Cell) 68 2.08
## 9 Methamphetamine 66 2.02
## 10 Fentanyl Analog 47 1.44
## # ℹ 29 more rows
Cleaning data as follows…
library(stringr)
library(dplyr)
DIH_DATA <- DIH_DATA %>%
mutate(
Substance = tolower(Substance),
Substance = na_if(Substance, "na"),
Substance_clean = case_when(
str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
str_detect(Substance, "fentanyl") ~ "Fentanyl",
str_detect(Substance, "heroin") ~ "Heroin",
str_detect(Substance, "meth") ~ "Methamphetamine",
str_detect(Substance, "cocaine") ~ "Cocaine",
str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
str_detect(Substance, "xanax|benzo") ~ "Other",
str_detect(Substance, "ecstasy") ~ "Other",
str_detect(Substance, "benadryl") ~ "Other",
str_detect(Substance, "bullskin") ~ NA_character_,
TRUE ~ "Other"
)
)
freq_table(DIH_DATA, Substance_clean)
## # A tibble: 9 × 3
## Substance_clean n percent
## <chr> <int> <dbl>
## 1 Heroin 945 29.0
## 2 Other 707 21.7
## 3 Fentanyl 670 20.5
## 4 Fentanyl + Heroin 429 13.1
## 5 Other Opioid 162 4.96
## 6 Methamphetamine 161 4.93
## 7 Polysubstance 113 3.46
## 8 Cocaine 76 2.33
## 9 <NA> 1 0.0306
bar_plot_cat(DIH_DATA, Substance_clean)
Collapsing Substance below, lmk if you have any issues with the way I
collapsed the categories; Benzos has 10 or less so collapsing with
“other”
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Substance_Group = case_when(
is.na(Substance) ~ NA_character_,
# Polysubstance (multiple drugs mentioned)
str_detect(tolower(Substance), " and |-|multiple") ~ "Polysubstance",
# Fentanyl (very important to isolate)
str_detect(tolower(Substance), "fentanyl") ~ "Fentanyl",
# Heroin / opioids (non-fentanyl)
str_detect(tolower(Substance), "heroin|opioid|methadone|prescription") ~ "Opioid (non-fentanyl)",
# Stimulants
str_detect(tolower(Substance), "methamphetamine|cocaine") ~ "Stimulant",
# Missing / junk
Substance %in% c("Unknown", "Pending") ~ NA_character_,
TRUE ~ "Other"
)
)
freq_table(DIH_DATA, Substance_Group)
## # A tibble: 6 × 3
## Substance_Group n percent
## <chr> <int> <dbl>
## 1 Opioid (non-fentanyl) 1162 35.6
## 2 Polysubstance 674 20.6
## 3 <NA> 623 19.1
## 4 Fentanyl 507 15.5
## 5 Other 200 6.13
## 6 Stimulant 98 3.00
library(dplyr)
library(tidyr)
options(scipen = 999)
numeric_summary <- DIH_DATA %>%
summarise(
across(
c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
list(
nonmissing = ~sum(!is.na(.)),
missing = ~sum(is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
)
)
) %>%
pivot_longer(
everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
) %>%
pivot_wider(names_from = Statistic, values_from = value)
numeric_summary <- numeric_summary %>%
mutate(
min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
)
numeric_summary
## # A tibble: 5 × 8
## Variable nonmissing missing mean sd median min max
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Rural_Code 3152 112 2.47 1.89 2 1 9
## 2 Year_Charged 3264 0 2016. 4.02 2017 1974 2026
## 3 Accused_Age 1040 2224 34.3 10.3 32 17 87
## 4 Deceased_Age 2718 546 29.6 11.5 28 0 91
## 5 Sentence_num 2055 1209 11.5 12.1 8 0.08219178082 180
library(dplyr)
library(purrr)
library(tidyr)
library(gt)
cat_vars <- c(
"State",
"Charge",
"Accused_Race",
"Accused_Ethnicity",
"Accused_Sex",
"Accused_State",
"Court_Type",
"Plea",
"Deceased_Race",
"Deceased_Ethnicity",
"Deceased_Sex",
"Substance",
"Relationship",
"Urbanicity",
"Sentence_Quartile_calc"
)
categorical_summary <- map_dfr(cat_vars, function(v) {
x <- DIH_DATA[[v]]
tab <- sort(table(x, useNA = "no"), decreasing = TRUE)
tibble(
Variable = v,
nonmissing = sum(!is.na(x)),
missing = sum(is.na(x)),
n_categories = dplyr::n_distinct(x, na.rm = TRUE),
most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
)
})
categorical_summary
## # A tibble: 15 × 7
## Variable nonmissing missing n_categories most_common most_common_n
## <chr> <int> <int> <int> <chr> <int>
## 1 State 3264 0 50 Pennsylvan… 578
## 2 Charge 3263 1 31 Drug Deliv… 610
## 3 Accused_Race 2737 527 5 White 2069
## 4 Accused_Ethnicity 883 2381 2 Not Hispan… 736
## 5 Accused_Sex 3244 20 2 Male 2388
## 6 Accused_State 3199 65 59 Pennsylvan… 549
## 7 Court_Type 3179 85 2 State 2789
## 8 Plea 2136 1128 5 Guilty 1781
## 9 Deceased_Race 2092 1172 7 White 1965
## 10 Deceased_Ethnicity 947 2317 3 Not Hispan… 845
## 11 Deceased_Sex 3013 251 3 Male 1918
## 12 Substance 2641 623 36 heroin 945
## 13 Relationship 2962 302 3 Dealer/Buy… 2106
## 14 Urbanicity 3152 112 3 Metropolit… 2525
## 15 Sentence_Quartile_… 2055 1209 4 1 514
## # ℹ 1 more variable: most_common_pct <dbl>
Accused_State has 59 states so it needs to be cleaned if we want to use that variable… not going to bother right now because we aren’t using it
#Bivariate Relationships ##Numeric vs. Numer
cor.test(
DIH_DATA$Sentence_num,
DIH_DATA$Accused_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Accused_Age, :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Sentence_num and DIH_DATA$Accused_Age
## S = 12670082, p-value = 0.005528
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1314717
cor.test(
DIH_DATA$Sentence_num,
DIH_DATA$Deceased_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Deceased_Age, :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Sentence_num and DIH_DATA$Deceased_Age
## S = 930447121, p-value = 0.2802
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.02575955
ggplot(DIH_DATA, aes(x = Accused_Age, y = Sentence_num)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2820 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2820 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(DIH_DATA, aes(x = Deceased_Age, y = Sentence_num)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1505 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1505 rows containing missing values or values outside the scale range
## (`geom_point()`).
outcome variable: • heavily skewed • heteroskedastic (variance increases with age for accused and Spread increases in the middle age range (~20–50) for deceased) • outliers present
This violates assumptions of:
• OLS regression
• Pearson correlation
So used Spearman correlation -- @Katie ... any thoughts on other tests to run here?
Given the skewed distribution of sentence length and the presence of outliers, Spearman’s rank correlation was used.
###Sentence length by Sex
DIH_DATA %>%
group_by(Accused_Sex) %>%
summarise(
n = sum(!is.na(Sentence_num)),
mean_sentence = mean(Sentence_num, na.rm = TRUE),
median_sentence = median(Sentence_num, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Accused_Sex n mean_sentence median_sentence
## <chr> <int> <dbl> <dbl>
## 1 Female 560 9.83 6
## 2 Male 1486 12.2 9
## 3 <NA> 9 6.86 5
ggplot(DIH_DATA, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
geom_boxplot() +
guides(fill = "none")
## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
t.test(Sentence_num ~ Accused_Sex, data = DIH_DATA)
##
## Welch Two Sample t-test
##
## data: Sentence_num by Accused_Sex
## t = -4.1076, df = 1071.3, p-value = 0.00004303
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -3.535442 -1.249629
## sample estimates:
## mean in group Female mean in group Male
## 9.830096 12.222631
Comparison of Sentence Length by Accused Sex
A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -4.11, p < 0.001). • Mean sentence (Female): 9.83 • Mean sentence (Male): 12.22 • Mean difference: approximately 2.39 units higher for males • 95% CI for the difference: [-3.54, -1.25]
These results suggest that, on average, male defendants receive longer sentences than female defendants.
However, several important caveats should be noted: • The distribution of sentence length is highly skewed with substantial outliers, particularly among male defendants. • Because of this skewness, the mean may not fully represent the typical case, and results should be interpreted cautiously. • A nonparametric alternative (e.g., Mann–Whitney U test) or analysis using transformed data (e.g., log(sentence)) may provide a more robust assessment.
Additionally, the boxplot visualization shows: • Greater variability in sentence length among males • More extreme high-value outliers in the male group • Substantial overlap between groups despite the difference in means
Finally, the plotting warning indicates that 1,209 observations were removed due to missing or non-finite values, which may affect representativeness and should be investigated further.
let’s discuss if I should run these additional tests or if we are satisfied with these findings
##Sentence_Quartile Bivariates on full dataset
cor.test(
DIH_DATA$Accused_Age,
DIH_DATA$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Accused_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Accused_Age and DIH_DATA$Sentence_Quartile_calc
## S = 12484370, p-value = 0.00232
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1442022
cor.test(
DIH_DATA$Deceased_Age,
DIH_DATA$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Deceased_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Deceased_Age and DIH_DATA$Sentence_Quartile_calc
## S = 902691085, p-value = 0.8393
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.004839738
###SentQ+Charge
kruskal.test(Sentence_Quartile_calc ~ Charge, data = DIH_DATA)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 258.17, df = 28, p-value <
## 0.00000000000000022
Again this wont mean much until we clean the Charge variable
###SentQ+Relationship Tests whether distributions differ across groups
kruskal.test(Sentence_Quartile_calc ~ Relationship, data = DIH_DATA)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 40.074, df = 2, p-value = 0.000000001986
table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)
##
## 1 2 3 4
## Caretaker/Family/Friend/Partner/Co-User 165 131 116 93
## Dealer/Buyer 291 339 353 363
## Doctor/Patient 5 10 10 22
tab2 <- table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)
chisq.test(tab2)
##
## Pearson's Chi-squared test
##
## data: tab2
## X-squared = 44.016, df = 6, p-value = 0.00000007337
#Analytic Sample
@Katie we will need to include a justification in the methods section if it is not already in there about why we restricted the data to this time frame.
dih_analytic <- DIH_DATA %>%
filter(Year_Charged >= 2011 & Year_Charged <= 2021) %>%
filter(!is.na(Rural_Code))
Hmmmm if we are going to restrict the analytic data sample by years… then perhaps we need to run descriptives here again with the analytic sample to include in the methods section?
library(dplyr)
library(tidyr)
options(scipen = 999)
numeric_summary_B <- dih_analytic %>%
summarise(
across(
c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
list(
nonmissing = ~sum(!is.na(.)),
missing = ~sum(is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
)
)
) %>%
pivot_longer(
everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
) %>%
pivot_wider(names_from = Statistic, values_from = value)
numeric_summary_B <- numeric_summary_B %>%
mutate(
min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
)
numeric_summary_B
## # A tibble: 5 × 8
## Variable nonmissing missing mean sd median min max
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Rural_Code 2838 0 2.49 1.88 2 1 9
## 2 Year_Charged 2838 0 2017. 2.36 2017 2011 2021
## 3 Accused_Age 976 1862 34.3 10.4 32 17 87
## 4 Deceased_Age 2375 463 29.9 11.5 28 0 91
## 5 Sentence_num 1772 1066 11.4 11.8 8 0.08219178082 180
library(dplyr)
library(purrr)
library(tidyr)
library(gt)
categorical_summary2 <- map_dfr(cat_vars, function(v) {
x <- dih_analytic[[v]]
tab <- sort(table(x, useNA = "no"), decreasing = TRUE)
tibble(
Variable = v,
nonmissing = sum(!is.na(x)),
missing = sum(is.na(x)),
n_categories = dplyr::n_distinct(x, na.rm = TRUE),
most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
)
})
categorical_summary2
## # A tibble: 15 × 7
## Variable nonmissing missing n_categories most_common most_common_n
## <chr> <int> <int> <int> <chr> <int>
## 1 State 2838 0 47 Pennsylvan… 553
## 2 Charge 2837 1 30 Drug Deliv… 571
## 3 Accused_Race 2441 397 5 White 1819
## 4 Accused_Ethnicity 829 2009 2 Not Hispan… 697
## 5 Accused_Sex 2823 15 2 Male 2082
## 6 Accused_State 2777 61 58 Pennsylvan… 525
## 7 Court_Type 2760 78 2 State 2436
## 8 Plea 1813 1025 5 Guilty 1497
## 9 Deceased_Race 1867 971 7 White 1750
## 10 Deceased_Ethnicity 873 1965 3 Not Hispan… 781
## 11 Deceased_Sex 2632 206 3 Male 1685
## 12 Substance 2317 521 34 heroin 823
## 13 Relationship 2576 262 3 Dealer/Buy… 1883
## 14 Urbanicity 2838 0 3 Metropolit… 2274
## 15 Sentence_Quartile_… 1772 1066 4 2 457
## # ℹ 1 more variable: most_common_pct <dbl>
##Testing Bivariate Relationships in Analytic Sample - Outcome is Sentence_num
cor.test(
dih_analytic$Sentence_num,
dih_analytic$Accused_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Accused_Age, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Sentence_num and dih_analytic$Accused_Age
## S = 10194801, p-value = 0.003246
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1441686
cor.test(
dih_analytic$Sentence_num,
dih_analytic$Deceased_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Deceased_Age, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Sentence_num and dih_analytic$Deceased_Age
## S = 601445934, p-value = 0.1897
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.03369232
cor.test(
dih_analytic$Sentence_num,
dih_analytic$Year_Charged,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Year_Charged, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Sentence_num and dih_analytic$Year_Charged
## S = 903777791, p-value = 0.285
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.02541011
dih_analytic %>%
group_by(Accused_Sex) %>%
summarise(
n = sum(!is.na(Sentence_num)),
mean_sentence = mean(Sentence_num, na.rm = TRUE),
median_sentence = median(Sentence_num, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Accused_Sex n mean_sentence median_sentence
## <chr> <int> <dbl> <dbl>
## 1 Female 479 10.0 6
## 2 Male 1286 11.9 9
## 3 <NA> 7 7.25 5
ggplot(dih_analytic, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
geom_boxplot() +
guides(fill = "none")
## Warning: Removed 1066 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
t.test(Sentence_num ~ Accused_Sex, data = dih_analytic)
##
## Welch Two Sample t-test
##
## data: Sentence_num by Accused_Sex
## t = -2.9574, df = 831.36, p-value = 0.00319
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -3.1560701 -0.6379943
## sample estimates:
## mean in group Female mean in group Male
## 10.00579 11.90282
Comparison of Sentence Length by Accused Sex (2017–2021 Subset)
A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants for cases from 2017–2021. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -2.96, p = 0.003). • Mean sentence (Female): 10.01 • Mean sentence (Male): 11.90 • Mean difference: approximately 1.9 units higher for males • 95% CI for the difference: [-3.16, -0.64]
These results suggest that, within this time period, male defendants receive longer sentences on average than female defendants.
Additional descriptive statistics support this pattern: • Median sentence (Female): 6 • Median sentence (Male): 9
This indicates that the difference is also present at the median level, not just driven by extreme values.
However, several important considerations apply: • Sentence length remains highly skewed with notable outliers, particularly among male defendants. • Although the mean difference is statistically significant, the magnitude of the difference is modest relative to the overall variability in sentencing. • The boxplot shows substantial overlap between the distributions for males and females. • A nonparametric test (e.g., Mann–Whitney U) or analysis using transformed sentence length may provide a more robust assessment.
Additionally, the plotting warning indicates that 1,066 observations were removed due to missing or non-finite values, which should be examined to ensure no systematic bias in the analyzed subset.
summary(aov(Sentence_num ~ Urbanicity, data = dih_analytic))
## Df Sum Sq Mean Sq F value Pr(>F)
## Urbanicity 2 408 204.0 1.463 0.232
## Residuals 1769 246685 139.4
## 1066 observations deleted due to missingness
summary(aov(Sentence_num ~ Charge, data = dih_analytic))
## Df Sum Sq Mean Sq F value Pr(>F)
## Charge 27 13356 494.7 3.691 0.000000000635 ***
## Residuals 1744 233737 134.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1066 observations deleted due to missingness
This suggest the sentence length varies by charge… but we need information about how charges were grouped before we can run this analysis properly
summary(aov(Sentence_num ~ Relationship, data = dih_analytic))
## Df Sum Sq Mean Sq F value Pr(>F)
## Relationship 2 2048 1024.0 7.471 0.000589 ***
## Residuals 1632 223687 137.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1203 observations deleted due to missingness
##Testing Bivariate Relationships in Analytic Sample: Outcome is Sentencing Quartile.
###SentQ+DecAge
cor.test(
dih_analytic$Accused_Age,
dih_analytic$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Accused_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Accused_Age and dih_analytic$Sentence_Quartile_calc
## S = 10081186, p-value = 0.001687
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1537063
cor.test(
dih_analytic$Deceased_Age,
dih_analytic$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Deceased_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Deceased_Age and dih_analytic$Sentence_Quartile_calc
## S = 582264805, p-value = 0.9775
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.0007261225
###SentQ+Yr
cor.test(
dih_analytic$Year_Charged,
dih_analytic$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Year_Charged,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Year_Charged and dih_analytic$Sentence_Quartile_calc
## S = 904918982, p-value = 0.309
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.02417951
Later years are associated with higher sentence quartiles ρ ≈ 0.28 → moderate (not strong)
Over time, cases are more likely to fall into higher relative sentence bins
###SentQ+Charge
kruskal.test(Sentence_Quartile_calc ~ Charge, data = dih_analytic)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 235.81, df = 27, p-value <
## 0.00000000000000022
Again this wont mean much until we clean the Charge variable
###SentQ+Relationship Tests whether distributions differ across groups
kruskal.test(Sentence_Quartile_calc ~ Relationship, data = dih_analytic)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 27.177, df = 2, p-value = 0.000001255
table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)
##
## 1 2 3 4
## Caretaker/Family/Friend/Partner/Co-User 124 112 91 73
## Dealer/Buyer 265 310 309 318
## Doctor/Patient 4 5 10 14
tab22 <- table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)
chisq.test(tab22)
##
## Pearson's Chi-squared test
##
## data: tab22
## X-squared = 27.92, df = 6, p-value = 0.00009729
###SentQ+Plea
tab_plea <- table(dih_analytic$Plea, dih_analytic$Sentence_Quartile_calc)
tab_plea
##
## 1 2 3 4
## Alford Plea 0 0 1 0
## Guilty 353 341 335 321
## Guilty to Lesser Charge 18 27 11 9
## No Contest 16 8 7 5
## Not Guilty 7 14 16 29
Fixed the issue with pending/awaiting trial showing up in the analytic dataset.
DIH_DATA %>%
filter(
Plea == "Awaiting Plea Hearing" &
(!is.na(Sentence_num) | !is.na(Sentence_Quartile_calc))
) %>%
summarise(n = n())
## # A tibble: 1 × 1
## n
## <int>
## 1 0
DIH_DATA %>%
filter(Plea == "Awaiting Plea Hearing") %>%
count(Sentence_num, Sentence_Quartile_calc, sort = TRUE)
## # A tibble: 0 × 3
## # ℹ 3 variables: Sentence_num <dbl>, Sentence_Quartile_calc <int>, n <int>
DIH_DATA %>%
filter(Plea == "Awaiting Plea Hearing") %>%
count(Sentence, Sentence_Quartile_calc, sort = TRUE)
## # A tibble: 0 × 3
## # ℹ 3 variables: Sentence <chr>, Sentence_Quartile_calc <int>, n <int>
#Logit Model Sent Q Analytic Sample
Decided not to include Race/Ethnicity of deceased because of large missingness in the data (over 2,000 observations missing)
Accused Ethnicity also has large missingness with over 2,000 missing, so rather than use the Race_Ethnicity Variable, I am just using the Accused_Race, which has 397 missing.
I decided to put in a child variable instead of the Deceased Age for theoretical reasons.
#Making Sure Variable is Ordered
dih_analytic <- dih_analytic %>%
mutate(Sentence_Quartile_calc = ordered(Sentence_Quartile_calc))
#refit all models on the same complete-case dataset so ANOVA will work later
model_data <- dih_analytic %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Relationship,
Urbanicity,
Accused_Race,
Accused_Sex,
Court_Type,
Plea,
Substance_Group,
child
) %>%
filter(complete.cases(.))
model_data only has 906 observations….
what if I did this on the full dataset instead of just the analytic?
#refit all models on the same complete-case dataset so ANOVA will work later
model_data2 <- DIH_DATA %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Relationship,
Urbanicity,
Accused_Race,
Accused_Sex,
Court_Type,
Plea,
Substance_Group,
child
) %>%
filter(complete.cases(.))
We get an extra 93 observations…
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model_full <- polr(
as.factor(Sentence_Quartile_calc) ~ Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child,
data = model_data,
Hess = TRUE
)
summary(model_full)
## Call:
## polr(formula = as.factor(Sentence_Quartile_calc) ~ Charge_Group +
## Relationship + Urbanicity + Accused_Race + Accused_Sex +
## Court_Type + Plea + Substance_Group + child, data = model_data,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.806485 1.0139 -0.79546
## Charge_GroupDrug-Induced Homicide 0.122844 0.5722 0.21470
## Charge_GroupManslaughter -0.677037 0.5781 -1.17106
## Charge_GroupMurder 0.739672 0.5962 1.24068
## Charge_GroupNegligent/Reckless -1.023815 0.5885 -1.73982
## Charge_GroupOther 0.445379 0.5787 0.76961
## RelationshipDealer/Buyer 0.140116 0.1546 0.90655
## RelationshipDoctor/Patient 1.150077 0.5622 2.04585
## UrbanicitySuburban 0.075172 0.1875 0.40095
## UrbanicityRural 0.552541 0.2800 1.97365
## Accused_RaceBlack 0.278980 0.7744 0.36024
## Accused_RaceNative American or Alaskan Native -0.689710 1.2418 -0.55541
## Accused_RaceWhite -0.219247 0.7661 -0.28619
## Accused_SexMale 0.265375 0.1440 1.84232
## Court_TypeState -0.755750 0.2445 -3.09081
## PleaGuilty -0.451714 1.5077 -0.29960
## PleaGuilty to Lesser Charge -1.326889 1.5284 -0.86818
## PleaNo Contest -1.248193 1.5550 -0.80268
## PleaNot Guilty 0.536800 1.5355 0.34960
## Substance_GroupOpioid (non-fentanyl) -0.220652 0.1758 -1.25478
## Substance_GroupOther 0.003549 0.3183 0.01115
## Substance_GroupPolysubstance 0.089881 0.1975 0.45519
## Substance_GroupStimulant 0.293040 0.3397 0.86276
## childChild (<18) 0.293834 0.2743 1.07136
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.3100 1.7988 -1.2842
## 2|3 -0.9300 1.7970 -0.5175
## 3|4 0.3683 1.7967 0.2050
##
## Residual Deviance: 2318.509
## AIC: 2372.509
#testing model specification
library(brant)
brant(model_full)
## ------------------------------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------------------------------
## Omnibus 17.11 48 1
## Charge_GroupDistribution/Possession Causing Death 0 2 1
## Charge_GroupDrug-Induced Homicide 0.76 2 0.68
## Charge_GroupManslaughter 0.07 2 0.97
## Charge_GroupMurder 0.26 2 0.88
## Charge_GroupNegligent/Reckless 1.51 2 0.47
## Charge_GroupOther 0.61 2 0.74
## RelationshipDealer/Buyer 3.96 2 0.14
## RelationshipDoctor/Patient 1.62 2 0.45
## UrbanicitySuburban 1.97 2 0.37
## UrbanicityRural 0.58 2 0.75
## Accused_RaceBlack 0.4 2 0.82
## Accused_RaceNative American or Alaskan Native 0 2 1
## Accused_RaceWhite 0.4 2 0.82
## Accused_SexMale 1.57 2 0.46
## Court_TypeState 1.08 2 0.58
## PleaGuilty 0 2 1
## PleaGuilty to Lesser Charge 0 2 1
## PleaNo Contest 0 2 1
## PleaNot Guilty 0 2 1
## Substance_GroupOpioid (non-fentanyl) 3.35 2 0.19
## Substance_GroupOther 2.97 2 0.23
## Substance_GroupPolysubstance 1.75 2 0.42
## Substance_GroupStimulant 6.35 2 0.04
## childChild (<18) 0.14 2 0.93
## ------------------------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(model_full): 201065 combinations in table(dv,ivs) do not
## occur. Because of that, the test results might be invalid.
FAIL to reject the proportional odds assumption
data are too sparse across combinations of predictors and outcome levels
This is an ordinal logistic regression with the proportional odds assumption
It assumes: The effect of each predictor is the same across all cutpoints of the outcome (aka the sentencing quartile). Effect of Charge_Group is the SAME in all 3 comparisons
Going to start dropping variables and comparing model fit.
# Drop one variable at a time
model_no_substance <- update(model_full, . ~ . - Substance_Group)
model_no_urban <- update(model_full, . ~ . - Urbanicity)
model_no_race <- update(model_full, . ~ . - Accused_Race)
model_no_sex <- update(model_full, . ~ . - Accused_Sex)
model_no_court <- update(model_full, . ~ . - Court_Type)
model_no_plea <- update(model_full, . ~ . - Plea)
model_no_child <- update(model_full, . ~ . - child)
model_no_relationship <- update(model_full, . ~ . - Relationship)
model_no_charge <- update(model_full, . ~ . - Charge_Group)
Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
AIC(
model_full,
model_no_substance,
model_no_urban,
model_no_race,
model_no_sex,
model_no_court,
model_no_plea,
model_no_child,
model_no_relationship,
model_no_charge
)
## df AIC
## model_full 27 2372.509
## model_no_substance 23 2370.662
## model_no_urban 25 2372.431
## model_no_race 24 2376.382
## model_no_sex 26 2373.909
## model_no_court 26 2380.190
## model_no_plea 23 2389.495
## model_no_child 26 2371.655
## model_no_relationship 25 2373.027
## model_no_charge 21 2445.748
To evaluate model parsimony and identify predictors contributing meaningfully to model fit, a series of nested ordinal logistic regression models were compared using Akaike Information Criterion (AIC).
anova(model_no_substance, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 883 2324.662
## 2 879 2318.509 1 vs 2 4 6.152442 0.1880494
anova(model_no_urban, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 881 2322.431
## 2 879 2318.509 1 vs 2 2 3.922153 0.1407069
anova(model_no_race, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Urbanicity + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 882 2328.382
## 2 879 2318.509 1 vs 2 3 9.872421 0.01968232
anova(model_no_sex, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Urbanicity + Accused_Race + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 880 2321.909
## 2 879 2318.509 1 vs 2 1 3.400056 0.06519422
anova(model_no_court, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 880 2328.190
## 2 879 2318.509 1 vs 2 1 9.680627 0.001862211
anova(model_no_plea, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 883 2343.495
## 2 879 2318.509 1 vs 2 4 24.98588 0.0000506397
anova(model_no_child, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 880 2319.655
## 2 879 2318.509 1 vs 2 1 1.145306 0.2845339
anova(model_no_relationship,model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 881 2323.027
## 2 879 2318.509 1 vs 2 2 4.517587 0.1044765
anova(model_no_charge, model_full)
## Likelihood ratio tests of ordinal regression models
##
## Response: as.factor(Sentence_Quartile_calc)
## Model
## 1 Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## 2 Charge_Group + Relationship + Urbanicity + Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 885 2403.748
## 2 879 2318.509 1 vs 2 6 85.23858 0.0000000000000003330669
The variables that clearly improve model fit are:
Variables that do not clearly improve fit:
Ok going to run the defensible model then include sensitivity checks…
model_core <- polr(
ordered(Sentence_Quartile_calc) ~
Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data,
Hess = TRUE
)
summary(model_core)
## Call:
## polr(formula = ordered(Sentence_Quartile_calc) ~ Charge_Group +
## Plea + Court_Type + Accused_Race, data = model_data, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.92348 0.9929 -0.93006
## Charge_GroupDrug-Induced Homicide 0.05331 0.5672 0.09399
## Charge_GroupManslaughter -0.68925 0.5745 -1.19976
## Charge_GroupMurder 0.66275 0.5922 1.11904
## Charge_GroupNegligent/Reckless -1.10969 0.5847 -1.89782
## Charge_GroupOther 0.42304 0.5724 0.73907
## PleaGuilty -0.77141 1.4919 -0.51708
## PleaGuilty to Lesser Charge -1.55796 1.5133 -1.02953
## PleaNo Contest -1.46579 1.5395 -0.95215
## PleaNot Guilty 0.27038 1.5220 0.17765
## Court_TypeState -0.87979 0.2390 -3.68081
## Accused_RaceBlack 0.03106 0.7608 0.04082
## Accused_RaceNative American or Alaskan Native -0.57072 1.2293 -0.46427
## Accused_RaceWhite -0.51803 0.7529 -0.68808
##
## Intercepts:
## Value Std. Error t value
## 1|2 -3.3409 1.7604 -1.8978
## 2|3 -1.9856 1.7576 -1.1297
## 3|4 -0.7050 1.7565 -0.4013
##
## Residual Deviance: 2336.71
## AIC: 2370.71
brant(model_core)
## ------------------------------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------------------------------
## Omnibus -67.9 28 1
## Charge_GroupDistribution/Possession Causing Death 0.07 2 0.97
## Charge_GroupDrug-Induced Homicide 0.7 2 0.71
## Charge_GroupManslaughter 0.07 2 0.96
## Charge_GroupMurder 0.32 2 0.85
## Charge_GroupNegligent/Reckless 1.48 2 0.48
## Charge_GroupOther 0.47 2 0.79
## PleaGuilty 0 2 1
## PleaGuilty to Lesser Charge 0 2 1
## PleaNo Contest 0 2 1
## PleaNot Guilty 0 2 1
## Court_TypeState 1.76 2 0.41
## Accused_RaceBlack 0.27 2 0.87
## Accused_RaceNative American or Alaskan Native 0 2 1
## Accused_RaceWhite 0.4 2 0.82
## ------------------------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(model_core): 983 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.
library(MASS)
# Sensitivity 1: add Accused_Sex
model_sens_sex <- update(model_core, . ~ . + Accused_Sex)
# Sensitivity 2: add Relationship
model_sens_relationship <- update(model_core, . ~ . + Relationship)
# Sensitivity 3: add child
model_sens_child <- update(model_core, . ~ . + child)
# Sensitivity 4: add all theoretically interesting covariates
model_sens_all <- update(
model_core,
. ~ . + Accused_Sex + Relationship + child
)
AIC(
model_core,
model_sens_sex,
model_sens_relationship,
model_sens_child,
model_sens_all
)
## df AIC
## model_core 17 2370.710
## model_sens_sex 18 2369.473
## model_sens_relationship 19 2371.068
## model_sens_child 18 2372.214
## model_sens_all 21 2370.847
anova(model_core, model_sens_sex)
## Likelihood ratio tests of ordinal regression models
##
## Response: ordered(Sentence_Quartile_calc)
## Model Resid. df
## 1 Charge_Group + Plea + Court_Type + Accused_Race 889
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Accused_Sex 888
## Resid. Dev Test Df LR stat. Pr(Chi)
## 1 2336.710
## 2 2333.473 1 vs 2 1 3.237434 0.07197326
anova(model_core, model_sens_relationship)
## Likelihood ratio tests of ordinal regression models
##
## Response: ordered(Sentence_Quartile_calc)
## Model Resid. df
## 1 Charge_Group + Plea + Court_Type + Accused_Race 889
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Relationship 887
## Resid. Dev Test Df LR stat. Pr(Chi)
## 1 2336.710
## 2 2333.068 1 vs 2 2 3.642627 0.1618131
anova(model_core, model_sens_child)
## Likelihood ratio tests of ordinal regression models
##
## Response: ordered(Sentence_Quartile_calc)
## Model Resid. df Resid. Dev
## 1 Charge_Group + Plea + Court_Type + Accused_Race 889 2336.710
## 2 Charge_Group + Plea + Court_Type + Accused_Race + child 888 2336.214
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 1 0.496671 0.4809665
anova(model_core, model_sens_all)
## Likelihood ratio tests of ordinal regression models
##
## Response: ordered(Sentence_Quartile_calc)
## Model
## 1 Charge_Group + Plea + Court_Type + Accused_Race
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Accused_Sex + Relationship + child
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 889 2336.710
## 2 885 2328.847 1 vs 2 4 7.863004 0.09672744
Sensitivity models adding Accused_Sex, Relationship, and child did not substantially improve model fit compared with the core model. Although adding Accused_Sex slightly reduced AIC, the likelihood ratio test was marginal, so the more parsimonious core model was retained.
Because complete-case modeling substantially reduced the analytic sample, models were re-estimated using a more parsimonious set of predictors with lower missingness. Variables with substantial missingness were evaluated in sensitivity analyses rather than included in the primary model.
colSums(is.na(dih_analytic %>%
dplyr::select(
Sentence_Quartile_calc, Charge_Group, Plea, Court_Type,
Accused_Race, Accused_Sex, Relationship, child,
Substance_Group, Urbanicity
)))
## Sentence_Quartile_calc Charge_Group Plea
## 1066 7 1025
## Court_Type Accused_Race Accused_Sex
## 78 397 15
## Relationship child Substance_Group
## 262 463 521
## Urbanicity
## 0
model_data_core <- dih_analytic %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Plea,
Court_Type,
Accused_Race
) %>%
filter(complete.cases(.))
model_core2 <- MASS::polr(
Sentence_Quartile_calc ~
Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data_core,
Hess = TRUE
)
summary(model_core2)
## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group +
## Plea + Court_Type + Accused_Race, data = model_data_core,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.30000 0.7810 -0.384108
## Charge_GroupDrug-Induced Homicide 0.03233 0.3852 0.083925
## Charge_GroupManslaughter -0.70066 0.3919 -1.788051
## Charge_GroupMurder 0.58097 0.4087 1.421552
## Charge_GroupNegligent/Reckless -0.59518 0.3999 -1.488226
## Charge_GroupOther 0.42410 0.3841 1.104219
## PleaGuilty -0.80994 1.4857 -0.545172
## PleaGuilty to Lesser Charge -1.41279 1.5015 -0.940891
## PleaNo Contest -1.42517 1.5221 -0.936301
## PleaNot Guilty 0.01495 1.5036 0.009941
## Court_TypeState -1.02898 0.1917 -5.368195
## Accused_RaceBlack 0.12947 0.6514 0.198763
## Accused_RaceIndian American -1.92695 1.6535 -1.165392
## Accused_RaceNative American or Alaskan Native -0.69366 1.1604 -0.597791
## Accused_RaceWhite -0.49990 0.6456 -0.774291
##
## Intercepts:
## Value Std. Error t value
## 1|2 -3.3585 1.6594 -2.0239
## 2|3 -2.1151 1.6577 -1.2759
## 3|4 -0.8777 1.6569 -0.5297
##
## Residual Deviance: 3455.907
## AIC: 3491.907
Plea looks unstable because the reference category is probably Alford Plea with only 1 case. That’s why the standard errors are huge (~1.5). Dropping or recoding Alford Plea to NA, then refit.
model_data_core <- model_data_core %>%
mutate(
Plea = ifelse(Plea == "Alford Plea", NA, Plea)
) %>%
filter(complete.cases(.)) %>%
droplevels()
Also, Accused_Race has tiny categories like Native American; collapsing these before modeling:
model_data_core <- model_data_core %>%
mutate(
Accused_Race = case_when(
Accused_Race %in% c("White", "Black") ~ Accused_Race,
TRUE ~ "Other"
),
Accused_Race = factor(Accused_Race)
)
model_core2 <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data_core,
Hess = TRUE
)
summary(model_core2)
## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group +
## Plea + Court_Type + Accused_Race, data = model_data_core,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.2303 0.7782 -0.2959
## Charge_GroupDrug-Induced Homicide 0.1073 0.3793 0.2828
## Charge_GroupManslaughter -0.6270 0.3862 -1.6238
## Charge_GroupMurder 0.6503 0.4031 1.6135
## Charge_GroupNegligent/Reckless -0.5247 0.3941 -1.3313
## Charge_GroupOther 0.4973 0.3780 1.3154
## PleaGuilty to Lesser Charge -0.6008 0.2345 -2.5618
## PleaNo Contest -0.6142 0.3343 -1.8375
## PleaNot Guilty 0.8200 0.2481 3.3053
## Court_TypeState -1.0305 0.1916 -5.3792
## Accused_RaceOther -0.5091 0.5159 -0.9868
## Accused_RaceWhite -0.6292 0.1224 -5.1401
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.6065 0.3546 -7.3505
## 2|3 -1.3635 0.3495 -3.9008
## 3|4 -0.1305 0.3476 -0.3756
##
## Residual Deviance: 3454.928
## AIC: 3484.928
Key interpretation:
NEXT STEPS: 1. exp(coef(model_core2)) and interpret 2. RUN Model on full dataset (not analytic)
The primary models were estimated using the analytic sample restricted to the study period. As a sensitivity analysis, models were re-estimated using the full dataset to assess whether results were robust to inclusion of earlier cases.
#Logit Model on Full Dataset
Making adjustments to the model to reflect the changes made to plea and accused race above…
model_data_all <- DIH_DATA %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Plea,
Court_Type,
Accused_Race
) %>%
mutate(
Plea = ifelse(Plea == "Alford Plea", NA, Plea),
Accused_Race = case_when(
Accused_Race %in% c("White", "Black") ~ Accused_Race,
TRUE ~ "Other"
),
Accused_Race = factor(Accused_Race),
Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
) %>%
filter(complete.cases(.)) %>%
droplevels()
DIH_DATA <- DIH_DATA %>%
mutate(
Sentence_num = case_when(
Sentence %in% c("Pending", "pending") ~ NA_character_,
TRUE ~ Sentence
),
Sentence_num = as.numeric(Sentence_num)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sentence_num = as.numeric(Sentence_num)`.
## Caused by warning:
## ! NAs introduced by coercion
model_core2_all <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data_all,
Hess = TRUE
)
summary(model_core2_all)
## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group +
## Plea + Court_Type + Accused_Race, data = model_data_all,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.3739 0.6497 -0.5755
## Charge_GroupDrug-Induced Homicide -0.1255 0.2987 -0.4202
## Charge_GroupManslaughter -0.8265 0.3045 -2.7143
## Charge_GroupMurder 0.4005 0.3246 1.2338
## Charge_GroupNegligent/Reckless -0.7391 0.3149 -2.3471
## Charge_GroupOther 0.3819 0.2970 1.2858
## PleaGuilty to Lesser Charge -0.6233 0.2192 -2.8439
## PleaNo Contest -0.5119 0.2838 -1.8039
## PleaNot Guilty 0.8357 0.2282 3.6628
## Court_TypeState -0.9628 0.1542 -6.2450
## Accused_RaceOther -0.3508 0.1526 -2.2991
## Accused_RaceWhite -0.6148 0.1172 -5.2450
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.7595 0.2900 -9.5159
## 2|3 -1.5549 0.2857 -5.4425
## 3|4 -0.3035 0.2830 -1.0722
##
## Residual Deviance: 4623.239
## AIC: 4653.239
nrow(model_data_core)
## [1] 1323
nrow(model_data_all)
## [1] 1773
Looks like we gain about 400 observations from doing the full dataset.
coef(summary(model_core2))
## Value Std. Error
## Charge_GroupDistribution/Possession Causing Death -0.2302897 0.7782069
## Charge_GroupDrug-Induced Homicide 0.1072662 0.3793125
## Charge_GroupManslaughter -0.6270398 0.3861507
## Charge_GroupMurder 0.6503113 0.4030527
## Charge_GroupNegligent/Reckless -0.5247348 0.3941440
## Charge_GroupOther 0.4972592 0.3780309
## PleaGuilty to Lesser Charge -0.6007545 0.2345075
## PleaNo Contest -0.6142136 0.3342613
## PleaNot Guilty 0.8200252 0.2480926
## Court_TypeState -1.0305461 0.1915814
## Accused_RaceOther -0.5090729 0.5158819
## Accused_RaceWhite -0.6292061 0.1224101
## 1|2 -2.6065429 0.3546067
## 2|3 -1.3635059 0.3495413
## 3|4 -0.1305438 0.3475875
## t value
## Charge_GroupDistribution/Possession Causing Death -0.2959235
## Charge_GroupDrug-Induced Homicide 0.2827912
## Charge_GroupManslaughter -1.6238215
## Charge_GroupMurder 1.6134648
## Charge_GroupNegligent/Reckless -1.3313277
## Charge_GroupOther 1.3153931
## PleaGuilty to Lesser Charge -2.5617712
## PleaNo Contest -1.8375254
## PleaNot Guilty 3.3053194
## Court_TypeState -5.3791554
## Accused_RaceOther -0.9868012
## Accused_RaceWhite -5.1401498
## 1|2 -7.3505172
## 2|3 -3.9008431
## 3|4 -0.3755712
coef(summary(model_core2_all))
## Value Std. Error
## Charge_GroupDistribution/Possession Causing Death -0.3739163 0.6496682
## Charge_GroupDrug-Induced Homicide -0.1255201 0.2987132
## Charge_GroupManslaughter -0.8264920 0.3044950
## Charge_GroupMurder 0.4004812 0.3245876
## Charge_GroupNegligent/Reckless -0.7390633 0.3148842
## Charge_GroupOther 0.3818782 0.2969966
## PleaGuilty to Lesser Charge -0.6233344 0.2191799
## PleaNo Contest -0.5119005 0.2837773
## PleaNot Guilty 0.8356956 0.2281593
## Court_TypeState -0.9628109 0.1541731
## Accused_RaceOther -0.3508181 0.1525918
## Accused_RaceWhite -0.6148201 0.1172203
## 1|2 -2.7595002 0.2899890
## 2|3 -1.5548866 0.2856953
## 3|4 -0.3034814 0.2830489
## t value
## Charge_GroupDistribution/Possession Causing Death -0.5755498
## Charge_GroupDrug-Induced Homicide -0.4202027
## Charge_GroupManslaughter -2.7143042
## Charge_GroupMurder 1.2338155
## Charge_GroupNegligent/Reckless -2.3470953
## Charge_GroupOther 1.2857998
## PleaGuilty to Lesser Charge -2.8439399
## PleaNo Contest -1.8038812
## PleaNot Guilty 3.6627727
## Court_TypeState -6.2450013
## Accused_RaceOther -2.2990622
## Accused_RaceWhite -5.2449953
## 1|2 -9.5158779
## 2|3 -5.4424645
## 3|4 -1.0721870
The full-dataset sensitivity model used 1,773 cases compared with 1,323 in the analytic sample. The main findings were largely consistent across both models: not guilty pleas were associated with higher sentence quartiles, guilty-to-lesser pleas with lower quartiles, state cases with lower quartiles than federal cases, and White defendants with lower sentence quartiles relative to the reference group.
In the full dataset, some Charge_Group effects became stronger, especially manslaughter and negligent/reckless charges, but the overall direction of the core legal/process variables remained stable.
Now testing the full dataset with relationship added baxk in…
model_data_all_rel <- DIH_DATA %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Plea,
Court_Type,
Accused_Race,
Relationship
) %>%
mutate(
Plea = ifelse(Plea == "Alford Plea", NA, Plea),
Accused_Race = case_when(
Accused_Race %in% c("White", "Black") ~ Accused_Race,
TRUE ~ "Other"
),
Accused_Race = factor(Accused_Race),
Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
) %>%
filter(complete.cases(.)) %>%
droplevels()
model_core2_all_rel_base <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data_all_rel,
Hess = TRUE
)
model_core2_all_rel <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race + Relationship,
data = model_data_all_rel,
Hess = TRUE
)
summary(model_core2_all_rel)
## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group +
## Plea + Court_Type + Accused_Race + Relationship, data = model_data_all_rel,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.7070 0.6850 -1.0322
## Charge_GroupDrug-Induced Homicide -0.1858 0.3170 -0.5859
## Charge_GroupManslaughter -0.9158 0.3240 -2.8263
## Charge_GroupMurder 0.3407 0.3439 0.9909
## Charge_GroupNegligent/Reckless -0.7932 0.3336 -2.3778
## Charge_GroupOther 0.2840 0.3144 0.9031
## PleaGuilty to Lesser Charge -0.6621 0.2276 -2.9088
## PleaNo Contest -0.4195 0.2886 -1.4536
## PleaNot Guilty 0.7611 0.2422 3.1430
## Court_TypeState -0.9372 0.1656 -5.6585
## Accused_RaceOther -0.2269 0.1611 -1.4087
## Accused_RaceWhite -0.5666 0.1243 -4.5596
## RelationshipDealer/Buyer 0.2222 0.1060 2.0957
## RelationshipDoctor/Patient 0.5751 0.3243 1.7734
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.6356 0.3234 -8.1501
## 2|3 -1.4012 0.3195 -4.3854
## 3|4 -0.1366 0.3170 -0.4309
##
## Residual Deviance: 4253.50
## AIC: 4287.50
AIC(model_core2_all_rel_base, model_core2_all_rel)
## df AIC
## model_core2_all_rel_base 15 4289.635
## model_core2_all_rel 17 4287.500
anova(model_core2_all_rel_base, model_core2_all_rel)
## Likelihood ratio tests of ordinal regression models
##
## Response: Sentence_Quartile_calc
## Model Resid. df
## 1 Charge_Group + Plea + Court_Type + Accused_Race 1623
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Relationship 1621
## Resid. Dev Test Df LR stat. Pr(Chi)
## 1 4259.635
## 2 4253.500 1 vs 2 2 6.134865 0.0465405
nrow(model_data_all_rel)
## [1] 1638
In the full-dataset sensitivity model, adding Relationship modestly improved model fit (LR χ² = 6.13, df = 2, p = .047), with dealer/buyer cases associated with higher sentence quartiles relative to the reference group. This suggests relationship category may matter, although the effect was weaker and less consistent than the core legal-process variables. Significant Charge_GroupManslaughter t = -2.83 # significant Charge_GroupNegligent/Reckless t = -2.38 # significant PleaGuilty to Lesser Charge t = -2.91 # significant PleaNot Guilty t = 3.14 # significant Court_TypeState t = -5.66 # significant Accused_RaceWhite t = -4.56 # significant RelationshipDealer/Buyer t = 2.10 # significant-ish
Not Significant Distribution/Possession Drug-Induced Homicide Murder Other charge No Contest Accused_RaceOther RelationshipDoctor/Patient
For the t value in polr(), the usual rule of thumb is:
model_data_all_child <- DIH_DATA %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Plea,
Court_Type,
Accused_Race,
child
) %>%
mutate(
Plea = ifelse(Plea == "Alford Plea", NA, Plea),
Accused_Race = case_when(
Accused_Race %in% c("White", "Black") ~ Accused_Race,
TRUE ~ "Other"
),
Accused_Race = factor(Accused_Race),
Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
) %>%
filter(complete.cases(.)) %>%
droplevels()
model_all_child_base <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data_all_child,
Hess = TRUE
)
model_all_child <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race + child,
data = model_data_all_child,
Hess = TRUE
)
summary(model_all_child)
## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group +
## Plea + Court_Type + Accused_Race + child, data = model_data_all_child,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.5217 0.7520 -0.69373
## Charge_GroupDrug-Induced Homicide 0.1429 0.3811 0.37486
## Charge_GroupManslaughter -0.6396 0.3850 -1.66129
## Charge_GroupMurder 0.6808 0.4059 1.67718
## Charge_GroupNegligent/Reckless -0.5401 0.3938 -1.37136
## Charge_GroupOther 0.6858 0.3828 1.79167
## PleaGuilty to Lesser Charge -0.5397 0.2428 -2.22304
## PleaNo Contest -0.3160 0.2902 -1.08921
## PleaNot Guilty 0.8989 0.2555 3.51791
## Court_TypeState -0.8934 0.1734 -5.15298
## Accused_RaceOther -0.3589 0.1687 -2.12684
## Accused_RaceWhite -0.5916 0.1258 -4.70273
## childChild (<18) 0.0169 0.1889 0.08947
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.3867 0.3740 -6.3818
## 2|3 -1.1776 0.3706 -3.1780
## 3|4 0.0898 0.3693 0.2431
##
## Residual Deviance: 3983.899
## AIC: 4015.899
AIC(model_all_child_base, model_all_child)
## df AIC
## model_all_child_base 15 4013.907
## model_all_child 16 4015.899
anova(model_all_child_base, model_all_child)
## Likelihood ratio tests of ordinal regression models
##
## Response: Sentence_Quartile_calc
## Model Resid. df Resid. Dev
## 1 Charge_Group + Plea + Court_Type + Accused_Race 1511 3983.907
## 2 Charge_Group + Plea + Court_Type + Accused_Race + child 1510 3983.899
## Test Df LR stat. Pr(Chi)
## 1
## 2 1 vs 2 1 0.008005908 0.9287038
table(model_data_all_child$child)
##
## Adult (18+) Child (<18)
## 1421 105
Adding a child-victim indicator did not improve model fit in the full-dataset sensitivity model and was not associated with sentence quartile after adjustment. Given the small number of child cases in the model sample and the lack of improvement in AIC or likelihood ratio testing, this variable was not retained in the final model.
model_data_core_rel <- dih_analytic %>%
dplyr::select(
Sentence_Quartile_calc,
Charge_Group,
Plea,
Court_Type,
Accused_Race,
Relationship
) %>%
mutate(
Plea = ifelse(Plea == "Alford Plea", NA, Plea),
Accused_Race = case_when(
Accused_Race %in% c("White", "Black") ~ Accused_Race,
TRUE ~ "Other"
),
Accused_Race = factor(Accused_Race),
Sentence_Quartile_calc = ordered(Sentence_Quartile_calc)
) %>%
filter(complete.cases(.)) %>%
droplevels()
model_core2_rel_base <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race,
data = model_data_core_rel,
Hess = TRUE
)
model_core2_rel <- MASS::polr(
Sentence_Quartile_calc ~ Charge_Group + Plea + Court_Type + Accused_Race + Relationship,
data = model_data_core_rel,
Hess = TRUE
)
summary(model_core2_rel)
## Call:
## MASS::polr(formula = Sentence_Quartile_calc ~ Charge_Group +
## Plea + Court_Type + Accused_Race + Relationship, data = model_data_core_rel,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Charge_GroupDistribution/Possession Causing Death -0.3549 0.7063 -0.5025
## Charge_GroupDrug-Induced Homicide -0.1482 0.3379 -0.4387
## Charge_GroupManslaughter -0.9348 0.3463 -2.6993
## Charge_GroupMurder 0.2982 0.3659 0.8150
## Charge_GroupNegligent/Reckless -0.7804 0.3554 -2.1954
## Charge_GroupOther 0.3020 0.3346 0.9028
## PleaGuilty to Lesser Charge -0.6619 0.2362 -2.8017
## PleaNo Contest -0.4348 0.3288 -1.3224
## PleaNot Guilty 0.7651 0.2508 3.0513
## Court_TypeState -0.9456 0.1825 -5.1806
## Accused_RaceOther -0.1132 0.1777 -0.6369
## Accused_RaceWhite -0.6063 0.1299 -4.6662
## RelationshipDealer/Buyer 0.1335 0.1175 1.1360
## RelationshipDoctor/Patient 0.3142 0.3878 0.8102
##
## Intercepts:
## Value Std. Error t value
## 1|2 -2.7100 0.3434 -7.8914
## 2|3 -1.4340 0.3386 -4.2355
## 3|4 -0.1730 0.3359 -0.5152
##
## Residual Deviance: 3621.323
## AIC: 3655.323
AIC(model_core2_rel_base, model_core2_rel)
## df AIC
## model_core2_rel_base 15 3652.930
## model_core2_rel 17 3655.323
anova(model_core2_rel_base, model_core2_rel)
## Likelihood ratio tests of ordinal regression models
##
## Response: Sentence_Quartile_calc
## Model Resid. df
## 1 Charge_Group + Plea + Court_Type + Accused_Race 1381
## 2 Charge_Group + Plea + Court_Type + Accused_Race + Relationship 1379
## Resid. Dev Test Df LR stat. Pr(Chi)
## 1 3622.930
## 2 3621.323 1 vs 2 2 1.607377 0.4476747
nrow(model_data_core_rel)
## [1] 1396
In the analytic sample, adding Relationship did not improve model fit and the relationship categories were not statistically significant predictors of sentence quartile. Although Relationship improved fit modestly in the full-dataset sensitivity model, it was not retained in the primary analytic model.
Note to self This is interesting and makes me wonder what the difference is between the full dataset and the analytic dataset…
#Linear Model
running a linear model with Sentence_num as the outcome, but plain OLS may be shaky because Sentence_num is highly skewed with big outliers.
lm_data_full <- DIH_DATA %>%
dplyr::select(
Sentence_num,
Charge_Group,
Relationship,
Urbanicity,
Accused_Race,
Accused_Sex,
Court_Type,
Plea,
Substance_Group,
child
) %>%
mutate(
Plea = ifelse(Plea == "Alford Plea", NA, Plea),
Accused_Race = case_when(
Accused_Race %in% c("White", "Black") ~ Accused_Race,
TRUE ~ "Other"
),
Accused_Race = factor(Accused_Race)
) %>%
filter(complete.cases(.)) %>%
droplevels()
##full OLS
lm_full <- lm(
Sentence_num ~ Charge_Group + Relationship + Urbanicity +
Accused_Race + Accused_Sex + Court_Type + Plea +
Substance_Group + child,
data = lm_data_full
)
summary(lm_full)
##
## Call:
## lm(formula = Sentence_num ~ Charge_Group + Relationship + Urbanicity +
## Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group +
## child, data = lm_data_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.294 -5.756 -2.273 2.840 171.643
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 13.96682 2.94009 4.750
## Charge_GroupDistribution/Possession Causing Death -1.30623 4.99499 -0.262
## Charge_GroupDrug-Induced Homicide -0.41040 2.73009 -0.150
## Charge_GroupManslaughter -3.79114 2.77920 -1.364
## Charge_GroupMurder 3.13818 2.90087 1.082
## Charge_GroupNegligent/Reckless -2.80906 2.83305 -0.992
## Charge_GroupOther 1.07790 2.74977 0.392
## RelationshipDealer/Buyer 0.76682 0.83050 0.923
## RelationshipDoctor/Patient 7.24098 2.58396 2.802
## UrbanicitySuburban 0.07022 1.01722 0.069
## UrbanicityRural 1.21458 1.47217 0.825
## Accused_RaceOther -1.53658 1.28789 -1.193
## Accused_RaceWhite -2.24824 0.96984 -2.318
## Accused_SexMale 1.21500 0.77854 1.561
## Court_TypeState -2.10031 1.25634 -1.672
## PleaGuilty to Lesser Charge -3.33070 1.66830 -1.996
## PleaNo Contest -3.12979 2.12461 -1.473
## PleaNot Guilty 7.12540 1.82914 3.896
## Substance_GroupOpioid (non-fentanyl) -0.43382 0.96691 -0.449
## Substance_GroupOther 1.45654 1.59582 0.913
## Substance_GroupPolysubstance -0.56932 1.10647 -0.515
## Substance_GroupStimulant 0.02411 1.89557 0.013
## childChild (<18) 3.41459 1.46113 2.337
## Pr(>|t|)
## (Intercept) 0.0000023 ***
## Charge_GroupDistribution/Possession Causing Death 0.793750
## Charge_GroupDrug-Induced Homicide 0.880537
## Charge_GroupManslaughter 0.172811
## Charge_GroupMurder 0.279575
## Charge_GroupNegligent/Reckless 0.321645
## Charge_GroupOther 0.695138
## RelationshipDealer/Buyer 0.356041
## RelationshipDoctor/Patient 0.005163 **
## UrbanicitySuburban 0.944974
## UrbanicityRural 0.409536
## Accused_RaceOther 0.233088
## Accused_RaceWhite 0.020623 *
## Accused_SexMale 0.118900
## Court_TypeState 0.094853 .
## PleaGuilty to Lesser Charge 0.046129 *
## PleaNo Contest 0.141006
## PleaNot Guilty 0.000104 ***
## Substance_GroupOpioid (non-fentanyl) 0.653757
## Substance_GroupOther 0.361589
## Substance_GroupPolysubstance 0.606976
## Substance_GroupStimulant 0.989852
## childChild (<18) 0.019620 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.22 on 1102 degrees of freedom
## Multiple R-squared: 0.0957, Adjusted R-squared: 0.07765
## F-statistic: 5.301 on 22 and 1102 DF, p-value: 0.00000000000005156
##log-sentence model:
lm_data_full <- lm_data_full %>%
mutate(log_sentence = log(Sentence_num))
lm_full_log <- lm(
log_sentence ~ Charge_Group + Relationship + Urbanicity +
Accused_Race + Accused_Sex + Court_Type + Plea +
Substance_Group + child,
data = lm_data_full
)
summary(lm_full_log)
##
## Call:
## lm(formula = log_sentence ~ Charge_Group + Relationship + Urbanicity +
## Accused_Race + Accused_Sex + Court_Type + Plea + Substance_Group +
## child, data = lm_data_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4569 -0.4568 0.0613 0.5605 3.5886
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 2.22514 0.24100 9.233
## Charge_GroupDistribution/Possession Causing Death -0.08361 0.40943 -0.204
## Charge_GroupDrug-Induced Homicide 0.20422 0.22378 0.913
## Charge_GroupManslaughter -0.19413 0.22781 -0.852
## Charge_GroupMurder 0.49091 0.23778 2.065
## Charge_GroupNegligent/Reckless -0.16341 0.23222 -0.704
## Charge_GroupOther 0.30933 0.22540 1.372
## RelationshipDealer/Buyer 0.13902 0.06808 2.042
## RelationshipDoctor/Patient 0.47673 0.21180 2.251
## UrbanicitySuburban 0.01887 0.08338 0.226
## UrbanicityRural 0.11915 0.12067 0.987
## Accused_RaceOther -0.11120 0.10557 -1.053
## Accused_RaceWhite -0.24895 0.07950 -3.132
## Accused_SexMale 0.14029 0.06382 2.198
## Court_TypeState -0.38672 0.10298 -3.755
## PleaGuilty to Lesser Charge -0.35728 0.13675 -2.613
## PleaNo Contest -0.20093 0.17415 -1.154
## PleaNot Guilty 0.47306 0.14993 3.155
## Substance_GroupOpioid (non-fentanyl) -0.10106 0.07926 -1.275
## Substance_GroupOther -0.03781 0.13081 -0.289
## Substance_GroupPolysubstance 0.04251 0.09070 0.469
## Substance_GroupStimulant 0.18146 0.15538 1.168
## childChild (<18) 0.22084 0.11977 1.844
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## Charge_GroupDistribution/Possession Causing Death 0.838225
## Charge_GroupDrug-Induced Homicide 0.361655
## Charge_GroupManslaughter 0.394315
## Charge_GroupMurder 0.039198 *
## Charge_GroupNegligent/Reckless 0.481792
## Charge_GroupOther 0.170226
## RelationshipDealer/Buyer 0.041379 *
## RelationshipDoctor/Patient 0.024594 *
## UrbanicitySuburban 0.820974
## UrbanicityRural 0.323678
## Accused_RaceOther 0.292391
## Accused_RaceWhite 0.001785 **
## Accused_SexMale 0.028136 *
## Court_TypeState 0.000182 ***
## PleaGuilty to Lesser Charge 0.009107 **
## PleaNo Contest 0.248856
## PleaNot Guilty 0.001647 **
## Substance_GroupOpioid (non-fentanyl) 0.202563
## Substance_GroupOther 0.772615
## Substance_GroupPolysubstance 0.639406
## Substance_GroupStimulant 0.243115
## childChild (<18) 0.065465 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9196 on 1102 degrees of freedom
## Multiple R-squared: 0.1529, Adjusted R-squared: 0.136
## F-statistic: 9.04 on 22 and 1102 DF, p-value: < 0.00000000000000022
#Check Sample Size
nrow(lm_data_full)
## [1] 1125
#Check Diagnostics
par(mfrow = c(2, 2))
plot(lm_full)
par(mfrow = c(2, 2))
plot(lm_full_log)
The raw OLS model has assumption problems: the residuals are highly
skewed, the Q-Q plot shows major non-normality, and a few extreme
sentences are very influential. So I would not rely on the raw sentence
model as the main linear model.
The log-sentence model is better. Its diagnostics look more reasonable, and it explains more variance: Raw sentence model: Adjusted R² = .078 Log sentence model: Adjusted R² = .136
Substantively, the log model suggests higher sentences for:
And lower sentences for:
child is marginal in the log model (p = .065), so I would not treat it as a clear predictor, but it is worth noting as suggestive.
Because sentence length was highly skewed with extreme outliers, the raw OLS model showed substantial assumption violations. A log-transformed sentence model fit the data better and produced results generally consistent with the ordinal models, particularly for plea, court type, race, and relationship. Therefore, the linear models are best treated as sensitivity analyses rather than the primary specification.
Some predictors appear significant in the log sentence model but not in the sentence quartile model because the log model preserves more information about sentence length, while the quartile model reduces the outcome to four ordered categories. These findings suggest that some variables may affect the magnitude of sentence length without consistently shifting cases into higher quartiles.
Because quartiling collapses continuous sentence values into four categories, it can obscure meaningful within-quartile differences and reduce statistical power. The log model therefore provides a more sensitive test of factors associated with sentence length, while the quartile model can be retained as a robustness check.
Why:
###Nested Loglinear Model
lm_no_charge <- update(lm_full_log, . ~ . - Charge_Group)
lm_no_relationship <- update(lm_full_log, . ~ . - Relationship)
lm_no_urban <- update(lm_full_log, . ~ . - Urbanicity)
lm_no_race <- update(lm_full_log, . ~ . - Accused_Race)
lm_no_sex <- update(lm_full_log, . ~ . - Accused_Sex)
lm_no_court <- update(lm_full_log, . ~ . - Court_Type)
lm_no_plea <- update(lm_full_log, . ~ . - Plea)
lm_no_substance <- update(lm_full_log, . ~ . - Substance_Group)
lm_no_child <- update(lm_full_log, . ~ . - child)
AIC(
lm_full_log,
lm_no_charge,
lm_no_relationship,
lm_no_urban,
lm_no_race,
lm_no_sex,
lm_no_court,
lm_no_plea,
lm_no_substance,
lm_no_child
)
## df AIC
## lm_full_log 24 3028.897
## lm_no_charge 18 3088.240
## lm_no_relationship 22 3032.548
## lm_no_urban 22 3025.908
## lm_no_race 22 3035.815
## lm_no_sex 23 3031.819
## lm_no_court 23 3041.201
## lm_no_plea 21 3042.154
## lm_no_substance 20 3028.288
## lm_no_child 23 3030.362
anova(lm_no_charge, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Relationship + Urbanicity + Accused_Race + Accused_Sex +
## Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1108 993.03
## 2 1102 932.01 6 61.019 12.025 0.0000000000004068 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_no_relationship, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Urbanicity + Accused_Race + Accused_Sex +
## Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1104 938.37
## 2 1102 932.01 2 6.3605 3.7603 0.02358 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_no_urban, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Accused_Race + Accused_Sex +
## Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1104 932.85
## 2 1102 932.01 2 0.83796 0.4954 0.6095
anova(lm_no_race, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Sex +
## Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1104 941.10
## 2 1102 932.01 2 9.0892 5.3735 0.00476 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_no_sex, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Court_Type + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1103 936.10
## 2 1102 932.01 1 4.087 4.8325 0.02814 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_no_court, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Plea + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1103 943.94
## 2 1102 932.01 1 11.927 14.102 0.0001822 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_no_plea, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Substance_Group + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1105 948.10
## 2 1102 932.01 3 16.091 6.3419 0.0002906 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm_no_substance, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + child
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1106 938.16
## 2 1102 932.01 4 6.1437 1.816 0.1234
anova(lm_no_child, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1103 934.89
## 2 1102 932.01 1 2.8755 3.4 0.06546 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Keep in the model:
Do not retain unless theoretically required:
AIC tells the same general story. Dropping Urbanicity and Substance_Group slightly improves or barely changes AIC, so they are good candidates to remove. Dropping child also only modestly worsens AIC and is not conventionally significant.
#simplified model
lm_final_log <- lm(
log_sentence ~ Charge_Group + Relationship + Accused_Race +
Accused_Sex + Court_Type + Plea,
data = lm_data_full
)
summary(lm_final_log)
##
## Call:
## lm(formula = log_sentence ~ Charge_Group + Relationship + Accused_Race +
## Accused_Sex + Court_Type + Plea, data = lm_data_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5232 -0.4506 0.0602 0.5662 3.5260
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 2.24514 0.23265 9.650
## Charge_GroupDistribution/Possession Causing Death -0.10962 0.40931 -0.268
## Charge_GroupDrug-Induced Homicide 0.18322 0.22350 0.820
## Charge_GroupManslaughter -0.19477 0.22736 -0.857
## Charge_GroupMurder 0.49697 0.23729 2.094
## Charge_GroupNegligent/Reckless -0.18811 0.23188 -0.811
## Charge_GroupOther 0.30112 0.22481 1.339
## RelationshipDealer/Buyer 0.11102 0.06569 1.690
## RelationshipDoctor/Patient 0.41970 0.20996 1.999
## Accused_RaceOther -0.12014 0.10496 -1.145
## Accused_RaceWhite -0.24841 0.07890 -3.149
## Accused_SexMale 0.13860 0.06390 2.169
## Court_TypeState -0.39130 0.10297 -3.800
## PleaGuilty to Lesser Charge -0.31520 0.13599 -2.318
## PleaNo Contest -0.17085 0.17381 -0.983
## PleaNot Guilty 0.50398 0.14897 3.383
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## Charge_GroupDistribution/Possession Causing Death 0.788896
## Charge_GroupDrug-Induced Homicide 0.412524
## Charge_GroupManslaughter 0.391823
## Charge_GroupMurder 0.036457 *
## Charge_GroupNegligent/Reckless 0.417411
## Charge_GroupOther 0.180701
## RelationshipDealer/Buyer 0.091283 .
## RelationshipDoctor/Patient 0.045861 *
## Accused_RaceOther 0.252583
## Accused_RaceWhite 0.001685 **
## Accused_SexMale 0.030295 *
## Court_TypeState 0.000152 ***
## PleaGuilty to Lesser Charge 0.020638 *
## PleaNo Contest 0.325846
## PleaNot Guilty 0.000742 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9216 on 1109 degrees of freedom
## Multiple R-squared: 0.1438, Adjusted R-squared: 0.1323
## F-statistic: 12.42 on 15 and 1109 DF, p-value: < 0.00000000000000022
AIC(lm_full_log, lm_final_log)
## df AIC
## lm_full_log 24 3028.897
## lm_final_log 17 3026.825
anova(lm_final_log, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Accused_Race + Accused_Sex +
## Court_Type + Plea
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1109 941.95
## 2 1102 932.01 7 9.9346 1.6781 0.1105
Interpretation:
A simplified log-linear model excluding Urbanicity, Substance_Group, and child provided a slightly better AIC than the full model and did not significantly worsen model fit in the nested model comparison. The final model retained Charge_Group, Relationship, Accused_Race, Accused_Sex, Court_Type, and Plea.
Modeling Rationale: After estimating the full log-linear model, predictors were evaluated using drop-one-variable comparisons. Variables that did not meaningfully improve fit were removed together, and the reduced model was compared against the full model. The reduced model had a slightly lower AIC and did not significantly worsen fit, supporting retention of the more parsimonious specification.
###Colinearity Tests
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
car::vif(lm_final_log)
## GVIF Df GVIF^(1/(2*Df))
## Charge_Group 1.542000 6 1.036749
## Relationship 1.190607 2 1.044581
## Accused_Race 1.191073 2 1.044683
## Accused_Sex 1.100737 1 1.049160
## Court_Type 1.425678 1 1.194018
## Plea 1.105746 3 1.016894
vif_vals <- car::vif(lm_final_log)
vif_vals
## GVIF Df GVIF^(1/(2*Df))
## Charge_Group 1.542000 6 1.036749
## Relationship 1.190607 2 1.044581
## Accused_Race 1.191073 2 1.044683
## Accused_Sex 1.100737 1 1.049160
## Court_Type 1.425678 1 1.194018
## Plea 1.105746 3 1.016894
vif_vals[, "GVIF^(1/(2*Df))"]
## Charge_Group Relationship Accused_Race Accused_Sex Court_Type Plea
## 1.036749 1.044581 1.044683 1.049160 1.194018 1.016894
Rule of thumb:
Multicollinearity was assessed using generalized variance inflation factors. Adjusted GVIF values were all close to 1, indicating no evidence of problematic collinearity among predictors in the final log-linear model.
###Model Diagnostics
par(mfrow = c(2, 2))
plot(lm_final_log)
we have:
The tails deviate from the line, especially lower quantiles.
That means:
OLS does not require perfectly normal residuals for unbiased coefficients. With n > 1000, inference is usually fairly robust.
The key question is whether the model is “good enough,” not perfect.
And compared with the raw model:
That supports the log transformation.
This checks homoskedasticity.
we still have:
But again:
to strengthen the analysis further, we can use robust standard errors.
No obvious Cook’s distance disaster.
Overall Assessment Strengths:
Weaknesses (which are normal and reportable):
Narrative: Because sentence length was highly skewed, analyses were conducted using the natural log of sentence length. Diagnostic plots indicated that the log transformation substantially improved model assumptions, reducing skewness and heteroskedasticity relative to models using raw sentence length. Residual deviations from normality remained modest but were considered acceptable given the large sample size and robustness of linear regression estimates.
###Robust Standard Errors
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
coeftest(
lm_final_log,
vcov = vcovHC(lm_final_log, type = "HC3")
)
##
## t test of coefficients:
##
## Estimate Std. Error t value
## (Intercept) 2.245144 0.294277 7.6294
## Charge_GroupDistribution/Possession Causing Death -0.109618 0.530832 -0.2065
## Charge_GroupDrug-Induced Homicide 0.183217 0.318445 0.5753
## Charge_GroupManslaughter -0.194770 0.323092 -0.6028
## Charge_GroupMurder 0.496971 0.328255 1.5140
## Charge_GroupNegligent/Reckless -0.188106 0.325981 -0.5770
## Charge_GroupOther 0.301125 0.308827 0.9751
## RelationshipDealer/Buyer 0.111017 0.070158 1.5824
## RelationshipDoctor/Patient 0.419699 0.199195 2.1070
## Accused_RaceOther -0.120144 0.097332 -1.2344
## Accused_RaceWhite -0.248411 0.072323 -3.4348
## Accused_SexMale 0.138596 0.068523 2.0226
## Court_TypeState -0.391304 0.095821 -4.0837
## PleaGuilty to Lesser Charge -0.315203 0.134271 -2.3475
## PleaNo Contest -0.170848 0.158813 -1.0758
## PleaNot Guilty 0.503975 0.136168 3.7011
## Pr(>|t|)
## (Intercept) 0.00000000000005062 ***
## Charge_GroupDistribution/Possession Causing Death 0.8364359
## Charge_GroupDrug-Induced Homicide 0.5651718
## Charge_GroupManslaughter 0.5467432
## Charge_GroupMurder 0.1303166
## Charge_GroupNegligent/Reckless 0.5640249
## Charge_GroupOther 0.3297425
## RelationshipDealer/Buyer 0.1138459
## RelationshipDoctor/Patient 0.0353440 *
## Accused_RaceOther 0.2173269
## Accused_RaceWhite 0.0006150 ***
## Accused_SexMale 0.0433506 *
## Court_TypeState 0.00004750706714125 ***
## PleaGuilty to Lesser Charge 0.0190738 *
## PleaNo Contest 0.2822593
## PleaNot Guilty 0.0002252 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RelationshipDoctor/Patient p = .035 Accused_RaceWhite p < .001 Accused_SexMale p = .043 Court_TypeState p < .001 PleaGuilty to Lesser Charge p = .019 PleaNot Guilty p < .001
No longer significant: Charge_GroupMurder RelationshipDealer/Buyer
Summary: After using heteroskedasticity-robust standard errors, the associations for plea, court type, accused race, accused sex, and doctor/patient relationship remained statistically significant. Some charge-group and dealer/buyer relationship effects were attenuated, suggesting those findings are less robust.
###Correcting Reference Groups & Re-running everything
coef_table <- data.frame(
term = names(coef(lm_final_log)),
estimate = coef(lm_final_log),
percent_change = (exp(coef(lm_final_log)) - 1) * 100
)
coef_table
## term
## (Intercept) (Intercept)
## Charge_GroupDistribution/Possession Causing Death Charge_GroupDistribution/Possession Causing Death
## Charge_GroupDrug-Induced Homicide Charge_GroupDrug-Induced Homicide
## Charge_GroupManslaughter Charge_GroupManslaughter
## Charge_GroupMurder Charge_GroupMurder
## Charge_GroupNegligent/Reckless Charge_GroupNegligent/Reckless
## Charge_GroupOther Charge_GroupOther
## RelationshipDealer/Buyer RelationshipDealer/Buyer
## RelationshipDoctor/Patient RelationshipDoctor/Patient
## Accused_RaceOther Accused_RaceOther
## Accused_RaceWhite Accused_RaceWhite
## Accused_SexMale Accused_SexMale
## Court_TypeState Court_TypeState
## PleaGuilty to Lesser Charge PleaGuilty to Lesser Charge
## PleaNo Contest PleaNo Contest
## PleaNot Guilty PleaNot Guilty
## estimate percent_change
## (Intercept) 2.2451438 844.17732
## Charge_GroupDistribution/Possession Causing Death -0.1096183 -10.38239
## Charge_GroupDrug-Induced Homicide 0.1832172 20.10753
## Charge_GroupManslaughter -0.1947702 -17.69762
## Charge_GroupMurder 0.4969711 64.37351
## Charge_GroupNegligent/Reckless -0.1881061 -17.14732
## Charge_GroupOther 0.3011248 35.13780
## RelationshipDealer/Buyer 0.1110171 11.74140
## RelationshipDoctor/Patient 0.4196992 52.15038
## Accused_RaceOther -0.1201441 -11.32073
## Accused_RaceWhite -0.2484112 -21.99609
## Accused_SexMale 0.1385962 14.86602
## Court_TypeState -0.3913044 -32.38257
## PleaGuilty to Lesser Charge -0.3152035 -27.03596
## PleaNo Contest -0.1708477 -15.70501
## PleaNot Guilty 0.5039755 65.52888
(exp(0.4197) - 1) * 100 # Doctor/Patient
## [1] 52.1505
(exp(-0.2484) - 1) * 100 # White
## [1] -21.99521
(exp(0.1386) - 1) * 100 # Male
## [1] 14.86645
(exp(-0.3913) - 1) * 100 # State
## [1] -32.38227
(exp(-0.3152) - 1) * 100 # Guilty to lesser charge
## [1] -27.03571
(exp(0.5040) - 1) * 100 # Not guilty
## [1] 65.53294
#Checking Reference Groups
levels(lm_data_full$Relationship)
## NULL
levels(lm_data_full$Accused_Race)
## [1] "Black" "Other" "White"
levels(lm_data_full$Accused_Sex)
## NULL
levels(lm_data_full$Court_Type)
## NULL
levels(lm_data_full$Plea)
## NULL
levels(lm_data_full$Charge_Group)
## NULL
unique(lm_data_full$Relationship)
## [1] "Caretaker/Family/Friend/Partner/Co-User"
## [2] "Dealer/Buyer"
## [3] "Doctor/Patient"
unique(lm_data_full$Accused_Sex)
## [1] "Female" "Male"
unique(lm_data_full$Court_Type)
## [1] "State" "Federal"
unique(lm_data_full$Plea)
## [1] "Guilty" "No Contest"
## [3] "Guilty to Lesser Charge" "Not Guilty"
unique(lm_data_full$Charge_Group)
## [1] "Manslaughter"
## [2] "Drug-Induced Homicide"
## [3] "Conspiracy"
## [4] "Murder"
## [5] "Other"
## [6] "Negligent/Reckless"
## [7] "Distribution/Possession Causing Death"
#explicitly convert and set reference groups:
lm_data_full <- lm_data_full %>%
mutate(
Relationship = factor(Relationship),
Accused_Sex = factor(Accused_Sex),
Court_Type = factor(Court_Type),
Plea = factor(Plea),
Charge_Group = factor(Charge_Group)
)
levels(lm_data_full$Relationship)
## [1] "Caretaker/Family/Friend/Partner/Co-User"
## [2] "Dealer/Buyer"
## [3] "Doctor/Patient"
levels(lm_data_full$Accused_Sex)
## [1] "Female" "Male"
levels(lm_data_full$Court_Type)
## [1] "Federal" "State"
levels(lm_data_full$Plea)
## [1] "Guilty" "Guilty to Lesser Charge"
## [3] "No Contest" "Not Guilty"
levels(lm_data_full$Charge_Group)
## [1] "Conspiracy"
## [2] "Distribution/Possession Causing Death"
## [3] "Drug-Induced Homicide"
## [4] "Manslaughter"
## [5] "Murder"
## [6] "Negligent/Reckless"
## [7] "Other"
NOW I need to rerun the final model again with the correct reference categories…
lm_final_log <- lm(
log_sentence ~
Charge_Group +
Relationship +
Accused_Race +
Accused_Sex +
Court_Type +
Plea,
data = lm_data_full
)
summary(lm_final_log)
##
## Call:
## lm(formula = log_sentence ~ Charge_Group + Relationship + Accused_Race +
## Accused_Sex + Court_Type + Plea, data = lm_data_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5232 -0.4506 0.0602 0.5662 3.5260
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 2.24514 0.23265 9.650
## Charge_GroupDistribution/Possession Causing Death -0.10962 0.40931 -0.268
## Charge_GroupDrug-Induced Homicide 0.18322 0.22350 0.820
## Charge_GroupManslaughter -0.19477 0.22736 -0.857
## Charge_GroupMurder 0.49697 0.23729 2.094
## Charge_GroupNegligent/Reckless -0.18811 0.23188 -0.811
## Charge_GroupOther 0.30112 0.22481 1.339
## RelationshipDealer/Buyer 0.11102 0.06569 1.690
## RelationshipDoctor/Patient 0.41970 0.20996 1.999
## Accused_RaceOther -0.12014 0.10496 -1.145
## Accused_RaceWhite -0.24841 0.07890 -3.149
## Accused_SexMale 0.13860 0.06390 2.169
## Court_TypeState -0.39130 0.10297 -3.800
## PleaGuilty to Lesser Charge -0.31520 0.13599 -2.318
## PleaNo Contest -0.17085 0.17381 -0.983
## PleaNot Guilty 0.50398 0.14897 3.383
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## Charge_GroupDistribution/Possession Causing Death 0.788896
## Charge_GroupDrug-Induced Homicide 0.412524
## Charge_GroupManslaughter 0.391823
## Charge_GroupMurder 0.036457 *
## Charge_GroupNegligent/Reckless 0.417411
## Charge_GroupOther 0.180701
## RelationshipDealer/Buyer 0.091283 .
## RelationshipDoctor/Patient 0.045861 *
## Accused_RaceOther 0.252583
## Accused_RaceWhite 0.001685 **
## Accused_SexMale 0.030295 *
## Court_TypeState 0.000152 ***
## PleaGuilty to Lesser Charge 0.020638 *
## PleaNo Contest 0.325846
## PleaNot Guilty 0.000742 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9216 on 1109 degrees of freedom
## Multiple R-squared: 0.1438, Adjusted R-squared: 0.1323
## F-statistic: 12.42 on 15 and 1109 DF, p-value: < 0.00000000000000022
library(lmtest)
library(sandwich)
coeftest(
lm_final_log,
vcov = vcovHC(lm_final_log, type = "HC3")
)
##
## t test of coefficients:
##
## Estimate Std. Error t value
## (Intercept) 2.245144 0.294277 7.6294
## Charge_GroupDistribution/Possession Causing Death -0.109618 0.530832 -0.2065
## Charge_GroupDrug-Induced Homicide 0.183217 0.318445 0.5753
## Charge_GroupManslaughter -0.194770 0.323092 -0.6028
## Charge_GroupMurder 0.496971 0.328255 1.5140
## Charge_GroupNegligent/Reckless -0.188106 0.325981 -0.5770
## Charge_GroupOther 0.301125 0.308827 0.9751
## RelationshipDealer/Buyer 0.111017 0.070158 1.5824
## RelationshipDoctor/Patient 0.419699 0.199195 2.1070
## Accused_RaceOther -0.120144 0.097332 -1.2344
## Accused_RaceWhite -0.248411 0.072323 -3.4348
## Accused_SexMale 0.138596 0.068523 2.0226
## Court_TypeState -0.391304 0.095821 -4.0837
## PleaGuilty to Lesser Charge -0.315203 0.134271 -2.3475
## PleaNo Contest -0.170848 0.158813 -1.0758
## PleaNot Guilty 0.503975 0.136168 3.7011
## Pr(>|t|)
## (Intercept) 0.00000000000005062 ***
## Charge_GroupDistribution/Possession Causing Death 0.8364359
## Charge_GroupDrug-Induced Homicide 0.5651718
## Charge_GroupManslaughter 0.5467432
## Charge_GroupMurder 0.1303166
## Charge_GroupNegligent/Reckless 0.5640249
## Charge_GroupOther 0.3297425
## RelationshipDealer/Buyer 0.1138459
## RelationshipDoctor/Patient 0.0353440 *
## Accused_RaceOther 0.2173269
## Accused_RaceWhite 0.0006150 ***
## Accused_SexMale 0.0433506 *
## Court_TypeState 0.00004750706714125 ***
## PleaGuilty to Lesser Charge 0.0190738 *
## PleaNo Contest 0.2822593
## PleaNot Guilty 0.0002252 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####VIF / multicollinearity
library(car)
car::vif(lm_final_log)
## GVIF Df GVIF^(1/(2*Df))
## Charge_Group 1.542000 6 1.036749
## Relationship 1.190607 2 1.044581
## Accused_Race 1.191073 2 1.044683
## Accused_Sex 1.100737 1 1.049160
## Court_Type 1.425678 1 1.194018
## Plea 1.105746 3 1.016894
vif_vals <- car::vif(lm_final_log)
vif_vals[, "GVIF^(1/(2*Df))"]
## Charge_Group Relationship Accused_Race Accused_Sex Court_Type Plea
## 1.036749 1.044581 1.044683 1.049160 1.194018 1.016894
####Diagnostics Plot
par(mfrow = c(2,2))
plot(lm_final_log)
###AIC comparison with the larger/full model
AIC(lm_full_log, lm_final_log)
## df AIC
## lm_full_log 24 3028.897
## lm_final_log 17 3026.825
###Nested model comparison
anova(lm_final_log, lm_full_log)
## Analysis of Variance Table
##
## Model 1: log_sentence ~ Charge_Group + Relationship + Accused_Race + Accused_Sex +
## Court_Type + Plea
## Model 2: log_sentence ~ Charge_Group + Relationship + Urbanicity + Accused_Race +
## Accused_Sex + Court_Type + Plea + Substance_Group + child
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1109 941.95
## 2 1102 932.01 7 9.9346 1.6781 0.1105
###Percent-change interpretation table
coef_table <- data.frame(
term = names(coef(lm_final_log)),
estimate = coef(lm_final_log),
percent_change = (exp(coef(lm_final_log)) - 1) * 100
)
coef_table
## term
## (Intercept) (Intercept)
## Charge_GroupDistribution/Possession Causing Death Charge_GroupDistribution/Possession Causing Death
## Charge_GroupDrug-Induced Homicide Charge_GroupDrug-Induced Homicide
## Charge_GroupManslaughter Charge_GroupManslaughter
## Charge_GroupMurder Charge_GroupMurder
## Charge_GroupNegligent/Reckless Charge_GroupNegligent/Reckless
## Charge_GroupOther Charge_GroupOther
## RelationshipDealer/Buyer RelationshipDealer/Buyer
## RelationshipDoctor/Patient RelationshipDoctor/Patient
## Accused_RaceOther Accused_RaceOther
## Accused_RaceWhite Accused_RaceWhite
## Accused_SexMale Accused_SexMale
## Court_TypeState Court_TypeState
## PleaGuilty to Lesser Charge PleaGuilty to Lesser Charge
## PleaNo Contest PleaNo Contest
## PleaNot Guilty PleaNot Guilty
## estimate percent_change
## (Intercept) 2.2451438 844.17732
## Charge_GroupDistribution/Possession Causing Death -0.1096183 -10.38239
## Charge_GroupDrug-Induced Homicide 0.1832172 20.10753
## Charge_GroupManslaughter -0.1947702 -17.69762
## Charge_GroupMurder 0.4969711 64.37351
## Charge_GroupNegligent/Reckless -0.1881061 -17.14732
## Charge_GroupOther 0.3011248 35.13780
## RelationshipDealer/Buyer 0.1110171 11.74140
## RelationshipDoctor/Patient 0.4196992 52.15038
## Accused_RaceOther -0.1201441 -11.32073
## Accused_RaceWhite -0.2484112 -21.99609
## Accused_SexMale 0.1385962 14.86602
## Court_TypeState -0.3913044 -32.38257
## PleaGuilty to Lesser Charge -0.3152035 -27.03596
## PleaNo Contest -0.1708477 -15.70501
## PleaNot Guilty 0.5039755 65.52888
###confidence intervals
confint(lm_final_log)
## 2.5 % 97.5 %
## (Intercept) 1.78865806 2.70162955
## Charge_GroupDistribution/Possession Causing Death -0.91273621 0.69349957
## Charge_GroupDrug-Induced Homicide -0.25531088 0.62174528
## Charge_GroupManslaughter -0.64087949 0.25133919
## Charge_GroupMurder 0.03137742 0.96256487
## Charge_GroupNegligent/Reckless -0.64307802 0.26686581
## Charge_GroupOther -0.13998336 0.74223295
## RelationshipDealer/Buyer -0.01786457 0.23989871
## RelationshipDoctor/Patient 0.00772919 0.83166921
## Accused_RaceOther -0.32608143 0.08579329
## Accused_RaceWhite -0.40321562 -0.09360686
## Accused_SexMale 0.01321998 0.26397245
## Court_TypeState -0.59334216 -0.18926669
## PleaGuilty to Lesser Charge -0.58202866 -0.04837829
## PleaNo Contest -0.51188347 0.17018802
## PleaNot Guilty 0.21168700 0.79626399
exponentiated
exp(confint(lm_final_log))
## 2.5 % 97.5 %
## (Intercept) 5.9814204 14.9039988
## Charge_GroupDistribution/Possession Causing Death 0.4014243 2.0007049
## Charge_GroupDrug-Induced Homicide 0.7746756 1.8621752
## Charge_GroupManslaughter 0.5268289 1.2857461
## Charge_GroupMurder 1.0318749 2.6184037
## Charge_GroupNegligent/Reckless 0.5256719 1.3058652
## Charge_GroupOther 0.8693727 2.1006209
## RelationshipDealer/Buyer 0.9822941 1.2711204
## RelationshipDoctor/Patient 1.0077591 2.2971500
## Accused_RaceOther 0.7217464 1.0895811
## Accused_RaceWhite 0.6681680 0.9106407
## Accused_SexMale 1.0133077 1.3020923
## Court_TypeState 0.5524777 0.8275658
## PleaGuilty to Lesser Charge 0.5587637 0.9527733
## PleaNo Contest 0.5993656 1.1855277
## PleaNot Guilty 1.2357610 2.2172418
###Clean Model Tables & Interpretion
library(broom)
tidy(lm_final_log, conf.int = TRUE)
## # A tibble: 16 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.25 0.233 9.65 3.25e-21 1.79 2.70
## 2 Charge_GroupDistrib… -0.110 0.409 -0.268 7.89e- 1 -0.913 0.693
## 3 Charge_GroupDrug-In… 0.183 0.223 0.820 4.13e- 1 -0.255 0.622
## 4 Charge_GroupManslau… -0.195 0.227 -0.857 3.92e- 1 -0.641 0.251
## 5 Charge_GroupMurder 0.497 0.237 2.09 3.65e- 2 0.0314 0.963
## 6 Charge_GroupNeglige… -0.188 0.232 -0.811 4.17e- 1 -0.643 0.267
## 7 Charge_GroupOther 0.301 0.225 1.34 1.81e- 1 -0.140 0.742
## 8 RelationshipDealer/… 0.111 0.0657 1.69 9.13e- 2 -0.0179 0.240
## 9 RelationshipDoctor/… 0.420 0.210 2.00 4.59e- 2 0.00773 0.832
## 10 Accused_RaceOther -0.120 0.105 -1.14 2.53e- 1 -0.326 0.0858
## 11 Accused_RaceWhite -0.248 0.0789 -3.15 1.68e- 3 -0.403 -0.0936
## 12 Accused_SexMale 0.139 0.0639 2.17 3.03e- 2 0.0132 0.264
## 13 Court_TypeState -0.391 0.103 -3.80 1.52e- 4 -0.593 -0.189
## 14 PleaGuilty to Lesse… -0.315 0.136 -2.32 2.06e- 2 -0.582 -0.0484
## 15 PleaNo Contest -0.171 0.174 -0.983 3.26e- 1 -0.512 0.170
## 16 PleaNot Guilty 0.504 0.149 3.38 7.42e- 4 0.212 0.796
with robust SE
library(broom)
robust_results <- broom::tidy(
coeftest(
lm_final_log,
vcov = vcovHC(lm_final_log, type = "HC3")
)
)
robust_results
## # A tibble: 16 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.25 0.294 7.63 5.06e-14
## 2 Charge_GroupDistribution/Possession Ca… -0.110 0.531 -0.207 8.36e- 1
## 3 Charge_GroupDrug-Induced Homicide 0.183 0.318 0.575 5.65e- 1
## 4 Charge_GroupManslaughter -0.195 0.323 -0.603 5.47e- 1
## 5 Charge_GroupMurder 0.497 0.328 1.51 1.30e- 1
## 6 Charge_GroupNegligent/Reckless -0.188 0.326 -0.577 5.64e- 1
## 7 Charge_GroupOther 0.301 0.309 0.975 3.30e- 1
## 8 RelationshipDealer/Buyer 0.111 0.0702 1.58 1.14e- 1
## 9 RelationshipDoctor/Patient 0.420 0.199 2.11 3.53e- 2
## 10 Accused_RaceOther -0.120 0.0973 -1.23 2.17e- 1
## 11 Accused_RaceWhite -0.248 0.0723 -3.43 6.15e- 4
## 12 Accused_SexMale 0.139 0.0685 2.02 4.34e- 2
## 13 Court_TypeState -0.391 0.0958 -4.08 4.75e- 5
## 14 PleaGuilty to Lesser Charge -0.315 0.134 -2.35 1.91e- 2
## 15 PleaNo Contest -0.171 0.159 -1.08 2.82e- 1
## 16 PleaNot Guilty 0.504 0.136 3.70 2.25e- 4
The final log-linear model indicated that several predictors were associated with sentence length after adjustment. Compared with caretaker/family/friend/partner/co-user cases, doctor/patient cases were associated with approximately 52% longer sentences (β = 0.420, p = .035). Male defendants received approximately 15% longer sentences than female defendants (β = 0.139, p = .043). White defendants received approximately 22% shorter sentences than Black defendants (β = -0.248, p < .001). State court cases were associated with approximately 32% shorter sentences than federal cases (β = -0.391, p < .001). Compared with guilty pleas, guilty pleas to a lesser charge were associated with approximately 27% shorter sentences (β = -0.315, p = .019), while not guilty pleas were associated with approximately 66% longer sentences (β = 0.504, p < .001).
Because the dependent variable was log-transformed, exponentiated coefficients can be interpreted as multiplicative changes in sentence length. Confidence intervals that do not cross 1 indicate statistically significant associations. In the final model, doctor/patient cases were associated with longer sentences than caretaker/family/friend/partner/co-user cases, with sentences estimated to be between 1.01 and 2.30 times as long. White defendants received shorter sentences than Black defendants, with sentences estimated to be about 0.67 to 0.91 times as long, or roughly 9% to 33% shorter. Male defendants received longer sentences than female defendants, with sentences estimated to be 1.01 to 1.30 times as long. State court cases received shorter sentences than federal cases, with sentences estimated to be 0.55 to 0.83 times as long, or about 17% to 45% shorter. Guilty pleas to a lesser charge were associated with shorter sentences than guilty pleas, with sentences estimated to be 0.56 to 0.95 times as long, while not guilty pleas were associated with longer sentences, with sentences estimated to be 1.24 to 2.22 times as long.
Model Diagnostics:
Variance inflation factors (VIFs) indicated no evidence of problematic multicollinearity among predictors. All adjusted GVIF values were well below conventional thresholds for concern (all GVIF^(1/(2*Df)) < 1.2), suggesting the included predictors were not highly correlated.
Diagnostic plots suggested acceptable model fit following log transformation of sentence length. Residual variance was substantially improved compared with the untransformed model, though minor deviations from normality remained in the tails. Robust standard errors were therefore used to account for potential heteroskedasticity. No evidence of problematic multicollinearity or influential observations was detected.
Model Selection: The simplified log-linear model had a slightly lower AIC than the larger model (3026.83 vs. 3028.90), indicating a modest improvement in parsimony. A nested model comparison showed that removing Urbanicity, Substance_Group, and child did not significantly worsen model fit (F = 1.68, p = .111). Therefore, the reduced model was retained as the final specification.
library(modelsummary)
modelsummary(
lm_final_log,
exponentiate = TRUE
)
| (1) | |
|---|---|
| (Intercept) | 9.442 |
| (2.197) | |
| Charge_GroupDistribution/Possession Causing Death | 0.896 |
| (0.367) | |
| Charge_GroupDrug-Induced Homicide | 1.201 |
| (0.268) | |
| Charge_GroupManslaughter | 0.823 |
| (0.187) | |
| Charge_GroupMurder | 1.644 |
| (0.390) | |
| Charge_GroupNegligent/Reckless | 0.829 |
| (0.192) | |
| Charge_GroupOther | 1.351 |
| (0.304) | |
| RelationshipDealer/Buyer | 1.117 |
| (0.073) | |
| RelationshipDoctor/Patient | 1.522 |
| (0.319) | |
| Accused_RaceOther | 0.887 |
| (0.093) | |
| Accused_RaceWhite | 0.780 |
| (0.062) | |
| Accused_SexMale | 1.149 |
| (0.073) | |
| Court_TypeState | 0.676 |
| (0.070) | |
| PleaGuilty to Lesser Charge | 0.730 |
| (0.099) | |
| PleaNo Contest | 0.843 |
| (0.147) | |
| PleaNot Guilty | 1.655 |
| (0.247) | |
| Num.Obs. | 1125 |
| R2 | 0.144 |
| R2 Adj. | 0.132 |
| AIC | 3026.8 |
| BIC | 3112.3 |
| Log.Lik. | -1496.412 |
| F | 12.422 |
| RMSE | 0.92 |
modelsummary(
lm_final_log,
vcov = "HC3",
exponentiate = TRUE
)
| (1) | |
|---|---|
| (Intercept) | 9.442 |
| (2.778) | |
| Charge_GroupDistribution/Possession Causing Death | 0.896 |
| (0.476) | |
| Charge_GroupDrug-Induced Homicide | 1.201 |
| (0.382) | |
| Charge_GroupManslaughter | 0.823 |
| (0.266) | |
| Charge_GroupMurder | 1.644 |
| (0.540) | |
| Charge_GroupNegligent/Reckless | 0.829 |
| (0.270) | |
| Charge_GroupOther | 1.351 |
| (0.417) | |
| RelationshipDealer/Buyer | 1.117 |
| (0.078) | |
| RelationshipDoctor/Patient | 1.522 |
| (0.303) | |
| Accused_RaceOther | 0.887 |
| (0.086) | |
| Accused_RaceWhite | 0.780 |
| (0.056) | |
| Accused_SexMale | 1.149 |
| (0.079) | |
| Court_TypeState | 0.676 |
| (0.065) | |
| PleaGuilty to Lesser Charge | 0.730 |
| (0.098) | |
| PleaNo Contest | 0.843 |
| (0.134) | |
| PleaNot Guilty | 1.655 |
| (0.225) | |
| Num.Obs. | 1125 |
| R2 | 0.144 |
| R2 Adj. | 0.132 |
| AIC | 3026.8 |
| BIC | 3112.3 |
| Log.Lik. | -1496.412 |
| F | 15.759 |
| RMSE | 0.92 |
| Std.Errors | HC3 |
library(pandoc)
modelsummary(
lm_final_log,
vcov = "HC3",
exponentiate = TRUE,
output = "table1.docx"
)
#Other Relationships
“Do cases involving women tend to occur when the deceased is younger?”
wilcox.test(Deceased_Age ~ Accused_Sex, data = DIH_DATA)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Deceased_Age by Accused_Sex
## W = 761528, p-value = 0.06442
## alternative hypothesis: true location shift is not equal to 0
DIH_DATA %>%
group_by(Accused_Sex) %>%
summarise(
n = sum(!is.na(Deceased_Age)),
mean_age = mean(Deceased_Age, na.rm = TRUE),
median_age = median(Deceased_Age, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Accused_Sex n mean_age median_age
## <chr> <int> <dbl> <dbl>
## 1 Female 741 29.8 30
## 2 Male 1965 29.6 28
## 3 <NA> 12 24 23
ggplot(DIH_DATA, aes(x = Accused_Sex, y = Deceased_Age, fill = Accused_Sex)) +
geom_boxplot() +
guides(fill = "none")
## Warning: Removed 546 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
“Are younger victims more likely to involve female defendants?”
glm(I(Accused_Sex == "Female") ~ Deceased_Age,
data = DIH_DATA,
family = binomial)
##
## Call: glm(formula = I(Accused_Sex == "Female") ~ Deceased_Age, family = binomial,
## data = DIH_DATA)
##
## Coefficients:
## (Intercept) Deceased_Age
## -1.011494 0.001221
##
## Degrees of Freedom: 2705 Total (i.e. Null); 2704 Residual
## (558 observations deleted due to missingness)
## Null Deviance: 3177
## Residual Deviance: 3177 AIC: 3181
There is no strong evidence that cases involving female defendants are associated with younger (or older) victims.
##Child cases
child_data <- DIH_DATA %>%
filter(!is.na(age_group))
child_data %>%
group_by(age_group, Accused_Sex) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(age_group) %>%
mutate(prop = n / sum(n))
## # A tibble: 23 × 4
## # Groups: age_group [10]
## age_group Accused_Sex n prop
## <fct> <chr> <int> <dbl>
## 1 0–1 Female 30 0.625
## 2 0–1 Male 18 0.375
## 3 2–3 Female 22 0.688
## 4 2–3 Male 10 0.312
## 5 4–5 Female 6 0.667
## 6 4–5 Male 3 0.333
## 7 6–7 Female 1 0.167
## 8 6–7 Male 5 0.833
## 9 8–9 Female 2 0.667
## 10 8–9 Male 1 0.333
## # ℹ 13 more rows
ggplot(child_data, aes(x = age_group, fill = Accused_Sex)) +
geom_bar(position = "fill") +
labs(
x = "Child Age Group",
y = "Proportion",
fill = "Accused Sex"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This is pretty striking.
Going to change the age groupings
DIH_DATA <- DIH_DATA %>%
mutate(
age_group2 = case_when(
Deceased_Age <= 5 ~ "0–5",
Deceased_Age <= 12 ~ "6–12",
Deceased_Age <= 17 ~ "13–17",
TRUE ~ NA_character_
)
)
DIH_DATA %>%
filter(!is.na(age_group2)) %>%
group_by(age_group2, Accused_Sex) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(age_group2) %>%
mutate(percent = 100 * n / sum(n))
## # A tibble: 7 × 4
## # Groups: age_group2 [3]
## age_group2 Accused_Sex n percent
## <chr> <chr> <int> <dbl>
## 1 0–5 Female 58 65.2
## 2 0–5 Male 31 34.8
## 3 13–17 Female 20 23.0
## 4 13–17 Male 64 73.6
## 5 13–17 <NA> 3 3.45
## 6 6–12 Female 9 50
## 7 6–12 Male 9 50
tab <- DIH_DATA %>%
filter(!is.na(age_group2)) %>%
count(age_group2, Accused_Sex) %>%
tidyr::pivot_wider(
names_from = Accused_Sex,
values_from = n,
values_fill = 0
)
tab_test <- table(
DIH_DATA$age_group2,
DIH_DATA$Accused_Sex
)
tab_test
##
## Female Male
## 0–5 58 31
## 13–17 20 64
## 6–12 9 9
chisq.test(tab_test)
##
## Pearson's Chi-squared test
##
## data: tab_test
## X-squared = 29.963, df = 2, p-value = 0.0000003116
chisq.test(tab_test)$expected
##
## Female Male
## 0–5 40.539267 48.460733
## 13–17 38.261780 45.738220
## 6–12 8.198953 9.801047
No expected counts are less than 0 so we are good.
DIH_DATA <- DIH_DATA %>%
mutate(female = ifelse(Accused_Sex == "Female", 1, 0))
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:modelsummary':
##
## Format, Mean, Median, N, SD, Var
## The following object is masked from 'package:car':
##
## Recode
CochranArmitageTest(
table(DIH_DATA$female, DIH_DATA$age_group2)
)
##
## Cochran-Armitage test for trend
##
## data: table(DIH_DATA$female, DIH_DATA$age_group2)
## Z = 3.7259, dim = 3, p-value = 0.0001946
## alternative hypothesis: two.sided
model <- glm(
I(Accused_Sex == "Female") ~ age_group2,
data = DIH_DATA,
family = binomial
)
summary(model)
##
## Call:
## glm(formula = I(Accused_Sex == "Female") ~ age_group2, family = binomial,
## data = DIH_DATA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6265 0.2225 2.816 0.00487 **
## age_group213–17 -1.7896 0.3393 -5.274 0.000000133 ***
## age_group26–12 -0.6265 0.5213 -1.202 0.22945
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 263.27 on 190 degrees of freedom
## Residual deviance: 232.22 on 188 degrees of freedom
## (3073 observations deleted due to missingness)
## AIC: 238.22
##
## Number of Fisher Scoring iterations: 4
Are younger victims more likely to involve female defendants? (Age Group Analysis)
Cases were grouped into three age categories (0–5, 6–12, 13–17), and the distribution of defendant sex across these groups was examined.
Descriptive results show a clear pattern: • Ages 0–5: • Female defendants: 65.2% • Male defendants: 34.8% • Ages 6–12: • Female defendants: 50.0% • Male defendants: 50.0% • Ages 13–17: • Female defendants: 23.0% • Male defendants: 73.6%
This suggests that female defendants are substantially overrepresented in cases involving the youngest victims (ages 0–5), while male defendants dominate cases involving older minors (13–17).
A chi-square test of independence confirms that this association is statistically significant: • χ²(2) = 29.96, p < 0.001
This indicates that the distribution of defendant sex differs meaningfully across age groups.
Trend Analysis
A Cochran–Armitage trend test was also conducted to assess whether there is a systematic trend across ordered age categories. Results from the logistic regression model (with age group as a predictor) further support this: • Compared to ages 0–5 (reference group), cases involving ages 13–17 show a large and statistically significant decrease in the likelihood of a female defendant (p < 0.001) • The 6–12 group does not differ significantly from the youngest group (p = 0.23)
This indicates a strong downward trend:
As victim age increases, the likelihood that the defendant is female decreases substantially, particularly when comparing very young children (0–5) to older adolescents (13–17).
Though I stopped the analysis here – we would need to consider whether to include other control variables in this regression if we ant to report it
##Sex & Relationship
tab_rel_sex <- table(DIH_DATA$Relationship, DIH_DATA$Accused_Sex)
tab_rel_sex
##
## Female Male
## Caretaker/Family/Friend/Partner/Co-User 304 459
## Dealer/Buyer 461 1637
## Doctor/Patient 13 75
chisq.test(tab_rel_sex)
##
## Pearson's Chi-squared test
##
## data: tab_rel_sex
## X-squared = 98.285, df = 2, p-value < 0.00000000000000022
chisq.test(tab_rel_sex)$expected
##
## Female Male
## Caretaker/Family/Friend/Partner/Co-User 201.29332 561.70668
## Dealer/Buyer 553.49067 1544.50933
## Doctor/Patient 23.21601 64.78399
No cells less than 5
prop.table(tab_rel_sex, margin = 1)
##
## Female Male
## Caretaker/Family/Friend/Partner/Co-User 0.3984273 0.6015727
## Dealer/Buyer 0.2197331 0.7802669
## Doctor/Patient 0.1477273 0.8522727
ggplot(DIH_DATA, aes(x = Relationship, fill = Accused_Sex)) +
geom_bar(position = "fill") +
labs(
y = "Proportion",
fill = "Accused Sex"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Urbancity & State
table(DIH_DATA$Urbanicity, DIH_DATA$State)
##
## Alabama Alaska Arizona Arkansas California Colorado Connecticut
## Metropolitan 9 0 13 1 76 14 0
## Suburban 3 0 0 0 3 10 0
## Rural 0 0 0 0 2 1 0
##
## Delaware Florida Georgia Idaho Illinois Indiana Iowa Kansas
## Metropolitan 3 121 18 5 238 44 23 3
## Suburban 0 3 2 0 26 9 3 1
## Rural 0 0 1 2 26 0 2 3
##
## Kentucky Louisana Louisiana Maine Maryland Massachusetts
## Metropolitan 25 0 47 5 46 26
## Suburban 1 0 3 3 7 0
## Rural 6 0 0 6 0 1
##
## Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada
## Metropolitan 72 84 1 18 8 1 14
## Suburban 13 24 1 9 3 0 2
## Rural 13 5 0 10 0 0 0
##
## New Hampshire New Jersey New Mexico New York North Carolina
## Metropolitan 26 108 3 90 74
## Suburban 12 0 0 13 30
## Rural 0 0 0 6 5
##
## North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island
## Metropolitan 14 296 17 17 466 2
## Suburban 1 49 8 0 88 0
## Rural 2 6 2 0 23 0
##
## South Carolina South Dakota Tennessee Texas Utah Vermont
## Metropolitan 13 3 69 23 13 1
## Suburban 2 2 7 4 0 8
## Rural 0 1 2 0 4 5
##
## Virginia Washington West Virginia Wisconsin Wyoming
## Metropolitan 45 54 18 252 6
## Suburban 7 17 5 67 1
## Rural 4 0 7 32 3
ggplot(DIH_DATA, aes(x = State, fill = Urbanicity)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))