library(googlesheets4)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(scales)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 4.0.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(ggplot2)
#Data isnt full cleaned so loading everything as text and then will specify column type
library(googlesheets4)
library(dplyr)
DIH_DATA <- read_sheet(
"https://docs.google.com/spreadsheets/d/1OS4-QPBoffGKsxfqPzW3LyLRoxga2j3wLYPPxXtnUHw/edit?usp=sharing",
col_types = "c"
)
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for
## 'taleedsabawi@gmail.com'.
## ✔ Reading from "DIH Data 2026".
## ✔ Range 'Data'.
#Assigning columns manually to data type
DIH_DATA$Year_Charged <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code <- as.numeric(DIH_DATA$Rural_Code)
# Dates
#These first dates are in excel serial data structure...
DIH_DATA$`Date Incident` <- as.Date(as.numeric(DIH_DATA$`Date Incident`), origin = "1899-12-30")
## Warning in as.Date(as.numeric(DIH_DATA$`Date Incident`), origin =
## "1899-12-30"): NAs introduced by coercion
DIH_DATA$`Date Charge` <- as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30")
## Warning in as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30"):
## NAs introduced by coercion
# Ages
DIH_DATA$Accused_Age <- as.numeric(DIH_DATA$Accused_Age)
## Warning: NAs introduced by coercion
DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)
## Warning: NAs introduced by coercion
#checking data structure
str(DIH_DATA)
## tibble [3,264 × 31] (S3: tbl_df/tbl/data.frame)
## $ ID : chr [1:3264] "1678" "2622" "1514" "320" ...
## $ URL : chr [1:3264] "http://www.nydailynews.com/news/crime/pain-relieving-patch-kills-harlem-girl-6-foster-mother-charged-article-1.328165" "http://whdh.com/news/woman-boyfriend-charged-in-fatal-cocaine-overdose-of-ohio-boy-9/" "http://www.capitalgazette.com/news/for_the_record/ac-cn-delvalle-overdose-verdict-20180620-story.html" "http://www.thisweeknews.com/news/20180719/fourth-person-expected-to-be-charged-in-connection-with-grandview-ove"| __truncated__ ...
## $ Court : chr [1:3264] NA NA "Baltimore County Court" NA ...
## $ Rural_Code : num [1:3264] 1 2 1 1 1 1 1 5 7 2 ...
## $ County_Fips : chr [1:3264] "36061" "39099" "24005" "39049" ...
## $ County_FullName : chr [1:3264] "New York County" "Mahoning County" "Baltimore County" "Franklin County" ...
## $ County_Name : chr [1:3264] "New York" "Mahoning" "Baltimore" "Franklin" ...
## $ State : chr [1:3264] "New York" "Ohio" "Maryland" "Ohio" ...
## $ Year_Charged : num [1:3264] 2004 2018 2018 2018 2011 ...
## $ Date Charge : Date[1:3264], format: NA NA ...
## $ Date Incident : Date[1:3264], format: NA NA ...
## $ Accused_Name : chr [1:3264] "Joanne Alvarez" "Raenell Allen" "Jason Patton Baker" "Benjamin Bussey" ...
## $ Sentence : chr [1:3264] NA NA NA NA ...
## $ Sentence Quartile : chr [1:3264] NA NA NA NA ...
## $ Charge : chr [1:3264] "Reckless/Negligent Homicide" "Involuntary Manslaughter" "Manslaughter" "Involuntary Manslaughter" ...
## $ Accused_Race : chr [1:3264] NA "White" "White" "White" ...
## $ Accused_Ethnicity : chr [1:3264] "Hispanic/Latino" NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
## $ Accused_Sex : chr [1:3264] "Female" "Female" "Male" "Male" ...
## $ Accused_Age : num [1:3264] NA NA 46 19 NA NA NA NA NA NA ...
## $ Accused_City : chr [1:3264] "East Harlem" "Youngstown" "Millersville" "Dublin" ...
## $ Accused_State : chr [1:3264] "New York" "Ohio" "Pennsylvania" "Ohio" ...
## $ Court_Type : chr [1:3264] "State" "State" "State" "State" ...
## $ Plea : chr [1:3264] "Guilty" NA "Not Guilty" "Guilty to a Lesser Charge" ...
## $ Deceased_Name : chr [1:3264] "Taylor Webster" NA "Josiah Christopher Klaes" "Haleah Myers" ...
## $ Deceased_Race : chr [1:3264] "White" NA "White" "White" ...
## $ Deceased_Ethnicity: chr [1:3264] NA NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
## $ Deceased_Age : num [1:3264] 6 9 16 17 17 18 18 18 18 18 ...
## $ Deceased_Sex : chr [1:3264] "Female" "Male" "Male" "Female" ...
## $ Substance : chr [1:3264] "Fentanyl (Analog)" NA "Fentanyl (Analog)" "Fentanyl (Analog)" ...
## $ Lawyer : chr [1:3264] NA NA "do'connell@opd.state.md.us" NA ...
## $ Relationship : chr [1:3264] "Caretaker/Family/Friend/Partner/Co-User" "Caretaker/Family/Friend/Partner/Co-User" "Dealer/Buyer" "Caretaker/Family/Friend/Partner/Co-User" ...
#Looking to see what Sentence contains
sort(unique(DIH_DATA$Sentence))
## [1] "0.08219178082" "0.0833" "0.08333333333" "0.1052511416"
## [5] "0.1616438356" "0.1666666667" "0.2191780822" "0.25"
## [9] "0.325" "0.333" "0.3333333333" "0.375"
## [13] "0.4" "0.405" "0.4166666667" "0.5"
## [17] "0.5726027397" "0.5833333333" "0.6666666667" "0.75"
## [21] "0.822" "0.8333333333" "0.9166666667" "0.9583333333"
## [25] "1" "1.002" "1.25" "1.3"
## [29] "1.333333333" "1.416666667" "1.5" "1.583"
## [33] "1.66" "1.666666667" "1.7" "1.75"
## [37] "1.8" "1.916666667" "1.917" "1.92"
## [41] "1.97" "10" "10.16" "10.41"
## [45] "10.5" "10.67" "10.8" "10.83"
## [49] "10.83333333" "11" "11.1" "11.16666667"
## [53] "11.25" "11.5" "11.83" "11.9"
## [57] "113" "12" "12.25" "12.5"
## [61] "12.75" "124" "13" "13.33"
## [65] "13.5" "14" "14.5" "143"
## [69] "15" "15.5" "15.67" "16"
## [73] "16.5" "17" "17.5" "18"
## [77] "18.25" "18.66666667" "180" "19"
## [81] "19.25" "19.58333333" "2" "2.25"
## [85] "2.333333333" "2.4" "2.5" "2.75"
## [89] "20" "20.5" "20.83333333" "21"
## [93] "22" "22.5" "23" "24"
## [97] "25" "25.5" "26" "26.83"
## [101] "27" "27.08333333" "28" "29"
## [105] "3" "3.08" "3.083333333" "3.25"
## [109] "3.3" "3.33" "3.333333333" "3.4"
## [113] "3.416666667" "3.5" "3.666666667" "3.83"
## [117] "3.833333333" "3.92" "30" "30.41666667"
## [121] "31" "32" "33" "33.25"
## [125] "34" "35" "36" "37"
## [129] "38" "4" "4.083333333" "4.17"
## [133] "4.25" "4.416666667" "4.5" "4.666666667"
## [137] "4.75" "4.8" "4.83" "4.833"
## [141] "4.833333333" "4.9" "40" "42"
## [145] "46" "47" "48" "48.33"
## [149] "5" "5.06" "5.25" "5.5"
## [153] "5.666666667" "5.75" "5.83" "5.916666667"
## [157] "5.92" "50" "52" "54"
## [161] "55" "6" "6.083333333" "6.166666667"
## [165] "6.25" "6.5" "6.67" "6.7"
## [169] "6.75" "60" "62" "63"
## [173] "65" "7" "7.25" "7.33"
## [177] "7.5" "7.666666667" "72" "75"
## [181] "8" "8.08" "8.33" "8.333"
## [185] "8.333333333" "8.5" "8.67" "84"
## [189] "9" "9.33" "9.5" "9.917"
## [193] "90" "96" "Life" "pending"
## [197] "Pending"
I have manually gone into the data and created a new variable called “Sentence_num” – which makes Sentence a continuous numeric variable. Just for visualization sake…
For now I will drop anything that says “pending” or “Pending” from the dataset and replace them with NA
I changed all “life” sentences to 100 years, just for th
DIH_DATA <- DIH_DATA %>%
mutate(
Sentence_num = na_if(Sentence, "Pending"),
Sentence_num = na_if(Sentence, "pending"),
Sentence_num = as.numeric(Sentence)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sentence_num = as.numeric(Sentence)`.
## Caused by warning:
## ! NAs introduced by coercion
#make sentence quart var
DIH_DATA <- DIH_DATA %>%
mutate(
Sentence_Quartile_calc = ntile(Sentence_num, 4)
)
library(dplyr)
library(stringr)
summary(DIH_DATA$Sentence_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.08219 4.00000 8.00000 11.54717 15.00000 180.00000 1209
sum(!is.na(DIH_DATA$Sentence))
## [1] 2348
sum(!is.na(DIH_DATA$Sentence_num))
## [1] 2055
All observations 3,364 Observations with sentencing data 2,055
#General Summary of All Variables
summary(DIH_DATA)
## ID URL Court Rural_Code
## Length:3264 Length:3264 Length:3264 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Mode :character Median :2.000
## Mean :2.474
## 3rd Qu.:3.000
## Max. :9.000
## NA's :112
## County_Fips County_FullName County_Name State
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Year_Charged Date Charge Date Incident Accused_Name
## Min. :1974 Min. :NA Min. :NA Length:3264
## 1st Qu.:2015 1st Qu.:NA 1st Qu.:NA Class :character
## Median :2017 Median :NA Median :NA Mode :character
## Mean :2016 Mean :NaN Mean :NaN
## 3rd Qu.:2018 3rd Qu.:NA 3rd Qu.:NA
## Max. :2026 Max. :NA Max. :NA
## NA's :3264 NA's :3264
## Sentence Sentence Quartile Charge Accused_Race
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Accused_Ethnicity Accused_Sex Accused_Age Accused_City
## Length:3264 Length:3264 Min. :17.00 Length:3264
## Class :character Class :character 1st Qu.:27.00 Class :character
## Mode :character Mode :character Median :32.00 Mode :character
## Mean :34.34
## 3rd Qu.:40.00
## Max. :87.00
## NA's :2224
## Accused_State Court_Type Plea Deceased_Name
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Deceased_Race Deceased_Ethnicity Deceased_Age Deceased_Sex
## Length:3264 Length:3264 Min. : 0.00 Length:3264
## Class :character Class :character 1st Qu.:22.25 Class :character
## Mode :character Mode :character Median :28.00 Mode :character
## Mean :29.62
## 3rd Qu.:35.00
## Max. :91.00
## NA's :546
## Substance Lawyer Relationship Sentence_num
## Length:3264 Length:3264 Length:3264 Min. : 0.08219
## Class :character Class :character Class :character 1st Qu.: 4.00000
## Mode :character Mode :character Mode :character Median : 8.00000
## Mean : 11.54717
## 3rd Qu.: 15.00000
## Max. :180.00000
## NA's :1209
## Sentence_Quartile_calc
## Min. :1.000
## 1st Qu.:1.500
## Median :2.000
## Mean :2.499
## 3rd Qu.:3.000
## Max. :4.000
## NA's :1209
Missingness
colSums(is.na(DIH_DATA))
## ID URL Court
## 0 103 999
## Rural_Code County_Fips County_FullName
## 112 0 5
## County_Name State Year_Charged
## 2 0 0
## Date Charge Date Incident Accused_Name
## 3264 3264 7
## Sentence Sentence Quartile Charge
## 916 3264 1
## Accused_Race Accused_Ethnicity Accused_Sex
## 461 2286 17
## Accused_Age Accused_City Accused_State
## 2224 181 65
## Court_Type Plea Deceased_Name
## 72 907 522
## Deceased_Race Deceased_Ethnicity Deceased_Age
## 1103 2198 546
## Deceased_Sex Substance Lawyer
## 220 617 2459
## Relationship Sentence_num Sentence_Quartile_calc
## 276 1209 1209
#Creating Freq table function to reuse later
freq_table <- function(data, var) {
data %>%
count({{ var }}) %>%
mutate(percent = n / sum(n) * 100) %>%
arrange(desc(n))
}
#Creating Bar chart function for categorical variables to reuse later
library(ggplot2)
library(dplyr)
bar_plot_cat <- function(data, var) {
ggplot(data, aes(x = {{ var }}, fill = {{ var }})) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
x = deparse(substitute(var)),
y = "Count",
title = paste("Distribution of", deparse(substitute(var)))
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
}
#Creating histogram function
hist_plot <- function(data, var) {
ggplot(data, aes(x = {{ var }})) +
geom_histogram(bins = 30, fill = "steelblue") +
scale_x_continuous(breaks = seq(0, 200, by = 10)) + # 👈 more numbers
labs(
x = deparse(substitute(var)),
y = "Count",
title = paste("Distribution of", deparse(substitute(var)))
)
}
#creating density plot function
density_plot <- function(data, var) {
ggplot(data, aes(x = {{ var }})) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(
x = deparse(substitute(var)),
y = "Density",
title = paste("Density of", deparse(substitute(var)))
)
}
hist_plot(DIH_DATA, Sentence_num)
## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_bin()`).
Will need to explore missingness to see if it is random.
#📅Date Charged
freq_table (DIH_DATA, Year_Charged)
## # A tibble: 34 × 3
## Year_Charged n percent
## <dbl> <int> <dbl>
## 1 2017 560 17.2
## 2 2018 532 16.3
## 3 2016 384 11.8
## 4 2019 326 9.99
## 5 2015 266 8.15
## 6 2020 263 8.06
## 7 2014 213 6.53
## 8 2013 120 3.68
## 9 2021 99 3.03
## 10 2012 88 2.70
## # ℹ 24 more rows
library(dplyr)
library(ggplot2)
DIH_DATA %>%
filter(!is.na(Year_Charged)) %>%
mutate(Year_Charged = as.numeric(Year_Charged)) %>%
count(Year_Charged) %>%
ggplot(aes(x = Year_Charged, y = n)) +
geom_line() +
labs(
title = "Number of Cases Over Time",
x = "Year",
y = "Count"
)
#👤Accused characteristics ## Accused Age
density_plot(DIH_DATA, Accused_Age)
## Warning: Removed 2224 rows containing non-finite outside the scale range
## (`stat_density()`).
## • Accused_Ethnicity
freq_table(DIH_DATA,Accused_Ethnicity)
## # A tibble: 7 × 3
## Accused_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2286 70.0
## 2 Not Hispanic/Latino 734 22.5
## 3 Hispanic/Latino 147 4.50
## 4 NA 94 2.88
## 5 Not HIspanic/Latino 1 0.0306
## 6 White 1 0.0306
## 7 not Hispanic/Latino 1 0.0306
Cleaning typos in data…
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Ethnicity = str_to_title(Accused_Ethnicity), # fix casing
Accused_Ethnicity = str_replace_all(Accused_Ethnicity, "Hispanic", "Hispanic"), # normalize
Accused_Ethnicity = na_if(Accused_Ethnicity, "Na"), # fix "NA"
Accused_Ethnicity = case_when(
Accused_Ethnicity %in% c("Not Hispanic/Latino", "Not Hispanic/Latino") ~ "Not Hispanic/Latino",
Accused_Ethnicity %in% c("Hispanic/Latino") ~ "Hispanic/Latino",
Accused_Ethnicity == "White" ~ NA_character_, # wrong variable → set to NA
TRUE ~ Accused_Ethnicity
)
)
bar_plot_cat(DIH_DATA, Accused_Ethnicity)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
bar_plot_cat(DIH_DATA, Accused_Sex)
freq_table(DIH_DATA, Accused_Sex)
## # A tibble: 5 × 3
## Accused_Sex n percent
## <chr> <int> <dbl>
## 1 Male 2388 73.2
## 2 Female 853 26.1
## 3 <NA> 17 0.521
## 4 NA 3 0.0919
## 5 feMale 3 0.0919
Fixing typos in the data again…
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Sex = str_to_title(Accused_Sex), # fixes feMale → Female
Accused_Sex = na_if(Accused_Sex, "Na") # fixes "NA" string → real NA
)
freq_table(DIH_DATA, Accused_Race)
## # A tibble: 12 × 3
## Accused_Race n percent
## <chr> <int> <dbl>
## 1 White 2067 63.3
## 2 Black 641 19.6
## 3 <NA> 461 14.1
## 4 NA 37 1.13
## 5 Unknown POC 25 0.766
## 6 Asian 16 0.490
## 7 Native American or Alaskan Native 10 0.306
## 8 Hispanic/Latino 3 0.0919
## 9 Arabic 1 0.0306
## 10 Indian American 1 0.0306
## 11 Manufacture, Delivery, or Possession With Intent to Manufactur… 1 0.0306
## 12 white 1 0.0306
“white” → should be “White” • “NA” → should be real NA • “Arabic” → you already decided → “White” • that long drug charge string → clearly wrong variable → should be NA
• "Unknown POC" → not a usable category → ❌
• "Hispanic/Latino" → ethnicity, not race → ❌
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Race = na_if(Accused_Race, "NA"),
Accused_Race = case_when(
Accused_Race %in% c("white", "White") ~ "White",
Accused_Race %in% c("black", "Black") ~ "Black",
Accused_Race == "Arabic" ~ "White",
Accused_Race %in% c("Unknown POC", "Hispanic/Latino", "X") ~ NA_character_,
str_detect(Accused_Race, "Manufacture") ~ NA_character_,
TRUE ~ Accused_Race
)
)
freq_table(DIH_DATA, Accused_Race)
## # A tibble: 6 × 3
## Accused_Race n percent
## <chr> <int> <dbl>
## 1 White 2069 63.4
## 2 Black 641 19.6
## 3 <NA> 527 16.1
## 4 Asian 16 0.490
## 5 Native American or Alaskan Native 10 0.306
## 6 Indian American 1 0.0306
bar_plot_cat(DIH_DATA, Accused_Race)
##Race/Ethnicity Variable
DIH_DATA <- DIH_DATA %>%
mutate(
Race_Ethnicity = case_when(
Accused_Ethnicity == "Hispanic/Latino" ~ "Hispanic",
Accused_Race == "White" ~ "White (Non-Hispanic)",
Accused_Race == "Black" ~ "Black (Non-Hispanic)",
Accused_Race == "Asian" ~ "Asian",
Accused_Race == "Native American or Alaskan Native" ~ "Native American",
TRUE ~ NA_character_
)
)
DIH_DATA <- DIH_DATA %>%
mutate(
Race_Ethnicity = case_when(
Race_Ethnicity %in% c("Asian", "Native American") ~ "Other",
TRUE ~ Race_Ethnicity
)
)
freq_table(DIH_DATA, Race_Ethnicity)
## # A tibble: 5 × 3
## Race_Ethnicity n percent
## <chr> <int> <dbl>
## 1 White (Non-Hispanic) 1993 61.1
## 2 Black (Non-Hispanic) 629 19.3
## 3 <NA> 472 14.5
## 4 Hispanic 147 4.50
## 5 Other 23 0.705
bar_plot_cat(DIH_DATA, Race_Ethnicity)
#👤Deceased characteristics
Was there really a 91 year old who was the deceased?
deceased under 18 =>
sum(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)
## [1] 262
percent of sample (after dropping those missing) 18 or under =>
mean(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)
## [1] 0.09639441
DIH_DATA %>%
filter(Deceased_Age <= 18) %>%
count(Deceased_Age, sort = TRUE)
## # A tibble: 19 × 2
## Deceased_Age n
## <dbl> <int>
## 1 18 68
## 2 17 37
## 3 1 29
## 4 16 24
## 5 2 21
## 6 0 19
## 7 15 15
## 8 3 11
## 9 14 8
## 10 5 7
## 11 6 5
## 12 10 5
## 13 13 3
## 14 4 2
## 15 9 2
## 16 11 2
## 17 12 2
## 18 7 1
## 19 8 1
percent of sample 2 or younger =>
mean(DIH_DATA$Deceased_Age <= 2, na.rm = TRUE)* 100
## [1] 2.538631
Quartiles for 18 and younger =>
quantile(
DIH_DATA$Deceased_Age[DIH_DATA$Deceased_Age <= 18],
probs = c(0.25, 0.5, 0.75),
na.rm = TRUE
)
## 25% 50% 75%
## 2 15 18
Interested in the breakdown of deceased by age under 18, creating groups and graphing. =>
library(dplyr)
DIH_DATA <- DIH_DATA %>%
mutate(
age_group = case_when(
Deceased_Age >= 0 & Deceased_Age <= 1 ~ "0–1",
Deceased_Age >= 2 & Deceased_Age <= 3 ~ "2–3",
Deceased_Age >= 4 & Deceased_Age <= 5 ~ "4–5",
Deceased_Age >= 6 & Deceased_Age <= 7 ~ "6–7",
Deceased_Age >= 8 & Deceased_Age <= 9 ~ "8–9",
Deceased_Age >= 10 & Deceased_Age <= 11 ~ "10–11",
Deceased_Age >= 12 & Deceased_Age <= 13 ~ "12–13",
Deceased_Age >= 14 & Deceased_Age <= 15 ~ "14–15",
Deceased_Age >= 16 & Deceased_Age <= 17 ~ "16–17",
Deceased_Age == 18 ~ "18",
TRUE ~ NA_character_
)
)
#Ordered
DIH_DATA$age_group <- factor(
DIH_DATA$age_group,
levels = c("0–1","2–3","4–5","6–7","8–9","10–11","12–13","14–15","16–17","18")
)
#Plot
library(ggplot2)
ggplot(
DIH_DATA %>% filter(Deceased_Age <= 18, !is.na(age_group)),
aes(x = age_group, fill = age_group)
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Deceased Age (≤ 18)",
x = "Age Group",
y = "Count"
) +
guides(fill = "none") # removes legend if you don’t want it
freq_table(DIH_DATA, Deceased_Age)
## # A tibble: 75 × 3
## Deceased_Age n percent
## <dbl> <int> <dbl>
## 1 NA 546 16.7
## 2 26 141 4.32
## 3 28 137 4.20
## 4 21 134 4.11
## 5 23 126 3.86
## 6 25 122 3.74
## 7 24 121 3.71
## 8 30 116 3.55
## 9 22 114 3.49
## 10 27 106 3.25
## # ℹ 65 more rows
freq_table(DIH_DATA, age_group)
## # A tibble: 11 × 3
## age_group n percent
## <fct> <int> <dbl>
## 1 <NA> 3002 92.0
## 2 18 68 2.08
## 3 16–17 61 1.87
## 4 0–1 48 1.47
## 5 2–3 32 0.980
## 6 14–15 23 0.705
## 7 4–5 9 0.276
## 8 10–11 7 0.214
## 9 6–7 6 0.184
## 10 12–13 5 0.153
## 11 8–9 3 0.0919
freq_table(DIH_DATA, Deceased_Race)
## # A tibble: 13 × 3
## Deceased_Race n percent
## <chr> <int> <dbl>
## 1 White 1964 60.2
## 2 <NA> 1103 33.8
## 3 Black 90 2.76
## 4 NA 66 2.02
## 5 Unknown POC 17 0.521
## 6 Asian 10 0.306
## 7 Native American or Alaskan Native 6 0.184
## 8 Hispanic/Latino 3 0.0919
## 9 Arabic 1 0.0306
## 10 Multiple Deceased 1 0.0306
## 11 Thomas McGuinness 1 0.0306
## 12 Unknown 1 0.0306
## 13 X 1 0.0306
Need to fix these… Arabic is changed to White. Thomas McGuinness changed to NA. Fix the NAs, X and Unknown
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Race = na_if(Deceased_Race, "NA"),
Deceased_Race = case_when(
Deceased_Race == "Arabic" ~ "White",
Deceased_Race == "Thomas McGuinness" ~ NA_character_,
Deceased_Race %in% c("Unknown", "X") ~ NA_character_,
TRUE ~ Deceased_Race
)
)
freq_table(DIH_DATA, Deceased_Race)
## # A tibble: 8 × 3
## Deceased_Race n percent
## <chr> <int> <dbl>
## 1 White 1965 60.2
## 2 <NA> 1172 35.9
## 3 Black 90 2.76
## 4 Unknown POC 17 0.521
## 5 Asian 10 0.306
## 6 Native American or Alaskan Native 6 0.184
## 7 Hispanic/Latino 3 0.0919
## 8 Multiple Deceased 1 0.0306
bar_plot_cat(DIH_DATA, Deceased_Race)
freq_table(DIH_DATA, Deceased_Ethnicity)
## # A tibble: 9 × 3
## Deceased_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2198 67.3
## 2 Not Hispanic/Latino 844 25.9
## 3 NA 116 3.55
## 4 Hispanic/Latino 99 3.03
## 5 Hispanic Latino 2 0.0613
## 6 White 2 0.0613
## 7 Multiple Deceased 1 0.0306
## 8 Not Hispanic/latino 1 0.0306
## 9 X 1 0.0306
Again, fixing the data…
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Ethnicity = na_if(Deceased_Ethnicity, "NA"),
Deceased_Ethnicity = case_when(
Deceased_Ethnicity == "Hispanic Latino" ~ "Hispanic/Latino",
Deceased_Ethnicity == "Not Hispanic/latino" ~ "Not Hispanic/Latino",
Deceased_Ethnicity == "White" ~ NA_character_,
Deceased_Ethnicity == "X" ~ NA_character_,
TRUE ~ Deceased_Ethnicity
)
)
freq_table(DIH_DATA, Deceased_Ethnicity)
## # A tibble: 4 × 3
## Deceased_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2317 71.0
## 2 Not Hispanic/Latino 845 25.9
## 3 Hispanic/Latino 101 3.09
## 4 Multiple Deceased 1 0.0306
bar_plot_cat(DIH_DATA, Deceased_Ethnicity)
Large missingness
freq_table(DIH_DATA, Deceased_Sex)
## # A tibble: 14 × 3
## Deceased_Sex n percent
## <chr> <int> <dbl>
## 1 Male 1912 58.6
## 2 Female 1090 33.4
## 3 <NA> 220 6.74
## 4 NA 25 0.766
## 5 male 6 0.184
## 6 Famale 2 0.0613
## 7 female 2 0.0613
## 8 27 1 0.0306
## 9 Both Female 1 0.0306
## 10 Both male 1 0.0306
## 11 Female, Male 1 0.0306
## 12 Multiple Deceased 1 0.0306
## 13 Other 1 0.0306
## 14 X 1 0.0306
changing both to be NA
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Sex = case_when(
Deceased_Sex %in% c("NA","27","X","Other",
"Both Female","Both male","Female, Male", "Both") ~ NA_character_,
Deceased_Sex %in% c("male","Male") ~ "Male",
Deceased_Sex %in% c("female","Female","Famale") ~ "Female",
TRUE ~ Deceased_Sex
)
)
freq_table(DIH_DATA, Deceased_Sex)
## # A tibble: 4 × 3
## Deceased_Sex n percent
## <chr> <int> <dbl>
## 1 Male 1918 58.8
## 2 Female 1094 33.5
## 3 <NA> 251 7.69
## 4 Multiple Deceased 1 0.0306
#🌍Rural Code NA’s: 112
ggplot(
DIH_DATA %>% filter(!is.na(Rural_Code)),
aes(x = factor(Rural_Code), fill = factor(Rural_Code))
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Rural_Code",
x = "Rural Code",
y = "Count"
) +
guides(fill = "none")
Creating urbanacity variable
DIH_DATA <- DIH_DATA %>%
mutate(
Urbanicity = case_when(
Rural_Code %in% 1:3 ~ "Metropolitan",
Rural_Code %in% 4:6 ~ "Suburban",
Rural_Code %in% 7:9 ~ "Rural",
TRUE ~ NA_character_
)
)
#Ordering for plotting
DIH_DATA$Urbanicity <- factor(
DIH_DATA$Urbanicity,
levels = c("Metropolitan", "Suburban", "Rural")
)
ggplot(
DIH_DATA %>% filter(!is.na(Urbanicity)),
aes(x = Urbanicity, fill = Urbanicity)
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Urbanicity",
x = "Urbanicity",
y = "Count"
) +
guides(fill = "none")
Would be interesting to see state by urbancity…
ggplot(
DIH_DATA %>% filter(!is.na(State), !is.na(Urbanicity)),
aes(x = State, fill = Urbanicity)
) +
geom_bar(position = "fill") +
labs(
title = "Proportion of Urbanicity by State",
x = "State",
y = "Proportion"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
I guess this could just be a function of how rural the state is…but we
could test further…
#📍State
There is a lowercase pennsylvania in the dataset…fixing it…
DIH_DATA %>%
filter(State == "pennsylvania")
## # A tibble: 1 × 36
## ID URL Court Rural_Code County_Fips County_FullName County_Name State
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 540 3 senten… <NA> 2 42071 Lancaster Coun… Lancaster penn…
## # ℹ 28 more variables: Year_Charged <dbl>, `Date Charge` <date>,
## # `Date Incident` <date>, Accused_Name <chr>, Sentence <chr>,
## # `Sentence Quartile` <chr>, Charge <chr>, Accused_Race <chr>,
## # Accused_Ethnicity <chr>, Accused_Sex <chr>, Accused_Age <dbl>,
## # Accused_City <chr>, Accused_State <chr>, Court_Type <chr>, Plea <chr>,
## # Deceased_Name <chr>, Deceased_Race <chr>, Deceased_Ethnicity <chr>,
## # Deceased_Age <dbl>, Deceased_Sex <chr>, Substance <chr>, Lawyer <chr>, …
DIH_DATA <- DIH_DATA %>%
mutate(
State = ifelse(State == "pennsylvania", "Pennsylvania", State)
)
bar_plot_cat(DIH_DATA, State)
freq_table(DIH_DATA, State)
## # A tibble: 50 × 3
## State n percent
## <chr> <int> <dbl>
## 1 Pennsylvania 578 17.7
## 2 Ohio 354 10.8
## 3 Wisconsin 351 10.8
## 4 Illinois 299 9.16
## 5 Florida 125 3.83
## 6 Minnesota 116 3.55
## 7 New Jersey 113 3.46
## 8 New York 112 3.43
## 9 North Carolina 111 3.40
## 10 Michigan 99 3.03
## # ℹ 40 more rows
#⚖️ Sentence Quartile
sum(!is.na(DIH_DATA$Sentence_Quartile_calc))
## [1] 2055
#⚖️ Legal / case variables ## • Charge
freq_table(DIH_DATA,Charge)
## # A tibble: 32 × 3
## Charge n percent
## <chr> <int> <dbl>
## 1 Drug Delivery/Distribution Resulting in Death 610 18.7
## 2 Involuntary Manslaughter 479 14.7
## 3 Reckless/Negligent Homicide 432 13.2
## 4 Murder (2nd or 3rd Degree) 342 10.5
## 5 Drug Induced Homicide 306 9.38
## 6 Manslaughter 252 7.72
## 7 Delivery/Distribution of Heroin/Fentanyl Resulting in Death 151 4.63
## 8 Delivery/Distribution Of a Controlled Substance Causing/Result… 122 3.74
## 9 Other (Please Specify in Notes) 101 3.09
## 10 Murder (1st Degree) 99 3.03
## # ℹ 22 more rows
@Katie Flagging this – still don’t know how these were categorized…
freq_table(DIH_DATA,Plea)
## # A tibble: 37 × 3
## Plea n percent
## <chr> <int> <dbl>
## 1 Guilty 1756 53.8
## 2 <NA> 907 27.8
## 3 Not Guilty 205 6.28
## 4 NA 188 5.76
## 5 Guilty of a Lesser Charge 46 1.41
## 6 No Contest 42 1.29
## 7 Awaiting Plea Hearing 31 0.950
## 8 Guilty to a Lesser Charge 31 0.950
## 9 Pleaded Guilty 13 0.398
## 10 Guilty to a Lesser DIH Charge 12 0.368
## # ℹ 27 more rows
Fixed the data as follows…
DIH_DATA <- DIH_DATA %>%
mutate(
Plea = na_if(Plea, "NA"),
Plea = case_when(
str_detect(tolower(Plea), "guilty") & !str_detect(tolower(Plea), "lesser") ~ "Guilty",
str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
str_detect(tolower(Plea), "pending") ~ "Pending/Awaiting",
Plea %in% c("State", "X", "Other") ~ NA_character_,
TRUE ~ Plea
)
)
DIH_DATA <- DIH_DATA %>%
mutate(
Plea = case_when(
Plea == "Judge ruled it was not murder" ~ NA_character_,
Plea == "Pleads not guily" ~ "Guilty",
Plea == "Pled guity to DIH charges" ~ "Guilty",
TRUE ~ Plea
)
)
freq_table(DIH_DATA, Plea)
## # A tibble: 6 × 3
## Plea n percent
## <chr> <int> <dbl>
## 1 Guilty 1996 61.2
## 2 <NA> 1097 33.6
## 3 Guilty to Lesser Charge 91 2.79
## 4 No Contest 48 1.47
## 5 Awaiting Plea Hearing 31 0.950
## 6 Alford Plea 1 0.0306
bar_plot_cat(DIH_DATA, Plea)
Do we have data on verdicts? Versus just pleas
freq_table(DIH_DATA,Court_Type)
## # A tibble: 6 × 3
## Court_Type n percent
## <chr> <int> <dbl>
## 1 State 2789 85.4
## 2 Federal 388 11.9
## 3 <NA> 72 2.21
## 4 NA 12 0.368
## 5 federal 2 0.0613
## 6 Fayette County 1 0.0306
DIH_DATA <- DIH_DATA %>%
mutate(
Court_Type = na_if(Court_Type, "NA"),
Court_Type = case_when(
Court_Type == "federal" ~ "Federal",
Court_Type == "Fayette County" ~ NA_character_,
TRUE ~ Court_Type
)
)
freq_table(DIH_DATA, Court_Type)
## # A tibble: 3 × 3
## Court_Type n percent
## <chr> <int> <dbl>
## 1 State 2789 85.4
## 2 Federal 390 11.9
## 3 <NA> 85 2.60
freq_table(DIH_DATA, Relationship)
## # A tibble: 6 × 3
## Relationship n percent
## <chr> <int> <dbl>
## 1 Dealer/Buyer 2097 64.2
## 2 Caretaker/Family/Friend/Partner/Co-User 768 23.5
## 3 <NA> 276 8.46
## 4 Doctor/Patient 88 2.70
## 5 NA 26 0.797
## 6 Dealer/buyer 9 0.276
Fixing typos.
DIH_DATA <- DIH_DATA %>%
mutate(
Relationship = na_if(Relationship, "NA"),
Relationship = case_when(
Relationship %in% c("Dealer/buyer", "Dealer/Buyer") ~ "Dealer/Buyer",
TRUE ~ Relationship
)
)
freq_table(DIH_DATA, Relationship)
## # A tibble: 4 × 3
## Relationship n percent
## <chr> <int> <dbl>
## 1 Dealer/Buyer 2106 64.5
## 2 Caretaker/Family/Friend/Partner/Co-User 768 23.5
## 3 <NA> 302 9.25
## 4 Doctor/Patient 88 2.70
freq_table(DIH_DATA, Substance)
## # A tibble: 39 × 3
## Substance n percent
## <chr> <int> <dbl>
## 1 Heroin 944 28.9
## 2 <NA> 617 18.9
## 3 Fentanyl (Analog) 459 14.1
## 4 Fentanyl (Analog) and Heroin 408 12.5
## 5 Prescription Opioid 137 4.20
## 6 Mixed Drugs Not Listed (Comment in Cell) 113 3.46
## 7 Methadone 80 2.45
## 8 Other Drug (Comment in Cell) 68 2.08
## 9 Methamphetamine 66 2.02
## 10 Fentanyl Analog 47 1.44
## # ℹ 29 more rows
Cleaning data as follows…
library(stringr)
library(dplyr)
DIH_DATA <- DIH_DATA %>%
mutate(
Substance = tolower(Substance),
Substance = na_if(Substance, "na"),
Substance_clean = case_when(
str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
str_detect(Substance, "fentanyl") ~ "Fentanyl",
str_detect(Substance, "heroin") ~ "Heroin",
str_detect(Substance, "meth") ~ "Methamphetamine",
str_detect(Substance, "cocaine") ~ "Cocaine",
str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
str_detect(Substance, "xanax|benzo") ~ "Other",
str_detect(Substance, "ecstasy") ~ "Other",
str_detect(Substance, "benadryl") ~ "Other",
str_detect(Substance, "bullskin") ~ NA_character_,
TRUE ~ "Other"
)
)
freq_table(DIH_DATA, Substance_clean)
## # A tibble: 9 × 3
## Substance_clean n percent
## <chr> <int> <dbl>
## 1 Heroin 945 29.0
## 2 Other 707 21.7
## 3 Fentanyl 670 20.5
## 4 Fentanyl + Heroin 429 13.1
## 5 Other Opioid 162 4.96
## 6 Methamphetamine 161 4.93
## 7 Polysubstance 113 3.46
## 8 Cocaine 76 2.33
## 9 <NA> 1 0.0306
bar_plot_cat(DIH_DATA, Substance_clean)
library(dplyr)
library(tidyr)
options(scipen = 999)
numeric_summary <- DIH_DATA %>%
summarise(
across(
c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
list(
nonmissing = ~sum(!is.na(.)),
missing = ~sum(is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
)
)
) %>%
pivot_longer(
everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
) %>%
pivot_wider(names_from = Statistic, values_from = value)
numeric_summary <- numeric_summary %>%
mutate(
min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
)
numeric_summary
## # A tibble: 5 × 8
## Variable nonmissing missing mean sd median min max
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Rural_Code 3152 112 2.47 1.89 2 1 9
## 2 Year_Charged 3264 0 2016. 4.02 2017 1974 2026
## 3 Accused_Age 1040 2224 34.3 10.3 32 17 87
## 4 Deceased_Age 2718 546 29.6 11.5 28 0 91
## 5 Sentence_num 2055 1209 11.5 12.1 8 0.08219178082 180
library(dplyr)
library(purrr)
library(tidyr)
library(gt)
cat_vars <- c(
"State",
"Charge",
"Accused_Race",
"Accused_Ethnicity",
"Accused_Sex",
"Accused_State",
"Court_Type",
"Plea",
"Deceased_Race",
"Deceased_Ethnicity",
"Deceased_Sex",
"Substance",
"Relationship",
"Urbanicity",
"Sentence_Quartile_calc"
)
categorical_summary <- map_dfr(cat_vars, function(v) {
x <- DIH_DATA[[v]]
tab <- sort(table(x, useNA = "no"), decreasing = TRUE)
tibble(
Variable = v,
nonmissing = sum(!is.na(x)),
missing = sum(is.na(x)),
n_categories = dplyr::n_distinct(x, na.rm = TRUE),
most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
)
})
categorical_summary
## # A tibble: 15 × 7
## Variable nonmissing missing n_categories most_common most_common_n
## <chr> <int> <int> <int> <chr> <int>
## 1 State 3264 0 50 Pennsylvan… 578
## 2 Charge 3263 1 31 Drug Deliv… 610
## 3 Accused_Race 2737 527 5 White 2069
## 4 Accused_Ethnicity 883 2381 2 Not Hispan… 736
## 5 Accused_Sex 3244 20 2 Male 2388
## 6 Accused_State 3199 65 59 Pennsylvan… 549
## 7 Court_Type 3179 85 2 State 2789
## 8 Plea 2167 1097 5 Guilty 1996
## 9 Deceased_Race 2092 1172 7 White 1965
## 10 Deceased_Ethnicity 947 2317 3 Not Hispan… 845
## 11 Deceased_Sex 3013 251 3 Male 1918
## 12 Substance 2641 623 36 heroin 945
## 13 Relationship 2962 302 3 Dealer/Buy… 2106
## 14 Urbanicity 3152 112 3 Metropolit… 2525
## 15 Sentence_Quartile_… 2055 1209 4 1 514
## # ℹ 1 more variable: most_common_pct <dbl>
Accused_State has 59 states so it needs to be cleaned if we want to use that variable… not going to bother right now because we aren’t using it
#Bivariate Relationships ##Numeric vs. Numer
cor.test(
DIH_DATA$Sentence_num,
DIH_DATA$Accused_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Accused_Age, :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Sentence_num and DIH_DATA$Accused_Age
## S = 12670082, p-value = 0.005528
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1314717
cor.test(
DIH_DATA$Sentence_num,
DIH_DATA$Deceased_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Sentence_num, DIH_DATA$Deceased_Age, :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Sentence_num and DIH_DATA$Deceased_Age
## S = 930447121, p-value = 0.2802
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.02575955
ggplot(DIH_DATA, aes(x = Accused_Age, y = Sentence_num)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2820 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2820 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(DIH_DATA, aes(x = Deceased_Age, y = Sentence_num)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1505 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1505 rows containing missing values or values outside the scale range
## (`geom_point()`).
outcome variable: • heavily skewed • heteroskedastic (variance increases with age for accused and Spread increases in the middle age range (~20–50) for deceased) • outliers present
This violates assumptions of:
• OLS regression
• Pearson correlation
So used Spearman correlation -- @Katie ... any thoughts on other tests to run here?
Given the skewed distribution of sentence length and the presence of outliers, Spearman’s rank correlation was used.
###Sentence length by Sex
DIH_DATA %>%
group_by(Accused_Sex) %>%
summarise(
n = sum(!is.na(Sentence_num)),
mean_sentence = mean(Sentence_num, na.rm = TRUE),
median_sentence = median(Sentence_num, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Accused_Sex n mean_sentence median_sentence
## <chr> <int> <dbl> <dbl>
## 1 Female 560 9.83 6
## 2 Male 1486 12.2 9
## 3 <NA> 9 6.86 5
ggplot(DIH_DATA, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
geom_boxplot() +
guides(fill = "none")
## Warning: Removed 1209 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
t.test(Sentence_num ~ Accused_Sex, data = DIH_DATA)
##
## Welch Two Sample t-test
##
## data: Sentence_num by Accused_Sex
## t = -4.1076, df = 1071.3, p-value = 0.00004303
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -3.535442 -1.249629
## sample estimates:
## mean in group Female mean in group Male
## 9.830096 12.222631
Comparison of Sentence Length by Accused Sex
A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -4.11, p < 0.001). • Mean sentence (Female): 9.83 • Mean sentence (Male): 12.22 • Mean difference: approximately 2.39 units higher for males • 95% CI for the difference: [-3.54, -1.25]
These results suggest that, on average, male defendants receive longer sentences than female defendants.
However, several important caveats should be noted: • The distribution of sentence length is highly skewed with substantial outliers, particularly among male defendants. • Because of this skewness, the mean may not fully represent the typical case, and results should be interpreted cautiously. • A nonparametric alternative (e.g., Mann–Whitney U test) or analysis using transformed data (e.g., log(sentence)) may provide a more robust assessment.
Additionally, the boxplot visualization shows: • Greater variability in sentence length among males • More extreme high-value outliers in the male group • Substantial overlap between groups despite the difference in means
Finally, the plotting warning indicates that 1,209 observations were removed due to missing or non-finite values, which may affect representativeness and should be investigated further.
let’s discuss if I should run these additional tests or if we are satisfied with these findings
##Sentence_Quartile Bivariates on full dataset
cor.test(
DIH_DATA$Accused_Age,
DIH_DATA$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Accused_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Accused_Age and DIH_DATA$Sentence_Quartile_calc
## S = 12484370, p-value = 0.00232
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1442022
cor.test(
DIH_DATA$Deceased_Age,
DIH_DATA$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(DIH_DATA$Deceased_Age,
## DIH_DATA$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: DIH_DATA$Deceased_Age and DIH_DATA$Sentence_Quartile_calc
## S = 902691085, p-value = 0.8393
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.004839738
###SentQ+Charge
kruskal.test(Sentence_Quartile_calc ~ Charge, data = DIH_DATA)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 258.17, df = 28, p-value <
## 0.00000000000000022
Again this wont mean much until we clean the Charge variable
###SentQ+Relationship Tests whether distributions differ across groups
kruskal.test(Sentence_Quartile_calc ~ Relationship, data = DIH_DATA)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 40.074, df = 2, p-value = 0.000000001986
table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)
##
## 1 2 3 4
## Caretaker/Family/Friend/Partner/Co-User 165 131 116 93
## Dealer/Buyer 291 339 353 363
## Doctor/Patient 5 10 10 22
tab2 <- table(DIH_DATA$Relationship, DIH_DATA$Sentence_Quartile_calc)
chisq.test(tab2)
##
## Pearson's Chi-squared test
##
## data: tab2
## X-squared = 44.016, df = 6, p-value = 0.00000007337
#Analytic Sample
@Katie we will need to include a justification in the methods section if it is not already in there about why we restricted the data to this time frame.
dih_analytic <- DIH_DATA %>%
filter(Year_Charged >= 2011 & Year_Charged <= 2021) %>%
filter(!is.na(Rural_Code))
Hmmmm if we are going to restrict the analytic data sample by years… then perhaps we need to run descriptives here again with the analytic sample to include in the methods section?
library(dplyr)
library(tidyr)
options(scipen = 999)
numeric_summary_B <- dih_analytic %>%
summarise(
across(
c(Rural_Code, Year_Charged, Accused_Age, Deceased_Age, Sentence_num),
list(
nonmissing = ~sum(!is.na(.)),
missing = ~sum(is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
)
)
) %>%
pivot_longer(
everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "^(.*)_(nonmissing|missing|mean|sd|median|min|max)$"
) %>%
pivot_wider(names_from = Statistic, values_from = value)
numeric_summary_B <- numeric_summary_B %>%
mutate(
min = ifelse(Variable == "Year_Charged", formatC(min, format = "f", digits = 0), as.character(min)),
median = ifelse(Variable == "Year_Charged", formatC(median, format = "f", digits = 0), as.character(median)),
max = ifelse(Variable == "Year_Charged", formatC(max, format = "f", digits = 0), as.character(max))
)
numeric_summary_B
## # A tibble: 5 × 8
## Variable nonmissing missing mean sd median min max
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Rural_Code 2838 0 2.49 1.88 2 1 9
## 2 Year_Charged 2838 0 2017. 2.36 2017 2011 2021
## 3 Accused_Age 976 1862 34.3 10.4 32 17 87
## 4 Deceased_Age 2375 463 29.9 11.5 28 0 91
## 5 Sentence_num 1772 1066 11.4 11.8 8 0.08219178082 180
library(dplyr)
library(purrr)
library(tidyr)
library(gt)
categorical_summary2 <- map_dfr(cat_vars, function(v) {
x <- dih_analytic[[v]]
tab <- sort(table(x, useNA = "no"), decreasing = TRUE)
tibble(
Variable = v,
nonmissing = sum(!is.na(x)),
missing = sum(is.na(x)),
n_categories = dplyr::n_distinct(x, na.rm = TRUE),
most_common = if (length(tab) > 0) names(tab)[1] else NA_character_,
most_common_n = if (length(tab) > 0) as.integer(tab[1]) else NA_integer_,
most_common_pct = if (length(tab) > 0) round(100 * as.integer(tab[1]) / sum(!is.na(x)), 2) else NA_real_
)
})
categorical_summary2
## # A tibble: 15 × 7
## Variable nonmissing missing n_categories most_common most_common_n
## <chr> <int> <int> <int> <chr> <int>
## 1 State 2838 0 47 Pennsylvan… 553
## 2 Charge 2837 1 30 Drug Deliv… 571
## 3 Accused_Race 2441 397 5 White 1819
## 4 Accused_Ethnicity 829 2009 2 Not Hispan… 697
## 5 Accused_Sex 2823 15 2 Male 2082
## 6 Accused_State 2777 61 58 Pennsylvan… 525
## 7 Court_Type 2760 78 2 State 2436
## 8 Plea 1835 1003 5 Guilty 1688
## 9 Deceased_Race 1867 971 7 White 1750
## 10 Deceased_Ethnicity 873 1965 3 Not Hispan… 781
## 11 Deceased_Sex 2632 206 3 Male 1685
## 12 Substance 2317 521 34 heroin 823
## 13 Relationship 2576 262 3 Dealer/Buy… 1883
## 14 Urbanicity 2838 0 3 Metropolit… 2274
## 15 Sentence_Quartile_… 1772 1066 4 2 457
## # ℹ 1 more variable: most_common_pct <dbl>
##Testing Bivariate Relationships in Analytic Sample - Outcome is Sentence_num
cor.test(
dih_analytic$Sentence_num,
dih_analytic$Accused_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Accused_Age, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Sentence_num and dih_analytic$Accused_Age
## S = 10194801, p-value = 0.003246
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1441686
cor.test(
dih_analytic$Sentence_num,
dih_analytic$Deceased_Age,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Deceased_Age, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Sentence_num and dih_analytic$Deceased_Age
## S = 601445934, p-value = 0.1897
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.03369232
cor.test(
dih_analytic$Sentence_num,
dih_analytic$Year_Charged,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Sentence_num,
## dih_analytic$Year_Charged, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Sentence_num and dih_analytic$Year_Charged
## S = 903777791, p-value = 0.285
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.02541011
dih_analytic %>%
group_by(Accused_Sex) %>%
summarise(
n = sum(!is.na(Sentence_num)),
mean_sentence = mean(Sentence_num, na.rm = TRUE),
median_sentence = median(Sentence_num, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Accused_Sex n mean_sentence median_sentence
## <chr> <int> <dbl> <dbl>
## 1 Female 479 10.0 6
## 2 Male 1286 11.9 9
## 3 <NA> 7 7.25 5
ggplot(dih_analytic, aes(x = Accused_Sex, y = Sentence_num, fill = Accused_Sex)) +
geom_boxplot() +
guides(fill = "none")
## Warning: Removed 1066 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
t.test(Sentence_num ~ Accused_Sex, data = dih_analytic)
##
## Welch Two Sample t-test
##
## data: Sentence_num by Accused_Sex
## t = -2.9574, df = 831.36, p-value = 0.00319
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -3.1560701 -0.6379943
## sample estimates:
## mean in group Female mean in group Male
## 10.00579 11.90282
Comparison of Sentence Length by Accused Sex (2017–2021 Subset)
A Welch two-sample t-test was conducted to compare average sentence length between female and male defendants for cases from 2017–2021. The results indicate a statistically significant difference in mean sentence length between the two groups (t = -2.96, p = 0.003). • Mean sentence (Female): 10.01 • Mean sentence (Male): 11.90 • Mean difference: approximately 1.9 units higher for males • 95% CI for the difference: [-3.16, -0.64]
These results suggest that, within this time period, male defendants receive longer sentences on average than female defendants.
Additional descriptive statistics support this pattern: • Median sentence (Female): 6 • Median sentence (Male): 9
This indicates that the difference is also present at the median level, not just driven by extreme values.
However, several important considerations apply: • Sentence length remains highly skewed with notable outliers, particularly among male defendants. • Although the mean difference is statistically significant, the magnitude of the difference is modest relative to the overall variability in sentencing. • The boxplot shows substantial overlap between the distributions for males and females. • A nonparametric test (e.g., Mann–Whitney U) or analysis using transformed sentence length may provide a more robust assessment.
Additionally, the plotting warning indicates that 1,066 observations were removed due to missing or non-finite values, which should be examined to ensure no systematic bias in the analyzed subset.
summary(aov(Sentence_num ~ Urbanicity, data = dih_analytic))
## Df Sum Sq Mean Sq F value Pr(>F)
## Urbanicity 2 408 204.0 1.463 0.232
## Residuals 1769 246685 139.4
## 1066 observations deleted due to missingness
summary(aov(Sentence_num ~ Charge, data = dih_analytic))
## Df Sum Sq Mean Sq F value Pr(>F)
## Charge 27 13356 494.7 3.691 0.000000000635 ***
## Residuals 1744 233737 134.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1066 observations deleted due to missingness
This suggest the sentence length varies by charge… but we need information about how charges were grouped before we can run this analysis properly
summary(aov(Sentence_num ~ Relationship, data = dih_analytic))
## Df Sum Sq Mean Sq F value Pr(>F)
## Relationship 2 2048 1024.0 7.471 0.000589 ***
## Residuals 1632 223687 137.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1203 observations deleted due to missingness
##Testing Bivariate Relationships in Analytic Sample: Outcome is Sentencing Quartile.
###SentQ+DecAge
cor.test(
dih_analytic$Accused_Age,
dih_analytic$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Accused_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Accused_Age and dih_analytic$Sentence_Quartile_calc
## S = 10081186, p-value = 0.001687
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1537063
cor.test(
dih_analytic$Deceased_Age,
dih_analytic$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Deceased_Age,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Deceased_Age and dih_analytic$Sentence_Quartile_calc
## S = 582264805, p-value = 0.9775
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.0007261225
###SentQ+Yr
cor.test(
dih_analytic$Year_Charged,
dih_analytic$Sentence_Quartile_calc,
method = "spearman",
use = "complete.obs"
)
## Warning in cor.test.default(dih_analytic$Year_Charged,
## dih_analytic$Sentence_Quartile_calc, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dih_analytic$Year_Charged and dih_analytic$Sentence_Quartile_calc
## S = 904918982, p-value = 0.309
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.02417951
Later years are associated with higher sentence quartiles ρ ≈ 0.28 → moderate (not strong)
Over time, cases are more likely to fall into higher relative sentence bins
###SentQ+Charge
kruskal.test(Sentence_Quartile_calc ~ Charge, data = dih_analytic)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Charge
## Kruskal-Wallis chi-squared = 235.81, df = 27, p-value <
## 0.00000000000000022
Again this wont mean much until we clean the Charge variable
###SentQ+Relationship Tests whether distributions differ across groups
kruskal.test(Sentence_Quartile_calc ~ Relationship, data = dih_analytic)
##
## Kruskal-Wallis rank sum test
##
## data: Sentence_Quartile_calc by Relationship
## Kruskal-Wallis chi-squared = 27.177, df = 2, p-value = 0.000001255
table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)
##
## 1 2 3 4
## Caretaker/Family/Friend/Partner/Co-User 124 112 91 73
## Dealer/Buyer 265 310 309 318
## Doctor/Patient 4 5 10 14
tab22 <- table(dih_analytic$Relationship, dih_analytic$Sentence_Quartile_calc)
chisq.test(tab22)
##
## Pearson's Chi-squared test
##
## data: tab22
## X-squared = 27.92, df = 6, p-value = 0.00009729
###SentQ+Plea
tab_plea <- table(dih_analytic$Plea, dih_analytic$Sentence_Quartile_calc)
tab_plea
##
## 1 2 3 4
## Alford Plea 0 0 1 0
## Awaiting Plea Hearing 0 0 0 1
## Guilty 360 355 351 350
## Guilty to Lesser Charge 18 28 11 9
## No Contest 16 7 7 5
Well that is a problem! why do we have “awaiting plea hearing” for ones that we have sentences for?
DIH_DATA %>%
filter(
Plea == "Awaiting Plea Hearing" &
(!is.na(Sentence_num) | !is.na(Sentence_Quartile_calc))
) %>%
summarise(n = n())
## # A tibble: 1 × 1
## n
## <int>
## 1 1
DIH_DATA %>%
filter(Plea == "Awaiting Plea Hearing") %>%
count(Sentence_num, Sentence_Quartile_calc, sort = TRUE)
## # A tibble: 2 × 3
## Sentence_num Sentence_Quartile_calc n
## <dbl> <int> <int>
## 1 NA NA 30
## 2 36 4 1
DIH_DATA %>%
filter(Plea == "Awaiting Plea Hearing") %>%
count(Sentence, Sentence_Quartile_calc, sort = TRUE)
## # A tibble: 3 × 3
## Sentence Sentence_Quartile_calc n
## <chr> <int> <int>
## 1 Pending NA 28
## 2 pending NA 2
## 3 36 4 1
#Logit Model Sent Q
#Other Relationships
“Do cases involving women tend to occur when the deceased is younger?”
wilcox.test(Deceased_Age ~ Accused_Sex, data = DIH_DATA)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Deceased_Age by Accused_Sex
## W = 761528, p-value = 0.06442
## alternative hypothesis: true location shift is not equal to 0
DIH_DATA %>%
group_by(Accused_Sex) %>%
summarise(
n = sum(!is.na(Deceased_Age)),
mean_age = mean(Deceased_Age, na.rm = TRUE),
median_age = median(Deceased_Age, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Accused_Sex n mean_age median_age
## <chr> <int> <dbl> <dbl>
## 1 Female 741 29.8 30
## 2 Male 1965 29.6 28
## 3 <NA> 12 24 23
ggplot(DIH_DATA, aes(x = Accused_Sex, y = Deceased_Age, fill = Accused_Sex)) +
geom_boxplot() +
guides(fill = "none")
## Warning: Removed 546 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
“Are younger victims more likely to involve female defendants?”
glm(I(Accused_Sex == "Female") ~ Deceased_Age,
data = DIH_DATA,
family = binomial)
##
## Call: glm(formula = I(Accused_Sex == "Female") ~ Deceased_Age, family = binomial,
## data = DIH_DATA)
##
## Coefficients:
## (Intercept) Deceased_Age
## -1.011494 0.001221
##
## Degrees of Freedom: 2705 Total (i.e. Null); 2704 Residual
## (558 observations deleted due to missingness)
## Null Deviance: 3177
## Residual Deviance: 3177 AIC: 3181
There is no strong evidence that cases involving female defendants are associated with younger (or older) victims.
##Child cases
child_data <- DIH_DATA %>%
filter(!is.na(age_group))
child_data %>%
group_by(age_group, Accused_Sex) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(age_group) %>%
mutate(prop = n / sum(n))
## # A tibble: 22 × 4
## # Groups: age_group [10]
## age_group Accused_Sex n prop
## <fct> <chr> <int> <dbl>
## 1 0–1 Female 30 0.625
## 2 0–1 Male 18 0.375
## 3 2–3 Female 22 0.688
## 4 2–3 Male 10 0.312
## 5 4–5 Female 6 0.667
## 6 4–5 Male 3 0.333
## 7 6–7 Female 1 0.167
## 8 6–7 Male 5 0.833
## 9 8–9 Female 2 0.667
## 10 8–9 Male 1 0.333
## # ℹ 12 more rows
ggplot(child_data, aes(x = age_group, fill = Accused_Sex)) +
geom_bar(position = "fill") +
labs(
x = "Child Age Group",
y = "Proportion",
fill = "Accused Sex"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This is pretty striking.
Going to change the age groupings
DIH_DATA <- DIH_DATA %>%
mutate(
age_group2 = case_when(
Deceased_Age <= 5 ~ "0–5",
Deceased_Age <= 12 ~ "6–12",
Deceased_Age <= 17 ~ "13–17",
TRUE ~ NA_character_
)
)
DIH_DATA %>%
filter(!is.na(age_group2)) %>%
group_by(age_group2, Accused_Sex) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(age_group2) %>%
mutate(percent = 100 * n / sum(n))
## # A tibble: 7 × 4
## # Groups: age_group2 [3]
## age_group2 Accused_Sex n percent
## <chr> <chr> <int> <dbl>
## 1 0–5 Female 58 65.2
## 2 0–5 Male 31 34.8
## 3 13–17 Female 20 23.0
## 4 13–17 Male 64 73.6
## 5 13–17 <NA> 3 3.45
## 6 6–12 Female 9 50
## 7 6–12 Male 9 50
tab <- DIH_DATA %>%
filter(!is.na(age_group2)) %>%
count(age_group2, Accused_Sex) %>%
tidyr::pivot_wider(
names_from = Accused_Sex,
values_from = n,
values_fill = 0
)
tab_test <- table(
DIH_DATA$age_group2,
DIH_DATA$Accused_Sex
)
tab_test
##
## Female Male
## 0–5 58 31
## 13–17 20 64
## 6–12 9 9
chisq.test(tab_test)
##
## Pearson's Chi-squared test
##
## data: tab_test
## X-squared = 29.963, df = 2, p-value = 0.0000003116
chisq.test(tab_test)$expected
##
## Female Male
## 0–5 40.539267 48.460733
## 13–17 38.261780 45.738220
## 6–12 8.198953 9.801047
No expected counts are less than 0 so we are good.
DIH_DATA <- DIH_DATA %>%
mutate(female = ifelse(Accused_Sex == "Female", 1, 0))
library(DescTools)
CochranArmitageTest(
table(DIH_DATA$female, DIH_DATA$age_group2)
)
##
## Cochran-Armitage test for trend
##
## data: table(DIH_DATA$female, DIH_DATA$age_group2)
## Z = 3.7259, dim = 3, p-value = 0.0001946
## alternative hypothesis: two.sided
model <- glm(
I(Accused_Sex == "Female") ~ age_group2,
data = DIH_DATA,
family = binomial
)
summary(model)
##
## Call:
## glm(formula = I(Accused_Sex == "Female") ~ age_group2, family = binomial,
## data = DIH_DATA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6265 0.2225 2.816 0.00487 **
## age_group213–17 -1.7896 0.3393 -5.274 0.000000133 ***
## age_group26–12 -0.6265 0.5213 -1.202 0.22945
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 263.27 on 190 degrees of freedom
## Residual deviance: 232.22 on 188 degrees of freedom
## (3073 observations deleted due to missingness)
## AIC: 238.22
##
## Number of Fisher Scoring iterations: 4
Are younger victims more likely to involve female defendants? (Age Group Analysis)
Cases were grouped into three age categories (0–5, 6–12, 13–17), and the distribution of defendant sex across these groups was examined.
Descriptive results show a clear pattern: • Ages 0–5: • Female defendants: 65.2% • Male defendants: 34.8% • Ages 6–12: • Female defendants: 50.0% • Male defendants: 50.0% • Ages 13–17: • Female defendants: 23.0% • Male defendants: 73.6%
This suggests that female defendants are substantially overrepresented in cases involving the youngest victims (ages 0–5), while male defendants dominate cases involving older minors (13–17).
A chi-square test of independence confirms that this association is statistically significant: • χ²(2) = 29.96, p < 0.001
This indicates that the distribution of defendant sex differs meaningfully across age groups.
Trend Analysis
A Cochran–Armitage trend test was also conducted to assess whether there is a systematic trend across ordered age categories. Results from the logistic regression model (with age group as a predictor) further support this: • Compared to ages 0–5 (reference group), cases involving ages 13–17 show a large and statistically significant decrease in the likelihood of a female defendant (p < 0.001) • The 6–12 group does not differ significantly from the youngest group (p = 0.23)
This indicates a strong downward trend:
As victim age increases, the likelihood that the defendant is female decreases substantially, particularly when comparing very young children (0–5) to older adolescents (13–17).
Though I stopped the analysis here – we would need to consider whether to include other control variables in this regression if we ant to report it
##Sex & Relationship
tab_rel_sex <- table(DIH_DATA$Relationship, DIH_DATA$Accused_Sex)
tab_rel_sex
##
## Female Male
## Caretaker/Family/Friend/Partner/Co-User 304 459
## Dealer/Buyer 461 1637
## Doctor/Patient 13 75
chisq.test(tab_rel_sex)
##
## Pearson's Chi-squared test
##
## data: tab_rel_sex
## X-squared = 98.285, df = 2, p-value < 0.00000000000000022
chisq.test(tab_rel_sex)$expected
##
## Female Male
## Caretaker/Family/Friend/Partner/Co-User 201.29332 561.70668
## Dealer/Buyer 553.49067 1544.50933
## Doctor/Patient 23.21601 64.78399
No cells less than 5
prop.table(tab_rel_sex, margin = 1)
##
## Female Male
## Caretaker/Family/Friend/Partner/Co-User 0.3984273 0.6015727
## Dealer/Buyer 0.2197331 0.7802669
## Doctor/Patient 0.1477273 0.8522727
ggplot(DIH_DATA, aes(x = Relationship, fill = Accused_Sex)) +
geom_bar(position = "fill") +
labs(
y = "Proportion",
fill = "Accused Sex"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Urbancity & State
table(DIH_DATA$Urbanicity, DIH_DATA$State)
##
## Alabama Alaska Arizona Arkansas California Colorado Connecticut
## Metropolitan 9 0 13 1 76 14 0
## Suburban 3 0 0 0 3 10 0
## Rural 0 0 0 0 2 1 0
##
## Delaware Florida Georgia Idaho Illinois Indiana Iowa Kansas
## Metropolitan 3 121 18 5 238 44 23 3
## Suburban 0 3 2 0 26 9 3 1
## Rural 0 0 1 2 26 0 2 3
##
## Kentucky Louisana Louisiana Maine Maryland Massachusetts
## Metropolitan 25 0 47 5 46 26
## Suburban 1 0 3 3 7 0
## Rural 6 0 0 6 0 1
##
## Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada
## Metropolitan 72 84 1 18 8 1 14
## Suburban 13 24 1 9 3 0 2
## Rural 13 5 0 10 0 0 0
##
## New Hampshire New Jersey New Mexico New York North Carolina
## Metropolitan 26 108 3 90 74
## Suburban 12 0 0 13 30
## Rural 0 0 0 6 5
##
## North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island
## Metropolitan 14 296 17 17 466 2
## Suburban 1 49 8 0 88 0
## Rural 2 6 2 0 23 0
##
## South Carolina South Dakota Tennessee Texas Utah Vermont
## Metropolitan 13 3 69 23 13 1
## Suburban 2 2 7 4 0 8
## Rural 0 1 2 0 4 5
##
## Virginia Washington West Virginia Wisconsin Wyoming
## Metropolitan 45 54 18 252 6
## Suburban 7 17 5 67 1
## Rural 4 0 7 32 3
ggplot(DIH_DATA, aes(x = State, fill = Urbanicity)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))