#Data isnt full cleaned so loading everything as text and then will specify column type
library(readxl)
DIH_DATA <- read_excel("/Users/taleedel-sabawi/Library/CloudStorage/Dropbox/1-Research/Leo DIH Paper/DIH DATA.xlsx",
sheet = "Clean Data",
col_types = "text"
)
## New names:
## • `` -> `...1`
#Assigning columns manually to data type
DIH_DATA$Year_Charged <- as.numeric(DIH_DATA$Year_Charged)
DIH_DATA$Rural_Code <- as.numeric(DIH_DATA$Rural_Code)
DIH_DATA$Sentence_Quartile <- as.numeric(DIH_DATA$Sentence_Quartile)
# Dates
#These first dates are in excel serial data structure...
DIH_DATA$`Date Incident` <- as.Date(as.numeric(DIH_DATA$`Date Incident`), origin = "1899-12-30")
## Warning in as.Date(as.numeric(DIH_DATA$`Date Incident`), origin =
## "1899-12-30"): NAs introduced by coercion
DIH_DATA$`Date Charge` <- as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30")
## Warning in as.Date(as.numeric(DIH_DATA$`Date Charge`), origin = "1899-12-30"):
## NAs introduced by coercion
# Ages
DIH_DATA$Accused_Age <- as.numeric(DIH_DATA$Accused_Age)
## Warning: NAs introduced by coercion
DIH_DATA$Deceased_Age <- as.numeric(DIH_DATA$Deceased_Age)
## Warning: NAs introduced by coercion
DIH_DATA$Sentence_clean <- as.numeric(DIH_DATA$Sentence_clean)
#checking data structure
str(DIH_DATA)
## tibble [3,264 × 32] (S3: tbl_df/tbl/data.frame)
## $ ...1 : chr [1:3264] "1678" "2622" "1514" "320" ...
## $ URL : chr [1:3264] "http://www.nydailynews.com/news/crime/pain-relieving-patch-kills-harlem-girl-6-foster-mother-charged-article-1.328165" "http://whdh.com/news/woman-boyfriend-charged-in-fatal-cocaine-overdose-of-ohio-boy-9/" "http://www.capitalgazette.com/news/for_the_record/ac-cn-delvalle-overdose-verdict-20180620-story.html" "http://www.thisweeknews.com/news/20180719/fourth-person-expected-to-be-charged-in-connection-with-grandview-ove"| __truncated__ ...
## $ Court : chr [1:3264] NA NA "Baltimore County Court" NA ...
## $ Rural_Code : num [1:3264] 1 2 1 1 1 1 1 5 7 2 ...
## $ County_Fips : chr [1:3264] "36061" "39099" "24005" "39049" ...
## $ County_FullName : chr [1:3264] "New York County" "Mahoning County" "Baltimore County" "Franklin County" ...
## $ County_Name : chr [1:3264] "New York" "Mahoning" "Baltimore" "Franklin" ...
## $ State : chr [1:3264] "New York" "Ohio" "Maryland" "Ohio" ...
## $ Year_Charged : num [1:3264] 2004 2018 2018 2018 2011 ...
## $ Date Charge : Date[1:3264], format: "2004-12-25" "2018-01-05" ...
## $ Date Incident : Date[1:3264], format: "2004-12-15" "2017-12-26" ...
## $ Accused_Name : chr [1:3264] "Joanne Alvarez" "Raenell Allen" "Jason Patton Baker" "Benjamin Bussey" ...
## $ Sentence_clean : num [1:3264] NA NA NA NA NA NA NA NA NA NA ...
## $ Sentence : chr [1:3264] NA NA NA NA ...
## $ Sentence_Quartile : num [1:3264] 1 1 1 1 1 1 1 1 1 1 ...
## $ Charge : chr [1:3264] "Reckless/Negligent Homicide" "Involuntary Manslaughter" "Manslaughter" "Involuntary Manslaughter" ...
## $ Accused_Race : chr [1:3264] NA "White" "White" "White" ...
## $ Accused_Ethnicity : chr [1:3264] "Hispanic/Latino" NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
## $ Accused_Sex : chr [1:3264] "Female" "Female" "Male" "Male" ...
## $ Accused_Age : num [1:3264] NA NA 46 19 NA NA NA NA NA NA ...
## $ Accused_City : chr [1:3264] "East Harlem" "Youngstown" "Millersville" "Dublin" ...
## $ Accused_State : chr [1:3264] "New York" "Ohio" "Pennsylvania" "Ohio" ...
## $ Court_Type : chr [1:3264] "State" "State" "State" "State" ...
## $ Plea : chr [1:3264] "Guilty" NA "Not Guilty" "Guilty to a Lesser Charge" ...
## $ Deceased_Name : chr [1:3264] "Taylor Webster" NA "Josiah Christopher Klaes" "Haleah Myers" ...
## $ Deceased_Race : chr [1:3264] "White" NA "White" "White" ...
## $ Deceased_Ethnicity: chr [1:3264] NA NA "Not Hispanic/Latino" "Not Hispanic/Latino" ...
## $ Deceased_Age : num [1:3264] 6 9 16 17 17 18 18 18 18 18 ...
## $ Deceased_Sex : chr [1:3264] "Female" "Male" "Male" "Female" ...
## $ Substance : chr [1:3264] "Fentanyl (Analog)" NA "Fentanyl (Analog)" "Fentanyl (Analog)" ...
## $ Lawyer : chr [1:3264] NA NA "do'connell@opd.state.md.us" NA ...
## $ Relationship : chr [1:3264] "Caretaker/Family/Friend/Partner/Co-User" "Caretaker/Family/Friend/Partner/Co-User" "Dealer/Buyer" "Caretaker/Family/Friend/Partner/Co-User" ...
#Looking to see what Sentence contains
sort(unique(DIH_DATA$Sentence))
## [1] "0.1052511416" "0.16164383560000001"
## [3] "0.16666666669999999" "0.21917808220000001"
## [5] "0.25" "0.32500000000000001"
## [7] "0.33300000000000002" "0.33333333329999998"
## [9] "0.375" "0.4"
## [11] "0.40500000000000003" "0.41666666670000002"
## [13] "0.5" "0.57260273969999997"
## [15] "0.58333333330000003" "0.66666666669999997"
## [17] "0.75" "0.82199999999999995"
## [19] "0.83333333330000003" "0.91666666669999997"
## [21] "0.95833333330000003" "1"
## [23] "1 year" "1.002"
## [25] "1.25" "1.3"
## [27] "1.3333333329999999" "1.4166666670000001"
## [29] "1.5" "1.5-3"
## [31] "1.583" "1.66"
## [33] "1.6666666670000001" "1.7"
## [35] "1.75" "1.8"
## [37] "1.9166666670000001" "1.917"
## [39] "1.92" "1.97"
## [41] "10" "10.16"
## [43] "10.41" "10.5"
## [45] "10.67" "10.8"
## [47] "10.83" "10.83333333"
## [49] "11" "11 years"
## [51] "11.1" "11.16666667"
## [53] "11.25" "11.5"
## [55] "11.83" "11.9"
## [57] "113" "12"
## [59] "12 years" "12.25"
## [61] "12.5" "12.75"
## [63] "124" "13"
## [65] "13.33" "13.5"
## [67] "14" "14.5"
## [69] "143" "15"
## [71] "15.5" "15.67"
## [73] "16" "16.5"
## [75] "17" "17.5"
## [77] "18" "18.25"
## [79] "18.666666670000001" "180"
## [81] "19" "19.25"
## [83] "19.583333329999999" "2"
## [85] "2.25" "2.3333333330000001"
## [87] "2.4" "2.5"
## [89] "2.75" "20"
## [91] "20.5" "20.833333329999999"
## [93] "21" "22"
## [95] "22.5" "23"
## [97] "24" "25"
## [99] "25 years" "25 years to life"
## [101] "25.5" "26"
## [103] "26.83" "27"
## [105] "27.083333329999999" "28"
## [107] "29" "3"
## [109] "3 to 9" "3.08"
## [111] "3.0833333330000001" "3.25"
## [113] "3.3" "3.33"
## [115] "3.3333333330000001" "3.4"
## [117] "3.4166666669999999" "3.5"
## [119] "3.6666666669999999" "3.83"
## [121] "3.8333333330000001" "3.92"
## [123] "30" "30.416666670000001"
## [125] "31" "32"
## [127] "33" "33.25"
## [129] "34" "35"
## [131] "36" "37"
## [133] "38" "4"
## [135] "4.0833333329999997" "4.17"
## [137] "4.25" "4.4166666670000003"
## [139] "4.5" "4.6666666670000003"
## [141] "4.75" "4.8"
## [143] "4.83" "4.8330000000000002"
## [145] "4.8333333329999997" "4.9000000000000004"
## [147] "40" "42"
## [149] "46" "47"
## [151] "48" "48.33"
## [153] "5" "5 years"
## [155] "5.0599999999999996" "5.25"
## [157] "5.5" "5.6666666670000003"
## [159] "5.75" "5.83"
## [161] "5.9166666670000003" "5.92"
## [163] "50" "52"
## [165] "54" "55"
## [167] "6" "6.0833333329999997"
## [169] "6.1666666670000003" "6.25"
## [171] "6.5" "6.67"
## [173] "6.7" "6.75"
## [175] "60" "62"
## [177] "63" "65"
## [179] "7" "7 to 14 years"
## [181] "7.25" "7.33"
## [183] "7.5" "7.6666666670000003"
## [185] "72" "75"
## [187] "8" "8.08"
## [189] "8.2191780820000004E-2" "8.3299999999999999E-2"
## [191] "8.33" "8.3330000000000002"
## [193] "8.3333333329999995E-2" "8.3333333330000006"
## [195] "8.5" "8.67"
## [197] "84" "9"
## [199] "9.33" "9.5"
## [201] "9.5-20" "9.9169999999999998"
## [203] "90" "96"
## [205] "Face 40 years" "Faces 6 to 30 years"
## [207] "Faces up to 30 years in prison" "Faces up to 40 years"
## [209] "Life" "pending"
## [211] "Pending" "Up to 20 years"
## [213] "Up to 4 years" "Up to 40 years"
@Leo how was this sentencing data gathered? The above “up to 40 years” do not sound like an actual sentence, but rather something a reporter said… please advise.
I have manually gone into the data and created a new variable called “Sentence_clean”
For now I will drop anything that says “Face” or “Up to” from the dataset because it seems they are erroneous.
In it, I changed “1 year” to 1 etc.. Made anything “Faces” or “Up to” as NA. Changed 1 instance of 25 years to life to 25. If there was a range then I took the average. For e.g. 1.5-3 years…2.25 years
I changed all “life” sentences to 100 years.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
summary(DIH_DATA$Sentence_clean)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.08219 4.00000 8.00000 12.44014 15.00000 180.00000 1183
sum(!is.na(DIH_DATA$Sentence_clean))
## [1] 2081
What this shows is that not all of these are actually sentences, but what the person was “facing” aka the longest sentence that could occur. @Leo I would like to see the codebook. Were folks told to record actual sentence length or statements by reporters saying things like the person is “facing this amount of time”. How were the sentences recorded in the data?
The current draft of the article says that sentencing information was available for 2,489 of 3,266 cases with a recorded disposition.
Yet here there are only 2,081 observations for sentence years…. where did the other 400 observations come from?
#General Summary of All Variables
summary(DIH_DATA)
## ...1 URL Court Rural_Code
## Length:3264 Length:3264 Length:3264 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Mode :character Median :2.000
## Mean :2.474
## 3rd Qu.:3.000
## Max. :9.000
## NA's :112
## County_Fips County_FullName County_Name State
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Year_Charged Date Charge Date Incident Accused_Name
## Min. :1974 Min. :1899-12-30 Min. :1899-12-30 Length:3264
## 1st Qu.:2015 1st Qu.:2014-02-26 1st Qu.:2013-10-21 Class :character
## Median :2017 Median :2017-01-14 Median :2016-08-01 Mode :character
## Mean :2016 Mean :2007-11-19 Mean :2009-09-03
## 3rd Qu.:2018 3rd Qu.:2018-07-08 3rd Qu.:2018-01-28
## Max. :2026 Max. :2026-08-01 Max. :2022-02-15
## NA's :14 NA's :11
## Sentence_clean Sentence Sentence_Quartile Charge
## Min. : 0.08219 Length:3264 Min. :1.000 Length:3264
## 1st Qu.: 4.00000 Class :character 1st Qu.:2.000 Class :character
## Median : 8.00000 Mode :character Median :3.000 Mode :character
## Mean : 12.44014 Mean :2.499
## 3rd Qu.: 15.00000 3rd Qu.:3.000
## Max. :180.00000 Max. :4.000
## NA's :1183 NA's :1041
## Accused_Race Accused_Ethnicity Accused_Sex Accused_Age
## Length:3264 Length:3264 Length:3264 Min. :17.00
## Class :character Class :character Class :character 1st Qu.:27.00
## Mode :character Mode :character Mode :character Median :32.00
## Mean :34.34
## 3rd Qu.:40.00
## Max. :87.00
## NA's :2224
## Accused_City Accused_State Court_Type Plea
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Deceased_Name Deceased_Race Deceased_Ethnicity Deceased_Age
## Length:3264 Length:3264 Length:3264 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.:23.00
## Mode :character Mode :character Mode :character Median :28.00
## Mean :29.63
## 3rd Qu.:35.00
## Max. :91.00
## NA's :547
## Deceased_Sex Substance Lawyer Relationship
## Length:3264 Length:3264 Length:3264 Length:3264
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
Missingness
colSums(is.na(DIH_DATA))
## ...1 URL Court Rural_Code
## 0 103 999 112
## County_Fips County_FullName County_Name State
## 66 5 6 0
## Year_Charged Date Charge Date Incident Accused_Name
## 0 14 11 7
## Sentence_clean Sentence Sentence_Quartile Charge
## 1183 906 1041 1
## Accused_Race Accused_Ethnicity Accused_Sex Accused_Age
## 461 2286 17 2224
## Accused_City Accused_State Court_Type Plea
## 181 65 72 907
## Deceased_Name Deceased_Race Deceased_Ethnicity Deceased_Age
## 522 1103 2198 547
## Deceased_Sex Substance Lawyer Relationship
## 220 617 2459 276
#Creating Freq table function to reuse later
freq_table <- function(data, var) {
data %>%
count({{ var }}) %>%
mutate(percent = n / sum(n) * 100) %>%
arrange(desc(n))
}
#Creating Bar chart function for categorical variables to reuse later
library(ggplot2)
library(dplyr)
bar_plot_cat <- function(data, var) {
ggplot(data, aes(x = {{ var }}, fill = {{ var }})) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
x = deparse(substitute(var)),
y = "Count",
title = paste("Distribution of", deparse(substitute(var)))
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
}
#Creating histogram function
hist_plot <- function(data, var) {
ggplot(data, aes(x = {{ var }})) +
geom_histogram(bins = 30, fill = "steelblue") +
scale_x_continuous(breaks = seq(0, 200, by = 10)) + # 👈 more numbers
labs(
x = deparse(substitute(var)),
y = "Count",
title = paste("Distribution of", deparse(substitute(var)))
)
}
#creating density plot function
density_plot <- function(data, var) {
ggplot(data, aes(x = {{ var }})) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(
x = deparse(substitute(var)),
y = "Density",
title = paste("Density of", deparse(substitute(var)))
)
}
#🎯 Outcome variable: Sentence_clean Mean : 12.44
Median : 8
Max : 180
NA’s : 1183
Interpretation: • Right-skewed distribution (mean > median, max = 180) • Likely long tail (e.g., very long sentences or life approximations) • ⚠️ ~36% missing (1183 / 3264)
Usable sample =
sum(!is.na(DIH_DATA$Sentence_clean))
## [1] 2081
hist_plot(DIH_DATA, Sentence_clean)
## Warning: Removed 1183 rows containing non-finite outside the scale range
## (`stat_bin()`).
density_plot(DIH_DATA, Sentence_clean)
## Warning: Removed 1183 rows containing non-finite outside the scale range
## (`stat_density()`).
Will need to explore missingness to see if it is random.
#📅 Dates Range: 1899 → 2022
1899 will need to be removed from the dataset. As will 1905
DIH_DATA <- DIH_DATA %>%
filter(`Date Incident` >= as.Date("1910-01-01") | is.na(`Date Incident`))
sort(unique(DIH_DATA$`Date Incident`))[1:20]
## [1] "1974-09-23" "1982-03-05" "1987-10-18" "1989-10-04" "1991-12-03"
## [6] "1992-06-01" "1994-02-04" "1994-09-03" "1995-06-30" "1995-08-09"
## [11] "1999-03-01" "1999-04-28" "1999-09-24" "1999-11-24" "2000-01-10"
## [16] "2000-09-16" "2001-02-03" "2001-02-08" "2001-04-01" "2001-04-10"
library(dplyr)
library(ggplot2)
DIH_DATA %>%
filter(!is.na(`Date Incident`)) %>%
count(`Date Incident`) %>%
ggplot(aes(x = `Date Incident`, y = n)) +
geom_line() +
labs(
title = "Number of Cases Over Time",
x = "Date",
y = "Count"
)
#👤 Accused characteristics ## Accused Age Mean: 34
Median: 32
NA’s: 2224 Over 68% missing → major limitation
density_plot(DIH_DATA, Accused_Age)
## Warning: Removed 2095 rows containing non-finite outside the scale range
## (`stat_density()`).
## • Accused_Ethnicity
freq_table(DIH_DATA,Accused_Ethnicity)
## # A tibble: 7 × 3
## Accused_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2150 69.5
## 2 Not Hispanic/Latino 704 22.8
## 3 Hispanic/Latino 143 4.62
## 4 NA 93 3.01
## 5 Not HIspanic/Latino 1 0.0323
## 6 White 1 0.0323
## 7 not Hispanic/Latino 1 0.0323
Cleaning typos in data…
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Ethnicity = str_to_title(Accused_Ethnicity), # fix casing
Accused_Ethnicity = str_replace_all(Accused_Ethnicity, "Hispanic", "Hispanic"), # normalize
Accused_Ethnicity = na_if(Accused_Ethnicity, "Na"), # fix "NA"
Accused_Ethnicity = case_when(
Accused_Ethnicity %in% c("Not Hispanic/Latino", "Not Hispanic/Latino") ~ "Not Hispanic/Latino",
Accused_Ethnicity %in% c("Hispanic/Latino") ~ "Hispanic/Latino",
Accused_Ethnicity == "White" ~ NA_character_, # wrong variable → set to NA
TRUE ~ Accused_Ethnicity
)
)
bar_plot_cat(DIH_DATA, Accused_Ethnicity)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
bar_plot_cat(DIH_DATA, Accused_Sex)
freq_table(DIH_DATA, Accused_Sex)
## # A tibble: 5 × 3
## Accused_Sex n percent
## <chr> <int> <dbl>
## 1 Male 2257 73.0
## 2 Female 818 26.4
## 3 <NA> 13 0.420
## 4 NA 3 0.0970
## 5 feMale 2 0.0647
Fixing typos in the data again…
library(dplyr)
library(stringr)
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Sex = str_to_title(Accused_Sex), # fixes feMale → Female
Accused_Sex = na_if(Accused_Sex, "Na") # fixes "NA" string → real NA
)
freq_table(DIH_DATA, Accused_Race)
## # A tibble: 11 × 3
## Accused_Race n percent
## <chr> <int> <dbl>
## 1 White 1972 63.8
## 2 Black 601 19.4
## 3 <NA> 429 13.9
## 4 NA 36 1.16
## 5 Unknown POC 25 0.808
## 6 Asian 14 0.453
## 7 Native American or Alaskan Native 10 0.323
## 8 Hispanic/Latino 3 0.0970
## 9 Arabic 1 0.0323
## 10 Manufacture, Delivery, or Possession With Intent to Manufactur… 1 0.0323
## 11 white 1 0.0323
“white” → should be “White” • “NA” → should be real NA • “Arabic” → you already decided → “White” • that long drug charge string → clearly wrong variable → should be NA
• "Unknown POC" → not a usable category → ❌
• "Hispanic/Latino" → ethnicity, not race → ❌
DIH_DATA <- DIH_DATA %>%
mutate(
Accused_Race = na_if(Accused_Race, "NA"),
Accused_Race = case_when(
Accused_Race %in% c("white", "White") ~ "White",
Accused_Race %in% c("black", "Black") ~ "Black",
Accused_Race == "Arabic" ~ "White",
Accused_Race %in% c("Unknown POC", "Hispanic/Latino", "X") ~ NA_character_,
str_detect(Accused_Race, "Manufacture") ~ NA_character_,
TRUE ~ Accused_Race
)
)
freq_table(DIH_DATA, Accused_Race)
## # A tibble: 5 × 3
## Accused_Race n percent
## <chr> <int> <dbl>
## 1 White 1974 63.8
## 2 Black 601 19.4
## 3 <NA> 494 16.0
## 4 Asian 14 0.453
## 5 Native American or Alaskan Native 10 0.323
bar_plot_cat(DIH_DATA, Accused_Race)
#👤 Deceased characteristics
Range: 0-91 Mean: 29.6
Median: 28
NA’s: 547 Only about 17% missingness here – which makes sense as news
articles will focus on the age of the deceased, something all readers
will want to know… Was there really a 91 year old who was the
deceased?
deceased under 18 =>
sum(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)
## [1] 255
percent of sample (after dropping those missing) 18 or under =>
mean(DIH_DATA$Deceased_Age <= 18, na.rm = TRUE)
## [1] 0.09673748
DIH_DATA %>%
filter(Deceased_Age <= 18) %>%
count(Deceased_Age, sort = TRUE)
## # A tibble: 40 × 2
## Deceased_Age n
## <dbl> <int>
## 1 18 66
## 2 17 33
## 3 16 24
## 4 2 17
## 5 1 15
## 6 15 15
## 7 3 11
## 8 14 8
## 9 5 7
## 10 6 5
## # ℹ 30 more rows
percent of sample 2 or younger =>
mean(DIH_DATA$Deceased_Age <= 2, na.rm = TRUE)* 100
## [1] 2.579666
Quartiles for 18 and younger =>
quantile(
DIH_DATA$Deceased_Age[DIH_DATA$Deceased_Age <= 18],
probs = c(0.25, 0.5, 0.75),
na.rm = TRUE
)
## 25% 50% 75%
## 2 15 18
Interested in the breakdown of deceased by age under 18, creating groups and graphing. =>
library(dplyr)
DIH_DATA <- DIH_DATA %>%
mutate(
age_group = case_when(
Deceased_Age >= 0 & Deceased_Age <= 1 ~ "0–1",
Deceased_Age >= 2 & Deceased_Age <= 3 ~ "2–3",
Deceased_Age >= 4 & Deceased_Age <= 5 ~ "4–5",
Deceased_Age >= 6 & Deceased_Age <= 7 ~ "6–7",
Deceased_Age >= 8 & Deceased_Age <= 9 ~ "8–9",
Deceased_Age >= 10 & Deceased_Age <= 11 ~ "10–11",
Deceased_Age >= 12 & Deceased_Age <= 13 ~ "12–13",
Deceased_Age >= 14 & Deceased_Age <= 15 ~ "14–15",
Deceased_Age >= 16 & Deceased_Age <= 17 ~ "16–17",
Deceased_Age == 18 ~ "18",
TRUE ~ NA_character_
)
)
#Ordered
DIH_DATA$age_group <- factor(
DIH_DATA$age_group,
levels = c("0–1","2–3","4–5","6–7","8–9","10–11","12–13","14–15","16–17","18")
)
#Plot
library(ggplot2)
ggplot(
DIH_DATA %>% filter(Deceased_Age <= 18, !is.na(age_group)),
aes(x = age_group, fill = age_group)
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Deceased Age (≤ 18)",
x = "Age Group",
y = "Count"
) +
guides(fill = "none") # removes legend if you don’t want it
freq_table(DIH_DATA, Deceased_Age)
## # A tibble: 95 × 3
## Deceased_Age n percent
## <dbl> <int> <dbl>
## 1 NA 457 14.8
## 2 28 135 4.36
## 3 21 133 4.30
## 4 26 132 4.27
## 5 23 124 4.01
## 6 25 120 3.88
## 7 24 117 3.78
## 8 30 114 3.69
## 9 22 109 3.52
## 10 27 106 3.43
## # ℹ 85 more rows
freq_table(DIH_DATA, age_group)
## # A tibble: 11 × 3
## age_group n percent
## <fct> <int> <dbl>
## 1 <NA> 2847 92.0
## 2 18 66 2.13
## 3 16–17 57 1.84
## 4 0–1 42 1.36
## 5 2–3 28 0.905
## 6 14–15 23 0.744
## 7 4–5 9 0.291
## 8 10–11 7 0.226
## 9 6–7 6 0.194
## 10 12–13 5 0.162
## 11 8–9 3 0.0970
freq_table(DIH_DATA, Deceased_Race)
## # A tibble: 12 × 3
## Deceased_Race n percent
## <chr> <int> <dbl>
## 1 White 1920 62.1
## 2 <NA> 983 31.8
## 3 Black 86 2.78
## 4 NA 64 2.07
## 5 Unknown POC 17 0.550
## 6 Asian 10 0.323
## 7 Native American or Alaskan Native 6 0.194
## 8 Hispanic/Latino 3 0.0970
## 9 Arabic 1 0.0323
## 10 Thomas McGuinness 1 0.0323
## 11 Unknown 1 0.0323
## 12 X 1 0.0323
Need to fix these… Arabic is changed to White. Thomas McGuinness changed to NA. Fix the NAs, X and Unknown
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Race = na_if(Deceased_Race, "NA"),
Deceased_Race = case_when(
Deceased_Race == "Arabic" ~ "White",
Deceased_Race == "Thomas McGuinness" ~ NA_character_,
Deceased_Race %in% c("Unknown", "X") ~ NA_character_,
TRUE ~ Deceased_Race
)
)
freq_table(DIH_DATA, Deceased_Race)
## # A tibble: 7 × 3
## Deceased_Race n percent
## <chr> <int> <dbl>
## 1 White 1921 62.1
## 2 <NA> 1050 33.9
## 3 Black 86 2.78
## 4 Unknown POC 17 0.550
## 5 Asian 10 0.323
## 6 Native American or Alaskan Native 6 0.194
## 7 Hispanic/Latino 3 0.0970
bar_plot_cat(DIH_DATA, Deceased_Race)
freq_table(DIH_DATA, Deceased_Ethnicity)
## # A tibble: 8 × 3
## Deceased_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2061 66.6
## 2 Not Hispanic/Latino 816 26.4
## 3 NA 113 3.65
## 4 Hispanic/Latino 97 3.14
## 5 Hispanic Latino 2 0.0647
## 6 White 2 0.0647
## 7 Not Hispanic/latino 1 0.0323
## 8 X 1 0.0323
Again, fixing the data…
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Ethnicity = na_if(Deceased_Ethnicity, "NA"),
Deceased_Ethnicity = case_when(
Deceased_Ethnicity == "Hispanic Latino" ~ "Hispanic/Latino",
Deceased_Ethnicity == "Not Hispanic/latino" ~ "Not Hispanic/Latino",
Deceased_Ethnicity == "White" ~ NA_character_,
Deceased_Ethnicity == "X" ~ NA_character_,
TRUE ~ Deceased_Ethnicity
)
)
freq_table(DIH_DATA, Deceased_Ethnicity)
## # A tibble: 3 × 3
## Deceased_Ethnicity n percent
## <chr> <int> <dbl>
## 1 <NA> 2177 70.4
## 2 Not Hispanic/Latino 817 26.4
## 3 Hispanic/Latino 99 3.20
bar_plot_cat(DIH_DATA, Deceased_Ethnicity)
Large missingness
freq_table(DIH_DATA, Deceased_Sex)
## # A tibble: 13 × 3
## Deceased_Sex n percent
## <chr> <int> <dbl>
## 1 Male 1849 59.8
## 2 Female 1050 33.9
## 3 <NA> 156 5.04
## 4 NA 23 0.744
## 5 male 5 0.162
## 6 Famale 2 0.0647
## 7 female 2 0.0647
## 8 27 1 0.0323
## 9 Both Female 1 0.0323
## 10 Both male 1 0.0323
## 11 Female, Male 1 0.0323
## 12 Other 1 0.0323
## 13 X 1 0.0323
changing both to be NA
DIH_DATA <- DIH_DATA %>%
mutate(
Deceased_Sex = case_when(
Deceased_Sex %in% c("NA","27","X","Other",
"Both Female","Both male","Female, Male", "Both") ~ NA_character_,
Deceased_Sex %in% c("male","Male") ~ "Male",
Deceased_Sex %in% c("female","Female","Famale") ~ "Female",
TRUE ~ Deceased_Sex
)
)
freq_table(DIH_DATA, Deceased_Sex)
## # A tibble: 3 × 3
## Deceased_Sex n percent
## <chr> <int> <dbl>
## 1 Male 1854 59.9
## 2 Female 1054 34.1
## 3 <NA> 185 5.98
#🌍 Rural Code NA’s: 112
ggplot(
DIH_DATA %>% filter(!is.na(Rural_Code)),
aes(x = factor(Rural_Code), fill = factor(Rural_Code))
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Rural_Code",
x = "Rural Code",
y = "Count"
) +
guides(fill = "none")
Creating urbanacity variable
DIH_DATA <- DIH_DATA %>%
mutate(
Urbanicity = case_when(
Rural_Code %in% 1:3 ~ "Metropolitan",
Rural_Code %in% 4:6 ~ "Suburban",
Rural_Code %in% 7:9 ~ "Rural",
TRUE ~ NA_character_
)
)
#Ordering for plotting
DIH_DATA$Urbanicity <- factor(
DIH_DATA$Urbanicity,
levels = c("Metropolitan", "Suburban", "Rural")
)
ggplot(
DIH_DATA %>% filter(!is.na(Urbanicity)),
aes(x = Urbanicity, fill = Urbanicity)
) +
geom_bar() +
geom_text(
stat = "count",
aes(label = ..count..),
vjust = -0.3
) +
labs(
title = "Distribution of Urbanicity",
x = "Urbanicity",
y = "Count"
) +
guides(fill = "none")
Would be interesting to see state by urbancity…
ggplot(
DIH_DATA %>% filter(!is.na(State), !is.na(Urbanicity)),
aes(x = State, fill = Urbanicity)
) +
geom_bar(position = "fill") +
labs(
title = "Proportion of Urbanicity by State",
x = "State",
y = "Proportion"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
I guess this could just be a function of how rural the state is…but we
could test further…
#📍State
There is a lowercase pennsylvania in the dataset…fixing it…
DIH_DATA %>%
filter(State == "pennsylvania")
## # A tibble: 1 × 34
## ...1 URL Court Rural_Code County_Fips County_FullName County_Name State
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 540 3 senten… <NA> 2 42071 Lancaster Coun… Lancaster penn…
## # ℹ 26 more variables: Year_Charged <dbl>, `Date Charge` <date>,
## # `Date Incident` <date>, Accused_Name <chr>, Sentence_clean <dbl>,
## # Sentence <chr>, Sentence_Quartile <dbl>, Charge <chr>, Accused_Race <chr>,
## # Accused_Ethnicity <chr>, Accused_Sex <chr>, Accused_Age <dbl>,
## # Accused_City <chr>, Accused_State <chr>, Court_Type <chr>, Plea <chr>,
## # Deceased_Name <chr>, Deceased_Race <chr>, Deceased_Ethnicity <chr>,
## # Deceased_Age <dbl>, Deceased_Sex <chr>, Substance <chr>, Lawyer <chr>, …
DIH_DATA <- DIH_DATA %>%
mutate(
State = ifelse(State == "pennsylvania", "Pennsylvania", State)
)
bar_plot_cat(DIH_DATA, State)
freq_table(DIH_DATA, State)
## # A tibble: 50 × 3
## State n percent
## <chr> <int> <dbl>
## 1 Pennsylvania 560 18.1
## 2 Wisconsin 338 10.9
## 3 Ohio 332 10.7
## 4 Illinois 288 9.31
## 5 Florida 113 3.65
## 6 Minnesota 113 3.65
## 7 New Jersey 113 3.65
## 8 North Carolina 110 3.56
## 9 New York 103 3.33
## 10 Michigan 94 3.04
## # ℹ 40 more rows
#⚖️ Sentence Quartile Mean: 2.5
Range: 1–4
NA’s: 1041
sum(!is.na(DIH_DATA$Sentence_Quartile))
## [1] 2118
Somehow there are more observations that have Quartiles than actual Sentences… @Katie & @Leo … any idea what happened here? Coding errors? I have created a table below with the observations thave have a response for Sentence Quartile but not Sentences…
problem_cases <- DIH_DATA %>%
filter(!is.na(Sentence_Quartile) & is.na(Sentence))
library(dplyr)
library(DT)
problem_cases <- DIH_DATA %>%
filter(!is.na(Sentence_Quartile) & is.na(Sentence))
datatable(
problem_cases,
options = list(
pageLength = 25,
scrollX = TRUE
),
filter = "top",
rownames = FALSE
)
I am not going to use this variable until we figure out what is going on here…
#⚖️ Legal / case variables ## • Charge
freq_table(DIH_DATA,Charge)
## # A tibble: 31 × 3
## Charge n percent
## <chr> <int> <dbl>
## 1 Drug Delivery/Distribution Resulting in Death 580 18.8
## 2 Involuntary Manslaughter 459 14.8
## 3 Reckless/Negligent Homicide 414 13.4
## 4 Murder (2nd or 3rd Degree) 334 10.8
## 5 Drug Induced Homicide 295 9.54
## 6 Manslaughter 238 7.69
## 7 Delivery/Distribution of Heroin/Fentanyl Resulting in Death 143 4.62
## 8 Delivery/Distribution Of a Controlled Substance Causing/Result… 111 3.59
## 9 Murder (1st Degree) 95 3.07
## 10 Other (Please Specify in Notes) 87 2.81
## # ℹ 21 more rows
@Leo, can you plese let me know how you would like these categorized? I do not have the previous code that you used so I do not know what previous decisions were made…
freq_table(DIH_DATA,Plea)
## # A tibble: 32 × 3
## Plea n percent
## <chr> <int> <dbl>
## 1 Guilty 1679 54.3
## 2 <NA> 843 27.3
## 3 Not Guilty 196 6.34
## 4 NA 181 5.85
## 5 Guilty of a Lesser Charge 43 1.39
## 6 No Contest 40 1.29
## 7 Guilty to a Lesser Charge 30 0.970
## 8 Awaiting Plea Hearing 28 0.905
## 9 Pleaded Guilty 13 0.420
## 10 Guilty to a Lesser DIH Charge 12 0.388
## # ℹ 22 more rows
Fixed the data as follows…
DIH_DATA <- DIH_DATA %>%
mutate(
Plea = na_if(Plea, "NA"),
Plea = case_when(
str_detect(tolower(Plea), "guilty") & !str_detect(tolower(Plea), "lesser") ~ "Guilty",
str_detect(tolower(Plea), "lesser") ~ "Guilty to Lesser Charge",
str_detect(tolower(Plea), "no contest|nolo") ~ "No Contest",
str_detect(tolower(Plea), "not guilty") ~ "Not Guilty",
str_detect(tolower(Plea), "pending") ~ "Pending/Awaiting",
Plea %in% c("State", "X", "Other") ~ NA_character_,
TRUE ~ Plea
)
)
DIH_DATA <- DIH_DATA %>%
mutate(
Plea = case_when(
Plea == "Judge ruled it was not murder" ~ NA_character_,
Plea == "Pleads not guily" ~ "Guilty",
Plea == "Pled guity to DIH charges" ~ "Guilty",
TRUE ~ Plea
)
)
freq_table(DIH_DATA, Plea)
## # A tibble: 6 × 3
## Plea n percent
## <chr> <int> <dbl>
## 1 Guilty 1907 61.7
## 2 <NA> 1026 33.2
## 3 Guilty to Lesser Charge 85 2.75
## 4 No Contest 46 1.49
## 5 Awaiting Plea Hearing 28 0.905
## 6 Alford Plea 1 0.0323
bar_plot_cat(DIH_DATA, Plea)
Do we have data on convictions? Versus just pleas
freq_table(DIH_DATA,Court_Type)
## # A tibble: 6 × 3
## Court_Type n percent
## <chr> <int> <dbl>
## 1 State 2678 86.6
## 2 Federal 332 10.7
## 3 <NA> 69 2.23
## 4 NA 11 0.356
## 5 federal 2 0.0647
## 6 Fayette County 1 0.0323
DIH_DATA <- DIH_DATA %>%
mutate(
Court_Type = na_if(Court_Type, "NA"),
Court_Type = case_when(
Court_Type == "federal" ~ "Federal",
Court_Type == "Fayette County" ~ NA_character_,
TRUE ~ Court_Type
)
)
freq_table(DIH_DATA, Court_Type)
## # A tibble: 3 × 3
## Court_Type n percent
## <chr> <int> <dbl>
## 1 State 2678 86.6
## 2 Federal 334 10.8
## 3 <NA> 81 2.62
freq_table(DIH_DATA, Relationship)
## # A tibble: 6 × 3
## Relationship n percent
## <chr> <int> <dbl>
## 1 Dealer/Buyer 2011 65.0
## 2 Caretaker/Family/Friend/Partner/Co-User 744 24.1
## 3 <NA> 248 8.02
## 4 Doctor/Patient 57 1.84
## 5 NA 25 0.808
## 6 Dealer/buyer 8 0.259
Fixing typos.
DIH_DATA <- DIH_DATA %>%
mutate(
Relationship = na_if(Relationship, "NA"),
Relationship = case_when(
Relationship %in% c("Dealer/buyer", "Dealer/Buyer") ~ "Dealer/Buyer",
TRUE ~ Relationship
)
)
freq_table(DIH_DATA, Relationship)
## # A tibble: 4 × 3
## Relationship n percent
## <chr> <int> <dbl>
## 1 Dealer/Buyer 2019 65.3
## 2 Caretaker/Family/Friend/Partner/Co-User 744 24.1
## 3 <NA> 273 8.83
## 4 Doctor/Patient 57 1.84
freq_table(DIH_DATA, Substance)
## # A tibble: 39 × 3
## Substance n percent
## <chr> <int> <dbl>
## 1 Heroin 907 29.3
## 2 <NA> 563 18.2
## 3 Fentanyl (Analog) 444 14.4
## 4 Fentanyl (Analog) and Heroin 397 12.8
## 5 Prescription Opioid 121 3.91
## 6 Mixed Drugs Not Listed (Comment in Cell) 104 3.36
## 7 Methadone 75 2.42
## 8 Methamphetamine 64 2.07
## 9 Other Drug (Comment in Cell) 62 2.00
## 10 Fentanyl Analog 47 1.52
## # ℹ 29 more rows
Cleaning data as follows…
library(stringr)
library(dplyr)
DIH_DATA <- DIH_DATA %>%
mutate(
Substance = tolower(Substance),
Substance = na_if(Substance, "na"),
Substance_clean = case_when(
str_detect(Substance, "heroin") & str_detect(Substance, "fentanyl") ~ "Fentanyl + Heroin",
str_detect(Substance, "fentanyl") ~ "Fentanyl",
str_detect(Substance, "heroin") ~ "Heroin",
str_detect(Substance, "meth") ~ "Methamphetamine",
str_detect(Substance, "cocaine") ~ "Cocaine",
str_detect(Substance, "opioid|oxycodone|suboxone|methadone") ~ "Other Opioid",
str_detect(Substance, "multiple|mixed|and") ~ "Polysubstance",
str_detect(Substance, "xanax|benzo") ~ "Other",
str_detect(Substance, "ecstasy") ~ "Other",
str_detect(Substance, "benadryl") ~ "Other",
str_detect(Substance, "bullskin") ~ NA_character_,
TRUE ~ "Other"
)
)
freq_table(DIH_DATA, Substance_clean)
## # A tibble: 9 × 3
## Substance_clean n percent
## <chr> <int> <dbl>
## 1 Heroin 908 29.4
## 2 Other 647 20.9
## 3 Fentanyl 645 20.9
## 4 Fentanyl + Heroin 417 13.5
## 5 Methamphetamine 152 4.91
## 6 Other Opioid 145 4.69
## 7 Polysubstance 104 3.36
## 8 Cocaine 74 2.39
## 9 <NA> 1 0.0323
bar_plot_cat(DIH_DATA, Substance_clean)