library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# Individual-level juvenile cases (non-traffic)
cases_fy23 <- read_excel("juvenile_dataset.xlsx", sheet = "FY23")
cases_fy24 <- read_excel("juvenile_dataset.xlsx", sheet = "FY24")
# Context / summary info
juvenile_wide <- read_excel("juvenile_data_information.xlsx", sheet = "Juvenile_Wide")
section_totals <- read_excel("juvenile_section_totals.xlsx", sheet = "Sheet1")
cases <- bind_rows(
cases_fy23 %>% mutate(FiscalYear = "FY2023"),
cases_fy24 %>% mutate(FiscalYear = "FY2024")
)
cases <- cases %>%
mutate(
recid = if_else(PersonViolCtAll > 1, 1L, 0L),
# clean categorical variables for tables
gender = case_when(
Gender == "M" ~ "Male",
Gender == "F" ~ "Female",
Gender == "U" ~ "Unknown",
TRUE ~ "Missing"
),
race = case_when(
Race == "W" ~ "White",
Race == "H" ~ "Hispanic",
Race == "B" ~ "Black",
Race == "A" ~ "Asian",
Race == "X" ~ "Other",
Race == "U" ~ "Unknown",
TRUE ~ "Missing"
)
)
# Overall N, age, and recidivism rate
cases %>%
summarise(
n_cases = n(),
mean_age = mean(AgeAtTimeOfViolation, na.rm = TRUE),
sd_age = sd(AgeAtTimeOfViolation, na.rm = TRUE),
min_age = min(AgeAtTimeOfViolation, na.rm = TRUE),
max_age = max(AgeAtTimeOfViolation, na.rm = TRUE),
recid_rate = mean(recid, na.rm = TRUE)
)
## # A tibble: 1 × 6
## n_cases mean_age sd_age min_age max_age recid_rate
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 649 14.2 1.58 2 16 0.149
desc_by_year <- cases %>%
group_by(FiscalYear) %>%
summarise(
n_cases = n(),
mean_age = mean(AgeAtTimeOfViolation, na.rm = TRUE),
recid_rate = mean(recid, na.rm = TRUE)
)
desc_by_year
## # A tibble: 2 × 4
## FiscalYear n_cases mean_age recid_rate
## <chr> <int> <dbl> <dbl>
## 1 FY2023 418 14.0 0.136
## 2 FY2024 231 14.4 0.173
tab_gender <- cases %>%
group_by(gender) %>%
summarise(
n = n(),
recid_n = sum(recid, na.rm = TRUE),
recid_rate = recid_n / n
)
tab_gender
## # A tibble: 4 × 4
## gender n recid_n recid_rate
## <chr> <int> <int> <dbl>
## 1 Female 270 29 0.107
## 2 Male 333 62 0.186
## 3 Missing 23 2 0.0870
## 4 Unknown 23 4 0.174
tab_race <- cases %>%
group_by(race) %>%
summarise(
n = n(),
recid_n = sum(recid, na.rm = TRUE),
recid_rate = recid_n / n
)
tab_race
## # A tibble: 7 × 4
## race n recid_n recid_rate
## <chr> <int> <int> <dbl>
## 1 Asian 7 0 0
## 2 Black 68 15 0.221
## 3 Hispanic 240 38 0.158
## 4 Missing 32 4 0.125
## 5 Other 1 0 0
## 6 Unknown 26 4 0.154
## 7 White 275 36 0.131
teen_court_summary <- juvenile_wide %>%
transmute(
FiscalYear,
YouthServed,
TrafficCitations,
NonTrafficCitations,
TeenCourtReferred,
TeenCourtDismissed,
DismissalRate = TeenCourtDismissed / TeenCourtReferred,
JuvenileCaseClearanceRate
)
teen_court_summary
## # A tibble: 2 × 8
## FiscalYear YouthServed TrafficCitations NonTrafficCitations TeenCourtReferred
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2023 5410 1311 494 20
## 2 2024 10185 1047 298 63
## # ℹ 3 more variables: TeenCourtDismissed <dbl>, DismissalRate <dbl>,
## # JuvenileCaseClearanceRate <dbl>
model_data <- cases %>%
mutate(
gender_male = if_else(gender == "Male", 1, 0, missing = NA_integer_),
race_black = if_else(race == "Black", 1, 0, missing = 0),
race_hisp = if_else(race == "Hispanic", 1, 0, missing = 0),
fy2024 = if_else(FiscalYear == "FY2024", 1, 0)
) %>%
drop_na(recid, AgeAtTimeOfViolation, gender_male)
logit_mod <- glm(
recid ~ AgeAtTimeOfViolation + gender_male + race_black + race_hisp + fy2024,
data = model_data,
family = binomial(link = "logit")
)
summary(logit_mod)
##
## Call:
## glm(formula = recid ~ AgeAtTimeOfViolation + gender_male + race_black +
## race_hisp + fy2024, family = binomial(link = "logit"), data = model_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.32906 1.07835 -3.087 0.00202 **
## AgeAtTimeOfViolation 0.07078 0.07522 0.941 0.34674
## gender_male 0.59182 0.23020 2.571 0.01014 *
## race_black 0.64066 0.33925 1.888 0.05896 .
## race_hisp 0.17199 0.24317 0.707 0.47938
## fy2024 0.27121 0.22892 1.185 0.23613
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 547.46 on 648 degrees of freedom
## Residual deviance: 534.05 on 643 degrees of freedom
## AIC: 546.05
##
## Number of Fisher Scoring iterations: 4
# Odds ratios with 95% CIs
exp(cbind(OR = coef(logit_mod), confint(logit_mod)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.03582682 0.00397749 0.2723369
## AgeAtTimeOfViolation 1.07334539 0.93063426 1.2496187
## gender_male 1.80727839 1.15738105 2.8606593
## race_black 1.89773773 0.95276296 3.6307168
## race_hisp 1.18767186 0.73488196 1.9114545
## fy2024 1.31154623 0.83341007 2.0490308
numeric_data <- cases %>%
select(AgeAtTimeOfViolation, PersonViolCtAll) %>%
drop_na()
summary(numeric_data)
## AgeAtTimeOfViolation PersonViolCtAll
## Min. : 2.00 Min. :1.00
## 1st Qu.:13.00 1st Qu.:1.00
## Median :14.00 Median :1.00
## Mean :14.16 Mean :1.24
## 3rd Qu.:15.00 3rd Qu.:1.00
## Max. :16.00 Max. :7.00
cor(numeric_data, method = "pearson")
## AgeAtTimeOfViolation PersonViolCtAll
## AgeAtTimeOfViolation 1.00000000 0.06401434
## PersonViolCtAll 0.06401434 1.00000000
cor.test(
~ AgeAtTimeOfViolation + PersonViolCtAll,
data = numeric_data
)
##
## Pearson's product-moment correlation
##
## data: AgeAtTimeOfViolation and PersonViolCtAll
## t = 1.6316, df = 647, p-value = 0.1032
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01301102 0.14028447
## sample estimates:
## cor
## 0.06401434
library(ggplot2)
ggplot(cases, aes(x = AgeAtTimeOfViolation, y = PersonViolCtAll)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Correlation Between Age and Prior Juvenile Cases",
x = "Age at Time of Violation",
y = "Total Cases for Youth (PersonViolCtAll)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
numeric_vars <- cases %>%
select(AgeAtTimeOfViolation, PersonViolCtAll) %>%
drop_na()
corr_matrix <- round(cor(numeric_vars), 2)
melted_corr <- melt(corr_matrix)
ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = value), color = "black") +
scale_fill_gradient2(
low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1)
) +
labs(
title = "Correlation Matrix of Numeric Variables",
x = "",
y = ""
) +
theme_minimal()

ggplot(cases, aes(x = gender, fill = factor(recid))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Recidivism Rates by Gender",
x = "Gender",
y = "Percentage of Cases",
fill = "Recidivist"
) +
theme_minimal()

ggplot(cases, aes(x = race, fill = factor(recid))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Recidivism Rates by Race/Ethnicity",
x = "Race/Ethnicity",
y = "Percentage of Cases",
fill = "Recidivist"
) +
theme_minimal()

cor.test(cases$recid, cases$AgeAtTimeOfViolation)
##
## Pearson's product-moment correlation
##
## data: cases$recid and cases$AgeAtTimeOfViolation
## t = 1.3008, df = 647, p-value = 0.1938
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02599198 0.12753144
## sample estimates:
## cor
## 0.05107145
library(lsr)
## Warning: package 'lsr' was built under R version 4.4.3
tbl <- table(cases$gender, cases$recid)
cramersV(tbl)
## Warning in stats::chisq.test(...): Chi-squared approximation may be incorrect
## [1] 0.1117468
chisq.test(table(cases$gender, cases$recid))
## Warning in chisq.test(table(cases$gender, cases$recid)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(cases$gender, cases$recid)
## X-squared = 8.1043, df = 3, p-value = 0.0439
model <- glm(
recid ~ AgeAtTimeOfViolation + gender + race + FiscalYear,
data = cases,
family = binomial
)
summary(model)
##
## Call:
## glm(formula = recid ~ AgeAtTimeOfViolation + gender + race +
## FiscalYear, family = binomial, data = cases)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.17450 542.28384 -0.032 0.97473
## AgeAtTimeOfViolation 0.07113 0.07637 0.931 0.35166
## genderMale 0.66985 0.24501 2.734 0.00626 **
## genderMissing -0.74145 1.10529 -0.671 0.50234
## genderUnknown 0.99867 1.10482 0.904 0.36604
## raceBlack 14.43643 542.28278 0.027 0.97876
## raceHispanic 13.94934 542.28272 0.026 0.97948
## raceMissing 14.41290 542.28331 0.027 0.97880
## raceOther 0.61258 1553.14280 0.000 0.99969
## raceUnknown 13.52728 542.28376 0.025 0.98010
## raceWhite 13.76580 542.28272 0.025 0.97975
## FiscalYearFY2024 0.26991 0.23253 1.161 0.24574
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 547.46 on 648 degrees of freedom
## Residual deviance: 529.50 on 637 degrees of freedom
## AIC: 553.5
##
## Number of Fisher Scoring iterations: 14
library(broom)
tidy(model, exponentiate = TRUE, conf.int = TRUE)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## # A tibble: 12 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.48e-8 542. -0.0317 0.975 NA 7.20e 12
## 2 AgeAtTimeOfViolation 1.07e+0 0.0764 0.931 0.352 9.29e- 1 1.25e 0
## 3 genderMale 1.95e+0 0.245 2.73 0.00626 1.22e+ 0 3.20e 0
## 4 genderMissing 4.76e-1 1.11 -0.671 0.502 4.79e- 2 4.69e 0
## 5 genderUnknown 2.71e+0 1.10 0.904 0.366 3.17e- 1 2.72e 1
## 6 raceBlack 1.86e+6 542. 0.0266 0.979 1.75e-15 NA
## 7 raceHispanic 1.14e+6 542. 0.0257 0.979 9.05e-16 NA
## 8 raceMissing 1.82e+6 542. 0.0266 0.979 1.72e- 2 1.10e 95
## 9 raceOther 1.85e+0 1553. 0.000394 1.00 1.79e- 6 9.73e 5
## 10 raceUnknown 7.50e+5 542. 0.0249 0.980 4.60e+64 5.08e101
## 11 raceWhite 9.52e+5 542. 0.0254 0.980 7.55e-16 NA
## 12 FiscalYearFY2024 1.31e+0 0.233 1.16 0.246 8.26e- 1 2.06e 0
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6 2023-06-27
hoslem.test(model$y, fitted(model))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: model$y, fitted(model)
## X-squared = 7.2643, df = 8, p-value = 0.5084
# install.packages("lsr") # <-- delete or comment this out
library(lsr)