library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# Individual-level juvenile cases (non-traffic)
cases_fy23 <- read_excel("juvenile_dataset.xlsx", sheet = "FY23")
cases_fy24 <- read_excel("juvenile_dataset.xlsx", sheet = "FY24")

# Context / summary info
juvenile_wide   <- read_excel("juvenile_data_information.xlsx", sheet = "Juvenile_Wide")
section_totals  <- read_excel("juvenile_section_totals.xlsx", sheet = "Sheet1")
cases <- bind_rows(
  cases_fy23 %>% mutate(FiscalYear = "FY2023"),
  cases_fy24 %>% mutate(FiscalYear = "FY2024")
)

cases <- cases %>%
  mutate(
    recid = if_else(PersonViolCtAll > 1, 1L, 0L),
    # clean categorical variables for tables
    gender = case_when(
      Gender == "M" ~ "Male",
      Gender == "F" ~ "Female",
      Gender == "U" ~ "Unknown",
      TRUE          ~ "Missing"
    ),
    race = case_when(
      Race == "W" ~ "White",
      Race == "H" ~ "Hispanic",
      Race == "B" ~ "Black",
      Race == "A" ~ "Asian",
      Race == "X" ~ "Other",
      Race == "U" ~ "Unknown",
      TRUE        ~ "Missing"
    )
  )
# Overall N, age, and recidivism rate
cases %>%
  summarise(
    n_cases      = n(),
    mean_age     = mean(AgeAtTimeOfViolation, na.rm = TRUE),
    sd_age       = sd(AgeAtTimeOfViolation, na.rm = TRUE),
    min_age      = min(AgeAtTimeOfViolation, na.rm = TRUE),
    max_age      = max(AgeAtTimeOfViolation, na.rm = TRUE),
    recid_rate   = mean(recid, na.rm = TRUE)
  )
## # A tibble: 1 × 6
##   n_cases mean_age sd_age min_age max_age recid_rate
##     <int>    <dbl>  <dbl>   <dbl>   <dbl>      <dbl>
## 1     649     14.2   1.58       2      16      0.149
desc_by_year <- cases %>%
  group_by(FiscalYear) %>%
  summarise(
    n_cases    = n(),
    mean_age   = mean(AgeAtTimeOfViolation, na.rm = TRUE),
    recid_rate = mean(recid, na.rm = TRUE)
  )

desc_by_year
## # A tibble: 2 × 4
##   FiscalYear n_cases mean_age recid_rate
##   <chr>        <int>    <dbl>      <dbl>
## 1 FY2023         418     14.0      0.136
## 2 FY2024         231     14.4      0.173
tab_gender <- cases %>%
  group_by(gender) %>%
  summarise(
    n        = n(),
    recid_n  = sum(recid, na.rm = TRUE),
    recid_rate = recid_n / n
  )

tab_gender
## # A tibble: 4 × 4
##   gender      n recid_n recid_rate
##   <chr>   <int>   <int>      <dbl>
## 1 Female    270      29     0.107 
## 2 Male      333      62     0.186 
## 3 Missing    23       2     0.0870
## 4 Unknown    23       4     0.174
tab_race <- cases %>%
  group_by(race) %>%
  summarise(
    n        = n(),
    recid_n  = sum(recid, na.rm = TRUE),
    recid_rate = recid_n / n
  )

tab_race
## # A tibble: 7 × 4
##   race         n recid_n recid_rate
##   <chr>    <int>   <int>      <dbl>
## 1 Asian        7       0      0    
## 2 Black       68      15      0.221
## 3 Hispanic   240      38      0.158
## 4 Missing     32       4      0.125
## 5 Other        1       0      0    
## 6 Unknown     26       4      0.154
## 7 White      275      36      0.131
teen_court_summary <- juvenile_wide %>%
  transmute(
    FiscalYear,
    YouthServed,
    TrafficCitations,
    NonTrafficCitations,
    TeenCourtReferred,
    TeenCourtDismissed,
    DismissalRate = TeenCourtDismissed / TeenCourtReferred,
    JuvenileCaseClearanceRate
  )

teen_court_summary
## # A tibble: 2 × 8
##   FiscalYear YouthServed TrafficCitations NonTrafficCitations TeenCourtReferred
##        <dbl>       <dbl>            <dbl>               <dbl>             <dbl>
## 1       2023        5410             1311                 494                20
## 2       2024       10185             1047                 298                63
## # ℹ 3 more variables: TeenCourtDismissed <dbl>, DismissalRate <dbl>,
## #   JuvenileCaseClearanceRate <dbl>
model_data <- cases %>%
  mutate(
    gender_male = if_else(gender == "Male", 1, 0, missing = NA_integer_),
    race_black  = if_else(race == "Black", 1, 0, missing = 0),
    race_hisp   = if_else(race == "Hispanic", 1, 0, missing = 0),
    fy2024      = if_else(FiscalYear == "FY2024", 1, 0)
  ) %>%
  drop_na(recid, AgeAtTimeOfViolation, gender_male)

logit_mod <- glm(
  recid ~ AgeAtTimeOfViolation + gender_male + race_black + race_hisp + fy2024,
  data = model_data,
  family = binomial(link = "logit")
)

summary(logit_mod)
## 
## Call:
## glm(formula = recid ~ AgeAtTimeOfViolation + gender_male + race_black + 
##     race_hisp + fy2024, family = binomial(link = "logit"), data = model_data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)          -3.32906    1.07835  -3.087  0.00202 **
## AgeAtTimeOfViolation  0.07078    0.07522   0.941  0.34674   
## gender_male           0.59182    0.23020   2.571  0.01014 * 
## race_black            0.64066    0.33925   1.888  0.05896 . 
## race_hisp             0.17199    0.24317   0.707  0.47938   
## fy2024                0.27121    0.22892   1.185  0.23613   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 547.46  on 648  degrees of freedom
## Residual deviance: 534.05  on 643  degrees of freedom
## AIC: 546.05
## 
## Number of Fisher Scoring iterations: 4
# Odds ratios with 95% CIs
exp(cbind(OR = coef(logit_mod), confint(logit_mod)))
## Waiting for profiling to be done...
##                              OR      2.5 %    97.5 %
## (Intercept)          0.03582682 0.00397749 0.2723369
## AgeAtTimeOfViolation 1.07334539 0.93063426 1.2496187
## gender_male          1.80727839 1.15738105 2.8606593
## race_black           1.89773773 0.95276296 3.6307168
## race_hisp            1.18767186 0.73488196 1.9114545
## fy2024               1.31154623 0.83341007 2.0490308
numeric_data <- cases %>%
  select(AgeAtTimeOfViolation, PersonViolCtAll) %>%
  drop_na()
summary(numeric_data)
##  AgeAtTimeOfViolation PersonViolCtAll
##  Min.   : 2.00        Min.   :1.00   
##  1st Qu.:13.00        1st Qu.:1.00   
##  Median :14.00        Median :1.00   
##  Mean   :14.16        Mean   :1.24   
##  3rd Qu.:15.00        3rd Qu.:1.00   
##  Max.   :16.00        Max.   :7.00
cor(numeric_data, method = "pearson")
##                      AgeAtTimeOfViolation PersonViolCtAll
## AgeAtTimeOfViolation           1.00000000      0.06401434
## PersonViolCtAll                0.06401434      1.00000000
cor.test(
  ~ AgeAtTimeOfViolation + PersonViolCtAll,
  data = numeric_data
)
## 
##  Pearson's product-moment correlation
## 
## data:  AgeAtTimeOfViolation and PersonViolCtAll
## t = 1.6316, df = 647, p-value = 0.1032
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01301102  0.14028447
## sample estimates:
##        cor 
## 0.06401434
library(ggplot2)

ggplot(cases, aes(x = AgeAtTimeOfViolation, y = PersonViolCtAll)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Correlation Between Age and Prior Juvenile Cases",
    x = "Age at Time of Violation",
    y = "Total Cases for Youth (PersonViolCtAll)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
numeric_vars <- cases %>%
  select(AgeAtTimeOfViolation, PersonViolCtAll) %>%
  drop_na()

corr_matrix <- round(cor(numeric_vars), 2)

melted_corr <- melt(corr_matrix)

ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = value), color = "black") +
  scale_fill_gradient2(
    low = "blue", high = "red", mid = "white",
    midpoint = 0, limit = c(-1, 1)
  ) +
  labs(
    title = "Correlation Matrix of Numeric Variables",
    x = "",
    y = ""
  ) +
  theme_minimal()

ggplot(cases, aes(x = gender, fill = factor(recid))) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Recidivism Rates by Gender",
    x = "Gender",
    y = "Percentage of Cases",
    fill = "Recidivist"
  ) +
  theme_minimal()

ggplot(cases, aes(x = race, fill = factor(recid))) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Recidivism Rates by Race/Ethnicity",
    x = "Race/Ethnicity",
    y = "Percentage of Cases",
    fill = "Recidivist"
  ) +
  theme_minimal()

cor.test(cases$recid, cases$AgeAtTimeOfViolation)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$recid and cases$AgeAtTimeOfViolation
## t = 1.3008, df = 647, p-value = 0.1938
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02599198  0.12753144
## sample estimates:
##        cor 
## 0.05107145
library(lsr)
## Warning: package 'lsr' was built under R version 4.4.3
tbl <- table(cases$gender, cases$recid)
cramersV(tbl)
## Warning in stats::chisq.test(...): Chi-squared approximation may be incorrect
## [1] 0.1117468
chisq.test(table(cases$gender, cases$recid))
## Warning in chisq.test(table(cases$gender, cases$recid)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(cases$gender, cases$recid)
## X-squared = 8.1043, df = 3, p-value = 0.0439
model <- glm(
 recid ~ AgeAtTimeOfViolation + gender + race + FiscalYear,
 data = cases,
 family = binomial
)

summary(model)
## 
## Call:
## glm(formula = recid ~ AgeAtTimeOfViolation + gender + race + 
##     FiscalYear, family = binomial, data = cases)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)   
## (Intercept)           -17.17450  542.28384  -0.032  0.97473   
## AgeAtTimeOfViolation    0.07113    0.07637   0.931  0.35166   
## genderMale              0.66985    0.24501   2.734  0.00626 **
## genderMissing          -0.74145    1.10529  -0.671  0.50234   
## genderUnknown           0.99867    1.10482   0.904  0.36604   
## raceBlack              14.43643  542.28278   0.027  0.97876   
## raceHispanic           13.94934  542.28272   0.026  0.97948   
## raceMissing            14.41290  542.28331   0.027  0.97880   
## raceOther               0.61258 1553.14280   0.000  0.99969   
## raceUnknown            13.52728  542.28376   0.025  0.98010   
## raceWhite              13.76580  542.28272   0.025  0.97975   
## FiscalYearFY2024        0.26991    0.23253   1.161  0.24574   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 547.46  on 648  degrees of freedom
## Residual deviance: 529.50  on 637  degrees of freedom
## AIC: 553.5
## 
## Number of Fisher Scoring iterations: 14
library(broom)
tidy(model, exponentiate = TRUE, conf.int = TRUE)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## # A tibble: 12 × 7
##    term                 estimate std.error statistic p.value  conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>   <dbl>     <dbl>     <dbl>
##  1 (Intercept)           3.48e-8  542.     -0.0317   0.975   NA         7.20e 12
##  2 AgeAtTimeOfViolation  1.07e+0    0.0764  0.931    0.352    9.29e- 1  1.25e  0
##  3 genderMale            1.95e+0    0.245   2.73     0.00626  1.22e+ 0  3.20e  0
##  4 genderMissing         4.76e-1    1.11   -0.671    0.502    4.79e- 2  4.69e  0
##  5 genderUnknown         2.71e+0    1.10    0.904    0.366    3.17e- 1  2.72e  1
##  6 raceBlack             1.86e+6  542.      0.0266   0.979    1.75e-15 NA       
##  7 raceHispanic          1.14e+6  542.      0.0257   0.979    9.05e-16 NA       
##  8 raceMissing           1.82e+6  542.      0.0266   0.979    1.72e- 2  1.10e 95
##  9 raceOther             1.85e+0 1553.      0.000394 1.00     1.79e- 6  9.73e  5
## 10 raceUnknown           7.50e+5  542.      0.0249   0.980    4.60e+64  5.08e101
## 11 raceWhite             9.52e+5  542.      0.0254   0.980    7.55e-16 NA       
## 12 FiscalYearFY2024      1.31e+0    0.233   1.16     0.246    8.26e- 1  2.06e  0
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6   2023-06-27
hoslem.test(model$y, fitted(model))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  model$y, fitted(model)
## X-squared = 7.2643, df = 8, p-value = 0.5084
# install.packages("lsr")  # <-- delete or comment this out
library(lsr)