library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
# Main individual-level dataset

juvenile_raw <- read_excel("juvenile_dataset.xlsx")

# Section totals (optional for context)

section_totals <- read_excel("juvenile_section_totals.xlsx")

# Info / metadata (optional for context)

info <- read_excel("juvenile_data_information.xlsx")

# Quick look at the raw case-level data

dplyr::glimpse(juvenile_raw)
## Rows: 649
## Columns: 23
## $ FiscalYear           <dbl> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
## $ ViolationDate        <dttm> 2023-01-04, 2023-01-05, 2023-01-05, 2023-01-09, …
## $ ViolTime             <dttm> 1899-12-31 15:02:00, 1899-12-31 10:31:00, 1899-1…
## $ Filed                <dttm> 2023-08-28, 2023-03-09, 2023-03-09, 2023-08-28, …
## $ StatusDate           <dttm> 2025-01-14, 2023-10-19, 2023-10-06, 2024-10-23, …
## $ ConvictionDate       <dttm> NA, 2023-10-19, NA, 2024-10-23, 2024-06-10, 2025…
## $ PleaDate             <dttm> NA, 2023-04-04, 2023-05-02, 2023-10-10, 2023-08-…
## $ Plea                 <chr> NA, "NC", "NC", "NC", "NC", "NC", NA, "NC", "NC",…
## $ DefendantZipcode1    <dbl> 78229, 78228, 78228, 78228, 78224, 78224, 78242, …
## $ OffenseCode          <chr> "2024A", "2017A", "2017A", "3006A", "6309A", "630…
## $ OffenseDesc          <chr> "DISORDERLY CONDUCT-FIGHTING", "FAIL TO INDENTIFY…
## $ ViolPaid             <dbl> 0, 0, 0, 0, 0, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 212…
## $ ViolFine             <dbl> 0, 100, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131,…
## $ ViolFees             <dbl> 81, 81, 0, 81, 81, 96, 0, 81, 0, 0, 150, 0, 0, 81…
## $ AmtDue               <dbl> 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 0, 0, 81, 0, …
## $ Status               <chr> "JR", "CL", "WD", "CL", "CL", "CL", "DP", "CL", "…
## $ StatusDescription    <chr> "AUTO RESET-JUVENILE APPEARANCE", "DOCKET CLOSED"…
## $ AgeAtTimeOfViolation <dbl> 14, 14, 14, 12, 15, 15, 16, 15, 13, 16, 16, 16, 1…
## $ Race                 <chr> "W", "H", "H", "W", "W", "W", "H", "H", "W", "H",…
## $ Gender               <chr> "M", "M", "M", "M", "M", "M", "F", "M", "M", "M",…
## $ CaseType             <chr> "NTRF", "NTRF", "NTRF", "NTRF", "NTRF", "NTRF", "…
## $ PersonViolCtAll      <dbl> 1, 1, 1, 1, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ PersonViolCtNTRF     <dbl> 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
juvenile <- juvenile_raw %>%
  mutate(
    FiscalYear       = as.factor(FiscalYear),
    ViolationDate    = as.Date(ViolationDate),
    Filed            = as.Date(Filed),
    StatusDate       = as.Date(StatusDate),
    ConvictionDate   = as.Date(ConvictionDate),
    PleaDate         = as.Date(PleaDate),

    age              = AgeAtTimeOfViolation,
    gender           = str_to_title(Gender),
    race             = str_to_title(Race),

    zipcode          = sprintf("%05d", DefendantZipcode1),

    total_violations = PersonViolCtAll,
    cited_violations = PersonViolCtNTRF,

    repeat_offender  = ifelse(total_violations > 1, 1, 0)
  ) %>%
  filter(CaseType == "NTRF")
research_data <- juvenile %>%
select(
FiscalYear,
age,
race,
gender,
zipcode,
total_violations,
cited_violations,
repeat_offender,

)

head(research_data)
## # A tibble: 6 × 8
##   FiscalYear   age race  gender zipcode total_violations cited_violations
##   <fct>      <dbl> <chr> <chr>  <chr>              <dbl>            <dbl>
## 1 2023          14 W     M      78229                  1                1
## 2 2023          14 H     M      78228                  1                1
## 3 2023          14 H     M      78228                  1                1
## 4 2023          12 W     M      78228                  1                1
## 5 2023          15 W     M      78224                  5                2
## 6 2023          15 W     M      78224                  5                2
## # ℹ 1 more variable: repeat_offender <dbl>
# Number of cases (rows)
n_cases <- nrow(research_data)

# Number of unique zip codes
n_zip <- dplyr::n_distinct(research_data$zipcode)

# Number of unique races and genders
n_race <- dplyr::n_distinct(research_data$race)
n_gender <- dplyr::n_distinct(research_data$gender)

cat("Total cases:", n_cases, "\n")
## Total cases: 649
cat("Unique zip codes:", n_zip, "\n")
## Unique zip codes: 77
cat("Race groups:", n_race, "\n")
## Race groups: 7
cat("Gender groups:", n_gender, "\n")
## Gender groups: 4
age_summary <- research_data %>%
  summarise(
    n      = sum(!is.na(age)),
    mean   = mean(age, na.rm = TRUE),
    sd     = sd(age, na.rm = TRUE),
    min    = min(age, na.rm = TRUE),
    q1     = quantile(age, 0.25, na.rm = TRUE),
    median = median(age, na.rm = TRUE),
    q3     = quantile(age, 0.75, na.rm = TRUE),
    max    = max(age, na.rm = TRUE)
  )

age_summary
## # A tibble: 1 × 8
##       n  mean    sd   min    q1 median    q3   max
##   <int> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1   649  14.2  1.58     2    13     14    15    16
gender_summary <- research_data %>%
  count(gender) %>%
  mutate(percent = round(100 * n / sum(n), 1))

gender_summary
## # A tibble: 4 × 3
##   gender     n percent
##   <chr>  <int>   <dbl>
## 1 F        270    41.6
## 2 M        333    51.3
## 3 U         23     3.5
## 4 <NA>      23     3.5
race_summary <- research_data %>%
  count(race) %>%
  mutate(percent = round(100 * n / sum(n), 1))

race_summary
## # A tibble: 7 × 3
##   race      n percent
##   <chr> <int>   <dbl>
## 1 A         7     1.1
## 2 B        68    10.5
## 3 H       240    37  
## 4 U        26     4  
## 5 W       275    42.4
## 6 X         1     0.2
## 7 <NA>     32     4.9
viol_summary <- research_data %>%
  summarise(
    mean_total   = mean(total_violations, na.rm = TRUE),
    sd_total     = sd(total_violations, na.rm = TRUE),
    min_total    = min(total_violations, na.rm = TRUE),
    max_total    = max(total_violations, na.rm = TRUE),

    mean_cited   = mean(cited_violations, na.rm = TRUE),
    sd_cited     = sd(cited_violations, na.rm = TRUE),
    min_cited    = min(cited_violations, na.rm = TRUE),
    max_cited    = max(cited_violations, na.rm = TRUE),

    repeat_rate  = mean(repeat_offender) * 100
  )

viol_summary
## # A tibble: 1 × 9
##   mean_total sd_total min_total max_total mean_cited sd_cited min_cited
##        <dbl>    <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1       1.24    0.692         1         7       1.14    0.440         1
## # ℹ 2 more variables: max_cited <dbl>, repeat_rate <dbl>
corr_vars <- research_data %>%
  select(age, total_violations, cited_violations, repeat_offender)

cor_matrix <- cor(corr_vars, use = "pairwise.complete.obs")
cor_matrix
##                           age total_violations cited_violations repeat_offender
## age               1.000000000       0.06401434     -0.008361656      0.05107145
## total_violations  0.064014343       1.00000000      0.691738067      0.82899804
## cited_violations -0.008361656       0.69173807      1.000000000      0.77682409
## repeat_offender   0.051071446       0.82899804      0.776824094      1.00000000
cor.test(research_data$age, research_data$total_violations)
## 
##  Pearson's product-moment correlation
## 
## data:  research_data$age and research_data$total_violations
## t = 1.6316, df = 647, p-value = 0.1032
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01301102  0.14028447
## sample estimates:
##        cor 
## 0.06401434
cor.test(research_data$age, research_data$cited_violations)
## 
##  Pearson's product-moment correlation
## 
## data:  research_data$age and research_data$cited_violations
## t = -0.2127, df = 647, p-value = 0.8316
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08526805  0.06864378
## sample estimates:
##          cor 
## -0.008361656
cor.test(research_data$total_violations, research_data$cited_violations)
## 
##  Pearson's product-moment correlation
## 
## data:  research_data$total_violations and research_data$cited_violations
## t = 24.365, df = 647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6493461 0.7298446
## sample estimates:
##       cor 
## 0.6917381
ggplot(research_data, aes(x = age)) +
  geom_histogram(bins = 10) +
  labs(title = "Age Distribution",
       x = "Age",
       y = "Case Count")

ggplot(research_data, aes(x = total_violations)) +
  geom_histogram(binwidth = 1) +
  labs(title = "Total Violations per Person",
       x = "Total Violations",
       y = "Count")

ggplot(research_data, aes(x = gender, y = total_violations)) +
  geom_boxplot() +
  labs(title = "Total Violations by Gender",
       x = "Gender",
       y = "Total Violations")

ggplot(research_data, aes(x = race, y = total_violations)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Total Violations by Race",
       x = "Race",
       y = "Total Violations")

ggplot(research_data, aes(x = age, y = total_violations)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Age vs Total Violations",
       x = "Age",
       y = "Total Violations")
## `geom_smooth()` using formula = 'y ~ x'

model_data <- research_data %>%
  mutate(
    gender = factor(gender),
    race   = factor(race),
    repeat_offender = factor(repeat_offender)
  )

logit_model <- glm(
  repeat_offender ~ age + gender + race,
  data = model_data,
  family = binomial
)

summary(logit_model)
## 
## Call:
## glm(formula = repeat_offender ~ age + gender + race, family = binomial, 
##     data = model_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -17.31422  543.87728  -0.032  0.97460   
## age            0.08400    0.07745   1.085  0.27809   
## genderM        0.73197    0.24938   2.935  0.00333 **
## genderU        1.02225    1.15438   0.886  0.37586   
## raceB         14.44249  543.87619   0.027  0.97881   
## raceH         13.95809  543.87614   0.026  0.97953   
## raceU         13.53256  543.87728   0.025  0.98015   
## raceW         13.78665  543.87614   0.025  0.97978   
## raceX          0.57217 1553.69986   0.000  0.99971   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 523.18  on 616  degrees of freedom
## Residual deviance: 506.53  on 608  degrees of freedom
##   (32 observations deleted due to missingness)
## AIC: 524.53
## 
## Number of Fisher Scoring iterations: 14
lm_model <- lm(
  total_violations ~ age + gender + race,
  data = model_data
)

summary(lm_model)
## 
## Call:
## lm(formula = total_violations ~ age + gender + race, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5481 -0.3429 -0.1656 -0.0701  5.6100 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.48474    0.36579   1.325    0.186    
## age          0.02389    0.01778   1.344    0.179    
## genderM      0.24828    0.05766   4.306 1.94e-05 ***
## genderU      0.17977    0.28554   0.630    0.529    
## raceB        0.43278    0.27644   1.566    0.118    
## raceH        0.29859    0.26680   1.119    0.264    
## raceU        0.16994    0.37460   0.454    0.650    
## raceW        0.25151    0.26639   0.944    0.345    
## raceX        0.18075    0.74467   0.243    0.808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6955 on 608 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.04059,    Adjusted R-squared:  0.02797 
## F-statistic: 3.216 on 8 and 608 DF,  p-value: 0.001372
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
corr_vars <- research_data %>%
  select(age, total_violations, cited_violations, repeat_offender)

cor_matrix <- cor(corr_vars, use = "pairwise.complete.obs")

# Convert to long format for ggplot
cor_melt <- melt(cor_matrix)

ggplot(cor_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(label = round(value, 2)), color = "white") +
  scale_fill_gradient2(
    low = "blue",
    mid = "white",
    high = "red",
    midpoint = 0
  ) +
  labs(
    title = "Correlation Heatmap",
    x = "Variable",
    y = "Variable",
    fill = "Correlation"
  ) +
  theme_minimal()

heat_data <- research_data %>%
  count(race, repeat_offender)

ggplot(heat_data, aes(x = race, y = repeat_offender, fill = n)) +
  geom_tile(color = "white") +
  geom_text(aes(label = n), color = "white") +
  scale_fill_gradient(low = "lightblue", high = "red") +
  labs(
    title = "Heatmap of Repeat Offenders by Race",
    x = "Race",
    y = "Repeat Offender",
    fill = "Count"
  ) +
  theme_minimal()

shapiro.test(research_data$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  research_data$age
## W = 0.87602, p-value < 2.2e-16
shapiro.test(research_data$total_violations)
## 
##  Shapiro-Wilk normality test
## 
## data:  research_data$total_violations
## W = 0.39798, p-value < 2.2e-16
shapiro.test(research_data$cited_violations)
## 
##  Shapiro-Wilk normality test
## 
## data:  research_data$cited_violations
## W = 0.36295, p-value < 2.2e-16
# QQ plots for age, total_violations, cited_violations
par(mfrow = c(1, 3))

qqnorm(research_data$age, main = "QQ Plot: Age")
qqline(research_data$age)

qqnorm(research_data$total_violations, main = "QQ Plot: Total Violations")
qqline(research_data$total_violations)

qqnorm(research_data$cited_violations, main = "QQ Plot: Cited Violations")
qqline(research_data$cited_violations)

par(mfrow = c(1, 1))
gender_tf <- research_data %>%
  filter(gender %in% c("M", "F"))

table(gender_tf$gender)
## 
##   F   M 
## 270 333
t_test_gender <- t.test(total_violations ~ gender, data = gender_tf)
t_test_gender
## 
##  Welch Two Sample t-test
## 
## data:  total_violations by gender
## t = -4.4234, df = 454.82, p-value = 1.217e-05
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -0.3406104 -0.1310612
## sample estimates:
## mean in group F mean in group M 
##        1.118519        1.354354
wilcox_gender <- wilcox.test(total_violations ~ gender, data = gender_tf)
wilcox_gender
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  total_violations by gender
## W = 41020, p-value = 0.002949
## alternative hypothesis: true location shift is not equal to 0
# One-way ANOVA: total violations by race
anova_race <- aov(total_violations ~ race, data = research_data)
summary(anova_race)
##              Df Sum Sq Mean Sq F value Pr(>F)
## race          5   2.53  0.5059   1.017  0.407
## Residuals   611 304.02  0.4976               
## 32 observations deleted due to missingness
kruskal_race <- kruskal.test(total_violations ~ race, data = research_data)
kruskal_race
## 
##  Kruskal-Wallis rank sum test
## 
## data:  total_violations by race
## Kruskal-Wallis chi-squared = 5.2211, df = 5, p-value = 0.3895
# Select numeric variables
corr_vars <- research_data %>%
  select(age, total_violations, cited_violations, repeat_offender)

cor_matrix <- cor(corr_vars, use = "pairwise.complete.obs")
cor_matrix
##                           age total_violations cited_violations repeat_offender
## age               1.000000000       0.06401434     -0.008361656      0.05107145
## total_violations  0.064014343       1.00000000      0.691738067      0.82899804
## cited_violations -0.008361656       0.69173807      1.000000000      0.77682409
## repeat_offender   0.051071446       0.82899804      0.776824094      1.00000000
# Correlation significance tests
cor.test(research_data$age, research_data$total_violations)
## 
##  Pearson's product-moment correlation
## 
## data:  research_data$age and research_data$total_violations
## t = 1.6316, df = 647, p-value = 0.1032
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01301102  0.14028447
## sample estimates:
##        cor 
## 0.06401434
cor.test(research_data$age, research_data$cited_violations)
## 
##  Pearson's product-moment correlation
## 
## data:  research_data$age and research_data$cited_violations
## t = -0.2127, df = 647, p-value = 0.8316
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08526805  0.06864378
## sample estimates:
##          cor 
## -0.008361656
cor.test(research_data$total_violations, research_data$cited_violations)
## 
##  Pearson's product-moment correlation
## 
## data:  research_data$total_violations and research_data$cited_violations
## t = 24.365, df = 647, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6493461 0.7298446
## sample estimates:
##       cor 
## 0.6917381
model_data <- research_data %>%
  mutate(
    gender = factor(gender),
    race   = factor(race),
    repeat_offender = factor(repeat_offender)
  ) %>%
  filter(!is.na(repeat_offender))

logit_model <- glm(
  repeat_offender ~ age + gender + race,
  data   = model_data,
  family = binomial
)

summary(logit_model)
## 
## Call:
## glm(formula = repeat_offender ~ age + gender + race, family = binomial, 
##     data = model_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -17.31422  543.87728  -0.032  0.97460   
## age            0.08400    0.07745   1.085  0.27809   
## genderM        0.73197    0.24938   2.935  0.00333 **
## genderU        1.02225    1.15438   0.886  0.37586   
## raceB         14.44249  543.87619   0.027  0.97881   
## raceH         13.95809  543.87614   0.026  0.97953   
## raceU         13.53256  543.87728   0.025  0.98015   
## raceW         13.78665  543.87614   0.025  0.97978   
## raceX          0.57217 1553.69986   0.000  0.99971   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 523.18  on 616  degrees of freedom
## Residual deviance: 506.53  on 608  degrees of freedom
##   (32 observations deleted due to missingness)
## AIC: 524.53
## 
## Number of Fisher Scoring iterations: 14
lm_model <- lm(
  total_violations ~ age + gender + race,
  data = model_data
)

summary(lm_model)
## 
## Call:
## lm(formula = total_violations ~ age + gender + race, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5481 -0.3429 -0.1656 -0.0701  5.6100 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.48474    0.36579   1.325    0.186    
## age          0.02389    0.01778   1.344    0.179    
## genderM      0.24828    0.05766   4.306 1.94e-05 ***
## genderU      0.17977    0.28554   0.630    0.529    
## raceB        0.43278    0.27644   1.566    0.118    
## raceH        0.29859    0.26680   1.119    0.264    
## raceU        0.16994    0.37460   0.454    0.650    
## raceW        0.25151    0.26639   0.944    0.345    
## raceX        0.18075    0.74467   0.243    0.808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6955 on 608 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.04059,    Adjusted R-squared:  0.02797 
## F-statistic: 3.216 on 8 and 608 DF,  p-value: 0.001372
# Load diversion outcomes
info_all <- readxl::read_excel("juvenile_data_information.xlsx", sheet = "Outcomes")

info_all
## # A tibble: 12 × 3
##    `Youth Diversion (Teen Court)` `FY 2023`           `FY 2024`          
##    <chr>                          <chr>               <chr>              
##  1 <NA>                           10/1/2022-9/30/2023 10/1/2023-9/30/2024
##  2 # Cases Referred               20                  63                 
##  3 # Dismissals                   16                  55                 
##  4 % Cases ending in dismissal    0.8                 0.87301587301587302
##  5 <NA>                           <NA>                <NA>               
##  6 <NA>                           <NA>                <NA>               
##  7 Juvenile Case Data             FY 2023             FY 2024            
##  8 <NA>                           10/1/2022-9/30/2023 10/1/2023-9/30/2024
##  9 Juvenile Filings               1311                1345               
## 10 Closed Cases                   1083                1056               
## 11 Number of Convictions          281                 194                
## 12 % Cases ending in conviction   0.25946445060018469 0.18371212121212119
diversion <- info_all[1:4, ]   # first 4 rows contain diversion data
names(diversion) <- c("Metric", "FY2023", "FY2024")

diversion
## # A tibble: 4 × 3
##   Metric                      FY2023              FY2024             
##   <chr>                       <chr>               <chr>              
## 1 <NA>                        10/1/2022-9/30/2023 10/1/2023-9/30/2024
## 2 # Cases Referred            20                  63                 
## 3 # Dismissals                16                  55                 
## 4 % Cases ending in dismissal 0.8                 0.87301587301587302
library(tidyr)
library(ggplot2)

diversion_long <- diversion %>%
  pivot_longer(cols = c(FY2023, FY2024), 
               names_to = "FiscalYear", 
               values_to = "Value")

ggplot(diversion_long[-1,], aes(x = Metric, y = as.numeric(Value), fill = FiscalYear)) +
  geom_col(position = "dodge") +
  labs(
    title = "Diversion Program Outcomes by Fiscal Year",
    x = "Metric",
    y = "Count"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

library(readxl)
library(dplyr)
library(tidyr)

section_totals <- read_excel("juvenile_section_totals.xlsx", sheet = "Sheet1")

diversion_cats <- c(
  "Peer Court",
  "Teen Court",
  "Mediations",
  "Mediation Participation",
  "Youth Enrichment Class",
  "Youth Enrichment Class (YEC)",
  "School Based Agreements",
  "UTSA Classes",
  "Parenting Classes"
)

diversion_totals <- section_totals %>%
  filter(Category %in% diversion_cats)

diversion_totals
## # A tibble: 23 × 4
##    FiscalYear Category                Metric                               Total
##    <chr>      <chr>                   <chr>                                <dbl>
##  1 FY2023     Peer Court              Referred                             33   
##  2 FY2023     Peer Court              Completed                             9   
##  3 FY2023     Mediations              Letters Sent                       2615   
##  4 FY2023     Mediations              Showed                             1201   
##  5 FY2023     Youth Enrichment Class  Number of Youth Served              332   
##  6 FY2023     Mediation Participation % of  Youth/Family attending medi…    0.46
##  7 FY2024     Mediations              Letters Sent                       5159   
##  8 FY2024     Mediations              Appeared/Resolved                  2265   
##  9 FY2024     Mediations              Percentage Appeared/Resolved          3.03
## 10 FY2024     School Based Agreements Letters Sent                       2099   
## # ℹ 13 more rows
diversion_wide <- diversion_totals %>%
  pivot_wider(
    id_cols = c(FiscalYear, Category),
    names_from = Metric,
    values_from = Total
  )

diversion_wide
## # A tibble: 10 × 21
##    FiscalYear Category                  Referred Completed `Letters Sent` Showed
##    <chr>      <chr>                        <dbl>     <dbl>          <dbl>  <dbl>
##  1 FY2023     Peer Court                      33         9             NA     NA
##  2 FY2023     Mediations                      NA        NA           2615   1201
##  3 FY2023     Youth Enrichment Class          NA        NA             NA     NA
##  4 FY2023     Mediation Participation         NA        NA             NA     NA
##  5 FY2024     Mediations                      NA        NA           5159     NA
##  6 FY2024     School Based Agreements         NA        NA           2099     NA
##  7 FY2024     Youth Enrichment Class (…       NA        NA             NA     NA
##  8 FY2024     Teen Court                      NA        NA             NA     NA
##  9 FY2024     UTSA Classes                    NA        NA             NA     NA
## 10 FY2024     Parenting Classes               NA        NA             NA     NA
## # ℹ 15 more variables: `Number of Youth Served` <dbl>,
## #   `% of  Youth/Family attending mediation` <dbl>, `Appeared/Resolved` <dbl>,
## #   `Percentage Appeared/Resolved` <dbl>, Attended <dbl>,
## #   `Percentage Appeared` <dbl>, Assigned <dbl>, `Attended/Completed` <dbl>,
## #   `Percentage Attended/Completed` <dbl>, `Cases Referred` <dbl>,
## #   `Cases Processed` <dbl>, `Cases Closed` <dbl>, `Appeared/Completed` <dbl>,
## #   `Percentage Appeared/Completed` <dbl>, Participants <dbl>
# Load section totals
section_totals <- readxl::read_excel("juvenile_section_totals.xlsx", sheet = "Sheet1")

# Load info file (contains diversion outcomes)
info_outcomes <- readxl::read_excel("juvenile_data_information.xlsx", sheet = "Outcomes")

# Clean outcomes into readable format
diversion_outcomes <- info_outcomes[1:4, ]
colnames(diversion_outcomes) <- c("Metric", "FY2023", "FY2024")

diversion_outcomes
## # A tibble: 4 × 3
##   Metric                      FY2023              FY2024             
##   <chr>                       <chr>               <chr>              
## 1 <NA>                        10/1/2022-9/30/2023 10/1/2023-9/30/2024
## 2 # Cases Referred            20                  63                 
## 3 # Dismissals                16                  55                 
## 4 % Cases ending in dismissal 0.8                 0.87301587301587302
# Identify diversion program categories
diversion_cats <- c(
  "Peer Court",
  "Teen Court",
  "Mediations",
  "Mediation Participation",
  "Youth Enrichment Class",
  "Youth Enrichment Class (YEC)",
  "School Based Agreements",
  "UTSA Classes",
  "Parenting Classes"
)

div_programs <- section_totals %>%
  filter(Category %in% diversion_cats)

# Reshape to wide format for easier year comparisons
div_programs_wide <- div_programs %>%
  tidyr::pivot_wider(
    id_cols = c(FiscalYear, Category),
    names_from = Metric,
    values_from = Total
  )

div_programs_wide
## # A tibble: 10 × 21
##    FiscalYear Category                  Referred Completed `Letters Sent` Showed
##    <chr>      <chr>                        <dbl>     <dbl>          <dbl>  <dbl>
##  1 FY2023     Peer Court                      33         9             NA     NA
##  2 FY2023     Mediations                      NA        NA           2615   1201
##  3 FY2023     Youth Enrichment Class          NA        NA             NA     NA
##  4 FY2023     Mediation Participation         NA        NA             NA     NA
##  5 FY2024     Mediations                      NA        NA           5159     NA
##  6 FY2024     School Based Agreements         NA        NA           2099     NA
##  7 FY2024     Youth Enrichment Class (…       NA        NA             NA     NA
##  8 FY2024     Teen Court                      NA        NA             NA     NA
##  9 FY2024     UTSA Classes                    NA        NA             NA     NA
## 10 FY2024     Parenting Classes               NA        NA             NA     NA
## # ℹ 15 more variables: `Number of Youth Served` <dbl>,
## #   `% of  Youth/Family attending mediation` <dbl>, `Appeared/Resolved` <dbl>,
## #   `Percentage Appeared/Resolved` <dbl>, Attended <dbl>,
## #   `Percentage Appeared` <dbl>, Assigned <dbl>, `Attended/Completed` <dbl>,
## #   `Percentage Attended/Completed` <dbl>, `Cases Referred` <dbl>,
## #   `Cases Processed` <dbl>, `Cases Closed` <dbl>, `Appeared/Completed` <dbl>,
## #   `Percentage Appeared/Completed` <dbl>, Participants <dbl>
recid_summary <- research_data %>%
  group_by(FiscalYear) %>%
  summarise(
    avg_violations = mean(total_violations, na.rm = TRUE),
    median_violations = median(total_violations, na.rm = TRUE),
    repeat_rate = mean(repeat_offender, na.rm = TRUE) * 100,
    n_cases = n()
  )

recid_summary
## # A tibble: 2 × 5
##   FiscalYear avg_violations median_violations repeat_rate n_cases
##   <fct>               <dbl>             <dbl>       <dbl>   <int>
## 1 2023                 1.23                 1        13.6     418
## 2 2024                 1.26                 1        17.3     231
# Extract key diversion totals: Teen Court, YEC, Mediations
key_div <- div_programs %>%
  filter(
    Category %in% c("Teen Court", "Youth Enrichment Class (YEC)", "Mediations"),
    Metric %in% c("Cases Referred", "Assigned", "Attended/Completed", "Appeared/Resolved")
  ) %>%
  select(FiscalYear, Category, Metric, Total)

key_div
## # A tibble: 4 × 4
##   FiscalYear Category                     Metric             Total
##   <chr>      <chr>                        <chr>              <dbl>
## 1 FY2024     Mediations                   Appeared/Resolved   2265
## 2 FY2024     Youth Enrichment Class (YEC) Assigned             794
## 3 FY2024     Youth Enrichment Class (YEC) Attended/Completed   378
## 4 FY2024     Teen Court                   Cases Referred        46
library(ggplot2)

# Bar chart: diversion referrals by year (example: Teen Court)
teen_court <- div_programs %>%
  filter(Category == "Teen Court", Metric == "Cases Referred")

ggplot(teen_court, aes(x = FiscalYear, y = Total, fill = FiscalYear)) +
  geom_col() +
  labs(
    title = "Teen Court Referrals by Year",
    x = "Fiscal Year",
    y = "Number of Referrals"
  ) +
  theme_minimal()

# Line chart: repeat offender rate by fiscal year
ggplot(recid_summary, aes(x = FiscalYear, y = repeat_rate, group = 1)) +
  geom_line(size = 1.4, color = "red") +
  geom_point(size = 4, color = "darkred") +
  labs(
    title = "Repeat Offender Rate by Fiscal Year",
    x = "Fiscal Year",
    y = "Repeat Offender Rate (%)"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

combined_summary <- recid_summary %>%
  left_join(
    diversion_outcomes %>% 
      filter(Metric == "% Cases ending in dismissal") %>%
      tidyr::pivot_longer(cols = c("FY2023", "FY2024"), 
                          names_to = "FiscalYear", 
                          values_to = "diversion_dismissal_rate"),
    by = "FiscalYear"
  )

combined_summary
## # A tibble: 2 × 7
##   FiscalYear avg_violations median_violations repeat_rate n_cases Metric
##   <chr>               <dbl>             <dbl>       <dbl>   <int> <chr> 
## 1 2023                 1.23                 1        13.6     418 <NA>  
## 2 2024                 1.26                 1        17.3     231 <NA>  
## # ℹ 1 more variable: diversion_dismissal_rate <chr>
library(dplyr)
library(tidyr)

# Reload section totals (your diversion program dataset)
section_totals <- readxl::read_excel("juvenile_section_totals.xlsx", sheet = "Sheet1")

# Use only the programs you are including
programs_to_use <- c(
  "Parenting Classes",
  "Teen Court",
  "UTSA Classes",
  "Youth Enrichment Class (YEC)"
)

program_data <- section_totals %>%
 filter(Category %in% programs_to_use)

# Use only assignment / completion type metrics
metrics_to_use <- c(
  "Assigned",
  "Attended/Completed",
  "Cases Closed"
)

program_summary <- program_data %>%
  filter(Metric %in% metrics_to_use) %>%
  select(FiscalYear, Category, Metric, Total) %>%
  pivot_wider(
    names_from = Metric,
    values_from = Total
  ) %>%
  mutate(
    completion_rate = (`Attended/Completed` / Assigned) * 100,
    closure_rate = (`Cases Closed` / Assigned) * 100
  )

program_summary
## # A tibble: 4 × 7
##   FiscalYear Category               Assigned `Attended/Completed` `Cases Closed`
##   <chr>      <chr>                     <dbl>                <dbl>          <dbl>
## 1 FY2024     Youth Enrichment Clas…      794                  378             NA
## 2 FY2024     Teen Court                   NA                   NA             30
## 3 FY2024     UTSA Classes                121                   NA             NA
## 4 FY2024     Parenting Classes           405                   NA             NA
## # ℹ 2 more variables: completion_rate <dbl>, closure_rate <dbl>
library(dplyr)
library(tidyr)

section_totals <- readxl::read_excel("juvenile_section_totals.xlsx", sheet = "Sheet1")

# Map program-specific metrics
program_summary <- section_totals %>%
  filter(Category %in% c("Youth Enrichment Class (YEC)",
                         "Parenting Classes",
                         "UTSA Classes",
                         "Teen Court")) %>%
  select(FiscalYear, Category, Metric, Total) %>%
  mutate(
    assigned = case_when(
      Category == "Youth Enrichment Class (YEC)" ~ ifelse(Metric == "Assigned", Total, NA),
      Category == "UTSA Classes" ~ ifelse(Metric == "Assigned", Total, NA),
      Category == "Parenting Classes" ~ ifelse(Metric == "Assigned", Total, NA),
      TRUE ~ NA
    ),
    completed = case_when(
      Category == "Youth Enrichment Class (YEC)" ~ ifelse(Metric == "Attended/Completed", Total, NA),
      Category == "UTSA Classes" ~ ifelse(Metric == "Appeared/Completed", Total, NA),
      Category == "Parenting Classes" ~ ifelse(Metric == "Participants", Total, NA),
      TRUE ~ NA
    ),
    processed = case_when(
      Category == "Teen Court" ~ ifelse(Metric == "Cases Processed", Total, NA),
      TRUE ~ NA
    ),
    closed = case_when(
      Category == "Teen Court" ~ ifelse(Metric == "Cases Closed", Total, NA),
      TRUE ~ NA
    )
  ) %>%
  group_by(FiscalYear, Category) %>%
  summarize(
    Assigned = sum(assigned, na.rm = TRUE),
    Completed = sum(completed, na.rm = TRUE),
    Processed = sum(processed, na.rm = TRUE),
    Closed = sum(closed, na.rm = TRUE),
    completion_rate = case_when(
      Category %in% c("Youth Enrichment Class (YEC)", "UTSA Classes", "Parenting Classes") ~
        (Completed / Assigned) * 100,
      Category == "Teen Court" ~ (Closed / Processed) * 100
    )
  )
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'FiscalYear', 'Category'. You can override
## using the `.groups` argument.
program_summary
## # A tibble: 11 × 7
## # Groups:   FiscalYear, Category [4]
##    FiscalYear Category       Assigned Completed Processed Closed completion_rate
##    <chr>      <chr>             <dbl>     <dbl>     <dbl>  <dbl>           <dbl>
##  1 FY2024     Parenting Cla…      405       143         0      0            35.3
##  2 FY2024     Parenting Cla…      405       143         0      0            35.3
##  3 FY2024     Teen Court            0         0        54     30            55.6
##  4 FY2024     Teen Court            0         0        54     30            55.6
##  5 FY2024     Teen Court            0         0        54     30            55.6
##  6 FY2024     UTSA Classes        121        80         0      0            66.1
##  7 FY2024     UTSA Classes        121        80         0      0            66.1
##  8 FY2024     UTSA Classes        121        80         0      0            66.1
##  9 FY2024     Youth Enrichm…      794       378         0      0            47.6
## 10 FY2024     Youth Enrichm…      794       378         0      0            47.6
## 11 FY2024     Youth Enrichm…      794       378         0      0            47.6
library(ggplot2)

ggplot(program_summary,
       aes(x = Category, y = completion_rate, fill = FiscalYear)) +
  geom_col(position = "dodge") +
  labs(
    title = "Completion / Participation Rates by Diversion Program",
    y = "Completion Rate (%)",
    x = "Diversion Program"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

juveniles <- juvenile %>%
  mutate(
    paid_fine = ifelse(ViolPaid > 0, 1, 0),
    owes_money = ifelse(AmtDue > 0, 1, 0),
    financial_resolution = case_when(
      ViolPaid > 0 ~ "Paid Fine",
      AmtDue > 0 ~ "Still Owes",
      TRUE ~ "No Payment"
    )
  )
finance_summary <- juveniles %>%
  group_by(financial_resolution) %>%
  summarise(
    n = n(),
    repeat_rate = mean(repeat_offender, na.rm = TRUE) * 100,
    avg_total_viol = mean(PersonViolCtAll, na.rm = TRUE)
  )
finance_year <- juveniles %>%
  group_by(FiscalYear) %>%
  summarise(
    pct_paid = mean(paid_fine, na.rm = TRUE) * 100,
    pct_owes = mean(owes_money, na.rm = TRUE) * 100,
    repeat_rate = mean(repeat_offender, na.rm = TRUE) * 100
  )
diversion_year <- program_summary %>%
  group_by(FiscalYear) %>%
  summarise(total_assigned = sum(Assigned, na.rm = TRUE))
# Create financial behavior flags in the CLEANED juvenile dataset
juveniles <- juvenile %>%
  mutate(
    paid_fine = ifelse(ViolPaid > 0, 1, 0),
    owes_money = ifelse(AmtDue > 0, 1, 0),
    financial_resolution = case_when(
      ViolPaid > 0 ~ "Paid Fine",
      AmtDue > 0 ~ "Still Owes",
      TRUE ~ "No Payment"
    )
  )
finance_summary <- juveniles %>%
  group_by(financial_resolution) %>%
  summarise(
    n = n(),
    pct_of_cases = round(100 * n / nrow(juvenile), 1),
    avg_total_viol = round(mean(PersonViolCtAll, na.rm = TRUE), 2),
    repeat_rate = round(mean(repeat_offender, na.rm = TRUE) * 100, 1)
  )

finance_summary
## # A tibble: 3 × 5
##   financial_resolution     n pct_of_cases avg_total_viol repeat_rate
##   <chr>                <int>        <dbl>          <dbl>       <dbl>
## 1 No Payment             405         62.4           1.25        15.8
## 2 Paid Fine               59          9.1           1.24        13.6
## 3 Still Owes             185         28.5           1.22        13.5
diversion_year <- program_summary %>%
  group_by(FiscalYear) %>%
  summarise(
    total_assigned = sum(Assigned, na.rm = TRUE),
    total_completed = sum(Completed, na.rm = TRUE),
    overall_completion_rate = round(100 * total_completed / total_assigned, 1)
  )

diversion_year
## # A tibble: 1 × 4
##   FiscalYear total_assigned total_completed overall_completion_rate
##   <chr>               <dbl>           <dbl>                   <dbl>
## 1 FY2024               3555            1660                    46.7
finance_by_year <- juveniles %>%
  group_by(FiscalYear) %>%
  summarise(
    pct_paid = round(mean(paid_fine, na.rm = TRUE) * 100, 1),
    pct_owes = round(mean(owes_money, na.rm = TRUE) * 100, 1),
    repeat_rate = round(mean(repeat_offender, na.rm = TRUE) * 100, 1)
  )
diversion_year <- program_summary %>%
  group_by(FiscalYear) %>%
  summarise(
    total_assigned = sum(Assigned, na.rm = TRUE),
    total_completed = sum(Completed, na.rm = TRUE),
    overall_completion_rate = round(100 * total_completed / total_assigned, 1)
  )
finance_diversion_year <- finance_by_year %>%
  left_join(diversion_year, by = "FiscalYear")

finance_diversion_year
## # A tibble: 2 × 7
##   FiscalYear pct_paid pct_owes repeat_rate total_assigned total_completed
##   <chr>         <dbl>    <dbl>       <dbl>          <dbl>           <dbl>
## 1 2023            8.6     24.2        13.6             NA              NA
## 2 2024           10       36.4        17.3             NA              NA
## # ℹ 1 more variable: overall_completion_rate <dbl>
library(ggplot2)

ggplot(finance_summary,
       aes(x = financial_resolution, y = repeat_rate)) +
  geom_col(fill = "steelblue") +
  labs(
    title = "Repeat Offender Rate by Financial Resolution",
    x = "How the Case Was Resolved Financially",
    y = "Repeat Offender Rate (%)"
  ) +
  theme_minimal()

finance_year_long <- finance_by_year %>%
  tidyr::pivot_longer(
    cols = c(pct_paid, pct_owes),
    names_to = "type",
    values_to = "percent"
  ) %>%
  mutate(
    type = dplyr::recode(
      type,
      pct_paid = "Paid Fine",
      pct_owes = "Still Owes"
    )
  )

ggplot(finance_year_long,
       aes(x = FiscalYear, y = percent, fill = type)) +
  geom_col(position = "dodge") +
  labs(
    title = "Financial Resolution Patterns by Fiscal Year",
    x = "Fiscal Year",
    y = "Percent of Cases",
    fill = "Resolution Type"
  ) +
  theme_minimal()

juvenile <- juvenile %>%
  mutate(
    paid_fine = ifelse(ViolPaid > 0, "Paid Fine", "Did Not Pay")
  )
status_payment_table <- table(juvenile$StatusDescription, juvenile$paid_fine)
status_payment_table
##                                 
##                                  Did Not Pay Paid Fine
##   AUTO RESET-JUVENILE APPEARANCE         118         0
##   DEFERRED DISPOSITION                     0         3
##   DEFERRED DISPOSITION P/P                 9         0
##   DISM PROS/PLEA BARGAIN                  27         0
##   DISMISS WAIVE FEE                      279         5
##   DISMISSED AFTER DEFERRED DISP.           1        40
##   DISMISSED YOUTH DIVERSION                0         1
##   DOCKET CLOSED                           81         9
##   JUDGMENT/CONVICTION                      2         0
##   JUVENILE APPEARANCE                     36         0
##   PAYMENT EXTENSION                        6         0
##   PLEA BARGAIN APPEARANCE                  2         0
##   RESET - MINOR APPEARANCE                 1         0
##   RESET - QOL APPEARANCE                   1         0
##   REVOKED PROB/DSC                        10         1
##   VOIDED DOCKET                            1         0
##   WAIVE JUVENILE JURISDICTION             16         0
chisq.test(status_payment_table)
## Warning in chisq.test(status_payment_table): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  status_payment_table
## X-squared = 468.75, df = 16, p-value < 2.2e-16
library(lsr)
## Warning: package 'lsr' was built under R version 4.4.3
cramersV(status_payment_table)
## Warning in stats::chisq.test(...): Chi-squared approximation may be incorrect
## [1] 0.849862
library(ggplot2)

status_payment_df <- as.data.frame(status_payment_table)
colnames(status_payment_df) <- c("StatusDescription", "Payment", "Count")

ggplot(status_payment_df,
       aes(x = Payment, y = StatusDescription, fill = Count)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(
    title = "Heatmap of Payment Behavior by Status Description",
    x = "Payment Status",
    y = "Case Status",
    fill = "Number of Cases"
  ) +
  theme_minimal()

viol_vs_payment <- juvenile %>%
  group_by(paid_fine) %>%
  summarise(
    n = n(),
    avg_total_viol = mean(PersonViolCtAll, na.rm = TRUE),
    median_total_viol = median(PersonViolCtAll, na.rm = TRUE),
    repeat_rate = mean(repeat_offender, na.rm = TRUE) * 100
  )

viol_vs_payment
## # A tibble: 2 × 5
##   paid_fine       n avg_total_viol median_total_viol repeat_rate
##   <chr>       <int>          <dbl>             <dbl>       <dbl>
## 1 Did Not Pay   590           1.24                 1        15.1
## 2 Paid Fine      59           1.24                 1        13.6
viol_vs_status <- juvenile %>%
  group_by(StatusDescription) %>%
  summarise(
    n = n(),
    avg_total_viol = mean(PersonViolCtAll, na.rm = TRUE),
    median_total_viol = median(PersonViolCtAll, na.rm = TRUE),
    repeat_rate = mean(repeat_offender, na.rm = TRUE) * 100
  ) %>%
  arrange(desc(avg_total_viol))

viol_vs_status
## # A tibble: 17 × 5
##    StatusDescription              n avg_total_viol median_total_viol repeat_rate
##    <chr>                      <int>          <dbl>             <dbl>       <dbl>
##  1 DEFERRED DISPOSITION P/P       9           2                    1       22.2 
##  2 PLEA BARGAIN APPEARANCE        2           2                    2      100   
##  3 RESET - QOL APPEARANCE         1           2                    2      100   
##  4 VOIDED DOCKET                  1           2                    2      100   
##  5 WAIVE JUVENILE JURISDICTI…    16           1.69                 1       31.2 
##  6 DISM PROS/PLEA BARGAIN        27           1.56                 1       40.7 
##  7 DEFERRED DISPOSITION           3           1.33                 1       33.3 
##  8 DOCKET CLOSED                 90           1.24                 1       12.2 
##  9 AUTO RESET-JUVENILE APPEA…   118           1.23                 1       15.3 
## 10 DISMISS WAIVE FEE            284           1.21                 1       14.1 
## 11 PAYMENT EXTENSION              6           1.17                 1       16.7 
## 12 DISMISSED AFTER DEFERRED …    41           1.10                 1        7.32
## 13 JUVENILE APPEARANCE           36           1.03                 1        2.78
## 14 DISMISSED YOUTH DIVERSION      1           1                    1        0   
## 15 JUDGMENT/CONVICTION            2           1                    1        0   
## 16 RESET - MINOR APPEARANCE       1           1                    1        0   
## 17 REVOKED PROB/DSC              11           1                    1        0
library(ggplot2)

ggplot(juvenile,
       aes(x = paid_fine, y = PersonViolCtAll)) +
  geom_boxplot(fill = "skyblue") +
  labs(
    title = "Total Violations by Payment Behavior",
    x = "Payment Behavior",
    y = "Total Violations"
  ) +
  theme_minimal()

juvenile <- juvenile %>%
  mutate(
    paid_fine_numeric = ifelse(paid_fine == "Paid Fine", 1, 0)
  )

cor.test(juvenile$PersonViolCtAll, juvenile$paid_fine_numeric)
## 
##  Pearson's product-moment correlation
## 
## data:  juvenile$PersonViolCtAll and juvenile$paid_fine_numeric
## t = -0.035837, df = 647, p-value = 0.9714
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07836165  0.07556057
## sample estimates:
##          cor 
## -0.001408884
viol_by_year <- juvenile %>%
  group_by(FiscalYear) %>%
  summarise(
    avg_total_viol = mean(PersonViolCtAll, na.rm = TRUE),
    repeat_rate = mean(repeat_offender, na.rm = TRUE) * 100
  )
viol_by_year
## # A tibble: 2 × 3
##   FiscalYear avg_total_viol repeat_rate
##   <fct>               <dbl>       <dbl>
## 1 2023                 1.23        13.6
## 2 2024                 1.26        17.3
year_cross_reference <- viol_by_year %>%
  left_join(diversion_year, by = "FiscalYear")

year_cross_reference
## # A tibble: 2 × 6
##   FiscalYear avg_total_viol repeat_rate total_assigned total_completed
##   <chr>               <dbl>       <dbl>          <dbl>           <dbl>
## 1 2023                 1.23        13.6             NA              NA
## 2 2024                 1.26        17.3             NA              NA
## # ℹ 1 more variable: overall_completion_rate <dbl>
year_cross_reference_clean <- year_cross_reference %>%
  filter(!is.na(total_assigned), !is.na(avg_total_viol))

ggplot(year_cross_reference_clean,
       aes(x = total_assigned, y = avg_total_viol)) +
  geom_point(size = 3, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Diversion Assignments vs Average Violations by Fiscal Year",
    x = "Total Diversion Assignments",
    y = "Average Total Violations"
  ) +
  theme_minimal()

year_cross_reference
## # A tibble: 2 × 6
##   FiscalYear avg_total_viol repeat_rate total_assigned total_completed
##   <chr>               <dbl>       <dbl>          <dbl>           <dbl>
## 1 2023                 1.23        13.6             NA              NA
## 2 2024                 1.26        17.3             NA              NA
## # ℹ 1 more variable: overall_completion_rate <dbl>
year_cross_reference
## # A tibble: 2 × 6
##   FiscalYear avg_total_viol repeat_rate total_assigned total_completed
##   <chr>               <dbl>       <dbl>          <dbl>           <dbl>
## 1 2023                 1.23        13.6             NA              NA
## 2 2024                 1.26        17.3             NA              NA
## # ℹ 1 more variable: overall_completion_rate <dbl>
str(year_cross_reference)
## tibble [2 × 6] (S3: tbl_df/tbl/data.frame)
##  $ FiscalYear             : chr [1:2] "2023" "2024"
##  $ avg_total_viol         : num [1:2] 1.23 1.26
##  $ repeat_rate            : num [1:2] 13.6 17.3
##  $ total_assigned         : num [1:2] NA NA
##  $ total_completed        : num [1:2] NA NA
##  $ overall_completion_rate: num [1:2] NA NA
ggplot(year_cross_reference,
       aes(x = total_assigned, y = avg_total_viol)) +
  geom_point(size = 3, na.rm = TRUE) +
  geom_smooth(method = "lm", se = FALSE, na.rm = TRUE) +
  labs(
    title = "Diversion Assignments vs Average Violations by Fiscal Year",
    x = "Total Diversion Assignments",
    y = "Average Total Violations"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

program_summary
## # A tibble: 11 × 7
## # Groups:   FiscalYear, Category [4]
##    FiscalYear Category       Assigned Completed Processed Closed completion_rate
##    <chr>      <chr>             <dbl>     <dbl>     <dbl>  <dbl>           <dbl>
##  1 FY2024     Parenting Cla…      405       143         0      0            35.3
##  2 FY2024     Parenting Cla…      405       143         0      0            35.3
##  3 FY2024     Teen Court            0         0        54     30            55.6
##  4 FY2024     Teen Court            0         0        54     30            55.6
##  5 FY2024     Teen Court            0         0        54     30            55.6
##  6 FY2024     UTSA Classes        121        80         0      0            66.1
##  7 FY2024     UTSA Classes        121        80         0      0            66.1
##  8 FY2024     UTSA Classes        121        80         0      0            66.1
##  9 FY2024     Youth Enrichm…      794       378         0      0            47.6
## 10 FY2024     Youth Enrichm…      794       378         0      0            47.6
## 11 FY2024     Youth Enrichm…      794       378         0      0            47.6
# In juvenile: turn 2023 -> "FY2023", 2024 -> "FY2024"
juvenile <- juvenile %>%
  mutate(FY = paste0("FY", FiscalYear))

# In program_summary: just copy FiscalYear into FY
program_summary <- program_summary %>%
  mutate(FY = FiscalYear)
# Summarise once per FY & category
program_summary_fy_cat <- program_summary %>%
  group_by(FY, Category) %>%
  summarise(
    Assigned  = max(Assigned,  na.rm = TRUE),
    Completed = max(Completed, na.rm = TRUE),
    Processed = max(Processed, na.rm = TRUE),
    Closed    = max(Closed,    na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    completion_rate = dplyr::case_when(
      Category == "Teen Court" ~ 100 * Closed / Processed,
      TRUE                     ~ 100 * Completed / Assigned
    )
  )

program_summary_fy_cat
## # A tibble: 4 × 7
##   FY     Category            Assigned Completed Processed Closed completion_rate
##   <chr>  <chr>                  <dbl>     <dbl>     <dbl>  <dbl>           <dbl>
## 1 FY2024 Parenting Classes        405       143         0      0            35.3
## 2 FY2024 Teen Court                 0         0        54     30            55.6
## 3 FY2024 UTSA Classes             121        80         0      0            66.1
## 4 FY2024 Youth Enrichment C…      794       378         0      0            47.6
diversion_year <- program_summary_fy_cat %>%
  group_by(FY) %>%
  summarise(
    total_assigned  = sum(Assigned,  na.rm = TRUE),
    total_completed = sum(Completed, na.rm = TRUE),
    overall_completion_rate = 100 * total_completed / total_assigned,
    .groups = "drop"
  )

diversion_year
## # A tibble: 1 × 4
##   FY     total_assigned total_completed overall_completion_rate
##   <chr>           <dbl>           <dbl>                   <dbl>
## 1 FY2024           1320             601                    45.5
viol_by_year <- juvenile %>%
  group_by(FY) %>%
  summarise(
    avg_total_viol = mean(PersonViolCtAll, na.rm = TRUE),
    repeat_rate    = mean(repeat_offender, na.rm = TRUE) * 100,
    .groups = "drop"
  )

viol_by_year
## # A tibble: 2 × 3
##   FY     avg_total_viol repeat_rate
##   <chr>           <dbl>       <dbl>
## 1 FY2023           1.23        13.6
## 2 FY2024           1.26        17.3
year_cross_reference <- viol_by_year %>%
  left_join(diversion_year, by = "FY")

year_cross_reference
## # A tibble: 2 × 6
##   FY     avg_total_viol repeat_rate total_assigned total_completed
##   <chr>           <dbl>       <dbl>          <dbl>           <dbl>
## 1 FY2023           1.23        13.6             NA              NA
## 2 FY2024           1.26        17.3           1320             601
## # ℹ 1 more variable: overall_completion_rate <dbl>
cor.test(juveniles$PersonViolCtAll, juveniles$paid_fine, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  juveniles$PersonViolCtAll and juveniles$paid_fine
## t = -0.035837, df = 647, p-value = 0.9714
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07836165  0.07556057
## sample estimates:
##          cor 
## -0.001408884
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
status_payment_table <- table(juvenile$StatusDescription, juvenile$paid_fine)
status_payment_table
##                                 
##                                  Did Not Pay Paid Fine
##   AUTO RESET-JUVENILE APPEARANCE         118         0
##   DEFERRED DISPOSITION                     0         3
##   DEFERRED DISPOSITION P/P                 9         0
##   DISM PROS/PLEA BARGAIN                  27         0
##   DISMISS WAIVE FEE                      279         5
##   DISMISSED AFTER DEFERRED DISP.           1        40
##   DISMISSED YOUTH DIVERSION                0         1
##   DOCKET CLOSED                           81         9
##   JUDGMENT/CONVICTION                      2         0
##   JUVENILE APPEARANCE                     36         0
##   PAYMENT EXTENSION                        6         0
##   PLEA BARGAIN APPEARANCE                  2         0
##   RESET - MINOR APPEARANCE                 1         0
##   RESET - QOL APPEARANCE                   1         0
##   REVOKED PROB/DSC                        10         1
##   VOIDED DOCKET                            1         0
##   WAIVE JUVENILE JURISDICTION             16         0
DescTools::CramerV(status_payment_table)
## [1] 0.849862
recid_model <- glm(
  repeat_offender ~ paid_fine + PersonViolCtAll + age + race + gender,
  data = juvenile,
  family = binomial
)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(recid_model)
## 
## Call:
## glm(formula = repeat_offender ~ paid_fine + PersonViolCtAll + 
##     age + race + gender, family = binomial, data = juvenile)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)        -7.707e+01  1.680e+05   0.000    1.000
## paid_finePaid Fine  5.054e-02  3.658e+04   0.000    1.000
## PersonViolCtAll     5.074e+01  2.154e+04   0.002    0.998
## age                -1.039e-02  6.921e+03   0.000    1.000
## raceB               3.264e-01  1.387e+05   0.000    1.000
## raceH               3.076e-01  1.360e+05   0.000    1.000
## raceU               4.478e-01  1.505e+05   0.000    1.000
## raceW               3.300e-01  1.358e+05   0.000    1.000
## raceX              -9.448e-02  3.811e+05   0.000    1.000
## genderM            -1.520e-01  2.166e+04   0.000    1.000
## genderU             1.226e-01  7.109e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5.2318e+02  on 616  degrees of freedom
## Residual deviance: 6.9035e-09  on 606  degrees of freedom
##   (32 observations deleted due to missingness)
## AIC: 22
## 
## Number of Fisher Scoring iterations: 25
juveniles$diversion_binary <- ifelse(juveniles$StatusDescription %in% c("Teen Court Completed",
                                                                "Completed Counseling",
                                                                "YEC Completed",
                                                                "Mediation Successful",
                                                                "Dismissal (Completed Program)"),
                                 1, 0)

cases <- juveniles
cases$diversion_binary <- ifelse(cases$StatusDescription %in% c("Teen Court Completed",
                                                                "Completed Counseling",
                                                                "YEC Completed",
                                                                "Mediation Successful",
                                                                "Dismissal (Completed Program)"),
                                 1, 0)
cases$punitive_binary <- ifelse(cases$diversion_binary == 0, 1, 0)
# create a recidivism flag: 1 = more than one violation, 0 = one or fewer
cases <- cases |>
  dplyr::mutate(
    recid_binary = if_else(PersonViolCtAll > 1, 1L, 0L)
  )

# quick check
table(cases$recid_binary)
## 
##   0   1 
## 552  97
tab_rd <- table(cases$recid_binary, cases$diversion_binary)
tab_rd
##    
##       0
##   0 552
##   1  97
prop.table(tab_rd, 2)
##    
##             0
##   0 0.8505393
##   1 0.1494607
tab_rd <- table(cases$recid_binary, cases$diversion_binary)
tab_rd
##    
##       0
##   0 552
##   1  97
prop.table(tab_rd, 2)
##    
##             0
##   0 0.8505393
##   1 0.1494607
unique(cases$StatusDescription)
##  [1] "AUTO RESET-JUVENILE APPEARANCE" "DOCKET CLOSED"                 
##  [3] "DISMISS WAIVE FEE"              "DISM PROS/PLEA BARGAIN"        
##  [5] "REVOKED PROB/DSC"               "DISMISSED AFTER DEFERRED DISP."
##  [7] "JUVENILE APPEARANCE"            "WAIVE JUVENILE JURISDICTION"   
##  [9] "DEFERRED DISPOSITION P/P"       "DEFERRED DISPOSITION"          
## [11] "PLEA BARGAIN APPEARANCE"        "RESET - MINOR APPEARANCE"      
## [13] "PAYMENT EXTENSION"              "RESET - QOL APPEARANCE"        
## [15] "DISMISSED YOUTH DIVERSION"      "JUDGMENT/CONVICTION"           
## [17] "VOIDED DOCKET"
# create diversion flag based on StatusDescription text
table(cases$StatusDescription)  # (optional: just to inspect)
## 
## AUTO RESET-JUVENILE APPEARANCE           DEFERRED DISPOSITION 
##                            118                              3 
##       DEFERRED DISPOSITION P/P         DISM PROS/PLEA BARGAIN 
##                              9                             27 
##              DISMISS WAIVE FEE DISMISSED AFTER DEFERRED DISP. 
##                            284                             41 
##      DISMISSED YOUTH DIVERSION                  DOCKET CLOSED 
##                              1                             90 
##            JUDGMENT/CONVICTION            JUVENILE APPEARANCE 
##                              2                             36 
##              PAYMENT EXTENSION        PLEA BARGAIN APPEARANCE 
##                              6                              2 
##       RESET - MINOR APPEARANCE         RESET - QOL APPEARANCE 
##                              1                              1 
##               REVOKED PROB/DSC                  VOIDED DOCKET 
##                             11                              1 
##    WAIVE JUVENILE JURISDICTION 
##                             16
cases <- cases |>
  dplyr::mutate(
    diversion_binary = if_else(
      grepl("DIVERSION", StatusDescription, ignore.case = TRUE),
      1L, 0L
    )
  )

# check that we have both 0 and 1
table(cases$diversion_binary)
## 
##   0   1 
## 648   1
str(cases)
## tibble [649 × 36] (S3: tbl_df/tbl/data.frame)
##  $ FiscalYear          : Factor w/ 2 levels "2023","2024": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ViolationDate       : Date[1:649], format: "2023-01-04" "2023-01-05" ...
##  $ ViolTime            : POSIXct[1:649], format: "1899-12-31 15:02:00" "1899-12-31 10:31:00" ...
##  $ Filed               : Date[1:649], format: "2023-08-28" "2023-03-09" ...
##  $ StatusDate          : Date[1:649], format: "2025-01-14" "2023-10-19" ...
##  $ ConvictionDate      : Date[1:649], format: NA "2023-10-19" ...
##  $ PleaDate            : Date[1:649], format: NA "2023-04-04" ...
##  $ Plea                : chr [1:649] NA "NC" "NC" "NC" ...
##  $ DefendantZipcode1   : num [1:649] 78229 78228 78228 78228 78224 ...
##  $ OffenseCode         : chr [1:649] "2024A" "2017A" "2017A" "3006A" ...
##  $ OffenseDesc         : chr [1:649] "DISORDERLY CONDUCT-FIGHTING" "FAIL TO INDENTIFY (REFUSED TO REPORT NAME RESIDENCE ADDRESS OR D.O.B.)" "FAIL TO INDENTIFY (REFUSED TO REPORT NAME RESIDENCE ADDRESS OR D.O.B.)" "POSSESSING DRUG PARAPHERNALIA" ...
##  $ ViolPaid            : num [1:649] 0 0 0 0 0 96 0 0 0 0 ...
##  $ ViolFine            : num [1:649] 0 100 0 19 0 0 0 0 0 0 ...
##  $ ViolFees            : num [1:649] 81 81 0 81 81 96 0 81 0 0 ...
##  $ AmtDue              : num [1:649] 81 0 0 0 0 0 0 0 0 0 ...
##  $ Status              : chr [1:649] "JR" "CL" "WD" "CL" ...
##  $ StatusDescription   : chr [1:649] "AUTO RESET-JUVENILE APPEARANCE" "DOCKET CLOSED" "DISMISS WAIVE FEE" "DOCKET CLOSED" ...
##  $ AgeAtTimeOfViolation: num [1:649] 14 14 14 12 15 15 16 15 13 16 ...
##  $ Race                : chr [1:649] "W" "H" "H" "W" ...
##  $ Gender              : chr [1:649] "M" "M" "M" "M" ...
##  $ CaseType            : chr [1:649] "NTRF" "NTRF" "NTRF" "NTRF" ...
##  $ PersonViolCtAll     : num [1:649] 1 1 1 1 5 5 2 1 1 1 ...
##  $ PersonViolCtNTRF    : num [1:649] 1 1 1 1 2 2 1 1 1 1 ...
##  $ age                 : num [1:649] 14 14 14 12 15 15 16 15 13 16 ...
##  $ gender              : chr [1:649] "M" "M" "M" "M" ...
##  $ race                : chr [1:649] "W" "H" "H" "W" ...
##  $ zipcode             : chr [1:649] "78229" "78228" "78228" "78228" ...
##  $ total_violations    : num [1:649] 1 1 1 1 5 5 2 1 1 1 ...
##  $ cited_violations    : num [1:649] 1 1 1 1 2 2 1 1 1 1 ...
##  $ repeat_offender     : num [1:649] 0 0 0 0 1 1 1 0 0 0 ...
##  $ paid_fine           : num [1:649] 0 0 0 0 0 1 0 0 0 0 ...
##  $ owes_money          : num [1:649] 1 0 0 0 0 0 0 0 0 0 ...
##  $ financial_resolution: chr [1:649] "Still Owes" "No Payment" "No Payment" "No Payment" ...
##  $ diversion_binary    : int [1:649] 0 0 0 0 0 0 0 0 0 0 ...
##  $ punitive_binary     : num [1:649] 1 1 1 1 1 1 1 1 1 1 ...
##  $ recid_binary        : int [1:649] 0 0 0 0 1 1 1 0 0 0 ...
table(cases$StatusDescription)
## 
## AUTO RESET-JUVENILE APPEARANCE           DEFERRED DISPOSITION 
##                            118                              3 
##       DEFERRED DISPOSITION P/P         DISM PROS/PLEA BARGAIN 
##                              9                             27 
##              DISMISS WAIVE FEE DISMISSED AFTER DEFERRED DISP. 
##                            284                             41 
##      DISMISSED YOUTH DIVERSION                  DOCKET CLOSED 
##                              1                             90 
##            JUDGMENT/CONVICTION            JUVENILE APPEARANCE 
##                              2                             36 
##              PAYMENT EXTENSION        PLEA BARGAIN APPEARANCE 
##                              6                              2 
##       RESET - MINOR APPEARANCE         RESET - QOL APPEARANCE 
##                              1                              1 
##               REVOKED PROB/DSC                  VOIDED DOCKET 
##                             11                              1 
##    WAIVE JUVENILE JURISDICTION 
##                             16
names(cases)
##  [1] "FiscalYear"           "ViolationDate"        "ViolTime"            
##  [4] "Filed"                "StatusDate"           "ConvictionDate"      
##  [7] "PleaDate"             "Plea"                 "DefendantZipcode1"   
## [10] "OffenseCode"          "OffenseDesc"          "ViolPaid"            
## [13] "ViolFine"             "ViolFees"             "AmtDue"              
## [16] "Status"               "StatusDescription"    "AgeAtTimeOfViolation"
## [19] "Race"                 "Gender"               "CaseType"            
## [22] "PersonViolCtAll"      "PersonViolCtNTRF"     "age"                 
## [25] "gender"               "race"                 "zipcode"             
## [28] "total_violations"     "cited_violations"     "repeat_offender"     
## [31] "paid_fine"            "owes_money"           "financial_resolution"
## [34] "diversion_binary"     "punitive_binary"      "recid_binary"
names(research_data)
## [1] "FiscalYear"       "age"              "race"             "gender"          
## [5] "zipcode"          "total_violations" "cited_violations" "repeat_offender"
## Warning: package 'knitr' was built under R version 4.4.3
Table 1. Descriptive statistics for key numeric variables.
Variable N Mean Median Min Max SD
Age 649 14.16 14 2 16 1.58
Total violations 649 1.24 1 1 7 0.69
Cited violations 649 1.14 1 1 4 0.44
Repeat offender (0/1) 649 0.15 0 0 1 0.36
## 2️⃣ Correlation Matrix Table (with significance stars)

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.4.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:DescTools':
## 
##     %nin%, Label, Mean, Quantile
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(knitr)

vars_for_corr <- research_data %>%
  select(age, total_violations, cited_violations, repeat_offender)

corr_res <- rcorr(as.matrix(vars_for_corr))  # r and p

r_mat <- corr_res$r
p_mat <- corr_res$P

# build matrix with stars
stars <- ifelse(p_mat < 0.001, "***",
         ifelse(p_mat < 0.01,  "**",
         ifelse(p_mat < 0.05,  "*", "")))

r_with_stars <- paste0(round(r_mat, 2), stars)
dim(r_with_stars) <- dim(r_mat)
rownames(r_with_stars) <- rownames(r_mat)
colnames(r_with_stars) <- colnames(r_mat)

kable(r_with_stars,
      caption = "Table 2. Correlation matrix for key variables (Pearson r with significance).")
Table 2. Correlation matrix for key variables (Pearson r with significance).
age total_violations cited_violations repeat_offender
age 1NA 0.06 -0.01 0.05
total_violations 0.06 1NA 0.69*** 0.83***
cited_violations -0.01 0.69*** 1NA 0.78***
repeat_offender 0.05 0.83*** 0.78*** 1NA
## Warning: package 'broom' was built under R version 4.4.3
Table 3. Linear regression predicting total violations from age, gender, and race.
Variable Coefficient Std. Error t value p value
Intercept 0.485 0.366 1.325 0.186
Age 0.024 0.018 1.344 0.179
genderM 0.248 0.058 4.306 0.000
genderU 0.180 0.286 0.630 0.529
raceB 0.433 0.276 1.566 0.118
raceH 0.299 0.267 1.119 0.264
raceU 0.170 0.375 0.454 0.650
raceW 0.252 0.266 0.944 0.345
raceX 0.181 0.745 0.243 0.808
## \nModel fit: R-squared = 0.041, Adjusted R-squared = 0.028, F(8, 608) = 3.22, p = 0.00137.\n
# Descriptive statistics for numeric variables used in analysis
library(dplyr)
library(tidyr)

numeric_vars <- c("age", 
                  "total_violations", 
                  "cited_violations", 
                  "repeat_offender")

descriptives_table <- research_data |>
  summarise(across(
    .cols = all_of(numeric_vars),
    .fns  = list(
      n      = ~sum(!is.na(.)),
      mean   = ~mean(., na.rm = TRUE),
      sd     = ~sd(., na.rm = TRUE),
      min    = ~min(., na.rm = TRUE),
      median = ~median(., na.rm = TRUE),
      max    = ~max(., na.rm = TRUE)
    ),
    .names = "{.col}_{.fn}"
  )) |>
  pivot_longer(
    cols = everything(),
    names_to = c("variable", "stat"),
    names_sep = "_",
    values_to = "value"
  ) |>
  pivot_wider(
    names_from = stat,
    values_from = value
  )
## Warning: Expected 2 pieces. Additional pieces discarded in 18 rows [7, 8, 9, 10, 11, 12,
## 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24].
## Warning: Values from `value` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} |>
##   dplyr::summarise(n = dplyr::n(), .by = c(variable, stat)) |>
##   dplyr::filter(n > 1L)
descriptives_table
## # A tibble: 4 × 9
##   variable n         mean      sd     min    median max    violations offender 
##   <chr>    <list>    <list>    <list> <list> <list> <list> <list>     <list>   
## 1 age      <dbl [1]> <dbl [1]> <dbl>  <dbl>  <dbl>  <dbl>  <NULL>     <NULL>   
## 2 total    <NULL>    <NULL>    <NULL> <NULL> <NULL> <NULL> <dbl [6]>  <NULL>   
## 3 cited    <NULL>    <NULL>    <NULL> <NULL> <NULL> <NULL> <dbl [6]>  <NULL>   
## 4 repeat   <NULL>    <NULL>    <NULL> <NULL> <NULL> <NULL> <NULL>     <dbl [6]>
# install.packages("Hmisc") # run once in Console
library(Hmisc)

corr_vars <- research_data |>
  dplyr::select(age, total_violations, cited_violations, repeat_offender)

corr_results <- Hmisc::rcorr(as.matrix(corr_vars))  # r and p-value matrices

# Helper to create significance stars
star_fun <- function(p) {
  ifelse(p < 0.001, "***",
  ifelse(p < 0.01,  "**",
  ifelse(p < 0.05,  "*",
  ifelse(p < 0.10, ".", ""))))
}

r_mat <- corr_results$r
p_mat <- corr_results$P

star_mat <- matrix(
  star_fun(p_mat),
  nrow = nrow(p_mat),
  dimnames = dimnames(p_mat)
)

# Correlations with significance stars (e.g., 0.83***)
corr_with_stars <- matrix(
  paste0(round(r_mat, 2), star_mat),
  nrow = nrow(r_mat),
  dimnames = dimnames(r_mat)
)

corr_with_stars
##                  age     total_violations cited_violations repeat_offender
## age              "1NA"   "0.06"           "-0.01"          "0.05"         
## total_violations "0.06"  "1NA"            "0.69***"        "0.83***"      
## cited_violations "-0.01" "0.69***"        "1NA"            "0.78***"      
## repeat_offender  "0.05"  "0.83***"        "0.78***"        "1NA"
# install.packages("broom") # run once in Console
library(broom)
library(dplyr)

# Coefficient-level table
lm_coef_table <- broom::tidy(lm_model) |>
  mutate(
    sig = dplyr::case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.10  ~ ".",
      TRUE            ~ ""
    )
  ) |>
  dplyr::select(
    term,          # (Intercept), age, genderM, genderU, raceB, raceH, raceU, raceW, raceX
    estimate,
    std.error,
    statistic,
    p.value,
    sig
  )

lm_coef_table
## # A tibble: 9 × 6
##   term        estimate std.error statistic   p.value sig  
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl> <chr>
## 1 (Intercept)   0.485     0.366      1.33  0.186     ""   
## 2 age           0.0239    0.0178     1.34  0.179     ""   
## 3 genderM       0.248     0.0577     4.31  0.0000194 "***"
## 4 genderU       0.180     0.286      0.630 0.529     ""   
## 5 raceB         0.433     0.276      1.57  0.118     ""   
## 6 raceH         0.299     0.267      1.12  0.264     ""   
## 7 raceU         0.170     0.375      0.454 0.650     ""   
## 8 raceW         0.252     0.266      0.944 0.345     ""   
## 9 raceX         0.181     0.745      0.243 0.808     ""
lm_fit_stats <- broom::glance(lm_model) |>
  transmute(
    adj_r_squared = adj.r.squared,
    F_statistic   = statistic,
    df1           = df,          # numerator df
    df2           = df.residual, # denominator df
    model_p       = p.value
  )

lm_fit_stats
## # A tibble: 1 × 5
##   adj_r_squared F_statistic   df1   df2 model_p
##           <dbl>       <dbl> <dbl> <int>   <dbl>
## 1        0.0280        3.22     8   608 0.00137
summary(logit_model)
## 
## Call:
## glm(formula = repeat_offender ~ age + gender + race, family = binomial, 
##     data = model_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -17.31422  543.87728  -0.032  0.97460   
## age            0.08400    0.07745   1.085  0.27809   
## genderM        0.73197    0.24938   2.935  0.00333 **
## genderU        1.02225    1.15438   0.886  0.37586   
## raceB         14.44249  543.87619   0.027  0.97881   
## raceH         13.95809  543.87614   0.026  0.97953   
## raceU         13.53256  543.87728   0.025  0.98015   
## raceW         13.78665  543.87614   0.025  0.97978   
## raceX          0.57217 1553.69986   0.000  0.99971   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 523.18  on 616  degrees of freedom
## Residual deviance: 506.53  on 608  degrees of freedom
##   (32 observations deleted due to missingness)
## AIC: 524.53
## 
## Number of Fisher Scoring iterations: 14
library(pscl)
## Warning: package 'pscl' was built under R version 4.4.3
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
# Pseudo R-squared values (McFadden, etc.)
pR2(logit_model)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -253.26459246 -261.59079230   16.65239969    0.03182910    0.02662835 
##          r2CU 
##    0.04657703
# Likelihood Ratio Chi-square test
anova(logit_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: repeat_offender
## 
## Terms added sequentially (first to last)
## 
## 
##        Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                     616     523.18           
## age     1   1.6300       615     521.55  0.20171  
## gender  2   8.7861       613     512.77  0.01236 *
## race    5   6.2363       608     506.53  0.28390  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1