This is a project that I hope to present at the NYC OpenData conference next spring. I must admit that there are some datasets that I intended to use for this final assignment, but because of time and roadblocks, I will focus on the two keys datasets for now. My goal is to keep on this project by finding a meaningful way to include the other datasets. I believe that this project has the potential to become something special.
rea <- nycOpenData::nyc_dop_juvenile_rearrest_rate(limit= 10000)
head(rea)
## # A tibble: 6 × 4
## borough month year rate
## <chr> <chr> <chr> <chr>
## 1 Citywide October 2025 4.7
## 2 Citywide September 2025 4.9
## 3 Citywide August 2025 4.1
## 4 Citywide July 2025 4.7
## 5 Citywide June 2025 4.7
## 6 Citywide May 2025 4.4
rea_clean <- rea %>%
filter(year >= 2023 & year <= 2025)
head(rea_clean)
## # A tibble: 6 × 4
## borough month year rate
## <chr> <chr> <chr> <chr>
## 1 Citywide October 2025 4.7
## 2 Citywide September 2025 4.9
## 3 Citywide August 2025 4.1
## 4 Citywide July 2025 4.7
## 5 Citywide June 2025 4.7
## 6 Citywide May 2025 4.4
rea_clean <- rea_clean %>%
mutate(
month_year = paste(month,year, sep = " "),
month_year = my(month_year)
)
rea_clean <- rea_clean %>%
select(-month, -year)
str(rea_clean)
## tibble [34 × 3] (S3: tbl_df/tbl/data.frame)
## $ borough : chr [1:34] "Citywide" "Citywide" "Citywide" "Citywide" ...
## $ rate : chr [1:34] "4.7" "4.9" "4.1" "4.7" ...
## $ month_year: Date[1:34], format: "2025-10-01" "2025-09-01" ...
To see the structure.
rea_clean <- rea_clean %>%
mutate(
rate = as.numeric(rate) # Making rate into numeric instead of character
)
sum_year <- rea_clean %>%
mutate(year = year(month_year)) %>%
group_by(year) %>%
summarise(
total_rate = sum(rate, na.rm = TRUE)
)
sum_year
## # A tibble: 3 × 2
## year total_rate
## <dbl> <dbl>
## 1 2023 38.8
## 2 2024 43.3
## 3 2025 44.3
avg_rea_year <- rea_clean %>%
mutate(year = year(month_year)) %>%
group_by(year) %>%
summarise(
mean_rate = mean(rate, na.rm = TRUE)
)
avg_rea_year
## # A tibble: 3 × 2
## year mean_rate
## <dbl> <dbl>
## 1 2023 3.23
## 2 2024 3.61
## 3 2025 4.43
Let’s take a closer look to see if there are some months where rearrest rates are higher than others
month_repeat <- rea_clean %>%
mutate(month = month(month_year, label = TRUE)) %>%
group_by(month) %>%
summarise(
mean_rate = mean(rate, na.rm = TRUE)
)
month_repeat
## # A tibble: 12 × 2
## month mean_rate
## <ord> <dbl>
## 1 Jan 3.5
## 2 Feb 3.57
## 3 Mar 3.67
## 4 Apr 3.73
## 5 May 3.73
## 6 Jun 3.87
## 7 Jul 4.07
## 8 Aug 3.63
## 9 Sep 3.87
## 10 Oct 3.83
## 11 Nov 3.55
## 12 Dec 3.45
cor(as.numeric(rea_clean$rate), as.numeric(rea_clean$month_year))
## [1] 0.6893935
It seems that there is indeed a strong correlation between rearrest rates and year with r = 0.69.
rea_cor <- rea_clean %>%
mutate(
year = lubridate::year(month_year),
month = lubridate::month(month_year)
) %>%
select(rate, year, month)
cor_matrix <- cor(rea_cor)
cor_matrix
## rate year month
## rate 1.00000000 0.6859785 0.04834241
## year 0.68597846 1.0000000 -0.11679859
## month 0.04834241 -0.1167986 1.00000000
corrplot(cor_matrix, method = "color", addCoef.col = "orange")
ggplot(rea_clean, aes(x = month_year, y = rate)) +
geom_line(color = "green", size = 1) +
geom_point(color = "black", size = 2) +
geom_smooth(method = "loess", se = FALSE, color = "red", linetype = "dashed") +
labs(
title = "Rearrest Rates Over Time",
x = "Year",
y = "Rate"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
This figure shows rearrest rates over time.
ggplot(rea_clean, aes(x = month_year, y = rate)) +
geom_line(color = "lightblue", size = 1) +
geom_point(color = "orange", size = 2) +
labs(
title = "Rearrest Rates Over Time",
subtitle = "Monthly Data",
x = "Year",
y = "Rate"
) +
scale_x_date(date_breaks = "3 months", date_labels = "%b %Y") +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
This figure shows rearrest rates by year.
juv <- nycOpenData::nyc_dop_Juvenile_cases(limit = 10000)
head(juv)
## # A tibble: 6 × 5
## borough supervision_caseload_type month year supervision_caseload…¹
## <chr> <chr> <chr> <chr> <chr>
## 1 Citywide Enhanced Supervision Program Octo… 2025 282
## 2 Citywide Juvenile Justice Initiative Octo… 2025 135
## 3 Citywide Pathways to Excellence Achievemen… Octo… 2025 0
## 4 Citywide IMPACT Octo… 2025 0
## 5 Citywide Every Child Has An Opportunity To… Octo… 2025 6
## 6 Citywide General Supervision Octo… 2025 648
## # ℹ abbreviated name: ¹supervision_caseload_count
juv_clean <- juv %>%
filter(year >= 2023 & year <= 2025)
head(juv_clean)
## # A tibble: 6 × 5
## borough supervision_caseload_type month year supervision_caseload…¹
## <chr> <chr> <chr> <chr> <chr>
## 1 Citywide Enhanced Supervision Program Octo… 2025 282
## 2 Citywide Juvenile Justice Initiative Octo… 2025 135
## 3 Citywide Pathways to Excellence Achievemen… Octo… 2025 0
## 4 Citywide IMPACT Octo… 2025 0
## 5 Citywide Every Child Has An Opportunity To… Octo… 2025 6
## 6 Citywide General Supervision Octo… 2025 648
## # ℹ abbreviated name: ¹supervision_caseload_count
juv_clean <- juv_clean %>%
mutate(
month_year = paste(month,year, sep = " "),
month_year = my(month_year)
)
head(juv_clean)
## # A tibble: 6 × 6
## borough supervision_caseload_…¹ month year supervision_caseload…² month_year
## <chr> <chr> <chr> <chr> <chr> <date>
## 1 Citywide Enhanced Supervision P… Octo… 2025 282 2025-10-01
## 2 Citywide Juvenile Justice Initi… Octo… 2025 135 2025-10-01
## 3 Citywide Pathways to Excellence… Octo… 2025 0 2025-10-01
## 4 Citywide IMPACT Octo… 2025 0 2025-10-01
## 5 Citywide Every Child Has An Opp… Octo… 2025 6 2025-10-01
## 6 Citywide General Supervision Octo… 2025 648 2025-10-01
## # ℹ abbreviated names: ¹supervision_caseload_type, ²supervision_caseload_count
juv_clean <- juv_clean %>%
select(-month, -year)
Let’s see if we can find a correlation between rearrest rates and date but before that let’s make sure that the columns are in the right format.
str(juv_clean)
## tibble [197 × 4] (S3: tbl_df/tbl/data.frame)
## $ borough : chr [1:197] "Citywide" "Citywide" "Citywide" "Citywide" ...
## $ supervision_caseload_type : chr [1:197] "Enhanced Supervision Program" "Juvenile Justice Initiative" "Pathways to Excellence Achievement and Knowledge" "IMPACT" ...
## $ supervision_caseload_count: chr [1:197] "282" "135" "0" "0" ...
## $ month_year : Date[1:197], format: "2025-10-01" "2025-10-01" ...
To look at the structure.
juv_clean <- juv_clean %>%
mutate(supervision_caseload_count = as.numeric(supervision_caseload_count))
juv_clean %>%
group_by(supervision_caseload_type) %>%
summarise(mean_count = mean(supervision_caseload_count, na.rm = TRUE))
## # A tibble: 7 × 2
## supervision_caseload_type mean_count
## <chr> <dbl>
## 1 Advocate Intervene Mentor 17.3
## 2 Enhanced Supervision Program 204.
## 3 Every Child Has An Opportunity To Excel And Succeed 8.27
## 4 General Supervision 531.
## 5 IMPACT 0
## 6 Juvenile Justice Initiative 91.8
## 7 Pathways to Excellence Achievement and Knowledge 0
cor(juv_clean$supervision_caseload_count, as.numeric(juv_clean$month_year))
## [1] 0.05815845
There is a very weak correlation (r = 0.06) between Supervision Caseload Count and Year.
juv_clean %>%
group_by(supervision_caseload_type) %>%
summarise(mean_count = mean(supervision_caseload_count, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(supervision_caseload_type, mean_count), y = mean_count, fill = supervision_caseload_type)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "Average Supervision Caseload Count by Program",
x = "Program Type",
y = "Average Caseload Count"
) +
theme_minimal()
This figure shows the average supervision caseload count per program type.
Let’s try to include years into the graph.
juv_year <- juv_clean %>%
mutate(year = year(month_year)) %>%
group_by(supervision_caseload_type, year) %>%
summarise(
total_count = sum(supervision_caseload_count, na.rm = TRUE),
.groups = "drop"
)
ggplot(juv_year, aes(x = factor(year), y = supervision_caseload_type, fill = total_count)) +
geom_tile(color = "white") +
scale_fill_viridis_c(option = "C") +
labs(
title = "Caseload Counts by Program Per Year",
x = "Year",
y = "Program Type",
fill = "Total Count"
) +
theme_minimal()
This figure shows supervision caseload type and count per year.