Load Libraries

## Load Libraries
library(readxl)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ranger)
## Warning: package 'ranger' was built under R version 4.2.3
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(ggplot2)

Load Data

data <- read_excel("C:/Users/15124/Downloads/prediction_data.xlsx")
## Warning: Expecting numeric in BL20694 / R20694C64: got 'NULL'
## Warning: Expecting logical in BS31248 / R31248C71: got 'NULL'
## Warning: Expecting numeric in BN101222 / R101222C66: got 'NULL'
## Warning: Expecting numeric in BP101539 / R101539C68: got 'NULL'
## Warning: Expecting logical in BS104425 / R104425C71: got 'NULL'
## Warning: Expecting logical in BR104824 / R104824C70: got 'NULL'
## Warning: Expecting numeric in BQ104956 / R104956C69: got 'NULL'
## Warning: Expecting numeric in BM106771 / R106771C65: got 'NULL'
## Warning: Expecting numeric in BQ107127 / R107127C69: got 'NULL'
## Warning: Expecting numeric in BP115280 / R115280C68: got 'NULL'
## Warning: Expecting logical in BS119900 / R119900C71: got 'NULL'
## Warning: Expecting logical in BS120685 / R120685C71: got 'NULL'
## Warning: Expecting numeric in BM122156 / R122156C65: got 'NULL'
## Warning: Expecting logical in BO123166 / R123166C67: got 'NULL'
## Warning: Expecting numeric in BN123211 / R123211C66: got 'NULL'
## Warning: Expecting numeric in BK123224 / R123224C63: got 'NULL'
## Warning: Expecting numeric in BM124484 / R124484C65: got 'NULL'
## Warning: Expecting numeric in BK131022 / R131022C63: got 'NULL'
## Warning: Expecting numeric in BL135418 / R135418C64: got 'NULL'
## Warning: Expecting numeric in BN135542 / R135542C66: got 'NULL'
## Warning: Expecting logical in BS137061 / R137061C71: got 'NULL'
## Warning: Expecting logical in BO137245 / R137245C67: got 'NULL'
## Warning: Expecting numeric in BQ137436 / R137436C69: got 'NULL'
## Warning: Expecting numeric in BK137589 / R137589C63: got 'NULL'
## Warning: Expecting logical in BO138918 / R138918C67: got 'NULL'
## Warning: Expecting numeric in BL138928 / R138928C64: got 'NULL'
## Warning: Expecting logical in BO140365 / R140365C67: got 'NULL'
## Warning: Expecting numeric in BP140371 / R140371C68: got 'NULL'
## Warning: Expecting logical in BS141315 / R141315C71: got 'NULL'
## Warning: Expecting numeric in BL142332 / R142332C64: got 'NULL'
## Warning: Expecting numeric in BK147241 / R147241C63: got 'NULL'
## Warning: Expecting logical in BO147339 / R147339C67: got 'NULL'
## Warning: Expecting numeric in BL147875 / R147875C64: got 'NULL'
## Warning: Expecting numeric in BN150048 / R150048C66: got 'NULL'
colnames(data)
##   [1] "internal_case_id"                 "gender"                          
##   [3] "race"                             "ethnicity"                       
##   [5] "age_at_crime"                     "age_at_arrest"                   
##   [7] "court_name"                       "court_ori"                       
##   [9] "county_name"                      "district"                        
##  [11] "region"                           "court_type"                      
##  [13] "judge_name"                       "offense_date"                    
##  [15] "arrest_date"                      "arrest_type"                     
##  [17] "top_arrest_law"                   "top_arrest_article_section"      
##  [19] "top_arrest_attempt_indicator"     "top_charge_at_arrest"            
##  [21] "top_charge_severity_at_arrest"    "top_charge_weight_at_arrest"     
##  [23] "top_charge_at_arrest_violent_fel" "case_type"                       
##  [25] "first_arraign_date"               "top_arraign_law"                 
##  [27] "top_arraign_article_section"      "top_arraign_attempt_indicator"   
##  [29] "top_charge_at_arraign"            "top_severity_at_arraign"         
##  [31] "top_charge_weight_at_arraign"     "top_charge_at_arraign_violent_fe"
##  [33] "arraignchargecategory"            "representation_type"             
##  [35] "app_count_arraign_to_dispo_relea" "app_count_arraign_to_dispo_detai"
##  [37] "app_count_arraign_to_dispo_total" "def_attended_sched_pretrials"    
##  [39] "remanded_to_jail_at_arraign"      "ror_at_arraign"                  
##  [41] "bail_set_and_posted_at_arraign"   "bail_set_and_not_posted_at_arrai"
##  [43] "nmr_at_arraign"                   "releasedecisionatarraign"        
##  [45] "representation_at_securing_order" "pretrial_supervision_at_arraign" 
##  [47] "contact_pretrial_service_agency"  "electronic_monitoring"           
##  [49] "travel_restrictions"              "passport_surrender"              
##  [51] "no_firearms_or_weapons"           "maintain_employment"             
##  [53] "maintain_housing"                 "maintain_school"                 
##  [55] "placement_in_mandatory_program"   "removal_to_hospital"             
##  [57] "obey_order_of_protection"         "obey_court_conditionsfamily_offe"
##  [59] "other_nmr"                        "order_of_protection"             
##  [61] "first_bail_set_cash"              "first_bail_set_credit"           
##  [63] "first_insurance_company_bail_bon" "first_secured_surety_bond"       
##  [65] "first_secured_app_bond"           "first_unsecured_surety_bond"     
##  [67] "first_unsecured_app_bond"         "first_partially_secured_surety_b"
##  [69] "partially_secured_surety_bond_pe" "first_partially_secured_app_bond"
##  [71] "partially_secured_app_bond_perc"  "bail_made_indicator"             
##  [73] "warrant_ordered_btw_arraign_and_" "dat_wo_ws_prior_to_arraign"      
##  [75] "first_bench_warrant_date"         "non_stayed_wo"                   
##  [77] "num_of_stayed_wo"                 "num_of_row"                      
##  [79] "docket_status"                    "disposition_type"                
##  [81] "disposition_detail"               "dismissal_reason"                
##  [83] "disposition_date"                 "most_severe_sentence"            
##  [85] "top_conviction_law"               "top_conviction_article_section"  
##  [87] "top_conviction_attempt_indicator" "top_charge_at_conviction"        
##  [89] "top_charge_severity_at_convictio" "top_charge_weight_at_conviction" 
##  [91] "top_charge_at_conviction_violent" "days_arraign_remand_first_releas"
##  [93] "known_days_in_custody"            "days_arraign_bail_set_to_first_p"
##  [95] "days_arraign_bail_set_to_first_r" "days_arraign_to_dispo"           
##  [97] "prior_vfo_cnt"                    "prior_nonvfo_cnt"                
##  [99] "prior_misd_cnt"                   "pend_vfo"                        
## [101] "pend_nonvfo"                      "pend_misd"                       
## [103] "supervision"                      "rearrest"                        
## [105] "rearrest_date"                    "rearrest_firearm"                
## [107] "rearrest_date_firearm"            "arr_cycle_id"                    
## [109] "ror"                              "arrest_year"                     
## [111] "arrest_month"                     "offense_year"                    
## [113] "offense_month"                    "first_arraign_year"              
## [115] "first_arraign_month"              "assault"                         
## [117] "burglary"                         "conspiracy"                      
## [119] "crim_contempt"                    "weapon_possesion"                
## [121] "dwi"                              "drug"                            
## [123] "endanger"                         "homicide_related"                
## [125] "larceny"                          "obstruction"                     
## [127] "other"                            "other_vtl"                       
## [129] "property"                         "robbery"                         
## [131] "unlicensed_operation"             "male"                            
## [133] "black"

Data Preparation

My Variables: rearrested: I created this binary variable to capture whether individuals were rearrested or not, with 1 indicating rearrest and 0 indicating no rearrest. This variable will likely serve as the dependent variable (outcome) for a predictive model or statistical analysis, as I am interested in understanding the factors that influence the likelihood of rearrest.

age_categories and age_cat_*: I created age categories to group individuals based on their age at the time of arrest. The use of age categories allows me to account for potential differences in rearrest rates based on age groups. These categorical variables were further converted into binary variables to facilitate regression analysis, where each binary variable indicates membership in a specific age group.

gender_female and gender_male: I created gender binary variables to capture the gender of individuals. Gender may be a relevant factor in understanding rearrest patterns, so I included these variables to control for any differences between male and female individuals.

race_white, race_black, and race_other: I created race binary variables to capture the race of individuals. Research has shown that race can play a role in criminal justice outcomes, so including these variables allows me to examine if there are any disparities in rearrest rates based on race.

top_charge_severity_binary: I created this binary variable to distinguish between individuals whose top charge severity at arrest was a misdemeanor versus a more serious offense. The severity of the charge could affect the likelihood of rearrest, so it is important to include this variable in the analysis.

prior_record: I created this binary variable to capture whether individuals have a prior record of offenses (VFO, non-VFO, or misdemeanor offenses). The presence of a prior criminal record is often considered an important risk factor for recidivism, so I included this variable to account for individuals’ criminal history.

missing_prior_*_cnt: I created these binary variables to indicate missing data for prior offenses. The presence of missing data can introduce bias in the analysis, so it’s important to account for it explicitly by including these variables.

prior_squared: I created this variable to represent the squared sum of prior offenses. The inclusion of a squared term allows me to explore potential non-linear relationships between the number of prior offenses and the likelihood of rearrest.

Overall, my choice of variables is based on the assumption that factors such as age, gender, race, charge severity, and prior criminal history may influence the likelihood of rearrest. By including these variables, I aim to build a comprehensive and informative analysis that accounts for the various factors that may contribute to rearrest patterns.

# Create binary variables for rearrested, age categories, gender, race, charge severity, and prior record
data$rearrested <- ifelse(is.na(data$rearrest) | data$rearrest == "No Arrest", 0, 1)

data$age_categories <- NA
data$age_categories <- ifelse(data$age_at_arrest < 18, "<18",
                              ifelse(data$age_at_arrest >= 18 & data$age_at_arrest <= 24, "18-24",
                                     ifelse(data$age_at_arrest >= 25 & data$age_at_arrest <= 34, "25-34",
                                            ifelse(data$age_at_arrest >= 35 & data$age_at_arrest <= 44, "35-44",
                                                   ifelse(data$age_at_arrest >= 45 & data$age_at_arrest <= 54, "45-54",
                                                          ifelse(data$age_at_arrest >= 55, "55+", NA))))))

data$age_cat_lt18 <- ifelse(data$age_categories == "<18", 1, 0)
data$age_cat_18_24 <- ifelse(data$age_categories == "18-24", 1, 0)
data$age_cat_25_34 <- ifelse(data$age_categories == "25-34", 1, 0)
data$age_cat_35_44 <- ifelse(data$age_categories == "35-44", 1, 0)
data$age_cat_45_54 <- ifelse(data$age_categories == "45-54", 1, 0)
data$age_cat_gt55 <- ifelse(data$age_categories == "55+", 1, 0)

data$gender_female <- ifelse(data$gender == "Female", 1, 0)
data$gender_male <- ifelse(data$gender == "Male", 1, 0)

data$race_white <- ifelse(data$race == "White", 1, 0)
data$race_black <- ifelse(data$race == "Black", 1, 0)
data$race_other <- ifelse(!data$race %in% c("White", "Black"), 1, 0)

data$top_charge_severity_binary <- ifelse(data$top_charge_severity_at_arrest == "Misdemeanor", 1, 0)

data$prior_record <- ifelse(data$prior_vfo_cnt > 0 | data$prior_nonvfo_cnt > 0 | data$prior_misd_cnt > 0, 1, 0)

data$missing_prior_vfo_cnt <- ifelse(is.na(data$prior_vfo_cnt), 1, 0)
data$missing_prior_nonvfo_cnt <- ifelse(is.na(data$prior_nonvfo_cnt), 1, 0)
data$missing_prior_misd_cnt <- ifelse(is.na(data$prior_misd_cnt), 1, 0)

data$prior_vfo_cnt[is.na(data$prior_vfo_cnt)] <- 0
data$prior_nonvfo_cnt[is.na(data$prior_nonvfo_cnt)] <- 0
data$prior_misd_cnt[is.na(data$prior_misd_cnt)] <- 0

data$prior_vfo_cnt <- as.numeric(data$prior_vfo_cnt)
## Warning: NAs introduced by coercion
data$prior_nonvfo_cnt <- as.numeric(data$prior_nonvfo_cnt)
## Warning: NAs introduced by coercion
data$prior_misd_cnt <- as.numeric(data$prior_misd_cnt)
## Warning: NAs introduced by coercion
data$prior_squared <- data$prior_vfo_cnt^2 + data$prior_nonvfo_cnt^2 + data$prior_misd_cnt^2

Model Fitting and Analysis - Rearrest

# Fit the OLS model for rearrest
ols_model <- lm(rearrested ~ age_cat_lt18 + age_cat_18_24 + age_cat_25_34 + age_cat_35_44 + age_cat_45_54 +
                  age_cat_gt55 + gender_female + gender_male + race_white + race_black + race_other +
                  top_charge_severity_binary + prior_record + missing_prior_vfo_cnt + missing_prior_nonvfo_cnt +
                  missing_prior_misd_cnt + prior_vfo_cnt + prior_nonvfo_cnt + prior_misd_cnt + prior_squared,
                data = data)

# Print the coefficients of the model
cat("Coefficients of the OLS model:\n")
## Coefficients of the OLS model:
print(coef(ols_model))
##                (Intercept)               age_cat_lt18 
##               0.0534933513               0.0718443399 
##              age_cat_18_24              age_cat_25_34 
##               0.1618114246               0.1185162614 
##              age_cat_35_44              age_cat_45_54 
##               0.0770269111               0.0432246393 
##               age_cat_gt55              gender_female 
##                         NA              -0.0478060605 
##                gender_male                 race_white 
##              -0.0103269416               0.0102369207 
##                 race_black                 race_other 
##                         NA                         NA 
## top_charge_severity_binary               prior_record 
##               0.0353577240               0.0520606388 
##      missing_prior_vfo_cnt   missing_prior_nonvfo_cnt 
##                         NA                         NA 
##     missing_prior_misd_cnt              prior_vfo_cnt 
##                         NA              -0.0121342246 
##           prior_nonvfo_cnt             prior_misd_cnt 
##              -0.0043250792               0.0269782711 
##              prior_squared 
##              -0.0001372239
# Display the model summary statistics
cat("\nModel summary statistics:\n")
## 
## Model summary statistics:
summary(ols_model)
## 
## Call:
## lm(formula = rearrested ~ age_cat_lt18 + age_cat_18_24 + age_cat_25_34 + 
##     age_cat_35_44 + age_cat_45_54 + age_cat_gt55 + gender_female + 
##     gender_male + race_white + race_black + race_other + top_charge_severity_binary + 
##     prior_record + missing_prior_vfo_cnt + missing_prior_nonvfo_cnt + 
##     missing_prior_misd_cnt + prior_vfo_cnt + prior_nonvfo_cnt + 
##     prior_misd_cnt + prior_squared, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55423 -0.24749 -0.19088 -0.07852  0.99431 
## 
## Coefficients: (6 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.0534934  0.0710603   0.753  0.45158    
## age_cat_lt18                0.0718443  0.0151471   4.743 2.11e-06 ***
## age_cat_18_24               0.1618114  0.0046811  34.567  < 2e-16 ***
## age_cat_25_34               0.1185163  0.0042465  27.909  < 2e-16 ***
## age_cat_35_44               0.0770269  0.0043480  17.715  < 2e-16 ***
## age_cat_45_54               0.0432246  0.0047863   9.031  < 2e-16 ***
## age_cat_gt55                       NA         NA      NA       NA    
## gender_female              -0.0478061  0.0709624  -0.674  0.50051    
## gender_male                -0.0103269  0.0709258  -0.146  0.88424    
## race_white                  0.0102369  0.0021763   4.704 2.56e-06 ***
## race_black                         NA         NA      NA       NA    
## race_other                         NA         NA      NA       NA    
## top_charge_severity_binary  0.0353577  0.0021696  16.297  < 2e-16 ***
## prior_record                0.0520606  0.0036369  14.314  < 2e-16 ***
## missing_prior_vfo_cnt              NA         NA      NA       NA    
## missing_prior_nonvfo_cnt           NA         NA      NA       NA    
## missing_prior_misd_cnt             NA         NA      NA       NA    
## prior_vfo_cnt              -0.0121342  0.0023485  -5.167 2.38e-07 ***
## prior_nonvfo_cnt           -0.0043251  0.0015041  -2.875  0.00403 ** 
## prior_misd_cnt              0.0269783  0.0017054  15.820  < 2e-16 ***
## prior_squared              -0.0001372  0.0001616  -0.849  0.39592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4135 on 154638 degrees of freedom
##   (1805 observations deleted due to missingness)
## Multiple R-squared:  0.04536,    Adjusted R-squared:  0.04527 
## F-statistic: 524.8 on 14 and 154638 DF,  p-value: < 2.2e-16
# Clean data and remove rows with missing values for the prediction of rearrest
data_clean <- na.omit(data[, c("rearrested", "age_cat_lt18", "age_cat_18_24", "age_cat_25_34", "age_cat_35_44", "age_cat_45_54",
                               "age_cat_gt55", "gender_female", "gender_male", "race_white", "race_black", "race_other",
                               "top_charge_severity_binary", "prior_record", "missing_prior_vfo_cnt", "missing_prior_nonvfo_cnt",
                               "missing_prior_misd_cnt", "prior_vfo_cnt", "prior_nonvfo_cnt", "prior_misd_cnt", "prior_squared")])

# Create a matrix of predictor variables using the cleaned data
predictors_matrix <- model.matrix(rearrested ~ . - 1, data = data_clean)

# Perform cross-validation for Lasso, Ridge, and Random Forest models
cv_lasso <- cv.glmnet(predictors_matrix, data_clean$rearrested, alpha = 1, nfolds = 5, family = "binomial")
optimal_lambda <- cv_lasso$lambda.min
lasso_model <- glmnet(predictors_matrix, data_clean$rearrested, alpha = 1, lambda = optimal_lambda, family = "binomial")

cv_ridge <- cv.glmnet(predictors_matrix, data_clean$rearrested, alpha = 0, nfolds = 5, family = "binomial")
optimal_lambda_ridge <- cv_ridge$lambda.min
ridge_model <- glmnet(predictors_matrix, data_clean$rearrested, alpha = 0, lambda = optimal_lambda_ridge, family = "binomial")

tuning_grid <- expand.grid(
  mtry = seq(2, floor(sqrt(ncol(predictors_matrix))), by = 2),
  splitrule = "gini",
  min.node.size = 1
)
cv_method <- trainControl(method = "cv", number = 10)
response_factor <- as.factor(data_clean$rearrested)
rf_model_cv <- train(
  x = predictors_matrix,
  y = response_factor,
  method = "ranger",
  trControl = cv_method,
  tuneGrid = tuning_grid,
  num.trees = 50,
  importance = "permutation"
)
rf_model <- ranger(
  formula = rearrested ~ .,
  data = data_clean,
  num.trees = 50,
  mtry = rf_model_cv$bestTune$mtry,
  importance = "permutation"
)

# Calculate model performance statistics
lasso_performance <- cv_lasso$cvm[cv_lasso$lambda == optimal_lambda]
ridge_performance <- cv_ridge$cvm[cv_ridge$lambda == optimal_lambda_ridge]
conf_matrix <- confusionMatrix(as.factor(ifelse(predict(rf_model, data = data_clean)$predictions > 0.5, 1, 0)), as.factor(data_clean$rearrested))
rf_performance <- conf_matrix$overall["Accuracy"]

# Print model performance statistics
cat("\nCross-validated mean deviance for the optimal Lasso model:", lasso_performance, "\n")
## 
## Cross-validated mean deviance for the optimal Lasso model: 1.042905
cat("Cross-validated mean deviance for the optimal Ridge model:", ridge_performance, "\n")
## Cross-validated mean deviance for the optimal Ridge model: 1.043197
cat("Cross-validated accuracy for the optimal random forest model:", rf_performance, "\n")
## Cross-validated accuracy for the optimal random forest model: 0.7690442

Model Fitting and Analysis - Release on Recognizance (ROR)

# Clean data and remove rows with missing values for the prediction of ROR
data_clean_ror <- na.omit(data[, c("ror", "age_cat_lt18", "age_cat_18_24", "age_cat_25_34", "age_cat_35_44", "age_cat_45_54",
                                   "age_cat_gt55", "gender_female", "gender_male", "race_white", "race_black", "race_other",
                                   "top_charge_severity_binary", "prior_record", "missing_prior_vfo_cnt", "missing_prior_nonvfo_cnt",
                                   "missing_prior_misd_cnt", "prior_vfo_cnt", "prior_nonvfo_cnt", "prior_misd_cnt", "prior_squared")])

# Fit the OLS model for ROR
ols_model_ror <- lm(ror ~ age_cat_lt18 + age_cat_18_24 + age_cat_25_34 + age_cat_35_44 + age_cat_45_54 +
                      age_cat_gt55 + gender_female + gender_male + race_white + race_black + race_other +
                      top_charge_severity_binary + prior_record + missing_prior_vfo_cnt + missing_prior_nonvfo_cnt +
                      missing_prior_misd_cnt + prior_vfo_cnt + prior_nonvfo_cnt + prior_misd_cnt + prior_squared,
                    data = data_clean_ror)

# Print the coefficients of the model
cat("Coefficients of the OLS model for predicting ROR:\n")
## Coefficients of the OLS model for predicting ROR:
print(coef(ols_model_ror))
##                (Intercept)               age_cat_lt18 
##                0.566647722               -0.088836940 
##              age_cat_18_24              age_cat_25_34 
##               -0.129604750               -0.103757280 
##              age_cat_35_44              age_cat_45_54 
##               -0.075643412               -0.036406946 
##               age_cat_gt55              gender_female 
##                         NA                0.081204732 
##                gender_male                 race_white 
##                0.002158698                0.011090404 
##                 race_black                 race_other 
##                         NA                         NA 
## top_charge_severity_binary               prior_record 
##                0.366102353               -0.058479755 
##      missing_prior_vfo_cnt   missing_prior_nonvfo_cnt 
##                         NA                         NA 
##     missing_prior_misd_cnt              prior_vfo_cnt 
##                         NA               -0.058249184 
##           prior_nonvfo_cnt             prior_misd_cnt 
##               -0.032823542               -0.036783383 
##              prior_squared 
##                0.001997881
# Display the model summary statistics
cat("\nModel summary statistics for predicting ROR:\n")
## 
## Model summary statistics for predicting ROR:
summary(ols_model_ror)
## 
## Call:
## lm(formula = ror ~ age_cat_lt18 + age_cat_18_24 + age_cat_25_34 + 
##     age_cat_35_44 + age_cat_45_54 + age_cat_gt55 + gender_female + 
##     gender_male + race_white + race_black + race_other + top_charge_severity_binary + 
##     prior_record + missing_prior_vfo_cnt + missing_prior_nonvfo_cnt + 
##     missing_prior_misd_cnt + prior_vfo_cnt + prior_nonvfo_cnt + 
##     prior_misd_cnt + prior_squared, data = data_clean_ror)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0251 -0.3534  0.1296  0.3033  0.9424 
## 
## Coefficients: (6 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.5666477  0.0737164   7.687 1.52e-14 ***
## age_cat_lt18               -0.0888369  0.0157133  -5.654 1.57e-08 ***
## age_cat_18_24              -0.1296048  0.0048560 -26.689  < 2e-16 ***
## age_cat_25_34              -0.1037573  0.0044053 -23.553  < 2e-16 ***
## age_cat_35_44              -0.0756434  0.0045106 -16.770  < 2e-16 ***
## age_cat_45_54              -0.0364069  0.0049652  -7.332 2.27e-13 ***
## age_cat_gt55                       NA         NA      NA       NA    
## gender_female               0.0812047  0.0736149   1.103    0.270    
## gender_male                 0.0021587  0.0735769   0.029    0.977    
## race_white                  0.0110904  0.0022577   4.912 9.01e-07 ***
## race_black                         NA         NA      NA       NA    
## race_other                         NA         NA      NA       NA    
## top_charge_severity_binary  0.3661024  0.0022507 162.660  < 2e-16 ***
## prior_record               -0.0584798  0.0037729 -15.500  < 2e-16 ***
## missing_prior_vfo_cnt              NA         NA      NA       NA    
## missing_prior_nonvfo_cnt           NA         NA      NA       NA    
## missing_prior_misd_cnt             NA         NA      NA       NA    
## prior_vfo_cnt              -0.0582492  0.0024362 -23.909  < 2e-16 ***
## prior_nonvfo_cnt           -0.0328235  0.0015604 -21.036  < 2e-16 ***
## prior_misd_cnt             -0.0367834  0.0017691 -20.792  < 2e-16 ***
## prior_squared               0.0019979  0.0001677  11.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4289 on 154638 degrees of freedom
## Multiple R-squared:  0.226,  Adjusted R-squared:  0.226 
## F-statistic:  3226 on 14 and 154638 DF,  p-value: < 2.2e-16
# Create a matrix of predictor variables using the "data_clean_ror" dataset
predictors_matrix_ror <- model.matrix(ror ~ . - 1, data = data_clean_ror)

# Perform cross-validation for Lasso regression (alpha = 1)
cv_lasso_ror <- cv.glmnet(predictors_matrix_ror, data_clean_ror$ror, alpha = 1, nfolds = 5, family = "binomial")

# Extract the optimal lambda value from the cross-validation object
optimal_lambda_ror <- cv_lasso_ror$lambda.min

# Fit the Lasso model using the optimal lambda value
lasso_model_ror <- glmnet(predictors_matrix_ror, data_clean_ror$ror, alpha = 1, lambda = optimal_lambda_ror, family = "binomial")

# Perform cross-validation for Ridge regression (alpha = 0)
cv_ridge_ror <- cv.glmnet(predictors_matrix_ror, data_clean_ror$ror, alpha = 0, nfolds = 5, family = "binomial")

# Extract the optimal lambda value from the cross-validation object
optimal_lambda_ridge_ror <- cv_ridge_ror$lambda.min

# Fit the Ridge model using the optimal lambda value
ridge_model_ror <- glmnet(predictors_matrix_ror, data_clean_ror$ror, alpha = 0, lambda = optimal_lambda_ridge_ror, family = "binomial")

# Define the tuning grid for the random forest model
tuning_grid_ror <- expand.grid(
  mtry = seq(2, floor(sqrt(ncol(predictors_matrix_ror))), by = 2),
  splitrule = "gini",
  min.node.size = 1
)

# Perform cross-validation for the random forest model and select the optimal hyperparameters
rf_model_cv_ror <- train(
  x = predictors_matrix_ror,
  y = as.factor(data_clean_ror$ror), # Use the converted response variable
  method = "ranger",
  trControl = cv_method,
  tuneGrid = tuning_grid_ror,
  num.trees = 50, # Number of trees
  importance = "permutation" # Request variable importance scores
)

# Fit the random forest model using the optimal hyperparameters
rf_model_ror <- ranger(
  formula = ror ~ .,
  data = data_clean_ror,
  num.trees = 50, # Fixed number of trees
  mtry = rf_model_cv_ror$bestTune$mtry, # Optimal value of mtry from cross-validation
  importance = "permutation"
)

# Calculate model performance statistics
lasso_performance_ror <- cv_lasso_ror$cvm[cv_lasso_ror$lambda == optimal_lambda_ror]
ridge_performance_ror <- cv_ridge_ror$cvm[cv_ridge_ror$lambda == optimal_lambda_ridge_ror]
conf_matrix_ror <- confusionMatrix(as.factor(ifelse(predict(rf_model_ror, data = data_clean_ror)$predictions > 0.5, 1, 0)), as.factor(data_clean_ror$ror))
rf_performance_ror <- conf_matrix_ror$overall["Accuracy"]

# Print the cross-validated mean deviance for the optimal Lasso model predicting ROR
cat("\nCross-validated mean deviance for the optimal Lasso model predicting ROR:", lasso_performance_ror, "\n")
## 
## Cross-validated mean deviance for the optimal Lasso model predicting ROR: 1.095738
# Print the cross-validated mean deviance for the optimal Ridge model predicting ROR
cat("\nCross-validated mean deviance for the optimal Ridge model predicting ROR:", ridge_performance_ror, "\n")
## 
## Cross-validated mean deviance for the optimal Ridge model predicting ROR: 1.098033
# Print the cross-validated accuracy for the optimal random forest model predicting ROR
cat("\nCross-validated accuracy for the optimal random forest model predicting ROR:", rf_performance_ror, "\n")
## 
## Cross-validated accuracy for the optimal random forest model predicting ROR: 0.7281398

OOSMSE Based on the provided out-of-sample mean squared error (OOSMSE) values for each model and outcome, the best model for predicting the “rearrested” outcome is the Lasso model, which has the lowest OOSMSE of 0.1716668. The best model for predicting the “ror” (release on recognizance) outcome is also the Lasso model, which has the lowest OOSMSE of 0.1834084.

In terms of which outcome can be predicted better, we can compare the OOSMSE values of the best-performing models for each outcome. The “rearrested” outcome has a lower OOSMSE value (0.1716668) compared to the “ror” outcome (0.1834084), indicating that the “rearrested” outcome can be predicted with slightly better accuracy. Lower values of OOSMSE indicate better prediction performance because it means the model’s predictions are closer to the observed values.

# Define a function to calculate OOSMSE for rearrest and ror outcomes
calculate_oosmse <- function(data_clean, outcome) {
  # Convert the response variable to binary numeric (0 or 1)
  data_clean[[outcome]] <- as.numeric(as.character(data_clean[[outcome]]))
  
  # Split the data into training (80%) and testing (20%) sets
  set.seed(123)
  train_index <- createDataPartition(data_clean[[outcome]], p = 0.8, list = FALSE)
  train_data <- data_clean[train_index, ]
  test_data <- data_clean[-train_index, ]
  
  # Create a matrix of predictor variables using the cleaned data
  predictors_matrix_train <- model.matrix(as.formula(paste(outcome, "~ . - 1")), data = train_data)
  predictors_matrix_test <- model.matrix(as.formula(paste(outcome, "~ . - 1")), data = test_data)
  
  # OLS Model
  ols_model_train <- lm(as.formula(paste(outcome, "~ .")), data = train_data)
  ols_predictions <- predict(ols_model_train, test_data)
  ols_mse <- mean((test_data[[outcome]] - ols_predictions)^2)
  
  # Lasso Model
  cv_lasso_train <- cv.glmnet(predictors_matrix_train, train_data[[outcome]], alpha = 1, nfolds = 10, family = "binomial")
  lasso_predictions <- predict(cv_lasso_train, newx = predictors_matrix_test, s = "lambda.min", type = "response")
  lasso_mse <- mean((test_data[[outcome]] - lasso_predictions)^2)
  
  # Ridge Model
  cv_ridge_train <- cv.glmnet(predictors_matrix_train, train_data[[outcome]], alpha = 0, nfolds = 10, family = "binomial")
  ridge_predictions <- predict(cv_ridge_train, newx = predictors_matrix_test, s = "lambda.min", type = "response")
  ridge_mse <- mean((test_data[[outcome]] - ridge_predictions)^2)
  
  # Random Forest Model
  rf_model_train <- ranger(as.formula(paste(outcome, "~ .")), data = train_data, num.trees = 500, importance = "permutation")
  rf_predictions_probs <- predict(rf_model_train, data = test_data)$predictions
  rf_predictions <- ifelse(rf_predictions_probs > 0.5, 1, 0)
  rf_mse <- mean((test_data[[outcome]] - rf_predictions)^2)
  
  # Compare OOSMSE for each model
  cat("OOSMSE for OLS model (", outcome, "): ", ols_mse, "\n", sep = "")
  cat("OOSMSE for Lasso model (", outcome, "): ", lasso_mse, "\n", sep = "")
  cat("OOSMSE for Ridge model (", outcome, "): ", ridge_mse, "\n", sep = "")
  cat("OOSMSE for Random Forest model (", outcome, "): ", rf_mse, "\n", sep = "")
}

# Calculate OOSMSE for rearrest outcome
calculate_oosmse(data_clean, "rearrested")
## Warning in predict.lm(ols_model_train, test_data): prediction from a
## rank-deficient fit may be misleading
## OOSMSE for OLS model (rearrested): 0.1719513
## OOSMSE for Lasso model (rearrested): 0.1716668
## OOSMSE for Ridge model (rearrested): 0.1717604
## OOSMSE for Random Forest model (rearrested): 0.2355642
# Calculate OOSMSE for ror outcome
calculate_oosmse(data_clean_ror, "ror")
## Warning in predict.lm(ols_model_train, test_data): prediction from a
## rank-deficient fit may be misleading
## OOSMSE for OLS model (ror): 0.1835986
## OOSMSE for Lasso model (ror): 0.1834084
## OOSMSE for Ridge model (ror): 0.1839191
## OOSMSE for Random Forest model (ror): 0.2786615

top 2 important features for each model

# Function to get top 2 important features for each model
get_top_features <- function(model, model_type, outcome) {
  if (model_type == "Random Forest") {
    # For Random Forest model, use the importance function from the ranger package
    imp <- importance(model)
    top_features <- head(sort(imp, decreasing = TRUE), n = 2)
    cat("Top 2 important features for", model_type, "model (", outcome, "):", names(top_features), "\n")
  } else if (model_type == "OLS") {
    # For OLS model, use the absolute values of the coefficients (excluding intercept)
    coefs <- coef(model)[-1]
    top_features <- head(sort(abs(coefs), decreasing = TRUE), n = 2)
    cat("Top 2 important features for", model_type, "model (", outcome, "):", names(top_features), "\n")
  } else if (model_type == "Lasso") {
    # For Lasso model, extract non-zero coefficients
    coefs <- as.data.frame(as.matrix(coef(model)))
    colnames(coefs) <- "coefficient"
    coefs$predictor <- rownames(coef(model))
    non_zero_coefs <- coefs[coefs$coefficient != 0, ]
    if (nrow(non_zero_coefs) > 0) {
      cat("Non-zero coefficients for", model_type, "model (", outcome, "):\n")
      print(non_zero_coefs)
      top_features <- head(non_zero_coefs[order(abs(non_zero_coefs$coefficient), decreasing = TRUE), ], n = 2)
      cat("Top 2 important features for", model_type, "model (", outcome, "):", top_features$predictor, "\n")
    } else {
      cat("No non-zero coefficients found for", model_type, "model (", outcome, ").\n")
    }
  }
}

# Get top 2 important features for OLS model (Rearrest outcome)
get_top_features(ols_model, "OLS", "rearrested")
## Top 2 important features for OLS model ( rearrested ): age_cat_18_24 age_cat_25_34
# Get top 2 important features for Lasso model (Rearrest outcome)
get_top_features(lasso_model, "Lasso", "rearrested")
## Non-zero coefficients for Lasso model ( rearrested ):
##                             coefficient                  predictor
## (Intercept)                -1.875238088                (Intercept)
## age_cat_18_24               0.501717528              age_cat_18_24
## age_cat_25_34               0.241873791              age_cat_25_34
## age_cat_45_54              -0.200450840              age_cat_45_54
## age_cat_gt55               -0.470660228               age_cat_gt55
## gender_female              -0.224601999              gender_female
## race_white                  0.041763924                 race_white
## race_black                 -0.006873122                 race_black
## top_charge_severity_binary  0.193343060 top_charge_severity_binary
## prior_record                0.346005266               prior_record
## prior_vfo_cnt              -0.048219126              prior_vfo_cnt
## prior_nonvfo_cnt           -0.006303743           prior_nonvfo_cnt
## prior_misd_cnt              0.125203204             prior_misd_cnt
## Top 2 important features for Lasso model ( rearrested ): (Intercept) age_cat_18_24
# Get top 2 important features for Ridge model (Rearrest outcome)
get_top_features(ridge_model, "Ridge", "rearrested")

# Get top 2 important features for Random Forest model (Rearrest outcome)
get_top_features(rf_model, "Random Forest", "rearrested")
## Top 2 important features for Random Forest model ( rearrested ): prior_misd_cnt prior_squared
# Get top 2 important features for OLS model (ROR outcome)
get_top_features(ols_model_ror, "OLS", "ror")
## Top 2 important features for OLS model ( ror ): top_charge_severity_binary age_cat_18_24
# Get top 2 important features for Lasso model (ROR outcome)
get_top_features(lasso_model_ror, "Lasso", "ror")
## Non-zero coefficients for Lasso model ( ror ):
##                             coefficient                  predictor
## (Intercept)                 0.051469684                (Intercept)
## age_cat_lt18               -0.058440029               age_cat_lt18
## age_cat_18_24              -0.289605934              age_cat_18_24
## age_cat_25_34              -0.149254096              age_cat_25_34
## age_cat_45_54               0.217165802              age_cat_45_54
## age_cat_gt55                0.431522822               age_cat_gt55
## gender_female               0.460839724              gender_female
## race_white                  0.055329401                 race_white
## race_black                 -0.004593999                 race_black
## top_charge_severity_binary  1.716730206 top_charge_severity_binary
## prior_record               -0.374570774               prior_record
## prior_vfo_cnt              -0.270688904              prior_vfo_cnt
## prior_nonvfo_cnt           -0.149698232           prior_nonvfo_cnt
## prior_misd_cnt             -0.156881472             prior_misd_cnt
## prior_squared               0.007256221              prior_squared
## Top 2 important features for Lasso model ( ror ): top_charge_severity_binary gender_female
# Get top 2 important features for Ridge model (ROR outcome)
get_top_features(ridge_model_ror, "Ridge", "ror")

# Get top 2 important features for Random Forest model (ROR outcome)
get_top_features(rf_model_ror, "Random Forest", "ror")
## Top 2 important features for Random Forest model ( ror ): top_charge_severity_binary prior_squared

Graphs

The code provided includes data preparation, feature engineering, and the implementation of multiple predictive models to predict two outcomes: rearrest (rearrested) and release on recognizance (ror). The code creates indicator variables for different features, such as age categories, gender, race, charge severity, and prior record counts. The predictive models used include Ordinary Least Squares (OLS) regression, Lasso regression, Ridge regression, and Random Forest.

Based on observation of the plots, the mean for Black individuals is slightly higher in predicted rearrest, and the mean for White individuals is higher in predicted release on recognizance (ROR). A potential interpretation of this observation could be that, on average, the models predict a higher likelihood of rearrest for Black individuals compared to White individuals. Conversely, the models predict a higher likelihood of release on recognizance for White individuals compared to Black individuals. However, it is important to carefully interpret these results and consider potential confounding factors that may impact the prediction.

The features used as predictors in the models include: Age categories (age_cat_lt18, age_cat_18_24, age_cat_25_34, age_cat_35_44, age_cat_45_54, age_cat_gt55) Gender indicators (gender_female, gender_male) Race indicators (race_white, race_black, race_other) Charge severity indicator (top_charge_severity_binary) - binary variable indicating whether the top charge severity at arrest is a misdemeanor (1) or not (0) Prior record indicators (prior_record, missing_prior_vfo_cnt, missing_prior_nonvfo_cnt, missing_prior_misd_cnt, prior_vfo_cnt, prior_nonvfo_cnt, prior_misd_cnt) - variables indicating whether the individual has a prior record, the count of prior violent felony offenses (VFO), prior_squared - squared term for the number of prior offenses (calculated as the sum of the squares of prior_vfo_cnt, prior_nonvfo_cnt, and prior_misd_cnt)

# Generate predictions for ror using the trained model (ols_model_ror)
data$predicted_ror <- predict(ols_model_ror, newdata = data)
## Warning in predict.lm(ols_model_ror, newdata = data): prediction from a
## rank-deficient fit may be misleading
# Generate predictions for rearrested using the trained model (ols_model)
data$predicted_rearrested <- predict(ols_model, newdata = data)
## Warning in predict.lm(ols_model, newdata = data): prediction from a
## rank-deficient fit may be misleading
# Remove rows with non-finite values in predicted columns (if any)
data <- data[is.finite(data$predicted_ror) & is.finite(data$predicted_rearrested), ]

# Plot the distribution of predicted ror by race
ggplot(data, aes(x = predicted_ror, fill = race)) +
  geom_density(alpha = 0.6, adjust = 5) +
  labs(title = "Distribution of Predicted ROR by Race",
       x = "Predicted ROR",
       y = "Density") +
  theme_minimal()

# Plot the distribution of predicted rearrested by race
ggplot(data, aes(x = predicted_rearrested, fill = race)) +
  geom_density(alpha = 0.6, adjust = 5) +
  labs(title = "Distribution of Predicted Rearrest by Race",
       x = "Predicted Rearrest",
       y = "Density") +
  theme_minimal()

Binsreg - Rearrest

This code uses the binsreg package to create binned scatter plot regressions that analyze the relationship between the predicted probability of rearrest (predicted_rearrested_numeric) and various predictor variables, including age at arrest (age_at_arrest_numeric), counts of prior violent felony offenses (prior_vfo_cnt_numeric), counts of prior non-violent felony offenses (prior_nonvfo_cnt_numeric), and counts of prior misdemeanor offenses (prior_misd_cnt_numeric). The binsreg function is used to generate these regressions, which group data into bins and plot the average outcome within each bin.

The code performs the following steps:

Load the binsreg package. Convert the predictor and outcome variables to numeric vectors. Perform binned scatter plot regression for the rearrest outcome using age_at_arrest_numeric as the independent variable. Perform binned scatter plot regression for the rearrest outcome using prior_vfo_cnt_numeric as the independent variable. Perform binned scatter plot regression for the rearrest outcome using prior_nonvfo_cnt_numeric as the independent variable. Perform binned scatter plot regression for the rearrest outcome using prior_misd_cnt_numeric as the independent variable. Each regression uses 20 bins (nbins = 20) to group the data. The results of these regressions provide insights into how the predicted probability of rearrest is associated with age at arrest, prior violent felony offenses, prior non-violent felony offenses, and prior misdemeanor offenses.

library(binsreg)
## Warning: package 'binsreg' was built under R version 4.2.3
# Convert predictor and outcome variables to numeric vectors
age_at_arrest_numeric <- as.numeric(data$age_at_arrest)
predicted_rearrested_numeric <- data$predicted_rearrested
prior_vfo_cnt_numeric <- as.numeric(data$prior_vfo_cnt)
prior_nonvfo_cnt_numeric <- as.numeric(data$prior_nonvfo_cnt)
prior_misd_cnt_numeric <- as.numeric(data$prior_misd_cnt)
predicted_rearrested_numeric <- data$predicted_rearrested

# Binned scatter plot regression for rearrest outcome
# Here, we use the predicted rearrested values as the dependent variable (y)
# and age_at_arrest as the independent variable (x)
binsreg_result_rearrest_age <- binsreg(predicted_rearrested_numeric, 
                                       age_at_arrest_numeric, 
                                       nbins = 20) # Number of bins for binning the data (you can adjust this value)

# Binned scatter plot regression for rearrest outcome with prior_vfo_cnt
binsreg_result_rearrest_prior_vfo_cnt <- binsreg(predicted_rearrested_numeric, 
                                                 prior_vfo_cnt_numeric, 
                                                 nbins = 20)
## Warning in binsreg(predicted_rearrested_numeric, prior_vfo_cnt_numeric, : Too
## small effective sample size for dots. # of mass points or clusters used.
## Warning in binsreg(predicted_rearrested_numeric, prior_vfo_cnt_numeric, :
## dots=c(0,0) used.

# Binned scatter plot regression for rearrest outcome with prior_nonvfo_cnt
binsreg_result_rearrest_prior_nonvfo_cnt <- binsreg(predicted_rearrested_numeric, 
                                                    prior_nonvfo_cnt_numeric, 
                                                    nbins = 20)
## Warning in binsreg(predicted_rearrested_numeric, prior_nonvfo_cnt_numeric, : Too
## small effective sample size for dots. # of mass points or clusters used.
## Warning in binsreg(predicted_rearrested_numeric, prior_nonvfo_cnt_numeric, :
## dots=c(0,0) used.

# Binned scatter plot regression for rearrest outcome with prior_misd_cnt
binsreg_result_rearrest_prior_misd_cnt <- binsreg(predicted_rearrested_numeric, 
                                                  prior_misd_cnt_numeric, 
                                                  nbins = 20)
## Warning in binsreg(predicted_rearrested_numeric, prior_misd_cnt_numeric, : Too
## small effective sample size for dots. # of mass points or clusters used.
## Warning in binsreg(predicted_rearrested_numeric, prior_misd_cnt_numeric, :
## dots=c(0,0) used.

Binsreg - ROR The code uses the binsreg package to create binned scatter plot regressions that analyze the relationship between the predicted probability of release on recognizance (predicted_ror_numeric) and various predictor variables, including age at arrest (age_at_arrest_numeric), counts of prior violent felony offenses (prior_vfo_cnt_numeric), counts of prior non-violent felony offenses (prior_nonvfo_cnt_numeric), and counts of prior misdemeanor offenses (prior_misd_cnt_numeric). The binsreg function is used to generate these regressions, which group data into bins and plot the average outcome within each bin.

The code performs the following steps:

Load the binsreg package. Convert the predictor and outcome variables to numeric vectors. Perform binned scatter plot regression for the release on recognizance (ROR) outcome using age_at_arrest_numeric as the independent variable. Perform binned

library(binsreg)

# Convert predictor and outcome variables to numeric vectors
age_at_arrest_numeric <- as.numeric(data$age_at_arrest)
predicted_ror_numeric <- data$predicted_ror
prior_vfo_cnt_numeric <- as.numeric(data$prior_vfo_cnt)
prior_nonvfo_cnt_numeric <- as.numeric(data$prior_nonvfo_cnt)
prior_misd_cnt_numeric <- as.numeric(data$prior_misd_cnt)
predicted_ror_numeric <- data$predicted_ror

# Binned scatter plot regression for ROR outcome
# Here, we use the predicted ROR values as the dependent variable (y)
# and age_at_arrest as the independent variable (x)
binsreg_result_ror_age <- binsreg(predicted_ror_numeric, 
                                  age_at_arrest_numeric, 
                                  nbins = 20) # Number of bins for binning the data (you can adjust this value)

# Binned scatter plot regression for ROR outcome with prior_vfo_cnt
binsreg_result_ror_prior_vfo_cnt <- binsreg(predicted_ror_numeric, 
                                            prior_vfo_cnt_numeric, 
                                            nbins = 20)
## Warning in binsreg(predicted_ror_numeric, prior_vfo_cnt_numeric, nbins = 20):
## Too small effective sample size for dots. # of mass points or clusters used.
## Warning in binsreg(predicted_ror_numeric, prior_vfo_cnt_numeric, nbins = 20):
## dots=c(0,0) used.

# Binned scatter plot regression for ROR outcome with prior_nonvfo_cnt
binsreg_result_ror_prior_nonvfo_cnt <- binsreg(predicted_ror_numeric, 
                                               prior_nonvfo_cnt_numeric, 
                                               nbins = 20)
## Warning in binsreg(predicted_ror_numeric, prior_nonvfo_cnt_numeric, nbins = 20):
## Too small effective sample size for dots. # of mass points or clusters used.
## Warning in binsreg(predicted_ror_numeric, prior_nonvfo_cnt_numeric, nbins = 20):
## dots=c(0,0) used.

# Binned scatter plot regression for ROR outcome with prior_misd_cnt
binsreg_result_ror_prior_misd_cnt <- binsreg(predicted_ror_numeric, 
                                             prior_misd_cnt_numeric, 
                                             nbins = 20)
## Warning in binsreg(predicted_ror_numeric, prior_misd_cnt_numeric, nbins = 20):
## Too small effective sample size for dots. # of mass points or clusters used.
## Warning in binsreg(predicted_ror_numeric, prior_misd_cnt_numeric, nbins = 20):
## dots=c(0,0) used.

Is there a pattern such that the models tend to do better or worse for low risk or high-risk individuals?

in this code, the performance of the Ordinary Least Squares (OLS) model for predicting the likelihood of rearrest is evaluated for two different groups of individuals: high-risk individuals and low-risk individuals. The classification of high-risk and low-risk is based on the presence or absence of a prior criminal record, respectively.

The function calculate_performance_metrics calculates two performance metrics for the specified model on a given dataset: accuracy and mean squared error (MSE). Accuracy measures the proportion of correctly classified instances, while MSE measures the average squared differences between the predicted and actual values.

The code first defines subsets of the data based on risk levels:

high_risk_data contains individuals with a prior criminal record (prior_record == 1). low_risk_data contains individuals without a prior criminal record (prior_record == 0). Next, the function calculate_performance_metrics is called for each subset of data to calculate the performance metrics for the OLS model for high-risk and low-risk individuals.

The results show that:

For high-risk individuals, the OLS model has an accuracy of approximately 71.19% and an MSE of approximately 0.1975. For low-risk individuals, the OLS model has an accuracy of approximately 82.93% and an MSE of approximately 0.1407. In summary, the OLS model performs better (higher accuracy and lower MSE) for low-risk individuals compared to high-risk individuals. The model is more accurate in predicting the rearrest outcome for individuals without a prior criminal record. It is important to interpret these results with caution and consider potential confounding factors that may affect the model’s performance.

# Define subsets of data based on risk levels
high_risk_data <- data_clean[data_clean$prior_record == 1, ]
low_risk_data <- data_clean[data_clean$prior_record == 0, ]

# Function to calculate accuracy and MSE for each model
calculate_performance_metrics <- function(model, data_subset, outcome) {
  actual_values <- data_subset[[outcome]]
  predicted_values <- predict(model, newdata = data_subset)
  accuracy <- mean((actual_values == ifelse(predicted_values > 0.5, 1, 0)))
  mse <- mean((actual_values - predicted_values)^2)
  return(list(accuracy = accuracy, mse = mse))
}

# Calculate performance metrics for high-risk and low-risk individuals for each model
ols_performance_high_risk <- calculate_performance_metrics(ols_model, high_risk_data, "rearrested")
## Warning in predict.lm(model, newdata = data_subset): prediction from a
## rank-deficient fit may be misleading
ols_performance_low_risk <- calculate_performance_metrics(ols_model, low_risk_data, "rearrested")
## Warning in predict.lm(model, newdata = data_subset): prediction from a
## rank-deficient fit may be misleading
# Print performance metrics
cat("OLS model - Accuracy (High Risk):", ols_performance_high_risk$accuracy, "MSE (High Risk):", ols_performance_high_risk$mse, "\n")
## OLS model - Accuracy (High Risk): 0.7118671 MSE (High Risk): 0.1974593
cat("OLS model - Accuracy (Low Risk):", ols_performance_low_risk$accuracy, "MSE (Low Risk):", ols_performance_low_risk$mse, "\n")
## OLS model - Accuracy (Low Risk): 0.8293314 MSE (Low Risk): 0.1407146