This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
###############################################################
# FINAL FULL SOLUTION — COMPLETE CODE FOR ALL QUESTIONS (Q1–Q2)
###############################################################
# =============================
# Load Required Packages
# =============================
pkgs <- c("caret", "ipred", "rpart.plot", "randomForest", "caTools")
new_pkgs <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if (length(new_pkgs) > 0) install.packages(new_pkgs)
library(caret)
library(ipred)
library(rpart.plot)
library(randomForest)
library(caTools)
# =============================
# Q1(a): Load and Prepare Data
# =============================
cat("\n--- Q1(a): Loading Data ---\n")
--- Q1(a): Loading Data ---
csv_path <- "C:\\Users\\Lenovo\\OneDrive\\Desktop\\fall 2025\\stat\\Assignment 4-Stat\\compas.csv"
if (!file.exists(csv_path)) {
stop("ERROR: compas.csv is NOT in working directory: ", getwd())
}
compas_data <- read.csv(csv_path)
cat("Loaded compas.csv with", nrow(compas_data), "rows.\n")
Loaded compas.csv with 6172 rows.
# Expected fields:
required_cols <- c("is_recid","race","sex","age",
"priors_count","c_charge_degree")
missing <- setdiff(required_cols, names(compas_data))
if (length(missing)>0) {
stop("Missing columns in compas.csv: ", paste(missing, collapse=", "))
}
# Filter two races only
filtered <- subset(compas_data,
race %in% c("African-American", "Caucasian"))
# Factor conversions
filtered$is_recid <- factor(filtered$is_recid)
levels(filtered$is_recid) <- c("No_Recid","Yes_Recid")
filtered$race <- factor(filtered$race)
filtered$sex <- factor(filtered$sex)
filtered$c_charge_degree <- factor(filtered$c_charge_degree)
# Save factor levels for Q1(d)
all_races <- levels(filtered$race)
all_sexes <- levels(filtered$sex)
all_charges <- levels(filtered$c_charge_degree)
# Split data
set.seed(123)
idx <- createDataPartition(filtered$is_recid, p=0.8, list=FALSE)
train <- filtered[idx, ]
test <- filtered[-idx, ]
cat("Training rows:", nrow(train), "Testing rows:", nrow(test), "\n")
Training rows: 4223 Testing rows: 1055
# =============================
# Q1(b): Bagging Model
# =============================
cat("\n--- Q1(b): Bagging Model ---\n")
--- Q1(b): Bagging Model ---
formula_all <- is_recid ~ race + sex + age + priors_count + c_charge_degree
ctrl_bag <- trainControl(method="none", classProbs=TRUE)
set.seed(123)
bag_model <- train(
form = formula_all,
data = train,
method = "treebag",
nbagg = 50,
trControl = ctrl_bag
)
print(bag_model)
Bagged CART
4223 samples
5 predictor
2 classes: 'No_Recid', 'Yes_Recid'
No pre-processing
Resampling: None
# =============================
# Q1(c): Plot 3 Bagged Trees - CORRECTED
# =============================
cat("\n--- Q1(c): Saving First 3 Trees ---\n")
--- Q1(c): Saving First 3 Trees ---
# Build ipred version for tree extraction
set.seed(123)
bag_ipred <- ipred::bagging(formula_all, data = train, nbagg = 50)
save_tree <- function(tree_obj, name) {
out <- file.path(getwd(), name)
tryCatch({
png(out, width = 1000, height = 800)
# Extract the rpart object from the bagging tree
rpart_obj <- tree_obj$btree
rpart.plot(rpart_obj, main = name, roundint = FALSE)
dev.off()
cat("Saved", name, "\n")
}, error = function(e) {
cat("Error saving", name, ":", e$message, "\n")
# Try to plot on screen as fallback
try({
rpart_obj <- tree_obj$btree
rpart.plot(rpart_obj, main = name)
}, silent = TRUE)
})
}
# Plot first 3 trees
for (i in 1:3) {
tree_name <- paste0("q1c_tree_", i, ".png")
save_tree(bag_ipred$mtrees[[i]], tree_name)
}
Saved q1c_tree_1.png
Saved q1c_tree_2.png
Saved q1c_tree_3.png
# =============================
# Q1(d): Predict New Individuals
# =============================
cat("\n--- Q1(d): New Arrestee Predictions ---\n")
--- Q1(d): New Arrestee Predictions ---
new_people <- data.frame(
race = factor(c("Caucasian","Caucasian","African-American"),
levels = all_races),
sex = factor(c("Male","Female","Male"), levels = all_sexes),
age = c(19,22,34),
priors_count = c(1,2,0),
c_charge_degree = factor(c("F","M","M"), levels = all_charges)
)
probs_new <- predict(bag_model, new_people, type="prob")
results_new <- data.frame(
Person=c("Archie","Betty","Chuck"),
Probability=probs_new$Yes_Recid
)
print(results_new)
# =============================
# Q1(e): Fairness Metrics
# =============================
cat("\n--- Q1(e): Fairness Metrics ---\n")
--- Q1(e): Fairness Metrics ---
probs_test <- predict(bag_model, test, type="prob")
pred_test <- ifelse(probs_test$Yes_Recid>0.5,"Yes_Recid","No_Recid")
df_test <- data.frame(
Actual=test$is_recid,
Predicted=pred_test,
Race=test$race
)
metric_fun <- function(df){
cm <- confusionMatrix(
factor(df$Predicted, levels=c("No_Recid","Yes_Recid")),
factor(df$Actual, levels=c("No_Recid","Yes_Recid")),
positive="Yes_Recid"
)
t <- cm$table
TN <- t["No_Recid","No_Recid"]
FP <- t["Yes_Recid","No_Recid"]
FN <- t["No_Recid","Yes_Recid"]
TP <- t["Yes_Recid","Yes_Recid"]
FPR <- FP/(FP+TN)
FNR <- FN/(FN+TP)
Error <- 1-cm$overall["Accuracy"]
c(Error=Error, FPR=FPR, FNR=FNR)
}
cat("\nCaucasian:\n")
Caucasian:
print(metric_fun(subset(df_test, Race=="Caucasian")))
Error.Accuracy FPR FNR
0.4063927 0.2840467 0.5801105
cat("\nAfrican-American:\n")
African-American:
print(metric_fun(subset(df_test, Race=="African-American")))
Error.Accuracy FPR FNR
0.3841167 0.4349442 0.3448276
# =============================
# Q2: Random Forest + LogitBoost
# =============================
cat("\n--- Q2: Model Tuning ---\n")
--- Q2: Model Tuning ---
cv5 <- trainControl(method="cv", number=5, classProbs=TRUE)
# -----------------------------
# Random Forest Tuning
# -----------------------------
cat("\n--- Q2: Random Forest ---\n")
--- Q2: Random Forest ---
rf_grid <- expand.grid(mtry = 1:5)
set.seed(123)
rf_model <- train(
formula_all,
data=train,
method="rf",
trControl=cv5,
tuneGrid=rf_grid,
ntree=500
)
print(rf_model)
Random Forest
4223 samples
5 predictor
2 classes: 'No_Recid', 'Yes_Recid'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 3378, 3379, 3378, 3378, 3379
Resampling results across tuning parameters:
mtry Accuracy Kappa
1 0.6649275 0.3298702
2 0.6810295 0.3621589
3 0.6637432 0.3275372
4 0.6438543 0.2876618
5 0.6367495 0.2733942
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
# Predictions
rf_probs <- predict(rf_model, test, type="prob")
rf_pred <- ifelse(rf_probs$Yes_Recid>0.5,"Yes_Recid","No_Recid")
rf_df <- data.frame(
Actual=test$is_recid,
Predicted=rf_pred,
Race=test$race
)
cat("\nRF Overall:\n")
RF Overall:
print(metric_fun(rf_df))
Error.Accuracy FPR FNR
0.3213270 0.2946768 0.3478261
cat("\nRF Caucasian:\n")
RF Caucasian:
print(metric_fun(subset(rf_df, Race=="Caucasian")))
Error.Accuracy FPR FNR
0.3424658 0.2140078 0.5248619
cat("\nRF African-American:\n")
RF African-American:
print(metric_fun(subset(rf_df, Race=="African-American")))
Error.Accuracy FPR FNR
0.3063209 0.3717472 0.2557471
# -----------------------------
# LogitBoost Tuning
# -----------------------------
cat("\n--- Q2: LogitBoost ---\n")
--- Q2: LogitBoost ---
grid_logit <- expand.grid(nIter=c(50,100,150,200,250,300))
set.seed(123)
logit_model <- train(
formula_all,
data=train,
method="LogitBoost",
trControl=cv5,
tuneGrid=grid_logit
)
print(logit_model)
Boosted Logistic Regression
4223 samples
5 predictor
2 classes: 'No_Recid', 'Yes_Recid'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 3378, 3379, 3378, 3378, 3379
Resampling results across tuning parameters:
nIter Accuracy Kappa
50 0.7638984 0.4223580
100 0.7583055 0.5060439
150 0.7638984 0.4223580
200 0.7583055 0.5060439
250 0.7638984 0.4223580
300 0.7583055 0.5060439
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was nIter = 50.
# Predictions
lb_probs <- predict(logit_model, test, type="prob")
lb_pred <- ifelse(lb_probs$Yes_Recid>0.5,"Yes_Recid","No_Recid")
lb_df <- data.frame(
Actual=test$is_recid,
Predicted=lb_pred,
Race=test$race
)
cat("\nLogitBoost Overall:\n")
LogitBoost Overall:
print(metric_fun(lb_df))
Error.Accuracy FPR FNR
0.40379147 0.08174905 0.72400756
cat("\nLogitBoost Caucasian:\n")
LogitBoost Caucasian:
print(metric_fun(subset(lb_df, Race=="Caucasian")))
Error.Accuracy FPR FNR
0.37442922 0.05447471 0.82872928
cat("\nLogitBoost African-American:\n")
LogitBoost African-American:
print(metric_fun(subset(lb_df, Race=="African-American")))
Error.Accuracy FPR FNR
0.4246353 0.1078067 0.6695402
cat("\n\n=============================\nEND OF FULL SCRIPT\n=============================\n")
=============================
END OF FULL SCRIPT
=============================
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.