suppressPackageStartupMessages({
library(tidyverse)
library(caret)
library(rpart)
library(rpart.plot)
})
set.seed(42)
auto_load_csv <- function(fname) {
paths <- c(
fname,
file.path(getwd(), fname),
file.path("~","Downloads",fname),
file.path("/mnt/data",fname)
)
for (p in paths) if (file.exists(p)) return(read.csv(p))
return(NULL)
}
cat("Sanity check: running at", as.character(Sys.time()), "\n")
## Sanity check: running at 2025-10-23 11:40:04.492479
# ===== Problem 1 =====
cat("\n--- Problem 1: Breast Cancer ---\n")
##
## --- Problem 1: Breast Cancer ---
if (exists("BreastCancerData")) {
bc <- BreastCancerData
} else {
bc <- auto_load_csv("breast_cancer_updated.csv")
}
stopifnot(!is.null(bc))
bc <- bc %>% select(-any_of("IDNumber")) %>% mutate(Class = as.factor(Class)) %>% na.omit()
cat("Rows after cleaning:", nrow(bc), "\n")
## Rows after cleaning: 683
ctrl <- trainControl(method="cv", number=10)
fit <- train(Class~., data=bc, method="rpart", trControl=ctrl, tuneLength=10)
final_tree <- rpart(Class~., data=bc, method="class",
control=rpart.control(cp=fit$bestTune$cp))
rpart.plot(final_tree, type=3, extra=104, under=TRUE, faclen=0,
main="Problem 1 – Breast Cancer Decision Tree")

knitr::kable(data.frame(Metric="P1 10-fold CV Accuracy",
Value=round(max(fit$results$Accuracy),3)),
caption="Problem 1 – Cross-Validation Accuracy")
Problem 1 – Cross-Validation Accuracy
| P1 10-fold CV Accuracy |
0.956 |
paths_to_rules <- function(tree){
fr <- tree$frame
leaves <- row.names(fr)[fr$var=="<leaf>"]
pths <- path.rpart(tree, leaves, print.it=FALSE)
out <- character(length(pths))
for (i in seq_along(pths)) {
conds <- pths[[i]][-1]
pred <- as.character(fr[leaves[i],"yval2"][,1])
if (is.null(pred) || length(pred)==0)
pred <- attr(tree,"ylevels")[ fr[leaves[i],"yval"] ]
out[i] <- paste0("IF ", paste(conds,collapse=" AND "), " THEN Class = ", pred)
}
out
}
rules <- paths_to_rules(final_tree)
knitr::kable(data.frame(Rule=rules), col.names="IF–THEN Rules",
caption="Problem 1 – Tree Rules")
Problem 1 – Tree Rules
| IF UniformCellSize< 2.5 AND BareNuclei< 5.5 THEN
Class = 1 |
| IF UniformCellSize< 2.5 AND BareNuclei>=5.5 THEN
Class = 2 |
| IF UniformCellSize>=2.5 AND UniformCellShape< 2.5
AND BlandChromatin< 3.5 THEN Class = 1 |
| IF UniformCellSize>=2.5 AND UniformCellShape< 2.5
AND BlandChromatin>=3.5 THEN Class = 2 |
| IF UniformCellSize>=2.5 AND UniformCellShape>=2.5
AND UniformCellSize< 4.5 AND BareNuclei< 2.5 THEN Class = 1 |
| IF UniformCellSize>=2.5 AND UniformCellShape>=2.5
AND UniformCellSize< 4.5 AND BareNuclei>=2.5 THEN Class = 2 |
| IF UniformCellSize>=2.5 AND UniformCellShape>=2.5
AND UniformCellSize>=4.5 THEN Class = 2 |
# ===== Problem 2 =====
cat("\n--- Problem 2: storms ---\n")
##
## --- Problem 2: storms ---
data("storms", package="dplyr")
storms_df <- storms %>%
mutate(category=as.factor(category)) %>%
select(-name,-status) %>%
drop_na()
ctrl2 <- trainControl(method="cv", number=10)
fit2 <- train(category~., data=storms_df, method="rpart", trControl=ctrl2,
tuneGrid=data.frame(cp=0),
control=rpart.control(maxdepth=2, minsplit=5, minbucket=3, cp=0))
cv2_acc <- max(fit2$results$Accuracy)
split_idx <- createDataPartition(storms_df$category, p=0.8, list=FALSE)
tr <- storms_df[split_idx,]; te <- storms_df[-split_idx,]
model2 <- rpart(category~., data=tr, method="class",
control=rpart.control(maxdepth=2, minsplit=5, minbucket=3, cp=0))
cm_tr <- confusionMatrix(predict(model2,tr,type="class"), tr$category)
cm_te <- confusionMatrix(predict(model2,te,type="class"), te$category)
knitr::kable(data.frame(Metric="P2 10-fold CV Accuracy",Value=round(cv2_acc,3)),
caption="Problem 2 – Cross-Validation Accuracy")
Problem 2 – Cross-Validation Accuracy
| P2 10-fold CV Accuracy |
0.836 |
knitr::kable(as.data.frame(cm_tr$table),caption="Problem 2 – Confusion Matrix (Train)")
Problem 2 – Confusion Matrix (Train)
| 1 |
1 |
867 |
| 2 |
1 |
0 |
| 3 |
1 |
0 |
| 4 |
1 |
0 |
| 5 |
1 |
0 |
| 1 |
2 |
0 |
| 2 |
2 |
348 |
| 3 |
2 |
0 |
| 4 |
2 |
0 |
| 5 |
2 |
0 |
| 1 |
3 |
0 |
| 2 |
3 |
0 |
| 3 |
3 |
0 |
| 4 |
3 |
233 |
| 5 |
3 |
0 |
| 1 |
4 |
0 |
| 2 |
4 |
0 |
| 3 |
4 |
0 |
| 4 |
4 |
238 |
| 5 |
4 |
0 |
| 1 |
5 |
0 |
| 2 |
5 |
0 |
| 3 |
5 |
0 |
| 4 |
5 |
52 |
| 5 |
5 |
0 |
knitr::kable(as.data.frame(cm_te$table),caption="Problem 2 – Confusion Matrix (Test)")
Problem 2 – Confusion Matrix (Test)
| 1 |
1 |
216 |
| 2 |
1 |
0 |
| 3 |
1 |
0 |
| 4 |
1 |
0 |
| 5 |
1 |
0 |
| 1 |
2 |
0 |
| 2 |
2 |
86 |
| 3 |
2 |
0 |
| 4 |
2 |
0 |
| 5 |
2 |
0 |
| 1 |
3 |
0 |
| 2 |
3 |
0 |
| 3 |
3 |
0 |
| 4 |
3 |
58 |
| 5 |
3 |
0 |
| 1 |
4 |
0 |
| 2 |
4 |
0 |
| 3 |
4 |
0 |
| 4 |
4 |
59 |
| 5 |
4 |
0 |
| 1 |
5 |
0 |
| 2 |
5 |
0 |
| 3 |
5 |
0 |
| 4 |
5 |
13 |
| 5 |
5 |
0 |
# ===== Problem 3 =====
cat("\n--- Problem 3: storms extension ---\n")
##
## --- Problem 3: storms extension ---
set.seed(123)
split_idx3 <- createDataPartition(storms_df$category, p=0.8, list=FALSE)
tr3 <- storms_df[split_idx3,]; te3 <- storms_df[-split_idx3,]
cat("Train rows:", nrow(tr3), "Test rows:", nrow(te3), "\n")
## Train rows: 1738 Test rows: 432
param_grid <- expand.grid(maxdepth=c(1,2,3,4,5,6),
minsplit=c(2,5,10),
minbucket=c(1,3,5),
cp=c(0,0.001,0.01),
KEEP.OUT.ATTRS=FALSE) %>% as_tibble()
if (nrow(param_grid)>=24) param_grid <- dplyr::sample_n(param_grid,24)
results <- purrr::map_dfr(1:nrow(param_grid), function(i){
pg <- param_grid[i,]
fit <- rpart(category~., data=tr3, method="class",
control=rpart.control(maxdepth=pg$maxdepth,
minsplit=pg$minsplit,
minbucket=pg$minbucket,
cp=pg$cp))
tibble(maxdepth=pg$maxdepth, minsplit=pg$minsplit, minbucket=pg$minbucket, cp=pg$cp,
nodes=nrow(fit$frame),
acc_train=mean(predict(fit,tr3,type="class")==tr3$category),
acc_test=mean(predict(fit,te3,type="class")==te3$category))
})
knitr::kable(results, digits=3, caption="Problem 3 – Parameter Grid Results")
Problem 3 – Parameter Grid Results
| 2 |
5 |
1 |
0.000 |
5 |
0.836 |
0.836 |
| 1 |
2 |
1 |
0.001 |
3 |
0.699 |
0.699 |
| 2 |
2 |
5 |
0.001 |
5 |
0.836 |
0.836 |
| 1 |
2 |
1 |
0.010 |
3 |
0.699 |
0.699 |
| 6 |
5 |
5 |
0.000 |
9 |
1.000 |
1.000 |
| 4 |
2 |
3 |
0.010 |
9 |
1.000 |
1.000 |
| 5 |
2 |
3 |
0.001 |
9 |
1.000 |
1.000 |
| 6 |
5 |
5 |
0.001 |
9 |
1.000 |
1.000 |
| 5 |
5 |
3 |
0.001 |
9 |
1.000 |
1.000 |
| 1 |
10 |
3 |
0.000 |
3 |
0.699 |
0.699 |
| 6 |
10 |
5 |
0.001 |
9 |
1.000 |
1.000 |
| 3 |
10 |
3 |
0.000 |
7 |
0.970 |
0.970 |
| 6 |
10 |
1 |
0.001 |
9 |
1.000 |
1.000 |
| 4 |
10 |
5 |
0.010 |
9 |
1.000 |
1.000 |
| 2 |
2 |
1 |
0.001 |
5 |
0.836 |
0.836 |
| 4 |
2 |
5 |
0.000 |
9 |
1.000 |
1.000 |
| 3 |
5 |
1 |
0.001 |
7 |
0.970 |
0.970 |
| 2 |
5 |
5 |
0.010 |
5 |
0.836 |
0.836 |
| 6 |
2 |
5 |
0.010 |
9 |
1.000 |
1.000 |
| 4 |
5 |
1 |
0.000 |
9 |
1.000 |
1.000 |
| 3 |
10 |
1 |
0.000 |
7 |
0.970 |
0.970 |
| 6 |
2 |
5 |
0.000 |
9 |
1.000 |
1.000 |
| 2 |
10 |
5 |
0.001 |
5 |
0.836 |
0.836 |
| 5 |
2 |
5 |
0.010 |
9 |
1.000 |
1.000 |
plt <- results %>%
tidyr::pivot_longer(cols=c(acc_train,acc_test), names_to="set", values_to="acc") %>%
ggplot(aes(x=nodes,y=acc,group=set))+
geom_point()+geom_line()+
labs(title="Accuracy vs Tree Size",x="# of nodes",y="Accuracy",caption="Train vs Test")
print(plt)

res2 <- results %>% mutate(gen_gap=acc_train-acc_test,
score=acc_test-0.15*pmax(gen_gap,0))
best <- dplyr::slice_max(res2, order_by=score, n=1)
knitr::kable(best, digits=3, caption="Problem 3 – Chosen Parameters")
Problem 3 – Chosen Parameters
| 6 |
5 |
5 |
0.000 |
9 |
1 |
1 |
0 |
1 |
| 4 |
2 |
3 |
0.010 |
9 |
1 |
1 |
0 |
1 |
| 5 |
2 |
3 |
0.001 |
9 |
1 |
1 |
0 |
1 |
| 6 |
5 |
5 |
0.001 |
9 |
1 |
1 |
0 |
1 |
| 5 |
5 |
3 |
0.001 |
9 |
1 |
1 |
0 |
1 |
| 6 |
10 |
5 |
0.001 |
9 |
1 |
1 |
0 |
1 |
| 6 |
10 |
1 |
0.001 |
9 |
1 |
1 |
0 |
1 |
| 4 |
10 |
5 |
0.010 |
9 |
1 |
1 |
0 |
1 |
| 4 |
2 |
5 |
0.000 |
9 |
1 |
1 |
0 |
1 |
| 6 |
2 |
5 |
0.010 |
9 |
1 |
1 |
0 |
1 |
| 4 |
5 |
1 |
0.000 |
9 |
1 |
1 |
0 |
1 |
| 6 |
2 |
5 |
0.000 |
9 |
1 |
1 |
0 |
1 |
| 5 |
2 |
5 |
0.010 |
9 |
1 |
1 |
0 |
1 |
md <- as.integer(best$maxdepth[1])
ms <- as.integer(best$minsplit[1])
mb <- as.integer(best$minbucket[1])
cpv <- as.numeric(best$cp[1])
best_fit <- rpart(category~., data=tr3, method="class",
control=rpart.control(maxdepth=md,minsplit=ms,minbucket=mb,cp=cpv))
cm_best_tr <- confusionMatrix(predict(best_fit,tr3,type="class"),tr3$category)
cm_best_te <- confusionMatrix(predict(best_fit,te3,type="class"),te3$category)
knitr::kable(as.data.frame(cm_best_tr$table),caption="Problem 3 – Confusion Matrix (Train)")
Problem 3 – Confusion Matrix (Train)
| 1 |
1 |
867 |
| 2 |
1 |
0 |
| 3 |
1 |
0 |
| 4 |
1 |
0 |
| 5 |
1 |
0 |
| 1 |
2 |
0 |
| 2 |
2 |
348 |
| 3 |
2 |
0 |
| 4 |
2 |
0 |
| 5 |
2 |
0 |
| 1 |
3 |
0 |
| 2 |
3 |
0 |
| 3 |
3 |
233 |
| 4 |
3 |
0 |
| 5 |
3 |
0 |
| 1 |
4 |
0 |
| 2 |
4 |
0 |
| 3 |
4 |
0 |
| 4 |
4 |
238 |
| 5 |
4 |
0 |
| 1 |
5 |
0 |
| 2 |
5 |
0 |
| 3 |
5 |
0 |
| 4 |
5 |
0 |
| 5 |
5 |
52 |
knitr::kable(as.data.frame(cm_best_te$table),caption="Problem 3 – Confusion Matrix (Test)")
Problem 3 – Confusion Matrix (Test)
| 1 |
1 |
216 |
| 2 |
1 |
0 |
| 3 |
1 |
0 |
| 4 |
1 |
0 |
| 5 |
1 |
0 |
| 1 |
2 |
0 |
| 2 |
2 |
86 |
| 3 |
2 |
0 |
| 4 |
2 |
0 |
| 5 |
2 |
0 |
| 1 |
3 |
0 |
| 2 |
3 |
0 |
| 3 |
3 |
58 |
| 4 |
3 |
0 |
| 5 |
3 |
0 |
| 1 |
4 |
0 |
| 2 |
4 |
0 |
| 3 |
4 |
0 |
| 4 |
4 |
59 |
| 5 |
4 |
0 |
| 1 |
5 |
0 |
| 2 |
5 |
0 |
| 3 |
5 |
0 |
| 4 |
5 |
0 |
| 5 |
5 |
13 |
cv <- train(category~., data=storms_df, method="rpart",
trControl=trainControl(method="cv",number=10),
tuneGrid=data.frame(cp=cpv),
control=rpart.control(maxdepth=md,minsplit=ms,minbucket=mb,cp=cpv))
knitr::kable(data.frame(Metric="P3 10-fold CV Accuracy (Chosen Model)",
Value=round(max(cv$results$Accuracy),3)),
caption="Problem 3 – Final Cross-Validation Accuracy")
Problem 3 – Final Cross-Validation Accuracy
| P3 10-fold CV Accuracy (Chosen Model) |
1 |
# ===== Problem 4 =====
cat("\n--- Problem 4: Variable Importance ---\n")
##
## --- Problem 4: Variable Importance ---
bank <- auto_load_csv("Bank_Modified.csv")
stopifnot(!is.null(bank))
bank <- bank %>% select(-any_of("ID")) %>% mutate(approval=as.factor(approval)) %>% drop_na()
model_full <- rpart(approval~., data=bank, method="class",
control=rpart.control(minsplit=10,maxdepth=20,cp=0.001))
vi <- sort(model_full$variable.importance, decreasing=TRUE)
knitr::kable(data.frame(Variable=names(vi),Importance=as.numeric(vi)),
caption="Problem 4 – Variable Importance")
Problem 4 – Variable Importance
| X |
215.114367 |
| bool1 |
179.282437 |
| ages |
98.349246 |
| cont4 |
85.217471 |
| bool2 |
85.083592 |
| cont3 |
71.832313 |
| cont2 |
10.700405 |
| credit.score |
7.414835 |
| cont1 |
6.726451 |
| cont5 |
5.602469 |
| cont6 |
4.189158 |
vars6 <- names(vi)[1:min(6,length(vi))]
form6 <- as.formula(paste("approval ~", paste(vars6, collapse=" + ")))
base_cv <- train(approval~., data=bank, method="rpart",
trControl=trainControl(method="cv",number=10),
tuneGrid=data.frame(cp=0.001))
top6_cv <- train(form6, data=bank, method="rpart",
trControl=trainControl(method="cv",number=10),
tuneGrid=data.frame(cp=0.001))
knitr::kable(data.frame(
Metric=c("P4 CV Accuracy – All Vars","P4 CV Accuracy – Top 6"),
Value=round(c(max(base_cv$results$Accuracy), max(top6_cv$results$Accuracy)),3)),
caption="Problem 4 – Accuracy Comparison")
Problem 4 – Accuracy Comparison
| P4 CV Accuracy – All Vars |
0.933 |
| P4 CV Accuracy – Top 6 |
0.957 |
par(mfrow=c(1,2))
rpart.plot(model_full, type=3, extra=104, under=TRUE, faclen=0, main="Full Variables")
model_top6 <- rpart(form6, data=bank, method="class",
control=rpart.control(minsplit=10,maxdepth=20,cp=0.001))
rpart.plot(model_top6, type=3, extra=104, under=TRUE, faclen=0, main="Top 6 Variables")

par(mfrow=c(1,1))