suppressPackageStartupMessages({
  library(tidyverse)
  library(caret)
  library(rpart)
  library(rpart.plot)
})

set.seed(42)

auto_load_csv <- function(fname) {
  paths <- c(
    fname,
    file.path(getwd(), fname),
    file.path("~","Downloads",fname),
    file.path("/mnt/data",fname)
  )
  for (p in paths) if (file.exists(p)) return(read.csv(p))
  return(NULL)
}

cat("Sanity check: running at", as.character(Sys.time()), "\n")
## Sanity check: running at 2025-10-23 11:40:04.492479
# ===== Problem 1 =====
cat("\n--- Problem 1: Breast Cancer ---\n")
## 
## --- Problem 1: Breast Cancer ---
if (exists("BreastCancerData")) {
  bc <- BreastCancerData
} else {
  bc <- auto_load_csv("breast_cancer_updated.csv")
}
stopifnot(!is.null(bc))

bc <- bc %>% select(-any_of("IDNumber")) %>% mutate(Class = as.factor(Class)) %>% na.omit()
cat("Rows after cleaning:", nrow(bc), "\n")
## Rows after cleaning: 683
ctrl <- trainControl(method="cv", number=10)
fit  <- train(Class~., data=bc, method="rpart", trControl=ctrl, tuneLength=10)
final_tree <- rpart(Class~., data=bc, method="class",
                    control=rpart.control(cp=fit$bestTune$cp))

rpart.plot(final_tree, type=3, extra=104, under=TRUE, faclen=0,
           main="Problem 1 – Breast Cancer Decision Tree")

knitr::kable(data.frame(Metric="P1 10-fold CV Accuracy",
                        Value=round(max(fit$results$Accuracy),3)),
             caption="Problem 1 – Cross-Validation Accuracy")
Problem 1 – Cross-Validation Accuracy
Metric Value
P1 10-fold CV Accuracy 0.956
paths_to_rules <- function(tree){
  fr <- tree$frame
  leaves <- row.names(fr)[fr$var=="<leaf>"]
  pths <- path.rpart(tree, leaves, print.it=FALSE)
  out <- character(length(pths))
  for (i in seq_along(pths)) {
    conds <- pths[[i]][-1]
    pred <- as.character(fr[leaves[i],"yval2"][,1])
    if (is.null(pred) || length(pred)==0)
      pred <- attr(tree,"ylevels")[ fr[leaves[i],"yval"] ]
    out[i] <- paste0("IF ", paste(conds,collapse=" AND "), " THEN Class = ", pred)
  }
  out
}
rules <- paths_to_rules(final_tree)
knitr::kable(data.frame(Rule=rules), col.names="IF–THEN Rules",
             caption="Problem 1 – Tree Rules")
Problem 1 – Tree Rules
IF–THEN Rules
IF UniformCellSize< 2.5 AND BareNuclei< 5.5 THEN Class = 1
IF UniformCellSize< 2.5 AND BareNuclei>=5.5 THEN Class = 2
IF UniformCellSize>=2.5 AND UniformCellShape< 2.5 AND BlandChromatin< 3.5 THEN Class = 1
IF UniformCellSize>=2.5 AND UniformCellShape< 2.5 AND BlandChromatin>=3.5 THEN Class = 2
IF UniformCellSize>=2.5 AND UniformCellShape>=2.5 AND UniformCellSize< 4.5 AND BareNuclei< 2.5 THEN Class = 1
IF UniformCellSize>=2.5 AND UniformCellShape>=2.5 AND UniformCellSize< 4.5 AND BareNuclei>=2.5 THEN Class = 2
IF UniformCellSize>=2.5 AND UniformCellShape>=2.5 AND UniformCellSize>=4.5 THEN Class = 2
# ===== Problem 2 =====
cat("\n--- Problem 2: storms ---\n")
## 
## --- Problem 2: storms ---
data("storms", package="dplyr")
storms_df <- storms %>%
  mutate(category=as.factor(category)) %>%
  select(-name,-status) %>%
  drop_na()

ctrl2 <- trainControl(method="cv", number=10)
fit2  <- train(category~., data=storms_df, method="rpart", trControl=ctrl2,
               tuneGrid=data.frame(cp=0),
               control=rpart.control(maxdepth=2, minsplit=5, minbucket=3, cp=0))
cv2_acc <- max(fit2$results$Accuracy)

split_idx <- createDataPartition(storms_df$category, p=0.8, list=FALSE)
tr <- storms_df[split_idx,]; te <- storms_df[-split_idx,]

model2 <- rpart(category~., data=tr, method="class",
                control=rpart.control(maxdepth=2, minsplit=5, minbucket=3, cp=0))

cm_tr <- confusionMatrix(predict(model2,tr,type="class"), tr$category)
cm_te <- confusionMatrix(predict(model2,te,type="class"), te$category)

knitr::kable(data.frame(Metric="P2 10-fold CV Accuracy",Value=round(cv2_acc,3)),
             caption="Problem 2 – Cross-Validation Accuracy")
Problem 2 – Cross-Validation Accuracy
Metric Value
P2 10-fold CV Accuracy 0.836
knitr::kable(as.data.frame(cm_tr$table),caption="Problem 2 – Confusion Matrix (Train)")
Problem 2 – Confusion Matrix (Train)
Prediction Reference Freq
1 1 867
2 1 0
3 1 0
4 1 0
5 1 0
1 2 0
2 2 348
3 2 0
4 2 0
5 2 0
1 3 0
2 3 0
3 3 0
4 3 233
5 3 0
1 4 0
2 4 0
3 4 0
4 4 238
5 4 0
1 5 0
2 5 0
3 5 0
4 5 52
5 5 0
knitr::kable(as.data.frame(cm_te$table),caption="Problem 2 – Confusion Matrix (Test)")
Problem 2 – Confusion Matrix (Test)
Prediction Reference Freq
1 1 216
2 1 0
3 1 0
4 1 0
5 1 0
1 2 0
2 2 86
3 2 0
4 2 0
5 2 0
1 3 0
2 3 0
3 3 0
4 3 58
5 3 0
1 4 0
2 4 0
3 4 0
4 4 59
5 4 0
1 5 0
2 5 0
3 5 0
4 5 13
5 5 0
# ===== Problem 3 =====
cat("\n--- Problem 3: storms extension ---\n")
## 
## --- Problem 3: storms extension ---
set.seed(123)
split_idx3 <- createDataPartition(storms_df$category, p=0.8, list=FALSE)
tr3 <- storms_df[split_idx3,]; te3 <- storms_df[-split_idx3,]
cat("Train rows:", nrow(tr3), "Test rows:", nrow(te3), "\n")
## Train rows: 1738 Test rows: 432
param_grid <- expand.grid(maxdepth=c(1,2,3,4,5,6),
                          minsplit=c(2,5,10),
                          minbucket=c(1,3,5),
                          cp=c(0,0.001,0.01),
                          KEEP.OUT.ATTRS=FALSE) %>% as_tibble()
if (nrow(param_grid)>=24) param_grid <- dplyr::sample_n(param_grid,24)

results <- purrr::map_dfr(1:nrow(param_grid), function(i){
  pg <- param_grid[i,]
  fit <- rpart(category~., data=tr3, method="class",
               control=rpart.control(maxdepth=pg$maxdepth,
                                     minsplit=pg$minsplit,
                                     minbucket=pg$minbucket,
                                     cp=pg$cp))
  tibble(maxdepth=pg$maxdepth, minsplit=pg$minsplit, minbucket=pg$minbucket, cp=pg$cp,
         nodes=nrow(fit$frame),
         acc_train=mean(predict(fit,tr3,type="class")==tr3$category),
         acc_test=mean(predict(fit,te3,type="class")==te3$category))
})

knitr::kable(results, digits=3, caption="Problem 3 – Parameter Grid Results")
Problem 3 – Parameter Grid Results
maxdepth minsplit minbucket cp nodes acc_train acc_test
2 5 1 0.000 5 0.836 0.836
1 2 1 0.001 3 0.699 0.699
2 2 5 0.001 5 0.836 0.836
1 2 1 0.010 3 0.699 0.699
6 5 5 0.000 9 1.000 1.000
4 2 3 0.010 9 1.000 1.000
5 2 3 0.001 9 1.000 1.000
6 5 5 0.001 9 1.000 1.000
5 5 3 0.001 9 1.000 1.000
1 10 3 0.000 3 0.699 0.699
6 10 5 0.001 9 1.000 1.000
3 10 3 0.000 7 0.970 0.970
6 10 1 0.001 9 1.000 1.000
4 10 5 0.010 9 1.000 1.000
2 2 1 0.001 5 0.836 0.836
4 2 5 0.000 9 1.000 1.000
3 5 1 0.001 7 0.970 0.970
2 5 5 0.010 5 0.836 0.836
6 2 5 0.010 9 1.000 1.000
4 5 1 0.000 9 1.000 1.000
3 10 1 0.000 7 0.970 0.970
6 2 5 0.000 9 1.000 1.000
2 10 5 0.001 5 0.836 0.836
5 2 5 0.010 9 1.000 1.000
plt <- results %>%
  tidyr::pivot_longer(cols=c(acc_train,acc_test), names_to="set", values_to="acc") %>%
  ggplot(aes(x=nodes,y=acc,group=set))+
  geom_point()+geom_line()+
  labs(title="Accuracy vs Tree Size",x="# of nodes",y="Accuracy",caption="Train vs Test")
print(plt)

res2 <- results %>% mutate(gen_gap=acc_train-acc_test,
                           score=acc_test-0.15*pmax(gen_gap,0))
best <- dplyr::slice_max(res2, order_by=score, n=1)
knitr::kable(best, digits=3, caption="Problem 3 – Chosen Parameters")
Problem 3 – Chosen Parameters
maxdepth minsplit minbucket cp nodes acc_train acc_test gen_gap score
6 5 5 0.000 9 1 1 0 1
4 2 3 0.010 9 1 1 0 1
5 2 3 0.001 9 1 1 0 1
6 5 5 0.001 9 1 1 0 1
5 5 3 0.001 9 1 1 0 1
6 10 5 0.001 9 1 1 0 1
6 10 1 0.001 9 1 1 0 1
4 10 5 0.010 9 1 1 0 1
4 2 5 0.000 9 1 1 0 1
6 2 5 0.010 9 1 1 0 1
4 5 1 0.000 9 1 1 0 1
6 2 5 0.000 9 1 1 0 1
5 2 5 0.010 9 1 1 0 1
md <- as.integer(best$maxdepth[1])
ms <- as.integer(best$minsplit[1])
mb <- as.integer(best$minbucket[1])
cpv <- as.numeric(best$cp[1])

best_fit <- rpart(category~., data=tr3, method="class",
                  control=rpart.control(maxdepth=md,minsplit=ms,minbucket=mb,cp=cpv))
cm_best_tr <- confusionMatrix(predict(best_fit,tr3,type="class"),tr3$category)
cm_best_te <- confusionMatrix(predict(best_fit,te3,type="class"),te3$category)

knitr::kable(as.data.frame(cm_best_tr$table),caption="Problem 3 – Confusion Matrix (Train)")
Problem 3 – Confusion Matrix (Train)
Prediction Reference Freq
1 1 867
2 1 0
3 1 0
4 1 0
5 1 0
1 2 0
2 2 348
3 2 0
4 2 0
5 2 0
1 3 0
2 3 0
3 3 233
4 3 0
5 3 0
1 4 0
2 4 0
3 4 0
4 4 238
5 4 0
1 5 0
2 5 0
3 5 0
4 5 0
5 5 52
knitr::kable(as.data.frame(cm_best_te$table),caption="Problem 3 – Confusion Matrix (Test)")
Problem 3 – Confusion Matrix (Test)
Prediction Reference Freq
1 1 216
2 1 0
3 1 0
4 1 0
5 1 0
1 2 0
2 2 86
3 2 0
4 2 0
5 2 0
1 3 0
2 3 0
3 3 58
4 3 0
5 3 0
1 4 0
2 4 0
3 4 0
4 4 59
5 4 0
1 5 0
2 5 0
3 5 0
4 5 0
5 5 13
cv <- train(category~., data=storms_df, method="rpart",
            trControl=trainControl(method="cv",number=10),
            tuneGrid=data.frame(cp=cpv),
            control=rpart.control(maxdepth=md,minsplit=ms,minbucket=mb,cp=cpv))
knitr::kable(data.frame(Metric="P3 10-fold CV Accuracy (Chosen Model)",
                        Value=round(max(cv$results$Accuracy),3)),
             caption="Problem 3 – Final Cross-Validation Accuracy")
Problem 3 – Final Cross-Validation Accuracy
Metric Value
P3 10-fold CV Accuracy (Chosen Model) 1
# ===== Problem 4 =====
cat("\n--- Problem 4: Variable Importance ---\n")
## 
## --- Problem 4: Variable Importance ---
bank <- auto_load_csv("Bank_Modified.csv")
stopifnot(!is.null(bank))

bank <- bank %>% select(-any_of("ID")) %>% mutate(approval=as.factor(approval)) %>% drop_na()

model_full <- rpart(approval~., data=bank, method="class",
                    control=rpart.control(minsplit=10,maxdepth=20,cp=0.001))
vi <- sort(model_full$variable.importance, decreasing=TRUE)
knitr::kable(data.frame(Variable=names(vi),Importance=as.numeric(vi)),
             caption="Problem 4 – Variable Importance")
Problem 4 – Variable Importance
Variable Importance
X 215.114367
bool1 179.282437
ages 98.349246
cont4 85.217471
bool2 85.083592
cont3 71.832313
cont2 10.700405
credit.score 7.414835
cont1 6.726451
cont5 5.602469
cont6 4.189158
vars6 <- names(vi)[1:min(6,length(vi))]
form6 <- as.formula(paste("approval ~", paste(vars6, collapse=" + ")))

base_cv <- train(approval~., data=bank, method="rpart",
                 trControl=trainControl(method="cv",number=10),
                 tuneGrid=data.frame(cp=0.001))
top6_cv <- train(form6, data=bank, method="rpart",
                 trControl=trainControl(method="cv",number=10),
                 tuneGrid=data.frame(cp=0.001))

knitr::kable(data.frame(
  Metric=c("P4 CV Accuracy – All Vars","P4 CV Accuracy – Top 6"),
  Value=round(c(max(base_cv$results$Accuracy), max(top6_cv$results$Accuracy)),3)),
  caption="Problem 4 – Accuracy Comparison")
Problem 4 – Accuracy Comparison
Metric Value
P4 CV Accuracy – All Vars 0.933
P4 CV Accuracy – Top 6 0.957
par(mfrow=c(1,2))
rpart.plot(model_full, type=3, extra=104, under=TRUE, faclen=0, main="Full Variables")
model_top6 <- rpart(form6, data=bank, method="class",
                    control=rpart.control(minsplit=10,maxdepth=20,cp=0.001))
rpart.plot(model_top6, type=3, extra=104, under=TRUE, faclen=0, main="Top 6 Variables")

par(mfrow=c(1,1))