Decision Trees Homework

suppressPackageStartupMessages({
  library(tidyverse)
  library(caret)
  library(rpart)
  library(rpart.plot)
})

set.seed(42)

auto_load_csv <- function(fname) {
  paths <- c(
    fname,
    file.path(getwd(), fname),
    file.path("~","Downloads",fname),
    file.path("/mnt/data",fname)
  )
  for (p in paths) if (file.exists(p)) return(read.csv(p))
  return(NULL)
}

cat("Sanity check: running at", as.character(Sys.time()), "\n")

## Sanity check: running at 2025-10-23 11:40:04.492479

# ===== Problem 1 =====
cat("\n--- Problem 1: Breast Cancer ---\n")

## 
## --- Problem 1: Breast Cancer ---

if (exists("BreastCancerData")) {
  bc <- BreastCancerData
} else {
  bc <- auto_load_csv("breast_cancer_updated.csv")
}
stopifnot(!is.null(bc))

bc <- bc %>% select(-any_of("IDNumber")) %>% mutate(Class = as.factor(Class)) %>% na.omit()
cat("Rows after cleaning:", nrow(bc), "\n")

## Rows after cleaning: 683

ctrl <- trainControl(method="cv", number=10)
fit  <- train(Class~., data=bc, method="rpart", trControl=ctrl, tuneLength=10)
final_tree <- rpart(Class~., data=bc, method="class",
                    control=rpart.control(cp=fit$bestTune$cp))

rpart.plot(final_tree, type=3, extra=104, under=TRUE, faclen=0,
           main="Problem 1 – Breast Cancer Decision Tree")

knitr::kable(data.frame(Metric="P1 10-fold CV Accuracy",
                        Value=round(max(fit$results$Accuracy),3)),
             caption="Problem 1 – Cross-Validation Accuracy")

Problem 1 – Cross-Validation Accuracy
Metric	Value
P1 10-fold CV Accuracy	0.956

paths_to_rules <- function(tree){
  fr <- tree$frame
  leaves <- row.names(fr)[fr$var=="<leaf>"]
  pths <- path.rpart(tree, leaves, print.it=FALSE)
  out <- character(length(pths))
  for (i in seq_along(pths)) {
    conds <- pths[[i]][-1]
    pred <- as.character(fr[leaves[i],"yval2"][,1])
    if (is.null(pred) || length(pred)==0)
      pred <- attr(tree,"ylevels")[ fr[leaves[i],"yval"] ]
    out[i] <- paste0("IF ", paste(conds,collapse=" AND "), " THEN Class = ", pred)
  }
  out
}
rules <- paths_to_rules(final_tree)
knitr::kable(data.frame(Rule=rules), col.names="IF–THEN Rules",
             caption="Problem 1 – Tree Rules")

Problem 1 – Tree Rules
IF–THEN Rules
IF UniformCellSize< 2.5 AND BareNuclei< 5.5 THEN Class = 1
IF UniformCellSize< 2.5 AND BareNuclei>=5.5 THEN Class = 2
IF UniformCellSize>=2.5 AND UniformCellShape< 2.5 AND BlandChromatin< 3.5 THEN Class = 1
IF UniformCellSize>=2.5 AND UniformCellShape< 2.5 AND BlandChromatin>=3.5 THEN Class = 2
IF UniformCellSize>=2.5 AND UniformCellShape>=2.5 AND UniformCellSize< 4.5 AND BareNuclei< 2.5 THEN Class = 1
IF UniformCellSize>=2.5 AND UniformCellShape>=2.5 AND UniformCellSize< 4.5 AND BareNuclei>=2.5 THEN Class = 2
IF UniformCellSize>=2.5 AND UniformCellShape>=2.5 AND UniformCellSize>=4.5 THEN Class = 2

# ===== Problem 2 =====
cat("\n--- Problem 2: storms ---\n")

## 
## --- Problem 2: storms ---

data("storms", package="dplyr")
storms_df <- storms %>%
  mutate(category=as.factor(category)) %>%
  select(-name,-status) %>%
  drop_na()

ctrl2 <- trainControl(method="cv", number=10)
fit2  <- train(category~., data=storms_df, method="rpart", trControl=ctrl2,
               tuneGrid=data.frame(cp=0),
               control=rpart.control(maxdepth=2, minsplit=5, minbucket=3, cp=0))
cv2_acc <- max(fit2$results$Accuracy)

split_idx <- createDataPartition(storms_df$category, p=0.8, list=FALSE)
tr <- storms_df[split_idx,]; te <- storms_df[-split_idx,]

model2 <- rpart(category~., data=tr, method="class",
                control=rpart.control(maxdepth=2, minsplit=5, minbucket=3, cp=0))

cm_tr <- confusionMatrix(predict(model2,tr,type="class"), tr$category)
cm_te <- confusionMatrix(predict(model2,te,type="class"), te$category)

knitr::kable(data.frame(Metric="P2 10-fold CV Accuracy",Value=round(cv2_acc,3)),
             caption="Problem 2 – Cross-Validation Accuracy")

Problem 2 – Cross-Validation Accuracy
Metric	Value
P2 10-fold CV Accuracy	0.836

knitr::kable(as.data.frame(cm_tr$table),caption="Problem 2 – Confusion Matrix (Train)")

Problem 2 – Confusion Matrix (Train)
Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	233
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	52
5	5	0

knitr::kable(as.data.frame(cm_te$table),caption="Problem 2 – Confusion Matrix (Test)")

Problem 2 – Confusion Matrix (Test)
Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	58
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	13
5	5	0

# ===== Problem 3 =====
cat("\n--- Problem 3: storms extension ---\n")

## 
## --- Problem 3: storms extension ---

set.seed(123)
split_idx3 <- createDataPartition(storms_df$category, p=0.8, list=FALSE)
tr3 <- storms_df[split_idx3,]; te3 <- storms_df[-split_idx3,]
cat("Train rows:", nrow(tr3), "Test rows:", nrow(te3), "\n")

## Train rows: 1738 Test rows: 432

param_grid <- expand.grid(maxdepth=c(1,2,3,4,5,6),
                          minsplit=c(2,5,10),
                          minbucket=c(1,3,5),
                          cp=c(0,0.001,0.01),
                          KEEP.OUT.ATTRS=FALSE) %>% as_tibble()
if (nrow(param_grid)>=24) param_grid <- dplyr::sample_n(param_grid,24)

results <- purrr::map_dfr(1:nrow(param_grid), function(i){
  pg <- param_grid[i,]
  fit <- rpart(category~., data=tr3, method="class",
               control=rpart.control(maxdepth=pg$maxdepth,
                                     minsplit=pg$minsplit,
                                     minbucket=pg$minbucket,
                                     cp=pg$cp))
  tibble(maxdepth=pg$maxdepth, minsplit=pg$minsplit, minbucket=pg$minbucket, cp=pg$cp,
         nodes=nrow(fit$frame),
         acc_train=mean(predict(fit,tr3,type="class")==tr3$category),
         acc_test=mean(predict(fit,te3,type="class")==te3$category))
})

knitr::kable(results, digits=3, caption="Problem 3 – Parameter Grid Results")

Problem 3 – Parameter Grid Results
maxdepth	minsplit	minbucket	cp	nodes	acc_train	acc_test
2	5	1	0.000	5	0.836	0.836
1	2	1	0.001	3	0.699	0.699
2	2	5	0.001	5	0.836	0.836
1	2	1	0.010	3	0.699	0.699
6	5	5	0.000	9	1.000	1.000
4	2	3	0.010	9	1.000	1.000
5	2	3	0.001	9	1.000	1.000
6	5	5	0.001	9	1.000	1.000
5	5	3	0.001	9	1.000	1.000
1	10	3	0.000	3	0.699	0.699
6	10	5	0.001	9	1.000	1.000
3	10	3	0.000	7	0.970	0.970
6	10	1	0.001	9	1.000	1.000
4	10	5	0.010	9	1.000	1.000
2	2	1	0.001	5	0.836	0.836
4	2	5	0.000	9	1.000	1.000
3	5	1	0.001	7	0.970	0.970
2	5	5	0.010	5	0.836	0.836
6	2	5	0.010	9	1.000	1.000
4	5	1	0.000	9	1.000	1.000
3	10	1	0.000	7	0.970	0.970
6	2	5	0.000	9	1.000	1.000
2	10	5	0.001	5	0.836	0.836
5	2	5	0.010	9	1.000	1.000

plt <- results %>%
  tidyr::pivot_longer(cols=c(acc_train,acc_test), names_to="set", values_to="acc") %>%
  ggplot(aes(x=nodes,y=acc,group=set))+
  geom_point()+geom_line()+
  labs(title="Accuracy vs Tree Size",x="# of nodes",y="Accuracy",caption="Train vs Test")
print(plt)

res2 <- results %>% mutate(gen_gap=acc_train-acc_test,
                           score=acc_test-0.15*pmax(gen_gap,0))
best <- dplyr::slice_max(res2, order_by=score, n=1)
knitr::kable(best, digits=3, caption="Problem 3 – Chosen Parameters")

Problem 3 – Chosen Parameters
maxdepth	minsplit	minbucket	cp	nodes	acc_train	acc_test	score
6	5	5	0.000	9	1	1	1
4	2	3	0.010	9	1	1	1
5	2	3	0.001	9	1	1	1
6	5	5	0.001	9	1	1	1
5	5	3	0.001	9	1	1	1
6	10	5	0.001	9	1	1	1
6	10	1	0.001	9	1	1	1
4	10	5	0.010	9	1	1	1
4	2	5	0.000	9	1	1	1
6	2	5	0.010	9	1	1	1
4	5	1	0.000	9	1	1	1
6	2	5	0.000	9	1	1	1
5	2	5	0.010	9	1	1	1

md <- as.integer(best$maxdepth[1])
ms <- as.integer(best$minsplit[1])
mb <- as.integer(best$minbucket[1])
cpv <- as.numeric(best$cp[1])

best_fit <- rpart(category~., data=tr3, method="class",
                  control=rpart.control(maxdepth=md,minsplit=ms,minbucket=mb,cp=cpv))
cm_best_tr <- confusionMatrix(predict(best_fit,tr3,type="class"),tr3$category)
cm_best_te <- confusionMatrix(predict(best_fit,te3,type="class"),te3$category)

knitr::kable(as.data.frame(cm_best_tr$table),caption="Problem 3 – Confusion Matrix (Train)")

Problem 3 – Confusion Matrix (Train)
Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	233
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	52

knitr::kable(as.data.frame(cm_best_te$table),caption="Problem 3 – Confusion Matrix (Test)")

Problem 3 – Confusion Matrix (Test)
Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	58
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	13

cv <- train(category~., data=storms_df, method="rpart",
            trControl=trainControl(method="cv",number=10),
            tuneGrid=data.frame(cp=cpv),
            control=rpart.control(maxdepth=md,minsplit=ms,minbucket=mb,cp=cpv))
knitr::kable(data.frame(Metric="P3 10-fold CV Accuracy (Chosen Model)",
                        Value=round(max(cv$results$Accuracy),3)),
             caption="Problem 3 – Final Cross-Validation Accuracy")

Problem 3 – Final Cross-Validation Accuracy
Metric	Value
P3 10-fold CV Accuracy (Chosen Model)	1

# ===== Problem 4 =====
cat("\n--- Problem 4: Variable Importance ---\n")

## 
## --- Problem 4: Variable Importance ---

bank <- auto_load_csv("Bank_Modified.csv")
stopifnot(!is.null(bank))

bank <- bank %>% select(-any_of("ID")) %>% mutate(approval=as.factor(approval)) %>% drop_na()

model_full <- rpart(approval~., data=bank, method="class",
                    control=rpart.control(minsplit=10,maxdepth=20,cp=0.001))
vi <- sort(model_full$variable.importance, decreasing=TRUE)
knitr::kable(data.frame(Variable=names(vi),Importance=as.numeric(vi)),
             caption="Problem 4 – Variable Importance")

Problem 4 – Variable Importance
Variable	Importance
X	215.114367
bool1	179.282437
ages	98.349246
cont4	85.217471
bool2	85.083592
cont3	71.832313
cont2	10.700405
credit.score	7.414835
cont1	6.726451
cont5	5.602469
cont6	4.189158

vars6 <- names(vi)[1:min(6,length(vi))]
form6 <- as.formula(paste("approval ~", paste(vars6, collapse=" + ")))

base_cv <- train(approval~., data=bank, method="rpart",
                 trControl=trainControl(method="cv",number=10),
                 tuneGrid=data.frame(cp=0.001))
top6_cv <- train(form6, data=bank, method="rpart",
                 trControl=trainControl(method="cv",number=10),
                 tuneGrid=data.frame(cp=0.001))

knitr::kable(data.frame(
  Metric=c("P4 CV Accuracy – All Vars","P4 CV Accuracy – Top 6"),
  Value=round(c(max(base_cv$results$Accuracy), max(top6_cv$results$Accuracy)),3)),
  caption="Problem 4 – Accuracy Comparison")

Problem 4 – Accuracy Comparison
Metric	Value
P4 CV Accuracy – All Vars	0.933
P4 CV Accuracy – Top 6	0.957

par(mfrow=c(1,2))
rpart.plot(model_full, type=3, extra=104, under=TRUE, faclen=0, main="Full Variables")
model_top6 <- rpart(form6, data=bank, method="class",
                    control=rpart.control(minsplit=10,maxdepth=20,cp=0.001))
rpart.plot(model_top6, type=3, extra=104, under=TRUE, faclen=0, main="Top 6 Variables")

par(mfrow=c(1,1))

Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	233
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	52
5	5	0

Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	58
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	13
5	5	0

Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	233
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	52

Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	58
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	13

Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	233
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	52
5	5	0

Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	58
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	13
5	5	0

Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	233
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	52

Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	58
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	13

Decision Trees Homework

Oliver Sikora

Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	233
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	52
5	5	0

Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	0
4	3	58
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	13
5	5	0

Prediction	Reference	Freq
1	1	867
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	348
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	233
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	238
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	52

Prediction	Reference	Freq
1	1	216
2	1	0
3	1	0
4	1	0
5	1	0
1	2	0
2	2	86
3	2	0
4	2	0
5	2	0
1	3	0
2	3	0
3	3	58
4	3	0
5	3	0
1	4	0
2	4	0
3	4	0
4	4	59
5	4	0
1	5	0
2	5	0
3	5	0
4	5	0
5	5	13