Assignment 2: Experimentation & Model Training

# ---- Packages ----
req <- c("tidyverse","data.table","caret","pROC",
         "rpart","rpart.plot","randomForest","adabag","MLmetrics", "stringr", "doParallel","ranger")
to_install <- req[!(req %in% installed.packages()[,1])]
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(req, library, character.only = TRUE))

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'data.table'
## 
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## 
## Loading required package: lattice
## 
## 
## Attaching package: 'caret'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## 
## Type 'citation("pROC")' for a citation.
## 
## 
## Attaching package: 'pROC'
## 
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## 
## randomForest 4.7-1.2
## 
## Type rfNews() to see new features/changes/bug fixes.
## 
## 
## Attaching package: 'randomForest'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## 
## Loading required package: foreach
## 
## 
## Attaching package: 'foreach'
## 
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## 
## Loading required package: doParallel
## 
## Loading required package: iterators
## 
## Loading required package: parallel
## 
## 
## Attaching package: 'MLmetrics'
## 
## 
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## 
## 
## The following object is masked from 'package:base':
## 
##     Recall
## 
## 
## 
## Attaching package: 'ranger'
## 
## 
## The following object is masked from 'package:randomForest':
## 
##     importance

library(tidyverse)
library(stringr)
library(caret)
library(pROC)
library(MLmetrics)
library(rpart)
library(rpart.plot)
library(randomForest)
library(adabag)
library(doParallel)
library(ranger)

#  Outputs dir 
out_dir <- file.path(getwd(), "outputs")
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

#  Candidate data roots 
win_dirs <- c(
  "C:/Users/snaqwe/OneDrive/Desktop/Dat 622/bank+marketing",
  "C:/Users/abc/OneDrive/Desktop/Dat 622/bank+marketing"
)
env_dir  <- Sys.getenv("BANK_DATA_DIR", unset = NA_character_)
proj_dir <- file.path(getwd(), "data")
roots    <- unique(na.omit(c(win_dirs, env_dir, proj_dir, getwd())))

#  Helper: list CSVs 
list_csvs <- function(root) {
  if (!dir.exists(root)) return(character())
  list.files(root, pattern = "\\.csv$", full.names = TRUE, recursive = TRUE)
}

#  Prefer UCI bank filenames (make scorer return integer!) 
pref_order <- c("bank-additional-full.csv","bank-full.csv","bank-additional.csv","bank.csv")
score_name <- function(x) {
  b <- basename(x)
  hits <- which(sapply(pref_order, function(p) grepl(p, b, ignore.case = TRUE)))
  if (length(hits)) as.integer(min(hits)) else 999L
}

#  Base CSV reader with delimiter auto-detect (; vs ,) 
read_any_csv_base <- function(path) {
  stopifnot(is.character(path), length(path) == 1)
  if (!file.exists(path)) stop("File not found: ", path)
  first <- readLines(path, n = 1, warn = FALSE)
  semi  <- stringr::str_count(first, ";")
  comma <- stringr::str_count(first, ",")
  if (semi > comma) read.csv2(path, stringsAsFactors = FALSE) else read.csv(path, stringsAsFactors = FALSE)
}

#  Collect, order, read (FIXED vapply integer type) 
csvs <- unique(unlist(lapply(roots, list_csvs)))
if (!length(csvs)) stop("No CSV files found under:\n", paste(roots, collapse = "\n"))
scores <- vapply(csvs, score_name, FUN.VALUE = integer(1))
csvs_ordered <- csvs[order(scores, na.last = TRUE)]

eda_list <- lapply(csvs_ordered, function(f) read_any_csv_base(f) %>% tibble::as_tibble())
names(eda_list) <- basename(csvs_ordered)  # short, clean names

primary_path <- csvs_ordered[1]
message("Using primary dataset: ", primary_path)

## Using primary dataset: C:/Users/snaqw/OneDrive/Desktop/Data 622/bank+marketing/bank-additional/bank-additional/bank-additional-full.csv

dat <- eda_list[[1]]  # index first item directly to avoid name mismatches

#  EDA summary across all CSVs 
eda_summary <- tibble(
  file = names(eda_list),
  rows = vapply(eda_list, nrow, 1L),
  cols = vapply(eda_list, ncol, 1L),
  has_y = vapply(eda_list, function(d) "y" %in% names(d), TRUE)
)
print(eda_summary)

## # A tibble: 5 × 4
##   file                        rows  cols has_y
##   <chr>                      <int> <int> <lgl>
## 1 bank-additional-full.csv   41188    21 TRUE 
## 2 bank-full.csv              45211    17 TRUE 
## 3 bank-additional.csv         4119    21 TRUE 
## 4 bank.csv                    4521    17 TRUE 
## 5 rf_variable_importance.csv    34     2 FALSE

#  Class balance plots for any file with 'y' 
for (nm in names(eda_list)) {
  d <- eda_list[[nm]]
  if ("y" %in% names(d)) {
    d$y <- as.factor(d$y)
    p <- d %>% count(y) %>%
      ggplot(aes(y, n)) + geom_col() +
      labs(title = paste("Class Balance:", nm), x = "y", y = "Count")
    print(p)
  }
}

# ---- Target handling & quick correlation (primary dataset) ----
if (!("y" %in% names(dat))) names(dat)[ncol(dat)] <- "y"
dat$y <- as.factor(dat$y)
if ("yes" %in% levels(dat$y)) dat$y <- relevel(dat$y, ref = "yes") else
  dat$y <- relevel(dat$y, ref = levels(dat$y)[which.min(table(dat$y))])

print(head(dat))

## # A tibble: 6 × 21
##     age job    marital education default housing loan  contact month day_of_week
##   <int> <chr>  <chr>   <chr>     <chr>   <chr>   <chr> <chr>   <chr> <chr>      
## 1    56 house… married basic.4y  no      no      no    teleph… may   mon        
## 2    57 servi… married high.sch… unknown no      no    teleph… may   mon        
## 3    37 servi… married high.sch… no      yes     no    teleph… may   mon        
## 4    40 admin. married basic.6y  no      no      no    teleph… may   mon        
## 5    56 servi… married high.sch… no      no      yes   teleph… may   mon        
## 6    45 servi… married basic.9y  unknown no      no    teleph… may   mon        
## # ℹ 11 more variables: duration <int>, campaign <int>, pdays <int>,
## #   previous <int>, poutcome <chr>, emp.var.rate <dbl>, cons.price.idx <dbl>,
## #   cons.conf.idx <dbl>, euribor3m <dbl>, nr.employed <dbl>, y <fct>

str(dat)

## tibble [41,188 × 21] (S3: tbl_df/tbl/data.frame)
##  $ age           : int [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr [1:41188] "housemaid" "services" "services" "admin." ...
##  $ marital       : chr [1:41188] "married" "married" "married" "married" ...
##  $ education     : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr [1:41188] "no" "unknown" "no" "no" ...
##  $ housing       : chr [1:41188] "no" "no" "yes" "no" ...
##  $ loan          : chr [1:41188] "no" "no" "no" "no" ...
##  $ contact       : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:41188] "may" "may" "may" "may" ...
##  $ day_of_week   : chr [1:41188] "mon" "mon" "mon" "mon" ...
##  $ duration      : int [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num [1:41188] 94 94 94 94 94 ...
##  $ cons.conf.idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num [1:41188] 5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

p_bal <- dat %>% count(y) %>%
  ggplot(aes(y, n)) + geom_col() +
  labs(title="Class Balance (Primary Dataset)", x="y", y="Count")
ggsave(file.path(out_dir, "class_balance_primary.png"), p_bal, width = 6, height = 4, dpi = 150)
print(p_bal)

nums <- dat %>% dplyr::select(where(is.numeric))
if (ncol(nums) >= 2) {
  corr <- cor(nums, use = "pairwise.complete.obs")
  corr_df <- as.data.frame(as.table(corr))
  p_corr <- ggplot(corr_df, aes(Var1, Var2, fill = Freq)) +
    geom_tile() + scale_fill_viridis_c() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Numeric Correlation Heatmap", x = NULL, y = NULL, fill = "corr")
  ggsave(file.path(out_dir, "correlations.png"), p_corr, width = 7, height = 6, dpi = 150)
  print(p_corr)
} else {
  message("Not enough numeric features for a correlation heatmap.")
}

# ---- Train/Test split & one-hot encoding ----
set.seed(622)  # ensure reproducible split
idx <- caret::createDataPartition(dat$y, p = 0.8, list = FALSE)
train <- dat[idx, ]
test  <- dat[-idx, ]

dummies <- caret::dummyVars(y ~ ., data = train, fullRank = TRUE)
X_train <- data.frame(predict(dummies, newdata = train))

## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'y' is not a factor

X_test  <- data.frame(predict(dummies, newdata = test))

## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'y' is not a factor

y_train <- train$y
y_test  <- test$y
pos_lab <- levels(y_test)[1]

message("Train rows: ", nrow(X_train), " | Test rows: ", nrow(X_test), " | Encoded features: ", ncol(X_train))

## Train rows: 32951 | Test rows: 8237 | Encoded features: 53

# ---- Metrics & CV helpers ----
metrics_from <- function(prob_pos, pred_class, truth, positive = levels(truth)[1]) {
  cm  <- caret::confusionMatrix(pred_class, truth, positive = positive)
  auc <- tryCatch(as.numeric(pROC::roc(truth, prob_pos, levels = rev(levels(truth)))$auc),
                  error = function(e) NA_real_)
  f1  <- tryCatch(MLmetrics::F1_Score(pred_class, truth, positive = positive),
                  error = function(e) NA_real_)
  tibble(
    Accuracy  = as.numeric(cm$overall["Accuracy"]),
    Precision = as.numeric(cm$byClass["Precision"]),
    Recall    = as.numeric(cm$byClass["Recall"]),
    F1        = f1,
    AUC       = auc
  )
}

ctrl <- caret::trainControl(
  method = "cv", number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

roc_curves <- list()
roc_df <- function(truth, prob, label){
  r <- pROC::roc(truth, prob, levels = rev(levels(truth)))
  tibble(tpr = rev(r$sensitivities), fpr = rev(1 - r$specificities), model = label, auc = as.numeric(r$auc))
}
results <- tibble()

# ---- Align train/test matrices & remove NZV (prevents 'defaultyes' type errors) ----
colnames(X_train) <- make.names(colnames(X_train))
colnames(X_test)  <- make.names(colnames(X_test))
missing_in_test <- setdiff(colnames(X_train), colnames(X_test))
if (length(missing_in_test)) for (m in missing_in_test) X_test[[m]] <- 0
X_test <- X_test[, colnames(X_train), drop = FALSE]
nzv <- caret::nearZeroVar(X_train)
if (length(nzv)) {
  keep <- setdiff(seq_len(ncol(X_train)), nzv)
  X_train <- X_train[, keep, drop = FALSE]
  X_test  <- X_test[,  keep, drop = FALSE]
}

# --- DT-1: tune cp (pruning) ---
grid_dt1 <- expand.grid(cp = seq(0, 0.05, by = 0.005))
dt1 <- caret::train(
  x = X_train, y = y_train,
  method = "rpart",
  trControl = ctrl, metric = "ROC",
  tuneGrid = grid_dt1
)
dt1_prob <- predict(dt1, newdata = X_test, type = "prob")[, pos_lab]
dt1_pred <- predict(dt1, newdata = X_test)
res_dt1  <- metrics_from(dt1_prob, dt1_pred, y_test, pos_lab)

## Setting direction: controls < cases

# --- DT-2: tune maxdepth ---
grid_dt2 <- expand.grid(maxdepth = 1:15)
dt2 <- caret::train(
  x = X_train, y = y_train,
  method = "rpart2",
  trControl = ctrl, metric = "ROC",
  tuneGrid = grid_dt2
)
dt2_prob <- predict(dt2, newdata = X_test, type = "prob")[, pos_lab]
dt2_pred <- predict(dt2, newdata = X_test)
res_dt2  <- metrics_from(dt2_prob, dt2_pred, y_test, pos_lab)

## Setting direction: controls < cases

# ---- Random Forest (2 experiments) — direct ranger, no caret CV (stable & fast) ----
if (!requireNamespace("ranger", quietly = TRUE)) install.packages("ranger")
library(ranger)

y_train <- droplevels(y_train)
y_test  <- droplevels(y_test)
stopifnot(is.factor(y_train), nlevels(y_train) == 2)
pos_lab <- levels(y_test)[1]

tbl <- table(y_train)
cw  <- as.numeric(sum(tbl) / (length(tbl) * tbl)); names(cw) <- names(tbl)

num_threads <- max(1, parallel::detectCores() - 1)
p <- ncol(X_train)
mtry1 <- max(1, round(sqrt(p)))
mtry2 <- max(1, round(p/4))

# RF-1
set.seed(6231)
rf1 <- ranger::ranger(
  dependent.variable.name = "y",
  data  = cbind(X_train, y = y_train),
  probability    = TRUE,
  num.trees      = 200,
  mtry           = mtry1,
  min.node.size  = 5,
  max.depth      = 20,
  sample.fraction= 0.8,
  importance     = "impurity",
  class.weights  = cw,
  num.threads    = num_threads,
  seed           = 6231
)
rf1_pred_obj <- predict(rf1, data = X_test, type = "response")
rf1_prob     <- rf1_pred_obj$predictions[, pos_lab]
rf1_pred     <- factor(colnames(rf1_pred_obj$predictions)[max.col(rf1_pred_obj$predictions)], levels = levels(y_test))
res_rf1      <- metrics_from(rf1_prob, rf1_pred, y_test, pos_lab)

## Setting direction: controls < cases

# RF-2
set.seed(6232)
rf2 <- ranger::ranger(
  dependent.variable.name = "y",
  data  = cbind(X_train, y = y_train),
  probability    = TRUE,
  num.trees      = 400,
  mtry           = mtry2,
  min.node.size  = 5,
  max.depth      = 25,
  sample.fraction= 0.8,
  importance     = "impurity",
  class.weights  = cw,
  num.threads    = num_threads,
  seed           = 6232
)
rf2_pred_obj <- predict(rf2, data = X_test, type = "response")
rf2_prob     <- rf2_pred_obj$predictions[, pos_lab]
rf2_pred     <- factor(colnames(rf2_pred_obj$predictions)[max.col(rf2_pred_obj$predictions)], levels = levels(y_test))
res_rf2      <- metrics_from(rf2_prob, rf2_pred, y_test, pos_lab)

## Setting direction: controls < cases

# ---- Aggregate results so far ----
results <- dplyr::bind_rows(
  results,
  tibble::tibble(ExperimentID="EXP-DT-1", Algorithm="Decision Tree (rpart)",  Params=paste0("cp=", dt1$bestTune$cp), res_dt1),
  tibble::tibble(ExperimentID="EXP-DT-2", Algorithm="Decision Tree (rpart2)", Params=paste0("maxdepth=", dt2$bestTune$maxdepth), res_dt2),
  tibble::tibble(ExperimentID="EXP-RF-1", Algorithm="Random Forest (ranger direct)",
                 Params=paste0("num.trees=200, mtry=", mtry1, ", min.node.size=5, max.depth=20"), res_rf1),
  tibble::tibble(ExperimentID="EXP-RF-2", Algorithm="Random Forest (ranger direct)",
                 Params=paste0("num.trees=400, mtry=", mtry2, ", min.node.size=5, max.depth=25"), res_rf2)
)

# ---- Variable importance from the better RF ----
best_rf_obj <- if (res_rf1$AUC >= res_rf2$AUC) rf1 else rf2
vi_vec <- ranger::importance(best_rf_obj)
vi_df  <- tibble::tibble(Feature = names(vi_vec), Importance = as.numeric(vi_vec)) %>%
  dplyr::arrange(dplyr::desc(Importance))
write.csv(vi_df, file.path(out_dir, "rf_variable_importance.csv"), row.names = FALSE)
print(utils::head(vi_df, 12))

## # A tibble: 12 × 2
##    Feature                    Importance
##    <chr>                           <dbl>
##  1 duration                        4003.
##  2 euribor3m                       1363.
##  3 age                              998.
##  4 nr.employed                      916.
##  5 cons.conf.idx                    419.
##  6 campaign                         414.
##  7 emp.var.rate                     363.
##  8 cons.price.idx                   313.
##  9 previous                         186.
## 10 housingyes                       170.
## 11 educationuniversity.degree       141.
## 12 contacttelephone                 134.

# ---- ROC traces list (build later if you plot) ----
roc_curves[["EXP-DT-1"]] <- roc_df(y_test, dt1_prob, "DT-1")

## Setting direction: controls < cases

roc_curves[["EXP-DT-2"]] <- roc_df(y_test, dt2_prob, "DT-2")

## Setting direction: controls < cases

roc_curves[["EXP-RF-1"]] <- roc_df(y_test, rf1_prob, "RF-1")

## Setting direction: controls < cases

roc_curves[["EXP-RF-2"]] <- roc_df(y_test, rf2_prob, "RF-2")

## Setting direction: controls < cases

# (Add your AdaBoost experiments next and then finalize results/ROC plots as before.)

Assignment 2: Experimentation & Model Training

Said Naqwe

2025-10-26