# ---- Packages ----
req <- c("tidyverse","data.table","caret","pROC",
"rpart","rpart.plot","randomForest","adabag","MLmetrics", "stringr", "doParallel","ranger")
to_install <- req[!(req %in% installed.packages()[,1])]
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(req, library, character.only = TRUE))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'data.table'
##
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
##
## The following object is masked from 'package:purrr':
##
## transpose
##
##
## Loading required package: lattice
##
##
## Attaching package: 'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
## Type 'citation("pROC")' for a citation.
##
##
## Attaching package: 'pROC'
##
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
##
## randomForest 4.7-1.2
##
## Type rfNews() to see new features/changes/bug fixes.
##
##
## Attaching package: 'randomForest'
##
##
## The following object is masked from 'package:dplyr':
##
## combine
##
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
##
## Loading required package: foreach
##
##
## Attaching package: 'foreach'
##
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
##
## Loading required package: doParallel
##
## Loading required package: iterators
##
## Loading required package: parallel
##
##
## Attaching package: 'MLmetrics'
##
##
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
##
##
## The following object is masked from 'package:base':
##
## Recall
##
##
##
## Attaching package: 'ranger'
##
##
## The following object is masked from 'package:randomForest':
##
## importance
library(tidyverse)
library(stringr)
library(caret)
library(pROC)
library(MLmetrics)
library(rpart)
library(rpart.plot)
library(randomForest)
library(adabag)
library(doParallel)
library(ranger)
# Outputs dir
out_dir <- file.path(getwd(), "outputs")
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
# Candidate data roots
win_dirs <- c(
"C:/Users/snaqwe/OneDrive/Desktop/Dat 622/bank+marketing",
"C:/Users/abc/OneDrive/Desktop/Dat 622/bank+marketing"
)
env_dir <- Sys.getenv("BANK_DATA_DIR", unset = NA_character_)
proj_dir <- file.path(getwd(), "data")
roots <- unique(na.omit(c(win_dirs, env_dir, proj_dir, getwd())))
# Helper: list CSVs
list_csvs <- function(root) {
if (!dir.exists(root)) return(character())
list.files(root, pattern = "\\.csv$", full.names = TRUE, recursive = TRUE)
}
# Prefer UCI bank filenames (make scorer return integer!)
pref_order <- c("bank-additional-full.csv","bank-full.csv","bank-additional.csv","bank.csv")
score_name <- function(x) {
b <- basename(x)
hits <- which(sapply(pref_order, function(p) grepl(p, b, ignore.case = TRUE)))
if (length(hits)) as.integer(min(hits)) else 999L
}
# Base CSV reader with delimiter auto-detect (; vs ,)
read_any_csv_base <- function(path) {
stopifnot(is.character(path), length(path) == 1)
if (!file.exists(path)) stop("File not found: ", path)
first <- readLines(path, n = 1, warn = FALSE)
semi <- stringr::str_count(first, ";")
comma <- stringr::str_count(first, ",")
if (semi > comma) read.csv2(path, stringsAsFactors = FALSE) else read.csv(path, stringsAsFactors = FALSE)
}
# Collect, order, read (FIXED vapply integer type)
csvs <- unique(unlist(lapply(roots, list_csvs)))
if (!length(csvs)) stop("No CSV files found under:\n", paste(roots, collapse = "\n"))
scores <- vapply(csvs, score_name, FUN.VALUE = integer(1))
csvs_ordered <- csvs[order(scores, na.last = TRUE)]
eda_list <- lapply(csvs_ordered, function(f) read_any_csv_base(f) %>% tibble::as_tibble())
names(eda_list) <- basename(csvs_ordered) # short, clean names
primary_path <- csvs_ordered[1]
message("Using primary dataset: ", primary_path)
## Using primary dataset: C:/Users/snaqw/OneDrive/Desktop/Data 622/bank+marketing/bank-additional/bank-additional/bank-additional-full.csv
dat <- eda_list[[1]] # index first item directly to avoid name mismatches
# EDA summary across all CSVs
eda_summary <- tibble(
file = names(eda_list),
rows = vapply(eda_list, nrow, 1L),
cols = vapply(eda_list, ncol, 1L),
has_y = vapply(eda_list, function(d) "y" %in% names(d), TRUE)
)
print(eda_summary)
## # A tibble: 5 × 4
## file rows cols has_y
## <chr> <int> <int> <lgl>
## 1 bank-additional-full.csv 41188 21 TRUE
## 2 bank-full.csv 45211 17 TRUE
## 3 bank-additional.csv 4119 21 TRUE
## 4 bank.csv 4521 17 TRUE
## 5 rf_variable_importance.csv 34 2 FALSE
# Class balance plots for any file with 'y'
for (nm in names(eda_list)) {
d <- eda_list[[nm]]
if ("y" %in% names(d)) {
d$y <- as.factor(d$y)
p <- d %>% count(y) %>%
ggplot(aes(y, n)) + geom_col() +
labs(title = paste("Class Balance:", nm), x = "y", y = "Count")
print(p)
}
}




# ---- Target handling & quick correlation (primary dataset) ----
if (!("y" %in% names(dat))) names(dat)[ncol(dat)] <- "y"
dat$y <- as.factor(dat$y)
if ("yes" %in% levels(dat$y)) dat$y <- relevel(dat$y, ref = "yes") else
dat$y <- relevel(dat$y, ref = levels(dat$y)[which.min(table(dat$y))])
print(head(dat))
## # A tibble: 6 × 21
## age job marital education default housing loan contact month day_of_week
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 56 house… married basic.4y no no no teleph… may mon
## 2 57 servi… married high.sch… unknown no no teleph… may mon
## 3 37 servi… married high.sch… no yes no teleph… may mon
## 4 40 admin. married basic.6y no no no teleph… may mon
## 5 56 servi… married high.sch… no no yes teleph… may mon
## 6 45 servi… married basic.9y unknown no no teleph… may mon
## # ℹ 11 more variables: duration <int>, campaign <int>, pdays <int>,
## # previous <int>, poutcome <chr>, emp.var.rate <dbl>, cons.price.idx <dbl>,
## # cons.conf.idx <dbl>, euribor3m <dbl>, nr.employed <dbl>, y <fct>
str(dat)
## tibble [41,188 × 21] (S3: tbl_df/tbl/data.frame)
## $ age : int [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr [1:41188] "housemaid" "services" "services" "admin." ...
## $ marital : chr [1:41188] "married" "married" "married" "married" ...
## $ education : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr [1:41188] "no" "unknown" "no" "no" ...
## $ housing : chr [1:41188] "no" "no" "yes" "no" ...
## $ loan : chr [1:41188] "no" "no" "no" "no" ...
## $ contact : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr [1:41188] "may" "may" "may" "may" ...
## $ day_of_week : chr [1:41188] "mon" "mon" "mon" "mon" ...
## $ duration : int [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num [1:41188] 94 94 94 94 94 ...
## $ cons.conf.idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num [1:41188] 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
p_bal <- dat %>% count(y) %>%
ggplot(aes(y, n)) + geom_col() +
labs(title="Class Balance (Primary Dataset)", x="y", y="Count")
ggsave(file.path(out_dir, "class_balance_primary.png"), p_bal, width = 6, height = 4, dpi = 150)
print(p_bal)

nums <- dat %>% dplyr::select(where(is.numeric))
if (ncol(nums) >= 2) {
corr <- cor(nums, use = "pairwise.complete.obs")
corr_df <- as.data.frame(as.table(corr))
p_corr <- ggplot(corr_df, aes(Var1, Var2, fill = Freq)) +
geom_tile() + scale_fill_viridis_c() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Numeric Correlation Heatmap", x = NULL, y = NULL, fill = "corr")
ggsave(file.path(out_dir, "correlations.png"), p_corr, width = 7, height = 6, dpi = 150)
print(p_corr)
} else {
message("Not enough numeric features for a correlation heatmap.")
}

# ---- Train/Test split & one-hot encoding ----
set.seed(622) # ensure reproducible split
idx <- caret::createDataPartition(dat$y, p = 0.8, list = FALSE)
train <- dat[idx, ]
test <- dat[-idx, ]
dummies <- caret::dummyVars(y ~ ., data = train, fullRank = TRUE)
X_train <- data.frame(predict(dummies, newdata = train))
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'y' is not a factor
X_test <- data.frame(predict(dummies, newdata = test))
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'y' is not a factor
y_train <- train$y
y_test <- test$y
pos_lab <- levels(y_test)[1]
message("Train rows: ", nrow(X_train), " | Test rows: ", nrow(X_test), " | Encoded features: ", ncol(X_train))
## Train rows: 32951 | Test rows: 8237 | Encoded features: 53
# ---- Metrics & CV helpers ----
metrics_from <- function(prob_pos, pred_class, truth, positive = levels(truth)[1]) {
cm <- caret::confusionMatrix(pred_class, truth, positive = positive)
auc <- tryCatch(as.numeric(pROC::roc(truth, prob_pos, levels = rev(levels(truth)))$auc),
error = function(e) NA_real_)
f1 <- tryCatch(MLmetrics::F1_Score(pred_class, truth, positive = positive),
error = function(e) NA_real_)
tibble(
Accuracy = as.numeric(cm$overall["Accuracy"]),
Precision = as.numeric(cm$byClass["Precision"]),
Recall = as.numeric(cm$byClass["Recall"]),
F1 = f1,
AUC = auc
)
}
ctrl <- caret::trainControl(
method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final"
)
roc_curves <- list()
roc_df <- function(truth, prob, label){
r <- pROC::roc(truth, prob, levels = rev(levels(truth)))
tibble(tpr = rev(r$sensitivities), fpr = rev(1 - r$specificities), model = label, auc = as.numeric(r$auc))
}
results <- tibble()
# ---- Align train/test matrices & remove NZV (prevents 'defaultyes' type errors) ----
colnames(X_train) <- make.names(colnames(X_train))
colnames(X_test) <- make.names(colnames(X_test))
missing_in_test <- setdiff(colnames(X_train), colnames(X_test))
if (length(missing_in_test)) for (m in missing_in_test) X_test[[m]] <- 0
X_test <- X_test[, colnames(X_train), drop = FALSE]
nzv <- caret::nearZeroVar(X_train)
if (length(nzv)) {
keep <- setdiff(seq_len(ncol(X_train)), nzv)
X_train <- X_train[, keep, drop = FALSE]
X_test <- X_test[, keep, drop = FALSE]
}
# --- DT-1: tune cp (pruning) ---
grid_dt1 <- expand.grid(cp = seq(0, 0.05, by = 0.005))
dt1 <- caret::train(
x = X_train, y = y_train,
method = "rpart",
trControl = ctrl, metric = "ROC",
tuneGrid = grid_dt1
)
dt1_prob <- predict(dt1, newdata = X_test, type = "prob")[, pos_lab]
dt1_pred <- predict(dt1, newdata = X_test)
res_dt1 <- metrics_from(dt1_prob, dt1_pred, y_test, pos_lab)
## Setting direction: controls < cases
# --- DT-2: tune maxdepth ---
grid_dt2 <- expand.grid(maxdepth = 1:15)
dt2 <- caret::train(
x = X_train, y = y_train,
method = "rpart2",
trControl = ctrl, metric = "ROC",
tuneGrid = grid_dt2
)
dt2_prob <- predict(dt2, newdata = X_test, type = "prob")[, pos_lab]
dt2_pred <- predict(dt2, newdata = X_test)
res_dt2 <- metrics_from(dt2_prob, dt2_pred, y_test, pos_lab)
## Setting direction: controls < cases
# ---- Random Forest (2 experiments) — direct ranger, no caret CV (stable & fast) ----
if (!requireNamespace("ranger", quietly = TRUE)) install.packages("ranger")
library(ranger)
y_train <- droplevels(y_train)
y_test <- droplevels(y_test)
stopifnot(is.factor(y_train), nlevels(y_train) == 2)
pos_lab <- levels(y_test)[1]
tbl <- table(y_train)
cw <- as.numeric(sum(tbl) / (length(tbl) * tbl)); names(cw) <- names(tbl)
num_threads <- max(1, parallel::detectCores() - 1)
p <- ncol(X_train)
mtry1 <- max(1, round(sqrt(p)))
mtry2 <- max(1, round(p/4))
# RF-1
set.seed(6231)
rf1 <- ranger::ranger(
dependent.variable.name = "y",
data = cbind(X_train, y = y_train),
probability = TRUE,
num.trees = 200,
mtry = mtry1,
min.node.size = 5,
max.depth = 20,
sample.fraction= 0.8,
importance = "impurity",
class.weights = cw,
num.threads = num_threads,
seed = 6231
)
rf1_pred_obj <- predict(rf1, data = X_test, type = "response")
rf1_prob <- rf1_pred_obj$predictions[, pos_lab]
rf1_pred <- factor(colnames(rf1_pred_obj$predictions)[max.col(rf1_pred_obj$predictions)], levels = levels(y_test))
res_rf1 <- metrics_from(rf1_prob, rf1_pred, y_test, pos_lab)
## Setting direction: controls < cases
# RF-2
set.seed(6232)
rf2 <- ranger::ranger(
dependent.variable.name = "y",
data = cbind(X_train, y = y_train),
probability = TRUE,
num.trees = 400,
mtry = mtry2,
min.node.size = 5,
max.depth = 25,
sample.fraction= 0.8,
importance = "impurity",
class.weights = cw,
num.threads = num_threads,
seed = 6232
)
rf2_pred_obj <- predict(rf2, data = X_test, type = "response")
rf2_prob <- rf2_pred_obj$predictions[, pos_lab]
rf2_pred <- factor(colnames(rf2_pred_obj$predictions)[max.col(rf2_pred_obj$predictions)], levels = levels(y_test))
res_rf2 <- metrics_from(rf2_prob, rf2_pred, y_test, pos_lab)
## Setting direction: controls < cases
# ---- Aggregate results so far ----
results <- dplyr::bind_rows(
results,
tibble::tibble(ExperimentID="EXP-DT-1", Algorithm="Decision Tree (rpart)", Params=paste0("cp=", dt1$bestTune$cp), res_dt1),
tibble::tibble(ExperimentID="EXP-DT-2", Algorithm="Decision Tree (rpart2)", Params=paste0("maxdepth=", dt2$bestTune$maxdepth), res_dt2),
tibble::tibble(ExperimentID="EXP-RF-1", Algorithm="Random Forest (ranger direct)",
Params=paste0("num.trees=200, mtry=", mtry1, ", min.node.size=5, max.depth=20"), res_rf1),
tibble::tibble(ExperimentID="EXP-RF-2", Algorithm="Random Forest (ranger direct)",
Params=paste0("num.trees=400, mtry=", mtry2, ", min.node.size=5, max.depth=25"), res_rf2)
)
# ---- Variable importance from the better RF ----
best_rf_obj <- if (res_rf1$AUC >= res_rf2$AUC) rf1 else rf2
vi_vec <- ranger::importance(best_rf_obj)
vi_df <- tibble::tibble(Feature = names(vi_vec), Importance = as.numeric(vi_vec)) %>%
dplyr::arrange(dplyr::desc(Importance))
write.csv(vi_df, file.path(out_dir, "rf_variable_importance.csv"), row.names = FALSE)
print(utils::head(vi_df, 12))
## # A tibble: 12 × 2
## Feature Importance
## <chr> <dbl>
## 1 duration 4003.
## 2 euribor3m 1363.
## 3 age 998.
## 4 nr.employed 916.
## 5 cons.conf.idx 419.
## 6 campaign 414.
## 7 emp.var.rate 363.
## 8 cons.price.idx 313.
## 9 previous 186.
## 10 housingyes 170.
## 11 educationuniversity.degree 141.
## 12 contacttelephone 134.
# ---- ROC traces list (build later if you plot) ----
roc_curves[["EXP-DT-1"]] <- roc_df(y_test, dt1_prob, "DT-1")
## Setting direction: controls < cases
roc_curves[["EXP-DT-2"]] <- roc_df(y_test, dt2_prob, "DT-2")
## Setting direction: controls < cases
roc_curves[["EXP-RF-1"]] <- roc_df(y_test, rf1_prob, "RF-1")
## Setting direction: controls < cases
roc_curves[["EXP-RF-2"]] <- roc_df(y_test, rf2_prob, "RF-2")
## Setting direction: controls < cases
# (Add your AdaBoost experiments next and then finalize results/ROC plots as before.)