622assignment3.knit

Assignment 3

Daniel DeBonis

Importing and Preprocessing Data from Previous Assignments

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(kernlab)

## Warning: package 'kernlab' was built under R version 4.5.2

## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(themis)

## Warning: package 'themis' was built under R version 4.5.2

## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(PRROC)

## Loading required package: rlang
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

set.seed(24601)
data <- read.csv('https://raw.githubusercontent.com/ddebonis47/classwork/refs/heads/main/bank-full.csv', sep = ';', stringsAsFactors = TRUE)

# all copied and pasted from previous assignment
median_prev <- median(data$previous, na.rm = TRUE)
data$previous <- ifelse(data$previous == 275, median_prev, data$previous)
data$pdays <- as.numeric(data$pdays)
data$pdays[is.na(data$pdays)] <- -1
data <- data |>
   mutate(poutcome=na_if(poutcome, "unknown"))
data <- data |>
   mutate(job=na_if(job, "unknown"))
data <- data |>
   mutate(education=na_if(education, "unknown"))
data <- data |>
   mutate(contact=na_if(contact, "unknown"))
data$y <- factor(data$y, levels = c("no", "yes"))

### changed coding of date based on feedback
data <- data |>
  mutate(
    month_factor = factor(month, levels = tolower(month.abb), ordered = TRUE),
    
    # Assign quarters based on month
    quarter = case_when(
      month_factor %in% c("jan","feb","mar") ~ "Q1",
      month_factor %in% c("apr","may","jun") ~ "Q2",
      month_factor %in% c("jul","aug","sep") ~ "Q3",
      month_factor %in% c("oct","nov","dec") ~ "Q4"
    ),
    
    # Assign seasons
    season = case_when(
      month_factor %in% c("dec","jan","feb") ~ "Winter",
      month_factor %in% c("mar","apr","may") ~ "Spring",
      month_factor %in% c("jun","jul","aug") ~ "Summer",
      month_factor %in% c("sep","oct","nov") ~ "Fall"
    ),
    
    # Bucket day into early/mid/late month
    day_group = case_when(
      day <= 10 ~ "Early",
      day <= 20 ~ "Mid",
      TRUE ~ "Late"
    )
  )
### ensuring removal of duration
data <- data |>
  select(-duration)

data$total_contacts <- data$campaign + data$previous
data$contact_ratio <- data$campaign / pmax(data$previous, 1)
data$contacts_per_campaign <- with(data,
                                   ifelse((campaign + previous) > 0,
                                          campaign / (campaign + previous),
                                          NA))
data$contacts_per_campaign[is.na(data$contacts_per_campaign)] <- 0

# Columns where unknown or NA should be a single level
cat_vars <- c("poutcome","job","education","contact","quarter","season","day_group")

for(col in cat_vars){
  data[[col]] <- as.character(data[[col]])        
  data[[col]][is.na(data[[col]])] <- "unknown"   # replace NAs
  data[[col]][data[[col]]=="unknown"] <- "unknown" # replace any lingering "unknown"
  data[[col]] <- factor(data[[col]])             # convert back to factor
}

# Subset to 5,000 rows (optional for speed)
data_subset <- data[sample(1:nrow(data), 5000), ]

# Train/Test Split 
trainIndex <- createDataPartition(data_subset$y, p = 0.8, list = FALSE)
trainData <- data_subset[trainIndex, ]
testData  <- data_subset[-trainIndex, ]

# Evaluation function 
evaluate_model <- function(model, testData, positive_class = "yes") {
  
  # Predicted probabilities
  probs <- predict(model, newdata = testData, type = "prob")
  
  # Predicted labels
  preds <- predict(model, newdata = testData)
  
  # ROC AUC
  roc_obj <- roc(response = testData$y, predictor = probs[, positive_class])
  roc_auc <- auc(roc_obj)
  
  # PR AUC
  fg <- probs[, positive_class]
  bg <- probs[, setdiff(colnames(probs), positive_class)]
  pr_obj <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  pr_auc <- pr_obj$auc.integral
  
  # Confusion Matrix metrics
  cm <- confusionMatrix(preds, testData$y, positive = positive_class)
  precision <- cm$byClass["Precision"]
  recall    <- cm$byClass["Recall"]
  f1        <- cm$byClass["F1"]
  
  # Accuracy
  accuracy <- mean(preds == testData$y)
  
  data.frame(
    Model = model$method,
    Accuracy = round(accuracy, 4),
    ROC_AUC = round(roc_auc, 4),
    PR_AUC = round(pr_auc, 4),
    Precision = round(precision, 4),
    Recall = round(recall, 4),
    F1 = round(f1, 4)
  )
}

# Function to train all kernels
run_svm_models <- function(trainData, testData, use_smote = FALSE) {
  
  ctrl <- trainControl(
    method = "repeatedcv",
    number = 5,
    repeats = 2,
    classProbs = TRUE,
    summaryFunction = twoClassSummary,
    savePredictions = TRUE,
    sampling = if(use_smote) "smote" else NULL
  )
  
  # Linear-like SVM (radial with tiny sigma)
  svm_linear_like <- train(
    y ~ ., data = trainData,
    method = "svmRadial",
    metric = "ROC",
    preProcess = c("center","scale"),
    trControl = ctrl,
    tuneGrid = expand.grid(
      sigma = 0.0001,
      C = 2^(-1:2)
    )
  )
  
  # Polynomial SVM
  svm_poly <- train(
    y ~ ., data = trainData,
    method = "svmPoly",
    metric = "ROC",
    preProcess = c("center","scale"),
    trControl = ctrl,
    tuneGrid = expand.grid(
      degree = c(2,3),
      scale = c(0.001,0.01,0.1),
      C = 2^(-1:2)
    )
  )
  
  # RBF SVM
  svm_rbf <- train(
    y ~ ., data = trainData,
    method = "svmRadial",
    metric = "ROC",
    preProcess = c("center","scale"),
    trControl = ctrl,
    tuneLength = 5
  )
  
  # Evaluate all models
  models <- list(LinearLike = svm_linear_like, Poly = svm_poly, RBF = svm_rbf)
  results_metrics <- do.call(rbind, lapply(models, evaluate_model, testData))
  
  # Print SMOTE info
  if(use_smote){
    cat("\n--- SVM Results with SMOTE ---\n")
  } else {
    cat("\n--- SVM Results without SMOTE ---\n")
  }
  print(results_metrics)
  
  # Resample comparison
  resamp_results <- resamples(models)
  bwplot(resamp_results, metric = "ROC", main = if(use_smote) "ROC Comparison with SMOTE" else "ROC Comparison without SMOTE")
  
  return(list(models = models, metrics = results_metrics))
}

# ---- Run both scenarios ----
res_no_smote <- run_svm_models(trainData, testData, use_smote = FALSE)

## line search fails -0.2733741 1.554954 2.291108e-05 -5.195745e-06 -1.038076e-08 3.984548e-09 -2.585372e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## line search fails -0.2755675 1.516934 1.942097e-05 -4.672667e-06 -9.122875e-09 3.108634e-09 -1.917007e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## line search fails -0.2796036 1.528292 1.605331e-05 -3.55773e-06 -5.924931e-09 4.446595e-09 -1.109345e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## --- SVM Results without SMOTE ---
##                Model Accuracy ROC_AUC PR_AUC Precision Recall     F1
## LinearLike svmRadial   0.8929  0.7600 0.3105    0.6383 0.2500 0.3593
## Poly         svmPoly   0.8949  0.7541 0.3093    0.6829 0.2333 0.3478
## RBF        svmRadial   0.8949  0.7428 0.3200    0.6744 0.2417 0.3558

res_smote    <- run_svm_models(trainData, testData, use_smote = TRUE)

## line search fails -1.698917 -0.2102793 1.346651e-05 6.599915e-06 -4.14791e-08 -1.255909e-08 -6.414677e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## line search fails -0.9429723 0.5497187 1.5085e-05 2.836673e-07 -9.4511e-09 4.415262e-09 -1.413173e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## line search fails -1.551299 -0.2941999 1.151151e-05 5.9291e-06 -2.646198e-08 -1.591249e-08 -3.989642e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## line search fails -1.663898 -0.2526609 1.154468e-05 5.371295e-06 -2.766122e-08 -1.427312e-08 -3.96005e-13

## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs

## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## --- SVM Results with SMOTE ---
##                Model Accuracy ROC_AUC PR_AUC Precision Recall     F1
## LinearLike svmRadial   0.7998  0.7713 0.3956    0.3148 0.5667 0.4048
## Poly         svmPoly   0.8158  0.7642 0.3969    0.3333 0.5333 0.4103
## RBF        svmRadial   0.8779  0.7664 0.3259    0.4909 0.4500 0.4696

# Define hyperparameter columns
hyper_cols <- c("sigma", "C", "degree", "scale")

get_hyper <- function(model, hyper_cols){
  hp <- model$bestTune
  # Ensure only one row (best overall)
  if(nrow(hp) > 1) hp <- hp[1, ]
  # Add missing columns as NA
  for(col in hyper_cols){
    if(!col %in% names(hp)) hp[[col]] <- NA
  }
  # Order columns
  hp <- hp[hyper_cols]
  return(hp)
}

# Summaries for both cases
summary_no <- do.call(rbind, lapply(names(res_no_smote$models), function(name) {
  metric_row <- evaluate_model(res_no_smote$models[[name]], testData)
  hp <- get_hyper(res_no_smote$models[[name]], hyper_cols)
  cbind(metric_row, hp, SMOTE = "No")
}))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

summary_yes <- do.call(rbind, lapply(names(res_smote$models), function(name) {
  metric_row <- evaluate_model(res_smote$models[[name]], testData)
  hp <- get_hyper(res_smote$models[[name]], hyper_cols)
  cbind(metric_row, hp, SMOTE = "Yes")
}))

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Combine into one table
summary_table <- rbind(summary_no, summary_yes)

# Reorder columns for readability
summary_table <- summary_table[, c("Model", "SMOTE", hyper_cols,
                                   "Accuracy", "ROC_AUC", "PR_AUC", 
                                   "Precision", "Recall", "F1")]

print(summary_table)

##                 Model SMOTE      sigma    C degree scale Accuracy ROC_AUC
## Precision   svmRadial    No 0.00010000 0.50     NA    NA   0.8929  0.7600
## Precision1    svmPoly    No         NA 1.00      2 0.001   0.8949  0.7541
## Precision2  svmRadial    No 0.01164917 0.25     NA    NA   0.8949  0.7428
## Precision3  svmRadial   Yes 0.00010000 4.00     NA    NA   0.7998  0.7713
## Precision11   svmPoly   Yes         NA 2.00      3 0.001   0.8158  0.7642
## Precision21 svmRadial   Yes 0.01121745 0.25     NA    NA   0.8779  0.7664
##             PR_AUC Precision Recall     F1
## Precision   0.3105    0.6383 0.2500 0.3593
## Precision1  0.3093    0.6829 0.2333 0.3478
## Precision2  0.3200    0.6744 0.2417 0.3558
## Precision3  0.3956    0.3148 0.5667 0.4048
## Precision11 0.3969    0.3333 0.5333 0.4103
## Precision21 0.3259    0.4909 0.4500 0.4696