Load Data

rm(list = ls())
cat("\014")          # Clear console
graphics.off()       # Close plots


## INSTALL & LOAD PACKAGES

packages <- c("caret", "e1071", "pROC")

for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, dependencies = TRUE)
    library(pkg, character.only = TRUE)
  }
}
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
## Loading required package: e1071
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## LOAD DATASET

bank_data <- read.csv(
  "bank+marketing/bank-additional/bank-additional/bank-additional-full.csv",
  sep = ",",
  header = TRUE,
  stringsAsFactors = FALSE
)

print(dim(bank_data))
## [1] 41188    21
print(names(bank_data))
##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "y"
## TARGET VARIABLE

bank_data$y <- factor(bank_data$y)


## TRAIN / TEST SPLIT

set.seed(123)
train_index <- caret::createDataPartition(bank_data$y, p = 0.7, list = FALSE)
train_data  <- bank_data[train_index, ]
test_data   <- bank_data[-train_index, ]


## SCALE NUMERIC VARIABLES USING CARET

numeric_vars <- names(train_data)[sapply(train_data, is.numeric)]

scaling_params <- caret::preProcess(
  train_data[, numeric_vars],
  method = c("center", "scale")
)

train_scaled <- train_data
test_scaled  <- test_data

train_scaled[, numeric_vars] <- predict(scaling_params, train_data[, numeric_vars])
test_scaled[, numeric_vars]  <- predict(scaling_params, test_data[, numeric_vars])


## TRAIN SVM MODEL

svm_model <- e1071::svm(
  y ~ .,
  data = train_scaled,
  kernel = "radial",
  cost = 1,
  gamma = 0.01,
  probability = TRUE
)


## PREDICTIONS + CONFUSION MATRIX

svm_pred <- predict(svm_model, newdata = test_scaled)

cm <- caret::confusionMatrix(
  svm_pred,
  test_scaled$y,
  positive = "yes"
)

print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  10776   965
##        yes   188   427
##                                           
##                Accuracy : 0.9067          
##                  95% CI : (0.9014, 0.9118)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 1.636e-12       
##                                           
##                   Kappa : 0.3829          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.30675         
##             Specificity : 0.98285         
##          Pos Pred Value : 0.69431         
##          Neg Pred Value : 0.91781         
##              Prevalence : 0.11266         
##          Detection Rate : 0.03456         
##    Detection Prevalence : 0.04977         
##       Balanced Accuracy : 0.64480         
##                                           
##        'Positive' Class : yes             
## 
## ROC CURVE + AUC

svm_prob <- attr(
  predict(svm_model, newdata = test_scaled, probability = TRUE),
  "probabilities"
)[, "yes"]

roc_obj <- pROC::roc(test_scaled$y, svm_prob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
print(paste("AUC:", pROC::auc(roc_obj)))
## [1] "AUC: 0.940276851723719"
plot(
  roc_obj,
  col = "blue",
  lwd = 2,
  main = "SVM ROC Curve - Bank Marketing Dataset"
)