Load Data
rm(list = ls())
cat("\014") # Clear console
graphics.off() # Close plots
## INSTALL & LOAD PACKAGES
packages <- c("caret", "e1071", "pROC")
for (pkg in packages) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
## Loading required package: e1071
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## LOAD DATASET
bank_data <- read.csv(
"bank+marketing/bank-additional/bank-additional/bank-additional-full.csv",
sep = ",",
header = TRUE,
stringsAsFactors = FALSE
)
print(dim(bank_data))
## [1] 41188 21
print(names(bank_data))
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
## TARGET VARIABLE
bank_data$y <- factor(bank_data$y)
## TRAIN / TEST SPLIT
set.seed(123)
train_index <- caret::createDataPartition(bank_data$y, p = 0.7, list = FALSE)
train_data <- bank_data[train_index, ]
test_data <- bank_data[-train_index, ]
## SCALE NUMERIC VARIABLES USING CARET
numeric_vars <- names(train_data)[sapply(train_data, is.numeric)]
scaling_params <- caret::preProcess(
train_data[, numeric_vars],
method = c("center", "scale")
)
train_scaled <- train_data
test_scaled <- test_data
train_scaled[, numeric_vars] <- predict(scaling_params, train_data[, numeric_vars])
test_scaled[, numeric_vars] <- predict(scaling_params, test_data[, numeric_vars])
## TRAIN SVM MODEL
svm_model <- e1071::svm(
y ~ .,
data = train_scaled,
kernel = "radial",
cost = 1,
gamma = 0.01,
probability = TRUE
)
## PREDICTIONS + CONFUSION MATRIX
svm_pred <- predict(svm_model, newdata = test_scaled)
cm <- caret::confusionMatrix(
svm_pred,
test_scaled$y,
positive = "yes"
)
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 10776 965
## yes 188 427
##
## Accuracy : 0.9067
## 95% CI : (0.9014, 0.9118)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1.636e-12
##
## Kappa : 0.3829
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.30675
## Specificity : 0.98285
## Pos Pred Value : 0.69431
## Neg Pred Value : 0.91781
## Prevalence : 0.11266
## Detection Rate : 0.03456
## Detection Prevalence : 0.04977
## Balanced Accuracy : 0.64480
##
## 'Positive' Class : yes
##
## ROC CURVE + AUC
svm_prob <- attr(
predict(svm_model, newdata = test_scaled, probability = TRUE),
"probabilities"
)[, "yes"]
roc_obj <- pROC::roc(test_scaled$y, svm_prob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
print(paste("AUC:", pROC::auc(roc_obj)))
## [1] "AUC: 0.940276851723719"
plot(
roc_obj,
col = "blue",
lwd = 2,
main = "SVM ROC Curve - Bank Marketing Dataset"
)