library(tidyverse)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)
assignment3
Data Import
# reading in the data
<- read.csv("preprocessed_bank.csv")
data print(head(data))
age job marital education default balance housing loan duration
1 55 management married tertiary no 2000 yes no 260
2 40 technician single secondary no 0 yes no 150
3 30 entrepreneur married secondary no 0 yes yes 70
4 45 blue-collar married unknown no 1000 yes no 90
5 30 unknown single unknown no 0 no no 190
6 35 management married tertiary no 0 yes no 130
campaign pdays previous y days_since
1 1 -1 0 no 240
2 1 -1 0 no 240
3 1 -1 0 no 240
4 1 -1 0 no 240
5 1 -1 0 no 240
6 1 -1 0 no 240
Extra Preprocessing
# transforming the data into numeric values for SVM
<- data |>
ppdata mutate(
marital = case_when(
== "single" ~ 0,
marital == "married" ~ 1,
marital == "divorced" ~ 2,
marital .default = -9999
),education = case_when(
== "primary" ~ 1,
education == "secondary" ~ 2,
education == "tertiary" ~ 3,
education == "unknown" ~ -1,
education .default = -9999
),default = case_when(
== "yes" ~ 1,
default == "no" ~ 0
default
),housing = case_when(
== "yes" ~ 1,
housing == "no" ~ 0
housing
),loan = case_when(
== "yes" ~ 1,
loan == "no" ~ 0
loan
),y = as.factor(y)
|>
) ::select(-job) dplyr
Training Data
# setting the seed for the assignment
set.seed(314)
# creating a random sample subset of indices
<- sample(nrow(data), round(nrow(data)*0.8), replace = FALSE)
samp
# setting the training and testing data from the sample indices
<- ppdata[samp,]
data_train <- ppdata[-samp,]
data_test
# displaying the count for each unique value in y
|>
data_train ::select(y) |>
dplyrgroup_by(y) |>
summarize(count=n()) |>
print()
# A tibble: 2 × 2
y count
<fct> <int>
1 no 31899
2 yes 4270
# printing the proportion of the counts of the unique values in y
print("Proportion of no to yes:")
[1] "Proportion of no to yes:"
print(nrow(data_train) / sum(data_train$y == "yes") - 1)
[1] 7.470492
Training SVM
#creating a list of proportions to try
= c(1:7)
proportion
#creating a df to store the results in
<- data.frame(
df proportion = integer(),
Accuracy = double(),
Precision = double(),
Recall = double(),
F1 = double())
#looping through the different cp values and storing choice metrics in df
for (i in proportion) {
<- sample(
temp_samp nrow(data_train[data_train$y == "no",]),
nrow(data_train[data_train$y == "yes",]) * i,
replace = FALSE
)
<- data_train[data_train$y == "no",][temp_samp,] |>
data_train2 rbind(data_train[data_train$y == "yes",])
#creation of decision tree model
<- rpart(
dt_model ~.,
ymethod="class",
data = data_train2,
#setting cp for the current loop
control = rpart.control(cp=0.008)
)
#predicting based on model and getting metrics
<- predict(dt_model, data_test, type="class")
pred <- confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
cM
#temp storage of choice metrics
<- cM$overall[[1]]
acc <- cM$byClass[[5]]
prec <- cM$byClass[[6]]
recall <- cM$byClass[[7]]
f1
#storing metrics for the run
nrow(df) + 1,] = c(i,acc, prec, recall, f1)
df[
}
#printing metrics for all runs
print(df |> arrange(desc(Accuracy)))
proportion Accuracy Precision Recall F1
1 7 0.8913957 0.9034957 0.9825502 0.9413661
2 6 0.8889626 0.9311955 0.9446591 0.9378790
3 5 0.8874143 0.9326745 0.9410445 0.9368408
4 4 0.8865295 0.9334655 0.9390502 0.9362495
5 3 0.8836541 0.9441825 0.9234700 0.9337114
6 2 0.8516921 0.9607005 0.8683784 0.9122095
7 1 0.7593453 0.9739018 0.7488471 0.8466742
= c(1:7)
proportion
<- data.frame(
df proportion = integer(),
Accuracy = double(),
Precision = double(),
Recall = double(),
F1 = double())
for (i in proportion) {
<- sample(
temp_samp nrow(data_train[data_train$y == "no",]),
nrow(data_train[data_train$y == "yes",]) * i,
replace = FALSE
)
<- data_train[data_train$y == "no",][temp_samp,] |>
data_train2 rbind(data_train[data_train$y == "yes",])
<- svm(
model ~.,
ydata = data_train2,
type = 'C-classification',
kernel = 'linear'
)<- predict(model, data_test)
pred <- confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
cM
<- cM$overall[[1]]
acc <- cM$byClass[[5]]
prec <- cM$byClass[[6]]
recall <- cM$byClass[[7]]
f1
nrow(df) + 1,] = c(i,acc, prec, recall, f1)
df[
}|>
df arrange(desc(Accuracy)) |>
print()
proportion Accuracy Precision Recall F1
1 5 0.8942712 0.9081668 0.9799327 0.9426859
2 4 0.8923911 0.9148035 0.9689642 0.9411053
3 6 0.8902898 0.8957559 0.9917737 0.9413226
4 7 0.8873037 0.8873037 1.0000000 0.9402871
5 3 0.8853130 0.9259756 0.9464041 0.9360784
6 2 0.8645211 0.9418877 0.9030288 0.9220490
7 1 0.8067905 0.9653025 0.8114172 0.8816957
<- sample(
samp3 nrow(data_train[data_train$y == "no",]),
nrow(data_train[data_train$y == "yes",]) * 5,
replace = FALSE
)
<- data_train[data_train$y == "no",][samp3,] |>
data_train3 rbind(data_train[data_train$y == "yes",])
<- best.tune(
rad_model
svm,~.,
ydata=data_train3,
ranges = list(
gamma = c(0.05,0.1,0.2),
cost = c(1,2,3,4)
),kernel = 'radial',
tunecontrol = tune.control(sampling = "fix"),
na.action=na.omit
)
<- best.tune(
lin_model
svm,~.,
ydata=data_train3,
ranges = list(
gamma = c(0.05,0.1,0.2),
cost = c(1,2,3,4)
),kernel = 'linear',
tunecontrol = tune.control(sampling = "fix"),
na.action=na.omit
)
<- best.tune(
poly_model
svm,~.,
ydata=data_train3,
ranges = list(
gamma = c(0.05,0.1,0.2),
cost = c(1,2,3,4)
),kernel = 'polynomial',
tunecontrol = tune.control(sampling = "fix"),
na.action=na.omit
)
<- best.tune(
sig_model
svm,~.,
ydata=data_train3,
ranges = list(
gamma = c(0.05,0.1,0.2),
cost = c(1,2,3,4)
),kernel = 'sigmoid',
tunecontrol = tune.control(sampling = "fix"),
na.action=na.omit
)
<- data.frame(
results Kernel = character(),
Gamma = double(),
Cost = double(),
Accuracy = double(),
Precision = double(),
Recall = double(),
F1 = double()
)
<- function(df, kernel, model){
metrics if (kernel == "decision tree") {
<- predict(model, data_test, type="class")
pred else{
} <- predict(model, data_test)
pred
}<- confusionMatrix(pred, data_test$y, mode = "everything")
cM
<- model$gamma
gamma <- model$cost
cost <- cM$overall[[1]]
acc <- cM$byClass[[5]]
prec <- cM$byClass[[6]]
recall <- cM$byClass[[7]]
f1
if (is.null(gamma)) {
<- NA
gamma
}if (is.null(cost)) {
<- NA
cost
}
nrow(df) + 1,] = c(kernel, gamma, cost, acc, prec, recall, f1)
df[
return (df)
}
rad_model
Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05,
0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "radial", tunecontrol = tune.control(sampling = "fix"),
na.action = na.omit)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 1
Number of Support Vectors: 8257
lin_model
Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05,
0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "linear", tunecontrol = tune.control(sampling = "fix"),
na.action = na.omit)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 1
Number of Support Vectors: 8193
poly_model
Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05,
0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "polynomial", tunecontrol = tune.control(sampling = "fix"),
na.action = na.omit)
Parameters:
SVM-Type: C-classification
SVM-Kernel: polynomial
cost: 4
degree: 3
coef.0: 0
Number of Support Vectors: 7590
sig_model
Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05,
0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "sigmoid", tunecontrol = tune.control(sampling = "fix"),
na.action = na.omit)
Parameters:
SVM-Type: C-classification
SVM-Kernel: sigmoid
cost: 2
coef.0: 0
Number of Support Vectors: 5714
<- metrics(results, "SVM radial", rad_model)
results <- metrics(results, "SVM linear", lin_model)
results <- metrics(results, "SVM polynomial", poly_model)
results <- metrics(results, "SVM sigmoid", sig_model) results
|>
results arrange(desc(Accuracy)) |>
print()
Kernel Gamma Cost Accuracy Precision
1 SVM radial 0.2 1 0.896151293961513 0.925414364640884
2 SVM polynomial 0.2 4 0.896040698960407 0.920855614973262
3 SVM linear 0.05 1 0.895045343950453 0.907300783049286
4 SVM sigmoid 0.05 2 0.814753373147534 0.912636505460218
Recall F1
1 0.960363953633304 0.942565294513426
2 0.965848186463916 0.9428154276676
3 0.98205160164527 0.943197462141617
4 0.874984419793095 0.893413935730194
Results Comparison
library(rpart)
library(randomForest)
library(xgboost)
<- rpart(
dt_model ~.,
ymethod="class",
data = data_train3,
#setting cp for the current loop
control = rpart.control(cp=0.008)
)
<- train(
rf_model ~.,
ydata = data_train3,
metric = "Accuracy",
method = "rf",
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(.mtry = 8),
na.action=na.omit
)
<- train(
xgb_model ~.,
ydata = data_train3,
metric = "Accuracy",
method = "xgbTree",
na.action=na.omit,
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(
#setting parameters for run
nrounds = 100,
max_depth = 8,
eta = 0.1,
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
)
<- metrics(results, "random forest", rf_model)
results <- metrics(results, "decision tree", dt_model)
results <- metrics(results, "gradient boost", xgb_model)
results
|>
results arrange(desc(Accuracy)) |>
print()
Kernel Gamma Cost Accuracy Precision
1 gradient boost <NA> <NA> 0.901570449015704 0.949124795365823
2 SVM radial 0.2 1 0.896151293961513 0.925414364640884
3 SVM polynomial 0.2 4 0.896040698960407 0.920855614973262
4 SVM linear 0.05 1 0.895045343950453 0.907300783049286
5 random forest <NA> <NA> 0.894934748949347 0.948282418557485
6 decision tree <NA> <NA> 0.884538818845388 0.936569498311022
7 SVM sigmoid 0.05 2 0.814753373147534 0.912636505460218
Recall F1
1 0.939424155552786 0.944249561513405
2 0.960363953633304 0.942565294513426
3 0.965848186463916 0.9428154276676
4 0.98205160164527 0.943197462141617
5 0.93244422285928 0.940296631473102
6 0.933067431135486 0.934815184815185
7 0.874984419793095 0.893413935730194