assignment3

Author

Cameron Gray

library(tidyverse)
library(rpart)
library(rpart.plot)
library(e1071)
library(caret)

Data Import

# reading in the data
data <- read.csv("preprocessed_bank.csv")
print(head(data))
  age          job marital education default balance housing loan duration
1  55   management married  tertiary      no    2000     yes   no      260
2  40   technician  single secondary      no       0     yes   no      150
3  30 entrepreneur married secondary      no       0     yes  yes       70
4  45  blue-collar married   unknown      no    1000     yes   no       90
5  30      unknown  single   unknown      no       0      no   no      190
6  35   management married  tertiary      no       0     yes   no      130
  campaign pdays previous  y days_since
1        1    -1        0 no        240
2        1    -1        0 no        240
3        1    -1        0 no        240
4        1    -1        0 no        240
5        1    -1        0 no        240
6        1    -1        0 no        240

Extra Preprocessing

# transforming the data into numeric values for SVM
ppdata <- data |>
  mutate(
    marital = case_when(
      marital == "single" ~ 0,
      marital == "married" ~ 1,
      marital == "divorced" ~ 2,
      .default = -9999
    ),
    education = case_when(
      education == "primary" ~ 1,
      education == "secondary" ~ 2,
      education == "tertiary" ~ 3,
      education == "unknown" ~ -1,
      .default = -9999
    ),
    default = case_when(
      default == "yes" ~ 1,
      default == "no" ~ 0
    ),
    housing = case_when(
      housing == "yes" ~ 1,
      housing == "no" ~ 0
    ),
    loan = case_when(
      loan == "yes" ~ 1,
      loan == "no" ~ 0
    ),
    y = as.factor(y)
  ) |>
  dplyr::select(-job)

Training Data

# setting the seed for the assignment
set.seed(314)

# creating a random sample subset of indices
samp <- sample(nrow(data), round(nrow(data)*0.8), replace = FALSE)

# setting the training and testing data from the sample indices
data_train <- ppdata[samp,]
data_test <- ppdata[-samp,]

# displaying the count for each unique value in y
data_train |>
  dplyr::select(y) |>
  group_by(y) |>
  summarize(count=n()) |>
  print()
# A tibble: 2 × 2
  y     count
  <fct> <int>
1 no    31899
2 yes    4270
# printing the proportion of the counts of the unique values in y
print("Proportion of no to yes:")
[1] "Proportion of no to yes:"
print(nrow(data_train) / sum(data_train$y == "yes") - 1)
[1] 7.470492

Training SVM

#creating a list of proportions to try
proportion = c(1:7)

#creating a df to store the results in
df <- data.frame(
  proportion = integer(),
  Accuracy = double(),
  Precision = double(),
  Recall = double(),
  F1 = double())

#looping through the different cp values and storing choice metrics in df
for (i in proportion) {
  temp_samp <- sample(
    nrow(data_train[data_train$y == "no",]), 
    nrow(data_train[data_train$y == "yes",]) * i,
    replace = FALSE
    )
  
  data_train2 <- data_train[data_train$y == "no",][temp_samp,] |>
    rbind(data_train[data_train$y == "yes",])
  
  #creation of decision tree model
  dt_model <- rpart(
    y~., 
    method="class", 
    data = data_train2, 
    #setting cp for the current loop
    control = rpart.control(cp=0.008)
  )
  
  #predicting based on model and getting metrics
  pred <- predict(dt_model, data_test, type="class")
  cM <- confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
  
  #temp storage of choice metrics
  acc <- cM$overall[[1]]
  prec <- cM$byClass[[5]]
  recall <- cM$byClass[[6]]
  f1 <- cM$byClass[[7]]
  
  #storing metrics for the run
  df[nrow(df) + 1,] = c(i,acc, prec, recall, f1)
}

#printing metrics for all runs
print(df |> arrange(desc(Accuracy)))
  proportion  Accuracy Precision    Recall        F1
1          7 0.8913957 0.9034957 0.9825502 0.9413661
2          6 0.8889626 0.9311955 0.9446591 0.9378790
3          5 0.8874143 0.9326745 0.9410445 0.9368408
4          4 0.8865295 0.9334655 0.9390502 0.9362495
5          3 0.8836541 0.9441825 0.9234700 0.9337114
6          2 0.8516921 0.9607005 0.8683784 0.9122095
7          1 0.7593453 0.9739018 0.7488471 0.8466742
proportion = c(1:7)

df <- data.frame(
  proportion = integer(),
  Accuracy = double(),
  Precision = double(),
  Recall = double(),
  F1 = double())

for (i in proportion) {
  temp_samp <- sample(
    nrow(data_train[data_train$y == "no",]), 
    nrow(data_train[data_train$y == "yes",]) * i,
    replace = FALSE
    )
  
  data_train2 <- data_train[data_train$y == "no",][temp_samp,] |>
    rbind(data_train[data_train$y == "yes",])
  
  model <- svm(
    y~., 
    data = data_train2, 
    type = 'C-classification',
    kernel = 'linear'
  )
  pred <- predict(model, data_test)
  cM <- confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
  
  acc <- cM$overall[[1]]
  prec <- cM$byClass[[5]]
  recall <- cM$byClass[[6]]
  f1 <- cM$byClass[[7]]
  
  df[nrow(df) + 1,] = c(i,acc, prec, recall, f1)
}
df |>
  arrange(desc(Accuracy)) |>
  print()
  proportion  Accuracy Precision    Recall        F1
1          5 0.8942712 0.9081668 0.9799327 0.9426859
2          4 0.8923911 0.9148035 0.9689642 0.9411053
3          6 0.8902898 0.8957559 0.9917737 0.9413226
4          7 0.8873037 0.8873037 1.0000000 0.9402871
5          3 0.8853130 0.9259756 0.9464041 0.9360784
6          2 0.8645211 0.9418877 0.9030288 0.9220490
7          1 0.8067905 0.9653025 0.8114172 0.8816957
samp3 <- sample(
  nrow(data_train[data_train$y == "no",]), 
  nrow(data_train[data_train$y == "yes",]) * 5,
  replace = FALSE
  )

data_train3 <- data_train[data_train$y == "no",][samp3,] |>
    rbind(data_train[data_train$y == "yes",])
rad_model <- best.tune(
  svm,
  y~.,
  data=data_train3,
  ranges = list(
    gamma = c(0.05,0.1,0.2), 
    cost = c(1,2,3,4)
  ),
  kernel = 'radial', 
  tunecontrol = tune.control(sampling = "fix"),
  na.action=na.omit
  )
lin_model <- best.tune(
  svm,
  y~.,
  data=data_train3,
  ranges = list(
    gamma = c(0.05,0.1,0.2), 
    cost = c(1,2,3,4)
  ),
  kernel = 'linear', 
  tunecontrol = tune.control(sampling = "fix"),
  na.action=na.omit
)
poly_model <- best.tune(
  svm,
  y~.,
  data=data_train3,
  ranges = list(
    gamma = c(0.05,0.1,0.2), 
    cost = c(1,2,3,4)
  ),
  kernel = 'polynomial', 
  tunecontrol = tune.control(sampling = "fix"),
  na.action=na.omit
)
sig_model <- best.tune(
  svm,
  y~.,
  data=data_train3,
  ranges = list(
    gamma = c(0.05,0.1,0.2), 
    cost = c(1,2,3,4)
  ),
  kernel = 'sigmoid', 
  tunecontrol = tune.control(sampling = "fix"),
  na.action=na.omit
)
results <- data.frame(
  Kernel = character(),
  Gamma = double(),
  Cost = double(),
  Accuracy = double(),
  Precision = double(),
  Recall = double(),
  F1 = double()
  )

metrics <- function(df, kernel, model){
  if (kernel == "decision tree") {
    pred <- predict(model, data_test, type="class")
  } else{
    pred <- predict(model, data_test)
  }
  cM<- confusionMatrix(pred, data_test$y, mode = "everything")
  
  gamma <- model$gamma
  cost <- model$cost
  acc <- cM$overall[[1]]
  prec <- cM$byClass[[5]]
  recall <- cM$byClass[[6]]
  f1 <- cM$byClass[[7]]
  
  if (is.null(gamma)) {
    gamma <- NA
  }
  if (is.null(cost)) {
    cost <- NA
  }
  
  df[nrow(df) + 1,] = c(kernel, gamma, cost, acc, prec, recall, f1)
  
  return (df)
}
rad_model

Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05, 
    0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "radial", tunecontrol = tune.control(sampling = "fix"), 
    na.action = na.omit)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  8257
lin_model

Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05, 
    0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "linear", tunecontrol = tune.control(sampling = "fix"), 
    na.action = na.omit)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  1 

Number of Support Vectors:  8193
poly_model

Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05, 
    0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "polynomial", tunecontrol = tune.control(sampling = "fix"), 
    na.action = na.omit)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  polynomial 
       cost:  4 
     degree:  3 
     coef.0:  0 

Number of Support Vectors:  7590
sig_model

Call:
best.tune(svm, y ~ ., data = data_train3, ranges = list(gamma = c(0.05, 
    0.1, 0.2), cost = c(1, 2, 3, 4)), kernel = "sigmoid", tunecontrol = tune.control(sampling = "fix"), 
    na.action = na.omit)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  sigmoid 
       cost:  2 
     coef.0:  0 

Number of Support Vectors:  5714
results <- metrics(results, "SVM radial", rad_model)
results <- metrics(results, "SVM linear", lin_model)
results <- metrics(results, "SVM polynomial", poly_model)
results <- metrics(results, "SVM sigmoid", sig_model)
results |>
  arrange(desc(Accuracy)) |>
  print()
          Kernel Gamma Cost          Accuracy         Precision
1     SVM radial   0.2    1 0.896151293961513 0.925414364640884
2 SVM polynomial   0.2    4 0.896040698960407 0.920855614973262
3     SVM linear  0.05    1 0.895045343950453 0.907300783049286
4    SVM sigmoid  0.05    2 0.814753373147534 0.912636505460218
             Recall                F1
1 0.960363953633304 0.942565294513426
2 0.965848186463916   0.9428154276676
3  0.98205160164527 0.943197462141617
4 0.874984419793095 0.893413935730194

Results Comparison

library(rpart)
library(randomForest)
library(xgboost)
dt_model <- rpart(
  y~.,
  method="class",
  data = data_train3,
  #setting cp for the current loop
  control = rpart.control(cp=0.008)
)

rf_model <- train(
  y~., 
  data = data_train3, 
  metric = "Accuracy", 
  method = "rf",
  trControl = trainControl(method = "none"),
  tuneGrid = expand.grid(.mtry = 8),
  na.action=na.omit
)

xgb_model <- train(
  y~.,
  data = data_train3,
  metric = "Accuracy",
  method = "xgbTree",
  na.action=na.omit,
  trControl = trainControl(method = "none"),
  tuneGrid = expand.grid(
    #setting parameters for run
    nrounds = 100,
    max_depth = 8,
    eta = 0.1,
    gamma = 0.01,
    colsample_bytree = 1,
    min_child_weight = 1,
    subsample = 1
  )
)

results <- metrics(results, "random forest", rf_model)
results <- metrics(results, "decision tree", dt_model)
results <- metrics(results, "gradient boost", xgb_model)

results |>
  arrange(desc(Accuracy)) |>
  print()
          Kernel Gamma Cost          Accuracy         Precision
1 gradient boost  <NA> <NA> 0.901570449015704 0.949124795365823
2     SVM radial   0.2    1 0.896151293961513 0.925414364640884
3 SVM polynomial   0.2    4 0.896040698960407 0.920855614973262
4     SVM linear  0.05    1 0.895045343950453 0.907300783049286
5  random forest  <NA> <NA> 0.894934748949347 0.948282418557485
6  decision tree  <NA> <NA> 0.884538818845388 0.936569498311022
7    SVM sigmoid  0.05    2 0.814753373147534 0.912636505460218
             Recall                F1
1 0.939424155552786 0.944249561513405
2 0.960363953633304 0.942565294513426
3 0.965848186463916   0.9428154276676
4  0.98205160164527 0.943197462141617
5  0.93244422285928 0.940296631473102
6 0.933067431135486 0.934815184815185
7 0.874984419793095 0.893413935730194