1.Loading the needed packages

library(tidyverse)
library(tidymodels)
library(themis)
library(FFTrees)
library(janitor)

2.Loading the dataset

bank_default <- read_csv("~/Desktop/R/csv/bank_default.csv", col_types = "dfddf") %>%
  clean_names() %>%
  mutate(
    defaulted = fct_recode(defaulted, "TRUE" = "1", "FALSE" = "0"),
    employed = fct_recode(employed, "Yes" = "1", "No" = "0")
  ) %>% select(-index)

3.EDA

Checking for missing values

## # A tibble: 1 × 4
##   employed bank_balance annual_salary defaulted
##      <int>        <int>         <int>     <int>
## 1        0            0             0         0

Is the target variable ballanced?

4.Splitting the data

set.seed(2022)
split <- initial_split(bank_default, strata = defaulted)
train <- training(split)
test <- testing(split)
val <- slice_sample(bank_default, n = 10)

5.Building a recipe to ballance the target variable

## # A tibble: 504 × 4
##    employed bank_balance annual_salary defaulted
##    <fct>           <dbl>         <dbl> <fct>    
##  1 Yes             6257.       383782. FALSE    
##  2 No             12844.       153081. FALSE    
##  3 Yes            10819.       685205. FALSE    
##  4 Yes              591.       365414. FALSE    
##  5 Yes             8579.       372394. FALSE    
##  6 Yes             9948.       668362. FALSE    
##  7 Yes            13711.       504643. FALSE    
##  8 No             13836.       156433. FALSE    
##  9 Yes             7448.       414589. FALSE    
## 10 No             22207.       244908. FALSE    
## # … with 494 more rows

Is the target variable ballanced now?

6.Modelling

## FFTrees 
## - Trees: 4 fast-and-frugal trees predicting defaulted
## - Outcome costs: [hi = 0, mi = 1, fa = 1, cr = 0]
## 
## FFT #1: Definition
## [1] If bank_balance > 14863.08, decide Default.
## [2] If employed != {No}, decide No Default.
## [3] If annual_salary <= 385443.48, decide Default, otherwise, decide No Default.
## 
## FFT #1: Training Accuracy
## Training data: N = 7,500, Pos (+) = 252 (3%) 
## 
## |          | True +   | True -   |   Totals:
## |----------|----------|----------|
## | Decide + | hi   237 | fa 2,884 |     3,121
## | Decide - | mi    15 | cr 4,364 |     4,379
## |----------|----------|----------|
##   Totals:         252      7,248   N = 7,500
## 
## acc  = 61.3%   ppv  = 7.6%   npv  = 99.7%
## bacc = 77.1%   sens = 94.0%   spec = 60.2%
## 
## FFT #1: Training Speed, Frugality, and Cost
## mcu = 2,  pci = 0.33,  E(cost) = 0.387

7.Plotting the results

8. Predicting on new data

##  [1]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE

8.Ranking the predictor accuracies

9.Comparing predictive performance across algorithms

algorithm n hi fa mi cr sens spec far ppv npv acc bacc wacc cost cost_decisions cost_cues
fftrees 2500 76 931 5 1488 0.938 0.615 0.385 0.075 0.997 0.626 0.777 0.777 0.374 0.374 0
lr 2500 22 8 59 2411 0.272 0.997 0.003 0.733 0.976 0.973 0.634 0.634 0.027 0.027 NA
cart 2500 22 7 59 2412 0.272 0.997 0.003 0.759 0.976 0.974 0.634 0.634 0.026 0.026 NA
rf 2500 21 7 60 2412 0.259 0.997 0.003 0.750 0.976 0.973 0.628 0.628 0.027 0.027 NA
svm 2500 15 5 66 2414 0.185 0.998 0.002 0.750 0.973 0.972 0.592 0.592 0.028 0.028 NA