1.Loading the needed packages
library(tidyverse)
library(tidymodels)
library(themis)
library(FFTrees)
library(janitor)
2.Loading the dataset
bank_default <- read_csv("~/Desktop/R/csv/bank_default.csv", col_types = "dfddf") %>%
clean_names() %>%
mutate(
defaulted = fct_recode(defaulted, "TRUE" = "1", "FALSE" = "0"),
employed = fct_recode(employed, "Yes" = "1", "No" = "0")
) %>% select(-index)
3.EDA
Checking for missing values
## # A tibble: 1 × 4
## employed bank_balance annual_salary defaulted
## <int> <int> <int> <int>
## 1 0 0 0 0
Is the target variable ballanced?

4.Splitting the data
set.seed(2022)
split <- initial_split(bank_default, strata = defaulted)
train <- training(split)
test <- testing(split)
val <- slice_sample(bank_default, n = 10)
5.Building a recipe to ballance the target variable
## # A tibble: 504 × 4
## employed bank_balance annual_salary defaulted
## <fct> <dbl> <dbl> <fct>
## 1 Yes 6257. 383782. FALSE
## 2 No 12844. 153081. FALSE
## 3 Yes 10819. 685205. FALSE
## 4 Yes 591. 365414. FALSE
## 5 Yes 8579. 372394. FALSE
## 6 Yes 9948. 668362. FALSE
## 7 Yes 13711. 504643. FALSE
## 8 No 13836. 156433. FALSE
## 9 Yes 7448. 414589. FALSE
## 10 No 22207. 244908. FALSE
## # … with 494 more rows
Is the target variable ballanced now?

6.Modelling
## FFTrees
## - Trees: 4 fast-and-frugal trees predicting defaulted
## - Outcome costs: [hi = 0, mi = 1, fa = 1, cr = 0]
##
## FFT #1: Definition
## [1] If bank_balance > 14863.08, decide Default.
## [2] If employed != {No}, decide No Default.
## [3] If annual_salary <= 385443.48, decide Default, otherwise, decide No Default.
##
## FFT #1: Training Accuracy
## Training data: N = 7,500, Pos (+) = 252 (3%)
##
## | | True + | True - | Totals:
## |----------|----------|----------|
## | Decide + | hi 237 | fa 2,884 | 3,121
## | Decide - | mi 15 | cr 4,364 | 4,379
## |----------|----------|----------|
## Totals: 252 7,248 N = 7,500
##
## acc = 61.3% ppv = 7.6% npv = 99.7%
## bacc = 77.1% sens = 94.0% spec = 60.2%
##
## FFT #1: Training Speed, Frugality, and Cost
## mcu = 2, pci = 0.33, E(cost) = 0.387
7.Plotting the results

8. Predicting on new data
## [1] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
8.Ranking the predictor accuracies
