library(rsample)

# untuk persiapan data
library(dplyr)

# untuk keperluan machine learning
library(partykit)
library(randomForest)
library(caret)

💉Introduction Study Case: Subscribes Deposito Prediction

❓ Business Question: S Data tersebut terkait dengan campaign pemasaran langsung dari lembaga perbankan Portugis.campaign didasarkan pada panggilan telepon. Seringkali, diperlukan lebih dari satu kontak ke klien yang sama, untuk mengakses apakah produk (deposito berjangka bank) akan berlangganan (yes) atau tidak (no). - Kelas positif: yes. - Kelas negatif: no.

1.Data Preparation

# read data
bank <- read.csv("bank.csv", 
                 sep = ";",
                 stringsAsFactors = T) 

# 5 observasi pertama
head(bank)
bank <- bank %>% rename(Subscribes = y)
head(bank)

🗒️Deskripsi Data

knitr::include_graphics("data.jpg")

2. Data Wrangling

Cek tipe data dan missing values.

# cek struktur data
glimpse(bank)
#> Rows: 4,521
#> Columns: 17
#> $ age        <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31,…
#> $ job        <fct> unemployed, services, management, management, blue-collar, …
#> $ marital    <fct> married, married, single, married, married, single, married…
#> $ education  <fct> primary, secondary, tertiary, tertiary, secondary, tertiary…
#> $ default    <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
#> $ balance    <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 2…
#> $ housing    <fct> no, yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, n…
#> $ loan       <fct> no, yes, no, yes, no, no, no, no, no, yes, no, no, no, no, …
#> $ contact    <fct> cellular, cellular, cellular, unknown, unknown, cellular, c…
#> $ day        <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, 29…
#> $ month      <fct> oct, may, apr, jun, may, feb, may, may, may, apr, may, apr,…
#> $ duration   <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 3…
#> $ campaign   <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1,…
#> $ pdays      <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, -1…
#> $ previous   <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1,…
#> $ poutcome   <fct> unknown, failure, failure, unknown, unknown, failure, other…
#> $ Subscribes <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, yes, no…

Tipe data sudah sesuai

# cek missing values
anyNA(bank)
#> [1] FALSE

Tidak ada missing value di data kita

3. Exploratory Data Analysis

# cek summary data
summary(bank)
#>       age                 job          marital         education    default   
#>  Min.   :19.00   management :969   divorced: 528   primary  : 678   no :4445  
#>  1st Qu.:33.00   blue-collar:946   married :2797   secondary:2306   yes:  76  
#>  Median :39.00   technician :768   single  :1196   tertiary :1350             
#>  Mean   :41.17   admin.     :478                   unknown  : 187             
#>  3rd Qu.:49.00   services   :417                                              
#>  Max.   :87.00   retired    :230                                              
#>                  (Other)    :713                                              
#>     balance      housing     loan           contact          day       
#>  Min.   :-3313   no :1962   no :3830   cellular :2896   Min.   : 1.00  
#>  1st Qu.:   69   yes:2559   yes: 691   telephone: 301   1st Qu.: 9.00  
#>  Median :  444                         unknown  :1324   Median :16.00  
#>  Mean   : 1423                                          Mean   :15.92  
#>  3rd Qu.: 1480                                          3rd Qu.:21.00  
#>  Max.   :71188                                          Max.   :31.00  
#>                                                                        
#>      month         duration       campaign          pdays       
#>  may    :1398   Min.   :   4   Min.   : 1.000   Min.   : -1.00  
#>  jul    : 706   1st Qu.: 104   1st Qu.: 1.000   1st Qu.: -1.00  
#>  aug    : 633   Median : 185   Median : 2.000   Median : -1.00  
#>  jun    : 531   Mean   : 264   Mean   : 2.794   Mean   : 39.77  
#>  nov    : 389   3rd Qu.: 329   3rd Qu.: 3.000   3rd Qu.: -1.00  
#>  apr    : 293   Max.   :3025   Max.   :50.000   Max.   :871.00  
#>  (Other): 571                                                   
#>     previous          poutcome    Subscribes
#>  Min.   : 0.0000   failure: 490   no :4000  
#>  1st Qu.: 0.0000   other  : 197   yes: 521  
#>  Median : 0.0000   success: 129             
#>  Mean   : 0.5426   unknown:3705             
#>  3rd Qu.: 0.0000                            
#>  Max.   :25.0000                            
#> 

💡 Insight:

Kasus: - Nasabah berhasil bayar / tidak - Prediktor: - Penghasilan - History pembayaran - Umur - Kewarganegaraan: WNI (semua nasabah WNI) -> tidak berguna bagi pemodelan

4. Feature Selection

Saat membuat model machine learning, kita perlu memilih prediktor-prediktor yang sesuai agar model dapat memberikan prediksi yang baik. Proses ini disebut sebagai feature selection.

🧪 Pendekatan Feature Selection

  • Berdasarkan intuisi bisnis: memasukkan fitur/variabel/atribut yang menurut pandangan bisnis berpengaruh kuat dalam melakukan prediksi.
  • Berdasarkan perhitungan matematika: pemilihan fitur/variabel/atribut yang akan digunakan untuk membuat machine learning didasarkan pada perhitungan matematis.

Dari data bank sebelumnya, mari kita gunakan fungsi nearZeroVar() dari package caret untuk mengeliminasi kolom-kolom yang hampir tidak memiliki variasi nilai.

# kolom-kolom yang hampir tidak memiliki variasi nilai
n0_var <- nearZeroVar(bank)

n0_var
#> [1]  5 14
summary(bank[,n0_var])
#>  default        pdays       
#>  no :4445   Min.   : -1.00  
#>  yes:  76   1st Qu.: -1.00  
#>             Median : -1.00  
#>             Mean   : 39.77  
#>             3rd Qu.: -1.00  
#>             Max.   :871.00
# drop kolom-kolom hasil nearZeroVar()
bank_clean <- bank[,-n0_var]

head(bank_clean)

5. Cross Validation

Lakukan cross validation untuk membagi data menjadi data training (bank_train) dan data testing (bank_test) dengan proporsi 70%:30%.

RNGkind(sample.kind = "Rounding")
set.seed(100)

splitter <- initial_split(data = bank_clean, prop = 0.7)

bank_train <- training(splitter)
bank_test <- testing(splitter)

Cek proporsi kelas di data training.

bank_train$Subscribes %>% table() %>% prop.table()
#> .
#>        no       yes 
#> 0.8827434 0.1172566

88:11

Belum balance

Case : Subscribes Deposito bank atau tidak

6. Handling Imbalanced Data

Apabila diperhatikan, proporsi kelas pada bank_train imbalance. Kondisi imbalance ini pada beberapa kasus dapat menyebabkan model menjadi bias: lebih banyak belajar dari kelas yang mayoritas (no). Untuk mengatasi hal ini, kita dapat melakukan sampling pada data. Terdapat 2 pendekatan sampling: upsampling dan undersampling.

  • Upsampling/Oversampling
    • Menambahkan observasi kelas minoritas hingga seimbang dengan kelas mayoritas dengan menduplikasi data pada kelas minoritas secara acak.
    • Digunakan ketika jumlah data minoritas sedikit.
    • Kekurangan: data duplikat tidak menambah informasi baru.
    • Contoh:
      • Before | 70 : 30
      • After | 70 : 70
    • 🗒️ Note: Apabila data terlalu sedikit dan tidak ingin melakukan upsampling, kita bisa mengumpulkan lebih banyak data untuk kelas minoritasnya.
  • Downsampling/Undersampling:
    • Mengurangi observasi kelas mayoritas hingga seimbang dengan kelas minoritas dengan membuang data dari kelas mayoritas secara acak.
    • Digunakan ketika jumlah data minoritas cukup banyak.
    • Kekurangan: mengurangi informasi dari data yang dimiliki
    • Contoh:
      • Before | 70 : 30
      • After | 30 : 30

❗️ Note: teknik sampling hanya boleh dilakukan pada data training. Data testing diperlakukan sebagai data baru untuk model.

💡 Fungsi untuk Upsampling dan Downsampling

  • upSample() : fungsi untuk upsampling.
  • downSample(): fungsi untuk downsampling.
  • Parameter:
    • x : predictor.
    • y : target.
    • yname : nama kolom target.

Karena perbedaan banyak data antara kelas neg dan pos cukup jauh, kita akan melakukan upsampling.

# upsampling
RNGkind(sample.kind = "Rounding")
set.seed(100)

bank_train_up <- upSample(x = bank_train %>% dplyr::select(-Subscribes),
                          y = bank_train$Subscribes,
                          yname = "SubscribesDeposito")


bank_train_up

Cek proporsi kelas target setelah melakukan upsampling.

bank_train_up$Subscribes %>% table() %>% prop.table()
#> .
#>  no yes 
#> 0.5 0.5

Proporsi kelas target telah seimbang

7.Model Fitting

Untuk membuat model decision tree, dapat digunakan fungsi ctree() dari library partykit.

🧪 Formula: ctree(formula, data)

  • formula = y ~ x
    • y: variabel target.
    • x: variabel prediktor.
  • data: dataframe yang berisikan variabel target dan prediktor.

Lakukan training menggunakan diab_train_up

# membuat model decision tree dengan semua prediktor
bank_tree <- ctree(formula = SubscribesDeposito ~ .,
                   data = bank_train_up)

bank_tree
#> 
#> Model formula:
#> SubscribesDeposito ~ age + job + marital + education + balance + 
#>     housing + loan + contact + day + month + duration + campaign + 
#>     previous + poutcome
#> 
#> Fitted party:
#> [1] root
#> |   [2] duration <= 214
#> |   |   [3] month in apr, dec, jan, mar, oct, sep
#> |   |   |   [4] month in apr, jan
#> |   |   |   |   [5] housing in no
#> |   |   |   |   |   [6] job in admin., management, retired, services, student, technician: yes (n = 97, err = 33.0%)
#> |   |   |   |   |   [7] job in blue-collar, entrepreneur, housemaid, self-employed, unemployed, unknown: no (n = 21, err = 0.0%)
#> |   |   |   |   [8] housing in yes
#> |   |   |   |   |   [9] marital in divorced: yes (n = 16, err = 18.8%)
#> |   |   |   |   |   [10] marital in married, single
#> |   |   |   |   |   |   [11] education in primary, secondary, tertiary: no (n = 93, err = 0.0%)
#> |   |   |   |   |   |   [12] education in unknown: yes (n = 10, err = 10.0%)
#> |   |   |   [13] month in dec, mar, oct, sep
#> |   |   |   |   [14] duration <= 78: no (n = 18, err = 0.0%)
#> |   |   |   |   [15] duration > 78
#> |   |   |   |   |   [16] campaign <= 3
#> |   |   |   |   |   |   [17] education in primary, secondary
#> |   |   |   |   |   |   |   [18] campaign <= 1
#> |   |   |   |   |   |   |   |   [19] balance <= 865
#> |   |   |   |   |   |   |   |   |   [20] marital in divorced, married: yes (n = 15, err = 26.7%)
#> |   |   |   |   |   |   |   |   |   [21] marital in single
#> |   |   |   |   |   |   |   |   |   |   [22] month in mar: yes (n = 12, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [23] month in oct, sep: yes (n = 10, err = 10.0%)
#> |   |   |   |   |   |   |   |   [24] balance > 865
#> |   |   |   |   |   |   |   |   |   [25] balance <= 4608: yes (n = 57, err = 1.8%)
#> |   |   |   |   |   |   |   |   |   [26] balance > 4608: yes (n = 13, err = 15.4%)
#> |   |   |   |   |   |   |   [27] campaign > 1: yes (n = 19, err = 26.3%)
#> |   |   |   |   |   |   [28] education in tertiary, unknown
#> |   |   |   |   |   |   |   [29] month in dec, oct, sep
#> |   |   |   |   |   |   |   |   [30] housing in no: no (n = 12, err = 0.0%)
#> |   |   |   |   |   |   |   |   [31] housing in yes: yes (n = 9, err = 44.4%)
#> |   |   |   |   |   |   |   [32] month in mar: yes (n = 26, err = 7.7%)
#> |   |   |   |   |   [33] campaign > 3: no (n = 7, err = 0.0%)
#> |   |   [34] month in aug, feb, jul, jun, may, nov
#> |   |   |   [35] poutcome in failure, other, unknown
#> |   |   |   |   [36] job in admin., blue-collar, entrepreneur, housemaid, management, retired, self-employed, services, technician, unemployed
#> |   |   |   |   |   [37] month in aug, jul, jun, may, nov
#> |   |   |   |   |   |   [38] poutcome in failure, other
#> |   |   |   |   |   |   |   [39] month in aug, jun, nov
#> |   |   |   |   |   |   |   |   [40] day <= 11
#> |   |   |   |   |   |   |   |   |   [41] marital in divorced, single: no (n = 9, err = 44.4%)
#> |   |   |   |   |   |   |   |   |   [42] marital in married: yes (n = 16, err = 12.5%)
#> |   |   |   |   |   |   |   |   [43] day > 11: no (n = 42, err = 0.0%)
#> |   |   |   |   |   |   |   [44] month in jul, may
#> |   |   |   |   |   |   |   |   [45] contact in cellular: no (n = 107, err = 0.0%)
#> |   |   |   |   |   |   |   |   [46] contact in telephone, unknown: no (n = 11, err = 36.4%)
#> |   |   |   |   |   |   [47] poutcome in unknown
#> |   |   |   |   |   |   |   [48] age <= 28
#> |   |   |   |   |   |   |   |   [49] housing in no
#> |   |   |   |   |   |   |   |   |   [50] month in aug, jul, may, nov
#> |   |   |   |   |   |   |   |   |   |   [51] day <= 5: yes (n = 7, err = 14.3%)
#> |   |   |   |   |   |   |   |   |   |   [52] day > 5: no (n = 21, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   [53] month in jun: yes (n = 14, err = 14.3%)
#> |   |   |   |   |   |   |   |   [54] housing in yes: no (n = 48, err = 0.0%)
#> |   |   |   |   |   |   |   [55] age > 28
#> |   |   |   |   |   |   |   |   [56] education in primary, secondary, tertiary
#> |   |   |   |   |   |   |   |   |   [57] job in admin., blue-collar, entrepreneur, housemaid, management, retired, self-employed, technician, unemployed
#> |   |   |   |   |   |   |   |   |   |   [58] month in aug: no (n = 254, err = 2.0%)
#> |   |   |   |   |   |   |   |   |   |   [59] month in jul, jun, may, nov: no (n = 815, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   [60] job in services
#> |   |   |   |   |   |   |   |   |   |   [61] marital in divorced, married: no (n = 91, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [62] marital in single
#> |   |   |   |   |   |   |   |   |   |   |   [63] day <= 20: no (n = 17, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   |   [64] day > 20: yes (n = 9, err = 11.1%)
#> |   |   |   |   |   |   |   |   [65] education in unknown
#> |   |   |   |   |   |   |   |   |   [66] marital in divorced: yes (n = 13, err = 38.5%)
#> |   |   |   |   |   |   |   |   |   [67] marital in married, single: no (n = 42, err = 0.0%)
#> |   |   |   |   |   [68] month in feb
#> |   |   |   |   |   |   [69] day <= 9: no (n = 79, err = 5.1%)
#> |   |   |   |   |   |   [70] day > 9: yes (n = 31, err = 19.4%)
#> |   |   |   |   [71] job in student, unknown
#> |   |   |   |   |   [72] month in aug, feb, jun, may
#> |   |   |   |   |   |   [73] contact in cellular, unknown: no (n = 21, err = 0.0%)
#> |   |   |   |   |   |   [74] contact in telephone: yes (n = 8, err = 25.0%)
#> |   |   |   |   |   [75] month in jul, nov
#> |   |   |   |   |   |   [76] education in primary, secondary
#> |   |   |   |   |   |   |   [77] poutcome in failure, unknown: yes (n = 22, err = 9.1%)
#> |   |   |   |   |   |   |   [78] poutcome in other: yes (n = 8, err = 0.0%)
#> |   |   |   |   |   |   [79] education in tertiary, unknown: yes (n = 13, err = 30.8%)
#> |   |   |   [80] poutcome in success
#> |   |   |   |   [81] education in primary, secondary, unknown: no (n = 12, err = 25.0%)
#> |   |   |   |   [82] education in tertiary: yes (n = 48, err = 0.0%)
#> |   [83] duration > 214
#> |   |   [84] month in apr, aug, dec, feb, jan, jun, mar, oct, sep
#> |   |   |   [85] poutcome in failure, unknown
#> |   |   |   |   [86] duration <= 457
#> |   |   |   |   |   [87] contact in cellular, telephone
#> |   |   |   |   |   |   [88] month in apr, aug, feb, jan
#> |   |   |   |   |   |   |   [89] loan in no
#> |   |   |   |   |   |   |   |   [90] campaign <= 1
#> |   |   |   |   |   |   |   |   |   [91] job in admin., entrepreneur, management, retired, student, technician, unemployed: yes (n = 144, err = 19.4%)
#> |   |   |   |   |   |   |   |   |   [92] job in blue-collar, services
#> |   |   |   |   |   |   |   |   |   |   [93] poutcome in failure: yes (n = 16, err = 31.2%)
#> |   |   |   |   |   |   |   |   |   |   [94] poutcome in unknown: no (n = 14, err = 0.0%)
#> |   |   |   |   |   |   |   |   [95] campaign > 1
#> |   |   |   |   |   |   |   |   |   [96] duration <= 347
#> |   |   |   |   |   |   |   |   |   |   [97] month in apr
#> |   |   |   |   |   |   |   |   |   |   |   [98] age <= 32: yes (n = 30, err = 10.0%)
#> |   |   |   |   |   |   |   |   |   |   |   [99] age > 32: no (n = 8, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [100] month in aug, feb, jan
#> |   |   |   |   |   |   |   |   |   |   |   [101] contact in cellular: no (n = 73, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   |   [102] contact in telephone: yes (n = 13, err = 15.4%)
#> |   |   |   |   |   |   |   |   |   [103] duration > 347
#> |   |   |   |   |   |   |   |   |   |   [104] job in admin., entrepreneur, management, retired, self-employed, technician
#> |   |   |   |   |   |   |   |   |   |   |   [105] month in apr, jan
#> |   |   |   |   |   |   |   |   |   |   |   |   [106] job in admin., retired: yes (n = 19, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   |   |   [107] job in self-employed, technician: yes (n = 8, err = 12.5%)
#> |   |   |   |   |   |   |   |   |   |   |   [108] month in aug, feb
#> |   |   |   |   |   |   |   |   |   |   |   |   [109] balance <= 538
#> |   |   |   |   |   |   |   |   |   |   |   |   |   [110] marital in divorced: yes (n = 21, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   |   |   |   [111] marital in married, single
#> |   |   |   |   |   |   |   |   |   |   |   |   |   |   [112] job in admin., retired, technician: no (n = 9, err = 44.4%)
#> |   |   |   |   |   |   |   |   |   |   |   |   |   |   [113] job in management, self-employed: yes (n = 18, err = 5.6%)
#> |   |   |   |   |   |   |   |   |   |   |   |   [114] balance > 538: no (n = 18, err = 38.9%)
#> |   |   |   |   |   |   |   |   |   |   [115] job in blue-collar, services, student, unemployed, unknown: no (n = 9, err = 0.0%)
#> |   |   |   |   |   |   |   [116] loan in yes: no (n = 25, err = 0.0%)
#> |   |   |   |   |   |   [117] month in dec, jun, mar, oct, sep
#> |   |   |   |   |   |   |   [118] balance <= 5958: yes (n = 236, err = 5.5%)
#> |   |   |   |   |   |   |   [119] balance > 5958: yes (n = 13, err = 46.2%)
#> |   |   |   |   |   [120] contact in unknown
#> |   |   |   |   |   |   [121] age <= 28: yes (n = 10, err = 30.0%)
#> |   |   |   |   |   |   [122] age > 28: no (n = 68, err = 0.0%)
#> |   |   |   |   [123] duration > 457
#> |   |   |   |   |   [124] loan in no
#> |   |   |   |   |   |   [125] job in admin., blue-collar, housemaid, services, unemployed, unknown: yes (n = 226, err = 19.0%)
#> |   |   |   |   |   |   [126] job in entrepreneur, management, retired, self-employed, student, technician: yes (n = 461, err = 6.7%)
#> |   |   |   |   |   [127] loan in yes
#> |   |   |   |   |   |   [128] duration <= 757: no (n = 9, err = 0.0%)
#> |   |   |   |   |   |   [129] duration > 757: yes (n = 13, err = 7.7%)
#> |   |   |   [130] poutcome in other, success
#> |   |   |   |   [131] poutcome in other
#> |   |   |   |   |   [132] month in apr, feb, jan, sep
#> |   |   |   |   |   |   [133] day <= 20
#> |   |   |   |   |   |   |   [134] duration <= 491: no (n = 10, err = 0.0%)
#> |   |   |   |   |   |   |   [135] duration > 491: yes (n = 15, err = 6.7%)
#> |   |   |   |   |   |   [136] day > 20
#> |   |   |   |   |   |   |   [137] duration <= 268: yes (n = 14, err = 0.0%)
#> |   |   |   |   |   |   |   [138] duration > 268: yes (n = 12, err = 16.7%)
#> |   |   |   |   |   [139] month in aug, dec, jun, oct
#> |   |   |   |   |   |   [140] job in admin., housemaid, retired, technician: yes (n = 49, err = 0.0%)
#> |   |   |   |   |   |   [141] job in management, services: yes (n = 24, err = 8.3%)
#> |   |   |   |   [142] poutcome in success
#> |   |   |   |   |   [143] duration <= 847: yes (n = 296, err = 1.7%)
#> |   |   |   |   |   [144] duration > 847: yes (n = 8, err = 25.0%)
#> |   |   [145] month in jul, may, nov
#> |   |   |   [146] duration <= 435
#> |   |   |   |   [147] job in admin., entrepreneur, housemaid, management, retired, services, student, technician, unemployed
#> |   |   |   |   |   [148] month in jul, nov
#> |   |   |   |   |   |   [149] poutcome in failure, other, success
#> |   |   |   |   |   |   |   [150] job in admin., housemaid, technician: no (n = 14, err = 28.6%)
#> |   |   |   |   |   |   |   [151] job in management, retired, services, unemployed
#> |   |   |   |   |   |   |   |   [152] campaign <= 3
#> |   |   |   |   |   |   |   |   |   [153] job in management, retired: yes (n = 56, err = 1.8%)
#> |   |   |   |   |   |   |   |   |   [154] job in services, unemployed: yes (n = 12, err = 16.7%)
#> |   |   |   |   |   |   |   |   [155] campaign > 3: yes (n = 7, err = 42.9%)
#> |   |   |   |   |   |   [156] poutcome in unknown
#> |   |   |   |   |   |   |   [157] job in admin., housemaid, retired
#> |   |   |   |   |   |   |   |   [158] month in jul
#> |   |   |   |   |   |   |   |   |   [159] contact in cellular: no (n = 22, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   [160] contact in telephone: yes (n = 25, err = 12.0%)
#> |   |   |   |   |   |   |   |   [161] month in nov: yes (n = 31, err = 3.2%)
#> |   |   |   |   |   |   |   [162] job in entrepreneur, management, services, technician, unemployed
#> |   |   |   |   |   |   |   |   [163] marital in divorced, married
#> |   |   |   |   |   |   |   |   |   [164] education in primary: no (n = 8, err = 50.0%)
#> |   |   |   |   |   |   |   |   |   [165] education in secondary, tertiary, unknown
#> |   |   |   |   |   |   |   |   |   |   [166] contact in cellular, unknown: no (n = 54, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [167] contact in telephone: no (n = 12, err = 33.3%)
#> |   |   |   |   |   |   |   |   [168] marital in single
#> |   |   |   |   |   |   |   |   |   [169] education in secondary
#> |   |   |   |   |   |   |   |   |   |   [170] balance <= 318: no (n = 12, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [171] balance > 318
#> |   |   |   |   |   |   |   |   |   |   |   [172] job in entrepreneur, services: yes (n = 11, err = 9.1%)
#> |   |   |   |   |   |   |   |   |   |   |   [173] job in management, technician: yes (n = 27, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   [174] education in tertiary: no (n = 7, err = 0.0%)
#> |   |   |   |   |   [175] month in may
#> |   |   |   |   |   |   [176] job in admin., management, services, technician, unemployed
#> |   |   |   |   |   |   |   [177] contact in cellular
#> |   |   |   |   |   |   |   |   [178] age <= 27: yes (n = 16, err = 6.2%)
#> |   |   |   |   |   |   |   |   [179] age > 27
#> |   |   |   |   |   |   |   |   |   [180] day <= 4: yes (n = 15, err = 6.7%)
#> |   |   |   |   |   |   |   |   |   [181] day > 4: no (n = 46, err = 0.0%)
#> |   |   |   |   |   |   |   [182] contact in telephone, unknown: no (n = 98, err = 0.0%)
#> |   |   |   |   |   |   [183] job in entrepreneur, retired, student
#> |   |   |   |   |   |   |   [184] duration <= 352
#> |   |   |   |   |   |   |   |   [185] day <= 15: no (n = 14, err = 0.0%)
#> |   |   |   |   |   |   |   |   [186] day > 15: no (n = 8, err = 50.0%)
#> |   |   |   |   |   |   |   [187] duration > 352
#> |   |   |   |   |   |   |   |   [188] education in primary: yes (n = 23, err = 0.0%)
#> |   |   |   |   |   |   |   |   [189] education in secondary, tertiary, unknown: yes (n = 8, err = 25.0%)
#> |   |   |   |   [190] job in blue-collar, self-employed, unknown
#> |   |   |   |   |   [191] poutcome in failure, other, unknown
#> |   |   |   |   |   |   [192] marital in divorced, married: no (n = 120, err = 0.0%)
#> |   |   |   |   |   |   [193] marital in single: no (n = 35, err = 14.3%)
#> |   |   |   |   |   [194] poutcome in success: yes (n = 7, err = 14.3%)
#> |   |   |   [195] duration > 435
#> |   |   |   |   [196] job in admin., blue-collar, housemaid, management, retired, self-employed, services, student, technician
#> |   |   |   |   |   [197] duration <= 638
#> |   |   |   |   |   |   [198] job in admin., blue-collar, housemaid, management, services, technician
#> |   |   |   |   |   |   |   [199] previous <= 3
#> |   |   |   |   |   |   |   |   [200] education in primary, tertiary
#> |   |   |   |   |   |   |   |   |   [201] job in admin., management, services, technician
#> |   |   |   |   |   |   |   |   |   |   [202] marital in divorced, married
#> |   |   |   |   |   |   |   |   |   |   |   [203] month in jul, may: yes (n = 59, err = 10.2%)
#> |   |   |   |   |   |   |   |   |   |   |   [204] month in nov: yes (n = 20, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [205] marital in single: no (n = 18, err = 44.4%)
#> |   |   |   |   |   |   |   |   |   [206] job in blue-collar, housemaid
#> |   |   |   |   |   |   |   |   |   |   [207] duration <= 574: no (n = 14, err = 0.0%)
#> |   |   |   |   |   |   |   |   |   |   [208] duration > 574: yes (n = 12, err = 8.3%)
#> |   |   |   |   |   |   |   |   [209] education in secondary, unknown
#> |   |   |   |   |   |   |   |   |   [210] marital in divorced, married
#> |   |   |   |   |   |   |   |   |   |   [211] poutcome in failure, success: yes (n = 15, err = 40.0%)
#> |   |   |   |   |   |   |   |   |   |   [212] poutcome in other, unknown: no (n = 38, err = 15.8%)
#> |   |   |   |   |   |   |   |   |   [213] marital in single: yes (n = 55, err = 23.6%)
#> |   |   |   |   |   |   |   [214] previous > 3
#> |   |   |   |   |   |   |   |   [215] job in admin., technician: yes (n = 28, err = 0.0%)
#> |   |   |   |   |   |   |   |   [216] job in management, services: yes (n = 16, err = 18.8%)
#> |   |   |   |   |   |   [217] job in retired, self-employed, student: no (n = 13, err = 0.0%)
#> |   |   |   |   |   [218] duration > 638
#> |   |   |   |   |   |   [219] marital in divorced, single
#> |   |   |   |   |   |   |   [220] education in primary, unknown
#> |   |   |   |   |   |   |   |   [221] day <= 15: yes (n = 32, err = 3.1%)
#> |   |   |   |   |   |   |   |   [222] day > 15: yes (n = 12, err = 33.3%)
#> |   |   |   |   |   |   |   [223] education in secondary, tertiary: yes (n = 221, err = 6.3%)
#> |   |   |   |   |   |   [224] marital in married
#> |   |   |   |   |   |   |   [225] poutcome in failure, unknown: yes (n = 208, err = 24.5%)
#> |   |   |   |   |   |   |   [226] poutcome in other, success: yes (n = 28, err = 0.0%)
#> |   |   |   |   [227] job in entrepreneur, unemployed: no (n = 27, err = 29.6%)
#> 
#> Number of inner nodes:    113
#> Number of terminal nodes: 114

Decision tree menghasilkan struktur pohon untuk melakukan prediksi. Hal ini menjadikan decision tree sebagai model robust yang cukup interpretable.

# visualisasi decision tree
plot(bank_tree, type = "simple")

8. Model Prediction and Evaluation

Dari model yang sudah dibuat sebelumnya, mari kita lakukan prediksi pada data testing menggunakan fungsi predict().

🧪 Formula: predict(object, newdata, type)

  • object: model decision tree.
  • newdata: data baru.
  • type: bentuk prediksi.
    • type = "prob" mengeluarkan peluang untuk masing-masing kelas.
    • type = "response" mengeluarkan label kelas.
# prediksi label kelas data testing
pred_test <- predict(object = bank_tree, 
                     newdata = bank_test,
                     type = "response")
pred_test[0:5]
#>   1   2   3   4   5 
#> yes  no  no  no  no 
#> Levels: no yes

Selanjutnya, kita bisa mengevaluasi hasil prediksi model dengan confusionMatrix().

Fungsi: confusionMatrix(data, reference)

  • data: Data prediksi
  • reference: Data aktual
  • positive: Kelas positif
# confusion matrix untuk prediksi label kelas data testing 
confusionMatrix(data = pred_test,
                reference = bank_test$Subscribes,
                positive = "yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  no yes
#>        no  974  35
#>        yes 233 115
#>                                              
#>                Accuracy : 0.8025             
#>                  95% CI : (0.7803, 0.8234)   
#>     No Information Rate : 0.8895             
#>     P-Value [Acc > NIR] : 1                  
#>                                              
#>                   Kappa : 0.3635             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.76667            
#>             Specificity : 0.80696            
#>          Pos Pred Value : 0.33046            
#>          Neg Pred Value : 0.96531            
#>              Prevalence : 0.11054            
#>          Detection Rate : 0.08475            
#>    Detection Prevalence : 0.25645            
#>       Balanced Accuracy : 0.78681            
#>                                              
#>        'Positive' Class : yes                
#> 

❓ Q: Metrics apakah yang cocok untuk digunakan pada kasus ini?

Kelas: - Yes (positive) - No (negative)

  • False Negative (recall): Diprediksi tidak subscribes padahal sebenarnya dia subscribes -> Tidak tercatat dipencapaian

  • False Positive (precision): Diprediksi subscribes padahal sebenarnya tidak subscribes

Minimalisir: Recall

9. Considerations on Decision Tree: Pruning and Tree-size

Kekurangan dari decision tree adalah kecenderungannya untuk overfitting. Overfitting adalah kondisi di mana data cenderung menghafal pola data training.

  • Training: Data training -> model menghafal pola
  • Prediction: Data testing -> performa model tidak terlalu baik

Hal ini terjadi karena decision tree melakukan percabangan data hingga amat detail (bahkan hingga dalam leaf node hanya terdapat 1 observasi). Pada keadaan ini, decision tree justru menghafal pola pada data training dan membuat aturan yang terlalu kompleks. Bukan mempelajari pola tersebut. Alhasil, model memiliki kemampuan generalisasi pola data yang rendah dan kurang bisa memprediksi data baru.

Indikasi overfitting: performa tinggi pada data training dan rendah pada data testing. Perbedaan performanya cukup signifikan.

Perhatikan model di bawah ini yang terlalu kompleks.

bank_tree_complex <- ctree(formula = SubscribesDeposito ~ ., 
                            data = bank_train_up,
                            control = ctree_control(mincriterion = 0.05, 
                                                    minsplit = 5,
                                                    minbucket = 5))
plot(bank_tree_complex, type='simple')

Selanjutnya, kita akan melakukan evaluasi model pada data training dan data testing.

# prediksi kelas di data training
pred_complex_train <- predict(bank_tree_complex, 
                           bank_train_up, 
                           type = "response")

# confusion matrix data train
confusionMatrix(pred_complex_train, 
                bank_train_up$SubscribesDeposito, 
                positive = "yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   no  yes
#>        no  2548   19
#>        yes  245 2774
#>                                                
#>                Accuracy : 0.9527               
#>                  95% CI : (0.9468, 0.9582)     
#>     No Information Rate : 0.5                  
#>     P-Value [Acc > NIR] : < 0.00000000000000022
#>                                                
#>                   Kappa : 0.9055               
#>                                                
#>  Mcnemar's Test P-Value : < 0.00000000000000022
#>                                                
#>             Sensitivity : 0.9932               
#>             Specificity : 0.9123               
#>          Pos Pred Value : 0.9188               
#>          Neg Pred Value : 0.9926               
#>              Prevalence : 0.5000               
#>          Detection Rate : 0.4966               
#>    Detection Prevalence : 0.5405               
#>       Balanced Accuracy : 0.9527               
#>                                                
#>        'Positive' Class : yes                  
#> 
# prediksi kelas di data testing 
pred_complex_test <- predict(bank_tree_complex, 
                     bank_test, 
                     type = "response")

# confusion matrix data testing
confusionMatrix(pred_complex_test, 
                bank_test$Subscribes, 
                positive = "yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  no yes
#>        no  998  45
#>        yes 209 105
#>                                              
#>                Accuracy : 0.8128             
#>                  95% CI : (0.791, 0.8332)    
#>     No Information Rate : 0.8895             
#>     P-Value [Acc > NIR] : 1                  
#>                                              
#>                   Kappa : 0.3563             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.70000            
#>             Specificity : 0.82684            
#>          Pos Pred Value : 0.33439            
#>          Neg Pred Value : 0.95686            
#>              Prevalence : 0.11054            
#>          Detection Rate : 0.07738            
#>    Detection Prevalence : 0.23139            
#>       Balanced Accuracy : 0.76342            
#>                                              
#>        'Positive' Class : yes                
#> 
  • Recall data training: 91.23%
  • Recall data testing: 33.43%

Indikasi overfitting -> perbedaan performa di data train & testing cukup jauh. - Tidak aturan khusus berapa perbedaan train-test hingga dikatakan overfitting - Jika perbedaannya lebih besar dari 10%, dikatakan overfitting.

Untuk mengatasinya, decision tree perlu tahu kapan ia berhenti membuat cabang. Dengan demikian, pohon yang dihasilkan tidak terlalu kompleks. Pemotongan/pencegahan cabang pohon disebut pruning. Pruning terbagi menjadi 2 cara:

  • Pre-pruning: pencegahan pembuatan cabang (memasang parameter sebelum model dihasilkan).
  • Post-pruning: pemangkasan cabang (memotong cabang setelah model terbentuk).

Mari kita lakukan tuning terhadap model decision tree dengan mengubah beberapa nilai berikut:

  • Parameter mincriterion
    • Parameter ini menunjukkan nilai (1 - p-value) yang harus dilewati agar proses percabangan dapat terjadi.
    • Contoh, saat mincriterion = 0.95, nilai p-value harus < 0.05 agar sebuah node dapat membuat cabang (berkaitan dengan signifikansi prediktor).
    • Semakin besar parameter ini, semakin besar tingkat signifikansi sebuah prediktor agar dapat dilakukan percabangan.
    • Default: mincriterion = 0.95.
  • Parameter minsplit
    • Parameter ini mengatur banyak observasi minimal pada node agar percabangan dapat terjadi.
    • Default: minsplit = 20.
  • Parameter minbucket
    • Parameter ini mengatur banyak observasi minimal pada node setelah terjadi percabangan.
    • Default: minbucket = 7.

💡️ Tips: semakin besar ketiga parameter di atas, semakin sederhana decision tree yang dihasilkan (semakin ketat kriteria dalam melakukan percabangan).

# tuning model decision tree
tree_bank_tuned <- ctree(formula = SubscribesDeposito ~ .,
                         data = bank_train_up,
                         control = ctree_control(mincriterion = 0.95,
                                                 minsplit = 30,
                                                 minbucket = 10))
# visualisasi decision tree hasil tuning
plot(tree_bank_tuned , type = "simple")

Mari kita coba evaluasi model hasil tuning sebelumnya pada data training dan data testing.

# prediksi kelas di data training
# Pak Supratno
pred_tuned_train <- predict(tree_bank_tuned, 
                           bank_train_up, 
                           type = "response")


# confusion matrix data training
confusionMatrix(pred_tuned_train,
                bank_train_up$Subscribes,
                positive = "yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   no  yes
#>        no  2411  124
#>        yes  382 2669
#>                                                
#>                Accuracy : 0.9094               
#>                  95% CI : (0.9016, 0.9168)     
#>     No Information Rate : 0.5                  
#>     P-Value [Acc > NIR] : < 0.00000000000000022
#>                                                
#>                   Kappa : 0.8188               
#>                                                
#>  Mcnemar's Test P-Value : < 0.00000000000000022
#>                                                
#>             Sensitivity : 0.9556               
#>             Specificity : 0.8632               
#>          Pos Pred Value : 0.8748               
#>          Neg Pred Value : 0.9511               
#>              Prevalence : 0.5000               
#>          Detection Rate : 0.4778               
#>    Detection Prevalence : 0.5462               
#>       Balanced Accuracy : 0.9094               
#>                                                
#>        'Positive' Class : yes                  
#> 
# prediksi kelas di data testing
# Pak Supratno
pred_tuned_test <- predict(tree_bank_tuned, 
                           bank_test, 
                           type = "response")


# confusion matrix data testing
confusionMatrix(pred_tuned_test,
                bank_test$Subscribes,
                positive = "yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  no yes
#>        no  958  32
#>        yes 249 118
#>                                              
#>                Accuracy : 0.7929             
#>                  95% CI : (0.7704, 0.8142)   
#>     No Information Rate : 0.8895             
#>     P-Value [Acc > NIR] : 1                  
#>                                              
#>                   Kappa : 0.3553             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.78667            
#>             Specificity : 0.79370            
#>          Pos Pred Value : 0.32153            
#>          Neg Pred Value : 0.96768            
#>              Prevalence : 0.11054            
#>          Detection Rate : 0.08696            
#>    Detection Prevalence : 0.27045            
#>       Balanced Accuracy : 0.79019            
#>                                              
#>        'Positive' Class : yes                
#> 

💡10. Insight:

  • Recall train: 87%
  • Recall test: 31.97%

Apakah model overfit? Masih, karena perbedaannya lebih dari 10% ToDo: - Coba untuk lebih menyederhanakan model kita. - Telah puas dengan recall 80%, bisa menggunakan modelnya.