R Markdown

Pada kali ini, saya ingin memprediksi apakah client sudah berlangganan deposito berjangka. disini kita ambil dataset dari https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

Disini kita menggunakan model keras RNN

library(dplyr)

library(keras)

library(caret)
library(recipes)
library(ROSE)
library(rsample)

library(tensorflow)
reticulate::use_python(python = "C:/Users/LENOVO/anaconda3/envs/r-tensorflow-gpu/python.exe",required=TRUE)

Pertama-tama kita akan membaca file yang berbentuk csv dan juga menggunakan seperator “;”

data <- read.csv("dataset/bank-full.csv", sep = ";")

glimpse(data)
## Rows: 45,211
## Columns: 17
## $ age       <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, ~
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "~
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie~
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", ~
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",~
## $ balance   <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71~
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"~
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"~
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn~
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ~
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may~
## $ duration  <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,~
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ~
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn~
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~

Di data tersebut, ternyata tidak ada data yang NA

anyNA(data)
## [1] FALSE

Disini kita menyesuaikan tipe columnya yang belum sesuai

df <- data %>% 
  mutate(job = as.factor(job),
         marital = as.factor(job),
         education = as.factor(education),
         default = as.factor(default),
         housing = as.factor(housing),
         loan = as.factor(loan),
         contact = as.factor(contact),
         month = as.factor(month),
         poutcome = as.factor(poutcome))

head(df)
##   age          job      marital education default balance housing loan contact
## 1  58   management   management  tertiary      no    2143     yes   no unknown
## 2  44   technician   technician secondary      no      29     yes   no unknown
## 3  33 entrepreneur entrepreneur secondary      no       2     yes  yes unknown
## 4  47  blue-collar  blue-collar   unknown      no    1506     yes   no unknown
## 5  33      unknown      unknown   unknown      no       1      no   no unknown
## 6  35   management   management  tertiary      no     231     yes   no unknown
##   day month duration campaign pdays previous poutcome  y
## 1   5   may      261        1    -1        0  unknown no
## 2   5   may      151        1    -1        0  unknown no
## 3   5   may       76        1    -1        0  unknown no
## 4   5   may       92        1    -1        0  unknown no
## 5   5   may      198        1    -1        0  unknown no
## 6   5   may      139        1    -1        0  unknown no

Setelah tipe data sudah sesuai, kita akan membuat obj untuk scaling dan membuat dummy dengan menggunakan function recipe

rec_obj <- recipe(y ~ ., data = df) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  prep(df)

setelah kita membuat object , kita akan mentransform datanya sesuai dengan transform dari obj recipe sebelumnya

df_transform <- bake(rec_obj, df)

head(df_transform)
## # A tibble: 6 x 52
##      age balance   day duration campaign  pdays previous y     job_blue.collar
##    <dbl>   <dbl> <dbl>    <dbl>    <dbl>  <dbl>    <dbl> <fct>           <dbl>
## 1  1.61   0.256  -1.30   0.0110   -0.569 -0.411   -0.252 no                  0
## 2  0.289 -0.438  -1.30  -0.416    -0.569 -0.411   -0.252 no                  0
## 3 -0.747 -0.447  -1.30  -0.707    -0.569 -0.411   -0.252 no                  0
## 4  0.571  0.0472 -1.30  -0.645    -0.569 -0.411   -0.252 no                  1
## 5 -0.747 -0.447  -1.30  -0.234    -0.569 -0.411   -0.252 no                  0
## 6 -0.559 -0.372  -1.30  -0.463    -0.569 -0.411   -0.252 no                  0
## # ... with 43 more variables: job_entrepreneur <dbl>, job_housemaid <dbl>,
## #   job_management <dbl>, job_retired <dbl>, job_self.employed <dbl>,
## #   job_services <dbl>, job_student <dbl>, job_technician <dbl>,
## #   job_unemployed <dbl>, job_unknown <dbl>, marital_blue.collar <dbl>,
## #   marital_entrepreneur <dbl>, marital_housemaid <dbl>,
## #   marital_management <dbl>, marital_retired <dbl>,
## #   marital_self.employed <dbl>, marital_services <dbl>, marital_student <dbl>,
## #   marital_technician <dbl>, marital_unemployed <dbl>, marital_unknown <dbl>,
## #   education_secondary <dbl>, education_tertiary <dbl>,
## #   education_unknown <dbl>, default_yes <dbl>, housing_yes <dbl>,
## #   loan_yes <dbl>, contact_telephone <dbl>, contact_unknown <dbl>,
## #   month_aug <dbl>, month_dec <dbl>, month_feb <dbl>, month_jan <dbl>,
## #   month_jul <dbl>, month_jun <dbl>, month_mar <dbl>, month_may <dbl>,
## #   month_nov <dbl>, month_oct <dbl>, month_sep <dbl>, poutcome_other <dbl>,
## #   poutcome_success <dbl>, poutcome_unknown <dbl>

Setelah itu, kita akan melakukan spliting data menjadi training dan testing sebesar 0.8

set.seed(2021)

split <- initial_split(data = df_transform, prop = 0.8)

train <- training(split)
test <- testing(split)

Disini kita cek apakah data train kita targetnya sudah balanced

table(as.factor(train$y)) %>% prop.table()
## 
##        no       yes 
## 0.8822716 0.1177284

Setelah kita cek sebelumnya, ternyata data target dari train itu belum balanced, maka dari itu kita akan lakukan balancing terhadapt data target menggunakan ovun.sample dan menggunakan method upsample dan downsample (both)

train_balanced <- ovun.sample(y ~ ., data = train, method = "both", N = nrow(train))

table(as.factor(train_balanced$data$y)) %>% prop.table()
## 
##        no       yes 
## 0.5015483 0.4984517

setelah kita rasa data sudah balanced, maka kita akan membuat funtion untuk merubah target menjadi angka dan merubah kembali

encode_label <- function(x){
  ifelse(x == "no", 1, 0)
}

decode_label <- function(x){
  ifelse(x == 1, "no", "yes")
}

setelah itu, kita akan memecah predictor dan target, dan merubahnya sesuai kebutuhan

train_x <- train_balanced$data %>% select(-y) %>% as.matrix()
test_x <- test %>% select(-y) %>% as.matrix()

train_x <- array_reshape(train_x, dim(train_x))
test_x <- array_reshape(test_x, dim(test_x))

train_y <- sapply(train_balanced$data$y, encode_label)
test_y <- sapply(test$y, encode_label)

train_y <- to_categorical(train_y)
test_y <- to_categorical(test_y)

n_input <- ncol(train_x)
n_output <- ncol(train_y)

Sekarang kita membuat arsitektur NN nya dengan menggunakan hidel_layer 3x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 64 dan activation tanh

untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.0001

model_base <- keras_model_sequential(name = "model_base") %>% 
  layer_dense(units = 64,
              input_shape = n_input,
              activation = "tanh",
              name = "layer1") %>% 
  layer_dense(units = 64,
              activation = "tanh",
              name = "layer2") %>% 
  layer_dense(units = 64,
              activation = "tanh",
              name = "layer3") %>% 
  layer_dense(units = n_output,
              activation = "sigmoid",
              name = "output")

model_base %>% 
  compile(loss = "categorical_crossentropy",
          metrics = "accuracy",
          optimizer = optimizer_adam(learning_rate = 0.0001))

Untuk training model, kita memakai epoch 15 dan batch size sebesar 64

set.seed(2021)

history_base <- model_base %>% 
  fit(x = train_x,
      y= train_y,
      epoch = 10,
      batch_size = 64,
      validation_data = list(test_x, test_y))

plot(history_base)
## `geom_smooth()` using formula 'y ~ x'

Setelah model belajar, maka kita dapat melakukan prediksi menggunakan model tersebut.

pred_test <- predict(model_base, test_x) %>% k_argmax() %>% as.array()

dari hasi prediksi, maka kita harus merubah hasil tersebut menjadi category seperti di awal.

pred_test <- sapply(pred_test, decode_label)

setelah dirubah maka kita akan menjalankan confusionMatrix untuk melihat accuracy, recall, dan precision

confusionMatrix(as.factor(pred_test), as.factor(test$y), positive = "no")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6531  139
##        yes 1481  892
##                                           
##                Accuracy : 0.8209          
##                  95% CI : (0.8128, 0.8287)
##     No Information Rate : 0.886           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4341          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8152          
##             Specificity : 0.8652          
##          Pos Pred Value : 0.9792          
##          Neg Pred Value : 0.3759          
##              Prevalence : 0.8860          
##          Detection Rate : 0.7222          
##    Detection Prevalence : 0.7376          
##       Balanced Accuracy : 0.8402          
##                                           
##        'Positive' Class : no              
## 

Dari hasil yang kita dapatkan, model tersebut menghasilkan accuracy yang cukup bagus di atas 80% Dan model ini juga masih dapat di improve kedepannya.