R Markdown

Case yang kita ambil disini ialah seorang manager ingin mengetahui apakah customer nya akan pindah ke tempat lain, atau tetap stay. Disini saya menggunakan data dari link https://www.kaggle.com/sakshigoyal7/credit-card-customers

library(dplyr)

library(keras)

library(caret)
library(recipes)
library(ROSE)
library(rsample)

library(tensorflow)
reticulate::use_python(python = "C:/Users/LENOVO/anaconda3/envs/r-tensorflow-gpu/python.exe",required=TRUE)

Pertama-tama kita akan membaca file yang berbentuk csv

data <- read.csv("dataset/BankChurners.csv")

glimpse(data)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM                                                                                                                          <int> ~
## $ Attrition_Flag                                                                                                                     <chr> ~
## $ Customer_Age                                                                                                                       <int> ~
## $ Gender                                                                                                                             <chr> ~
## $ Dependent_count                                                                                                                    <int> ~
## $ Education_Level                                                                                                                    <chr> ~
## $ Marital_Status                                                                                                                     <chr> ~
## $ Income_Category                                                                                                                    <chr> ~
## $ Card_Category                                                                                                                      <chr> ~
## $ Months_on_book                                                                                                                     <int> ~
## $ Total_Relationship_Count                                                                                                           <int> ~
## $ Months_Inactive_12_mon                                                                                                             <int> ~
## $ Contacts_Count_12_mon                                                                                                              <int> ~
## $ Credit_Limit                                                                                                                       <dbl> ~
## $ Total_Revolving_Bal                                                                                                                <int> ~
## $ Avg_Open_To_Buy                                                                                                                    <dbl> ~
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> ~
## $ Total_Trans_Amt                                                                                                                    <int> ~
## $ Total_Trans_Ct                                                                                                                     <int> ~
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> ~
## $ Avg_Utilization_Ratio                                                                                                              <dbl> ~
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> ~
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> ~

Disini kita membuang column yang tidak kita butuhkan

df <- data %>% 
  select(-c(Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
           Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2,
           CLIENTNUM))
glimpse(df)
## Rows: 10,127
## Columns: 20
## $ Attrition_Flag           <chr> "Existing Customer", "Existing Customer", "Ex~
## $ Customer_Age             <int> 45, 49, 51, 40, 40, 44, 51, 32, 37, 48, 42, 6~
## $ Gender                   <chr> "M", "F", "M", "F", "M", "M", "M", "M", "M", ~
## $ Dependent_count          <int> 3, 5, 3, 4, 3, 2, 4, 0, 3, 2, 5, 1, 1, 3, 2, ~
## $ Education_Level          <chr> "High School", "Graduate", "Graduate", "High ~
## $ Marital_Status           <chr> "Married", "Single", "Married", "Unknown", "M~
## $ Income_Category          <chr> "$60K - $80K", "Less than $40K", "$80K - $120~
## $ Card_Category            <chr> "Blue", "Blue", "Blue", "Blue", "Blue", "Blue~
## $ Months_on_book           <int> 39, 44, 36, 34, 21, 36, 46, 27, 36, 36, 31, 5~
## $ Total_Relationship_Count <int> 5, 6, 4, 3, 5, 3, 6, 2, 5, 6, 5, 6, 3, 5, 5, ~
## $ Months_Inactive_12_mon   <int> 1, 1, 1, 4, 1, 1, 1, 2, 2, 3, 3, 2, 6, 1, 2, ~
## $ Contacts_Count_12_mon    <int> 3, 2, 0, 1, 0, 2, 3, 2, 0, 3, 2, 3, 0, 3, 2, ~
## $ Credit_Limit             <dbl> 12691.0, 8256.0, 3418.0, 3313.0, 4716.0, 4010~
## $ Total_Revolving_Bal      <int> 777, 864, 0, 2517, 0, 1247, 2264, 1396, 2517,~
## $ Avg_Open_To_Buy          <dbl> 11914.0, 7392.0, 3418.0, 796.0, 4716.0, 2763.~
## $ Total_Amt_Chng_Q4_Q1     <dbl> 1.335, 1.541, 2.594, 1.405, 2.175, 1.376, 1.9~
## $ Total_Trans_Amt          <int> 1144, 1291, 1887, 1171, 816, 1088, 1330, 1538~
## $ Total_Trans_Ct           <int> 42, 33, 20, 20, 28, 24, 31, 36, 24, 32, 42, 2~
## $ Total_Ct_Chng_Q4_Q1      <dbl> 1.625, 3.714, 2.333, 2.333, 2.500, 0.846, 0.7~
## $ Avg_Utilization_Ratio    <dbl> 0.061, 0.105, 0.000, 0.760, 0.000, 0.311, 0.0~

Di data tersebut, ternyata tidak ada data yang NA

anyNA(df)
## [1] FALSE

Disini kita menyesuaikan tipe columnya yang belum sesuai

df_clean <- df %>% 
  mutate(Gender = as.factor(Gender),
         Education_Level = as.factor(Education_Level),
         Marital_Status = as.factor(Marital_Status),
         Income_Category = as.factor(Income_Category),
         Card_Category = as.factor(Card_Category),
         Attrition_Flag = as.factor(Attrition_Flag)
         )
str(df_clean)
## 'data.frame':    10127 obs. of  20 variables:
##  $ Attrition_Flag          : Factor w/ 2 levels "Attrited Customer",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Customer_Age            : int  45 49 51 40 40 44 51 32 37 48 ...
##  $ Gender                  : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 2 2 2 2 ...
##  $ Dependent_count         : int  3 5 3 4 3 2 4 0 3 2 ...
##  $ Education_Level         : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
##  $ Marital_Status          : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
##  $ Income_Category         : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
##  $ Card_Category           : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
##  $ Months_on_book          : int  39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count: int  5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon  : int  1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon   : int  3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit            : num  12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal     : int  777 864 0 2517 0 1247 2264 1396 2517 1677 ...
##  $ Avg_Open_To_Buy         : num  11914 7392 3418 796 4716 ...
##  $ Total_Amt_Chng_Q4_Q1    : num  1.33 1.54 2.59 1.4 2.17 ...
##  $ Total_Trans_Amt         : int  1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
##  $ Total_Trans_Ct          : int  42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1     : num  1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio   : num  0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...

Setelah tipe data sudah sesuai, kita akan membuat obj untuk scaling dan membuat dummy dengan menggunakan function recipe

rec_obj <- recipe(Attrition_Flag ~ ., data = df_clean) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  prep(data = df_clean)

rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         19
## 
## Training data contained 10127 data points and no missing data.
## 
## Operations:
## 
## Centering for Customer_Age, Dependent_count, ... [trained]
## Scaling for Customer_Age, Dependent_count, ... [trained]
## Dummy variables from Gender, Education_Level, Marital_Status, ... [trained]

setelah kita membuat object , kita akan mentransform datanya sesuai dengan transform dari obj recipe sebelumnya

df_transform <- bake(rec_obj, df_clean)

head(df_transform)
## # A tibble: 6 x 33
##   Customer_Age Dependent_count Months_on_book Total_Relationsh~ Months_Inactive~
##          <dbl>           <dbl>          <dbl>             <dbl>            <dbl>
## 1       -0.165           0.503        0.385               0.764            -1.33
## 2        0.334           2.04         1.01                1.41             -1.33
## 3        0.583           0.503        0.00896             0.121            -1.33
## 4       -0.789           1.27        -0.241              -0.523             1.64
## 5       -0.789           0.503       -1.87                0.764            -1.33
## 6       -0.290          -0.267        0.00896            -0.523            -1.33
## # ... with 28 more variables: Contacts_Count_12_mon <dbl>, Credit_Limit <dbl>,
## #   Total_Revolving_Bal <dbl>, Avg_Open_To_Buy <dbl>,
## #   Total_Amt_Chng_Q4_Q1 <dbl>, Total_Trans_Amt <dbl>, Total_Trans_Ct <dbl>,
## #   Total_Ct_Chng_Q4_Q1 <dbl>, Avg_Utilization_Ratio <dbl>,
## #   Attrition_Flag <fct>, Gender_M <dbl>, Education_Level_Doctorate <dbl>,
## #   Education_Level_Graduate <dbl>, Education_Level_High.School <dbl>,
## #   Education_Level_Post.Graduate <dbl>, Education_Level_Uneducated <dbl>,
## #   Education_Level_Unknown <dbl>, Marital_Status_Married <dbl>,
## #   Marital_Status_Single <dbl>, Marital_Status_Unknown <dbl>,
## #   Income_Category_X.40K....60K <dbl>, Income_Category_X.60K....80K <dbl>,
## #   Income_Category_X.80K....120K <dbl>, Income_Category_Less.than..40K <dbl>,
## #   Income_Category_Unknown <dbl>, Card_Category_Gold <dbl>,
## #   Card_Category_Platinum <dbl>, Card_Category_Silver <dbl>

Setelah itu, kita akan melakukan spliting data menjadi training dan testing sebesar 0.8

set.seed(2021)
split <- initial_split(df_transform, 0.8)

train <- training(split)
val <- testing(split)

dim(train)
## [1] 8101   33
dim(val)
## [1] 2026   33

Disini kita cek apakah data train kita targetnya sudah balanced

table(as.factor(train$Attrition_Flag)) %>% prop.table()
## 
## Attrited Customer Existing Customer 
##          0.159363          0.840637

Setelah kita cek sebelumnya, ternyata data target dari train itu belum balanced, maka dari itu kita akan lakukan balancing terhadapt data target menggunakan ovun.sample dan menggunakan method upsample dan downsample (both)

train_balanced <- ovun.sample(Attrition_Flag ~ ., data = train, method = "both", N = nrow(train))

table(train_balanced$data$Attrition_Flag) %>% prop.table()
## 
## Existing Customer Attrited Customer 
##         0.5009258         0.4990742

setelah kita rasa data sudah balanced, maka kita akan membuat funtion untuk merubah target menjadi angka

decode_target <- function(x) {
  return(ifelse(x == "Existing Customer", 0, 1))
}

setelah itu, kita akan memecah predictor dan target, dan merubahnya sesuai kebutuhan

train_x <- train_balanced$data %>% select(-Attrition_Flag) %>% as.matrix()
val_x <- val %>% select(-Attrition_Flag) %>% as.matrix()

train_x <- array_reshape(train_x, dim(train_x))
val_x <- array_reshape(val_x, dim(val_x))

train_y <- to_categorical(decode_target(train_balanced$data$Attrition_Flag))
val_y <- to_categorical(decode_target(val$Attrition_Flag))

head(train_y)
##      [,1] [,2]
## [1,]    1    0
## [2,]    1    0
## [3,]    1    0
## [4,]    1    0
## [5,]    1    0
## [6,]    1    0

Sekarang kita membuat arsitektur NN nya dengan menggunakan hidel_layer 2x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 64 dan activation tanh

untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.001

n_input <- dim(train_x)[2]
n_output <- dim(train_y)[2]

model_base <- keras_model_sequential(name = "model_base") %>% 
  layer_dense(units = 64,
              input_shape = n_input,
              activation = "tanh",
              name = "layer_1") %>% 
  layer_dense(units = 64,
              activation = "tanh",
              name = "layer_2") %>% 
  layer_dense(units = n_output,
              activation = "sigmoid",
              name = "output")

model_base %>% 
  compile(loss = "binary_crossentropy",
          metrics = "accuracy",
          optimizer = optimizer_adam(learning_rate = 0.001))

model_base
## Model
## Model: "model_base"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## layer_1 (Dense)                     (None, 64)                      2112        
## ________________________________________________________________________________
## layer_2 (Dense)                     (None, 64)                      4160        
## ________________________________________________________________________________
## output (Dense)                      (None, 2)                       130         
## ================================================================================
## Total params: 6,402
## Trainable params: 6,402
## Non-trainable params: 0
## ________________________________________________________________________________

Untuk training model, kita memakai epoch 10 dan batch size sebesar 64

set.seed(2021)

history <- model_base %>% 
  fit(x = train_x,
      y = train_y,
      epoch = 10,
      batch_size = 64,
      validation_data = list(val_x, val_y))

plot(history)
## `geom_smooth()` using formula 'y ~ x'

Setelah model belajar, maka kita dapat melakukan prediksi menggunakan model tersebut.

pred_val <- predict(model_base, val_x) %>% k_argmax() %>% as.array()

Dari hasi prediksi, maka kita harus merubah hasil tersebut menjadi category seperti di awal. dan juga n menjalankan confusionMatrix untuk melihat accuracy, recall, dan precision

confusionMatrix(as.factor(pred_val), as.factor(decode_target(val$Attrition_Flag)), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1493   29
##          1  197  307
##                                           
##                Accuracy : 0.8885          
##                  95% CI : (0.8739, 0.9018)
##     No Information Rate : 0.8342          
##     P-Value [Acc > NIR] : 3.208e-12       
##                                           
##                   Kappa : 0.6641          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9137          
##             Specificity : 0.8834          
##          Pos Pred Value : 0.6091          
##          Neg Pred Value : 0.9809          
##              Prevalence : 0.1658          
##          Detection Rate : 0.1515          
##    Detection Prevalence : 0.2488          
##       Balanced Accuracy : 0.8986          
##                                           
##        'Positive' Class : 1               
## 

Dari hasil yang kita dapatkan, model tersebut menghasilkan accuracy yang cukup bagus di atas 85% Dan model ini juga masih dapat di improve kedepannya.