Case yang kita ambil disini ialah seorang manager ingin mengetahui apakah customer nya akan pindah ke tempat lain, atau tetap stay. Disini saya menggunakan data dari link https://www.kaggle.com/sakshigoyal7/credit-card-customers
library(dplyr)
library(keras)
library(caret)
library(recipes)
library(ROSE)
library(rsample)
library(tensorflow)
reticulate::use_python(python = "C:/Users/LENOVO/anaconda3/envs/r-tensorflow-gpu/python.exe",required=TRUE)
Pertama-tama kita akan membaca file yang berbentuk csv
data <- read.csv("dataset/BankChurners.csv")
glimpse(data)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM <int> ~
## $ Attrition_Flag <chr> ~
## $ Customer_Age <int> ~
## $ Gender <chr> ~
## $ Dependent_count <int> ~
## $ Education_Level <chr> ~
## $ Marital_Status <chr> ~
## $ Income_Category <chr> ~
## $ Card_Category <chr> ~
## $ Months_on_book <int> ~
## $ Total_Relationship_Count <int> ~
## $ Months_Inactive_12_mon <int> ~
## $ Contacts_Count_12_mon <int> ~
## $ Credit_Limit <dbl> ~
## $ Total_Revolving_Bal <int> ~
## $ Avg_Open_To_Buy <dbl> ~
## $ Total_Amt_Chng_Q4_Q1 <dbl> ~
## $ Total_Trans_Amt <int> ~
## $ Total_Trans_Ct <int> ~
## $ Total_Ct_Chng_Q4_Q1 <dbl> ~
## $ Avg_Utilization_Ratio <dbl> ~
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> ~
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> ~
Disini kita membuang column yang tidak kita butuhkan
df <- data %>%
select(-c(Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2,
CLIENTNUM))
glimpse(df)
## Rows: 10,127
## Columns: 20
## $ Attrition_Flag <chr> "Existing Customer", "Existing Customer", "Ex~
## $ Customer_Age <int> 45, 49, 51, 40, 40, 44, 51, 32, 37, 48, 42, 6~
## $ Gender <chr> "M", "F", "M", "F", "M", "M", "M", "M", "M", ~
## $ Dependent_count <int> 3, 5, 3, 4, 3, 2, 4, 0, 3, 2, 5, 1, 1, 3, 2, ~
## $ Education_Level <chr> "High School", "Graduate", "Graduate", "High ~
## $ Marital_Status <chr> "Married", "Single", "Married", "Unknown", "M~
## $ Income_Category <chr> "$60K - $80K", "Less than $40K", "$80K - $120~
## $ Card_Category <chr> "Blue", "Blue", "Blue", "Blue", "Blue", "Blue~
## $ Months_on_book <int> 39, 44, 36, 34, 21, 36, 46, 27, 36, 36, 31, 5~
## $ Total_Relationship_Count <int> 5, 6, 4, 3, 5, 3, 6, 2, 5, 6, 5, 6, 3, 5, 5, ~
## $ Months_Inactive_12_mon <int> 1, 1, 1, 4, 1, 1, 1, 2, 2, 3, 3, 2, 6, 1, 2, ~
## $ Contacts_Count_12_mon <int> 3, 2, 0, 1, 0, 2, 3, 2, 0, 3, 2, 3, 0, 3, 2, ~
## $ Credit_Limit <dbl> 12691.0, 8256.0, 3418.0, 3313.0, 4716.0, 4010~
## $ Total_Revolving_Bal <int> 777, 864, 0, 2517, 0, 1247, 2264, 1396, 2517,~
## $ Avg_Open_To_Buy <dbl> 11914.0, 7392.0, 3418.0, 796.0, 4716.0, 2763.~
## $ Total_Amt_Chng_Q4_Q1 <dbl> 1.335, 1.541, 2.594, 1.405, 2.175, 1.376, 1.9~
## $ Total_Trans_Amt <int> 1144, 1291, 1887, 1171, 816, 1088, 1330, 1538~
## $ Total_Trans_Ct <int> 42, 33, 20, 20, 28, 24, 31, 36, 24, 32, 42, 2~
## $ Total_Ct_Chng_Q4_Q1 <dbl> 1.625, 3.714, 2.333, 2.333, 2.500, 0.846, 0.7~
## $ Avg_Utilization_Ratio <dbl> 0.061, 0.105, 0.000, 0.760, 0.000, 0.311, 0.0~
Di data tersebut, ternyata tidak ada data yang NA
anyNA(df)
## [1] FALSE
Disini kita menyesuaikan tipe columnya yang belum sesuai
df_clean <- df %>%
mutate(Gender = as.factor(Gender),
Education_Level = as.factor(Education_Level),
Marital_Status = as.factor(Marital_Status),
Income_Category = as.factor(Income_Category),
Card_Category = as.factor(Card_Category),
Attrition_Flag = as.factor(Attrition_Flag)
)
str(df_clean)
## 'data.frame': 10127 obs. of 20 variables:
## $ Attrition_Flag : Factor w/ 2 levels "Attrited Customer",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 2 2 2 2 ...
## $ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
## $ Marital_Status : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
## $ Income_Category : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
## $ Card_Category : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
## $ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 ...
## $ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 ...
## $ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
## $ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
Setelah tipe data sudah sesuai, kita akan membuat obj untuk scaling dan membuat dummy dengan menggunakan function recipe
rec_obj <- recipe(Attrition_Flag ~ ., data = df_clean) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep(data = df_clean)
rec_obj
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 19
##
## Training data contained 10127 data points and no missing data.
##
## Operations:
##
## Centering for Customer_Age, Dependent_count, ... [trained]
## Scaling for Customer_Age, Dependent_count, ... [trained]
## Dummy variables from Gender, Education_Level, Marital_Status, ... [trained]
setelah kita membuat object , kita akan mentransform datanya sesuai dengan transform dari obj recipe sebelumnya
df_transform <- bake(rec_obj, df_clean)
head(df_transform)
## # A tibble: 6 x 33
## Customer_Age Dependent_count Months_on_book Total_Relationsh~ Months_Inactive~
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.165 0.503 0.385 0.764 -1.33
## 2 0.334 2.04 1.01 1.41 -1.33
## 3 0.583 0.503 0.00896 0.121 -1.33
## 4 -0.789 1.27 -0.241 -0.523 1.64
## 5 -0.789 0.503 -1.87 0.764 -1.33
## 6 -0.290 -0.267 0.00896 -0.523 -1.33
## # ... with 28 more variables: Contacts_Count_12_mon <dbl>, Credit_Limit <dbl>,
## # Total_Revolving_Bal <dbl>, Avg_Open_To_Buy <dbl>,
## # Total_Amt_Chng_Q4_Q1 <dbl>, Total_Trans_Amt <dbl>, Total_Trans_Ct <dbl>,
## # Total_Ct_Chng_Q4_Q1 <dbl>, Avg_Utilization_Ratio <dbl>,
## # Attrition_Flag <fct>, Gender_M <dbl>, Education_Level_Doctorate <dbl>,
## # Education_Level_Graduate <dbl>, Education_Level_High.School <dbl>,
## # Education_Level_Post.Graduate <dbl>, Education_Level_Uneducated <dbl>,
## # Education_Level_Unknown <dbl>, Marital_Status_Married <dbl>,
## # Marital_Status_Single <dbl>, Marital_Status_Unknown <dbl>,
## # Income_Category_X.40K....60K <dbl>, Income_Category_X.60K....80K <dbl>,
## # Income_Category_X.80K....120K <dbl>, Income_Category_Less.than..40K <dbl>,
## # Income_Category_Unknown <dbl>, Card_Category_Gold <dbl>,
## # Card_Category_Platinum <dbl>, Card_Category_Silver <dbl>
Setelah itu, kita akan melakukan spliting data menjadi training dan testing sebesar 0.8
set.seed(2021)
split <- initial_split(df_transform, 0.8)
train <- training(split)
val <- testing(split)
dim(train)
## [1] 8101 33
dim(val)
## [1] 2026 33
Disini kita cek apakah data train kita targetnya sudah balanced
table(as.factor(train$Attrition_Flag)) %>% prop.table()
##
## Attrited Customer Existing Customer
## 0.159363 0.840637
Setelah kita cek sebelumnya, ternyata data target dari train itu belum balanced, maka dari itu kita akan lakukan balancing terhadapt data target menggunakan ovun.sample dan menggunakan method upsample dan downsample (both)
train_balanced <- ovun.sample(Attrition_Flag ~ ., data = train, method = "both", N = nrow(train))
table(train_balanced$data$Attrition_Flag) %>% prop.table()
##
## Existing Customer Attrited Customer
## 0.5009258 0.4990742
setelah kita rasa data sudah balanced, maka kita akan membuat funtion untuk merubah target menjadi angka
decode_target <- function(x) {
return(ifelse(x == "Existing Customer", 0, 1))
}
setelah itu, kita akan memecah predictor dan target, dan merubahnya sesuai kebutuhan
train_x <- train_balanced$data %>% select(-Attrition_Flag) %>% as.matrix()
val_x <- val %>% select(-Attrition_Flag) %>% as.matrix()
train_x <- array_reshape(train_x, dim(train_x))
val_x <- array_reshape(val_x, dim(val_x))
train_y <- to_categorical(decode_target(train_balanced$data$Attrition_Flag))
val_y <- to_categorical(decode_target(val$Attrition_Flag))
head(train_y)
## [,1] [,2]
## [1,] 1 0
## [2,] 1 0
## [3,] 1 0
## [4,] 1 0
## [5,] 1 0
## [6,] 1 0
Sekarang kita membuat arsitektur NN nya dengan menggunakan hidel_layer 2x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 64 dan activation tanh
untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.001
n_input <- dim(train_x)[2]
n_output <- dim(train_y)[2]
model_base <- keras_model_sequential(name = "model_base") %>%
layer_dense(units = 64,
input_shape = n_input,
activation = "tanh",
name = "layer_1") %>%
layer_dense(units = 64,
activation = "tanh",
name = "layer_2") %>%
layer_dense(units = n_output,
activation = "sigmoid",
name = "output")
model_base %>%
compile(loss = "binary_crossentropy",
metrics = "accuracy",
optimizer = optimizer_adam(learning_rate = 0.001))
model_base
## Model
## Model: "model_base"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## layer_1 (Dense) (None, 64) 2112
## ________________________________________________________________________________
## layer_2 (Dense) (None, 64) 4160
## ________________________________________________________________________________
## output (Dense) (None, 2) 130
## ================================================================================
## Total params: 6,402
## Trainable params: 6,402
## Non-trainable params: 0
## ________________________________________________________________________________
Untuk training model, kita memakai epoch 10 dan batch size sebesar 64
set.seed(2021)
history <- model_base %>%
fit(x = train_x,
y = train_y,
epoch = 10,
batch_size = 64,
validation_data = list(val_x, val_y))
plot(history)
## `geom_smooth()` using formula 'y ~ x'
Setelah model belajar, maka kita dapat melakukan prediksi menggunakan model tersebut.
pred_val <- predict(model_base, val_x) %>% k_argmax() %>% as.array()
Dari hasi prediksi, maka kita harus merubah hasil tersebut menjadi category seperti di awal. dan juga n menjalankan confusionMatrix untuk melihat accuracy, recall, dan precision
confusionMatrix(as.factor(pred_val), as.factor(decode_target(val$Attrition_Flag)), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1493 29
## 1 197 307
##
## Accuracy : 0.8885
## 95% CI : (0.8739, 0.9018)
## No Information Rate : 0.8342
## P-Value [Acc > NIR] : 3.208e-12
##
## Kappa : 0.6641
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9137
## Specificity : 0.8834
## Pos Pred Value : 0.6091
## Neg Pred Value : 0.9809
## Prevalence : 0.1658
## Detection Rate : 0.1515
## Detection Prevalence : 0.2488
## Balanced Accuracy : 0.8986
##
## 'Positive' Class : 1
##
Dari hasil yang kita dapatkan, model tersebut menghasilkan accuracy yang cukup bagus di atas 85% Dan model ini juga masih dapat di improve kedepannya.