Pada kali ini, saya ingin memprediksi apakah client sudah berlangganan deposito berjangka. disini kita ambil dataset dari https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Disini kita menggunakan model keras RNN
library(dplyr)
library(keras)
library(caret)
library(recipes)
library(ROSE)
library(rsample)
library(tensorflow)
reticulate::use_python(python = "C:/Users/LENOVO/anaconda3/envs/r-tensorflow-gpu/python.exe",required=TRUE)
Pertama-tama kita akan membaca file yang berbentuk csv dan juga menggunakan seperator “;”
data <- read.csv("dataset/bank-full.csv", sep = ";")
glimpse(data)
## Rows: 45,211
## Columns: 17
## $ age <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, ~
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "~
## $ marital <chr> "married", "single", "married", "married", "single", "marrie~
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", ~
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",~
## $ balance <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71~
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"~
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"~
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn~
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ~
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may~
## $ duration <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,~
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ~
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn~
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
Di data tersebut, ternyata tidak ada data yang NA
anyNA(data)
## [1] FALSE
Disini kita menyesuaikan tipe columnya yang belum sesuai
df <- data %>%
mutate(job = as.factor(job),
marital = as.factor(job),
education = as.factor(education),
default = as.factor(default),
housing = as.factor(housing),
loan = as.factor(loan),
contact = as.factor(contact),
month = as.factor(month),
poutcome = as.factor(poutcome))
head(df)
## age job marital education default balance housing loan contact
## 1 58 management management tertiary no 2143 yes no unknown
## 2 44 technician technician secondary no 29 yes no unknown
## 3 33 entrepreneur entrepreneur secondary no 2 yes yes unknown
## 4 47 blue-collar blue-collar unknown no 1506 yes no unknown
## 5 33 unknown unknown unknown no 1 no no unknown
## 6 35 management management tertiary no 231 yes no unknown
## day month duration campaign pdays previous poutcome y
## 1 5 may 261 1 -1 0 unknown no
## 2 5 may 151 1 -1 0 unknown no
## 3 5 may 76 1 -1 0 unknown no
## 4 5 may 92 1 -1 0 unknown no
## 5 5 may 198 1 -1 0 unknown no
## 6 5 may 139 1 -1 0 unknown no
Setelah tipe data sudah sesuai, kita akan membuat obj untuk scaling dan membuat dummy dengan menggunakan function recipe
rec_obj <- recipe(y ~ ., data = df) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep(df)
setelah kita membuat object , kita akan mentransform datanya sesuai dengan transform dari obj recipe sebelumnya
df_transform <- bake(rec_obj, df)
head(df_transform)
## # A tibble: 6 x 52
## age balance day duration campaign pdays previous y job_blue.collar
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 1.61 0.256 -1.30 0.0110 -0.569 -0.411 -0.252 no 0
## 2 0.289 -0.438 -1.30 -0.416 -0.569 -0.411 -0.252 no 0
## 3 -0.747 -0.447 -1.30 -0.707 -0.569 -0.411 -0.252 no 0
## 4 0.571 0.0472 -1.30 -0.645 -0.569 -0.411 -0.252 no 1
## 5 -0.747 -0.447 -1.30 -0.234 -0.569 -0.411 -0.252 no 0
## 6 -0.559 -0.372 -1.30 -0.463 -0.569 -0.411 -0.252 no 0
## # ... with 43 more variables: job_entrepreneur <dbl>, job_housemaid <dbl>,
## # job_management <dbl>, job_retired <dbl>, job_self.employed <dbl>,
## # job_services <dbl>, job_student <dbl>, job_technician <dbl>,
## # job_unemployed <dbl>, job_unknown <dbl>, marital_blue.collar <dbl>,
## # marital_entrepreneur <dbl>, marital_housemaid <dbl>,
## # marital_management <dbl>, marital_retired <dbl>,
## # marital_self.employed <dbl>, marital_services <dbl>, marital_student <dbl>,
## # marital_technician <dbl>, marital_unemployed <dbl>, marital_unknown <dbl>,
## # education_secondary <dbl>, education_tertiary <dbl>,
## # education_unknown <dbl>, default_yes <dbl>, housing_yes <dbl>,
## # loan_yes <dbl>, contact_telephone <dbl>, contact_unknown <dbl>,
## # month_aug <dbl>, month_dec <dbl>, month_feb <dbl>, month_jan <dbl>,
## # month_jul <dbl>, month_jun <dbl>, month_mar <dbl>, month_may <dbl>,
## # month_nov <dbl>, month_oct <dbl>, month_sep <dbl>, poutcome_other <dbl>,
## # poutcome_success <dbl>, poutcome_unknown <dbl>
Setelah itu, kita akan melakukan spliting data menjadi training dan testing sebesar 0.8
set.seed(2021)
split <- initial_split(data = df_transform, prop = 0.8)
train <- training(split)
test <- testing(split)
Disini kita cek apakah data train kita targetnya sudah balanced
table(as.factor(train$y)) %>% prop.table()
##
## no yes
## 0.8822716 0.1177284
Setelah kita cek sebelumnya, ternyata data target dari train itu belum balanced, maka dari itu kita akan lakukan balancing terhadapt data target menggunakan ovun.sample dan menggunakan method upsample dan downsample (both)
train_balanced <- ovun.sample(y ~ ., data = train, method = "both", N = nrow(train))
table(as.factor(train_balanced$data$y)) %>% prop.table()
##
## no yes
## 0.5015483 0.4984517
setelah kita rasa data sudah balanced, maka kita akan membuat funtion untuk merubah target menjadi angka dan merubah kembali
encode_label <- function(x){
ifelse(x == "no", 1, 0)
}
decode_label <- function(x){
ifelse(x == 1, "no", "yes")
}
setelah itu, kita akan memecah predictor dan target, dan merubahnya sesuai kebutuhan
train_x <- train_balanced$data %>% select(-y) %>% as.matrix()
test_x <- test %>% select(-y) %>% as.matrix()
train_x <- array_reshape(train_x, dim(train_x))
test_x <- array_reshape(test_x, dim(test_x))
train_y <- sapply(train_balanced$data$y, encode_label)
test_y <- sapply(test$y, encode_label)
train_y <- to_categorical(train_y)
test_y <- to_categorical(test_y)
n_input <- ncol(train_x)
n_output <- ncol(train_y)
Sekarang kita membuat arsitektur NN nya dengan menggunakan hidel_layer 3x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 64 dan activation tanh
untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.0001
model_base <- keras_model_sequential(name = "model_base") %>%
layer_dense(units = 64,
input_shape = n_input,
activation = "tanh",
name = "layer1") %>%
layer_dense(units = 64,
activation = "tanh",
name = "layer2") %>%
layer_dense(units = 64,
activation = "tanh",
name = "layer3") %>%
layer_dense(units = n_output,
activation = "sigmoid",
name = "output")
model_base %>%
compile(loss = "categorical_crossentropy",
metrics = "accuracy",
optimizer = optimizer_adam(learning_rate = 0.0001))
Untuk training model, kita memakai epoch 15 dan batch size sebesar 64
set.seed(2021)
history_base <- model_base %>%
fit(x = train_x,
y= train_y,
epoch = 10,
batch_size = 64,
validation_data = list(test_x, test_y))
plot(history_base)
## `geom_smooth()` using formula 'y ~ x'
Setelah model belajar, maka kita dapat melakukan prediksi menggunakan model tersebut.
pred_test <- predict(model_base, test_x) %>% k_argmax() %>% as.array()
dari hasi prediksi, maka kita harus merubah hasil tersebut menjadi category seperti di awal.
pred_test <- sapply(pred_test, decode_label)
setelah dirubah maka kita akan menjalankan confusionMatrix untuk melihat accuracy, recall, dan precision
confusionMatrix(as.factor(pred_test), as.factor(test$y), positive = "no")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6531 139
## yes 1481 892
##
## Accuracy : 0.8209
## 95% CI : (0.8128, 0.8287)
## No Information Rate : 0.886
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4341
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8152
## Specificity : 0.8652
## Pos Pred Value : 0.9792
## Neg Pred Value : 0.3759
## Prevalence : 0.8860
## Detection Rate : 0.7222
## Detection Prevalence : 0.7376
## Balanced Accuracy : 0.8402
##
## 'Positive' Class : no
##
Dari hasil yang kita dapatkan, model tersebut menghasilkan accuracy yang cukup bagus di atas 80% Dan model ini juga masih dapat di improve kedepannya.