Kami mendapat amanah dari sebuah klien kami perusahaan yang melayani telekomunikasi untuk menganalisa dari proses rekrutasi dan seleksi yang mereka telah lakukan, dan disusun algoritma proses seleksi mereka dan validasi atasnya Mereka memiliki data hasil assessment sebagai berikut
REKOMENDASI_DIVISI | REKOMENDASI_PIC_FA | OVERSEAS | SURAT_PERINGATAN | PSIKOTEST | TEST_ONLINE | TEST_PRAKTEK | KEBUGARAN | REKOMENDASI_ATASAN | KINERJA | JENIS_KELAMIN | LEVEL_PENDIDIKAN | HASIL |
---|---|---|---|---|---|---|---|---|---|---|---|---|
NO | NO | NO | TIDAK ADA SP | CUKUP | 57 | 60 | Sangat Kurang | DIREKOMENDASIKAN | C3 | Laki-laki | SMK | TIDAK LULUS |
NO | NO | NO | TIDAK ADA SP | CUKUP | 67 | 59 | Sangat Kurang | DIREKOMENDASIKAN | C3 | Laki-laki | SMK | TIDAK LULUS |
NO | NO | NO | TIDAK ADA SP | BAIK | 90 | 82 | Sangat Kurang | DIREKOMENDASIKAN | C3 | Laki-laki | SMK | TIDAK LULUS |
NO | NO | NO | TIDAK ADA SP | CUKUP | 73 | 79 | Sangat Kurang | DIREKOMENDASIKAN | C4 | Laki-laki | SMK | TIDAK LULUS |
NO | NO | NO | TIDAK ADA SP | SANGAT BAIK | 67 | 85 | Sedang | DIREKOMENDASIKAN | C2 | Laki-laki | S1 | TIDAK LULUS |
NO | NO | NO | TIDAK ADA SP | BAIK | 83 | 74 | Sedang | DIREKOMENDASIKAN | C3 | Laki-laki | SMK | LULUS |
Adapun nama-nama Variabel sebagai prediktor adalah
colnames(full)
## [1] "REKOMENDASI_DIVISI" "REKOMENDASI_PIC_FA" "OVERSEAS"
## [4] "SURAT_PERINGATAN" "PSIKOTEST" "TEST_ONLINE"
## [7] "TEST_PRAKTEK" "KEBUGARAN" "REKOMENDASI_ATASAN"
## [10] "KINERJA" "JENIS_KELAMIN" "LEVEL_PENDIDIKAN"
## [13] "HASIL"
Jumlah Kasus dan Variabel
dim(full)
## [1] 553 13
Target Prediksi
summary(full$HASIL)
## LULUS TIDAK LULUS
## 273 280
N=Jumlah yang lulus dan tidak lulus relatif sama, 273 dan 280 responden sehingga dapat dikatakan sampelnya balance.
digunakan splitting 80% untuk data training dan 20% untuk data test (validasi)
## REKOMENDASI_DIVISI REKOMENDASI_PIC_FA OVERSEAS SURAT_PERINGATAN
## NO :497 NO :536 NO :538 ADA SP : 2
## YES: 56 YES: 17 YES: 15 TIDAK ADA SP:551
##
##
##
##
##
## PSIKOTEST TEST_ONLINE TEST_PRAKTEK
## BAIK : 81 Min. :13.0 Min. :19.00
## CUKUP :311 1st Qu.:60.0 1st Qu.:73.00
## KURANG :102 Median :70.0 Median :81.00
## SANGAT BAIK : 28 Mean :67.9 Mean :77.59
## SANGAT KURANG: 25 3rd Qu.:77.0 3rd Qu.:87.00
## TIDAK HADIR : 6 Max. :97.0 Max. :98.00
##
## KEBUGARAN REKOMENDASI_ATASAN KINERJA
## Baik : 40 DIREKOMENDASIKAN:533 C1\n: 36
## Baik Sekali : 8 Not Rec : 20 C2 :111
## Belum Test : 9 C3 :329
## Kurang :190 C4\n: 64
## Sangat Kurang :158 C5\n: 13
## Sedang :145
## baik sekali dan terlatih: 3
## JENIS_KELAMIN LEVEL_PENDIDIKAN HASIL
## Laki-laki:547 Diploma: 51 LULUS :273
## Perempuan: 6 S1 : 68 TIDAK LULUS:280
## S2 : 2
## SMK :432
##
##
##
## [1] 553 13
## LULUS TIDAK LULUS
## 273 280
## dummies-1.5.6 provided by Decision Patterns
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: broom
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(keras)
library(lime)
##
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
##
## explain
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following objects are masked from 'package:data.table':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
## Loading required package: tidyverse
## -- Attaching packages -------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v readr 1.1.1
## v tibble 1.4.1 v purrr 0.2.4
## v tidyr 0.7.2 v stringr 1.2.0
## v ggplot2 2.2.1 v forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------- tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x dplyr::between() masks data.table::between()
## x lubridate::date() masks base::date()
## x lime::explain() masks dplyr::explain()
## x dplyr::filter() masks stats::filter()
## x xts::first() masks dplyr::first(), data.table::first()
## x lubridate::hour() masks data.table::hour()
## x lubridate::intersect() masks base::intersect()
## x lubridate::isoweek() masks data.table::isoweek()
## x dplyr::lag() masks stats::lag()
## x xts::last() masks dplyr::last(), data.table::last()
## x lubridate::mday() masks data.table::mday()
## x lubridate::minute() masks data.table::minute()
## x lubridate::month() masks data.table::month()
## x lubridate::quarter() masks data.table::quarter()
## x lubridate::second() masks data.table::second()
## x lubridate::setdiff() masks base::setdiff()
## x purrr::transpose() masks data.table::transpose()
## x lubridate::union() masks base::union()
## x lubridate::wday() masks data.table::wday()
## x lubridate::week() masks data.table::week()
## x lubridate::yday() masks data.table::yday()
## x lubridate::year() masks data.table::year()
##
## Attaching package: 'tidyquant'
## The following object is masked from 'package:tibble':
##
## as_tibble
## The following object is masked from 'package:dplyr':
##
## as_tibble
library(rsample)
##
## Attaching package: 'rsample'
## The following object is masked from 'package:tidyr':
##
## fill
library(recipes)
library(yardstick)
##
## Attaching package: 'yardstick'
## The following object is masked from 'package:readr':
##
## spec
library(corrr)
## create your model,and add layers
model_keras <- keras_model_sequential()
model_keras %>%
# First hidden layer
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu",
input_shape = ncol(X_train)) %>%
# Dropout to prevent overfitting
layer_dropout(rate = 0.1) %>%
# Second hidden layer
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu") %>%
# Dropout to prevent overfitting
layer_dropout(rate = 0.1) %>%
# Output layer
layer_dense(
units = 1,
kernel_initializer = "uniform",
activation = "sigmoid") %>%
# Compile ANN
compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = 'accuracy'
)
# Fit the keras model to the training data
history <- fit(
object = model_keras,
x = as.matrix(X_train),
y = Y_train_vec,
batch_size = 50,
epochs = 35,
validation_split = 0.30,
verbose = 0
)
plot(history)
library(ggplot2)
library('WVPlots')
d<-data.frame(history$metrics)
knitr::kable(head(d))
val_loss | val_acc | loss | acc |
---|---|---|---|
0.6969798 | 0.4360902 | 0.6915137 | 0.5210356 |
0.7005508 | 0.4360902 | 0.6887956 | 0.5242718 |
0.7030222 | 0.4360902 | 0.6878172 | 0.5242718 |
0.7042253 | 0.4360902 | 0.6859085 | 0.5242718 |
0.7017176 | 0.4360902 | 0.6882849 | 0.5242718 |
0.6979152 | 0.4360902 | 0.6858975 | 0.5242718 |
plt <- plot_Keras_fit_trajectory(d,title = "model performance by epoch, dataset, and measure")
suppressWarnings(print(plt)) # too few points for loess
library(tibble)
# Predicted Class
yhat_keras_class_vec <- predict_classes(object = model_keras, x = as.matrix(X_test)) %>%
as.vector()
# Predicted Class Probability
yhat_keras_prob_vec <- predict_proba(object = model_keras, x = as.matrix(X_test)) %>%
as.vector()
# Format test data and predictions for yardstick metrics
estimates_keras_tbl <- tibble(
truth = as.factor(Y_test_vec),
estimate = as.factor(yhat_keras_class_vec),
class_prob = yhat_keras_prob_vec
)
#Confusion Matrix
ConfMatriks<-estimates_keras_tbl %>% conf_mat(truth, estimate)
print(ConfMatriks)
## Truth
## Prediction 0 1
## 0 53 13
## 1 0 45
#Akurasi
Akurasi<-estimates_keras_tbl %>% metrics(truth, estimate)
print(Akurasi)
## # A tibble: 1 x 1
## accuracy
## <dbl>
## 1 0.883
#AUC -ROC
ROC<-estimates_keras_tbl %>% roc_auc(truth, class_prob)
print(ROC)
## [1] 0.9489265
# F1-Statistic
estimates_keras_tbl %>% f_meas(truth, estimate, beta = 1)
## [1] 0.8907563
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:PerformanceAnalytics':
##
## textplot
## The following object is masked from 'package:stats':
##
## lowess
pred <- prediction(yhat_keras_prob_vec,Y_test_vec)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
# I know, the following code is bizarre. Just go with it.
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
roc.data <- data.frame(fpr=unlist(perf@x.values),
tpr=unlist(perf@y.values),
model="XGB")
ggplot(roc.data, aes(x=fpr, ymin=0, ymax=tpr)) +
geom_ribbon(alpha=0.2) +
geom_line(aes(y=tpr),colour="red") +
geom_abline(intercept = 0, slope = 1, colour="blue") +
ggtitle(paste0("ROC Curve w/ AUC=", auc))
library(corrr)
data$HASIL<-as.numeric(data$HASIL)
corrr_analysis <- data %>%
mutate(HASIL = data$HASIL ) %>%
correlate() %>%
focus(HASIL) %>%
rename(feature = rowname) %>%
arrange(abs(HASIL)) %>%
mutate(feature = as_factor(feature))
corrr_analysis
## # A tibble: 36 x 2
## feature HASIL
## <fctr> <dbl>
## 1 LEVEL_PENDIDIKANS2 -0.000763
## 2 JENIS_KELAMINLaki-laki 0.00133
## 3 JENIS_KELAMINPerempuan -0.00133
## 4 KEBUGARANBaik Sekali -0.00153
## 5 PSIKOTESTSANGAT BAIK 0.0136
## 6 KEBUGARANbaik sekali dan terlatih 0.0237
## 7 "KINERJAC4\n" 0.0293
## 8 REKOMENDASI_PIC_FANO 0.0337
## 9 REKOMENDASI_PIC_FAYES -0.0337
## 10 LEVEL_PENDIDIKANS1 -0.0378
## # ... with 26 more rows
# Correlation visualization
corrr_analysis %>%
ggplot(aes(x = HASIL, y = fct_reorder(feature, desc(HASIL)))) +
geom_point() +
# Positive Correlations - Contribute to churn
geom_segment(aes(xend = 0, yend = feature),
color = palette_light()[[2]],
data = corrr_analysis %>% filter(HASIL > 0)) +
geom_point(color = palette_light()[[2]],
data = corrr_analysis %>% filter(HASIL > 0)) +
# Negative Correlations - Prevent churn
geom_segment(aes(xend = 0, yend = feature),
color = palette_light()[[1]],
data = corrr_analysis %>% filter(HASIL < 0)) +
geom_point(color = palette_light()[[1]],
data = corrr_analysis %>% filter(HASIL < 0)) +
# Vertical lines
geom_vline(xintercept = 0, color = palette_light()[[5]], size = 1, linetype = 2) +
geom_vline(xintercept = -0.25, color = palette_light()[[5]], size = 1, linetype = 2) +
geom_vline(xintercept = 0.25, color = palette_light()[[5]], size = 1, linetype = 2) +
# Aesthetics
theme_tq() +
labs(title = "Churn Correlation Analysis",
subtitle = "Positive Correlations (contribute to HASIL), Negative Correlations (prevent HASIL)",
y = "Feature Importance")