library(unbalanced)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
## x purrr::when()       masks foreach::when()
library(readxl)
library(rpart)
library(rpart.plot)
library(mlr3verse)
## Loading required package: mlr3
## 
## Attaching package: 'mlr3'
## The following objects are masked from 'package:mlr':
## 
##     benchmark, resample
library(mlr3extralearners)
## 
## Attaching package: 'mlr3extralearners'
## The following objects are masked from 'package:mlr3':
## 
##     lrn, lrns
library(ggpubr)
library(RWeka)
dataTelp <- read_excel("Tugas_STA581.xlsx")

dataTelp$Y <- as.factor(dataTelp$Y)
dataTelp <- dataTelp %>% select(-ID)
levels(dataTelp$Y)<- c('0', '1') # names of factors
summary(dataTelp$Y)
##     0     1 
## 40664  9336

Oversampling

# plotting number of samples in each class - original dataset
options(scipen=10000) # remove scientific notation when viewing plot

ggplot(data = dataTelp, aes(fill = Y)) +
  geom_bar(aes(x = Y))+
  ggtitle("Number of samples in each class", subtitle = "Original dataset")+
  xlab("")+
  ylab("Samples")+
  scale_y_continuous(expand = c(0,0))+
  scale_x_discrete(expand = c(0,0))+
  theme(legend.position = "none", 
        legend.title = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())

Dapat dilihat pada plot, jumlah data pada peubah respon kelas 0 dan 1 signifikan perbedaannya.

predictor_variables <- dataTelp[,-12] # Select everything except response
response_variable <- dataTelp$Y   # Only select response variable
# run oversampled function on data
oversampled_data <- ubBalance(predictor_variables,  
                              response_variable, 
                              type='ubOver',         # Option for oversampling
                              k = 0,                 # Value of 0 creates 50:50 split
                              verbose = TRUE)
## Proportion of positives after ubOver : 50 % of 81328 observations
oversampled_combined <- cbind(oversampled_data$X,    # combine output
                              oversampled_data$Y)
names(oversampled_combined)[names(oversampled_combined) == "oversampled_data$Y"] <- "Class" # change name to class
levels(oversampled_combined$Class) <- c('0', '1')
summary(oversampled_combined$Class)
##     0     1 
## 40664 40664

Jumlah data pada kelas 1 bertambah hingga jumlahnya sama dengan kelas 0.

sum(summary(oversampled_combined$Class))
## [1] 81328
# plot number of cases in oversampled dataset
ggplot(data = oversampled_combined, aes(fill = Class))+
  geom_bar(aes(x = Class))+
  ggtitle("Number of samples in each class after oversampling", subtitle="Total samples: 81328 ")+
  xlab("")+
  ylab("Samples")+
  scale_y_continuous(expand = c(0,0))+
  scale_x_discrete(expand = c(0,0))+
  theme(legend.position = "none", 
        legend.title = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())

Kedua kelas menjadi balanced.

oversampled_combined$Class <- as.factor(oversampled_combined$Class)
taskTelp5 = TaskClassif$new(id="telp",
                            backend = oversampled_combined,
                            target = "Class",
                            positive ="1")
learner_logreg <- lrn("classif.log_reg", predict_type="prob")
learner_knn <-  lrn("classif.kknn",predict_type="prob",k=10,kernel="rectangular")
learner_tree <-  lrn("classif.rpart", cp = 0.001, predict_type="prob")
learner_bagging  <-  lrn(id="bagging clf", "classif.ranger", mtry=11, predict_type="prob", importance="impurity")
learner_rf <-  lrn("classif.ranger",predict_type="prob",importance="impurity")
learner_adaboost <- lrn("classif.AdaBoostM1",predict_type="prob")
set.seed(123)
resampleTelp5 =rsmp("holdout", ratio = 0.8)
resampleTelp5$instantiate(task=taskTelp5)

#Regresi Logistik

learner_logreg$train(task = taskTelp5)
summary(learner_logreg$model)
## 
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data, 
##     model = FALSE)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9839  -1.0094  -0.0838   1.0437   2.6942  
## 
## Coefficients:
##                  Estimate    Std. Error z value            Pr(>|z|)    
## (Intercept) -0.4891173353  0.0481076495 -10.167 <0.0000000000000002 ***
## X1           0.0007175279  0.0000571410  12.557 <0.0000000000000002 ***
## X10         -0.1334146199  0.0033335543 -40.022 <0.0000000000000002 ***
## X11         -0.0680193556  0.0060078377 -11.322 <0.0000000000000002 ***
## X2           0.0000188083  0.0000004283  43.913 <0.0000000000000002 ***
## X3          -0.0003906647  0.0002285885  -1.709              0.0874 .  
## X4           0.9741517804  0.0586603536  16.607 <0.0000000000000002 ***
## X5          -0.4255059357  0.0489185794  -8.698 <0.0000000000000002 ***
## X6           0.0000198160  0.0000012310  16.098 <0.0000000000000002 ***
## X7           0.0299224072  0.0346901878   0.863              0.3884    
## X8           0.0000056551  0.0000040070   1.411              0.1582    
## X9           0.0000237748  0.0000003998  59.469 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 112745  on 81327  degrees of freedom
## Residual deviance:  98357  on 81316  degrees of freedom
## AIC: 98381
## 
## Number of Fisher Scoring iterations: 4
train_test_telp_log = resample(task = taskTelp5,
                               learner = learner_logreg,
                               resampling = resampleTelp5,
                               store_models = TRUE)
## INFO  [01:36:05.893] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_log$prediction())
head(prediksi_test)
##    row_ids truth response     prob.1    prob.0
## 1:       4     1        1 0.52138513 0.4786149
## 2:       9     1        1 0.52138513 0.4786149
## 3:      11     0        0 0.37244614 0.6275539
## 4:      13     1        0 0.06888776 0.9311122
## 5:      14     1        0 0.06888776 0.9311122
## 6:      19     0        0 0.42544388 0.5745561
train_test_telp_log$prediction()$confusion
##         truth
## response    1    0
##        1 5429 2336
##        0 2641 5860
accreglog <- train_test_telp_log$aggregate(list(msr("classif.acc"),
                                                msr("classif.specificity"),
                                                msr("classif.sensitivity")
))
accreglog
##         classif.acc classif.specificity classif.sensitivity 
##           0.6940243           0.7149829           0.6727385
autoplot(train_test_telp_log, type = "roc")

#KNN

set.seed(123)
train_test_telp_knn = resample(task = taskTelp5,
                               learner = learner_knn,
                               resampling = resampleTelp5,
                               store_models = TRUE)
## INFO  [01:36:07.435] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_knn$prediction())
head(prediksi_test)
##    row_ids truth response prob.1 prob.0
## 1:       4     1        1    0.7    0.3
## 2:       9     1        1    0.7    0.3
## 3:      11     0        0    0.0    1.0
## 4:      13     1        0    0.0    1.0
## 5:      14     1        0    0.0    1.0
## 6:      19     0        0    0.4    0.6
train_test_telp_knn$prediction()$confusion
##         truth
## response    1    0
##        1 6397 2767
##        0 1673 5429
accknn <- train_test_telp_knn$aggregate(list(msr("classif.acc"),
                                             msr("classif.specificity"),
                                             msr("classif.sensitivity")))
accknn
##         classif.acc classif.specificity classif.sensitivity 
##           0.7270380           0.6623963           0.7926890
autoplot(train_test_telp_knn, type = "roc")

Pohon Klasifikasi

learner_tree$train(task = taskTelp5)
rpart.plot(learner_tree$model,roundint = F,type = 5,tweak = 2)

set.seed(123)
train_test_telp_tree = resample(task = taskTelp5,
                                learner = learner_tree,
                                resampling = resampleTelp5,
                                store_models = TRUE
)
## INFO  [01:36:25.391] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_tree$prediction())
head(prediksi_test)
##    row_ids truth response    prob.1    prob.0
## 1:       4     1        0 0.3418675 0.6581325
## 2:       9     1        0 0.3418675 0.6581325
## 3:      11     0        0 0.2568124 0.7431876
## 4:      13     1        0 0.2568124 0.7431876
## 5:      14     1        0 0.2568124 0.7431876
## 6:      19     0        0 0.2568124 0.7431876
train_test_telp_tree$prediction()$confusion
##         truth
## response    1    0
##        1 5772 2616
##        0 2298 5580
acctree<- train_test_telp_tree$aggregate(list(msr("classif.acc"),
                                              msr("classif.specificity"),
                                              msr("classif.sensitivity")
))
acctree
##         classif.acc classif.specificity classif.sensitivity 
##           0.6978975           0.6808199           0.7152416
autoplot(train_test_telp_tree, type = "roc")

Bagging

set.seed(123)
train_test_bagging = resample(task = taskTelp5,
                              learner = learner_bagging,
                              resampling = resampleTelp5,
                              store_models = TRUE
)
## INFO  [01:36:27.020] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1) 
## Growing trees.. Progress: 9%. Estimated remaining time: 4 minutes, 58 seconds.
## Growing trees.. Progress: 19%. Estimated remaining time: 4 minutes, 17 seconds.
## Growing trees.. Progress: 29%. Estimated remaining time: 3 minutes, 52 seconds.
## Growing trees.. Progress: 40%. Estimated remaining time: 3 minutes, 5 seconds.
## Growing trees.. Progress: 51%. Estimated remaining time: 2 minutes, 27 seconds.
## Growing trees.. Progress: 62%. Estimated remaining time: 1 minute, 52 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 1 minute, 19 seconds.
## Growing trees.. Progress: 84%. Estimated remaining time: 46 seconds.
## Growing trees.. Progress: 96%. Estimated remaining time: 11 seconds.
train_test_bagging$prediction()$confusion
##         truth
## response    1    0
##        1 7856  864
##        0  214 7332
accbagging <- train_test_bagging$aggregate(list(msr("classif.acc"),
                                                msr("classif.specificity"),
                                                msr("classif.sensitivity")))
accbagging
##         classif.acc classif.specificity classif.sensitivity 
##           0.9337268           0.8945827           0.9734820
autoplot(train_test_bagging, type = "roc")

#rf
learner_rf$train(task=taskTelp5)
## Growing trees.. Progress: 21%. Estimated remaining time: 1 minute, 55 seconds.
## Growing trees.. Progress: 48%. Estimated remaining time: 1 minute, 6 seconds.
## Growing trees.. Progress: 76%. Estimated remaining time: 30 seconds.
learner_rf$model$variable.importance
##       X1      X10      X11       X2       X3       X4       X5       X6 
## 2922.236 3656.146 2875.107 4533.043 2652.929 3301.327 3104.815 3445.650 
##       X7       X8       X9 
## 2879.199 3335.405 4629.780
set.seed(123)
train_test_telp_forest = resample(task = taskTelp5,
                                  learner = learner_rf,
                                  resampling = resampleTelp5,
                                  store_models = TRUE
)
## INFO  [01:44:19.917] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1) 
## Growing trees.. Progress: 30%. Estimated remaining time: 1 minute, 10 seconds.
## Growing trees.. Progress: 64%. Estimated remaining time: 35 seconds.
## Growing trees.. Progress: 99%. Estimated remaining time: 0 seconds.
prediksi_test = as.data.table(train_test_telp_forest$prediction())
head(prediksi_test)
##    row_ids truth response    prob.1    prob.0
## 1:       4     1        1 0.8793238 0.1206762
## 2:       9     1        1 0.8793238 0.1206762
## 3:      11     0        0 0.1559683 0.8440317
## 4:      13     1        0 0.1316413 0.8683587
## 5:      14     1        0 0.1316413 0.8683587
## 6:      19     0        0 0.2312825 0.7687175
train_test_telp_forest$prediction()$confusion
##         truth
## response    1    0
##        1 7814  770
##        0  256 7426
accforest <- train_test_telp_forest$aggregate(list(msr("classif.acc"),
                                                   msr("classif.specificity"),
                                                   msr("classif.sensitivity")
))
accforest
##         classif.acc classif.specificity classif.sensitivity 
##           0.9369236           0.9060517           0.9682776
autoplot(train_test_telp_forest$prediction(), type = "roc")

Ada Boost

train_test_telp_adaboost = resample(task = taskTelp5,
                                    learner = learner_adaboost,
                                    resampling = resampleTelp5,
                                    store_models = TRUE)
## INFO  [01:46:51.609] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_adaboost$prediction())
head(prediksi_test)
##    row_ids truth response    prob.1    prob.0
## 1:       4     1        1 0.5588525 0.4411475
## 2:       9     1        1 0.5588525 0.4411475
## 3:      11     0        0 0.2319949 0.7680051
## 4:      13     1        0 0.2939067 0.7060933
## 5:      14     1        0 0.2939067 0.7060933
## 6:      19     0        0 0.2599529 0.7400471
train_test_telp_adaboost$prediction()$confusion
##         truth
## response    1    0
##        1 5074 2427
##        0 2996 5769
accadaboost <- train_test_telp_adaboost$aggregate(list(msr("classif.acc"),
                                                       msr("classif.specificity"),
                                                       msr("classif.sensitivity")
))
accadaboost
##         classif.acc classif.specificity classif.sensitivity 
##           0.6666052           0.7038799           0.6287485
autoplot(train_test_telp_adaboost, type = "roc")

komparasi model

accall<- list(regresi.logistik= accreglog,knn= accknn,pohon.klasifikasi=acctree,random.forest=accforest,bagging=accbagging,adaboost=accadaboost)
accall
## $regresi.logistik
##         classif.acc classif.specificity classif.sensitivity 
##           0.6940243           0.7149829           0.6727385 
## 
## $knn
##         classif.acc classif.specificity classif.sensitivity 
##           0.7270380           0.6623963           0.7926890 
## 
## $pohon.klasifikasi
##         classif.acc classif.specificity classif.sensitivity 
##           0.6978975           0.6808199           0.7152416 
## 
## $random.forest
##         classif.acc classif.specificity classif.sensitivity 
##           0.9369236           0.9060517           0.9682776 
## 
## $bagging
##         classif.acc classif.specificity classif.sensitivity 
##           0.9337268           0.8945827           0.9734820 
## 
## $adaboost
##         classif.acc classif.specificity classif.sensitivity 
##           0.6666052           0.7038799           0.6287485

Walaupun akurasi tidak terlalu tinggi, tetapi specificity dan sensitivity terlihat seimbang sehingga prediksi pada kelas minoritas membaik.

set.seed(123)
learner_telp <- list(learner_logreg,
                     learner_tree,
                     learner_rf,
                     learner_knn,
                     learner_bagging,
                     learner_adaboost)
set.seed(123)
design.telp <- benchmark_grid(tasks = taskTelp5,
                              learners = learner_telp,
                              resamplings = resampleTelp5 
)
set.seed(123)
bmr = benchmark(design.telp, store_models = TRUE)
## INFO  [01:46:59.693] [mlr3] Running benchmark with 6 resampling iterations 
## INFO  [01:46:59.731] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1) 
## Growing trees.. Progress: 23%. Estimated remaining time: 1 minute, 43 seconds.
## Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 13 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 34 seconds.
## INFO  [01:50:00.714] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1) 
## INFO  [01:50:04.878] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1) 
## INFO  [01:50:05.949] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1) 
## INFO  [01:50:28.110] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1) 
## Growing trees.. Progress: 11%. Estimated remaining time: 4 minutes, 5 seconds.
## Growing trees.. Progress: 21%. Estimated remaining time: 3 minutes, 50 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 3 minutes, 17 seconds.
## Growing trees.. Progress: 41%. Estimated remaining time: 2 minutes, 59 seconds.
## Growing trees.. Progress: 48%. Estimated remaining time: 2 minutes, 50 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 2 minutes, 30 seconds.
## Growing trees.. Progress: 65%. Estimated remaining time: 1 minute, 58 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 1 minute, 32 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 57 seconds.
## Growing trees.. Progress: 92%. Estimated remaining time: 27 seconds.
## INFO  [01:57:01.749] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1) 
## INFO  [01:57:13.540] [mlr3] Finished benchmark
bmr$aggregate(list(msr("classif.auc")))
##    nr      resample_result task_id         learner_id resampling_id iters
## 1:  1 <ResampleResult[22]>    telp    classif.log_reg       holdout     1
## 2:  2 <ResampleResult[22]>    telp      classif.rpart       holdout     1
## 3:  3 <ResampleResult[22]>    telp     classif.ranger       holdout     1
## 4:  4 <ResampleResult[22]>    telp       classif.kknn       holdout     1
## 5:  5 <ResampleResult[22]>    telp        bagging clf       holdout     1
## 6:  6 <ResampleResult[22]>    telp classif.AdaBoostM1       holdout     1
##    classif.auc
## 1:   0.7502483
## 2:   0.7393851
## 3:   0.9870566
## 4:   0.8074962
## 5:   0.9869867
## 6:   0.7321031

Pada oversampling, AUC tertinggi didapat ketika menggunakan model bagging serta random forest.