library(unbalanced)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
## x purrr::when()       masks foreach::when()
library(readxl)
library(rpart)
library(rpart.plot)
library(mlr3verse)
## Loading required package: mlr3
## 
## Attaching package: 'mlr3'
## The following objects are masked from 'package:mlr':
## 
##     benchmark, resample
library(mlr3extralearners)
## 
## Attaching package: 'mlr3extralearners'
## The following objects are masked from 'package:mlr3':
## 
##     lrn, lrns
library(ggpubr)
library(RWeka)
dataTelp <- read_excel("Tugas_STA581.xlsx")

dataTelp$Y <- as.factor(dataTelp$Y)
dataTelp <- dataTelp %>% select(-ID)
levels(dataTelp$Y)<- c('0', '1') # names of factors
summary(dataTelp$Y)
##     0     1 
## 40664  9336

Undersampling

# plotting number of samples in each class - original dataset
options(scipen=10000) # remove scientific notation when viewing plot

ggplot(data = dataTelp, aes(fill = Y)) +
  geom_bar(aes(x = Y))+
  ggtitle("Number of samples in each class", subtitle = "Original dataset")+
  xlab("")+
  ylab("Samples")+
  scale_y_continuous(expand = c(0,0))+
  scale_x_discrete(expand = c(0,0))+
  theme(legend.position = "none", 
        legend.title = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())

Dapat dilihat pada plot, jumlah data pada peubah respon kelas 0 dan 1 signifikan perbedaannya.

predictor_variables <- dataTelp[,-12] # Select everything except response
response_variable <- dataTelp$Y   # Only select response variable
# Run undersampled function
undersampled_data <- ubBalance(predictor_variables, 
                               response_variable, 
                               type='ubUnder',         # Option for undersampling
                               verbose = TRUE)
## Proportion of positives after ubUnder : 50 % of 18672 observations
undersampled_combined <- cbind(undersampled_data$X,    # combine output
                               undersampled_data$Y)
names(undersampled_combined)[names(undersampled_combined) == "undersampled_data$Y"] <- "Class" # change name to class
levels(undersampled_combined$Class) <- c('0', '1')
summary(undersampled_combined$Class)
##    0    1 
## 9336 9336

Data pada kelas 0 akan dipilih kembali agar jumlahnya sama dengan data kelas 1.

sum(summary(undersampled_combined$Class))
## [1] 18672
# plot number of cases in undersampled dataset
ggplot(data = undersampled_combined, aes(fill = Class))+
  geom_bar(aes(x = Class))+
  ggtitle("Number of samples in each class after undersampling", 
          subtitle="Total samples: 18.672" )+
  xlab("")+
  ylab("Samples")+
  scale_y_continuous(expand = c(0,0))+
  scale_x_discrete(expand = c(0,0))+
  theme(legend.position = "none", 
        legend.title = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank())

Kedua kelas menjadi balanced.

undersampled_combined$Class <- as.factor(undersampled_combined$Class)
taskTelp2 = TaskClassif$new(id="telp",
                           backend = undersampled_combined,
                           target = "Class",
                           positive ="1")
learner_logreg <- lrn("classif.log_reg", predict_type="prob")
learner_knn <-  lrn("classif.kknn",predict_type="prob",k=10,kernel="rectangular")
learner_tree <-  lrn("classif.rpart", cp = 0.001, predict_type="prob")
learner_bagging  <-  lrn(id="bagging clf", "classif.ranger", mtry=11, predict_type="prob", importance="impurity")
learner_rf <-  lrn("classif.ranger",predict_type="prob",importance="impurity")
learner_adaboost <- lrn("classif.AdaBoostM1",predict_type="prob")
set.seed(123)
resampleTelp2 =rsmp("holdout", ratio = 0.8)
resampleTelp2$instantiate(task=taskTelp2)

#Regresi Logistik

learner_logreg$train(task = taskTelp2)
summary(learner_logreg$model)
## 
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data, 
##     model = FALSE)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6373  -1.0055  -0.1093   1.0378   2.7469  
## 
## Coefficients:
##                  Estimate    Std. Error z value             Pr(>|z|)    
## (Intercept) -0.3679576198  0.1028175258  -3.579             0.000345 ***
## X1           0.0007169010  0.0001239563   5.783  0.00000000731637318 ***
## X10         -0.1476707843  0.0070067506 -21.076 < 0.0000000000000002 ***
## X11         -0.0498576359  0.0126242227  -3.949  0.00007835949729243 ***
## X2           0.0000188650  0.0000009018  20.920 < 0.0000000000000002 ***
## X3           0.0001396877  0.0004836270   0.289             0.772709    
## X4           0.8271152983  0.1252242517   6.605  0.00000000003973221 ***
## X5          -0.4345964403  0.1037806180  -4.188  0.00002818632937296 ***
## X6           0.0000202774  0.0000025955   7.812  0.00000000000000561 ***
## X7           0.0367199726  0.0731367043   0.502             0.615616    
## X8           0.0000002060  0.0000087392   0.024             0.981198    
## X9           0.0000243731  0.0000008476  28.754 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25885  on 18671  degrees of freedom
## Residual deviance: 22441  on 18660  degrees of freedom
## AIC: 22465
## 
## Number of Fisher Scoring iterations: 4
train_test_telp_log = resample(task = taskTelp2,
                               learner = learner_logreg,
                               resampling = resampleTelp2,
                               store_models = TRUE)
## INFO  [02:07:51.365] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_log$prediction())
head(prediksi_test)
##    row_ids truth response    prob.1    prob.0
## 1:       8     1        0 0.3942380 0.6057620
## 2:      12     0        0 0.2971260 0.7028740
## 3:      23     1        1 0.6846174 0.3153826
## 4:      24     0        0 0.1964586 0.8035414
## 5:      34     1        1 0.5770908 0.4229092
## 6:      35     1        1 0.7672081 0.2327919
train_test_telp_log$prediction()$confusion
##         truth
## response    1    0
##        1 1248  546
##        0  595 1345
accreglog <- train_test_telp_log$aggregate(list(msr("classif.acc"),
                                                msr("classif.specificity"),
                                                msr("classif.sensitivity")
))
accreglog
##         classif.acc classif.specificity classif.sensitivity 
##           0.6944296           0.7112639           0.6771568
autoplot(train_test_telp_log, type = "roc")

KNN

set.seed(123)
train_test_telp_knn = resample(task = taskTelp2,
                               learner = learner_knn,
                               resampling = resampleTelp2,
                               store_models = TRUE)
## INFO  [02:07:52.676] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_knn$prediction())
head(prediksi_test)
##    row_ids truth response prob.1 prob.0
## 1:       8     1        0    0.3    0.7
## 2:      12     0        0    0.1    0.9
## 3:      23     1        1    0.6    0.4
## 4:      24     0        0    0.0    1.0
## 5:      34     1        0    0.3    0.7
## 6:      35     1        1    0.8    0.2
train_test_telp_knn$prediction()$confusion
##         truth
## response    1    0
##        1 1196  614
##        0  647 1277
accknn <- train_test_telp_knn$aggregate(list(msr("classif.acc"),
                                             msr("classif.specificity"),
                                             msr("classif.sensitivity")))
accknn
##         classif.acc classif.specificity classif.sensitivity 
##           0.6622924           0.6753041           0.6489419
autoplot(train_test_telp_knn, type = "roc")

pohon klasifikasi

set.seed(123)
train_test_telp_tree = resample(task = taskTelp2,
                                learner = learner_tree,
                                resampling = resampleTelp2,
                                store_models = TRUE
)
## INFO  [02:07:55.376] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_tree$prediction())
head(prediksi_test)
##    row_ids truth response    prob.1    prob.0
## 1:       8     1        0 0.2489588 0.7510412
## 2:      12     0        0 0.2489588 0.7510412
## 3:      23     1        1 0.6971014 0.3028986
## 4:      24     0        0 0.2489588 0.7510412
## 5:      34     1        1 0.6994819 0.3005181
## 6:      35     1        1 0.8182294 0.1817706
train_test_telp_tree$prediction()$confusion
##         truth
## response    1    0
##        1 1344  661
##        0  499 1230
acctree<- train_test_telp_tree$aggregate(list(msr("classif.acc"),
                                              msr("classif.specificity"),
                                              msr("classif.sensitivity")
))
acctree
##         classif.acc classif.specificity classif.sensitivity 
##           0.6893412           0.6504495           0.7292458
autoplot(train_test_telp_tree, type = "roc")

#bagging

set.seed(123)
train_test_bagging = resample(task = taskTelp2,
                              learner = learner_bagging,
                              resampling = resampleTelp2,
                              store_models = TRUE
)
## INFO  [02:07:56.345] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1) 
## Growing trees.. Progress: 41%. Estimated remaining time: 43 seconds.
train_test_bagging$prediction()$confusion
##         truth
## response    1    0
##        1 1321  603
##        0  522 1288
accbagging <- train_test_bagging$aggregate(list(msr("classif.acc"),
                                                msr("classif.specificity"),
                                                msr("classif.sensitivity")))
accbagging
##         classif.acc classif.specificity classif.sensitivity 
##           0.6987145           0.6811211           0.7167661
autoplot(train_test_bagging, type = "roc")

#random forest

set.seed(123)
train_test_telp_forest = resample(task = taskTelp2,
                                  learner = learner_rf,
                                  resampling = resampleTelp2,
                                  store_models = TRUE
)
## INFO  [02:09:05.550] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_forest$prediction())
head(prediksi_test)
##    row_ids truth response     prob.1     prob.0
## 1:       8     1        0 0.27723730 0.72276270
## 2:      12     0        0 0.30683413 0.69316587
## 3:      23     1        1 0.76673413 0.23326587
## 4:      24     0        0 0.08648968 0.91351032
## 5:      34     1        1 0.56936270 0.43063730
## 6:      35     1        1 0.91164683 0.08835317
train_test_telp_forest$prediction()$confusion
##         truth
## response    1    0
##        1 1318  577
##        0  525 1314
accforest <- train_test_telp_forest$aggregate(list(msr("classif.acc"),
                                                   msr("classif.specificity"),
                                                   msr("classif.sensitivity")
))
accforest
##         classif.acc classif.specificity classif.sensitivity 
##           0.7048741           0.6948704           0.7151384
autoplot(train_test_telp_forest$prediction(), type = "roc")

AdaBoost

train_test_telp_adaboost = resample(task = taskTelp2,
                                    learner = learner_adaboost,
                                    resampling = resampleTelp2,
                                    store_models = TRUE)
## INFO  [02:09:54.268] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_adaboost$prediction())
head(prediksi_test)
##    row_ids truth response    prob.1    prob.0
## 1:       8     1        0 0.2796991 0.7203009
## 2:      12     0        0 0.2150757 0.7849243
## 3:      23     1        1 0.7138106 0.2861894
## 4:      24     0        0 0.1685538 0.8314462
## 5:      34     1        0 0.4044013 0.5955987
## 6:      35     1        1 0.7765917 0.2234083
train_test_telp_adaboost$prediction()$confusion
##         truth
## response    1    0
##        1 1255  609
##        0  588 1282
accadaboost <- train_test_telp_adaboost$aggregate(list(msr("classif.acc"),
                                                     msr("classif.specificity"),
                                                     msr("classif.sensitivity")
))
accadaboost
##         classif.acc classif.specificity classif.sensitivity 
##           0.6794322           0.6779482           0.6809550
autoplot(train_test_telp_adaboost, type = "roc")

komparasi model

accall<- list(regresi.logistik= accreglog,knn= accknn,pohon.klasifikasi=acctree,random.forest=accforest,bagging=accbagging,adaboost=accadaboost)
accall
## $regresi.logistik
##         classif.acc classif.specificity classif.sensitivity 
##           0.6944296           0.7112639           0.6771568 
## 
## $knn
##         classif.acc classif.specificity classif.sensitivity 
##           0.6622924           0.6753041           0.6489419 
## 
## $pohon.klasifikasi
##         classif.acc classif.specificity classif.sensitivity 
##           0.6893412           0.6504495           0.7292458 
## 
## $random.forest
##         classif.acc classif.specificity classif.sensitivity 
##           0.7048741           0.6948704           0.7151384 
## 
## $bagging
##         classif.acc classif.specificity classif.sensitivity 
##           0.6987145           0.6811211           0.7167661 
## 
## $adaboost
##         classif.acc classif.specificity classif.sensitivity 
##           0.6794322           0.6779482           0.6809550
set.seed(123)
learner_telp <- list(learner_logreg,
                     learner_tree,
                     learner_rf,
                     learner_knn,
                     learner_bagging,
                     learner_adaboost)
set.seed(123)
design.telp <- benchmark_grid(tasks = taskTelp2,
                              learners = learner_telp,
                              resamplings = resampleTelp2 
)
set.seed(123)
bmr = benchmark(design.telp, store_models = TRUE)
## INFO  [02:10:08.466] [mlr3] Running benchmark with 6 resampling iterations 
## INFO  [02:10:08.497] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1) 
## INFO  [02:10:37.716] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1) 
## INFO  [02:10:39.212] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1) 
## INFO  [02:10:39.684] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1) 
## INFO  [02:10:43.158] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1) 
## Growing trees.. Progress: 39%. Estimated remaining time: 47 seconds.
## Growing trees.. Progress: 78%. Estimated remaining time: 17 seconds.
## INFO  [02:12:11.968] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1) 
## INFO  [02:12:17.306] [mlr3] Finished benchmark
bmr$aggregate(list(msr("classif.auc")))
##    nr      resample_result task_id         learner_id resampling_id iters
## 1:  1 <ResampleResult[22]>    telp    classif.log_reg       holdout     1
## 2:  2 <ResampleResult[22]>    telp      classif.rpart       holdout     1
## 3:  3 <ResampleResult[22]>    telp     classif.ranger       holdout     1
## 4:  4 <ResampleResult[22]>    telp       classif.kknn       holdout     1
## 5:  5 <ResampleResult[22]>    telp        bagging clf       holdout     1
## 6:  6 <ResampleResult[22]>    telp classif.AdaBoostM1       holdout     1
##    classif.auc
## 1:   0.7590164
## 2:   0.7355401
## 3:   0.7722200
## 4:   0.7242590
## 5:   0.7643279
## 6:   0.7427994

Pada oversampling, AUC tertinggi didapat ketika menggunakan model random forest.