library(unbalanced)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::when() masks foreach::when()
library(readxl)
library(rpart)
library(rpart.plot)
library(mlr3verse)
## Loading required package: mlr3
##
## Attaching package: 'mlr3'
## The following objects are masked from 'package:mlr':
##
## benchmark, resample
library(mlr3extralearners)
##
## Attaching package: 'mlr3extralearners'
## The following objects are masked from 'package:mlr3':
##
## lrn, lrns
library(ggpubr)
library(RWeka)
dataTelp <- read_excel("Tugas_STA581.xlsx")
dataTelp$Y <- as.factor(dataTelp$Y)
dataTelp <- dataTelp %>% select(-ID)
levels(dataTelp$Y)<- c('0', '1') # names of factors
summary(dataTelp$Y)
## 0 1
## 40664 9336
# plotting number of samples in each class - original dataset
options(scipen=10000) # remove scientific notation when viewing plot
ggplot(data = dataTelp, aes(fill = Y)) +
geom_bar(aes(x = Y))+
ggtitle("Number of samples in each class", subtitle = "Original dataset")+
xlab("")+
ylab("Samples")+
scale_y_continuous(expand = c(0,0))+
scale_x_discrete(expand = c(0,0))+
theme(legend.position = "none",
legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
Dapat dilihat pada plot, jumlah data pada peubah respon kelas 0 dan 1 signifikan perbedaannya.
predictor_variables <- dataTelp[,-12] # Select everything except response
response_variable <- dataTelp$Y # Only select response variable
# Run undersampled function
undersampled_data <- ubBalance(predictor_variables,
response_variable,
type='ubUnder', # Option for undersampling
verbose = TRUE)
## Proportion of positives after ubUnder : 50 % of 18672 observations
undersampled_combined <- cbind(undersampled_data$X, # combine output
undersampled_data$Y)
names(undersampled_combined)[names(undersampled_combined) == "undersampled_data$Y"] <- "Class" # change name to class
levels(undersampled_combined$Class) <- c('0', '1')
summary(undersampled_combined$Class)
## 0 1
## 9336 9336
Data pada kelas 0 akan dipilih kembali agar jumlahnya sama dengan data kelas 1.
sum(summary(undersampled_combined$Class))
## [1] 18672
# plot number of cases in undersampled dataset
ggplot(data = undersampled_combined, aes(fill = Class))+
geom_bar(aes(x = Class))+
ggtitle("Number of samples in each class after undersampling",
subtitle="Total samples: 18.672" )+
xlab("")+
ylab("Samples")+
scale_y_continuous(expand = c(0,0))+
scale_x_discrete(expand = c(0,0))+
theme(legend.position = "none",
legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
Kedua kelas menjadi balanced.
undersampled_combined$Class <- as.factor(undersampled_combined$Class)
taskTelp2 = TaskClassif$new(id="telp",
backend = undersampled_combined,
target = "Class",
positive ="1")
learner_logreg <- lrn("classif.log_reg", predict_type="prob")
learner_knn <- lrn("classif.kknn",predict_type="prob",k=10,kernel="rectangular")
learner_tree <- lrn("classif.rpart", cp = 0.001, predict_type="prob")
learner_bagging <- lrn(id="bagging clf", "classif.ranger", mtry=11, predict_type="prob", importance="impurity")
learner_rf <- lrn("classif.ranger",predict_type="prob",importance="impurity")
learner_adaboost <- lrn("classif.AdaBoostM1",predict_type="prob")
set.seed(123)
resampleTelp2 =rsmp("holdout", ratio = 0.8)
resampleTelp2$instantiate(task=taskTelp2)
#Regresi Logistik
learner_logreg$train(task = taskTelp2)
summary(learner_logreg$model)
##
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data,
## model = FALSE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6373 -1.0055 -0.1093 1.0378 2.7469
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3679576198 0.1028175258 -3.579 0.000345 ***
## X1 0.0007169010 0.0001239563 5.783 0.00000000731637318 ***
## X10 -0.1476707843 0.0070067506 -21.076 < 0.0000000000000002 ***
## X11 -0.0498576359 0.0126242227 -3.949 0.00007835949729243 ***
## X2 0.0000188650 0.0000009018 20.920 < 0.0000000000000002 ***
## X3 0.0001396877 0.0004836270 0.289 0.772709
## X4 0.8271152983 0.1252242517 6.605 0.00000000003973221 ***
## X5 -0.4345964403 0.1037806180 -4.188 0.00002818632937296 ***
## X6 0.0000202774 0.0000025955 7.812 0.00000000000000561 ***
## X7 0.0367199726 0.0731367043 0.502 0.615616
## X8 0.0000002060 0.0000087392 0.024 0.981198
## X9 0.0000243731 0.0000008476 28.754 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25885 on 18671 degrees of freedom
## Residual deviance: 22441 on 18660 degrees of freedom
## AIC: 22465
##
## Number of Fisher Scoring iterations: 4
train_test_telp_log = resample(task = taskTelp2,
learner = learner_logreg,
resampling = resampleTelp2,
store_models = TRUE)
## INFO [02:07:51.365] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_log$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 8 1 0 0.3942380 0.6057620
## 2: 12 0 0 0.2971260 0.7028740
## 3: 23 1 1 0.6846174 0.3153826
## 4: 24 0 0 0.1964586 0.8035414
## 5: 34 1 1 0.5770908 0.4229092
## 6: 35 1 1 0.7672081 0.2327919
train_test_telp_log$prediction()$confusion
## truth
## response 1 0
## 1 1248 546
## 0 595 1345
accreglog <- train_test_telp_log$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accreglog
## classif.acc classif.specificity classif.sensitivity
## 0.6944296 0.7112639 0.6771568
autoplot(train_test_telp_log, type = "roc")
set.seed(123)
train_test_telp_knn = resample(task = taskTelp2,
learner = learner_knn,
resampling = resampleTelp2,
store_models = TRUE)
## INFO [02:07:52.676] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_knn$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 8 1 0 0.3 0.7
## 2: 12 0 0 0.1 0.9
## 3: 23 1 1 0.6 0.4
## 4: 24 0 0 0.0 1.0
## 5: 34 1 0 0.3 0.7
## 6: 35 1 1 0.8 0.2
train_test_telp_knn$prediction()$confusion
## truth
## response 1 0
## 1 1196 614
## 0 647 1277
accknn <- train_test_telp_knn$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")))
accknn
## classif.acc classif.specificity classif.sensitivity
## 0.6622924 0.6753041 0.6489419
autoplot(train_test_telp_knn, type = "roc")
set.seed(123)
train_test_telp_tree = resample(task = taskTelp2,
learner = learner_tree,
resampling = resampleTelp2,
store_models = TRUE
)
## INFO [02:07:55.376] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_tree$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 8 1 0 0.2489588 0.7510412
## 2: 12 0 0 0.2489588 0.7510412
## 3: 23 1 1 0.6971014 0.3028986
## 4: 24 0 0 0.2489588 0.7510412
## 5: 34 1 1 0.6994819 0.3005181
## 6: 35 1 1 0.8182294 0.1817706
train_test_telp_tree$prediction()$confusion
## truth
## response 1 0
## 1 1344 661
## 0 499 1230
acctree<- train_test_telp_tree$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
acctree
## classif.acc classif.specificity classif.sensitivity
## 0.6893412 0.6504495 0.7292458
autoplot(train_test_telp_tree, type = "roc")
#bagging
set.seed(123)
train_test_bagging = resample(task = taskTelp2,
learner = learner_bagging,
resampling = resampleTelp2,
store_models = TRUE
)
## INFO [02:07:56.345] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1)
## Growing trees.. Progress: 41%. Estimated remaining time: 43 seconds.
train_test_bagging$prediction()$confusion
## truth
## response 1 0
## 1 1321 603
## 0 522 1288
accbagging <- train_test_bagging$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")))
accbagging
## classif.acc classif.specificity classif.sensitivity
## 0.6987145 0.6811211 0.7167661
autoplot(train_test_bagging, type = "roc")
#random forest
set.seed(123)
train_test_telp_forest = resample(task = taskTelp2,
learner = learner_rf,
resampling = resampleTelp2,
store_models = TRUE
)
## INFO [02:09:05.550] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_forest$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 8 1 0 0.27723730 0.72276270
## 2: 12 0 0 0.30683413 0.69316587
## 3: 23 1 1 0.76673413 0.23326587
## 4: 24 0 0 0.08648968 0.91351032
## 5: 34 1 1 0.56936270 0.43063730
## 6: 35 1 1 0.91164683 0.08835317
train_test_telp_forest$prediction()$confusion
## truth
## response 1 0
## 1 1318 577
## 0 525 1314
accforest <- train_test_telp_forest$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accforest
## classif.acc classif.specificity classif.sensitivity
## 0.7048741 0.6948704 0.7151384
autoplot(train_test_telp_forest$prediction(), type = "roc")
train_test_telp_adaboost = resample(task = taskTelp2,
learner = learner_adaboost,
resampling = resampleTelp2,
store_models = TRUE)
## INFO [02:09:54.268] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_adaboost$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 8 1 0 0.2796991 0.7203009
## 2: 12 0 0 0.2150757 0.7849243
## 3: 23 1 1 0.7138106 0.2861894
## 4: 24 0 0 0.1685538 0.8314462
## 5: 34 1 0 0.4044013 0.5955987
## 6: 35 1 1 0.7765917 0.2234083
train_test_telp_adaboost$prediction()$confusion
## truth
## response 1 0
## 1 1255 609
## 0 588 1282
accadaboost <- train_test_telp_adaboost$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accadaboost
## classif.acc classif.specificity classif.sensitivity
## 0.6794322 0.6779482 0.6809550
autoplot(train_test_telp_adaboost, type = "roc")
accall<- list(regresi.logistik= accreglog,knn= accknn,pohon.klasifikasi=acctree,random.forest=accforest,bagging=accbagging,adaboost=accadaboost)
accall
## $regresi.logistik
## classif.acc classif.specificity classif.sensitivity
## 0.6944296 0.7112639 0.6771568
##
## $knn
## classif.acc classif.specificity classif.sensitivity
## 0.6622924 0.6753041 0.6489419
##
## $pohon.klasifikasi
## classif.acc classif.specificity classif.sensitivity
## 0.6893412 0.6504495 0.7292458
##
## $random.forest
## classif.acc classif.specificity classif.sensitivity
## 0.7048741 0.6948704 0.7151384
##
## $bagging
## classif.acc classif.specificity classif.sensitivity
## 0.6987145 0.6811211 0.7167661
##
## $adaboost
## classif.acc classif.specificity classif.sensitivity
## 0.6794322 0.6779482 0.6809550
set.seed(123)
learner_telp <- list(learner_logreg,
learner_tree,
learner_rf,
learner_knn,
learner_bagging,
learner_adaboost)
set.seed(123)
design.telp <- benchmark_grid(tasks = taskTelp2,
learners = learner_telp,
resamplings = resampleTelp2
)
set.seed(123)
bmr = benchmark(design.telp, store_models = TRUE)
## INFO [02:10:08.466] [mlr3] Running benchmark with 6 resampling iterations
## INFO [02:10:08.497] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1)
## INFO [02:10:37.716] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1)
## INFO [02:10:39.212] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1)
## INFO [02:10:39.684] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1)
## INFO [02:10:43.158] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1)
## Growing trees.. Progress: 39%. Estimated remaining time: 47 seconds.
## Growing trees.. Progress: 78%. Estimated remaining time: 17 seconds.
## INFO [02:12:11.968] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1)
## INFO [02:12:17.306] [mlr3] Finished benchmark
bmr$aggregate(list(msr("classif.auc")))
## nr resample_result task_id learner_id resampling_id iters
## 1: 1 <ResampleResult[22]> telp classif.log_reg holdout 1
## 2: 2 <ResampleResult[22]> telp classif.rpart holdout 1
## 3: 3 <ResampleResult[22]> telp classif.ranger holdout 1
## 4: 4 <ResampleResult[22]> telp classif.kknn holdout 1
## 5: 5 <ResampleResult[22]> telp bagging clf holdout 1
## 6: 6 <ResampleResult[22]> telp classif.AdaBoostM1 holdout 1
## classif.auc
## 1: 0.7590164
## 2: 0.7355401
## 3: 0.7722200
## 4: 0.7242590
## 5: 0.7643279
## 6: 0.7427994
Pada oversampling, AUC tertinggi didapat ketika menggunakan model random forest.