library(unbalanced)
## Loading required package: mlr
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::when() masks foreach::when()
library(readxl)
library(rpart)
library(rpart.plot)
library(mlr3verse)
## Loading required package: mlr3
##
## Attaching package: 'mlr3'
## The following objects are masked from 'package:mlr':
##
## benchmark, resample
library(mlr3extralearners)
##
## Attaching package: 'mlr3extralearners'
## The following objects are masked from 'package:mlr3':
##
## lrn, lrns
library(ggpubr)
library(RWeka)
dataTelp <- read_excel("Tugas_STA581.xlsx")
dataTelp$Y <- as.factor(dataTelp$Y)
dataTelp <- dataTelp %>% select(-ID)
levels(dataTelp$Y)<- c('0', '1') # names of factors
summary(dataTelp$Y)
## 0 1
## 40664 9336
# plotting number of samples in each class - original dataset
options(scipen=10000) # remove scientific notation when viewing plot
ggplot(data = dataTelp, aes(fill = Y)) +
geom_bar(aes(x = Y))+
ggtitle("Number of samples in each class", subtitle = "Original dataset")+
xlab("")+
ylab("Samples")+
scale_y_continuous(expand = c(0,0))+
scale_x_discrete(expand = c(0,0))+
theme(legend.position = "none",
legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
Dapat dilihat pada plot, jumlah data pada peubah respon kelas 0 dan 1 signifikan perbedaannya.
predictor_variables <- dataTelp[,-12] # Select everything except response
response_variable <- dataTelp$Y # Only select response variable
# run oversampled function on data
oversampled_data <- ubBalance(predictor_variables,
response_variable,
type='ubOver', # Option for oversampling
k = 0, # Value of 0 creates 50:50 split
verbose = TRUE)
## Proportion of positives after ubOver : 50 % of 81328 observations
oversampled_combined <- cbind(oversampled_data$X, # combine output
oversampled_data$Y)
names(oversampled_combined)[names(oversampled_combined) == "oversampled_data$Y"] <- "Class" # change name to class
levels(oversampled_combined$Class) <- c('0', '1')
summary(oversampled_combined$Class)
## 0 1
## 40664 40664
Jumlah data pada kelas 1 bertambah hingga jumlahnya sama dengan kelas 0.
sum(summary(oversampled_combined$Class))
## [1] 81328
# plot number of cases in oversampled dataset
ggplot(data = oversampled_combined, aes(fill = Class))+
geom_bar(aes(x = Class))+
ggtitle("Number of samples in each class after oversampling", subtitle="Total samples: 81328 ")+
xlab("")+
ylab("Samples")+
scale_y_continuous(expand = c(0,0))+
scale_x_discrete(expand = c(0,0))+
theme(legend.position = "none",
legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())
Kedua kelas menjadi balanced.
oversampled_combined$Class <- as.factor(oversampled_combined$Class)
taskTelp5 = TaskClassif$new(id="telp",
backend = oversampled_combined,
target = "Class",
positive ="1")
learner_logreg <- lrn("classif.log_reg", predict_type="prob")
learner_knn <- lrn("classif.kknn",predict_type="prob",k=10,kernel="rectangular")
learner_tree <- lrn("classif.rpart", cp = 0.001, predict_type="prob")
learner_bagging <- lrn(id="bagging clf", "classif.ranger", mtry=11, predict_type="prob", importance="impurity")
learner_rf <- lrn("classif.ranger",predict_type="prob",importance="impurity")
learner_adaboost <- lrn("classif.AdaBoostM1",predict_type="prob")
set.seed(123)
resampleTelp5 =rsmp("holdout", ratio = 0.8)
resampleTelp5$instantiate(task=taskTelp5)
#Regresi Logistik
learner_logreg$train(task = taskTelp5)
summary(learner_logreg$model)
##
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data,
## model = FALSE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9839 -1.0094 -0.0838 1.0437 2.6942
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4891173353 0.0481076495 -10.167 <0.0000000000000002 ***
## X1 0.0007175279 0.0000571410 12.557 <0.0000000000000002 ***
## X10 -0.1334146199 0.0033335543 -40.022 <0.0000000000000002 ***
## X11 -0.0680193556 0.0060078377 -11.322 <0.0000000000000002 ***
## X2 0.0000188083 0.0000004283 43.913 <0.0000000000000002 ***
## X3 -0.0003906647 0.0002285885 -1.709 0.0874 .
## X4 0.9741517804 0.0586603536 16.607 <0.0000000000000002 ***
## X5 -0.4255059357 0.0489185794 -8.698 <0.0000000000000002 ***
## X6 0.0000198160 0.0000012310 16.098 <0.0000000000000002 ***
## X7 0.0299224072 0.0346901878 0.863 0.3884
## X8 0.0000056551 0.0000040070 1.411 0.1582
## X9 0.0000237748 0.0000003998 59.469 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 112745 on 81327 degrees of freedom
## Residual deviance: 98357 on 81316 degrees of freedom
## AIC: 98381
##
## Number of Fisher Scoring iterations: 4
train_test_telp_log = resample(task = taskTelp5,
learner = learner_logreg,
resampling = resampleTelp5,
store_models = TRUE)
## INFO [01:36:05.893] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_log$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 4 1 1 0.52138513 0.4786149
## 2: 9 1 1 0.52138513 0.4786149
## 3: 11 0 0 0.37244614 0.6275539
## 4: 13 1 0 0.06888776 0.9311122
## 5: 14 1 0 0.06888776 0.9311122
## 6: 19 0 0 0.42544388 0.5745561
train_test_telp_log$prediction()$confusion
## truth
## response 1 0
## 1 5429 2336
## 0 2641 5860
accreglog <- train_test_telp_log$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accreglog
## classif.acc classif.specificity classif.sensitivity
## 0.6940243 0.7149829 0.6727385
autoplot(train_test_telp_log, type = "roc")
#KNN
set.seed(123)
train_test_telp_knn = resample(task = taskTelp5,
learner = learner_knn,
resampling = resampleTelp5,
store_models = TRUE)
## INFO [01:36:07.435] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_knn$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 4 1 1 0.7 0.3
## 2: 9 1 1 0.7 0.3
## 3: 11 0 0 0.0 1.0
## 4: 13 1 0 0.0 1.0
## 5: 14 1 0 0.0 1.0
## 6: 19 0 0 0.4 0.6
train_test_telp_knn$prediction()$confusion
## truth
## response 1 0
## 1 6397 2767
## 0 1673 5429
accknn <- train_test_telp_knn$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")))
accknn
## classif.acc classif.specificity classif.sensitivity
## 0.7270380 0.6623963 0.7926890
autoplot(train_test_telp_knn, type = "roc")
learner_tree$train(task = taskTelp5)
rpart.plot(learner_tree$model,roundint = F,type = 5,tweak = 2)
set.seed(123)
train_test_telp_tree = resample(task = taskTelp5,
learner = learner_tree,
resampling = resampleTelp5,
store_models = TRUE
)
## INFO [01:36:25.391] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_tree$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 4 1 0 0.3418675 0.6581325
## 2: 9 1 0 0.3418675 0.6581325
## 3: 11 0 0 0.2568124 0.7431876
## 4: 13 1 0 0.2568124 0.7431876
## 5: 14 1 0 0.2568124 0.7431876
## 6: 19 0 0 0.2568124 0.7431876
train_test_telp_tree$prediction()$confusion
## truth
## response 1 0
## 1 5772 2616
## 0 2298 5580
acctree<- train_test_telp_tree$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
acctree
## classif.acc classif.specificity classif.sensitivity
## 0.6978975 0.6808199 0.7152416
autoplot(train_test_telp_tree, type = "roc")
set.seed(123)
train_test_bagging = resample(task = taskTelp5,
learner = learner_bagging,
resampling = resampleTelp5,
store_models = TRUE
)
## INFO [01:36:27.020] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1)
## Growing trees.. Progress: 9%. Estimated remaining time: 4 minutes, 58 seconds.
## Growing trees.. Progress: 19%. Estimated remaining time: 4 minutes, 17 seconds.
## Growing trees.. Progress: 29%. Estimated remaining time: 3 minutes, 52 seconds.
## Growing trees.. Progress: 40%. Estimated remaining time: 3 minutes, 5 seconds.
## Growing trees.. Progress: 51%. Estimated remaining time: 2 minutes, 27 seconds.
## Growing trees.. Progress: 62%. Estimated remaining time: 1 minute, 52 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 1 minute, 19 seconds.
## Growing trees.. Progress: 84%. Estimated remaining time: 46 seconds.
## Growing trees.. Progress: 96%. Estimated remaining time: 11 seconds.
train_test_bagging$prediction()$confusion
## truth
## response 1 0
## 1 7856 864
## 0 214 7332
accbagging <- train_test_bagging$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")))
accbagging
## classif.acc classif.specificity classif.sensitivity
## 0.9337268 0.8945827 0.9734820
autoplot(train_test_bagging, type = "roc")
#rf
learner_rf$train(task=taskTelp5)
## Growing trees.. Progress: 21%. Estimated remaining time: 1 minute, 55 seconds.
## Growing trees.. Progress: 48%. Estimated remaining time: 1 minute, 6 seconds.
## Growing trees.. Progress: 76%. Estimated remaining time: 30 seconds.
learner_rf$model$variable.importance
## X1 X10 X11 X2 X3 X4 X5 X6
## 2922.236 3656.146 2875.107 4533.043 2652.929 3301.327 3104.815 3445.650
## X7 X8 X9
## 2879.199 3335.405 4629.780
set.seed(123)
train_test_telp_forest = resample(task = taskTelp5,
learner = learner_rf,
resampling = resampleTelp5,
store_models = TRUE
)
## INFO [01:44:19.917] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1)
## Growing trees.. Progress: 30%. Estimated remaining time: 1 minute, 10 seconds.
## Growing trees.. Progress: 64%. Estimated remaining time: 35 seconds.
## Growing trees.. Progress: 99%. Estimated remaining time: 0 seconds.
prediksi_test = as.data.table(train_test_telp_forest$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 4 1 1 0.8793238 0.1206762
## 2: 9 1 1 0.8793238 0.1206762
## 3: 11 0 0 0.1559683 0.8440317
## 4: 13 1 0 0.1316413 0.8683587
## 5: 14 1 0 0.1316413 0.8683587
## 6: 19 0 0 0.2312825 0.7687175
train_test_telp_forest$prediction()$confusion
## truth
## response 1 0
## 1 7814 770
## 0 256 7426
accforest <- train_test_telp_forest$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accforest
## classif.acc classif.specificity classif.sensitivity
## 0.9369236 0.9060517 0.9682776
autoplot(train_test_telp_forest$prediction(), type = "roc")
train_test_telp_adaboost = resample(task = taskTelp5,
learner = learner_adaboost,
resampling = resampleTelp5,
store_models = TRUE)
## INFO [01:46:51.609] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1)
prediksi_test = as.data.table(train_test_telp_adaboost$prediction())
head(prediksi_test)
## row_ids truth response prob.1 prob.0
## 1: 4 1 1 0.5588525 0.4411475
## 2: 9 1 1 0.5588525 0.4411475
## 3: 11 0 0 0.2319949 0.7680051
## 4: 13 1 0 0.2939067 0.7060933
## 5: 14 1 0 0.2939067 0.7060933
## 6: 19 0 0 0.2599529 0.7400471
train_test_telp_adaboost$prediction()$confusion
## truth
## response 1 0
## 1 5074 2427
## 0 2996 5769
accadaboost <- train_test_telp_adaboost$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accadaboost
## classif.acc classif.specificity classif.sensitivity
## 0.6666052 0.7038799 0.6287485
autoplot(train_test_telp_adaboost, type = "roc")
accall<- list(regresi.logistik= accreglog,knn= accknn,pohon.klasifikasi=acctree,random.forest=accforest,bagging=accbagging,adaboost=accadaboost)
accall
## $regresi.logistik
## classif.acc classif.specificity classif.sensitivity
## 0.6940243 0.7149829 0.6727385
##
## $knn
## classif.acc classif.specificity classif.sensitivity
## 0.7270380 0.6623963 0.7926890
##
## $pohon.klasifikasi
## classif.acc classif.specificity classif.sensitivity
## 0.6978975 0.6808199 0.7152416
##
## $random.forest
## classif.acc classif.specificity classif.sensitivity
## 0.9369236 0.9060517 0.9682776
##
## $bagging
## classif.acc classif.specificity classif.sensitivity
## 0.9337268 0.8945827 0.9734820
##
## $adaboost
## classif.acc classif.specificity classif.sensitivity
## 0.6666052 0.7038799 0.6287485
Walaupun akurasi tidak terlalu tinggi, tetapi specificity dan sensitivity terlihat seimbang sehingga prediksi pada kelas minoritas membaik.
set.seed(123)
learner_telp <- list(learner_logreg,
learner_tree,
learner_rf,
learner_knn,
learner_bagging,
learner_adaboost)
set.seed(123)
design.telp <- benchmark_grid(tasks = taskTelp5,
learners = learner_telp,
resamplings = resampleTelp5
)
set.seed(123)
bmr = benchmark(design.telp, store_models = TRUE)
## INFO [01:46:59.693] [mlr3] Running benchmark with 6 resampling iterations
## INFO [01:46:59.731] [mlr3] Applying learner 'classif.ranger' on task 'telp' (iter 1/1)
## Growing trees.. Progress: 23%. Estimated remaining time: 1 minute, 43 seconds.
## Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 13 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 34 seconds.
## INFO [01:50:00.714] [mlr3] Applying learner 'classif.AdaBoostM1' on task 'telp' (iter 1/1)
## INFO [01:50:04.878] [mlr3] Applying learner 'classif.rpart' on task 'telp' (iter 1/1)
## INFO [01:50:05.949] [mlr3] Applying learner 'classif.kknn' on task 'telp' (iter 1/1)
## INFO [01:50:28.110] [mlr3] Applying learner 'bagging clf' on task 'telp' (iter 1/1)
## Growing trees.. Progress: 11%. Estimated remaining time: 4 minutes, 5 seconds.
## Growing trees.. Progress: 21%. Estimated remaining time: 3 minutes, 50 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 3 minutes, 17 seconds.
## Growing trees.. Progress: 41%. Estimated remaining time: 2 minutes, 59 seconds.
## Growing trees.. Progress: 48%. Estimated remaining time: 2 minutes, 50 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 2 minutes, 30 seconds.
## Growing trees.. Progress: 65%. Estimated remaining time: 1 minute, 58 seconds.
## Growing trees.. Progress: 73%. Estimated remaining time: 1 minute, 32 seconds.
## Growing trees.. Progress: 83%. Estimated remaining time: 57 seconds.
## Growing trees.. Progress: 92%. Estimated remaining time: 27 seconds.
## INFO [01:57:01.749] [mlr3] Applying learner 'classif.log_reg' on task 'telp' (iter 1/1)
## INFO [01:57:13.540] [mlr3] Finished benchmark
bmr$aggregate(list(msr("classif.auc")))
## nr resample_result task_id learner_id resampling_id iters
## 1: 1 <ResampleResult[22]> telp classif.log_reg holdout 1
## 2: 2 <ResampleResult[22]> telp classif.rpart holdout 1
## 3: 3 <ResampleResult[22]> telp classif.ranger holdout 1
## 4: 4 <ResampleResult[22]> telp classif.kknn holdout 1
## 5: 5 <ResampleResult[22]> telp bagging clf holdout 1
## 6: 6 <ResampleResult[22]> telp classif.AdaBoostM1 holdout 1
## classif.auc
## 1: 0.7502483
## 2: 0.7393851
## 3: 0.9870566
## 4: 0.8074962
## 5: 0.9869867
## 6: 0.7321031
Pada oversampling, AUC tertinggi didapat ketika menggunakan model bagging serta random forest.