Data
data <- readxl::read_excel('Tugas_STA581.xlsx')
data <- data[-1]
set.seed(1)
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.8,0.2))
train <- data[sample, ]
test <- data[!sample, ]
lattice::barchart(as.factor(train$Y), col='maroon')

smote_train <- smotefamily::SMOTE(train[,-12], train$Y)
newtrain <- smote_train$data
lattice::barchart(newtrain$class, col = 'navy')

Regresi Logistik Biner
log_model <- glm(as.factor(class)~., data = newtrain, family = "binomial")
car::vif(log_model)
## X1 X2 X3 X4 X5 X6 X7 X8
## 1.710112 1.894378 2.237184 1.226638 1.690421 1.361770 1.374746 1.422844
## X9 X10 X11
## 1.572714 3.458002 2.819418
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
train_ctrl <- trainControl(method = 'cv', number = 10)
set.seed(2022)
log_train <- newtrain
log_model <- train(as.factor(class)~., data = log_train, method = 'glm', family='binomial', trControl=train_ctrl, na.action=na.omit)
print(log_model$results)
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.6867082 0.370383 0.006477365 0.01306511
print(log_model$resample)
## Accuracy Kappa Resample
## 1 0.6829464 0.3630067 Fold01
## 2 0.6861489 0.3691356 Fold02
## 3 0.6849183 0.3663729 Fold03
## 4 0.6866293 0.3702622 Fold04
## 5 0.6786229 0.3540686 Fold05
## 6 0.6969260 0.3909726 Fold06
## 7 0.6789432 0.3545212 Fold07
## 8 0.6835869 0.3646011 Fold08
## 9 0.6962370 0.3896515 Fold09
## 10 0.6921230 0.3812373 Fold10
plot(x=1:10, y=log_model$resample$Accuracy, xlab='Fold', ylab='Akurasi', type='b'); points(x=which.max(log_model$resample$Accuracy), y=max(log_model$resample$Accuracy), col='red', pch=20); axis(1, at = seq(1,10, by=1))

log_test <- test
log_test$Y <- as.factor(log_test$Y)
log_predict <- predict(log_model, newdata = test, type = 'raw')
log_conf <- confusionMatrix(log_test$Y, predict(log_model, newdata = test, type = 'raw'), positive = "1", mode = 'everything')
log_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5933 2198
## 1 672 1184
##
## Accuracy : 0.7126
## 95% CI : (0.7036, 0.7215)
## No Information Rate : 0.6614
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2791
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3501
## Specificity : 0.8983
## Pos Pred Value : 0.6379
## Neg Pred Value : 0.7297
## Precision : 0.6379
## Recall : 0.3501
## F1 : 0.4521
## Prevalence : 0.3386
## Detection Rate : 0.1186
## Detection Prevalence : 0.1858
## Balanced Accuracy : 0.6242
##
## 'Positive' Class : 1
##
acc_log <- log_conf$overall['Accuracy']
Classification Tree
library(rpart)
library(rpart.plot)
clas_test <- test
clas_train <- newtrain
clas_tree <- rpart(data = clas_train, as.factor(class)~., control = rpart.control(cp = 0, minsplit = 5000),
method = 'class')
rpart.plot(clas_tree, extra = 'auto', box.palette="RdBu", shadow.col="gray", nn=TRUE)

clas_pred <- ifelse(predict(clas_tree, clas_test) >= 0.5, 1, 0)
clas_predict <- ifelse(clas_pred[,2] > 0.5, 1, 0)
clas_conf <- confusionMatrix(as.factor(clas_predict), as.factor(clas_test$Y))
clas_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5842 826
## 1 2289 1030
##
## Accuracy : 0.6881
## 95% CI : (0.6789, 0.6972)
## No Information Rate : 0.8142
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2097
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7185
## Specificity : 0.5550
## Pos Pred Value : 0.8761
## Neg Pred Value : 0.3103
## Prevalence : 0.8142
## Detection Rate : 0.5850
## Detection Prevalence : 0.6677
## Balanced Accuracy : 0.6367
##
## 'Positive' Class : 0
##
acc_clas <- clas_conf$overall['Accuracy']
RF
rf_train <- newtrain
rf_test <- test
set.seed(2022)
rf_model <- randomForest::randomForest(data=rf_train,
as.factor(class)~.,
ntree=500)
rf_pred <- predict(rf_model, rf_test, type="prob")
rf_predict <- ifelse(rf_pred[,2] > 0.5, 1, 0)
rf_conf <- confusionMatrix(as.factor(rf_predict), as.factor(rf_test$Y), positive = "1", mode='everything')
rf_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7159 1096
## 1 972 760
##
## Accuracy : 0.7929
## 95% CI : (0.7848, 0.8008)
## No Information Rate : 0.8142
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.2976
##
## Mcnemar's Test P-Value : 0.006835
##
## Sensitivity : 0.4095
## Specificity : 0.8805
## Pos Pred Value : 0.4388
## Neg Pred Value : 0.8672
## Precision : 0.4388
## Recall : 0.4095
## F1 : 0.4236
## Prevalence : 0.1858
## Detection Rate : 0.0761
## Detection Prevalence : 0.1734
## Balanced Accuracy : 0.6450
##
## 'Positive' Class : 1
##
acc_rf <- rf_conf$overall["Accuracy"]
XGBoost
xgb_train <- newtrain
xgb_test <- test
library(xgboost)
xgboost_train = xgb.DMatrix(data=as.matrix(xgb_train[,-12]), label=as.matrix(xgb_train[,12]))
xgboost_test = xgb.DMatrix(data=as.matrix(xgb_test[,-12]), label=as.matrix(xgb_test[,12]))
xgb_hyper <- expand.grid(
max.depth = c(6, 7, 8, 9, 10), #Maximum depth of each tree
optimal_trees = 0,
min_RMSE = 0,
nrounds = c(200, 250))
nrow(xgb_hyper)
## [1] 10
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
xgb_hyper %>%
arrange(min_RMSE) %>%
head(10)
## max.depth optimal_trees min_RMSE nrounds
## 1 6 0 0 200
## 2 7 0 0 200
## 3 8 0 0 200
## 4 9 0 0 200
## 5 10 0 0 200
## 6 6 0 0 250
## 7 7 0 0 250
## 8 8 0 0 250
## 9 9 0 0 250
## 10 10 0 0 250
xgb_final <- xgboost(data = xgboost_train,
max.depth=6,
nrounds=200)
## [1] train-rmse:0.466160
## [2] train-rmse:0.440813
## [3] train-rmse:0.425039
## [4] train-rmse:0.413789
## [5] train-rmse:0.402083
## [6] train-rmse:0.395837
## [7] train-rmse:0.382477
## [8] train-rmse:0.377698
## [9] train-rmse:0.369384
## [10] train-rmse:0.365333
## [11] train-rmse:0.363650
## [12] train-rmse:0.360094
## [13] train-rmse:0.354475
## [14] train-rmse:0.350728
## [15] train-rmse:0.344385
## [16] train-rmse:0.341427
## [17] train-rmse:0.339730
## [18] train-rmse:0.338106
## [19] train-rmse:0.336538
## [20] train-rmse:0.332836
## [21] train-rmse:0.328622
## [22] train-rmse:0.325335
## [23] train-rmse:0.322229
## [24] train-rmse:0.320695
## [25] train-rmse:0.319498
## [26] train-rmse:0.317487
## [27] train-rmse:0.315853
## [28] train-rmse:0.315318
## [29] train-rmse:0.312775
## [30] train-rmse:0.311798
## [31] train-rmse:0.310764
## [32] train-rmse:0.310045
## [33] train-rmse:0.309286
## [34] train-rmse:0.309023
## [35] train-rmse:0.308300
## [36] train-rmse:0.307423
## [37] train-rmse:0.306973
## [38] train-rmse:0.305383
## [39] train-rmse:0.304719
## [40] train-rmse:0.303154
## [41] train-rmse:0.302488
## [42] train-rmse:0.301435
## [43] train-rmse:0.300164
## [44] train-rmse:0.299420
## [45] train-rmse:0.298787
## [46] train-rmse:0.298413
## [47] train-rmse:0.297967
## [48] train-rmse:0.297068
## [49] train-rmse:0.296781
## [50] train-rmse:0.296402
## [51] train-rmse:0.295970
## [52] train-rmse:0.295473
## [53] train-rmse:0.294651
## [54] train-rmse:0.294079
## [55] train-rmse:0.293274
## [56] train-rmse:0.292558
## [57] train-rmse:0.292172
## [58] train-rmse:0.291129
## [59] train-rmse:0.290374
## [60] train-rmse:0.290125
## [61] train-rmse:0.289933
## [62] train-rmse:0.289436
## [63] train-rmse:0.289133
## [64] train-rmse:0.288289
## [65] train-rmse:0.287834
## [66] train-rmse:0.287214
## [67] train-rmse:0.286949
## [68] train-rmse:0.286491
## [69] train-rmse:0.286090
## [70] train-rmse:0.285628
## [71] train-rmse:0.285060
## [72] train-rmse:0.284157
## [73] train-rmse:0.283654
## [74] train-rmse:0.283012
## [75] train-rmse:0.282509
## [76] train-rmse:0.281895
## [77] train-rmse:0.281277
## [78] train-rmse:0.280721
## [79] train-rmse:0.280464
## [80] train-rmse:0.280233
## [81] train-rmse:0.279883
## [82] train-rmse:0.279576
## [83] train-rmse:0.279197
## [84] train-rmse:0.278785
## [85] train-rmse:0.278203
## [86] train-rmse:0.277640
## [87] train-rmse:0.277341
## [88] train-rmse:0.276971
## [89] train-rmse:0.276748
## [90] train-rmse:0.276504
## [91] train-rmse:0.275976
## [92] train-rmse:0.275373
## [93] train-rmse:0.274921
## [94] train-rmse:0.274531
## [95] train-rmse:0.274039
## [96] train-rmse:0.273647
## [97] train-rmse:0.273299
## [98] train-rmse:0.272970
## [99] train-rmse:0.272590
## [100] train-rmse:0.272361
## [101] train-rmse:0.272043
## [102] train-rmse:0.271697
## [103] train-rmse:0.271273
## [104] train-rmse:0.271123
## [105] train-rmse:0.270688
## [106] train-rmse:0.270328
## [107] train-rmse:0.270027
## [108] train-rmse:0.269741
## [109] train-rmse:0.269257
## [110] train-rmse:0.269015
## [111] train-rmse:0.268655
## [112] train-rmse:0.268312
## [113] train-rmse:0.268000
## [114] train-rmse:0.267382
## [115] train-rmse:0.267185
## [116] train-rmse:0.266418
## [117] train-rmse:0.266247
## [118] train-rmse:0.265929
## [119] train-rmse:0.265614
## [120] train-rmse:0.265362
## [121] train-rmse:0.265031
## [122] train-rmse:0.264554
## [123] train-rmse:0.264344
## [124] train-rmse:0.264062
## [125] train-rmse:0.263746
## [126] train-rmse:0.263492
## [127] train-rmse:0.262892
## [128] train-rmse:0.262617
## [129] train-rmse:0.262180
## [130] train-rmse:0.262010
## [131] train-rmse:0.261822
## [132] train-rmse:0.261571
## [133] train-rmse:0.261451
## [134] train-rmse:0.261228
## [135] train-rmse:0.261099
## [136] train-rmse:0.260702
## [137] train-rmse:0.260458
## [138] train-rmse:0.260160
## [139] train-rmse:0.259917
## [140] train-rmse:0.259597
## [141] train-rmse:0.259439
## [142] train-rmse:0.259070
## [143] train-rmse:0.258797
## [144] train-rmse:0.258288
## [145] train-rmse:0.257952
## [146] train-rmse:0.257655
## [147] train-rmse:0.257280
## [148] train-rmse:0.257079
## [149] train-rmse:0.256849
## [150] train-rmse:0.256768
## [151] train-rmse:0.256462
## [152] train-rmse:0.256087
## [153] train-rmse:0.255889
## [154] train-rmse:0.255789
## [155] train-rmse:0.255427
## [156] train-rmse:0.255207
## [157] train-rmse:0.254927
## [158] train-rmse:0.254651
## [159] train-rmse:0.254298
## [160] train-rmse:0.254123
## [161] train-rmse:0.254009
## [162] train-rmse:0.253751
## [163] train-rmse:0.253430
## [164] train-rmse:0.253193
## [165] train-rmse:0.252652
## [166] train-rmse:0.252388
## [167] train-rmse:0.252220
## [168] train-rmse:0.252067
## [169] train-rmse:0.251832
## [170] train-rmse:0.251731
## [171] train-rmse:0.251462
## [172] train-rmse:0.251089
## [173] train-rmse:0.250722
## [174] train-rmse:0.250539
## [175] train-rmse:0.250394
## [176] train-rmse:0.250051
## [177] train-rmse:0.249866
## [178] train-rmse:0.249800
## [179] train-rmse:0.249684
## [180] train-rmse:0.249471
## [181] train-rmse:0.249441
## [182] train-rmse:0.249152
## [183] train-rmse:0.249019
## [184] train-rmse:0.248796
## [185] train-rmse:0.248464
## [186] train-rmse:0.248157
## [187] train-rmse:0.247921
## [188] train-rmse:0.247583
## [189] train-rmse:0.247113
## [190] train-rmse:0.246983
## [191] train-rmse:0.246580
## [192] train-rmse:0.246403
## [193] train-rmse:0.246056
## [194] train-rmse:0.245817
## [195] train-rmse:0.245573
## [196] train-rmse:0.245259
## [197] train-rmse:0.244933
## [198] train-rmse:0.244813
## [199] train-rmse:0.244791
## [200] train-rmse:0.244601
xgb_pred <- predict(xgb_final, xgboost_test)
xgb_predict <- ifelse(xgb_pred > 0.5, 1, 0)
xgb_conf <- confusionMatrix(as.factor(xgb_test$Y), as.factor(xgb_predict), positive = "1", mode='everything')
xgb_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7699 432
## 1 1407 449
##
## Accuracy : 0.8159
## 95% CI : (0.8081, 0.8234)
## No Information Rate : 0.9118
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2368
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.50965
## Specificity : 0.84549
## Pos Pred Value : 0.24192
## Neg Pred Value : 0.94687
## Precision : 0.24192
## Recall : 0.50965
## F1 : 0.32810
## Prevalence : 0.08821
## Detection Rate : 0.04496
## Detection Prevalence : 0.18584
## Balanced Accuracy : 0.67757
##
## 'Positive' Class : 1
##
acc_xgb <- xgb_conf$overall['Accuracy']
KNN
knn_test <- test
knn_train <- newtrain
normalize <- function(x){
return((x - min(x)) / (max(x) - min(x)))
}
norm_train <- as.data.frame(lapply(knn_train[,-12], normalize))
norm_test <- as.data.frame(lapply(knn_test[,-12], normalize))
y_train <- knn_train[,12]
y_test <- knn_test[,12]
norm_train2 <- cbind(norm_train, y_train)
norm_test2 <- cbind(norm_test, y_test)
knn_trctrl <- trainControl(method = "cv", number = 10)
knn_final <- train(as.factor(class)~.,
data = norm_train2,
method = "knn",
trControl = knn_trctrl,
tuneGrid = data.frame(k=1))
knn_final$results
## k Accuracy Kappa AccuracySD KappaSD
## 1 1 0.7514452 0.5049802 0.006359864 0.01265753
knn_pred <- predict(knn_final, norm_test2)
knn_conf <- confusionMatrix(as.factor(norm_test2$Y), knn_pred, mode = 'everything', positive = "1")
knn_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5381 2750
## 1 878 978
##
## Accuracy : 0.6367
## 95% CI : (0.6272, 0.6462)
## No Information Rate : 0.6267
## P-Value [Acc > NIR] : 0.01963
##
## Kappa : 0.1359
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.26234
## Specificity : 0.85972
## Pos Pred Value : 0.52694
## Neg Pred Value : 0.66179
## Precision : 0.52694
## Recall : 0.26234
## F1 : 0.35029
## Prevalence : 0.37329
## Detection Rate : 0.09793
## Detection Prevalence : 0.18584
## Balanced Accuracy : 0.56103
##
## 'Positive' Class : 1
##
acc_knn <- knn_conf$overall['Accuracy']
Ensemble Mean
pred <- data.frame(RF = rf_predict,
LOG = as.integer(log_predict),
XGB = xgb_predict,
KNN = as.integer(knn_pred),
CLS = clas_pred[,2])
pred$majority <- ifelse((pred$RF+pred$XGB+pred$LOG+pred$KNN+pred$CLS)/5 > 0.5, 1, 0)
ens_conf <- confusionMatrix(as.factor(test$Y), as.factor(pred$majority), mode = 'everything', positive = "1")
ens_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3678 4453
## 1 317 1539
##
## Accuracy : 0.5224
## 95% CI : (0.5125, 0.5322)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1514
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.2568
## Specificity : 0.9207
## Pos Pred Value : 0.8292
## Neg Pred Value : 0.4523
## Precision : 0.8292
## Recall : 0.2568
## F1 : 0.3922
## Prevalence : 0.6000
## Detection Rate : 0.1541
## Detection Prevalence : 0.1858
## Balanced Accuracy : 0.5887
##
## 'Positive' Class : 1
##
pred$rss <- ifelse((acc_rf*pred['RF']+acc_xgb*pred['XGB']+acc_log*pred['LOG']+acc_knn*pred['KNN']+acc_clas['CLS'])/
(acc_rf+acc_xgb+acc_log+acc_knn+acc_clas)> 0.5, 1, 0)
ens_conf <- confusionMatrix(as.factor(test$Y), as.factor(pred$majority), mode = 'everything', positive = "1")
ens_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3678 4453
## 1 317 1539
##
## Accuracy : 0.5224
## 95% CI : (0.5125, 0.5322)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1514
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.2568
## Specificity : 0.9207
## Pos Pred Value : 0.8292
## Neg Pred Value : 0.4523
## Precision : 0.8292
## Recall : 0.2568
## F1 : 0.3922
## Prevalence : 0.6000
## Detection Rate : 0.1541
## Detection Prevalence : 0.1858
## Balanced Accuracy : 0.5887
##
## 'Positive' Class : 1
##