Ensemble Regression Algorithms

Data

data <- readxl::read_excel('Tugas_STA581.xlsx')
data <- data[-1]

set.seed(1)
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.8,0.2))
train  <- data[sample, ]
test   <- data[!sample, ]

lattice::barchart(as.factor(train$Y), col='maroon')

smote_train <- smotefamily::SMOTE(train[,-12], train$Y)
newtrain <- smote_train$data
lattice::barchart(newtrain$class, col = 'navy')

Regresi Logistik Biner

log_model <- glm(as.factor(class)~., data = newtrain, family = "binomial")
car::vif(log_model)

##       X1       X2       X3       X4       X5       X6       X7       X8 
## 1.710112 1.894378 2.237184 1.226638 1.690421 1.361770 1.374746 1.422844 
##       X9      X10      X11 
## 1.572714 3.458002 2.819418

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

train_ctrl <- trainControl(method = 'cv', number = 10)

set.seed(2022)
log_train <- newtrain
log_model <- train(as.factor(class)~., data = log_train, method = 'glm', family='binomial', trControl=train_ctrl, na.action=na.omit)
print(log_model$results)

##   parameter  Accuracy    Kappa  AccuracySD    KappaSD
## 1      none 0.6867082 0.370383 0.006477365 0.01306511

print(log_model$resample)

##     Accuracy     Kappa Resample
## 1  0.6829464 0.3630067   Fold01
## 2  0.6861489 0.3691356   Fold02
## 3  0.6849183 0.3663729   Fold03
## 4  0.6866293 0.3702622   Fold04
## 5  0.6786229 0.3540686   Fold05
## 6  0.6969260 0.3909726   Fold06
## 7  0.6789432 0.3545212   Fold07
## 8  0.6835869 0.3646011   Fold08
## 9  0.6962370 0.3896515   Fold09
## 10 0.6921230 0.3812373   Fold10

plot(x=1:10, y=log_model$resample$Accuracy, xlab='Fold', ylab='Akurasi', type='b'); points(x=which.max(log_model$resample$Accuracy),  y=max(log_model$resample$Accuracy), col='red', pch=20); axis(1, at = seq(1,10, by=1))

log_test <- test
log_test$Y <- as.factor(log_test$Y)
log_predict <- predict(log_model, newdata = test, type = 'raw')
log_conf <- confusionMatrix(log_test$Y, predict(log_model, newdata = test, type = 'raw'), positive = "1", mode = 'everything')
log_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5933 2198
##          1  672 1184
##                                           
##                Accuracy : 0.7126          
##                  95% CI : (0.7036, 0.7215)
##     No Information Rate : 0.6614          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2791          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3501          
##             Specificity : 0.8983          
##          Pos Pred Value : 0.6379          
##          Neg Pred Value : 0.7297          
##               Precision : 0.6379          
##                  Recall : 0.3501          
##                      F1 : 0.4521          
##              Prevalence : 0.3386          
##          Detection Rate : 0.1186          
##    Detection Prevalence : 0.1858          
##       Balanced Accuracy : 0.6242          
##                                           
##        'Positive' Class : 1               
##

acc_log <- log_conf$overall['Accuracy']

Classification Tree

library(rpart)
library(rpart.plot)

clas_test <- test
clas_train <- newtrain

clas_tree <- rpart(data = clas_train, as.factor(class)~., control = rpart.control(cp = 0, minsplit = 5000),
                   method = 'class')
rpart.plot(clas_tree, extra = 'auto', box.palette="RdBu", shadow.col="gray", nn=TRUE)

clas_pred <- ifelse(predict(clas_tree, clas_test) >= 0.5, 1, 0)
clas_predict <- ifelse(clas_pred[,2] > 0.5, 1, 0)
clas_conf <- confusionMatrix(as.factor(clas_predict), as.factor(clas_test$Y))
clas_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5842  826
##          1 2289 1030
##                                           
##                Accuracy : 0.6881          
##                  95% CI : (0.6789, 0.6972)
##     No Information Rate : 0.8142          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2097          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7185          
##             Specificity : 0.5550          
##          Pos Pred Value : 0.8761          
##          Neg Pred Value : 0.3103          
##              Prevalence : 0.8142          
##          Detection Rate : 0.5850          
##    Detection Prevalence : 0.6677          
##       Balanced Accuracy : 0.6367          
##                                           
##        'Positive' Class : 0               
##

acc_clas <- clas_conf$overall['Accuracy']

RF

rf_train <- newtrain
rf_test <- test

set.seed(2022)
rf_model <- randomForest::randomForest(data=rf_train,
               as.factor(class)~.,
               ntree=500)
rf_pred <- predict(rf_model, rf_test, type="prob")
rf_predict <- ifelse(rf_pred[,2] > 0.5, 1, 0)
rf_conf <- confusionMatrix(as.factor(rf_predict), as.factor(rf_test$Y), positive = "1", mode='everything')
rf_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7159 1096
##          1  972  760
##                                           
##                Accuracy : 0.7929          
##                  95% CI : (0.7848, 0.8008)
##     No Information Rate : 0.8142          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.2976          
##                                           
##  Mcnemar's Test P-Value : 0.006835        
##                                           
##             Sensitivity : 0.4095          
##             Specificity : 0.8805          
##          Pos Pred Value : 0.4388          
##          Neg Pred Value : 0.8672          
##               Precision : 0.4388          
##                  Recall : 0.4095          
##                      F1 : 0.4236          
##              Prevalence : 0.1858          
##          Detection Rate : 0.0761          
##    Detection Prevalence : 0.1734          
##       Balanced Accuracy : 0.6450          
##                                           
##        'Positive' Class : 1               
##

acc_rf <- rf_conf$overall["Accuracy"]

XGBoost

xgb_train <- newtrain
xgb_test <- test

library(xgboost)
xgboost_train = xgb.DMatrix(data=as.matrix(xgb_train[,-12]), label=as.matrix(xgb_train[,12]))
xgboost_test = xgb.DMatrix(data=as.matrix(xgb_test[,-12]), label=as.matrix(xgb_test[,12]))

xgb_hyper <- expand.grid(
  max.depth = c(6, 7, 8, 9, 10), #Maximum depth of each tree
  optimal_trees = 0, 
  min_RMSE = 0,
  nrounds = c(200, 250))
nrow(xgb_hyper)

## [1] 10

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:xgboost':
## 
##     slice

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

xgb_hyper %>% 
  arrange(min_RMSE) %>%
  head(10)

##    max.depth optimal_trees min_RMSE nrounds
## 1          6             0        0     200
## 2          7             0        0     200
## 3          8             0        0     200
## 4          9             0        0     200
## 5         10             0        0     200
## 6          6             0        0     250
## 7          7             0        0     250
## 8          8             0        0     250
## 9          9             0        0     250
## 10        10             0        0     250

xgb_final <- xgboost(data = xgboost_train,                       
                 max.depth=6,                          
                 nrounds=200)

## [1]  train-rmse:0.466160 
## [2]  train-rmse:0.440813 
## [3]  train-rmse:0.425039 
## [4]  train-rmse:0.413789 
## [5]  train-rmse:0.402083 
## [6]  train-rmse:0.395837 
## [7]  train-rmse:0.382477 
## [8]  train-rmse:0.377698 
## [9]  train-rmse:0.369384 
## [10] train-rmse:0.365333 
## [11] train-rmse:0.363650 
## [12] train-rmse:0.360094 
## [13] train-rmse:0.354475 
## [14] train-rmse:0.350728 
## [15] train-rmse:0.344385 
## [16] train-rmse:0.341427 
## [17] train-rmse:0.339730 
## [18] train-rmse:0.338106 
## [19] train-rmse:0.336538 
## [20] train-rmse:0.332836 
## [21] train-rmse:0.328622 
## [22] train-rmse:0.325335 
## [23] train-rmse:0.322229 
## [24] train-rmse:0.320695 
## [25] train-rmse:0.319498 
## [26] train-rmse:0.317487 
## [27] train-rmse:0.315853 
## [28] train-rmse:0.315318 
## [29] train-rmse:0.312775 
## [30] train-rmse:0.311798 
## [31] train-rmse:0.310764 
## [32] train-rmse:0.310045 
## [33] train-rmse:0.309286 
## [34] train-rmse:0.309023 
## [35] train-rmse:0.308300 
## [36] train-rmse:0.307423 
## [37] train-rmse:0.306973 
## [38] train-rmse:0.305383 
## [39] train-rmse:0.304719 
## [40] train-rmse:0.303154 
## [41] train-rmse:0.302488 
## [42] train-rmse:0.301435 
## [43] train-rmse:0.300164 
## [44] train-rmse:0.299420 
## [45] train-rmse:0.298787 
## [46] train-rmse:0.298413 
## [47] train-rmse:0.297967 
## [48] train-rmse:0.297068 
## [49] train-rmse:0.296781 
## [50] train-rmse:0.296402 
## [51] train-rmse:0.295970 
## [52] train-rmse:0.295473 
## [53] train-rmse:0.294651 
## [54] train-rmse:0.294079 
## [55] train-rmse:0.293274 
## [56] train-rmse:0.292558 
## [57] train-rmse:0.292172 
## [58] train-rmse:0.291129 
## [59] train-rmse:0.290374 
## [60] train-rmse:0.290125 
## [61] train-rmse:0.289933 
## [62] train-rmse:0.289436 
## [63] train-rmse:0.289133 
## [64] train-rmse:0.288289 
## [65] train-rmse:0.287834 
## [66] train-rmse:0.287214 
## [67] train-rmse:0.286949 
## [68] train-rmse:0.286491 
## [69] train-rmse:0.286090 
## [70] train-rmse:0.285628 
## [71] train-rmse:0.285060 
## [72] train-rmse:0.284157 
## [73] train-rmse:0.283654 
## [74] train-rmse:0.283012 
## [75] train-rmse:0.282509 
## [76] train-rmse:0.281895 
## [77] train-rmse:0.281277 
## [78] train-rmse:0.280721 
## [79] train-rmse:0.280464 
## [80] train-rmse:0.280233 
## [81] train-rmse:0.279883 
## [82] train-rmse:0.279576 
## [83] train-rmse:0.279197 
## [84] train-rmse:0.278785 
## [85] train-rmse:0.278203 
## [86] train-rmse:0.277640 
## [87] train-rmse:0.277341 
## [88] train-rmse:0.276971 
## [89] train-rmse:0.276748 
## [90] train-rmse:0.276504 
## [91] train-rmse:0.275976 
## [92] train-rmse:0.275373 
## [93] train-rmse:0.274921 
## [94] train-rmse:0.274531 
## [95] train-rmse:0.274039 
## [96] train-rmse:0.273647 
## [97] train-rmse:0.273299 
## [98] train-rmse:0.272970 
## [99] train-rmse:0.272590 
## [100]    train-rmse:0.272361 
## [101]    train-rmse:0.272043 
## [102]    train-rmse:0.271697 
## [103]    train-rmse:0.271273 
## [104]    train-rmse:0.271123 
## [105]    train-rmse:0.270688 
## [106]    train-rmse:0.270328 
## [107]    train-rmse:0.270027 
## [108]    train-rmse:0.269741 
## [109]    train-rmse:0.269257 
## [110]    train-rmse:0.269015 
## [111]    train-rmse:0.268655 
## [112]    train-rmse:0.268312 
## [113]    train-rmse:0.268000 
## [114]    train-rmse:0.267382 
## [115]    train-rmse:0.267185 
## [116]    train-rmse:0.266418 
## [117]    train-rmse:0.266247 
## [118]    train-rmse:0.265929 
## [119]    train-rmse:0.265614 
## [120]    train-rmse:0.265362 
## [121]    train-rmse:0.265031 
## [122]    train-rmse:0.264554 
## [123]    train-rmse:0.264344 
## [124]    train-rmse:0.264062 
## [125]    train-rmse:0.263746 
## [126]    train-rmse:0.263492 
## [127]    train-rmse:0.262892 
## [128]    train-rmse:0.262617 
## [129]    train-rmse:0.262180 
## [130]    train-rmse:0.262010 
## [131]    train-rmse:0.261822 
## [132]    train-rmse:0.261571 
## [133]    train-rmse:0.261451 
## [134]    train-rmse:0.261228 
## [135]    train-rmse:0.261099 
## [136]    train-rmse:0.260702 
## [137]    train-rmse:0.260458 
## [138]    train-rmse:0.260160 
## [139]    train-rmse:0.259917 
## [140]    train-rmse:0.259597 
## [141]    train-rmse:0.259439 
## [142]    train-rmse:0.259070 
## [143]    train-rmse:0.258797 
## [144]    train-rmse:0.258288 
## [145]    train-rmse:0.257952 
## [146]    train-rmse:0.257655 
## [147]    train-rmse:0.257280 
## [148]    train-rmse:0.257079 
## [149]    train-rmse:0.256849 
## [150]    train-rmse:0.256768 
## [151]    train-rmse:0.256462 
## [152]    train-rmse:0.256087 
## [153]    train-rmse:0.255889 
## [154]    train-rmse:0.255789 
## [155]    train-rmse:0.255427 
## [156]    train-rmse:0.255207 
## [157]    train-rmse:0.254927 
## [158]    train-rmse:0.254651 
## [159]    train-rmse:0.254298 
## [160]    train-rmse:0.254123 
## [161]    train-rmse:0.254009 
## [162]    train-rmse:0.253751 
## [163]    train-rmse:0.253430 
## [164]    train-rmse:0.253193 
## [165]    train-rmse:0.252652 
## [166]    train-rmse:0.252388 
## [167]    train-rmse:0.252220 
## [168]    train-rmse:0.252067 
## [169]    train-rmse:0.251832 
## [170]    train-rmse:0.251731 
## [171]    train-rmse:0.251462 
## [172]    train-rmse:0.251089 
## [173]    train-rmse:0.250722 
## [174]    train-rmse:0.250539 
## [175]    train-rmse:0.250394 
## [176]    train-rmse:0.250051 
## [177]    train-rmse:0.249866 
## [178]    train-rmse:0.249800 
## [179]    train-rmse:0.249684 
## [180]    train-rmse:0.249471 
## [181]    train-rmse:0.249441 
## [182]    train-rmse:0.249152 
## [183]    train-rmse:0.249019 
## [184]    train-rmse:0.248796 
## [185]    train-rmse:0.248464 
## [186]    train-rmse:0.248157 
## [187]    train-rmse:0.247921 
## [188]    train-rmse:0.247583 
## [189]    train-rmse:0.247113 
## [190]    train-rmse:0.246983 
## [191]    train-rmse:0.246580 
## [192]    train-rmse:0.246403 
## [193]    train-rmse:0.246056 
## [194]    train-rmse:0.245817 
## [195]    train-rmse:0.245573 
## [196]    train-rmse:0.245259 
## [197]    train-rmse:0.244933 
## [198]    train-rmse:0.244813 
## [199]    train-rmse:0.244791 
## [200]    train-rmse:0.244601

xgb_pred <- predict(xgb_final, xgboost_test)
xgb_predict <- ifelse(xgb_pred > 0.5, 1, 0)
xgb_conf <- confusionMatrix(as.factor(xgb_test$Y), as.factor(xgb_predict), positive = "1", mode='everything')

xgb_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7699  432
##          1 1407  449
##                                           
##                Accuracy : 0.8159          
##                  95% CI : (0.8081, 0.8234)
##     No Information Rate : 0.9118          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2368          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.50965         
##             Specificity : 0.84549         
##          Pos Pred Value : 0.24192         
##          Neg Pred Value : 0.94687         
##               Precision : 0.24192         
##                  Recall : 0.50965         
##                      F1 : 0.32810         
##              Prevalence : 0.08821         
##          Detection Rate : 0.04496         
##    Detection Prevalence : 0.18584         
##       Balanced Accuracy : 0.67757         
##                                           
##        'Positive' Class : 1               
##

acc_xgb <- xgb_conf$overall['Accuracy']

KNN

knn_test <- test
knn_train <- newtrain

normalize <- function(x){
  return((x - min(x)) / (max(x) - min(x)))
}

norm_train <- as.data.frame(lapply(knn_train[,-12], normalize))
norm_test <- as.data.frame(lapply(knn_test[,-12], normalize))
y_train <- knn_train[,12]
y_test <- knn_test[,12]
norm_train2 <- cbind(norm_train, y_train)
norm_test2 <- cbind(norm_test, y_test)

knn_trctrl <- trainControl(method = "cv", number = 10)
knn_final <- train(as.factor(class)~., 
                   data = norm_train2,
                   method = "knn",
                   trControl = knn_trctrl,
                   tuneGrid = data.frame(k=1))
knn_final$results

##   k  Accuracy     Kappa  AccuracySD    KappaSD
## 1 1 0.7514452 0.5049802 0.006359864 0.01265753

knn_pred <- predict(knn_final, norm_test2)
knn_conf <- confusionMatrix(as.factor(norm_test2$Y), knn_pred, mode = 'everything', positive = "1")
knn_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5381 2750
##          1  878  978
##                                           
##                Accuracy : 0.6367          
##                  95% CI : (0.6272, 0.6462)
##     No Information Rate : 0.6267          
##     P-Value [Acc > NIR] : 0.01963         
##                                           
##                   Kappa : 0.1359          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.26234         
##             Specificity : 0.85972         
##          Pos Pred Value : 0.52694         
##          Neg Pred Value : 0.66179         
##               Precision : 0.52694         
##                  Recall : 0.26234         
##                      F1 : 0.35029         
##              Prevalence : 0.37329         
##          Detection Rate : 0.09793         
##    Detection Prevalence : 0.18584         
##       Balanced Accuracy : 0.56103         
##                                           
##        'Positive' Class : 1               
##

acc_knn <- knn_conf$overall['Accuracy']

Ensemble Mean

pred <- data.frame(RF = rf_predict,
                   LOG = as.integer(log_predict),
                   XGB = xgb_predict,
                   KNN = as.integer(knn_pred),
                   CLS = clas_pred[,2])
pred$majority <- ifelse((pred$RF+pred$XGB+pred$LOG+pred$KNN+pred$CLS)/5 > 0.5, 1, 0)
ens_conf <- confusionMatrix(as.factor(test$Y), as.factor(pred$majority), mode = 'everything', positive = "1")
ens_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3678 4453
##          1  317 1539
##                                           
##                Accuracy : 0.5224          
##                  95% CI : (0.5125, 0.5322)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1514          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.2568          
##             Specificity : 0.9207          
##          Pos Pred Value : 0.8292          
##          Neg Pred Value : 0.4523          
##               Precision : 0.8292          
##                  Recall : 0.2568          
##                      F1 : 0.3922          
##              Prevalence : 0.6000          
##          Detection Rate : 0.1541          
##    Detection Prevalence : 0.1858          
##       Balanced Accuracy : 0.5887          
##                                           
##        'Positive' Class : 1               
##

pred$rss <- ifelse((acc_rf*pred['RF']+acc_xgb*pred['XGB']+acc_log*pred['LOG']+acc_knn*pred['KNN']+acc_clas['CLS'])/
                     (acc_rf+acc_xgb+acc_log+acc_knn+acc_clas)> 0.5, 1, 0)
ens_conf <- confusionMatrix(as.factor(test$Y), as.factor(pred$majority), mode = 'everything', positive = "1")
ens_conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3678 4453
##          1  317 1539
##                                           
##                Accuracy : 0.5224          
##                  95% CI : (0.5125, 0.5322)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1514          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.2568          
##             Specificity : 0.9207          
##          Pos Pred Value : 0.8292          
##          Neg Pred Value : 0.4523          
##               Precision : 0.8292          
##                  Recall : 0.2568          
##                      F1 : 0.3922          
##              Prevalence : 0.6000          
##          Detection Rate : 0.1541          
##    Detection Prevalence : 0.1858          
##       Balanced Accuracy : 0.5887          
##                                           
##        'Positive' Class : 1               
##

Ensemble Regression Algorithms

2023-02-16

Data

Regresi Logistik Biner

Classification Tree

RF

XGBoost

KNN

Ensemble Mean