LOAD THE NECESSARY LIBRARIES

# For manipulating the datasets
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(readxl)

# For plotting correlation matrix
library(ggcorrplot)
## Loading required package: ggplot2
# Machine Learning library
library(caret)
## Loading required package: lattice
library(catboost)

# For Multi-core processing support
library(parallel)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators

START GENERAL TIME

t <- proc.time()

OPEN THE CLUSTER

cl <- makePSOCKcluster(2)
registerDoParallel(cl)

GET THE DATA

Load the datasets

#Numerical dataset
dataset_num <- read_excel("rice.xlsx")

#Categorical dataset
dataset_cat <- read.csv("mushrooms.csv")

#Mix dataset
dataset_mix <- read_excel("bank.xlsx")

CLEAN, PREPARE & MANIPULATE THE DATA

#Eliminate VEIL.TYPE since it only has one value, STALK.ROOT since it has missing values and other important attributes.
dataset_cat <- dataset_cat %>% select(-VEIL.TYPE,-STALK.ROOT,-ODOR,-SPORE.PRINT.COLOR,-GILL.COLOR,-GILL.SIZE,-HABITAT,-POPULATION,-STALK.SURFACE.ABOVE.RING,-CAP.COLOR,-RING.TYPE,-STALK.SURFACE.BELOW.RING)

All character columns to factor

dataset_num$CLASS <- as.factor(dataset_num$CLASS)

dataset_cat <- mutate_if(dataset_cat, is.character, as.factor)

dataset_mix <- mutate_if(dataset_mix, is.character, as.factor)

DEFINE FUNCTIONS

#CATBOOST
train_cb_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv", 
                     repeats = 2,
                     number = 5, 
                     returnResamp = 'final',
                     savePredictions = 'final',
                     verboseIter = T,
                     allowParallel = T)

catboost_model <- train(
               x = data_train[,!(names(data_train) %in% c("CLASS"))],
               y = data_train$CLASS,
               method = catboost.caret,
               trControl = fitControl)

return(catboost_model)
}
#RANDOM FOREST
train_rf_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv", 
                     repeats = 2,
                     number = 5, 
                     returnResamp = 'final',
                     savePredictions = 'final',
                     verboseIter = T,
                     allowParallel = T)

train_formula<-formula(CLASS~.)
rf_model <- train(train_formula,
               data = data_train,
               method = "rf",
               trControl = fitControl)

return(rf_model)
}
#Predictions
predict_results <- function(model,data_test){
predictions=predict(model,data_test)
return(confusionMatrix(predictions,as.factor(data_test$CLASS)))
}

EXECUTE CATBOOST AND RANDOMFOREST IN EACH DATASET

NUMERICAL DATASET

dataset_num

Split in train and test

trainIndex <- createDataPartition(dataset_num$CLASS, p=0.80, list=FALSE)
data_train_num <- dataset_num[ trainIndex,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
data_test_num <-  dataset_num[-trainIndex,]

Train catboost model

#Start time
t1 <- proc.time()

catboost_model_num <- train_cb_model(data_train_num)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.0498, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## Warning: Setting row names on a tibble is deprecated.
## 0:   learn: 0.6568135    total: 141ms    remaining: 13.9s
## 1:   learn: 0.6236078    total: 144ms    remaining: 7.05s
## 2:   learn: 0.5936251    total: 147ms    remaining: 4.75s
## 3:   learn: 0.5662970    total: 150ms    remaining: 3.6s
## 4:   learn: 0.5405178    total: 153ms    remaining: 2.9s
## 5:   learn: 0.5171789    total: 156ms    remaining: 2.45s
## 6:   learn: 0.4956609    total: 160ms    remaining: 2.12s
## 7:   learn: 0.4761940    total: 163ms    remaining: 1.87s
## 8:   learn: 0.4576017    total: 166ms    remaining: 1.68s
## 9:   learn: 0.4406181    total: 169ms    remaining: 1.52s
## 10:  learn: 0.4247206    total: 172ms    remaining: 1.39s
## 11:  learn: 0.4099156    total: 175ms    remaining: 1.28s
## 12:  learn: 0.3961075    total: 178ms    remaining: 1.19s
## 13:  learn: 0.3832902    total: 181ms    remaining: 1.11s
## 14:  learn: 0.3716732    total: 185ms    remaining: 1.05s
## 15:  learn: 0.3606363    total: 188ms    remaining: 987ms
## 16:  learn: 0.3501943    total: 191ms    remaining: 934ms
## 17:  learn: 0.3404634    total: 194ms    remaining: 885ms
## 18:  learn: 0.3314146    total: 197ms    remaining: 842ms
## 19:  learn: 0.3229360    total: 200ms    remaining: 801ms
## 20:  learn: 0.3150070    total: 203ms    remaining: 764ms
## 21:  learn: 0.3072563    total: 206ms    remaining: 731ms
## 22:  learn: 0.3000933    total: 209ms    remaining: 701ms
## 23:  learn: 0.2932783    total: 212ms    remaining: 672ms
## 24:  learn: 0.2869615    total: 215ms    remaining: 646ms
## 25:  learn: 0.2812050    total: 219ms    remaining: 624ms
## 26:  learn: 0.2755738    total: 222ms    remaining: 602ms
## 27:  learn: 0.2704136    total: 226ms    remaining: 582ms
## 28:  learn: 0.2655610    total: 229ms    remaining: 561ms
## 29:  learn: 0.2608680    total: 232ms    remaining: 541ms
## 30:  learn: 0.2563472    total: 235ms    remaining: 523ms
## 31:  learn: 0.2523727    total: 238ms    remaining: 506ms
## 32:  learn: 0.2485799    total: 242ms    remaining: 490ms
## 33:  learn: 0.2447511    total: 245ms    remaining: 475ms
## 34:  learn: 0.2414468    total: 248ms    remaining: 460ms
## 35:  learn: 0.2383678    total: 251ms    remaining: 446ms
## 36:  learn: 0.2353650    total: 254ms    remaining: 432ms
## 37:  learn: 0.2324733    total: 257ms    remaining: 420ms
## 38:  learn: 0.2293391    total: 260ms    remaining: 407ms
## 39:  learn: 0.2267338    total: 264ms    remaining: 395ms
## 40:  learn: 0.2240232    total: 267ms    remaining: 384ms
## 41:  learn: 0.2215597    total: 270ms    remaining: 372ms
## 42:  learn: 0.2189280    total: 273ms    remaining: 362ms
## 43:  learn: 0.2167430    total: 277ms    remaining: 352ms
## 44:  learn: 0.2145314    total: 280ms    remaining: 342ms
## 45:  learn: 0.2125653    total: 283ms    remaining: 332ms
## 46:  learn: 0.2106722    total: 286ms    remaining: 323ms
## 47:  learn: 0.2087211    total: 291ms    remaining: 315ms
## 48:  learn: 0.2073332    total: 294ms    remaining: 306ms
## 49:  learn: 0.2057595    total: 297ms    remaining: 297ms
## 50:  learn: 0.2040379    total: 300ms    remaining: 289ms
## 51:  learn: 0.2023699    total: 304ms    remaining: 280ms
## 52:  learn: 0.2010369    total: 307ms    remaining: 272ms
## 53:  learn: 0.1996971    total: 310ms    remaining: 264ms
## 54:  learn: 0.1984580    total: 313ms    remaining: 256ms
## 55:  learn: 0.1974165    total: 316ms    remaining: 249ms
## 56:  learn: 0.1960377    total: 320ms    remaining: 241ms
## 57:  learn: 0.1947740    total: 323ms    remaining: 234ms
## 58:  learn: 0.1938288    total: 326ms    remaining: 227ms
## 59:  learn: 0.1926876    total: 329ms    remaining: 219ms
## 60:  learn: 0.1916285    total: 332ms    remaining: 212ms
## 61:  learn: 0.1905130    total: 335ms    remaining: 205ms
## 62:  learn: 0.1897046    total: 338ms    remaining: 199ms
## 63:  learn: 0.1891968    total: 341ms    remaining: 192ms
## 64:  learn: 0.1882166    total: 344ms    remaining: 185ms
## 65:  learn: 0.1873404    total: 347ms    remaining: 179ms
## 66:  learn: 0.1865113    total: 350ms    remaining: 172ms
## 67:  learn: 0.1857105    total: 353ms    remaining: 166ms
## 68:  learn: 0.1847751    total: 356ms    remaining: 160ms
## 69:  learn: 0.1838037    total: 359ms    remaining: 154ms
## 70:  learn: 0.1831553    total: 363ms    remaining: 148ms
## 71:  learn: 0.1824054    total: 367ms    remaining: 143ms
## 72:  learn: 0.1814054    total: 370ms    remaining: 137ms
## 73:  learn: 0.1805700    total: 374ms    remaining: 131ms
## 74:  learn: 0.1797765    total: 377ms    remaining: 126ms
## 75:  learn: 0.1789485    total: 380ms    remaining: 120ms
## 76:  learn: 0.1784491    total: 383ms    remaining: 114ms
## 77:  learn: 0.1779122    total: 386ms    remaining: 109ms
## 78:  learn: 0.1770016    total: 390ms    remaining: 104ms
## 79:  learn: 0.1764126    total: 393ms    remaining: 98.2ms
## 80:  learn: 0.1758560    total: 396ms    remaining: 93ms
## 81:  learn: 0.1750834    total: 399ms    remaining: 87.7ms
## 82:  learn: 0.1745138    total: 403ms    remaining: 82.5ms
## 83:  learn: 0.1741124    total: 406ms    remaining: 77.4ms
## 84:  learn: 0.1736901    total: 409ms    remaining: 72.2ms
## 85:  learn: 0.1732599    total: 412ms    remaining: 67.1ms
## 86:  learn: 0.1728302    total: 415ms    remaining: 62.1ms
## 87:  learn: 0.1724763    total: 419ms    remaining: 57.1ms
## 88:  learn: 0.1719654    total: 422ms    remaining: 52.1ms
## 89:  learn: 0.1714901    total: 425ms    remaining: 47.2ms
## 90:  learn: 0.1708885    total: 428ms    remaining: 42.3ms
## 91:  learn: 0.1704591    total: 431ms    remaining: 37.4ms
## 92:  learn: 0.1700912    total: 435ms    remaining: 32.7ms
## 93:  learn: 0.1689023    total: 439ms    remaining: 28ms
## 94:  learn: 0.1681292    total: 442ms    remaining: 23.3ms
## 95:  learn: 0.1675693    total: 446ms    remaining: 18.6ms
## 96:  learn: 0.1671149    total: 449ms    remaining: 13.9ms
## 97:  learn: 0.1667215    total: 453ms    remaining: 9.24ms
## 98:  learn: 0.1661118    total: 456ms    remaining: 4.61ms
## 99:  learn: 0.1654008    total: 459ms    remaining: 0us
catboost_model_num
## Catboost 
## 
## 3048 samples
##    7 predictor
##    2 classes: 'Cammeo', 'Osmancik' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2438, 2439, 2438, 2438, 2439, 2439, ... 
## Resampling results across tuning parameters:
## 
##   depth  learning_rate  Accuracy   Kappa    
##   2      0.04978707     0.9247075  0.8458708
##   2      0.13533528     0.9222469  0.8410028
##   2      0.36787944     0.9197895  0.8358922
##   2      1.00000000     0.9114235  0.8188794
##   4      0.04978707     0.9250359  0.8464785
##   4      0.13533528     0.9215917  0.8395079
##   4      0.36787944     0.9142133  0.8244731
##   4      1.00000000     0.8955135  0.7865344
##   6      0.04978707     0.9270050  0.8505720
##   6      0.13533528     0.9202794  0.8366790
##   6      0.36787944     0.9086344  0.8131553
##   6      1.00000000     0.9005979  0.7970720
## 
## Tuning parameter 'iterations' was held constant at a value of 100
## 
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
##  parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
##  0.04978707, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and
##  border_count = 255.
#Stop time
proc.time()-t1
##    user  system elapsed 
##    1.80    0.12   45.92

Make predictions

catboost_pred_num <- predict_results(catboost_model_num,data_test_num)
catboost_pred_num
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cammeo Osmancik
##   Cammeo      301       23
##   Osmancik     25      413
##                                           
##                Accuracy : 0.937           
##                  95% CI : (0.9173, 0.9532)
##     No Information Rate : 0.5722          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8712          
##                                           
##  Mcnemar's Test P-Value : 0.8852          
##                                           
##             Sensitivity : 0.9233          
##             Specificity : 0.9472          
##          Pos Pred Value : 0.9290          
##          Neg Pred Value : 0.9429          
##              Prevalence : 0.4278          
##          Detection Rate : 0.3950          
##    Detection Prevalence : 0.4252          
##       Balanced Accuracy : 0.9353          
##                                           
##        'Positive' Class : Cammeo          
## 

Train random forest model

#Start time
t1 <- proc.time()

rf_model_num <- train_rf_model(data_train_num)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
rf_model_num
## Random Forest 
## 
## 3048 samples
##    7 predictor
##    2 classes: 'Cammeo', 'Osmancik' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2438, 2439, 2438, 2439, 2438, 2438, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9227349  0.8417241
##   4     0.9212590  0.8387185
##   7     0.9212571  0.8386529
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#Stop time
proc.time()-t1
##    user  system elapsed 
##    1.86    0.07   18.80

Make predictions

rf_pred_num <- predict_results(rf_model_num,data_test_num)
rf_pred_num
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cammeo Osmancik
##   Cammeo      298       25
##   Osmancik     28      411
##                                         
##                Accuracy : 0.9304        
##                  95% CI : (0.91, 0.9475)
##     No Information Rate : 0.5722        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.8578        
##                                         
##  Mcnemar's Test P-Value : 0.7835        
##                                         
##             Sensitivity : 0.9141        
##             Specificity : 0.9427        
##          Pos Pred Value : 0.9226        
##          Neg Pred Value : 0.9362        
##              Prevalence : 0.4278        
##          Detection Rate : 0.3911        
##    Detection Prevalence : 0.4239        
##       Balanced Accuracy : 0.9284        
##                                         
##        'Positive' Class : Cammeo        
## 

Compare models

resamps_num <- resamples(list(cb_num=catboost_model_num,rf_num=rf_model_num))
resamps_num
## 
## Call:
## resamples.default(x = list(cb_num = catboost_model_num, rf_num = rf_model_num))
## 
## Models: cb_num, rf_num 
## Number of resamples: 10 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
summary(resamps_num)
## 
## Call:
## summary.resamples(object = resamps_num)
## 
## Models: cb_num, rf_num 
## Number of resamples: 10 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_num 0.9098361 0.9212146 0.9269913 0.9270050 0.9306529 0.9490969    0
## rf_num 0.9113300 0.9184096 0.9228876 0.9227349 0.9270492 0.9327869    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_num 0.8143147 0.8389740 0.8505871 0.8505720 0.8579431 0.8960230    0
## rf_num 0.8166828 0.8330627 0.8420719 0.8417241 0.8510985 0.8626503    0
bwplot(resamps_num)

dotplot(resamps_num)

difValues_num <- diff(resamps_num)
difValues_num
## 
## Call:
## diff.resamples(x = resamps_num)
## 
## Models: cb_num, rf_num 
## Metrics: Accuracy, Kappa 
## Number of differences: 1 
## p-value adjustment: bonferroni
summary(difValues_num)
## 
## Call:
## summary.diff.resamples(object = difValues_num)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##        cb_num rf_num 
## cb_num        0.00427
## rf_num 0.2778        
## 
## Kappa 
##        cb_num rf_num  
## cb_num        0.008848
## rf_num 0.2822

CATEGORICAL DATASET

dataset_cat

Split in train and test

trainIndex <- createDataPartition(dataset_cat$CLASS, p=0.80, list=FALSE)
data_train_cat <- dataset_cat[ trainIndex,]
data_test_cat <-  dataset_cat[-trainIndex,]

Train catboost model

#Start time
t1 <- proc.time()

catboost_model_cat <- train_cb_model(data_train_cat)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## 0:   learn: 0.6122370    total: 6.31ms   remaining: 625ms
## 1:   learn: 0.5507690    total: 12.4ms   remaining: 607ms
## 2:   learn: 0.5029202    total: 20.4ms   remaining: 661ms
## 3:   learn: 0.4666108    total: 23.4ms   remaining: 561ms
## 4:   learn: 0.4360618    total: 26.9ms   remaining: 510ms
## 5:   learn: 0.4129783    total: 30.2ms   remaining: 474ms
## 6:   learn: 0.3942320    total: 33.9ms   remaining: 451ms
## 7:   learn: 0.3678110    total: 38ms remaining: 437ms
## 8:   learn: 0.3363034    total: 42ms remaining: 424ms
## 9:   learn: 0.3122952    total: 46.2ms   remaining: 416ms
## 10:  learn: 0.2932804    total: 50.5ms   remaining: 409ms
## 11:  learn: 0.2809059    total: 55.1ms   remaining: 404ms
## 12:  learn: 0.2695079    total: 59.5ms   remaining: 398ms
## 13:  learn: 0.2563822    total: 63.9ms   remaining: 393ms
## 14:  learn: 0.2480827    total: 68.9ms   remaining: 390ms
## 15:  learn: 0.2348083    total: 74.4ms   remaining: 390ms
## 16:  learn: 0.2142045    total: 79.7ms   remaining: 389ms
## 17:  learn: 0.2014268    total: 83.9ms   remaining: 382ms
## 18:  learn: 0.1926262    total: 88.5ms   remaining: 377ms
## 19:  learn: 0.1808947    total: 92.7ms   remaining: 371ms
## 20:  learn: 0.1702964    total: 97.4ms   remaining: 367ms
## 21:  learn: 0.1634833    total: 101ms    remaining: 360ms
## 22:  learn: 0.1574146    total: 106ms    remaining: 355ms
## 23:  learn: 0.1533933    total: 110ms    remaining: 350ms
## 24:  learn: 0.1494566    total: 114ms    remaining: 343ms
## 25:  learn: 0.1467275    total: 118ms    remaining: 337ms
## 26:  learn: 0.1434511    total: 122ms    remaining: 331ms
## 27:  learn: 0.1365177    total: 127ms    remaining: 326ms
## 28:  learn: 0.1350156    total: 131ms    remaining: 320ms
## 29:  learn: 0.1322677    total: 135ms    remaining: 316ms
## 30:  learn: 0.1296128    total: 139ms    remaining: 310ms
## 31:  learn: 0.1251714    total: 145ms    remaining: 309ms
## 32:  learn: 0.1225821    total: 149ms    remaining: 303ms
## 33:  learn: 0.1211077    total: 153ms    remaining: 297ms
## 34:  learn: 0.1191154    total: 158ms    remaining: 293ms
## 35:  learn: 0.1160553    total: 163ms    remaining: 290ms
## 36:  learn: 0.1123520    total: 168ms    remaining: 286ms
## 37:  learn: 0.1094478    total: 172ms    remaining: 281ms
## 38:  learn: 0.1081211    total: 178ms    remaining: 279ms
## 39:  learn: 0.1070587    total: 183ms    remaining: 274ms
## 40:  learn: 0.1030064    total: 187ms    remaining: 269ms
## 41:  learn: 0.1010134    total: 192ms    remaining: 265ms
## 42:  learn: 0.0987311    total: 197ms    remaining: 262ms
## 43:  learn: 0.0983617    total: 202ms    remaining: 257ms
## 44:  learn: 0.0978947    total: 206ms    remaining: 252ms
## 45:  learn: 0.0967254    total: 212ms    remaining: 249ms
## 46:  learn: 0.0954702    total: 219ms    remaining: 247ms
## 47:  learn: 0.0938459    total: 224ms    remaining: 242ms
## 48:  learn: 0.0935742    total: 232ms    remaining: 241ms
## 49:  learn: 0.0921233    total: 236ms    remaining: 236ms
## 50:  learn: 0.0898602    total: 240ms    remaining: 231ms
## 51:  learn: 0.0894541    total: 246ms    remaining: 227ms
## 52:  learn: 0.0888838    total: 250ms    remaining: 222ms
## 53:  learn: 0.0873815    total: 254ms    remaining: 217ms
## 54:  learn: 0.0867422    total: 262ms    remaining: 214ms
## 55:  learn: 0.0862856    total: 266ms    remaining: 209ms
## 56:  learn: 0.0859953    total: 270ms    remaining: 204ms
## 57:  learn: 0.0857699    total: 278ms    remaining: 201ms
## 58:  learn: 0.0856337    total: 286ms    remaining: 199ms
## 59:  learn: 0.0845285    total: 294ms    remaining: 196ms
## 60:  learn: 0.0840471    total: 299ms    remaining: 191ms
## 61:  learn: 0.0823477    total: 303ms    remaining: 186ms
## 62:  learn: 0.0802434    total: 311ms    remaining: 183ms
## 63:  learn: 0.0795666    total: 316ms    remaining: 178ms
## 64:  learn: 0.0792756    total: 320ms    remaining: 172ms
## 65:  learn: 0.0785699    total: 328ms    remaining: 169ms
## 66:  learn: 0.0783523    total: 332ms    remaining: 164ms
## 67:  learn: 0.0774731    total: 337ms    remaining: 159ms
## 68:  learn: 0.0771809    total: 345ms    remaining: 155ms
## 69:  learn: 0.0762274    total: 349ms    remaining: 150ms
## 70:  learn: 0.0751240    total: 353ms    remaining: 144ms
## 71:  learn: 0.0749683    total: 361ms    remaining: 141ms
## 72:  learn: 0.0747531    total: 367ms    remaining: 136ms
## 73:  learn: 0.0746343    total: 375ms    remaining: 132ms
## 74:  learn: 0.0745693    total: 379ms    remaining: 126ms
## 75:  learn: 0.0743716    total: 385ms    remaining: 121ms
## 76:  learn: 0.0732670    total: 392ms    remaining: 117ms
## 77:  learn: 0.0732560    total: 395ms    remaining: 111ms
## 78:  learn: 0.0731248    total: 398ms    remaining: 106ms
## 79:  learn: 0.0726237    total: 402ms    remaining: 101ms
## 80:  learn: 0.0720270    total: 411ms    remaining: 96.3ms
## 81:  learn: 0.0719702    total: 416ms    remaining: 91.3ms
## 82:  learn: 0.0716857    total: 422ms    remaining: 86.4ms
## 83:  learn: 0.0715268    total: 426ms    remaining: 81.2ms
## 84:  learn: 0.0712087    total: 430ms    remaining: 76ms
## 85:  learn: 0.0709254    total: 434ms    remaining: 70.7ms
## 86:  learn: 0.0706008    total: 441ms    remaining: 66ms
## 87:  learn: 0.0704501    total: 446ms    remaining: 60.9ms
## 88:  learn: 0.0703976    total: 451ms    remaining: 55.7ms
## 89:  learn: 0.0699715    total: 458ms    remaining: 50.9ms
## 90:  learn: 0.0695446    total: 462ms    remaining: 45.7ms
## 91:  learn: 0.0692119    total: 466ms    remaining: 40.5ms
## 92:  learn: 0.0689237    total: 474ms    remaining: 35.6ms
## 93:  learn: 0.0688865    total: 477ms    remaining: 30.5ms
## 94:  learn: 0.0681742    total: 481ms    remaining: 25.3ms
## 95:  learn: 0.0680095    total: 489ms    remaining: 20.4ms
## 96:  learn: 0.0679535    total: 493ms    remaining: 15.2ms
## 97:  learn: 0.0678766    total: 498ms    remaining: 10.2ms
## 98:  learn: 0.0676545    total: 503ms    remaining: 5.08ms
## 99:  learn: 0.0675518    total: 508ms    remaining: 0us
catboost_model_cat
## Catboost 
## 
## 6500 samples
##   10 predictor
##    2 classes: 'e', 'p' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 5199, 5201, 5200, 5201, 5199, 5201, ... 
## Resampling results across tuning parameters:
## 
##   depth  learning_rate  Accuracy   Kappa    
##   2      0.04978707     0.8938478  0.7863302
##   2      0.13533528     0.9303078  0.8603756
##   2      0.36787944     0.9397695  0.8793362
##   2      1.00000000     0.9564634  0.9127782
##   4      0.04978707     0.9394627  0.8785638
##   4      0.13533528     0.9608461  0.9215197
##   4      0.36787944     0.9673072  0.9344566
##   4      1.00000000     0.9477065  0.8947271
##   6      0.04978707     0.9656155  0.9310161
##   6      0.13533528     0.9674611  0.9347500
##   6      0.36787944     0.9666924  0.9332041
##   6      1.00000000     0.9663844  0.9325722
## 
## Tuning parameter 'iterations' was held constant at a value of 100
## 
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
##  parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
##  0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
##  = 255.
#Stop time
proc.time()-t1
##    user  system elapsed 
##    2.89    0.28   52.45

Make predictions

catboost_pred_cat <- predict_results(catboost_model_cat,data_test_cat)
catboost_pred_cat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   e   p
##          e 838  46
##          p   3 737
##                                           
##                Accuracy : 0.9698          
##                  95% CI : (0.9603, 0.9776)
##     No Information Rate : 0.5179          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9395          
##                                           
##  Mcnemar's Test P-Value : 1.973e-09       
##                                           
##             Sensitivity : 0.9964          
##             Specificity : 0.9413          
##          Pos Pred Value : 0.9480          
##          Neg Pred Value : 0.9959          
##              Prevalence : 0.5179          
##          Detection Rate : 0.5160          
##    Detection Prevalence : 0.5443          
##       Balanced Accuracy : 0.9688          
##                                           
##        'Positive' Class : e               
## 

Train random forest model

#Start time
t1 <- proc.time()

rf_model_cat <- train_rf_model(data_train_cat)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 17 on full training set
rf_model_cat
## Random Forest 
## 
## 6500 samples
##   10 predictor
##    2 classes: 'e', 'p' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 5201, 5200, 5199, 5200, 5200, 5200, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8475368  0.6923302
##   17    0.9682315  0.9363089
##   33    0.9678468  0.9355343
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.
#Stop time
proc.time()-t1
##    user  system elapsed 
##    6.71    0.06   91.89

Make predictions

rf_pred_cat <- predict_results(rf_model_cat,data_test_cat)
rf_pred_cat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   e   p
##          e 831  42
##          p  10 741
##                                          
##                Accuracy : 0.968          
##                  95% CI : (0.9582, 0.976)
##     No Information Rate : 0.5179         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9358         
##                                          
##  Mcnemar's Test P-Value : 1.716e-05      
##                                          
##             Sensitivity : 0.9881         
##             Specificity : 0.9464         
##          Pos Pred Value : 0.9519         
##          Neg Pred Value : 0.9867         
##              Prevalence : 0.5179         
##          Detection Rate : 0.5117         
##    Detection Prevalence : 0.5376         
##       Balanced Accuracy : 0.9672         
##                                          
##        'Positive' Class : e              
## 

Compare models

resamps_cat <- resamples(list(cb_cat=catboost_model_cat,rf_cat=rf_model_cat))
resamps_cat
## 
## Call:
## resamples.default(x = list(cb_cat = catboost_model_cat, rf_cat = rf_model_cat))
## 
## Models: cb_cat, rf_cat 
## Number of resamples: 10 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
summary(resamps_cat)
## 
## Call:
## summary.resamples(object = resamps_cat)
## 
## Models: cb_cat, rf_cat 
## Number of resamples: 10 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_cat 0.9615089 0.9653913 0.9669358 0.9674611 0.9700106 0.9745958    0
## rf_cat 0.9638462 0.9661604 0.9676935 0.9682315 0.9711538 0.9730562    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_cat 0.9227876 0.9305715 0.9337030 0.9347500 0.9398612 0.9490711    0
## rf_cat 0.9274657 0.9321458 0.9352378 0.9363089 0.9421932 0.9459724    0
bwplot(resamps_cat)

dotplot(resamps_cat)

difValues_cat <- diff(resamps_cat)
difValues_cat
## 
## Call:
## diff.resamples(x = resamps_cat)
## 
## Models: cb_cat, rf_cat 
## Metrics: Accuracy, Kappa 
## Number of differences: 1 
## p-value adjustment: bonferroni
summary(difValues_cat)
## 
## Call:
## summary.diff.resamples(object = difValues_cat)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##        cb_cat rf_cat    
## cb_cat        -0.0007704
## rf_cat 0.4833           
## 
## Kappa 
##        cb_cat rf_cat   
## cb_cat        -0.001559
## rf_cat 0.4802

##MIX DATASET

dataset_mix

Split in train and test

trainIndex <- createDataPartition(dataset_mix$CLASS, p=0.80, list=FALSE)
data_train_mix <- dataset_mix[ trainIndex,]
data_test_mix <-  dataset_mix[-trainIndex,]

Train catboost model

#Start time
t1 <- proc.time()

catboost_model_mix <- train_cb_model(data_train_mix)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 2, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## Warning: Setting row names on a tibble is deprecated.
## 0:   learn: 0.6121339    total: 5.64ms   remaining: 558ms
## 1:   learn: 0.5502777    total: 9.34ms   remaining: 458ms
## 2:   learn: 0.5021942    total: 12.9ms   remaining: 419ms
## 3:   learn: 0.4643175    total: 15.8ms   remaining: 378ms
## 4:   learn: 0.4356381    total: 17.5ms   remaining: 332ms
## 5:   learn: 0.4093016    total: 18.9ms   remaining: 296ms
## 6:   learn: 0.3837429    total: 22.9ms   remaining: 304ms
## 7:   learn: 0.3621549    total: 24.4ms   remaining: 281ms
## 8:   learn: 0.3450963    total: 25.9ms   remaining: 262ms
## 9:   learn: 0.3333567    total: 27.7ms   remaining: 249ms
## 10:  learn: 0.3242311    total: 29.2ms   remaining: 236ms
## 11:  learn: 0.3168739    total: 30.7ms   remaining: 225ms
## 12:  learn: 0.3081181    total: 32.2ms   remaining: 216ms
## 13:  learn: 0.2996895    total: 33.6ms   remaining: 207ms
## 14:  learn: 0.2928020    total: 35.2ms   remaining: 199ms
## 15:  learn: 0.2873438    total: 36.6ms   remaining: 192ms
## 16:  learn: 0.2820336    total: 38.3ms   remaining: 187ms
## 17:  learn: 0.2787684    total: 39.7ms   remaining: 181ms
## 18:  learn: 0.2722486    total: 41.3ms   remaining: 176ms
## 19:  learn: 0.2683284    total: 42.9ms   remaining: 172ms
## 20:  learn: 0.2644261    total: 45.1ms   remaining: 170ms
## 21:  learn: 0.2606796    total: 46.6ms   remaining: 165ms
## 22:  learn: 0.2574881    total: 48.1ms   remaining: 161ms
## 23:  learn: 0.2556970    total: 49.6ms   remaining: 157ms
## 24:  learn: 0.2541877    total: 51.1ms   remaining: 153ms
## 25:  learn: 0.2518247    total: 52.6ms   remaining: 150ms
## 26:  learn: 0.2503556    total: 54.2ms   remaining: 146ms
## 27:  learn: 0.2489015    total: 55.8ms   remaining: 143ms
## 28:  learn: 0.2477297    total: 57.2ms   remaining: 140ms
## 29:  learn: 0.2458457    total: 58.6ms   remaining: 137ms
## 30:  learn: 0.2448694    total: 61.4ms   remaining: 137ms
## 31:  learn: 0.2436032    total: 63.1ms   remaining: 134ms
## 32:  learn: 0.2426952    total: 64.8ms   remaining: 131ms
## 33:  learn: 0.2415039    total: 66.3ms   remaining: 129ms
## 34:  learn: 0.2406009    total: 67.7ms   remaining: 126ms
## 35:  learn: 0.2394611    total: 69.1ms   remaining: 123ms
## 36:  learn: 0.2384822    total: 70.7ms   remaining: 120ms
## 37:  learn: 0.2371913    total: 72.2ms   remaining: 118ms
## 38:  learn: 0.2362425    total: 73.7ms   remaining: 115ms
## 39:  learn: 0.2354962    total: 75.2ms   remaining: 113ms
## 40:  learn: 0.2346559    total: 76.6ms   remaining: 110ms
## 41:  learn: 0.2339751    total: 78.1ms   remaining: 108ms
## 42:  learn: 0.2331142    total: 79.6ms   remaining: 106ms
## 43:  learn: 0.2328724    total: 81.1ms   remaining: 103ms
## 44:  learn: 0.2322204    total: 84ms remaining: 103ms
## 45:  learn: 0.2316547    total: 86ms remaining: 101ms
## 46:  learn: 0.2314605    total: 87.6ms   remaining: 98.8ms
## 47:  learn: 0.2312707    total: 89.1ms   remaining: 96.5ms
## 48:  learn: 0.2305370    total: 91.1ms   remaining: 94.8ms
## 49:  learn: 0.2300010    total: 92.6ms   remaining: 92.6ms
## 50:  learn: 0.2296235    total: 94.3ms   remaining: 90.6ms
## 51:  learn: 0.2291560    total: 95.9ms   remaining: 88.5ms
## 52:  learn: 0.2290524    total: 97.3ms   remaining: 86.3ms
## 53:  learn: 0.2286910    total: 98.9ms   remaining: 84.2ms
## 54:  learn: 0.2285615    total: 100ms    remaining: 82.1ms
## 55:  learn: 0.2279909    total: 102ms    remaining: 80ms
## 56:  learn: 0.2274678    total: 103ms    remaining: 77.9ms
## 57:  learn: 0.2273241    total: 105ms    remaining: 75.9ms
## 58:  learn: 0.2265899    total: 107ms    remaining: 74ms
## 59:  learn: 0.2261008    total: 108ms    remaining: 72.1ms
## 60:  learn: 0.2252369    total: 110ms    remaining: 70.1ms
## 61:  learn: 0.2250421    total: 111ms    remaining: 68.1ms
## 62:  learn: 0.2246596    total: 112ms    remaining: 66.1ms
## 63:  learn: 0.2241355    total: 114ms    remaining: 64ms
## 64:  learn: 0.2241081    total: 115ms    remaining: 62ms
## 65:  learn: 0.2238018    total: 117ms    remaining: 60.1ms
## 66:  learn: 0.2237307    total: 118ms    remaining: 58.1ms
## 67:  learn: 0.2233752    total: 119ms    remaining: 56.2ms
## 68:  learn: 0.2229184    total: 121ms    remaining: 54.3ms
## 69:  learn: 0.2227866    total: 122ms    remaining: 52.4ms
## 70:  learn: 0.2224133    total: 124ms    remaining: 50.6ms
## 71:  learn: 0.2218968    total: 125ms    remaining: 48.7ms
## 72:  learn: 0.2216509    total: 127ms    remaining: 46.9ms
## 73:  learn: 0.2213231    total: 128ms    remaining: 45.1ms
## 74:  learn: 0.2212330    total: 130ms    remaining: 43.2ms
## 75:  learn: 0.2208867    total: 131ms    remaining: 41.4ms
## 76:  learn: 0.2208864    total: 132ms    remaining: 39.6ms
## 77:  learn: 0.2205792    total: 134ms    remaining: 37.8ms
## 78:  learn: 0.2201641    total: 135ms    remaining: 36ms
## 79:  learn: 0.2198007    total: 137ms    remaining: 34.2ms
## 80:  learn: 0.2194934    total: 138ms    remaining: 32.4ms
## 81:  learn: 0.2189740    total: 139ms    remaining: 30.6ms
## 82:  learn: 0.2177490    total: 141ms    remaining: 28.9ms
## 83:  learn: 0.2176918    total: 142ms    remaining: 27.1ms
## 84:  learn: 0.2171174    total: 144ms    remaining: 25.4ms
## 85:  learn: 0.2167369    total: 145ms    remaining: 23.7ms
## 86:  learn: 0.2163940    total: 147ms    remaining: 22ms
## 87:  learn: 0.2162784    total: 148ms    remaining: 20.2ms
## 88:  learn: 0.2162118    total: 150ms    remaining: 18.5ms
## 89:  learn: 0.2159134    total: 151ms    remaining: 16.8ms
## 90:  learn: 0.2159058    total: 153ms    remaining: 15.1ms
## 91:  learn: 0.2157729    total: 154ms    remaining: 13.4ms
## 92:  learn: 0.2156463    total: 156ms    remaining: 11.7ms
## 93:  learn: 0.2151911    total: 157ms    remaining: 10ms
## 94:  learn: 0.2148647    total: 159ms    remaining: 8.35ms
## 95:  learn: 0.2146870    total: 160ms    remaining: 6.68ms
## 96:  learn: 0.2143327    total: 162ms    remaining: 5.01ms
## 97:  learn: 0.2139554    total: 163ms    remaining: 3.33ms
## 98:  learn: 0.2135875    total: 165ms    remaining: 1.66ms
## 99:  learn: 0.2134650    total: 166ms    remaining: 0us
catboost_model_mix
## Catboost 
## 
## 3617 samples
##   16 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2893, 2894, 2894, 2894, 2893, 2894, ... 
## Resampling results across tuning parameters:
## 
##   depth  learning_rate  Accuracy   Kappa    
##   2      0.04978707     0.8986767  0.3245610
##   2      0.13533528     0.9022705  0.3992917
##   2      0.36787944     0.9001954  0.4205602
##   2      1.00000000     0.8902419  0.3984283
##   4      0.04978707     0.9003351  0.3498960
##   4      0.13533528     0.9015782  0.4180483
##   4      0.36787944     0.8961876  0.4063029
##   4      1.00000000     0.8847128  0.3817551
##   6      0.04978707     0.9017182  0.3801920
##   6      0.13533528     0.8986742  0.4123413
##   6      0.36787944     0.8952194  0.4182872
##   6      1.00000000     0.8833335  0.3966184
## 
## Tuning parameter 'iterations' was held constant at a value of 100
## 
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
##  parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 2, learning_rate =
##  0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
##  = 255.
#Stop time
proc.time()-t1
##    user  system elapsed 
##    1.53    0.14   49.08

Make predictions

catboost_pred_mix <- predict_results(catboost_model_mix,data_test_mix)
catboost_pred_mix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  767  72
##        yes  33  32
##                                          
##                Accuracy : 0.8838         
##                  95% CI : (0.8611, 0.904)
##     No Information Rate : 0.885          
##     P-Value [Acc > NIR] : 0.5672514      
##                                          
##                   Kappa : 0.3184         
##                                          
##  Mcnemar's Test P-Value : 0.0002086      
##                                          
##             Sensitivity : 0.9587         
##             Specificity : 0.3077         
##          Pos Pred Value : 0.9142         
##          Neg Pred Value : 0.4923         
##              Prevalence : 0.8850         
##          Detection Rate : 0.8485         
##    Detection Prevalence : 0.9281         
##       Balanced Accuracy : 0.6332         
##                                          
##        'Positive' Class : no             
## 

Train random forest model

#Start time
t1 <- proc.time()

rf_model_mix <- train_rf_model(data_train_mix)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 22 on full training set
rf_model_mix
## Random Forest 
## 
## 3617 samples
##   16 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2893, 2894, 2894, 2894, 2893, 2894, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##    2    0.8884431  0.06988726
##   22    0.9028136  0.44577851
##   42    0.9008780  0.44446045
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 22.
#Stop time
proc.time()-t1
##    user  system elapsed 
##    7.48    0.03  108.19

Make predictions

rf_pred_mix <- predict_results(rf_model_mix,data_test_mix)
rf_pred_mix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  764  68
##        yes  36  36
##                                          
##                Accuracy : 0.885          
##                  95% CI : (0.8623, 0.905)
##     No Information Rate : 0.885          
##     P-Value [Acc > NIR] : 0.526096       
##                                          
##                   Kappa : 0.3477         
##                                          
##  Mcnemar's Test P-Value : 0.002367       
##                                          
##             Sensitivity : 0.9550         
##             Specificity : 0.3462         
##          Pos Pred Value : 0.9183         
##          Neg Pred Value : 0.5000         
##              Prevalence : 0.8850         
##          Detection Rate : 0.8451         
##    Detection Prevalence : 0.9204         
##       Balanced Accuracy : 0.6506         
##                                          
##        'Positive' Class : no             
## 

Compare models

resamps_mix <- resamples(list(cb_mix=catboost_model_mix,rf_mix=rf_model_mix))
resamps_mix
## 
## Call:
## resamples.default(x = list(cb_mix = catboost_model_mix, rf_mix = rf_model_mix))
## 
## Models: cb_mix, rf_mix 
## Number of resamples: 10 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
summary(resamps_mix)
## 
## Call:
## summary.resamples(object = resamps_mix)
## 
## Models: cb_mix, rf_mix 
## Number of resamples: 10 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_mix 0.8853591 0.8981005 0.9045643 0.9022705 0.9081167 0.9128631    0
## rf_mix 0.8852006 0.8924620 0.9046274 0.9028136 0.9143350 0.9171271    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_mix 0.3279478 0.3451060 0.4171445 0.3992917 0.4352372 0.4895194    0
## rf_mix 0.3499540 0.3697414 0.4483677 0.4457785 0.5213937 0.5387360    0
bwplot(resamps_mix)

dotplot(resamps_mix)

difValues_mix <- diff(resamps_mix)
difValues_mix
## 
## Call:
## diff.resamples(x = resamps_mix)
## 
## Models: cb_mix, rf_mix 
## Metrics: Accuracy, Kappa 
## Number of differences: 1 
## p-value adjustment: bonferroni
summary(difValues_mix)
## 
## Call:
## summary.diff.resamples(object = difValues_mix)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##        cb_mix rf_mix    
## cb_mix        -0.0005431
## rf_mix 0.9147           
## 
## Kappa 
##        cb_mix rf_mix  
## cb_mix        -0.04649
## rf_mix 0.1781

CLOSE THE CLUSTER

stopCluster(cl)

STOP GENERAL TIME

proc.time()-t
##    user  system elapsed 
##   24.98    1.20  370.08