LOAD THE NECESSARY LIBRARIES

# For manipulating the datasets
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(readxl)

# For plotting correlation matrix
library(ggcorrplot)

## Loading required package: ggplot2

# Machine Learning library
library(caret)

## Loading required package: lattice

library(catboost)

# For Multi-core processing support
library(parallel)
library(doParallel)

## Loading required package: foreach

## Loading required package: iterators

START GENERAL TIME

t <- proc.time()

OPEN THE CLUSTER

cl <- makePSOCKcluster(2)
registerDoParallel(cl)

GET THE DATA

Load the datasets

#Numerical dataset
dataset_num <- read_excel("rice.xlsx")

#Categorical dataset
dataset_cat <- read.csv("mushrooms.csv")

#Mix dataset
dataset_mix <- read_excel("bank.xlsx")

CLEAN, PREPARE & MANIPULATE THE DATA

#Eliminate VEIL.TYPE since it only has one value, STALK.ROOT since it has missing values and other important attributes.
dataset_cat <- dataset_cat %>% select(-VEIL.TYPE,-STALK.ROOT,-ODOR,-SPORE.PRINT.COLOR,-GILL.COLOR,-GILL.SIZE,-HABITAT,-POPULATION,-STALK.SURFACE.ABOVE.RING,-CAP.COLOR,-RING.TYPE,-STALK.SURFACE.BELOW.RING)

All character columns to factor

dataset_num$CLASS <- as.factor(dataset_num$CLASS)

dataset_cat <- mutate_if(dataset_cat, is.character, as.factor)

dataset_mix <- mutate_if(dataset_mix, is.character, as.factor)

DEFINE FUNCTIONS

#CATBOOST
train_cb_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv", 
                     repeats = 2,
                     number = 5, 
                     returnResamp = 'final',
                     savePredictions = 'final',
                     verboseIter = T,
                     allowParallel = T)

catboost_model <- train(
               x = data_train[,!(names(data_train) %in% c("CLASS"))],
               y = data_train$CLASS,
               method = catboost.caret,
               trControl = fitControl)

return(catboost_model)
}
#RANDOM FOREST
train_rf_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv", 
                     repeats = 2,
                     number = 5, 
                     returnResamp = 'final',
                     savePredictions = 'final',
                     verboseIter = T,
                     allowParallel = T)

train_formula<-formula(CLASS~.)
rf_model <- train(train_formula,
               data = data_train,
               method = "rf",
               trControl = fitControl)

return(rf_model)
}
#Predictions
predict_results <- function(model,data_test){
predictions=predict(model,data_test)
return(confusionMatrix(predictions,as.factor(data_test$CLASS)))
}

EXECUTE CATBOOST AND RANDOMFOREST IN EACH DATASET

NUMERICAL DATASET

dataset_num

Split in train and test

trainIndex <- createDataPartition(dataset_num$CLASS, p=0.80, list=FALSE)
data_train_num <- dataset_num[ trainIndex,]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

data_test_num <-  dataset_num[-trainIndex,]

Train catboost model

#Start time
t1 <- proc.time()

catboost_model_num <- train_cb_model(data_train_num)

## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.0498, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set

## Warning: Setting row names on a tibble is deprecated.

## 0:   learn: 0.6568135    total: 141ms    remaining: 13.9s
## 1:   learn: 0.6236078    total: 144ms    remaining: 7.05s
## 2:   learn: 0.5936251    total: 147ms    remaining: 4.75s
## 3:   learn: 0.5662970    total: 150ms    remaining: 3.6s
## 4:   learn: 0.5405178    total: 153ms    remaining: 2.9s
## 5:   learn: 0.5171789    total: 156ms    remaining: 2.45s
## 6:   learn: 0.4956609    total: 160ms    remaining: 2.12s
## 7:   learn: 0.4761940    total: 163ms    remaining: 1.87s
## 8:   learn: 0.4576017    total: 166ms    remaining: 1.68s
## 9:   learn: 0.4406181    total: 169ms    remaining: 1.52s
## 10:  learn: 0.4247206    total: 172ms    remaining: 1.39s
## 11:  learn: 0.4099156    total: 175ms    remaining: 1.28s
## 12:  learn: 0.3961075    total: 178ms    remaining: 1.19s
## 13:  learn: 0.3832902    total: 181ms    remaining: 1.11s
## 14:  learn: 0.3716732    total: 185ms    remaining: 1.05s
## 15:  learn: 0.3606363    total: 188ms    remaining: 987ms
## 16:  learn: 0.3501943    total: 191ms    remaining: 934ms
## 17:  learn: 0.3404634    total: 194ms    remaining: 885ms
## 18:  learn: 0.3314146    total: 197ms    remaining: 842ms
## 19:  learn: 0.3229360    total: 200ms    remaining: 801ms
## 20:  learn: 0.3150070    total: 203ms    remaining: 764ms
## 21:  learn: 0.3072563    total: 206ms    remaining: 731ms
## 22:  learn: 0.3000933    total: 209ms    remaining: 701ms
## 23:  learn: 0.2932783    total: 212ms    remaining: 672ms
## 24:  learn: 0.2869615    total: 215ms    remaining: 646ms
## 25:  learn: 0.2812050    total: 219ms    remaining: 624ms
## 26:  learn: 0.2755738    total: 222ms    remaining: 602ms
## 27:  learn: 0.2704136    total: 226ms    remaining: 582ms
## 28:  learn: 0.2655610    total: 229ms    remaining: 561ms
## 29:  learn: 0.2608680    total: 232ms    remaining: 541ms
## 30:  learn: 0.2563472    total: 235ms    remaining: 523ms
## 31:  learn: 0.2523727    total: 238ms    remaining: 506ms
## 32:  learn: 0.2485799    total: 242ms    remaining: 490ms
## 33:  learn: 0.2447511    total: 245ms    remaining: 475ms
## 34:  learn: 0.2414468    total: 248ms    remaining: 460ms
## 35:  learn: 0.2383678    total: 251ms    remaining: 446ms
## 36:  learn: 0.2353650    total: 254ms    remaining: 432ms
## 37:  learn: 0.2324733    total: 257ms    remaining: 420ms
## 38:  learn: 0.2293391    total: 260ms    remaining: 407ms
## 39:  learn: 0.2267338    total: 264ms    remaining: 395ms
## 40:  learn: 0.2240232    total: 267ms    remaining: 384ms
## 41:  learn: 0.2215597    total: 270ms    remaining: 372ms
## 42:  learn: 0.2189280    total: 273ms    remaining: 362ms
## 43:  learn: 0.2167430    total: 277ms    remaining: 352ms
## 44:  learn: 0.2145314    total: 280ms    remaining: 342ms
## 45:  learn: 0.2125653    total: 283ms    remaining: 332ms
## 46:  learn: 0.2106722    total: 286ms    remaining: 323ms
## 47:  learn: 0.2087211    total: 291ms    remaining: 315ms
## 48:  learn: 0.2073332    total: 294ms    remaining: 306ms
## 49:  learn: 0.2057595    total: 297ms    remaining: 297ms
## 50:  learn: 0.2040379    total: 300ms    remaining: 289ms
## 51:  learn: 0.2023699    total: 304ms    remaining: 280ms
## 52:  learn: 0.2010369    total: 307ms    remaining: 272ms
## 53:  learn: 0.1996971    total: 310ms    remaining: 264ms
## 54:  learn: 0.1984580    total: 313ms    remaining: 256ms
## 55:  learn: 0.1974165    total: 316ms    remaining: 249ms
## 56:  learn: 0.1960377    total: 320ms    remaining: 241ms
## 57:  learn: 0.1947740    total: 323ms    remaining: 234ms
## 58:  learn: 0.1938288    total: 326ms    remaining: 227ms
## 59:  learn: 0.1926876    total: 329ms    remaining: 219ms
## 60:  learn: 0.1916285    total: 332ms    remaining: 212ms
## 61:  learn: 0.1905130    total: 335ms    remaining: 205ms
## 62:  learn: 0.1897046    total: 338ms    remaining: 199ms
## 63:  learn: 0.1891968    total: 341ms    remaining: 192ms
## 64:  learn: 0.1882166    total: 344ms    remaining: 185ms
## 65:  learn: 0.1873404    total: 347ms    remaining: 179ms
## 66:  learn: 0.1865113    total: 350ms    remaining: 172ms
## 67:  learn: 0.1857105    total: 353ms    remaining: 166ms
## 68:  learn: 0.1847751    total: 356ms    remaining: 160ms
## 69:  learn: 0.1838037    total: 359ms    remaining: 154ms
## 70:  learn: 0.1831553    total: 363ms    remaining: 148ms
## 71:  learn: 0.1824054    total: 367ms    remaining: 143ms
## 72:  learn: 0.1814054    total: 370ms    remaining: 137ms
## 73:  learn: 0.1805700    total: 374ms    remaining: 131ms
## 74:  learn: 0.1797765    total: 377ms    remaining: 126ms
## 75:  learn: 0.1789485    total: 380ms    remaining: 120ms
## 76:  learn: 0.1784491    total: 383ms    remaining: 114ms
## 77:  learn: 0.1779122    total: 386ms    remaining: 109ms
## 78:  learn: 0.1770016    total: 390ms    remaining: 104ms
## 79:  learn: 0.1764126    total: 393ms    remaining: 98.2ms
## 80:  learn: 0.1758560    total: 396ms    remaining: 93ms
## 81:  learn: 0.1750834    total: 399ms    remaining: 87.7ms
## 82:  learn: 0.1745138    total: 403ms    remaining: 82.5ms
## 83:  learn: 0.1741124    total: 406ms    remaining: 77.4ms
## 84:  learn: 0.1736901    total: 409ms    remaining: 72.2ms
## 85:  learn: 0.1732599    total: 412ms    remaining: 67.1ms
## 86:  learn: 0.1728302    total: 415ms    remaining: 62.1ms
## 87:  learn: 0.1724763    total: 419ms    remaining: 57.1ms
## 88:  learn: 0.1719654    total: 422ms    remaining: 52.1ms
## 89:  learn: 0.1714901    total: 425ms    remaining: 47.2ms
## 90:  learn: 0.1708885    total: 428ms    remaining: 42.3ms
## 91:  learn: 0.1704591    total: 431ms    remaining: 37.4ms
## 92:  learn: 0.1700912    total: 435ms    remaining: 32.7ms
## 93:  learn: 0.1689023    total: 439ms    remaining: 28ms
## 94:  learn: 0.1681292    total: 442ms    remaining: 23.3ms
## 95:  learn: 0.1675693    total: 446ms    remaining: 18.6ms
## 96:  learn: 0.1671149    total: 449ms    remaining: 13.9ms
## 97:  learn: 0.1667215    total: 453ms    remaining: 9.24ms
## 98:  learn: 0.1661118    total: 456ms    remaining: 4.61ms
## 99:  learn: 0.1654008    total: 459ms    remaining: 0us

catboost_model_num

## Catboost 
## 
## 3048 samples
##    7 predictor
##    2 classes: 'Cammeo', 'Osmancik' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2438, 2439, 2438, 2438, 2439, 2439, ... 
## Resampling results across tuning parameters:
## 
##   depth  learning_rate  Accuracy   Kappa    
##   2      0.04978707     0.9247075  0.8458708
##   2      0.13533528     0.9222469  0.8410028
##   2      0.36787944     0.9197895  0.8358922
##   2      1.00000000     0.9114235  0.8188794
##   4      0.04978707     0.9250359  0.8464785
##   4      0.13533528     0.9215917  0.8395079
##   4      0.36787944     0.9142133  0.8244731
##   4      1.00000000     0.8955135  0.7865344
##   6      0.04978707     0.9270050  0.8505720
##   6      0.13533528     0.9202794  0.8366790
##   6      0.36787944     0.9086344  0.8131553
##   6      1.00000000     0.9005979  0.7970720
## 
## Tuning parameter 'iterations' was held constant at a value of 100
## 
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
##  parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
##  0.04978707, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and
##  border_count = 255.

#Stop time
proc.time()-t1

##    user  system elapsed 
##    1.80    0.12   45.92

Make predictions

catboost_pred_num <- predict_results(catboost_model_num,data_test_num)
catboost_pred_num

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cammeo Osmancik
##   Cammeo      301       23
##   Osmancik     25      413
##                                           
##                Accuracy : 0.937           
##                  95% CI : (0.9173, 0.9532)
##     No Information Rate : 0.5722          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8712          
##                                           
##  Mcnemar's Test P-Value : 0.8852          
##                                           
##             Sensitivity : 0.9233          
##             Specificity : 0.9472          
##          Pos Pred Value : 0.9290          
##          Neg Pred Value : 0.9429          
##              Prevalence : 0.4278          
##          Detection Rate : 0.3950          
##    Detection Prevalence : 0.4252          
##       Balanced Accuracy : 0.9353          
##                                           
##        'Positive' Class : Cammeo          
##

Train random forest model

#Start time
t1 <- proc.time()

rf_model_num <- train_rf_model(data_train_num)

## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set

rf_model_num

## Random Forest 
## 
## 3048 samples
##    7 predictor
##    2 classes: 'Cammeo', 'Osmancik' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2438, 2439, 2438, 2439, 2438, 2438, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9227349  0.8417241
##   4     0.9212590  0.8387185
##   7     0.9212571  0.8386529
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

#Stop time
proc.time()-t1

##    user  system elapsed 
##    1.86    0.07   18.80

Make predictions

rf_pred_num <- predict_results(rf_model_num,data_test_num)
rf_pred_num

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Cammeo Osmancik
##   Cammeo      298       25
##   Osmancik     28      411
##                                         
##                Accuracy : 0.9304        
##                  95% CI : (0.91, 0.9475)
##     No Information Rate : 0.5722        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.8578        
##                                         
##  Mcnemar's Test P-Value : 0.7835        
##                                         
##             Sensitivity : 0.9141        
##             Specificity : 0.9427        
##          Pos Pred Value : 0.9226        
##          Neg Pred Value : 0.9362        
##              Prevalence : 0.4278        
##          Detection Rate : 0.3911        
##    Detection Prevalence : 0.4239        
##       Balanced Accuracy : 0.9284        
##                                         
##        'Positive' Class : Cammeo        
##

Compare models

resamps_num <- resamples(list(cb_num=catboost_model_num,rf_num=rf_model_num))
resamps_num

## 
## Call:
## resamples.default(x = list(cb_num = catboost_model_num, rf_num = rf_model_num))
## 
## Models: cb_num, rf_num 
## Number of resamples: 10 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit

summary(resamps_num)

## 
## Call:
## summary.resamples(object = resamps_num)
## 
## Models: cb_num, rf_num 
## Number of resamples: 10 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_num 0.9098361 0.9212146 0.9269913 0.9270050 0.9306529 0.9490969    0
## rf_num 0.9113300 0.9184096 0.9228876 0.9227349 0.9270492 0.9327869    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_num 0.8143147 0.8389740 0.8505871 0.8505720 0.8579431 0.8960230    0
## rf_num 0.8166828 0.8330627 0.8420719 0.8417241 0.8510985 0.8626503    0

bwplot(resamps_num)

dotplot(resamps_num)

difValues_num <- diff(resamps_num)
difValues_num

## 
## Call:
## diff.resamples(x = resamps_num)
## 
## Models: cb_num, rf_num 
## Metrics: Accuracy, Kappa 
## Number of differences: 1 
## p-value adjustment: bonferroni

summary(difValues_num)

## 
## Call:
## summary.diff.resamples(object = difValues_num)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##        cb_num rf_num 
## cb_num        0.00427
## rf_num 0.2778        
## 
## Kappa 
##        cb_num rf_num  
## cb_num        0.008848
## rf_num 0.2822

CATEGORICAL DATASET

dataset_cat

Split in train and test

trainIndex <- createDataPartition(dataset_cat$CLASS, p=0.80, list=FALSE)
data_train_cat <- dataset_cat[ trainIndex,]
data_test_cat <-  dataset_cat[-trainIndex,]

Train catboost model

#Start time
t1 <- proc.time()

catboost_model_cat <- train_cb_model(data_train_cat)

## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## 0:   learn: 0.6122370    total: 6.31ms   remaining: 625ms
## 1:   learn: 0.5507690    total: 12.4ms   remaining: 607ms
## 2:   learn: 0.5029202    total: 20.4ms   remaining: 661ms
## 3:   learn: 0.4666108    total: 23.4ms   remaining: 561ms
## 4:   learn: 0.4360618    total: 26.9ms   remaining: 510ms
## 5:   learn: 0.4129783    total: 30.2ms   remaining: 474ms
## 6:   learn: 0.3942320    total: 33.9ms   remaining: 451ms
## 7:   learn: 0.3678110    total: 38ms remaining: 437ms
## 8:   learn: 0.3363034    total: 42ms remaining: 424ms
## 9:   learn: 0.3122952    total: 46.2ms   remaining: 416ms
## 10:  learn: 0.2932804    total: 50.5ms   remaining: 409ms
## 11:  learn: 0.2809059    total: 55.1ms   remaining: 404ms
## 12:  learn: 0.2695079    total: 59.5ms   remaining: 398ms
## 13:  learn: 0.2563822    total: 63.9ms   remaining: 393ms
## 14:  learn: 0.2480827    total: 68.9ms   remaining: 390ms
## 15:  learn: 0.2348083    total: 74.4ms   remaining: 390ms
## 16:  learn: 0.2142045    total: 79.7ms   remaining: 389ms
## 17:  learn: 0.2014268    total: 83.9ms   remaining: 382ms
## 18:  learn: 0.1926262    total: 88.5ms   remaining: 377ms
## 19:  learn: 0.1808947    total: 92.7ms   remaining: 371ms
## 20:  learn: 0.1702964    total: 97.4ms   remaining: 367ms
## 21:  learn: 0.1634833    total: 101ms    remaining: 360ms
## 22:  learn: 0.1574146    total: 106ms    remaining: 355ms
## 23:  learn: 0.1533933    total: 110ms    remaining: 350ms
## 24:  learn: 0.1494566    total: 114ms    remaining: 343ms
## 25:  learn: 0.1467275    total: 118ms    remaining: 337ms
## 26:  learn: 0.1434511    total: 122ms    remaining: 331ms
## 27:  learn: 0.1365177    total: 127ms    remaining: 326ms
## 28:  learn: 0.1350156    total: 131ms    remaining: 320ms
## 29:  learn: 0.1322677    total: 135ms    remaining: 316ms
## 30:  learn: 0.1296128    total: 139ms    remaining: 310ms
## 31:  learn: 0.1251714    total: 145ms    remaining: 309ms
## 32:  learn: 0.1225821    total: 149ms    remaining: 303ms
## 33:  learn: 0.1211077    total: 153ms    remaining: 297ms
## 34:  learn: 0.1191154    total: 158ms    remaining: 293ms
## 35:  learn: 0.1160553    total: 163ms    remaining: 290ms
## 36:  learn: 0.1123520    total: 168ms    remaining: 286ms
## 37:  learn: 0.1094478    total: 172ms    remaining: 281ms
## 38:  learn: 0.1081211    total: 178ms    remaining: 279ms
## 39:  learn: 0.1070587    total: 183ms    remaining: 274ms
## 40:  learn: 0.1030064    total: 187ms    remaining: 269ms
## 41:  learn: 0.1010134    total: 192ms    remaining: 265ms
## 42:  learn: 0.0987311    total: 197ms    remaining: 262ms
## 43:  learn: 0.0983617    total: 202ms    remaining: 257ms
## 44:  learn: 0.0978947    total: 206ms    remaining: 252ms
## 45:  learn: 0.0967254    total: 212ms    remaining: 249ms
## 46:  learn: 0.0954702    total: 219ms    remaining: 247ms
## 47:  learn: 0.0938459    total: 224ms    remaining: 242ms
## 48:  learn: 0.0935742    total: 232ms    remaining: 241ms
## 49:  learn: 0.0921233    total: 236ms    remaining: 236ms
## 50:  learn: 0.0898602    total: 240ms    remaining: 231ms
## 51:  learn: 0.0894541    total: 246ms    remaining: 227ms
## 52:  learn: 0.0888838    total: 250ms    remaining: 222ms
## 53:  learn: 0.0873815    total: 254ms    remaining: 217ms
## 54:  learn: 0.0867422    total: 262ms    remaining: 214ms
## 55:  learn: 0.0862856    total: 266ms    remaining: 209ms
## 56:  learn: 0.0859953    total: 270ms    remaining: 204ms
## 57:  learn: 0.0857699    total: 278ms    remaining: 201ms
## 58:  learn: 0.0856337    total: 286ms    remaining: 199ms
## 59:  learn: 0.0845285    total: 294ms    remaining: 196ms
## 60:  learn: 0.0840471    total: 299ms    remaining: 191ms
## 61:  learn: 0.0823477    total: 303ms    remaining: 186ms
## 62:  learn: 0.0802434    total: 311ms    remaining: 183ms
## 63:  learn: 0.0795666    total: 316ms    remaining: 178ms
## 64:  learn: 0.0792756    total: 320ms    remaining: 172ms
## 65:  learn: 0.0785699    total: 328ms    remaining: 169ms
## 66:  learn: 0.0783523    total: 332ms    remaining: 164ms
## 67:  learn: 0.0774731    total: 337ms    remaining: 159ms
## 68:  learn: 0.0771809    total: 345ms    remaining: 155ms
## 69:  learn: 0.0762274    total: 349ms    remaining: 150ms
## 70:  learn: 0.0751240    total: 353ms    remaining: 144ms
## 71:  learn: 0.0749683    total: 361ms    remaining: 141ms
## 72:  learn: 0.0747531    total: 367ms    remaining: 136ms
## 73:  learn: 0.0746343    total: 375ms    remaining: 132ms
## 74:  learn: 0.0745693    total: 379ms    remaining: 126ms
## 75:  learn: 0.0743716    total: 385ms    remaining: 121ms
## 76:  learn: 0.0732670    total: 392ms    remaining: 117ms
## 77:  learn: 0.0732560    total: 395ms    remaining: 111ms
## 78:  learn: 0.0731248    total: 398ms    remaining: 106ms
## 79:  learn: 0.0726237    total: 402ms    remaining: 101ms
## 80:  learn: 0.0720270    total: 411ms    remaining: 96.3ms
## 81:  learn: 0.0719702    total: 416ms    remaining: 91.3ms
## 82:  learn: 0.0716857    total: 422ms    remaining: 86.4ms
## 83:  learn: 0.0715268    total: 426ms    remaining: 81.2ms
## 84:  learn: 0.0712087    total: 430ms    remaining: 76ms
## 85:  learn: 0.0709254    total: 434ms    remaining: 70.7ms
## 86:  learn: 0.0706008    total: 441ms    remaining: 66ms
## 87:  learn: 0.0704501    total: 446ms    remaining: 60.9ms
## 88:  learn: 0.0703976    total: 451ms    remaining: 55.7ms
## 89:  learn: 0.0699715    total: 458ms    remaining: 50.9ms
## 90:  learn: 0.0695446    total: 462ms    remaining: 45.7ms
## 91:  learn: 0.0692119    total: 466ms    remaining: 40.5ms
## 92:  learn: 0.0689237    total: 474ms    remaining: 35.6ms
## 93:  learn: 0.0688865    total: 477ms    remaining: 30.5ms
## 94:  learn: 0.0681742    total: 481ms    remaining: 25.3ms
## 95:  learn: 0.0680095    total: 489ms    remaining: 20.4ms
## 96:  learn: 0.0679535    total: 493ms    remaining: 15.2ms
## 97:  learn: 0.0678766    total: 498ms    remaining: 10.2ms
## 98:  learn: 0.0676545    total: 503ms    remaining: 5.08ms
## 99:  learn: 0.0675518    total: 508ms    remaining: 0us

catboost_model_cat

## Catboost 
## 
## 6500 samples
##   10 predictor
##    2 classes: 'e', 'p' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 5199, 5201, 5200, 5201, 5199, 5201, ... 
## Resampling results across tuning parameters:
## 
##   depth  learning_rate  Accuracy   Kappa    
##   2      0.04978707     0.8938478  0.7863302
##   2      0.13533528     0.9303078  0.8603756
##   2      0.36787944     0.9397695  0.8793362
##   2      1.00000000     0.9564634  0.9127782
##   4      0.04978707     0.9394627  0.8785638
##   4      0.13533528     0.9608461  0.9215197
##   4      0.36787944     0.9673072  0.9344566
##   4      1.00000000     0.9477065  0.8947271
##   6      0.04978707     0.9656155  0.9310161
##   6      0.13533528     0.9674611  0.9347500
##   6      0.36787944     0.9666924  0.9332041
##   6      1.00000000     0.9663844  0.9325722
## 
## Tuning parameter 'iterations' was held constant at a value of 100
## 
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
##  parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
##  0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
##  = 255.

#Stop time
proc.time()-t1

##    user  system elapsed 
##    2.89    0.28   52.45

Make predictions

catboost_pred_cat <- predict_results(catboost_model_cat,data_test_cat)
catboost_pred_cat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   e   p
##          e 838  46
##          p   3 737
##                                           
##                Accuracy : 0.9698          
##                  95% CI : (0.9603, 0.9776)
##     No Information Rate : 0.5179          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9395          
##                                           
##  Mcnemar's Test P-Value : 1.973e-09       
##                                           
##             Sensitivity : 0.9964          
##             Specificity : 0.9413          
##          Pos Pred Value : 0.9480          
##          Neg Pred Value : 0.9959          
##              Prevalence : 0.5179          
##          Detection Rate : 0.5160          
##    Detection Prevalence : 0.5443          
##       Balanced Accuracy : 0.9688          
##                                           
##        'Positive' Class : e               
##

Train random forest model

#Start time
t1 <- proc.time()

rf_model_cat <- train_rf_model(data_train_cat)

## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 17 on full training set

rf_model_cat

## Random Forest 
## 
## 6500 samples
##   10 predictor
##    2 classes: 'e', 'p' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 5201, 5200, 5199, 5200, 5200, 5200, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8475368  0.6923302
##   17    0.9682315  0.9363089
##   33    0.9678468  0.9355343
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.

#Stop time
proc.time()-t1

##    user  system elapsed 
##    6.71    0.06   91.89

Make predictions

rf_pred_cat <- predict_results(rf_model_cat,data_test_cat)
rf_pred_cat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   e   p
##          e 831  42
##          p  10 741
##                                          
##                Accuracy : 0.968          
##                  95% CI : (0.9582, 0.976)
##     No Information Rate : 0.5179         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9358         
##                                          
##  Mcnemar's Test P-Value : 1.716e-05      
##                                          
##             Sensitivity : 0.9881         
##             Specificity : 0.9464         
##          Pos Pred Value : 0.9519         
##          Neg Pred Value : 0.9867         
##              Prevalence : 0.5179         
##          Detection Rate : 0.5117         
##    Detection Prevalence : 0.5376         
##       Balanced Accuracy : 0.9672         
##                                          
##        'Positive' Class : e              
##

Compare models

resamps_cat <- resamples(list(cb_cat=catboost_model_cat,rf_cat=rf_model_cat))
resamps_cat

## 
## Call:
## resamples.default(x = list(cb_cat = catboost_model_cat, rf_cat = rf_model_cat))
## 
## Models: cb_cat, rf_cat 
## Number of resamples: 10 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit

summary(resamps_cat)

## 
## Call:
## summary.resamples(object = resamps_cat)
## 
## Models: cb_cat, rf_cat 
## Number of resamples: 10 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_cat 0.9615089 0.9653913 0.9669358 0.9674611 0.9700106 0.9745958    0
## rf_cat 0.9638462 0.9661604 0.9676935 0.9682315 0.9711538 0.9730562    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_cat 0.9227876 0.9305715 0.9337030 0.9347500 0.9398612 0.9490711    0
## rf_cat 0.9274657 0.9321458 0.9352378 0.9363089 0.9421932 0.9459724    0

bwplot(resamps_cat)

dotplot(resamps_cat)

difValues_cat <- diff(resamps_cat)
difValues_cat

## 
## Call:
## diff.resamples(x = resamps_cat)
## 
## Models: cb_cat, rf_cat 
## Metrics: Accuracy, Kappa 
## Number of differences: 1 
## p-value adjustment: bonferroni

summary(difValues_cat)

## 
## Call:
## summary.diff.resamples(object = difValues_cat)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##        cb_cat rf_cat    
## cb_cat        -0.0007704
## rf_cat 0.4833           
## 
## Kappa 
##        cb_cat rf_cat   
## cb_cat        -0.001559
## rf_cat 0.4802

##MIX DATASET

dataset_mix

Split in train and test

trainIndex <- createDataPartition(dataset_mix$CLASS, p=0.80, list=FALSE)
data_train_mix <- dataset_mix[ trainIndex,]
data_test_mix <-  dataset_mix[-trainIndex,]

Train catboost model

#Start time
t1 <- proc.time()

catboost_model_mix <- train_cb_model(data_train_mix)

## Aggregating results
## Selecting tuning parameters
## Fitting depth = 2, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set

## Warning: Setting row names on a tibble is deprecated.

## 0:   learn: 0.6121339    total: 5.64ms   remaining: 558ms
## 1:   learn: 0.5502777    total: 9.34ms   remaining: 458ms
## 2:   learn: 0.5021942    total: 12.9ms   remaining: 419ms
## 3:   learn: 0.4643175    total: 15.8ms   remaining: 378ms
## 4:   learn: 0.4356381    total: 17.5ms   remaining: 332ms
## 5:   learn: 0.4093016    total: 18.9ms   remaining: 296ms
## 6:   learn: 0.3837429    total: 22.9ms   remaining: 304ms
## 7:   learn: 0.3621549    total: 24.4ms   remaining: 281ms
## 8:   learn: 0.3450963    total: 25.9ms   remaining: 262ms
## 9:   learn: 0.3333567    total: 27.7ms   remaining: 249ms
## 10:  learn: 0.3242311    total: 29.2ms   remaining: 236ms
## 11:  learn: 0.3168739    total: 30.7ms   remaining: 225ms
## 12:  learn: 0.3081181    total: 32.2ms   remaining: 216ms
## 13:  learn: 0.2996895    total: 33.6ms   remaining: 207ms
## 14:  learn: 0.2928020    total: 35.2ms   remaining: 199ms
## 15:  learn: 0.2873438    total: 36.6ms   remaining: 192ms
## 16:  learn: 0.2820336    total: 38.3ms   remaining: 187ms
## 17:  learn: 0.2787684    total: 39.7ms   remaining: 181ms
## 18:  learn: 0.2722486    total: 41.3ms   remaining: 176ms
## 19:  learn: 0.2683284    total: 42.9ms   remaining: 172ms
## 20:  learn: 0.2644261    total: 45.1ms   remaining: 170ms
## 21:  learn: 0.2606796    total: 46.6ms   remaining: 165ms
## 22:  learn: 0.2574881    total: 48.1ms   remaining: 161ms
## 23:  learn: 0.2556970    total: 49.6ms   remaining: 157ms
## 24:  learn: 0.2541877    total: 51.1ms   remaining: 153ms
## 25:  learn: 0.2518247    total: 52.6ms   remaining: 150ms
## 26:  learn: 0.2503556    total: 54.2ms   remaining: 146ms
## 27:  learn: 0.2489015    total: 55.8ms   remaining: 143ms
## 28:  learn: 0.2477297    total: 57.2ms   remaining: 140ms
## 29:  learn: 0.2458457    total: 58.6ms   remaining: 137ms
## 30:  learn: 0.2448694    total: 61.4ms   remaining: 137ms
## 31:  learn: 0.2436032    total: 63.1ms   remaining: 134ms
## 32:  learn: 0.2426952    total: 64.8ms   remaining: 131ms
## 33:  learn: 0.2415039    total: 66.3ms   remaining: 129ms
## 34:  learn: 0.2406009    total: 67.7ms   remaining: 126ms
## 35:  learn: 0.2394611    total: 69.1ms   remaining: 123ms
## 36:  learn: 0.2384822    total: 70.7ms   remaining: 120ms
## 37:  learn: 0.2371913    total: 72.2ms   remaining: 118ms
## 38:  learn: 0.2362425    total: 73.7ms   remaining: 115ms
## 39:  learn: 0.2354962    total: 75.2ms   remaining: 113ms
## 40:  learn: 0.2346559    total: 76.6ms   remaining: 110ms
## 41:  learn: 0.2339751    total: 78.1ms   remaining: 108ms
## 42:  learn: 0.2331142    total: 79.6ms   remaining: 106ms
## 43:  learn: 0.2328724    total: 81.1ms   remaining: 103ms
## 44:  learn: 0.2322204    total: 84ms remaining: 103ms
## 45:  learn: 0.2316547    total: 86ms remaining: 101ms
## 46:  learn: 0.2314605    total: 87.6ms   remaining: 98.8ms
## 47:  learn: 0.2312707    total: 89.1ms   remaining: 96.5ms
## 48:  learn: 0.2305370    total: 91.1ms   remaining: 94.8ms
## 49:  learn: 0.2300010    total: 92.6ms   remaining: 92.6ms
## 50:  learn: 0.2296235    total: 94.3ms   remaining: 90.6ms
## 51:  learn: 0.2291560    total: 95.9ms   remaining: 88.5ms
## 52:  learn: 0.2290524    total: 97.3ms   remaining: 86.3ms
## 53:  learn: 0.2286910    total: 98.9ms   remaining: 84.2ms
## 54:  learn: 0.2285615    total: 100ms    remaining: 82.1ms
## 55:  learn: 0.2279909    total: 102ms    remaining: 80ms
## 56:  learn: 0.2274678    total: 103ms    remaining: 77.9ms
## 57:  learn: 0.2273241    total: 105ms    remaining: 75.9ms
## 58:  learn: 0.2265899    total: 107ms    remaining: 74ms
## 59:  learn: 0.2261008    total: 108ms    remaining: 72.1ms
## 60:  learn: 0.2252369    total: 110ms    remaining: 70.1ms
## 61:  learn: 0.2250421    total: 111ms    remaining: 68.1ms
## 62:  learn: 0.2246596    total: 112ms    remaining: 66.1ms
## 63:  learn: 0.2241355    total: 114ms    remaining: 64ms
## 64:  learn: 0.2241081    total: 115ms    remaining: 62ms
## 65:  learn: 0.2238018    total: 117ms    remaining: 60.1ms
## 66:  learn: 0.2237307    total: 118ms    remaining: 58.1ms
## 67:  learn: 0.2233752    total: 119ms    remaining: 56.2ms
## 68:  learn: 0.2229184    total: 121ms    remaining: 54.3ms
## 69:  learn: 0.2227866    total: 122ms    remaining: 52.4ms
## 70:  learn: 0.2224133    total: 124ms    remaining: 50.6ms
## 71:  learn: 0.2218968    total: 125ms    remaining: 48.7ms
## 72:  learn: 0.2216509    total: 127ms    remaining: 46.9ms
## 73:  learn: 0.2213231    total: 128ms    remaining: 45.1ms
## 74:  learn: 0.2212330    total: 130ms    remaining: 43.2ms
## 75:  learn: 0.2208867    total: 131ms    remaining: 41.4ms
## 76:  learn: 0.2208864    total: 132ms    remaining: 39.6ms
## 77:  learn: 0.2205792    total: 134ms    remaining: 37.8ms
## 78:  learn: 0.2201641    total: 135ms    remaining: 36ms
## 79:  learn: 0.2198007    total: 137ms    remaining: 34.2ms
## 80:  learn: 0.2194934    total: 138ms    remaining: 32.4ms
## 81:  learn: 0.2189740    total: 139ms    remaining: 30.6ms
## 82:  learn: 0.2177490    total: 141ms    remaining: 28.9ms
## 83:  learn: 0.2176918    total: 142ms    remaining: 27.1ms
## 84:  learn: 0.2171174    total: 144ms    remaining: 25.4ms
## 85:  learn: 0.2167369    total: 145ms    remaining: 23.7ms
## 86:  learn: 0.2163940    total: 147ms    remaining: 22ms
## 87:  learn: 0.2162784    total: 148ms    remaining: 20.2ms
## 88:  learn: 0.2162118    total: 150ms    remaining: 18.5ms
## 89:  learn: 0.2159134    total: 151ms    remaining: 16.8ms
## 90:  learn: 0.2159058    total: 153ms    remaining: 15.1ms
## 91:  learn: 0.2157729    total: 154ms    remaining: 13.4ms
## 92:  learn: 0.2156463    total: 156ms    remaining: 11.7ms
## 93:  learn: 0.2151911    total: 157ms    remaining: 10ms
## 94:  learn: 0.2148647    total: 159ms    remaining: 8.35ms
## 95:  learn: 0.2146870    total: 160ms    remaining: 6.68ms
## 96:  learn: 0.2143327    total: 162ms    remaining: 5.01ms
## 97:  learn: 0.2139554    total: 163ms    remaining: 3.33ms
## 98:  learn: 0.2135875    total: 165ms    remaining: 1.66ms
## 99:  learn: 0.2134650    total: 166ms    remaining: 0us

catboost_model_mix

## Catboost 
## 
## 3617 samples
##   16 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2893, 2894, 2894, 2894, 2893, 2894, ... 
## Resampling results across tuning parameters:
## 
##   depth  learning_rate  Accuracy   Kappa    
##   2      0.04978707     0.8986767  0.3245610
##   2      0.13533528     0.9022705  0.3992917
##   2      0.36787944     0.9001954  0.4205602
##   2      1.00000000     0.8902419  0.3984283
##   4      0.04978707     0.9003351  0.3498960
##   4      0.13533528     0.9015782  0.4180483
##   4      0.36787944     0.8961876  0.4063029
##   4      1.00000000     0.8847128  0.3817551
##   6      0.04978707     0.9017182  0.3801920
##   6      0.13533528     0.8986742  0.4123413
##   6      0.36787944     0.8952194  0.4182872
##   6      1.00000000     0.8833335  0.3966184
## 
## Tuning parameter 'iterations' was held constant at a value of 100
## 
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
##  parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 2, learning_rate =
##  0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
##  = 255.

#Stop time
proc.time()-t1

##    user  system elapsed 
##    1.53    0.14   49.08

Make predictions

catboost_pred_mix <- predict_results(catboost_model_mix,data_test_mix)
catboost_pred_mix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  767  72
##        yes  33  32
##                                          
##                Accuracy : 0.8838         
##                  95% CI : (0.8611, 0.904)
##     No Information Rate : 0.885          
##     P-Value [Acc > NIR] : 0.5672514      
##                                          
##                   Kappa : 0.3184         
##                                          
##  Mcnemar's Test P-Value : 0.0002086      
##                                          
##             Sensitivity : 0.9587         
##             Specificity : 0.3077         
##          Pos Pred Value : 0.9142         
##          Neg Pred Value : 0.4923         
##              Prevalence : 0.8850         
##          Detection Rate : 0.8485         
##    Detection Prevalence : 0.9281         
##       Balanced Accuracy : 0.6332         
##                                          
##        'Positive' Class : no             
##

Train random forest model

#Start time
t1 <- proc.time()

rf_model_mix <- train_rf_model(data_train_mix)

## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 22 on full training set

rf_model_mix

## Random Forest 
## 
## 3617 samples
##   16 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 2893, 2894, 2894, 2894, 2893, 2894, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##    2    0.8884431  0.06988726
##   22    0.9028136  0.44577851
##   42    0.9008780  0.44446045
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 22.

#Stop time
proc.time()-t1

##    user  system elapsed 
##    7.48    0.03  108.19

Make predictions

rf_pred_mix <- predict_results(rf_model_mix,data_test_mix)
rf_pred_mix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  764  68
##        yes  36  36
##                                          
##                Accuracy : 0.885          
##                  95% CI : (0.8623, 0.905)
##     No Information Rate : 0.885          
##     P-Value [Acc > NIR] : 0.526096       
##                                          
##                   Kappa : 0.3477         
##                                          
##  Mcnemar's Test P-Value : 0.002367       
##                                          
##             Sensitivity : 0.9550         
##             Specificity : 0.3462         
##          Pos Pred Value : 0.9183         
##          Neg Pred Value : 0.5000         
##              Prevalence : 0.8850         
##          Detection Rate : 0.8451         
##    Detection Prevalence : 0.9204         
##       Balanced Accuracy : 0.6506         
##                                          
##        'Positive' Class : no             
##

Compare models

resamps_mix <- resamples(list(cb_mix=catboost_model_mix,rf_mix=rf_model_mix))
resamps_mix

## 
## Call:
## resamples.default(x = list(cb_mix = catboost_model_mix, rf_mix = rf_model_mix))
## 
## Models: cb_mix, rf_mix 
## Number of resamples: 10 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit

summary(resamps_mix)

## 
## Call:
## summary.resamples(object = resamps_mix)
## 
## Models: cb_mix, rf_mix 
## Number of resamples: 10 
## 
## Accuracy 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_mix 0.8853591 0.8981005 0.9045643 0.9022705 0.9081167 0.9128631    0
## rf_mix 0.8852006 0.8924620 0.9046274 0.9028136 0.9143350 0.9171271    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## cb_mix 0.3279478 0.3451060 0.4171445 0.3992917 0.4352372 0.4895194    0
## rf_mix 0.3499540 0.3697414 0.4483677 0.4457785 0.5213937 0.5387360    0

bwplot(resamps_mix)

dotplot(resamps_mix)

difValues_mix <- diff(resamps_mix)
difValues_mix

## 
## Call:
## diff.resamples(x = resamps_mix)
## 
## Models: cb_mix, rf_mix 
## Metrics: Accuracy, Kappa 
## Number of differences: 1 
## p-value adjustment: bonferroni

summary(difValues_mix)

## 
## Call:
## summary.diff.resamples(object = difValues_mix)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##        cb_mix rf_mix    
## cb_mix        -0.0005431
## rf_mix 0.9147           
## 
## Kappa 
##        cb_mix rf_mix  
## cb_mix        -0.04649
## rf_mix 0.1781

CLOSE THE CLUSTER

stopCluster(cl)

STOP GENERAL TIME

proc.time()-t

##    user  system elapsed 
##   24.98    1.20  370.08

R Notebook

LOAD THE NECESSARY LIBRARIES

START GENERAL TIME

OPEN THE CLUSTER

GET THE DATA

Load the datasets

CLEAN, PREPARE & MANIPULATE THE DATA

All character columns to factor

DEFINE FUNCTIONS

EXECUTE CATBOOST AND RANDOMFOREST IN EACH DATASET

NUMERICAL DATASET

Split in train and test

Train catboost model

Make predictions

Train random forest model

Make predictions

Compare models

CATEGORICAL DATASET

Split in train and test

Train catboost model

Make predictions

Train random forest model

Make predictions

Compare models

Split in train and test

Train catboost model

Make predictions

Train random forest model

Make predictions

Compare models

CLOSE THE CLUSTER

STOP GENERAL TIME