This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
# For manipulating the datasets
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(readxl)
# For plotting correlation matrix
library(ggcorrplot)
## Loading required package: ggplot2
# Machine Learning library
library(caret)
## Loading required package: lattice
library(catboost)
# For Multi-core processing support
library(parallel)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
t <- proc.time()
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
#Numerical dataset
dataset_num <- read_excel("rice.xlsx")
#Categorical dataset
dataset_cat <- read.csv("mushrooms.csv")
#Mix dataset
dataset_mix <- read_excel("bank.xlsx")
dataset_cat %>% group_by(VEIL.TYPE) %>% summarise(total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
#Eliminate VEIL.TYPE since it only has one value
dataset_cat <- dataset_cat %>% select(-VEIL.TYPE)
dataset_cat %>% group_by(STALK.ROOT) %>% summarise(total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
#Eliminate STALK.ROOT since it has missing values
dataset_cat <- dataset_cat %>% select(-STALK.ROOT)
dataset_num$CLASS <- as.factor(dataset_num$CLASS)
dataset_cat <- mutate_if(dataset_cat, is.character, as.factor)
dataset_mix <- mutate_if(dataset_mix, is.character, as.factor)
#CATBOOST
train_cb_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv",
repeats = 2,
number = 5,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = T,
allowParallel = T)
catboost_model <- train(
x = data_train[,!(names(data_train) %in% c("CLASS"))],
y = data_train$CLASS,
method = catboost.caret,
trControl = fitControl)
return(catboost_model)
}
#RANDOM FOREST
train_rf_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv",
repeats = 2,
number = 5,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = T,
allowParallel = T)
train_formula<-formula(CLASS~.)
rf_model <- train(train_formula,
data = data_train,
method = "rf",
trControl = fitControl)
return(rf_model)
}
#Predictions
predict_results <- function(model,data_test){
predictions=predict(model,data_test)
return(confusionMatrix(predictions,as.factor(data_test$CLASS)))
}
#EXECUTE CATBOOST AND RANDOMFOREST IN EACH DATASET
##NUMERICAL DATASET
dataset_num
###Split in train and test
trainIndex <- createDataPartition(dataset_num$CLASS, p=0.80, list=FALSE)
data_train_num <- dataset_num[ trainIndex,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
data_test_num <- dataset_num[-trainIndex,]
###Train catboost model
#Start time
t1 <- proc.time()
catboost_model_num <- train_cb_model(data_train_num)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.0498, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## Warning: Setting row names on a tibble is deprecated.
## 0: learn: 0.6562256 total: 144ms remaining: 14.3s
## 1: learn: 0.6224505 total: 147ms remaining: 7.22s
## 2: learn: 0.5912482 total: 150ms remaining: 4.86s
## 3: learn: 0.5637288 total: 153ms remaining: 3.68s
## 4: learn: 0.5374063 total: 156ms remaining: 2.97s
## 5: learn: 0.5138677 total: 159ms remaining: 2.5s
## 6: learn: 0.4918847 total: 162ms remaining: 2.16s
## 7: learn: 0.4719303 total: 165ms remaining: 1.9s
## 8: learn: 0.4530863 total: 168ms remaining: 1.7s
## 9: learn: 0.4357108 total: 171ms remaining: 1.54s
## 10: learn: 0.4193844 total: 175ms remaining: 1.42s
## 11: learn: 0.4041877 total: 179ms remaining: 1.31s
## 12: learn: 0.3908262 total: 182ms remaining: 1.22s
## 13: learn: 0.3782138 total: 185ms remaining: 1.14s
## 14: learn: 0.3663505 total: 188ms remaining: 1.06s
## 15: learn: 0.3547899 total: 191ms remaining: 1s
## 16: learn: 0.3439707 total: 194ms remaining: 948ms
## 17: learn: 0.3339882 total: 197ms remaining: 898ms
## 18: learn: 0.3247707 total: 203ms remaining: 864ms
## 19: learn: 0.3158970 total: 206ms remaining: 823ms
## 20: learn: 0.3076138 total: 209ms remaining: 786ms
## 21: learn: 0.2997156 total: 212ms remaining: 752ms
## 22: learn: 0.2923779 total: 215ms remaining: 720ms
## 23: learn: 0.2858883 total: 218ms remaining: 691ms
## 24: learn: 0.2794182 total: 221ms remaining: 663ms
## 25: learn: 0.2735185 total: 224ms remaining: 638ms
## 26: learn: 0.2678042 total: 227ms remaining: 614ms
## 27: learn: 0.2627088 total: 230ms remaining: 592ms
## 28: learn: 0.2576216 total: 234ms remaining: 574ms
## 29: learn: 0.2526883 total: 237ms remaining: 553ms
## 30: learn: 0.2478236 total: 240ms remaining: 534ms
## 31: learn: 0.2435743 total: 243ms remaining: 516ms
## 32: learn: 0.2399787 total: 247ms remaining: 501ms
## 33: learn: 0.2361181 total: 251ms remaining: 487ms
## 34: learn: 0.2326336 total: 254ms remaining: 471ms
## 35: learn: 0.2292718 total: 257ms remaining: 458ms
## 36: learn: 0.2261385 total: 260ms remaining: 443ms
## 37: learn: 0.2230610 total: 263ms remaining: 430ms
## 38: learn: 0.2199877 total: 266ms remaining: 416ms
## 39: learn: 0.2173585 total: 269ms remaining: 404ms
## 40: learn: 0.2147473 total: 273ms remaining: 392ms
## 41: learn: 0.2124027 total: 275ms remaining: 380ms
## 42: learn: 0.2099111 total: 279ms remaining: 369ms
## 43: learn: 0.2078079 total: 282ms remaining: 359ms
## 44: learn: 0.2057913 total: 285ms remaining: 348ms
## 45: learn: 0.2036273 total: 288ms remaining: 338ms
## 46: learn: 0.2016668 total: 291ms remaining: 328ms
## 47: learn: 0.1997976 total: 294ms remaining: 319ms
## 48: learn: 0.1983207 total: 297ms remaining: 309ms
## 49: learn: 0.1968982 total: 300ms remaining: 300ms
## 50: learn: 0.1950934 total: 303ms remaining: 291ms
## 51: learn: 0.1932238 total: 306ms remaining: 282ms
## 52: learn: 0.1919146 total: 309ms remaining: 274ms
## 53: learn: 0.1904395 total: 312ms remaining: 266ms
## 54: learn: 0.1887808 total: 315ms remaining: 257ms
## 55: learn: 0.1875906 total: 318ms remaining: 250ms
## 56: learn: 0.1861783 total: 322ms remaining: 243ms
## 57: learn: 0.1848646 total: 325ms remaining: 236ms
## 58: learn: 0.1838002 total: 329ms remaining: 229ms
## 59: learn: 0.1827546 total: 332ms remaining: 221ms
## 60: learn: 0.1815981 total: 335ms remaining: 214ms
## 61: learn: 0.1808303 total: 338ms remaining: 207ms
## 62: learn: 0.1799687 total: 341ms remaining: 200ms
## 63: learn: 0.1793507 total: 344ms remaining: 194ms
## 64: learn: 0.1782863 total: 347ms remaining: 187ms
## 65: learn: 0.1771588 total: 350ms remaining: 181ms
## 66: learn: 0.1763324 total: 354ms remaining: 174ms
## 67: learn: 0.1757985 total: 357ms remaining: 168ms
## 68: learn: 0.1748335 total: 360ms remaining: 162ms
## 69: learn: 0.1739295 total: 363ms remaining: 156ms
## 70: learn: 0.1731076 total: 366ms remaining: 149ms
## 71: learn: 0.1722549 total: 369ms remaining: 143ms
## 72: learn: 0.1716651 total: 372ms remaining: 138ms
## 73: learn: 0.1709407 total: 375ms remaining: 132ms
## 74: learn: 0.1699720 total: 378ms remaining: 126ms
## 75: learn: 0.1689495 total: 381ms remaining: 120ms
## 76: learn: 0.1684102 total: 384ms remaining: 115ms
## 77: learn: 0.1678780 total: 387ms remaining: 109ms
## 78: learn: 0.1669825 total: 391ms remaining: 104ms
## 79: learn: 0.1663556 total: 394ms remaining: 98.6ms
## 80: learn: 0.1657322 total: 397ms remaining: 93.2ms
## 81: learn: 0.1650136 total: 401ms remaining: 88ms
## 82: learn: 0.1643945 total: 404ms remaining: 82.8ms
## 83: learn: 0.1638710 total: 407ms remaining: 77.6ms
## 84: learn: 0.1633685 total: 410ms remaining: 72.4ms
## 85: learn: 0.1627107 total: 413ms remaining: 67.3ms
## 86: learn: 0.1622749 total: 416ms remaining: 62.2ms
## 87: learn: 0.1618455 total: 420ms remaining: 57.2ms
## 88: learn: 0.1613752 total: 423ms remaining: 52.2ms
## 89: learn: 0.1608166 total: 426ms remaining: 47.3ms
## 90: learn: 0.1602857 total: 429ms remaining: 42.4ms
## 91: learn: 0.1598929 total: 431ms remaining: 37.5ms
## 92: learn: 0.1594210 total: 434ms remaining: 32.7ms
## 93: learn: 0.1587583 total: 438ms remaining: 27.9ms
## 94: learn: 0.1581888 total: 441ms remaining: 23.2ms
## 95: learn: 0.1577363 total: 444ms remaining: 18.5ms
## 96: learn: 0.1572438 total: 447ms remaining: 13.8ms
## 97: learn: 0.1568425 total: 451ms remaining: 9.2ms
## 98: learn: 0.1564707 total: 454ms remaining: 4.58ms
## 99: learn: 0.1559826 total: 457ms remaining: 0us
catboost_model_num
## Catboost
##
## 3048 samples
## 7 predictor
## 2 classes: 'Cammeo', 'Osmancik'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2438, 2438, 2438, 2440, 2438, 2438, ...
## Resampling results across tuning parameters:
##
## depth learning_rate Accuracy Kappa
## 2 0.04978707 0.9286427 0.8540650
## 2 0.13533528 0.9286427 0.8540676
## 2 0.36787944 0.9232291 0.8428829
## 2 1.00000000 0.9169969 0.8302178
## 4 0.04978707 0.9291340 0.8549903
## 4 0.13533528 0.9261821 0.8490582
## 4 0.36787944 0.9140380 0.8242190
## 4 1.00000000 0.9001019 0.7962228
## 6 0.04978707 0.9299547 0.8567736
## 6 0.13533528 0.9247023 0.8460398
## 6 0.36787944 0.9138767 0.8239142
## 6 1.00000000 0.8813212 0.7609330
##
## Tuning parameter 'iterations' was held constant at a value of 100
##
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
## parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
## 0.04978707, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and
## border_count = 255.
#Stop time
proc.time()-t1
## user system elapsed
## 1.70 0.10 51.92
###Make predictions
catboost_pred_num <- predict_results(catboost_model_num,data_test_num)
catboost_pred_num
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cammeo Osmancik
## Cammeo 297 30
## Osmancik 29 406
##
## Accuracy : 0.9226
## 95% CI : (0.9013, 0.9405)
## No Information Rate : 0.5722
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8419
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9110
## Specificity : 0.9312
## Pos Pred Value : 0.9083
## Neg Pred Value : 0.9333
## Prevalence : 0.4278
## Detection Rate : 0.3898
## Detection Prevalence : 0.4291
## Balanced Accuracy : 0.9211
##
## 'Positive' Class : Cammeo
##
###Train random forest model
#Start time
t1 <- proc.time()
rf_model_num <- train_rf_model(data_train_num)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 4 on full training set
rf_model_num
## Random Forest
##
## 3048 samples
## 7 predictor
## 2 classes: 'Cammeo', 'Osmancik'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2438, 2438, 2438, 2440, 2438, 2439, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9237155 0.8440430
## 4 0.9238789 0.8443793
## 7 0.9232251 0.8429397
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
#Stop time
proc.time()-t1
## user system elapsed
## 1.95 0.02 18.38
###Make predictions
rf_pred_num <- predict_results(rf_model_num,data_test_num)
rf_pred_num
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cammeo Osmancik
## Cammeo 299 37
## Osmancik 27 399
##
## Accuracy : 0.916
## 95% CI : (0.894, 0.9347)
## No Information Rate : 0.5722
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8291
##
## Mcnemar's Test P-Value : 0.2606
##
## Sensitivity : 0.9172
## Specificity : 0.9151
## Pos Pred Value : 0.8899
## Neg Pred Value : 0.9366
## Prevalence : 0.4278
## Detection Rate : 0.3924
## Detection Prevalence : 0.4409
## Balanced Accuracy : 0.9162
##
## 'Positive' Class : Cammeo
##
##CATEGORICAL DATASET
dataset_cat
###Split in train and test
trainIndex <- createDataPartition(dataset_cat$CLASS, p=0.80, list=FALSE)
data_train_cat <- dataset_cat[ trainIndex,]
data_test_cat <- dataset_cat[-trainIndex,]
###Train catboost model
#Start time
t1 <- proc.time()
catboost_model_cat <- train_cb_model(data_train_cat)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.0498, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## 0: learn: 0.4922908 total: 11.8ms remaining: 1.17s
## 1: learn: 0.3893184 total: 23.5ms remaining: 1.15s
## 2: learn: 0.3157816 total: 33.1ms remaining: 1.07s
## 3: learn: 0.2526762 total: 41.6ms remaining: 999ms
## 4: learn: 0.1973376 total: 48.2ms remaining: 915ms
## 5: learn: 0.1582169 total: 56.3ms remaining: 882ms
## 6: learn: 0.1326078 total: 61.9ms remaining: 823ms
## 7: learn: 0.1215008 total: 66.1ms remaining: 760ms
## 8: learn: 0.0947080 total: 74.8ms remaining: 756ms
## 9: learn: 0.0696482 total: 83.6ms remaining: 753ms
## 10: learn: 0.0512432 total: 92.3ms remaining: 747ms
## 11: learn: 0.0404933 total: 100ms remaining: 737ms
## 12: learn: 0.0280777 total: 109ms remaining: 730ms
## 13: learn: 0.0181843 total: 117ms remaining: 720ms
## 14: learn: 0.0147062 total: 125ms remaining: 711ms
## 15: learn: 0.0104951 total: 134ms remaining: 705ms
## 16: learn: 0.0080495 total: 144ms remaining: 702ms
## 17: learn: 0.0068975 total: 152ms remaining: 693ms
## 18: learn: 0.0060270 total: 161ms remaining: 684ms
## 19: learn: 0.0042476 total: 169ms remaining: 676ms
## 20: learn: 0.0038424 total: 177ms remaining: 667ms
## 21: learn: 0.0034744 total: 186ms remaining: 659ms
## 22: learn: 0.0031707 total: 194ms remaining: 649ms
## 23: learn: 0.0027084 total: 202ms remaining: 640ms
## 24: learn: 0.0024464 total: 211ms remaining: 633ms
## 25: learn: 0.0023048 total: 220ms remaining: 626ms
## 26: learn: 0.0020885 total: 228ms remaining: 616ms
## 27: learn: 0.0019479 total: 236ms remaining: 607ms
## 28: learn: 0.0018037 total: 244ms remaining: 599ms
## 29: learn: 0.0016826 total: 253ms remaining: 589ms
## 30: learn: 0.0014034 total: 261ms remaining: 581ms
## 31: learn: 0.0012063 total: 269ms remaining: 572ms
## 32: learn: 0.0009502 total: 277ms remaining: 563ms
## 33: learn: 0.0007953 total: 286ms remaining: 556ms
## 34: learn: 0.0007671 total: 295ms remaining: 547ms
## 35: learn: 0.0006287 total: 303ms remaining: 538ms
## 36: learn: 0.0005470 total: 311ms remaining: 530ms
## 37: learn: 0.0004976 total: 320ms remaining: 522ms
## 38: learn: 0.0004641 total: 328ms remaining: 513ms
## 39: learn: 0.0004142 total: 336ms remaining: 504ms
## 40: learn: 0.0003724 total: 344ms remaining: 495ms
## 41: learn: 0.0003139 total: 352ms remaining: 486ms
## 42: learn: 0.0002955 total: 360ms remaining: 478ms
## 43: learn: 0.0002592 total: 369ms remaining: 469ms
## 44: learn: 0.0002383 total: 380ms remaining: 464ms
## 45: learn: 0.0002233 total: 390ms remaining: 458ms
## 46: learn: 0.0002171 total: 398ms remaining: 448ms
## 47: learn: 0.0001861 total: 405ms remaining: 439ms
## 48: learn: 0.0001828 total: 413ms remaining: 430ms
## 49: learn: 0.0001786 total: 422ms remaining: 422ms
## 50: learn: 0.0001693 total: 430ms remaining: 413ms
## 51: learn: 0.0001633 total: 439ms remaining: 405ms
## 52: learn: 0.0001485 total: 448ms remaining: 397ms
## 53: learn: 0.0001389 total: 456ms remaining: 389ms
## 54: learn: 0.0001218 total: 465ms remaining: 381ms
## 55: learn: 0.0001189 total: 474ms remaining: 372ms
## 56: learn: 0.0001144 total: 482ms remaining: 363ms
## 57: learn: 0.0001100 total: 490ms remaining: 355ms
## 58: learn: 0.0001030 total: 497ms remaining: 346ms
## 59: learn: 0.0000914 total: 506ms remaining: 338ms
## 60: learn: 0.0000826 total: 515ms remaining: 329ms
## 61: learn: 0.0000794 total: 523ms remaining: 321ms
## 62: learn: 0.0000771 total: 531ms remaining: 312ms
## 63: learn: 0.0000695 total: 539ms remaining: 303ms
## 64: learn: 0.0000695 total: 546ms remaining: 294ms
## 65: learn: 0.0000673 total: 554ms remaining: 285ms
## 66: learn: 0.0000673 total: 562ms remaining: 277ms
## 67: learn: 0.0000638 total: 570ms remaining: 268ms
## 68: learn: 0.0000638 total: 578ms remaining: 260ms
## 69: learn: 0.0000638 total: 586ms remaining: 251ms
## 70: learn: 0.0000638 total: 593ms remaining: 242ms
## 71: learn: 0.0000597 total: 602ms remaining: 234ms
## 72: learn: 0.0000597 total: 609ms remaining: 225ms
## 73: learn: 0.0000542 total: 618ms remaining: 217ms
## 74: learn: 0.0000538 total: 626ms remaining: 209ms
## 75: learn: 0.0000538 total: 633ms remaining: 200ms
## 76: learn: 0.0000473 total: 641ms remaining: 192ms
## 77: learn: 0.0000459 total: 649ms remaining: 183ms
## 78: learn: 0.0000452 total: 658ms remaining: 175ms
## 79: learn: 0.0000452 total: 666ms remaining: 167ms
## 80: learn: 0.0000438 total: 674ms remaining: 158ms
## 81: learn: 0.0000429 total: 682ms remaining: 150ms
## 82: learn: 0.0000412 total: 689ms remaining: 141ms
## 83: learn: 0.0000412 total: 697ms remaining: 133ms
## 84: learn: 0.0000394 total: 705ms remaining: 124ms
## 85: learn: 0.0000394 total: 712ms remaining: 116ms
## 86: learn: 0.0000394 total: 720ms remaining: 108ms
## 87: learn: 0.0000377 total: 728ms remaining: 99.3ms
## 88: learn: 0.0000377 total: 736ms remaining: 91ms
## 89: learn: 0.0000364 total: 744ms remaining: 82.7ms
## 90: learn: 0.0000364 total: 752ms remaining: 74.4ms
## 91: learn: 0.0000364 total: 760ms remaining: 66.1ms
## 92: learn: 0.0000364 total: 768ms remaining: 57.8ms
## 93: learn: 0.0000364 total: 776ms remaining: 49.5ms
## 94: learn: 0.0000364 total: 783ms remaining: 41.2ms
## 95: learn: 0.0000324 total: 791ms remaining: 33ms
## 96: learn: 0.0000316 total: 800ms remaining: 24.7ms
## 97: learn: 0.0000316 total: 808ms remaining: 16.5ms
## 98: learn: 0.0000316 total: 816ms remaining: 8.24ms
## 99: learn: 0.0000310 total: 824ms remaining: 0us
catboost_model_cat
## Catboost
##
## 6500 samples
## 20 predictor
## 2 classes: 'e', 'p'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 5199, 5201, 5200, 5200, 5200, 5200, ...
## Resampling results across tuning parameters:
##
## depth learning_rate Accuracy Kappa
## 2 0.04978707 0.9996922 0.9993836
## 2 0.13533528 0.9996922 0.9993836
## 2 0.36787944 0.9997691 0.9995376
## 2 1.00000000 0.9998461 0.9996918
## 4 0.04978707 0.9998460 0.9996916
## 4 0.13533528 0.9999230 0.9998458
## 4 0.36787944 0.9999230 0.9998458
## 4 1.00000000 0.9629231 0.9275693
## 6 0.04978707 1.0000000 1.0000000
## 6 0.13533528 1.0000000 1.0000000
## 6 0.36787944 1.0000000 1.0000000
## 6 1.00000000 0.9697460 0.9399195
##
## Tuning parameter 'iterations' was held constant at a value of 100
##
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
## parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
## 0.04978707, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and
## border_count = 255.
#Stop time
proc.time()-t1
## user system elapsed
## 4.05 0.40 87.73
###Make predictions
catboost_pred_cat <- predict_results(catboost_model_cat,data_test_cat)
catboost_pred_cat
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 841 0
## p 0 783
##
## Accuracy : 1
## 95% CI : (0.9977, 1)
## No Information Rate : 0.5179
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5179
## Detection Rate : 0.5179
## Detection Prevalence : 0.5179
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : e
##
###Train random forest model
#Start time
t1 <- proc.time()
rf_model_cat <- train_rf_model(data_train_cat)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 46 on full training set
rf_model_cat
## Random Forest
##
## 6500 samples
## 20 predictor
## 2 classes: 'e', 'p'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 5199, 5200, 5201, 5200, 5200, 5200, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9541554 0.907873
## 46 1.0000000 1.000000
## 91 1.0000000 1.000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 46.
#Stop time
proc.time()-t1
## user system elapsed
## 15.77 0.08 271.44
###Make predictions
rf_pred_cat <- predict_results(rf_model_cat,data_test_cat)
rf_pred_cat
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 841 0
## p 0 783
##
## Accuracy : 1
## 95% CI : (0.9977, 1)
## No Information Rate : 0.5179
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5179
## Detection Rate : 0.5179
## Detection Prevalence : 0.5179
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : e
##
##MIX DATASET
dataset_mix
###Split in train and test
trainIndex <- createDataPartition(dataset_mix$CLASS, p=0.80, list=FALSE)
data_train_mix <- dataset_mix[ trainIndex,]
data_test_mix <- dataset_mix[-trainIndex,]
###Train catboost model
#Start time
t1 <- proc.time()
catboost_model_mix <- train_cb_model(data_train_mix)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## Warning: Setting row names on a tibble is deprecated.
## 0: learn: 0.6118936 total: 8.69ms remaining: 861ms
## 1: learn: 0.5499721 total: 12.3ms remaining: 602ms
## 2: learn: 0.4980993 total: 16.1ms remaining: 521ms
## 3: learn: 0.4575281 total: 19.8ms remaining: 475ms
## 4: learn: 0.4259942 total: 23.7ms remaining: 451ms
## 5: learn: 0.4002412 total: 31.6ms remaining: 495ms
## 6: learn: 0.3782158 total: 35.1ms remaining: 467ms
## 7: learn: 0.3599250 total: 39.5ms remaining: 455ms
## 8: learn: 0.3422690 total: 46.1ms remaining: 466ms
## 9: learn: 0.3257866 total: 50.1ms remaining: 451ms
## 10: learn: 0.3127245 total: 53.8ms remaining: 435ms
## 11: learn: 0.3022354 total: 57.7ms remaining: 423ms
## 12: learn: 0.2940524 total: 61.2ms remaining: 410ms
## 13: learn: 0.2858524 total: 64.7ms remaining: 397ms
## 14: learn: 0.2790922 total: 68.7ms remaining: 389ms
## 15: learn: 0.2718715 total: 73ms remaining: 383ms
## 16: learn: 0.2654991 total: 76.5ms remaining: 373ms
## 17: learn: 0.2593208 total: 80.5ms remaining: 367ms
## 18: learn: 0.2545782 total: 84.5ms remaining: 360ms
## 19: learn: 0.2533133 total: 86.5ms remaining: 346ms
## 20: learn: 0.2489868 total: 90.3ms remaining: 340ms
## 21: learn: 0.2426215 total: 94.1ms remaining: 334ms
## 22: learn: 0.2381686 total: 97.8ms remaining: 327ms
## 23: learn: 0.2333104 total: 102ms remaining: 321ms
## 24: learn: 0.2288080 total: 105ms remaining: 316ms
## 25: learn: 0.2267367 total: 110ms remaining: 312ms
## 26: learn: 0.2237066 total: 114ms remaining: 307ms
## 27: learn: 0.2211292 total: 118ms remaining: 303ms
## 28: learn: 0.2188874 total: 122ms remaining: 299ms
## 29: learn: 0.2172067 total: 126ms remaining: 294ms
## 30: learn: 0.2142067 total: 130ms remaining: 288ms
## 31: learn: 0.2120427 total: 134ms remaining: 284ms
## 32: learn: 0.2099689 total: 138ms remaining: 280ms
## 33: learn: 0.2085930 total: 142ms remaining: 275ms
## 34: learn: 0.2071417 total: 146ms remaining: 270ms
## 35: learn: 0.2046751 total: 149ms remaining: 265ms
## 36: learn: 0.2025437 total: 153ms remaining: 260ms
## 37: learn: 0.2023686 total: 154ms remaining: 252ms
## 38: learn: 0.2000917 total: 158ms remaining: 247ms
## 39: learn: 0.1993683 total: 162ms remaining: 243ms
## 40: learn: 0.1981843 total: 166ms remaining: 239ms
## 41: learn: 0.1960681 total: 170ms remaining: 235ms
## 42: learn: 0.1936508 total: 174ms remaining: 230ms
## 43: learn: 0.1934037 total: 176ms remaining: 224ms
## 44: learn: 0.1927199 total: 180ms remaining: 220ms
## 45: learn: 0.1918260 total: 185ms remaining: 217ms
## 46: learn: 0.1909057 total: 188ms remaining: 212ms
## 47: learn: 0.1905664 total: 193ms remaining: 209ms
## 48: learn: 0.1891510 total: 196ms remaining: 204ms
## 49: learn: 0.1882901 total: 200ms remaining: 200ms
## 50: learn: 0.1872299 total: 204ms remaining: 196ms
## 51: learn: 0.1853002 total: 208ms remaining: 192ms
## 52: learn: 0.1843454 total: 211ms remaining: 187ms
## 53: learn: 0.1832187 total: 215ms remaining: 183ms
## 54: learn: 0.1822790 total: 219ms remaining: 179ms
## 55: learn: 0.1813353 total: 223ms remaining: 175ms
## 56: learn: 0.1799829 total: 227ms remaining: 171ms
## 57: learn: 0.1785457 total: 231ms remaining: 167ms
## 58: learn: 0.1773562 total: 235ms remaining: 164ms
## 59: learn: 0.1755507 total: 239ms remaining: 160ms
## 60: learn: 0.1741962 total: 243ms remaining: 155ms
## 61: learn: 0.1736540 total: 247ms remaining: 151ms
## 62: learn: 0.1713553 total: 251ms remaining: 147ms
## 63: learn: 0.1698708 total: 255ms remaining: 144ms
## 64: learn: 0.1686851 total: 259ms remaining: 140ms
## 65: learn: 0.1674396 total: 263ms remaining: 136ms
## 66: learn: 0.1669483 total: 267ms remaining: 132ms
## 67: learn: 0.1661536 total: 271ms remaining: 128ms
## 68: learn: 0.1652897 total: 275ms remaining: 124ms
## 69: learn: 0.1648333 total: 279ms remaining: 120ms
## 70: learn: 0.1635274 total: 283ms remaining: 115ms
## 71: learn: 0.1619404 total: 286ms remaining: 111ms
## 72: learn: 0.1606925 total: 290ms remaining: 107ms
## 73: learn: 0.1595061 total: 294ms remaining: 103ms
## 74: learn: 0.1589048 total: 298ms remaining: 99.4ms
## 75: learn: 0.1579914 total: 301ms remaining: 95.2ms
## 76: learn: 0.1569769 total: 305ms remaining: 91.2ms
## 77: learn: 0.1565810 total: 309ms remaining: 87.2ms
## 78: learn: 0.1562802 total: 313ms remaining: 83.2ms
## 79: learn: 0.1550379 total: 317ms remaining: 79.2ms
## 80: learn: 0.1540212 total: 321ms remaining: 75.2ms
## 81: learn: 0.1530705 total: 324ms remaining: 71.1ms
## 82: learn: 0.1521065 total: 328ms remaining: 67.2ms
## 83: learn: 0.1500865 total: 333ms remaining: 63.4ms
## 84: learn: 0.1489228 total: 337ms remaining: 59.4ms
## 85: learn: 0.1482246 total: 341ms remaining: 55.5ms
## 86: learn: 0.1472080 total: 345ms remaining: 51.5ms
## 87: learn: 0.1459424 total: 349ms remaining: 47.5ms
## 88: learn: 0.1455194 total: 352ms remaining: 43.6ms
## 89: learn: 0.1442697 total: 356ms remaining: 39.6ms
## 90: learn: 0.1437398 total: 360ms remaining: 35.6ms
## 91: learn: 0.1421481 total: 364ms remaining: 31.6ms
## 92: learn: 0.1415796 total: 367ms remaining: 27.7ms
## 93: learn: 0.1404977 total: 371ms remaining: 23.7ms
## 94: learn: 0.1387489 total: 375ms remaining: 19.7ms
## 95: learn: 0.1378152 total: 379ms remaining: 15.8ms
## 96: learn: 0.1371539 total: 383ms remaining: 11.8ms
## 97: learn: 0.1365447 total: 386ms remaining: 7.89ms
## 98: learn: 0.1356621 total: 390ms remaining: 3.94ms
## 99: learn: 0.1350601 total: 394ms remaining: 0us
catboost_model_mix
## Catboost
##
## 3617 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2894, 2894, 2893, 2893, 2894, 2893, ...
## Resampling results across tuning parameters:
##
## depth learning_rate Accuracy Kappa
## 2 0.04978707 0.8959089 0.2769385
## 2 0.13533528 0.8985328 0.3625356
## 2 0.36787944 0.8972890 0.3930061
## 2 1.00000000 0.8943857 0.4039987
## 4 0.04978707 0.8972882 0.3168869
## 4 0.13533528 0.9004696 0.4002256
## 4 0.36787944 0.8971503 0.4191668
## 4 1.00000000 0.8885785 0.4073606
## 6 0.04978707 0.8972888 0.3331885
## 6 0.13533528 0.9014351 0.4130435
## 6 0.36787944 0.8935549 0.3878588
## 6 1.00000000 0.8801548 0.3736351
##
## Tuning parameter 'iterations' was held constant at a value of 100
##
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
## parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
## 0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
## = 255.
#Stop time
proc.time()-t1
## user system elapsed
## 2.20 0.14 51.02
###Make predictions
catboost_pred_mix <- predict_results(catboost_model_mix,data_test_mix)
catboost_pred_mix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 774 64
## yes 26 40
##
## Accuracy : 0.9004
## 95% CI : (0.879, 0.9192)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.07758
##
## Kappa : 0.4187
##
## Mcnemar's Test P-Value : 9.614e-05
##
## Sensitivity : 0.9675
## Specificity : 0.3846
## Pos Pred Value : 0.9236
## Neg Pred Value : 0.6061
## Prevalence : 0.8850
## Detection Rate : 0.8562
## Detection Prevalence : 0.9270
## Balanced Accuracy : 0.6761
##
## 'Positive' Class : no
##
###Train random forest model
#Start time
t1 <- proc.time()
rf_model_mix <- train_rf_model(data_train_mix)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 22 on full training set
rf_model_mix
## Random Forest
##
## 3617 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2894, 2894, 2894, 2893, 2893, 2893, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8848506 0.01635529
## 22 0.8959093 0.39753611
## 42 0.8953564 0.41016563
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 22.
#Stop time
proc.time()-t1
## user system elapsed
## 8.17 0.03 119.60
###Make predictions
rf_pred_mix <- predict_results(rf_model_mix,data_test_mix)
rf_pred_mix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 775 61
## yes 25 43
##
## Accuracy : 0.9049
## 95% CI : (0.8838, 0.9232)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.0315601
##
## Kappa : 0.45
##
## Mcnemar's Test P-Value : 0.0001606
##
## Sensitivity : 0.9688
## Specificity : 0.4135
## Pos Pred Value : 0.9270
## Neg Pred Value : 0.6324
## Prevalence : 0.8850
## Detection Rate : 0.8573
## Detection Prevalence : 0.9248
## Balanced Accuracy : 0.6911
##
## 'Positive' Class : no
##
stopCluster(cl)
proc.time()-t
## user system elapsed
## 35.62 1.14 602.95