# For manipulating the datasets
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(readxl)
# For plotting correlation matrix
library(ggcorrplot)
## Loading required package: ggplot2
# Machine Learning library
library(caret)
## Loading required package: lattice
library(catboost)
# For Multi-core processing support
library(parallel)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
t <- proc.time()
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
#Numerical dataset
dataset_num <- read_excel("rice.xlsx")
#Categorical dataset
dataset_cat <- read.csv("mushrooms.csv")
#Mix dataset
dataset_mix <- read_excel("bank.xlsx")
#Eliminate VEIL.TYPE since it only has one value, STALK.ROOT since it has missing values and other important attributes.
dataset_cat <- dataset_cat %>% select(-VEIL.TYPE,-STALK.ROOT,-ODOR,-SPORE.PRINT.COLOR,-GILL.COLOR,-GILL.SIZE,-HABITAT,-POPULATION,-STALK.SURFACE.ABOVE.RING,-CAP.COLOR,-RING.TYPE,-STALK.SURFACE.BELOW.RING)
dataset_num$CLASS <- as.factor(dataset_num$CLASS)
dataset_cat <- mutate_if(dataset_cat, is.character, as.factor)
dataset_mix <- mutate_if(dataset_mix, is.character, as.factor)
#CATBOOST
train_cb_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv",
repeats = 2,
number = 5,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = T,
allowParallel = T)
catboost_model <- train(
x = data_train[,!(names(data_train) %in% c("CLASS"))],
y = data_train$CLASS,
method = catboost.caret,
trControl = fitControl)
return(catboost_model)
}
#RANDOM FOREST
train_rf_model <- function(data_train){
fitControl <- trainControl(method="repeatedcv",
repeats = 2,
number = 5,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = T,
allowParallel = T)
train_formula<-formula(CLASS~.)
rf_model <- train(train_formula,
data = data_train,
method = "rf",
trControl = fitControl)
return(rf_model)
}
#Predictions
predict_results <- function(model,data_test){
predictions=predict(model,data_test)
return(confusionMatrix(predictions,as.factor(data_test$CLASS)))
}
dataset_num
trainIndex <- createDataPartition(dataset_num$CLASS, p=0.80, list=FALSE)
data_train_num <- dataset_num[ trainIndex,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
data_test_num <- dataset_num[-trainIndex,]
#Start time
t1 <- proc.time()
catboost_model_num <- train_cb_model(data_train_num)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.0498, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## Warning: Setting row names on a tibble is deprecated.
## 0: learn: 0.6568135 total: 141ms remaining: 13.9s
## 1: learn: 0.6236078 total: 144ms remaining: 7.05s
## 2: learn: 0.5936251 total: 147ms remaining: 4.75s
## 3: learn: 0.5662970 total: 150ms remaining: 3.6s
## 4: learn: 0.5405178 total: 153ms remaining: 2.9s
## 5: learn: 0.5171789 total: 156ms remaining: 2.45s
## 6: learn: 0.4956609 total: 160ms remaining: 2.12s
## 7: learn: 0.4761940 total: 163ms remaining: 1.87s
## 8: learn: 0.4576017 total: 166ms remaining: 1.68s
## 9: learn: 0.4406181 total: 169ms remaining: 1.52s
## 10: learn: 0.4247206 total: 172ms remaining: 1.39s
## 11: learn: 0.4099156 total: 175ms remaining: 1.28s
## 12: learn: 0.3961075 total: 178ms remaining: 1.19s
## 13: learn: 0.3832902 total: 181ms remaining: 1.11s
## 14: learn: 0.3716732 total: 185ms remaining: 1.05s
## 15: learn: 0.3606363 total: 188ms remaining: 987ms
## 16: learn: 0.3501943 total: 191ms remaining: 934ms
## 17: learn: 0.3404634 total: 194ms remaining: 885ms
## 18: learn: 0.3314146 total: 197ms remaining: 842ms
## 19: learn: 0.3229360 total: 200ms remaining: 801ms
## 20: learn: 0.3150070 total: 203ms remaining: 764ms
## 21: learn: 0.3072563 total: 206ms remaining: 731ms
## 22: learn: 0.3000933 total: 209ms remaining: 701ms
## 23: learn: 0.2932783 total: 212ms remaining: 672ms
## 24: learn: 0.2869615 total: 215ms remaining: 646ms
## 25: learn: 0.2812050 total: 219ms remaining: 624ms
## 26: learn: 0.2755738 total: 222ms remaining: 602ms
## 27: learn: 0.2704136 total: 226ms remaining: 582ms
## 28: learn: 0.2655610 total: 229ms remaining: 561ms
## 29: learn: 0.2608680 total: 232ms remaining: 541ms
## 30: learn: 0.2563472 total: 235ms remaining: 523ms
## 31: learn: 0.2523727 total: 238ms remaining: 506ms
## 32: learn: 0.2485799 total: 242ms remaining: 490ms
## 33: learn: 0.2447511 total: 245ms remaining: 475ms
## 34: learn: 0.2414468 total: 248ms remaining: 460ms
## 35: learn: 0.2383678 total: 251ms remaining: 446ms
## 36: learn: 0.2353650 total: 254ms remaining: 432ms
## 37: learn: 0.2324733 total: 257ms remaining: 420ms
## 38: learn: 0.2293391 total: 260ms remaining: 407ms
## 39: learn: 0.2267338 total: 264ms remaining: 395ms
## 40: learn: 0.2240232 total: 267ms remaining: 384ms
## 41: learn: 0.2215597 total: 270ms remaining: 372ms
## 42: learn: 0.2189280 total: 273ms remaining: 362ms
## 43: learn: 0.2167430 total: 277ms remaining: 352ms
## 44: learn: 0.2145314 total: 280ms remaining: 342ms
## 45: learn: 0.2125653 total: 283ms remaining: 332ms
## 46: learn: 0.2106722 total: 286ms remaining: 323ms
## 47: learn: 0.2087211 total: 291ms remaining: 315ms
## 48: learn: 0.2073332 total: 294ms remaining: 306ms
## 49: learn: 0.2057595 total: 297ms remaining: 297ms
## 50: learn: 0.2040379 total: 300ms remaining: 289ms
## 51: learn: 0.2023699 total: 304ms remaining: 280ms
## 52: learn: 0.2010369 total: 307ms remaining: 272ms
## 53: learn: 0.1996971 total: 310ms remaining: 264ms
## 54: learn: 0.1984580 total: 313ms remaining: 256ms
## 55: learn: 0.1974165 total: 316ms remaining: 249ms
## 56: learn: 0.1960377 total: 320ms remaining: 241ms
## 57: learn: 0.1947740 total: 323ms remaining: 234ms
## 58: learn: 0.1938288 total: 326ms remaining: 227ms
## 59: learn: 0.1926876 total: 329ms remaining: 219ms
## 60: learn: 0.1916285 total: 332ms remaining: 212ms
## 61: learn: 0.1905130 total: 335ms remaining: 205ms
## 62: learn: 0.1897046 total: 338ms remaining: 199ms
## 63: learn: 0.1891968 total: 341ms remaining: 192ms
## 64: learn: 0.1882166 total: 344ms remaining: 185ms
## 65: learn: 0.1873404 total: 347ms remaining: 179ms
## 66: learn: 0.1865113 total: 350ms remaining: 172ms
## 67: learn: 0.1857105 total: 353ms remaining: 166ms
## 68: learn: 0.1847751 total: 356ms remaining: 160ms
## 69: learn: 0.1838037 total: 359ms remaining: 154ms
## 70: learn: 0.1831553 total: 363ms remaining: 148ms
## 71: learn: 0.1824054 total: 367ms remaining: 143ms
## 72: learn: 0.1814054 total: 370ms remaining: 137ms
## 73: learn: 0.1805700 total: 374ms remaining: 131ms
## 74: learn: 0.1797765 total: 377ms remaining: 126ms
## 75: learn: 0.1789485 total: 380ms remaining: 120ms
## 76: learn: 0.1784491 total: 383ms remaining: 114ms
## 77: learn: 0.1779122 total: 386ms remaining: 109ms
## 78: learn: 0.1770016 total: 390ms remaining: 104ms
## 79: learn: 0.1764126 total: 393ms remaining: 98.2ms
## 80: learn: 0.1758560 total: 396ms remaining: 93ms
## 81: learn: 0.1750834 total: 399ms remaining: 87.7ms
## 82: learn: 0.1745138 total: 403ms remaining: 82.5ms
## 83: learn: 0.1741124 total: 406ms remaining: 77.4ms
## 84: learn: 0.1736901 total: 409ms remaining: 72.2ms
## 85: learn: 0.1732599 total: 412ms remaining: 67.1ms
## 86: learn: 0.1728302 total: 415ms remaining: 62.1ms
## 87: learn: 0.1724763 total: 419ms remaining: 57.1ms
## 88: learn: 0.1719654 total: 422ms remaining: 52.1ms
## 89: learn: 0.1714901 total: 425ms remaining: 47.2ms
## 90: learn: 0.1708885 total: 428ms remaining: 42.3ms
## 91: learn: 0.1704591 total: 431ms remaining: 37.4ms
## 92: learn: 0.1700912 total: 435ms remaining: 32.7ms
## 93: learn: 0.1689023 total: 439ms remaining: 28ms
## 94: learn: 0.1681292 total: 442ms remaining: 23.3ms
## 95: learn: 0.1675693 total: 446ms remaining: 18.6ms
## 96: learn: 0.1671149 total: 449ms remaining: 13.9ms
## 97: learn: 0.1667215 total: 453ms remaining: 9.24ms
## 98: learn: 0.1661118 total: 456ms remaining: 4.61ms
## 99: learn: 0.1654008 total: 459ms remaining: 0us
catboost_model_num
## Catboost
##
## 3048 samples
## 7 predictor
## 2 classes: 'Cammeo', 'Osmancik'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2438, 2439, 2438, 2438, 2439, 2439, ...
## Resampling results across tuning parameters:
##
## depth learning_rate Accuracy Kappa
## 2 0.04978707 0.9247075 0.8458708
## 2 0.13533528 0.9222469 0.8410028
## 2 0.36787944 0.9197895 0.8358922
## 2 1.00000000 0.9114235 0.8188794
## 4 0.04978707 0.9250359 0.8464785
## 4 0.13533528 0.9215917 0.8395079
## 4 0.36787944 0.9142133 0.8244731
## 4 1.00000000 0.8955135 0.7865344
## 6 0.04978707 0.9270050 0.8505720
## 6 0.13533528 0.9202794 0.8366790
## 6 0.36787944 0.9086344 0.8131553
## 6 1.00000000 0.9005979 0.7970720
##
## Tuning parameter 'iterations' was held constant at a value of 100
##
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
## parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
## 0.04978707, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and
## border_count = 255.
#Stop time
proc.time()-t1
## user system elapsed
## 1.80 0.12 45.92
catboost_pred_num <- predict_results(catboost_model_num,data_test_num)
catboost_pred_num
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cammeo Osmancik
## Cammeo 301 23
## Osmancik 25 413
##
## Accuracy : 0.937
## 95% CI : (0.9173, 0.9532)
## No Information Rate : 0.5722
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8712
##
## Mcnemar's Test P-Value : 0.8852
##
## Sensitivity : 0.9233
## Specificity : 0.9472
## Pos Pred Value : 0.9290
## Neg Pred Value : 0.9429
## Prevalence : 0.4278
## Detection Rate : 0.3950
## Detection Prevalence : 0.4252
## Balanced Accuracy : 0.9353
##
## 'Positive' Class : Cammeo
##
#Start time
t1 <- proc.time()
rf_model_num <- train_rf_model(data_train_num)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
rf_model_num
## Random Forest
##
## 3048 samples
## 7 predictor
## 2 classes: 'Cammeo', 'Osmancik'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2438, 2439, 2438, 2439, 2438, 2438, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9227349 0.8417241
## 4 0.9212590 0.8387185
## 7 0.9212571 0.8386529
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#Stop time
proc.time()-t1
## user system elapsed
## 1.86 0.07 18.80
rf_pred_num <- predict_results(rf_model_num,data_test_num)
rf_pred_num
## Confusion Matrix and Statistics
##
## Reference
## Prediction Cammeo Osmancik
## Cammeo 298 25
## Osmancik 28 411
##
## Accuracy : 0.9304
## 95% CI : (0.91, 0.9475)
## No Information Rate : 0.5722
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8578
##
## Mcnemar's Test P-Value : 0.7835
##
## Sensitivity : 0.9141
## Specificity : 0.9427
## Pos Pred Value : 0.9226
## Neg Pred Value : 0.9362
## Prevalence : 0.4278
## Detection Rate : 0.3911
## Detection Prevalence : 0.4239
## Balanced Accuracy : 0.9284
##
## 'Positive' Class : Cammeo
##
resamps_num <- resamples(list(cb_num=catboost_model_num,rf_num=rf_model_num))
resamps_num
##
## Call:
## resamples.default(x = list(cb_num = catboost_model_num, rf_num = rf_model_num))
##
## Models: cb_num, rf_num
## Number of resamples: 10
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
summary(resamps_num)
##
## Call:
## summary.resamples(object = resamps_num)
##
## Models: cb_num, rf_num
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## cb_num 0.9098361 0.9212146 0.9269913 0.9270050 0.9306529 0.9490969 0
## rf_num 0.9113300 0.9184096 0.9228876 0.9227349 0.9270492 0.9327869 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## cb_num 0.8143147 0.8389740 0.8505871 0.8505720 0.8579431 0.8960230 0
## rf_num 0.8166828 0.8330627 0.8420719 0.8417241 0.8510985 0.8626503 0
bwplot(resamps_num)
dotplot(resamps_num)
difValues_num <- diff(resamps_num)
difValues_num
##
## Call:
## diff.resamples(x = resamps_num)
##
## Models: cb_num, rf_num
## Metrics: Accuracy, Kappa
## Number of differences: 1
## p-value adjustment: bonferroni
summary(difValues_num)
##
## Call:
## summary.diff.resamples(object = difValues_num)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## cb_num rf_num
## cb_num 0.00427
## rf_num 0.2778
##
## Kappa
## cb_num rf_num
## cb_num 0.008848
## rf_num 0.2822
dataset_cat
trainIndex <- createDataPartition(dataset_cat$CLASS, p=0.80, list=FALSE)
data_train_cat <- dataset_cat[ trainIndex,]
data_test_cat <- dataset_cat[-trainIndex,]
#Start time
t1 <- proc.time()
catboost_model_cat <- train_cb_model(data_train_cat)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 6, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## 0: learn: 0.6122370 total: 6.31ms remaining: 625ms
## 1: learn: 0.5507690 total: 12.4ms remaining: 607ms
## 2: learn: 0.5029202 total: 20.4ms remaining: 661ms
## 3: learn: 0.4666108 total: 23.4ms remaining: 561ms
## 4: learn: 0.4360618 total: 26.9ms remaining: 510ms
## 5: learn: 0.4129783 total: 30.2ms remaining: 474ms
## 6: learn: 0.3942320 total: 33.9ms remaining: 451ms
## 7: learn: 0.3678110 total: 38ms remaining: 437ms
## 8: learn: 0.3363034 total: 42ms remaining: 424ms
## 9: learn: 0.3122952 total: 46.2ms remaining: 416ms
## 10: learn: 0.2932804 total: 50.5ms remaining: 409ms
## 11: learn: 0.2809059 total: 55.1ms remaining: 404ms
## 12: learn: 0.2695079 total: 59.5ms remaining: 398ms
## 13: learn: 0.2563822 total: 63.9ms remaining: 393ms
## 14: learn: 0.2480827 total: 68.9ms remaining: 390ms
## 15: learn: 0.2348083 total: 74.4ms remaining: 390ms
## 16: learn: 0.2142045 total: 79.7ms remaining: 389ms
## 17: learn: 0.2014268 total: 83.9ms remaining: 382ms
## 18: learn: 0.1926262 total: 88.5ms remaining: 377ms
## 19: learn: 0.1808947 total: 92.7ms remaining: 371ms
## 20: learn: 0.1702964 total: 97.4ms remaining: 367ms
## 21: learn: 0.1634833 total: 101ms remaining: 360ms
## 22: learn: 0.1574146 total: 106ms remaining: 355ms
## 23: learn: 0.1533933 total: 110ms remaining: 350ms
## 24: learn: 0.1494566 total: 114ms remaining: 343ms
## 25: learn: 0.1467275 total: 118ms remaining: 337ms
## 26: learn: 0.1434511 total: 122ms remaining: 331ms
## 27: learn: 0.1365177 total: 127ms remaining: 326ms
## 28: learn: 0.1350156 total: 131ms remaining: 320ms
## 29: learn: 0.1322677 total: 135ms remaining: 316ms
## 30: learn: 0.1296128 total: 139ms remaining: 310ms
## 31: learn: 0.1251714 total: 145ms remaining: 309ms
## 32: learn: 0.1225821 total: 149ms remaining: 303ms
## 33: learn: 0.1211077 total: 153ms remaining: 297ms
## 34: learn: 0.1191154 total: 158ms remaining: 293ms
## 35: learn: 0.1160553 total: 163ms remaining: 290ms
## 36: learn: 0.1123520 total: 168ms remaining: 286ms
## 37: learn: 0.1094478 total: 172ms remaining: 281ms
## 38: learn: 0.1081211 total: 178ms remaining: 279ms
## 39: learn: 0.1070587 total: 183ms remaining: 274ms
## 40: learn: 0.1030064 total: 187ms remaining: 269ms
## 41: learn: 0.1010134 total: 192ms remaining: 265ms
## 42: learn: 0.0987311 total: 197ms remaining: 262ms
## 43: learn: 0.0983617 total: 202ms remaining: 257ms
## 44: learn: 0.0978947 total: 206ms remaining: 252ms
## 45: learn: 0.0967254 total: 212ms remaining: 249ms
## 46: learn: 0.0954702 total: 219ms remaining: 247ms
## 47: learn: 0.0938459 total: 224ms remaining: 242ms
## 48: learn: 0.0935742 total: 232ms remaining: 241ms
## 49: learn: 0.0921233 total: 236ms remaining: 236ms
## 50: learn: 0.0898602 total: 240ms remaining: 231ms
## 51: learn: 0.0894541 total: 246ms remaining: 227ms
## 52: learn: 0.0888838 total: 250ms remaining: 222ms
## 53: learn: 0.0873815 total: 254ms remaining: 217ms
## 54: learn: 0.0867422 total: 262ms remaining: 214ms
## 55: learn: 0.0862856 total: 266ms remaining: 209ms
## 56: learn: 0.0859953 total: 270ms remaining: 204ms
## 57: learn: 0.0857699 total: 278ms remaining: 201ms
## 58: learn: 0.0856337 total: 286ms remaining: 199ms
## 59: learn: 0.0845285 total: 294ms remaining: 196ms
## 60: learn: 0.0840471 total: 299ms remaining: 191ms
## 61: learn: 0.0823477 total: 303ms remaining: 186ms
## 62: learn: 0.0802434 total: 311ms remaining: 183ms
## 63: learn: 0.0795666 total: 316ms remaining: 178ms
## 64: learn: 0.0792756 total: 320ms remaining: 172ms
## 65: learn: 0.0785699 total: 328ms remaining: 169ms
## 66: learn: 0.0783523 total: 332ms remaining: 164ms
## 67: learn: 0.0774731 total: 337ms remaining: 159ms
## 68: learn: 0.0771809 total: 345ms remaining: 155ms
## 69: learn: 0.0762274 total: 349ms remaining: 150ms
## 70: learn: 0.0751240 total: 353ms remaining: 144ms
## 71: learn: 0.0749683 total: 361ms remaining: 141ms
## 72: learn: 0.0747531 total: 367ms remaining: 136ms
## 73: learn: 0.0746343 total: 375ms remaining: 132ms
## 74: learn: 0.0745693 total: 379ms remaining: 126ms
## 75: learn: 0.0743716 total: 385ms remaining: 121ms
## 76: learn: 0.0732670 total: 392ms remaining: 117ms
## 77: learn: 0.0732560 total: 395ms remaining: 111ms
## 78: learn: 0.0731248 total: 398ms remaining: 106ms
## 79: learn: 0.0726237 total: 402ms remaining: 101ms
## 80: learn: 0.0720270 total: 411ms remaining: 96.3ms
## 81: learn: 0.0719702 total: 416ms remaining: 91.3ms
## 82: learn: 0.0716857 total: 422ms remaining: 86.4ms
## 83: learn: 0.0715268 total: 426ms remaining: 81.2ms
## 84: learn: 0.0712087 total: 430ms remaining: 76ms
## 85: learn: 0.0709254 total: 434ms remaining: 70.7ms
## 86: learn: 0.0706008 total: 441ms remaining: 66ms
## 87: learn: 0.0704501 total: 446ms remaining: 60.9ms
## 88: learn: 0.0703976 total: 451ms remaining: 55.7ms
## 89: learn: 0.0699715 total: 458ms remaining: 50.9ms
## 90: learn: 0.0695446 total: 462ms remaining: 45.7ms
## 91: learn: 0.0692119 total: 466ms remaining: 40.5ms
## 92: learn: 0.0689237 total: 474ms remaining: 35.6ms
## 93: learn: 0.0688865 total: 477ms remaining: 30.5ms
## 94: learn: 0.0681742 total: 481ms remaining: 25.3ms
## 95: learn: 0.0680095 total: 489ms remaining: 20.4ms
## 96: learn: 0.0679535 total: 493ms remaining: 15.2ms
## 97: learn: 0.0678766 total: 498ms remaining: 10.2ms
## 98: learn: 0.0676545 total: 503ms remaining: 5.08ms
## 99: learn: 0.0675518 total: 508ms remaining: 0us
catboost_model_cat
## Catboost
##
## 6500 samples
## 10 predictor
## 2 classes: 'e', 'p'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 5199, 5201, 5200, 5201, 5199, 5201, ...
## Resampling results across tuning parameters:
##
## depth learning_rate Accuracy Kappa
## 2 0.04978707 0.8938478 0.7863302
## 2 0.13533528 0.9303078 0.8603756
## 2 0.36787944 0.9397695 0.8793362
## 2 1.00000000 0.9564634 0.9127782
## 4 0.04978707 0.9394627 0.8785638
## 4 0.13533528 0.9608461 0.9215197
## 4 0.36787944 0.9673072 0.9344566
## 4 1.00000000 0.9477065 0.8947271
## 6 0.04978707 0.9656155 0.9310161
## 6 0.13533528 0.9674611 0.9347500
## 6 0.36787944 0.9666924 0.9332041
## 6 1.00000000 0.9663844 0.9325722
##
## Tuning parameter 'iterations' was held constant at a value of 100
##
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
## parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
## 0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
## = 255.
#Stop time
proc.time()-t1
## user system elapsed
## 2.89 0.28 52.45
catboost_pred_cat <- predict_results(catboost_model_cat,data_test_cat)
catboost_pred_cat
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 838 46
## p 3 737
##
## Accuracy : 0.9698
## 95% CI : (0.9603, 0.9776)
## No Information Rate : 0.5179
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9395
##
## Mcnemar's Test P-Value : 1.973e-09
##
## Sensitivity : 0.9964
## Specificity : 0.9413
## Pos Pred Value : 0.9480
## Neg Pred Value : 0.9959
## Prevalence : 0.5179
## Detection Rate : 0.5160
## Detection Prevalence : 0.5443
## Balanced Accuracy : 0.9688
##
## 'Positive' Class : e
##
#Start time
t1 <- proc.time()
rf_model_cat <- train_rf_model(data_train_cat)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 17 on full training set
rf_model_cat
## Random Forest
##
## 6500 samples
## 10 predictor
## 2 classes: 'e', 'p'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 5201, 5200, 5199, 5200, 5200, 5200, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8475368 0.6923302
## 17 0.9682315 0.9363089
## 33 0.9678468 0.9355343
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.
#Stop time
proc.time()-t1
## user system elapsed
## 6.71 0.06 91.89
rf_pred_cat <- predict_results(rf_model_cat,data_test_cat)
rf_pred_cat
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 831 42
## p 10 741
##
## Accuracy : 0.968
## 95% CI : (0.9582, 0.976)
## No Information Rate : 0.5179
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9358
##
## Mcnemar's Test P-Value : 1.716e-05
##
## Sensitivity : 0.9881
## Specificity : 0.9464
## Pos Pred Value : 0.9519
## Neg Pred Value : 0.9867
## Prevalence : 0.5179
## Detection Rate : 0.5117
## Detection Prevalence : 0.5376
## Balanced Accuracy : 0.9672
##
## 'Positive' Class : e
##
resamps_cat <- resamples(list(cb_cat=catboost_model_cat,rf_cat=rf_model_cat))
resamps_cat
##
## Call:
## resamples.default(x = list(cb_cat = catboost_model_cat, rf_cat = rf_model_cat))
##
## Models: cb_cat, rf_cat
## Number of resamples: 10
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
summary(resamps_cat)
##
## Call:
## summary.resamples(object = resamps_cat)
##
## Models: cb_cat, rf_cat
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## cb_cat 0.9615089 0.9653913 0.9669358 0.9674611 0.9700106 0.9745958 0
## rf_cat 0.9638462 0.9661604 0.9676935 0.9682315 0.9711538 0.9730562 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## cb_cat 0.9227876 0.9305715 0.9337030 0.9347500 0.9398612 0.9490711 0
## rf_cat 0.9274657 0.9321458 0.9352378 0.9363089 0.9421932 0.9459724 0
bwplot(resamps_cat)
dotplot(resamps_cat)
difValues_cat <- diff(resamps_cat)
difValues_cat
##
## Call:
## diff.resamples(x = resamps_cat)
##
## Models: cb_cat, rf_cat
## Metrics: Accuracy, Kappa
## Number of differences: 1
## p-value adjustment: bonferroni
summary(difValues_cat)
##
## Call:
## summary.diff.resamples(object = difValues_cat)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## cb_cat rf_cat
## cb_cat -0.0007704
## rf_cat 0.4833
##
## Kappa
## cb_cat rf_cat
## cb_cat -0.001559
## rf_cat 0.4802
##MIX DATASET
dataset_mix
trainIndex <- createDataPartition(dataset_mix$CLASS, p=0.80, list=FALSE)
data_train_mix <- dataset_mix[ trainIndex,]
data_test_mix <- dataset_mix[-trainIndex,]
#Start time
t1 <- proc.time()
catboost_model_mix <- train_cb_model(data_train_mix)
## Aggregating results
## Selecting tuning parameters
## Fitting depth = 2, learning_rate = 0.135, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9, border_count = 255 on full training set
## Warning: Setting row names on a tibble is deprecated.
## 0: learn: 0.6121339 total: 5.64ms remaining: 558ms
## 1: learn: 0.5502777 total: 9.34ms remaining: 458ms
## 2: learn: 0.5021942 total: 12.9ms remaining: 419ms
## 3: learn: 0.4643175 total: 15.8ms remaining: 378ms
## 4: learn: 0.4356381 total: 17.5ms remaining: 332ms
## 5: learn: 0.4093016 total: 18.9ms remaining: 296ms
## 6: learn: 0.3837429 total: 22.9ms remaining: 304ms
## 7: learn: 0.3621549 total: 24.4ms remaining: 281ms
## 8: learn: 0.3450963 total: 25.9ms remaining: 262ms
## 9: learn: 0.3333567 total: 27.7ms remaining: 249ms
## 10: learn: 0.3242311 total: 29.2ms remaining: 236ms
## 11: learn: 0.3168739 total: 30.7ms remaining: 225ms
## 12: learn: 0.3081181 total: 32.2ms remaining: 216ms
## 13: learn: 0.2996895 total: 33.6ms remaining: 207ms
## 14: learn: 0.2928020 total: 35.2ms remaining: 199ms
## 15: learn: 0.2873438 total: 36.6ms remaining: 192ms
## 16: learn: 0.2820336 total: 38.3ms remaining: 187ms
## 17: learn: 0.2787684 total: 39.7ms remaining: 181ms
## 18: learn: 0.2722486 total: 41.3ms remaining: 176ms
## 19: learn: 0.2683284 total: 42.9ms remaining: 172ms
## 20: learn: 0.2644261 total: 45.1ms remaining: 170ms
## 21: learn: 0.2606796 total: 46.6ms remaining: 165ms
## 22: learn: 0.2574881 total: 48.1ms remaining: 161ms
## 23: learn: 0.2556970 total: 49.6ms remaining: 157ms
## 24: learn: 0.2541877 total: 51.1ms remaining: 153ms
## 25: learn: 0.2518247 total: 52.6ms remaining: 150ms
## 26: learn: 0.2503556 total: 54.2ms remaining: 146ms
## 27: learn: 0.2489015 total: 55.8ms remaining: 143ms
## 28: learn: 0.2477297 total: 57.2ms remaining: 140ms
## 29: learn: 0.2458457 total: 58.6ms remaining: 137ms
## 30: learn: 0.2448694 total: 61.4ms remaining: 137ms
## 31: learn: 0.2436032 total: 63.1ms remaining: 134ms
## 32: learn: 0.2426952 total: 64.8ms remaining: 131ms
## 33: learn: 0.2415039 total: 66.3ms remaining: 129ms
## 34: learn: 0.2406009 total: 67.7ms remaining: 126ms
## 35: learn: 0.2394611 total: 69.1ms remaining: 123ms
## 36: learn: 0.2384822 total: 70.7ms remaining: 120ms
## 37: learn: 0.2371913 total: 72.2ms remaining: 118ms
## 38: learn: 0.2362425 total: 73.7ms remaining: 115ms
## 39: learn: 0.2354962 total: 75.2ms remaining: 113ms
## 40: learn: 0.2346559 total: 76.6ms remaining: 110ms
## 41: learn: 0.2339751 total: 78.1ms remaining: 108ms
## 42: learn: 0.2331142 total: 79.6ms remaining: 106ms
## 43: learn: 0.2328724 total: 81.1ms remaining: 103ms
## 44: learn: 0.2322204 total: 84ms remaining: 103ms
## 45: learn: 0.2316547 total: 86ms remaining: 101ms
## 46: learn: 0.2314605 total: 87.6ms remaining: 98.8ms
## 47: learn: 0.2312707 total: 89.1ms remaining: 96.5ms
## 48: learn: 0.2305370 total: 91.1ms remaining: 94.8ms
## 49: learn: 0.2300010 total: 92.6ms remaining: 92.6ms
## 50: learn: 0.2296235 total: 94.3ms remaining: 90.6ms
## 51: learn: 0.2291560 total: 95.9ms remaining: 88.5ms
## 52: learn: 0.2290524 total: 97.3ms remaining: 86.3ms
## 53: learn: 0.2286910 total: 98.9ms remaining: 84.2ms
## 54: learn: 0.2285615 total: 100ms remaining: 82.1ms
## 55: learn: 0.2279909 total: 102ms remaining: 80ms
## 56: learn: 0.2274678 total: 103ms remaining: 77.9ms
## 57: learn: 0.2273241 total: 105ms remaining: 75.9ms
## 58: learn: 0.2265899 total: 107ms remaining: 74ms
## 59: learn: 0.2261008 total: 108ms remaining: 72.1ms
## 60: learn: 0.2252369 total: 110ms remaining: 70.1ms
## 61: learn: 0.2250421 total: 111ms remaining: 68.1ms
## 62: learn: 0.2246596 total: 112ms remaining: 66.1ms
## 63: learn: 0.2241355 total: 114ms remaining: 64ms
## 64: learn: 0.2241081 total: 115ms remaining: 62ms
## 65: learn: 0.2238018 total: 117ms remaining: 60.1ms
## 66: learn: 0.2237307 total: 118ms remaining: 58.1ms
## 67: learn: 0.2233752 total: 119ms remaining: 56.2ms
## 68: learn: 0.2229184 total: 121ms remaining: 54.3ms
## 69: learn: 0.2227866 total: 122ms remaining: 52.4ms
## 70: learn: 0.2224133 total: 124ms remaining: 50.6ms
## 71: learn: 0.2218968 total: 125ms remaining: 48.7ms
## 72: learn: 0.2216509 total: 127ms remaining: 46.9ms
## 73: learn: 0.2213231 total: 128ms remaining: 45.1ms
## 74: learn: 0.2212330 total: 130ms remaining: 43.2ms
## 75: learn: 0.2208867 total: 131ms remaining: 41.4ms
## 76: learn: 0.2208864 total: 132ms remaining: 39.6ms
## 77: learn: 0.2205792 total: 134ms remaining: 37.8ms
## 78: learn: 0.2201641 total: 135ms remaining: 36ms
## 79: learn: 0.2198007 total: 137ms remaining: 34.2ms
## 80: learn: 0.2194934 total: 138ms remaining: 32.4ms
## 81: learn: 0.2189740 total: 139ms remaining: 30.6ms
## 82: learn: 0.2177490 total: 141ms remaining: 28.9ms
## 83: learn: 0.2176918 total: 142ms remaining: 27.1ms
## 84: learn: 0.2171174 total: 144ms remaining: 25.4ms
## 85: learn: 0.2167369 total: 145ms remaining: 23.7ms
## 86: learn: 0.2163940 total: 147ms remaining: 22ms
## 87: learn: 0.2162784 total: 148ms remaining: 20.2ms
## 88: learn: 0.2162118 total: 150ms remaining: 18.5ms
## 89: learn: 0.2159134 total: 151ms remaining: 16.8ms
## 90: learn: 0.2159058 total: 153ms remaining: 15.1ms
## 91: learn: 0.2157729 total: 154ms remaining: 13.4ms
## 92: learn: 0.2156463 total: 156ms remaining: 11.7ms
## 93: learn: 0.2151911 total: 157ms remaining: 10ms
## 94: learn: 0.2148647 total: 159ms remaining: 8.35ms
## 95: learn: 0.2146870 total: 160ms remaining: 6.68ms
## 96: learn: 0.2143327 total: 162ms remaining: 5.01ms
## 97: learn: 0.2139554 total: 163ms remaining: 3.33ms
## 98: learn: 0.2135875 total: 165ms remaining: 1.66ms
## 99: learn: 0.2134650 total: 166ms remaining: 0us
catboost_model_mix
## Catboost
##
## 3617 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2893, 2894, 2894, 2894, 2893, 2894, ...
## Resampling results across tuning parameters:
##
## depth learning_rate Accuracy Kappa
## 2 0.04978707 0.8986767 0.3245610
## 2 0.13533528 0.9022705 0.3992917
## 2 0.36787944 0.9001954 0.4205602
## 2 1.00000000 0.8902419 0.3984283
## 4 0.04978707 0.9003351 0.3498960
## 4 0.13533528 0.9015782 0.4180483
## 4 0.36787944 0.8961876 0.4063029
## 4 1.00000000 0.8847128 0.3817551
## 6 0.04978707 0.9017182 0.3801920
## 6 0.13533528 0.8986742 0.4123413
## 6 0.36787944 0.8952194 0.4182872
## 6 1.00000000 0.8833335 0.3966184
##
## Tuning parameter 'iterations' was held constant at a value of 100
##
## Tuning parameter 'rsm' was held constant at a value of 0.9
## Tuning
## parameter 'border_count' was held constant at a value of 255
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 2, learning_rate =
## 0.1353353, iterations = 100, l2_leaf_reg = 1e-06, rsm = 0.9 and border_count
## = 255.
#Stop time
proc.time()-t1
## user system elapsed
## 1.53 0.14 49.08
catboost_pred_mix <- predict_results(catboost_model_mix,data_test_mix)
catboost_pred_mix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 767 72
## yes 33 32
##
## Accuracy : 0.8838
## 95% CI : (0.8611, 0.904)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.5672514
##
## Kappa : 0.3184
##
## Mcnemar's Test P-Value : 0.0002086
##
## Sensitivity : 0.9587
## Specificity : 0.3077
## Pos Pred Value : 0.9142
## Neg Pred Value : 0.4923
## Prevalence : 0.8850
## Detection Rate : 0.8485
## Detection Prevalence : 0.9281
## Balanced Accuracy : 0.6332
##
## 'Positive' Class : no
##
#Start time
t1 <- proc.time()
rf_model_mix <- train_rf_model(data_train_mix)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 22 on full training set
rf_model_mix
## Random Forest
##
## 3617 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 2893, 2894, 2894, 2894, 2893, 2894, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8884431 0.06988726
## 22 0.9028136 0.44577851
## 42 0.9008780 0.44446045
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 22.
#Stop time
proc.time()-t1
## user system elapsed
## 7.48 0.03 108.19
rf_pred_mix <- predict_results(rf_model_mix,data_test_mix)
rf_pred_mix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 764 68
## yes 36 36
##
## Accuracy : 0.885
## 95% CI : (0.8623, 0.905)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.526096
##
## Kappa : 0.3477
##
## Mcnemar's Test P-Value : 0.002367
##
## Sensitivity : 0.9550
## Specificity : 0.3462
## Pos Pred Value : 0.9183
## Neg Pred Value : 0.5000
## Prevalence : 0.8850
## Detection Rate : 0.8451
## Detection Prevalence : 0.9204
## Balanced Accuracy : 0.6506
##
## 'Positive' Class : no
##
resamps_mix <- resamples(list(cb_mix=catboost_model_mix,rf_mix=rf_model_mix))
resamps_mix
##
## Call:
## resamples.default(x = list(cb_mix = catboost_model_mix, rf_mix = rf_model_mix))
##
## Models: cb_mix, rf_mix
## Number of resamples: 10
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
summary(resamps_mix)
##
## Call:
## summary.resamples(object = resamps_mix)
##
## Models: cb_mix, rf_mix
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## cb_mix 0.8853591 0.8981005 0.9045643 0.9022705 0.9081167 0.9128631 0
## rf_mix 0.8852006 0.8924620 0.9046274 0.9028136 0.9143350 0.9171271 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## cb_mix 0.3279478 0.3451060 0.4171445 0.3992917 0.4352372 0.4895194 0
## rf_mix 0.3499540 0.3697414 0.4483677 0.4457785 0.5213937 0.5387360 0
bwplot(resamps_mix)
dotplot(resamps_mix)
difValues_mix <- diff(resamps_mix)
difValues_mix
##
## Call:
## diff.resamples(x = resamps_mix)
##
## Models: cb_mix, rf_mix
## Metrics: Accuracy, Kappa
## Number of differences: 1
## p-value adjustment: bonferroni
summary(difValues_mix)
##
## Call:
## summary.diff.resamples(object = difValues_mix)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## cb_mix rf_mix
## cb_mix -0.0005431
## rf_mix 0.9147
##
## Kappa
## cb_mix rf_mix
## cb_mix -0.04649
## rf_mix 0.1781
stopCluster(cl)
proc.time()-t
## user system elapsed
## 24.98 1.20 370.08