This material is to practise machine learning with mlr package. If you want to learn more about it, please see this link

In this tutorial, Random Forest classification models will be built to predict three levels of classes using a dataset called wine from HDclassif package. Let’s get started.

Load required library and dataset

# Predicting diabetes 

library(HDclassif) # Diabetes data from this package

library(tibble)

library(tidyverse)

library(ggplot2)

library(mlr) # Machine learning with r package

data(wine, package = "HDclassif")

wine_tibble<-as_tibble(wine)

head(wine_tibble) # Look at first few rows 
NA

Data cleaning

Rename the wine_tibble data

names(wine_tibble) <- c("Class", "Alco", "Malic", "Ash", "Alk", "Mag",
                    "Phe", "Flav", "Non_flav", "Proan", "Col", "Hue",
                    "OD", "Prol")

head(wine_tibble)
NA

This dataset has 14 columns and 178 rows (cases).

Convert Class variable to factor
wine_tibble$Class<-as.factor(wine_tibble$Class)
Rename levels of class as wine A, B and C
# Using for loop to assign levels A, B and C

for (i in 1:nrow(wine_tibble)){
  if (wine_tibble$Class[i]=="1"){
    wine_tibble$Types[i]<-"A"
  } else if(wine_tibble$Class[i]=="2"){
    wine_tibble$Types[i]<-"B"
  }else{
    wine_tibble$Types[i]<-"C"
  }
}
Unknown or uninitialised column: 'Types'.
# Convert as factor levels
wine_tibble$Types<-as.factor(wine_tibble$Types)

wine_tibble<- wine_tibble %>% select(-Class) # Remove Class variable 

tail(wine_tibble)
NA
Visualizing the data by boxplot
# Convert long data to wide data

Visual<-wine_tibble %>% gather(key="Predictors", value = "Values",-Types)

ggplot(Visual) + facet_wrap(~Predictors, scales="free_y")+ geom_boxplot(aes(x=Types,y=Values, fill=Types)) + theme_bw()

Building Random Forest

Create Task, Learner
# Create a task, learner and train the model
df<- data.frame(wine_tibble)

RF_task<-makeClassifTask(data=df, target = "Types")

library(randomForest)

RF_learner<-makeLearner("classif.randomForest")

train_RF<-train(RF_learner,RF_task)
Find hyperparameters
paraspace<-makeParamSet(makeIntegerParam("ntree", lower = 200, upper = 500), makeIntegerParam("mtry", lower = 3, upper = 8),
                        makeIntegerParam("nodesize", lower = 1,upper = 3), makeIntegerParam("maxnodes", lower = 5, upper = 15))
Warning messages:
1: Unknown or uninitialised column: 'Class'. 
2: Unknown or uninitialised column: 'Class'. 
3: Unknown or uninitialised column: 'Class'. 
4: Unknown or uninitialised column: 'Class'. 
randsearch<-makeTuneControlRandom(maxit=100)

Kfold<-makeResampleDesc(method = "RepCV", folds=10, reps=5, stratify = T)

library(parallel)
library(parallelMap)

parallelStartSocket(cpus = detectCores())
Parallelization was not stopped, doing it now.Stopped parallelization. All cleaned up.
Starting parallelization in mode=socket with cpus=8.
Turning_para<-tuneParams(RF_learner,RF_task,resampling = Kfold, par.set = paraspace, control = randsearch)
[Tune] Started tuning learner classif.randomForest for parameter set:

With control class: TuneControlRandom
Imputation value: 1
Exporting objects to slaves for mode socket: .mlr.slave.options
Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 100.
[Tune] Result: ntree=489; mtry=3; nodesize=2; maxnodes=11 : mmce.test.mean=0.0169281
parallelStop()
Stopped parallelization. All cleaned up.
Turning_para
Tune result:
Op. pars: ntree=489; mtry=3; nodesize=2; maxnodes=11
mmce.test.mean=0.0169281

Set hyperparameters as filtered through turning paparmeter process

RF_Set<-setHyperPars(RF_learner, par.vals = Turning_para$x)
There were 18 warnings (use warnings() to see them)
train_RF<-train(RF_Set, RF_task)
Plotting the random forest
get_RF <- getLearnerModel(train_RF)
Warning messages:
1: Unknown or uninitialised column: 'Class'. 
2: Unknown or uninitialised column: 'Class'. 
plot(get_RF)

Class_wine <- colnames(get_RF$err.rate)

legend("topright", Class_wine,
       col = 1:length(Class_wine),
       lty = 1:length(Class_wine))

Cross-validating the model

outer <- makeResampleDesc("RepCV", reps = 5, stratify = T, folds=10)
Warning messages:
1: Unknown or uninitialised column: 'Class'. 
2: Unknown or uninitialised column: 'Class'. 
forestWrapper <- makeTuneWrapper("classif.randomForest", resampling = outer,
                                 par.set = paraspace,
                                 control = randsearch)

parallelStartSocket(cpus = detectCores())
Starting parallelization in mode=socket with cpus=8.
cvWithTuning <- resample(forestWrapper, RF_task, resampling = outer)
Exporting objects to slaves for mode socket: .mlr.slave.options
Resampling: repeated cross-validation
Measures:             mmce      
Mapping in parallel: mode = socket; level = mlr.resample; cpus = 8; elements = 50.


Aggregated Result: mmce.test.mean=0.0190334
parallelStop()
Stopped parallelization. All cleaned up.
cvWithTuning
Resample Result
Task: df
Learner: classif.randomForest.tuned
Aggr perf: mmce.test.mean=0.0190334
Runtime: 1927.93
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpUaGlzIG1hdGVyaWFsIGlzIHRvIHByYWN0aXNlIGBtYWNoaW5lIGxlYXJuaW5nIHdpdGggbWxyYCBwYWNrYWdlLiBJZiB5b3Ugd2FudCB0byBsZWFybiBtb3JlIGFib3V0IGl0LCBwbGVhc2Ugc2VlIHRoaXMgW2xpbmtdKGh0dHBzOi8vd3d3Lm1hbm5pbmcuY29tL2Jvb2tzL21hY2hpbmUtbGVhcm5pbmctd2l0aC1yLXRoZS10aWR5dmVyc2UtYW5kLW1scikNCg0KSW4gdGhpcyB0dXRvcmlhbCwgYCBSYW5kb20gRm9yZXN0YCBjbGFzc2lmaWNhdGlvbiBtb2RlbHMgd2lsbCBiZSBidWlsdCB0byBwcmVkaWN0IHRocmVlIGxldmVscyBvZiBjbGFzc2VzIHVzaW5nIGEgZGF0YXNldCBjYWxsZWQgYHdpbmVgIGZyb20gYEhEY2xhc3NpZmAgcGFja2FnZS4gTGV0J3MgZ2V0IHN0YXJ0ZWQuDQoNCiMgTG9hZCByZXF1aXJlZCBsaWJyYXJ5IGFuZCBkYXRhc2V0DQoNCmBgYHtyfQ0KIyBQcmVkaWN0aW5nIGRpYWJldGVzIA0KDQpsaWJyYXJ5KEhEY2xhc3NpZikgIyBEaWFiZXRlcyBkYXRhIGZyb20gdGhpcyBwYWNrYWdlDQoNCmxpYnJhcnkodGliYmxlKQ0KDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCg0KbGlicmFyeShnZ3Bsb3QyKQ0KDQpsaWJyYXJ5KG1scikgIyBNYWNoaW5lIGxlYXJuaW5nIHdpdGggciBwYWNrYWdlDQoNCmRhdGEod2luZSwgcGFja2FnZSA9ICJIRGNsYXNzaWYiKQ0KDQp3aW5lX3RpYmJsZTwtYXNfdGliYmxlKHdpbmUpDQoNCmhlYWQod2luZV90aWJibGUpICMgTG9vayBhdCBmaXJzdCBmZXcgcm93cyANCg0KYGBgDQoNCiMgRGF0YSBjbGVhbmluZyANCg0KIyMjIyMgUmVuYW1lIHRoZSB3aW5lX3RpYmJsZSBkYXRhDQoNCmBgYHtyfQ0KDQpuYW1lcyh3aW5lX3RpYmJsZSkgPC0gYygiQ2xhc3MiLCAiQWxjbyIsICJNYWxpYyIsICJBc2giLCAiQWxrIiwgIk1hZyIsDQogICAgICAgICAgICAgICAgICAgICJQaGUiLCAiRmxhdiIsICJOb25fZmxhdiIsICJQcm9hbiIsICJDb2wiLCAiSHVlIiwNCiAgICAgICAgICAgICAgICAgICAgIk9EIiwgIlByb2wiKQ0KDQpoZWFkKHdpbmVfdGliYmxlKQ0KDQpgYGANCg0KVGhpcyBkYXRhc2V0IGhhcyAxNCBjb2x1bW5zIGFuZCAxNzggcm93cyAoY2FzZXMpLiAgDQoNCiMjIyMjIENvbnZlcnQgYENsYXNzIHZhcmlhYmxlYCB0byBmYWN0b3IgDQoNCmBgYHtyfQ0Kd2luZV90aWJibGUkQ2xhc3M8LWFzLmZhY3Rvcih3aW5lX3RpYmJsZSRDbGFzcykNCg0KYGBgDQoNCiMjIyMjIFJlbmFtZSBsZXZlbHMgb2YgY2xhc3MgYXMgd2luZSBBLCBCIGFuZCBDDQoNCmBgYHtyfQ0KIyBVc2luZyBmb3IgbG9vcCB0byBhc3NpZ24gbGV2ZWxzIEEsIEIgYW5kIEMNCg0KZm9yIChpIGluIDE6bnJvdyh3aW5lX3RpYmJsZSkpew0KICBpZiAod2luZV90aWJibGUkQ2xhc3NbaV09PSIxIil7DQogICAgd2luZV90aWJibGUkVHlwZXNbaV08LSJBIg0KICB9IGVsc2UgaWYod2luZV90aWJibGUkQ2xhc3NbaV09PSIyIil7DQogICAgd2luZV90aWJibGUkVHlwZXNbaV08LSJCIg0KICB9ZWxzZXsNCiAgICB3aW5lX3RpYmJsZSRUeXBlc1tpXTwtIkMiDQogIH0NCn0NCiMgQ29udmVydCBhcyBmYWN0b3IgbGV2ZWxzDQp3aW5lX3RpYmJsZSRUeXBlczwtYXMuZmFjdG9yKHdpbmVfdGliYmxlJFR5cGVzKQ0KDQp3aW5lX3RpYmJsZTwtIHdpbmVfdGliYmxlICU+JSBzZWxlY3QoLUNsYXNzKSAjIFJlbW92ZSBDbGFzcyB2YXJpYWJsZSANCg0KdGFpbCh3aW5lX3RpYmJsZSkNCg0KYGBgDQoNCiMjIyMjIFZpc3VhbGl6aW5nIHRoZSBkYXRhIGJ5IGJveHBsb3QNCg0KYGBge3J9DQojIENvbnZlcnQgbG9uZyBkYXRhIHRvIHdpZGUgZGF0YQ0KDQpWaXN1YWw8LXdpbmVfdGliYmxlICU+JSBnYXRoZXIoa2V5PSJQcmVkaWN0b3JzIiwgdmFsdWUgPSAiVmFsdWVzIiwtVHlwZXMpDQoNCmdncGxvdChWaXN1YWwpICsgZmFjZXRfd3JhcCh+UHJlZGljdG9ycywgc2NhbGVzPSJmcmVlX3kiKSsgZ2VvbV9ib3hwbG90KGFlcyh4PVR5cGVzLHk9VmFsdWVzLCBmaWxsPVR5cGVzKSkgKyB0aGVtZV9idygpDQoNCmBgYA0KDQoNCiMgQnVpbGRpbmcgUmFuZG9tIEZvcmVzdCANCg0KIyMjIyMgQ3JlYXRlIFRhc2ssIExlYXJuZXINCg0KYGBge3J9DQojIENyZWF0ZSBhIHRhc2ssIGxlYXJuZXIgDQpkZjwtIGRhdGEuZnJhbWUod2luZV90aWJibGUpDQoNClJGX3Rhc2s8LW1ha2VDbGFzc2lmVGFzayhkYXRhPWRmLCB0YXJnZXQgPSAiVHlwZXMiKQ0KDQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkNCg0KUkZfbGVhcm5lcjwtbWFrZUxlYXJuZXIoImNsYXNzaWYucmFuZG9tRm9yZXN0IikNCg0KDQpgYGANCg0KIyMjIyMgRmluZCBoeXBlcnBhcmFtZXRlcnMgDQoNCmBgYHtyfQ0KcGFyYXNwYWNlPC1tYWtlUGFyYW1TZXQobWFrZUludGVnZXJQYXJhbSgibnRyZWUiLCBsb3dlciA9IDIwMCwgdXBwZXIgPSA1MDApLCBtYWtlSW50ZWdlclBhcmFtKCJtdHJ5IiwgbG93ZXIgPSAzLCB1cHBlciA9IDgpLA0KICAgICAgICAgICAgICAgICAgICAgICAgbWFrZUludGVnZXJQYXJhbSgibm9kZXNpemUiLCBsb3dlciA9IDEsdXBwZXIgPSAzKSwgbWFrZUludGVnZXJQYXJhbSgibWF4bm9kZXMiLCBsb3dlciA9IDUsIHVwcGVyID0gMTUpKQ0KDQpyYW5kc2VhcmNoPC1tYWtlVHVuZUNvbnRyb2xSYW5kb20obWF4aXQ9MTAwKQ0KDQpLZm9sZDwtbWFrZVJlc2FtcGxlRGVzYyhtZXRob2QgPSAiUmVwQ1YiLCBmb2xkcz0xMCwgcmVwcz01LCBzdHJhdGlmeSA9IFQpDQoNCmxpYnJhcnkocGFyYWxsZWwpDQpsaWJyYXJ5KHBhcmFsbGVsTWFwKQ0KDQpwYXJhbGxlbFN0YXJ0U29ja2V0KGNwdXMgPSBkZXRlY3RDb3JlcygpKQ0KDQpUdXJuaW5nX3BhcmE8LXR1bmVQYXJhbXMoUkZfbGVhcm5lcixSRl90YXNrLHJlc2FtcGxpbmcgPSBLZm9sZCwgcGFyLnNldCA9IHBhcmFzcGFjZSwgY29udHJvbCA9IHJhbmRzZWFyY2gpDQoNCnBhcmFsbGVsU3RvcCgpDQoNClR1cm5pbmdfcGFyYQ0KDQpgYGANCg0KIyBTZXQgaHlwZXJwYXJhbWV0ZXJzIGFzIGZpbHRlcmVkIHRocm91Z2ggdHVybmluZyBwYXBhcm1ldGVyIHByb2Nlc3MNCg0KYGBge3J9DQpSRl9TZXQ8LXNldEh5cGVyUGFycyhSRl9sZWFybmVyLCBwYXIudmFscyA9IFR1cm5pbmdfcGFyYSR4KQ0KDQp0cmFpbl9SRjwtdHJhaW4oUkZfU2V0LCBSRl90YXNrKQ0KDQpgYGANCg0KIyMjIyMgUGxvdHRpbmcgdGhlIHJhbmRvbSBmb3Jlc3QgDQoNCmBgYHtyfQ0KZ2V0X1JGIDwtIGdldExlYXJuZXJNb2RlbCh0cmFpbl9SRikNCg0KcGxvdChnZXRfUkYpDQoNCkNsYXNzX3dpbmUgPC0gY29sbmFtZXMoZ2V0X1JGJGVyci5yYXRlKQ0KDQpsZWdlbmQoInRvcHJpZ2h0IiwgQ2xhc3Nfd2luZSwNCiAgICAgICBjb2wgPSAxOmxlbmd0aChDbGFzc193aW5lKSwNCiAgICAgICBsdHkgPSAxOmxlbmd0aChDbGFzc193aW5lKSkNCg0KYGBgDQoNCiMgQ3Jvc3MtdmFsaWRhdGluZyB0aGUgbW9kZWwNCg0KYGBge3J9DQpvdXRlciA8LSBtYWtlUmVzYW1wbGVEZXNjKCJSZXBDViIsIHJlcHMgPSA1LCBzdHJhdGlmeSA9IFQsIGZvbGRzPTEwKQ0KDQpmb3Jlc3RXcmFwcGVyIDwtIG1ha2VUdW5lV3JhcHBlcigiY2xhc3NpZi5yYW5kb21Gb3Jlc3QiLCByZXNhbXBsaW5nID0gb3V0ZXIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBwYXIuc2V0ID0gcGFyYXNwYWNlLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY29udHJvbCA9IHJhbmRzZWFyY2gpDQoNCnBhcmFsbGVsU3RhcnRTb2NrZXQoY3B1cyA9IGRldGVjdENvcmVzKCkpDQoNCmN2V2l0aFR1bmluZyA8LSByZXNhbXBsZShmb3Jlc3RXcmFwcGVyLCBSRl90YXNrLCByZXNhbXBsaW5nID0gb3V0ZXIpDQoNCnBhcmFsbGVsU3RvcCgpDQoNCmN2V2l0aFR1bmluZw0KDQpgYGANCg0KDQoNCg==