This material is to practise machine learning with mlr package. If you want to learn more about it, please see this link
In this tutorial, Random Forest classification models will be built to predict three levels of classes using a dataset called wine from HDclassif package. Let’s get started.
# Predicting diabetes
library(HDclassif) # Diabetes data from this package
library(tibble)
library(tidyverse)
library(ggplot2)
library(mlr) # Machine learning with r package
data(wine, package = "HDclassif")
wine_tibble<-as_tibble(wine)
head(wine_tibble) # Look at first few rows
NA
names(wine_tibble) <- c("Class", "Alco", "Malic", "Ash", "Alk", "Mag",
"Phe", "Flav", "Non_flav", "Proan", "Col", "Hue",
"OD", "Prol")
head(wine_tibble)
NA
This dataset has 14 columns and 178 rows (cases).
Class variable to factorwine_tibble$Class<-as.factor(wine_tibble$Class)
# Using for loop to assign levels A, B and C
for (i in 1:nrow(wine_tibble)){
if (wine_tibble$Class[i]=="1"){
wine_tibble$Types[i]<-"A"
} else if(wine_tibble$Class[i]=="2"){
wine_tibble$Types[i]<-"B"
}else{
wine_tibble$Types[i]<-"C"
}
}
Unknown or uninitialised column: 'Types'.
# Convert as factor levels
wine_tibble$Types<-as.factor(wine_tibble$Types)
wine_tibble<- wine_tibble %>% select(-Class) # Remove Class variable
tail(wine_tibble)
NA
# Convert long data to wide data
Visual<-wine_tibble %>% gather(key="Predictors", value = "Values",-Types)
ggplot(Visual) + facet_wrap(~Predictors, scales="free_y")+ geom_boxplot(aes(x=Types,y=Values, fill=Types)) + theme_bw()
# Create a task, learner
df<- data.frame(wine_tibble)
RF_task<-makeClassifTask(data=df, target = "Types")
library(randomForest)
RF_learner<-makeLearner("classif.randomForest")
Thare are some parameters need to be optimized
paraspace<-makeParamSet(makeIntegerParam("ntree", lower = 200, upper = 500), makeIntegerParam("mtry", lower = 3, upper = 8),
makeIntegerParam("nodesize", lower = 1,upper = 3), makeIntegerParam("maxnodes", lower = 5, upper = 15))
randsearch<-makeTuneControlRandom(maxit=100)
Kfold<-makeResampleDesc(method = "RepCV", folds=10, reps=5, stratify = T)
library(parallel)
library(parallelMap)
package 㤼㸱parallelMap㤼㸲 was built under R version 3.6.3
parallelStartSocket(cpus = detectCores())
Starting parallelization in mode=socket with cpus=8.
Turning_para<-tuneParams(RF_learner,RF_task,resampling = Kfold, par.set = paraspace, control = randsearch)
[Tune] Started tuning learner classif.randomForest for parameter set:
With control class: TuneControlRandom
Imputation value: 1
Exporting objects to slaves for mode socket: .mlr.slave.options
Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 100.
[Tune] Result: ntree=255; mtry=3; nodesize=2; maxnodes=14 : mmce.test.mean=0.0168778
parallelStop()
Stopped parallelization. All cleaned up.
Turning_para
Tune result:
Op. pars: ntree=255; mtry=3; nodesize=2; maxnodes=14
mmce.test.mean=0.0168778
RF_Set<-setHyperPars(RF_learner, par.vals = Turning_para$x)
train_RF<-train(RF_Set, RF_task)
get_RF <- getLearnerModel(train_RF)
plot(get_RF)
Class_wine <- colnames(get_RF$err.rate)
legend("topright", Class_wine,
col = 1:length(Class_wine),
lty = 1:length(Class_wine))
outer <- makeResampleDesc("RepCV", reps = 2, stratify = T, folds=2)
forestWrapper <- makeTuneWrapper("classif.randomForest", resampling = outer,
par.set = paraspace,
control = randsearch)
parallelStartSocket(cpus = detectCores())
Starting parallelization in mode=socket with cpus=8.
cvWithTuning <- resample(forestWrapper, RF_task, resampling = outer)
Exporting objects to slaves for mode socket: .mlr.slave.options
Resampling: repeated cross-validation
Measures: mmce
Mapping in parallel: mode = socket; level = mlr.resample; cpus = 8; elements = 4.
Aggregated Result: mmce.test.mean=0.0169823
parallelStop()
Stopped parallelization. All cleaned up.
cvWithTuning
Resample Result
Task: df
Learner: classif.randomForest.tuned
Aggr perf: mmce.test.mean=0.0169823
Runtime: 19.3315
calculateConfusionMatrix(cvWithTuning$pred,relative = T)
Relative confusion matrix (normalized by row/column):
predicted
true A B C -err.-
A 0.98/0.98 0.02/0.01 0.00/0.00 0.02
B 0.01/0.02 0.97/0.99 0.01/0.02 0.03
C 0.00/0.00 0.00/0.00 1.00/0.98 0.00
-err.- 0.02 0.01 0.02 0.02
Absolute confusion matrix:
predicted
true A B C -err.-
A 116 2 0 2
B 2 138 2 4
C 0 0 96 0
-err.- 2 2 2 6