This material is to practise machine learning with mlr
package. If you want to learn more about it, please see this link
In this tutorial, Random Forest
classification models will be built to predict three levels of classes using a dataset called wine
from HDclassif
package. Let’s get started.
# Predicting diabetes
library(HDclassif) # Diabetes data from this package
library(tibble)
library(tidyverse)
library(ggplot2)
library(mlr) # Machine learning with r package
data(wine, package = "HDclassif")
wine_tibble<-as_tibble(wine)
head(wine_tibble) # Look at first few rows
NA
names(wine_tibble) <- c("Class", "Alco", "Malic", "Ash", "Alk", "Mag",
"Phe", "Flav", "Non_flav", "Proan", "Col", "Hue",
"OD", "Prol")
head(wine_tibble)
NA
This dataset has 14 columns and 178 rows (cases).
Class variable
to factorwine_tibble$Class<-as.factor(wine_tibble$Class)
# Using for loop to assign levels A, B and C
for (i in 1:nrow(wine_tibble)){
if (wine_tibble$Class[i]=="1"){
wine_tibble$Types[i]<-"A"
} else if(wine_tibble$Class[i]=="2"){
wine_tibble$Types[i]<-"B"
}else{
wine_tibble$Types[i]<-"C"
}
}
Unknown or uninitialised column: 'Types'.
# Convert as factor levels
wine_tibble$Types<-as.factor(wine_tibble$Types)
wine_tibble<- wine_tibble %>% select(-Class) # Remove Class variable
tail(wine_tibble)
NA
# Convert long data to wide data
Visual<-wine_tibble %>% gather(key="Predictors", value = "Values",-Types)
ggplot(Visual) + facet_wrap(~Predictors, scales="free_y")+ geom_boxplot(aes(x=Types,y=Values, fill=Types)) + theme_bw()
# Create a task, learner and train the model
df<- data.frame(wine_tibble)
RF_task<-makeClassifTask(data=df, target = "Types")
library(randomForest)
RF_learner<-makeLearner("classif.randomForest")
train_RF<-train(RF_learner,RF_task)
paraspace<-makeParamSet(makeIntegerParam("ntree", lower = 200, upper = 500), makeIntegerParam("mtry", lower = 3, upper = 8),
makeIntegerParam("nodesize", lower = 1,upper = 3), makeIntegerParam("maxnodes", lower = 5, upper = 15))
Warning messages:
1: Unknown or uninitialised column: 'Class'.
2: Unknown or uninitialised column: 'Class'.
3: Unknown or uninitialised column: 'Class'.
4: Unknown or uninitialised column: 'Class'.
randsearch<-makeTuneControlRandom(maxit=100)
Kfold<-makeResampleDesc(method = "RepCV", folds=10, reps=5, stratify = T)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
Parallelization was not stopped, doing it now.Stopped parallelization. All cleaned up.
Starting parallelization in mode=socket with cpus=8.
Turning_para<-tuneParams(RF_learner,RF_task,resampling = Kfold, par.set = paraspace, control = randsearch)
[Tune] Started tuning learner classif.randomForest for parameter set:
With control class: TuneControlRandom
Imputation value: 1
Exporting objects to slaves for mode socket: .mlr.slave.options
Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 100.
[Tune] Result: ntree=489; mtry=3; nodesize=2; maxnodes=11 : mmce.test.mean=0.0169281
parallelStop()
Stopped parallelization. All cleaned up.
Turning_para
Tune result:
Op. pars: ntree=489; mtry=3; nodesize=2; maxnodes=11
mmce.test.mean=0.0169281
RF_Set<-setHyperPars(RF_learner, par.vals = Turning_para$x)
There were 18 warnings (use warnings() to see them)
train_RF<-train(RF_Set, RF_task)
get_RF <- getLearnerModel(train_RF)
Warning messages:
1: Unknown or uninitialised column: 'Class'.
2: Unknown or uninitialised column: 'Class'.
plot(get_RF)
Class_wine <- colnames(get_RF$err.rate)
legend("topright", Class_wine,
col = 1:length(Class_wine),
lty = 1:length(Class_wine))
outer <- makeResampleDesc("RepCV", reps = 5, stratify = T, folds=10)
Warning messages:
1: Unknown or uninitialised column: 'Class'.
2: Unknown or uninitialised column: 'Class'.
forestWrapper <- makeTuneWrapper("classif.randomForest", resampling = outer,
par.set = paraspace,
control = randsearch)
parallelStartSocket(cpus = detectCores())
Starting parallelization in mode=socket with cpus=8.
cvWithTuning <- resample(forestWrapper, RF_task, resampling = outer)
Exporting objects to slaves for mode socket: .mlr.slave.options
Resampling: repeated cross-validation
Measures: mmce
Mapping in parallel: mode = socket; level = mlr.resample; cpus = 8; elements = 50.
Aggregated Result: mmce.test.mean=0.0190334
parallelStop()
Stopped parallelization. All cleaned up.
cvWithTuning
Resample Result
Task: df
Learner: classif.randomForest.tuned
Aggr perf: mmce.test.mean=0.0190334
Runtime: 1927.93