This material is to practise machine learning with mlr
package. If you want to learn more about it, please see this link
In this tutorial, Discriminant Analysis
classification model will be built to predict three levels of classes using a dataset called wine
from HDclassif
package. Let’s get started.
# Predicting diabetes
library(HDclassif) # Diabetes data from this package
library(tibble)
library(mlr) # Machine learning with r package
data(wine, package = "HDclassif")
wine_tibble<-as_tibble(wine)
head(wine_tibble) # Look at first few rows
NA
# Rename the wine_tibble data
names(wine_tibble) <- c("Class", "Alco", "Malic", "Ash", "Alk", "Mag",
"Phe", "Flav", "Non_flav", "Proan", "Col", "Hue",
"OD", "Prol")
head(wine_tibble)
NA
This dataset has 14 columns and 178 rows (cases). It is now to twist data a little bit.
# Create a task, learner and train the model
LDA_task<-makeClassifTask(data=wine_tibble, target = "Class")
Provided data is not a pure data.frame but from class tbl_df, hence it will be converted.
LDA_learner<-makeLearner("classif.lda")
train_LDA<-train(LDA_learner,LDA_task)
get_learner1<-getLearnerModel(train_LDA)
get_learner1
Call:
lda(f, data = getTaskData(.task, .subset))
Prior probabilities of groups:
1 2 3
0.3314607 0.3988764 0.2696629
Group means:
Alco Malic Ash Alk Mag Phe Flav Non_flav Proan Col Hue OD
1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729 0.290000 1.899322 5.528305 1.0620339 3.157797
2 12.27873 1.932676 2.244789 20.23803 94.5493 2.258873 2.0808451 0.363662 1.630282 3.086620 1.0562817 2.785352
3 13.15375 3.333750 2.437083 21.41667 99.3125 1.678750 0.7814583 0.447500 1.153542 7.396250 0.6827083 1.683542
Prol
1 1115.7119
2 519.5070
3 629.8958
Coefficients of linear discriminants:
LD1 LD2
Alco -0.403399781 0.8717930699
Malic 0.165254596 0.3053797325
Ash -0.369075256 2.3458497486
Alk 0.154797889 -0.1463807654
Mag -0.002163496 -0.0004627565
Phe 0.618052068 -0.0322128171
Flav -1.661191235 -0.4919980543
Non_flav -1.495818440 -1.6309537953
Proan 0.134092628 -0.3070875776
Col 0.355055710 0.2532306865
Hue -0.818036073 -1.5156344987
OD -1.157559376 0.0511839665
Prol -0.002691206 0.0028529846
Proportion of trace:
LD1 LD2
0.6875 0.3125
pred_LDA<-predict(get_learner1)
head(pred_LDA$x)
LD1 LD2
1 -4.700244 1.9791383
2 -4.301958 1.1704129
3 -3.420720 1.4291014
4 -4.205754 4.0028715
5 -1.509982 0.4512239
6 -4.518689 3.2131376
wine_tibble %>% mutate(LD1=pred_LDA$x[,1],LD2=pred_LDA$x[,2]) %>% ggplot(aes(x=LD1,y=LD2,col=Class)) + geom_point(size=2) +
stat_ellipse() + theme_bw()
Kfold<-makeResampleDesc(method = "RepCV", folds=10, reps=5, stratify = T)
LDA_kfold<-resample(LDA_learner,LDA_task,resampling = Kfold ,measures = list(mmce,acc))
Resampling: repeated cross-validation
Measures: mmce acc
[Resample] iter 1: 0.0000000 1.0000000
[Resample] iter 2: 0.0000000 1.0000000
[Resample] iter 3: 0.0000000 1.0000000
[Resample] iter 4: 0.0000000 1.0000000
[Resample] iter 5: 0.0555556 0.9444444
[Resample] iter 6: 0.0000000 1.0000000
[Resample] iter 7: 0.0000000 1.0000000
[Resample] iter 8: 0.0555556 0.9444444
[Resample] iter 9: 0.0000000 1.0000000
[Resample] iter 10: 0.0000000 1.0000000
[Resample] iter 11: 0.0000000 1.0000000
[Resample] iter 12: 0.0555556 0.9444444
[Resample] iter 13: 0.0000000 1.0000000
[Resample] iter 14: 0.0000000 1.0000000
[Resample] iter 15: 0.0000000 1.0000000
[Resample] iter 16: 0.0588235 0.9411765
[Resample] iter 17: 0.0555556 0.9444444
[Resample] iter 18: 0.0000000 1.0000000
[Resample] iter 19: 0.0526316 0.9473684
[Resample] iter 20: 0.0000000 1.0000000
[Resample] iter 21: 0.0000000 1.0000000
[Resample] iter 22: 0.0000000 1.0000000
[Resample] iter 23: 0.0555556 0.9444444
[Resample] iter 24: 0.0000000 1.0000000
[Resample] iter 25: 0.0000000 1.0000000
[Resample] iter 26: 0.0000000 1.0000000
[Resample] iter 27: 0.0555556 0.9444444
[Resample] iter 28: 0.0555556 0.9444444
[Resample] iter 29: 0.0000000 1.0000000
[Resample] iter 30: 0.0000000 1.0000000
[Resample] iter 31: 0.0555556 0.9444444
[Resample] iter 32: 0.0000000 1.0000000
[Resample] iter 33: 0.0000000 1.0000000
[Resample] iter 34: 0.0000000 1.0000000
[Resample] iter 35: 0.0000000 1.0000000
[Resample] iter 36: 0.0000000 1.0000000
[Resample] iter 37: 0.0555556 0.9444444
[Resample] iter 38: 0.0000000 1.0000000
[Resample] iter 39: 0.0555556 0.9444444
[Resample] iter 40: 0.0000000 1.0000000
[Resample] iter 41: 0.0000000 1.0000000
[Resample] iter 42: 0.0000000 1.0000000
[Resample] iter 43: 0.0000000 1.0000000
[Resample] iter 44: 0.0555556 0.9444444
[Resample] iter 45: 0.0000000 1.0000000
[Resample] iter 46: 0.0000000 1.0000000
[Resample] iter 47: 0.0000000 1.0000000
[Resample] iter 48: 0.0000000 1.0000000
[Resample] iter 49: 0.0000000 1.0000000
[Resample] iter 50: 0.0588235 0.9411765
Aggregated Result: mmce.test.mean=0.0156278,acc.test.mean=0.9843722
LDA_kfold$aggr
mmce.test.mean acc.test.mean
0.01562779 0.98437221
QDA_learner<-makeLearner("classif.qda") # Set the learner
train_QDA<-train(QDA_learner,LDA_task) # Train the quadratic discriminant analysis
QDA_check<-resample(QDA_learner, LDA_task, resampling = Kfold, measures = list(mmce, acc))
Resampling: repeated cross-validation
Measures: mmce acc
[Resample] iter 1: 0.0000000 1.0000000
[Resample] iter 2: 0.0000000 1.0000000
[Resample] iter 3: 0.0000000 1.0000000
[Resample] iter 4: 0.0555556 0.9444444
[Resample] iter 5: 0.0000000 1.0000000
[Resample] iter 6: 0.0588235 0.9411765
[Resample] iter 7: 0.0000000 1.0000000
[Resample] iter 8: 0.0000000 1.0000000
[Resample] iter 9: 0.0000000 1.0000000
[Resample] iter 10: 0.0000000 1.0000000
[Resample] iter 11: 0.0000000 1.0000000
[Resample] iter 12: 0.0555556 0.9444444
[Resample] iter 13: 0.0000000 1.0000000
[Resample] iter 14: 0.0000000 1.0000000
[Resample] iter 15: 0.0000000 1.0000000
[Resample] iter 16: 0.0000000 1.0000000
[Resample] iter 17: 0.0000000 1.0000000
[Resample] iter 18: 0.0000000 1.0000000
[Resample] iter 19: 0.0000000 1.0000000
[Resample] iter 20: 0.0555556 0.9444444
[Resample] iter 21: 0.0000000 1.0000000
[Resample] iter 22: 0.0000000 1.0000000
[Resample] iter 23: 0.0000000 1.0000000
[Resample] iter 24: 0.0000000 1.0000000
[Resample] iter 25: 0.0555556 0.9444444
[Resample] iter 26: 0.0000000 1.0000000
[Resample] iter 27: 0.0000000 1.0000000
[Resample] iter 28: 0.0000000 1.0000000
[Resample] iter 29: 0.0000000 1.0000000
[Resample] iter 30: 0.0000000 1.0000000
[Resample] iter 31: 0.0000000 1.0000000
[Resample] iter 32: 0.0000000 1.0000000
[Resample] iter 33: 0.0000000 1.0000000
[Resample] iter 34: 0.0555556 0.9444444
[Resample] iter 35: 0.0000000 1.0000000
[Resample] iter 36: 0.0588235 0.9411765
[Resample] iter 37: 0.0000000 1.0000000
[Resample] iter 38: 0.0526316 0.9473684
[Resample] iter 39: 0.0000000 1.0000000
[Resample] iter 40: 0.0000000 1.0000000
[Resample] iter 41: 0.0000000 1.0000000
[Resample] iter 42: 0.0000000 1.0000000
[Resample] iter 43: 0.0000000 1.0000000
[Resample] iter 44: 0.0000000 1.0000000
[Resample] iter 45: 0.0000000 1.0000000
[Resample] iter 46: 0.0000000 1.0000000
[Resample] iter 47: 0.0000000 1.0000000
[Resample] iter 48: 0.0555556 0.9444444
[Resample] iter 49: 0.0000000 1.0000000
[Resample] iter 50: 0.0000000 1.0000000
Aggregated Result: mmce.test.mean=0.0100722,acc.test.mean=0.9899278
QDA_check$aggr
mmce.test.mean acc.test.mean
0.01007224 0.98992776