to Build a IRIS database using Random Forest.
#install.packages("randomForest")
#install.packages("Mass")
#install.packages("caret")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# USe the set.seed function so that we get same results each time
set.seed(123)
data(iris)
View(iris)
# Splitting data into training and testing. As the species are in order
# splitting the data based on species
iris_setosa<-iris[iris$Species=="setosa",] # 50
iris_versicolor <- iris[iris$Species=="versicolor",] # 50
iris_virginica <- iris[iris$Species=="virginica",] # 50
iris_train <- rbind(iris_setosa[1:25,],iris_versicolor[1:25,],iris_virginica[1:25,])
iris_test <- rbind(iris_setosa[26:50,],iris_versicolor[26:50,],iris_virginica[26:50,])
rf <- randomForest(Species~., data=iris_train)
rf # Description of the random forest with no of trees, mtry = no of variables for splitting
##
## Call:
## randomForest(formula = Species ~ ., data = iris_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 25 0 0 0.00
## versicolor 0 24 1 0.04
## virginica 0 2 23 0.08
# each tree node.
# Out of bag estimate of error rate is 4 % in Random Forest Model.
attributes(rf)
## $names
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
# Prediction and Confusion Matrix - Training data
pred1 <- predict(rf, iris_train)
head(pred1)
## 1 2 3 4 5 6
## setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
head(iris_train$Species)
## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
# looks like the first six predicted value and original value matches.
confusionMatrix(pred1, iris_train$Species) # 100 % accuracy on training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 25 0 0
## versicolor 0 25 0
## virginica 0 0 25
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.952, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
# Around 95% Confidence Interval.
# Sensitivity for all three species/categories is 100 %
# Prediction with test data - Test Data
pred2 <- predict(rf, iris_test)
confusionMatrix(pred2, iris_test$Species) # 94,67 % accuracy on test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 25 0 0
## versicolor 0 23 2
## virginica 0 2 23
##
## Overall Statistics
##
## Accuracy : 0.9467
## 95% CI : (0.869, 0.9853)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.92
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9200 0.9200
## Specificity 1.0000 0.9600 0.9600
## Pos Pred Value 1.0000 0.9200 0.9200
## Neg Pred Value 1.0000 0.9600 0.9600
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3067 0.3067
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9400 0.9400
# Error Rate in Random Forest Model :
plot(rf)

# at 200 there is a constant line and it doesnot vary after 200 trees
# Tune Random Forest Model mtry
tune <- tuneRF(iris_train[,-5], iris_train[,5], stepFactor = 0.5, plot = TRUE, ntreeTry = 300,
trace = TRUE, improve = 0.05)
## mtry = 2 OOB error = 4%
## Searching left ...
## mtry = 4 OOB error = 4%
## 0 0.05
## Searching right ...
## mtry = 1 OOB error = 4%
## 0 0.05

rf1 <- randomForest(Species~., data=iris_train, ntree = 140, mtry = 2, importance = TRUE,
proximity = TRUE)
rf1 # with the new values after tuning, the OOB estimate error is 4 %
##
## Call:
## randomForest(formula = Species ~ ., data = iris_train, ntree = 140, mtry = 2, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 140
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 25 0 0 0.00
## versicolor 0 24 1 0.04
## virginica 0 2 23 0.08
pred1 <- predict(rf1, iris_train)
confusionMatrix(pred1, iris_train$Species) # 100 % accuracy on training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 25 0 0
## versicolor 0 25 0
## virginica 0 0 25
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.952, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
# Around 95% Confidence Interval.
# Sensitivity for Yes and No is 100 %
# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, iris_test)
confusionMatrix(pred2, iris_test$Species) # 96 % accuracy on test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 25 0 0
## versicolor 0 23 1
## virginica 0 2 24
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.8875, 0.9917)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9200 0.9600
## Specificity 1.0000 0.9800 0.9600
## Pos Pred Value 1.0000 0.9583 0.9231
## Neg Pred Value 1.0000 0.9608 0.9796
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3067 0.3200
## Detection Prevalence 0.3333 0.3200 0.3467
## Balanced Accuracy 1.0000 0.9500 0.9600
# Confidence Interval between 88 to 99 %
# Species Setosa is 100 % accurate, Versicolor is 92 % and Virginica is 96 % Accurate
# no of nodes of trees
hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number has close to 40 nodes.
# Variable Importance :
varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# say Petal Length is the most important variable for prediction.on looking at Sepat.Width.has least score
# to be used for predication
# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were
# removed. Petal.Width is very important and Sepal.Width is not that important.
varImpPlot(rf1 ,Sort = T, n.var = 4, main = "Top 4 -Variable Importance")
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values
importance(rf1)
## setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length 3.193412 2.7848108 3.320912 4.199282
## Sepal.Width 2.082868 0.9835657 3.196945 2.454948
## Petal.Length 11.346951 15.1235002 13.314110 16.841821
## Petal.Width 12.239102 16.2323228 16.007636 19.431241
## MeanDecreaseGini
## Sepal.Length 2.897292
## Sepal.Width 1.431492
## Petal.Length 20.557043
## Petal.Width 24.442268
varUsed(rf) # which predictor variables are actually used in the random forest.
## [1] 355 316 858 813
# Partial Dependence Plot
partialPlot(rf1, iris_train, Petal.Length, "versicolor")

partialPlot(rf1, iris_train, Petal.Length, "setosa")

partialPlot(rf1, iris_train, Petal.Length, "virginica")

# if the petal.length is between 2.5 to 5,5, then it is Versicolor
# If the petal.length is between 1 to 3 cms in length, then it is setosa
# if the petal.length is greater than 3 cms in lenth, then it is Virginica
# Extract single tree from the forest :
tr1 <- getTree(rf1, 1, labelVar = TRUE)
tr1
## left daughter right daughter split var split point status prediction
## 1 2 3 Petal.Length 2.60 1 <NA>
## 2 0 0 <NA> 0.00 -1 setosa
## 3 4 5 Petal.Length 4.95 1 <NA>
## 4 0 0 <NA> 0.00 -1 versicolor
## 5 0 0 <NA> 0.00 -1 virginica
# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, iris$Species)
