to Build a IRIS database using Random Forest.

#install.packages("randomForest")
#install.packages("Mass")
#install.packages("caret")
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.5.1

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(MASS)
library(caret)

## Warning: package 'caret' was built under R version 3.5.1

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.1

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

# USe the set.seed function so that we get same results each time 
set.seed(123)
data(iris)
View(iris)
# Splitting data into training and testing. As the species are in order 
# splitting the data based on species 
iris_setosa<-iris[iris$Species=="setosa",] # 50
iris_versicolor <- iris[iris$Species=="versicolor",] # 50
iris_virginica <- iris[iris$Species=="virginica",] # 50
iris_train <- rbind(iris_setosa[1:25,],iris_versicolor[1:25,],iris_virginica[1:25,])
iris_test <- rbind(iris_setosa[26:50,],iris_versicolor[26:50,],iris_virginica[26:50,])

rf <- randomForest(Species~., data=iris_train)
rf  # Description of the random forest with no of trees, mtry = no of variables for splitting

## 
## Call:
##  randomForest(formula = Species ~ ., data = iris_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         25          0         0        0.00
## versicolor      0         24         1        0.04
## virginica       0          2        23        0.08

                  # each tree node.
   # Out of bag estimate of error rate is 4 % in Random Forest Model.
attributes(rf)

## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"

# Prediction and Confusion Matrix - Training data 
pred1 <- predict(rf, iris_train)
head(pred1)

##      1      2      3      4      5      6 
## setosa setosa setosa setosa setosa setosa 
## Levels: setosa versicolor virginica

head(iris_train$Species)

## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica

# looks like the first six predicted value and original value matches.

confusionMatrix(pred1, iris_train$Species)  # 100 % accuracy on training data

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         25          0         0
##   versicolor      0         25         0
##   virginica       0          0        25
## 
## Overall Statistics
##                                     
##                Accuracy : 1         
##                  95% CI : (0.952, 1)
##     No Information Rate : 0.3333    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##  Mcnemar's Test P-Value : NA        
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000

 # Around 95% Confidence Interval. 
  # Sensitivity for all three species/categories is 100 % 

# Prediction with test data - Test Data 
pred2 <- predict(rf, iris_test)
confusionMatrix(pred2, iris_test$Species) # 94,67 % accuracy on test data

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         25          0         0
##   versicolor      0         23         2
##   virginica       0          2        23
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9467         
##                  95% CI : (0.869, 0.9853)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.92           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9200           0.9200
## Specificity                 1.0000            0.9600           0.9600
## Pos Pred Value              1.0000            0.9200           0.9200
## Neg Pred Value              1.0000            0.9600           0.9600
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3067           0.3067
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9400           0.9400

# Error Rate in Random Forest Model :
plot(rf)

# at 200 there is a constant line and it doesnot vary after 200 trees

# Tune Random Forest Model mtry 
tune <- tuneRF(iris_train[,-5], iris_train[,5], stepFactor = 0.5, plot = TRUE, ntreeTry = 300,
       trace = TRUE, improve = 0.05)

## mtry = 2  OOB error = 4% 
## Searching left ...
## mtry = 4     OOB error = 4% 
## 0 0.05 
## Searching right ...
## mtry = 1     OOB error = 4% 
## 0 0.05

rf1 <- randomForest(Species~., data=iris_train, ntree = 140, mtry = 2, importance = TRUE,
                   proximity = TRUE)
rf1  # with the new values after tuning, the OOB estimate error is 4 %

## 
## Call:
##  randomForest(formula = Species ~ ., data = iris_train, ntree = 140,      mtry = 2, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 140
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         25          0         0        0.00
## versicolor      0         24         1        0.04
## virginica       0          2        23        0.08

pred1 <- predict(rf1, iris_train)
confusionMatrix(pred1, iris_train$Species)  # 100 % accuracy on training data

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         25          0         0
##   versicolor      0         25         0
##   virginica       0          0        25
## 
## Overall Statistics
##                                     
##                Accuracy : 1         
##                  95% CI : (0.952, 1)
##     No Information Rate : 0.3333    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##  Mcnemar's Test P-Value : NA        
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000

  # Around 95% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, iris_test)
confusionMatrix(pred2, iris_test$Species) # 96 % accuracy on test data

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         25          0         0
##   versicolor      0         23         1
##   virginica       0          2        24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.96            
##                  95% CI : (0.8875, 0.9917)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.94            
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9200           0.9600
## Specificity                 1.0000            0.9800           0.9600
## Pos Pred Value              1.0000            0.9583           0.9231
## Neg Pred Value              1.0000            0.9608           0.9796
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3067           0.3200
## Detection Prevalence        0.3333            0.3200           0.3467
## Balanced Accuracy           1.0000            0.9500           0.9600

# Confidence Interval between 88 to 99 %
# Species Setosa is 100 % accurate, Versicolor is 92 % and Virginica is 96 % Accurate


# no of nodes of trees

hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number has close to 40 nodes. 

# Variable Importance :

varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# say Petal Length is the most important variable for prediction.on looking at Sepat.Width.has least score
# to be used for predication

# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were 
# removed. Petal.Width is very important and Sepal.Width is not that important.

varImpPlot(rf1 ,Sort = T, n.var = 4, main = "Top 4 -Variable Importance")

## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter

## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter

## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values 
importance(rf1)

##                 setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length  3.193412  2.7848108  3.320912             4.199282
## Sepal.Width   2.082868  0.9835657  3.196945             2.454948
## Petal.Length 11.346951 15.1235002 13.314110            16.841821
## Petal.Width  12.239102 16.2323228 16.007636            19.431241
##              MeanDecreaseGini
## Sepal.Length         2.897292
## Sepal.Width          1.431492
## Petal.Length        20.557043
## Petal.Width         24.442268

varUsed(rf)   # which predictor variables are actually used in the random forest.

## [1] 355 316 858 813

# Partial Dependence Plot 
partialPlot(rf1, iris_train, Petal.Length, "versicolor")

partialPlot(rf1, iris_train, Petal.Length, "setosa")

partialPlot(rf1, iris_train, Petal.Length, "virginica")

# if the petal.length is between 2.5 to 5,5, then it is Versicolor
# If the petal.length is between 1 to 3 cms in length, then it is setosa
# if the petal.length is greater than 3 cms in lenth, then it is Virginica
# Extract single tree from the forest :

tr1 <- getTree(rf1, 1, labelVar = TRUE)
tr1

##   left daughter right daughter    split var split point status prediction
## 1             2              3 Petal.Length        2.60      1       <NA>
## 2             0              0         <NA>        0.00     -1     setosa
## 3             4              5 Petal.Length        4.95      1       <NA>
## 4             0              0         <NA>        0.00     -1 versicolor
## 5             0              0         <NA>        0.00     -1  virginica

# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, iris$Species)