Iris Dataset

# Paul Brown Random Forest Project

# Load Libraries
library(stats)
library(caret)
library(randomForest)
library(mlbench)
library(dplyr)
library(tidyverse)

#Load and Inspect data
data("iris")
head(iris)
View(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# Variables
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 ## 1 1 1 1 1 1 1 1 ...

# Split Data into Training and Testing
index <- sample(2,nrow(iris),replace = TRUE, prob=c(0.7,0.3))

##Training data
Training <- iris[index==1, ]

##Testing data
Testing <- iris[index==2, ]

# Random Forest Model
rf = randomForest(Species~., data = Training)
print(rf)


## randomForest(formula = Species ~ ., data = Training) 
##               Type of random forest: classification
##                     Number of trees: 500
## No. of variables tried at each split: 2

##        OOB estimate of  error rate: 3.96%
## Confusion matrix:
##           setosa versicolor virginica class.error
## setosa         37          0         0  0.00000000
## versicolor      0         31         2  0.06060606
## virginica       0          2        29  0.06451613

#Importance
importance(rf)
varImpPlot(rf)

##   MeanDecreaseGini
## Sepal.Length         6.183899
## Sepal.Width          1.426547
## Petal.Length        30.243872
## Petal.Width         28.581184

Predict Model Accuracy

# Confusion Matrix

pred_1 <- predict(rf,Testing)
confusionMatrix(pred_1,Testing$Species)

#Confusion Matrix and Statistics (Testing Data)

##            Reference
## Prediction   setosa versicolor virginica
##  setosa         13          0         0
##  versicolor      0         15         0
##  virginica       0          2        19

## Overall Statistics
                                         
##               Accuracy : 0.9592         
##                 95% CI : (0.8602, 0.995)
##    No Information Rate : 0.3878         
##    P-Value [Acc > NIR] : < 2.2e-16      
                                         
##                  Kappa : 0.9379         
                                         
## Mcnemar's Test P-Value : NA             

## Statistics by Class:

##             Class:setosa  Class:versicolor    Class:virginica
## Sensitivity      1.0000        0.8824             1.0000
## Specificity      1.0000        1.0000             0.9333
## Pos Pred Value   1.0000        1.0000             0.9048
## Neg Pred Value   1.0000        0.9412             1.0000
## Prevalence       0.2653        0.3469             0.3878
## Detection Rate   0.2653        0.3061             0.3878
## Detection Prev   0.2653        0.3061             0.4286
## Balanced Accuracy1.0000        0.9412             0.9667

# Error Rate
plot(rf)

## Random Search
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
set.seed(seed)
mtry <- sqrt(ncol(x))
rf_random <- train(Species~., data=Training, method="rf", metric=metric, tuneLength=15, trControl=control)
print(rf_random)
plot(rf_random)

##Random Forest 

##101 samples
##  4 predictor
##  3 classes: 'setosa', 'versicolor', 'virginica' 

## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 90, 91, 91, 92, 90, 91, ... 
## Resampling results across tuning parameters:

##  mtry   Accuracy   Kappa    
     2     0.9772054  0.9656726
     3     0.9805387  0.9708008
     4     0.9906061  0.9858582

## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.

#Tuning Parameters
predicted_table <- predict(rf, Testing[,-5])
table(observed= Testing[,5], predicted = predicted_table)


##                      predicted
##  observed     setosa versicolor virginica
##  setosa         13          0         0
##  versicolor      0         15         2
##  virginica       0          0        19

Title: “Random Forest Assignment”

Author: “Paul Brown”

Iris Dataset