1 Objective

To classify the species based on various characteristics of the plant.

2 Algorithm development & Testing

2.1 Initial setup

2.1.1 Load libraries

Data cleansing, tidying, transformation libraries. Plotting libraries.

# Load required libraries
library(dplyr); library(tidyr); library(ggplot2)

Specialised libraries for machine learning

# Load required libraries
library(caret);  library(randomForest)
library(rattle); library(rpart.plot)
# set see so that the results can be reproducible
set.seed(1000)

2.2 Model development

2.2.0.1 Splitting data into Training and Testing

# Tidy up the dataset used for development of the model
data(iris)
dataset <- iris 
# Split the data into training and testing datasets
# 70% in the training dataset and 30% in testing dataset
inTrain  <- createDataPartition(y=dataset$Species, p=0.7, list=FALSE)
training <- dataset[inTrain,]
testing  <- dataset[-inTrain,]
dim(training); dim(testing)
## [1] 105   5
## [1] 45  5

2.3 Model development using “rpart”

2.3.1 Model definition

modFit <- train(Species ~., method = "rpart", data = training)

2.3.2 Plotting the classification tree, the fancy style

library(rattle); library(rpart.plot)
fancyRpartPlot(modFit$finalModel)

2.3.3 Model validation

2.3.3.1 Training set accuracy (In-Sample)

pred.train <- predict(modFit, training)
print(confusionMatrix(pred.train, training$Species))
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         33         1
##   virginica       0          2        34
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9714          
##                  95% CI : (0.9188, 0.9941)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9571          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9429           0.9714
## Specificity                 1.0000            0.9857           0.9714
## Pos Pred Value              1.0000            0.9706           0.9444
## Neg Pred Value              1.0000            0.9718           0.9855
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3143           0.3238
## Detection Prevalence        0.3333            0.3238           0.3429
## Balanced Accuracy           1.0000            0.9643           0.9714

2.3.3.2 Validation set accuracy (Out-of-Sample)

pred.test <- predict(modFit, testing)
print(confusionMatrix(pred.test, testing$Species))
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         13         2
##   virginica       0          2        13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 8.467e-16       
##                                           
##                   Kappa : 0.8667          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8667           0.8667
## Specificity                 1.0000            0.9333           0.9333
## Pos Pred Value              1.0000            0.8667           0.8667
## Neg Pred Value              1.0000            0.9333           0.9333
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.2889           0.2889
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9000           0.9000

2.4 Model development using “tree”

2.4.1 Model definition

library(tree)
modFit <- tree(Species ~., data = training)
        summary(modFit)
## 
## Classification tree:
## tree(formula = Species ~ ., data = training)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Sepal.Length" "Petal.Width" 
## Number of terminal nodes:  5 
## Residual mean deviance:  0.1173 = 11.73 / 100 
## Misclassification error rate: 0.02857 = 3 / 105

2.4.2 Plotting the classification tree

plot(modFit)

2.4.3 Model validation

2.4.3.1 Training set accuracy (In-Sample)

pred.train <- predict(modFit, training, type= "class")
print(confusionMatrix(pred.train, training$Species))
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         33         1
##   virginica       0          2        34
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9714          
##                  95% CI : (0.9188, 0.9941)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9571          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9429           0.9714
## Specificity                 1.0000            0.9857           0.9714
## Pos Pred Value              1.0000            0.9706           0.9444
## Neg Pred Value              1.0000            0.9718           0.9855
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3143           0.3238
## Detection Prevalence        0.3333            0.3238           0.3429
## Balanced Accuracy           1.0000            0.9643           0.9714

2.4.3.2 Validation set accuracy (Out-of-Sample)

pred.test <- predict(modFit, testing, type="class")
print(confusionMatrix(pred.test, testing$Species))
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         13         2
##   virginica       0          2        13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9111          
##                  95% CI : (0.7878, 0.9752)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 8.467e-16       
##                                           
##                   Kappa : 0.8667          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8667           0.8667
## Specificity                 1.0000            0.9333           0.9333
## Pos Pred Value              1.0000            0.8667           0.8667
## Neg Pred Value              1.0000            0.9333           0.9333
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.2889           0.2889
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9000           0.9000