To classify the species based on various characteristics of the plant.
Data cleansing, tidying, transformation libraries. Plotting libraries.
# Load required libraries
library(dplyr); library(tidyr); library(ggplot2)
Specialised libraries for machine learning
# Load required libraries
library(caret); library(randomForest)
library(rattle); library(rpart.plot)
# set see so that the results can be reproducible
set.seed(1000)
# Tidy up the dataset used for development of the model
data(iris)
dataset <- iris
# Split the data into training and testing datasets
# 70% in the training dataset and 30% in testing dataset
inTrain <- createDataPartition(y=dataset$Species, p=0.7, list=FALSE)
training <- dataset[inTrain,]
testing <- dataset[-inTrain,]
dim(training); dim(testing)
## [1] 105 5
## [1] 45 5
modFit <- train(Species ~., method = "rpart", data = training)
library(rattle); library(rpart.plot)
fancyRpartPlot(modFit$finalModel)
pred.train <- predict(modFit, training)
print(confusionMatrix(pred.train, training$Species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 33 1
## virginica 0 2 34
##
## Overall Statistics
##
## Accuracy : 0.9714
## 95% CI : (0.9188, 0.9941)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9571
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9429 0.9714
## Specificity 1.0000 0.9857 0.9714
## Pos Pred Value 1.0000 0.9706 0.9444
## Neg Pred Value 1.0000 0.9718 0.9855
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3143 0.3238
## Detection Prevalence 0.3333 0.3238 0.3429
## Balanced Accuracy 1.0000 0.9643 0.9714
pred.test <- predict(modFit, testing)
print(confusionMatrix(pred.test, testing$Species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 2
## virginica 0 2 13
##
## Overall Statistics
##
## Accuracy : 0.9111
## 95% CI : (0.7878, 0.9752)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.467e-16
##
## Kappa : 0.8667
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8667 0.8667
## Specificity 1.0000 0.9333 0.9333
## Pos Pred Value 1.0000 0.8667 0.8667
## Neg Pred Value 1.0000 0.9333 0.9333
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.2889 0.2889
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9000 0.9000
library(tree)
modFit <- tree(Species ~., data = training)
summary(modFit)
##
## Classification tree:
## tree(formula = Species ~ ., data = training)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Sepal.Length" "Petal.Width"
## Number of terminal nodes: 5
## Residual mean deviance: 0.1173 = 11.73 / 100
## Misclassification error rate: 0.02857 = 3 / 105
plot(modFit)
pred.train <- predict(modFit, training, type= "class")
print(confusionMatrix(pred.train, training$Species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 33 1
## virginica 0 2 34
##
## Overall Statistics
##
## Accuracy : 0.9714
## 95% CI : (0.9188, 0.9941)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9571
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9429 0.9714
## Specificity 1.0000 0.9857 0.9714
## Pos Pred Value 1.0000 0.9706 0.9444
## Neg Pred Value 1.0000 0.9718 0.9855
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3143 0.3238
## Detection Prevalence 0.3333 0.3238 0.3429
## Balanced Accuracy 1.0000 0.9643 0.9714
pred.test <- predict(modFit, testing, type="class")
print(confusionMatrix(pred.test, testing$Species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 2
## virginica 0 2 13
##
## Overall Statistics
##
## Accuracy : 0.9111
## 95% CI : (0.7878, 0.9752)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.467e-16
##
## Kappa : 0.8667
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8667 0.8667
## Specificity 1.0000 0.9333 0.9333
## Pos Pred Value 1.0000 0.8667 0.8667
## Neg Pred Value 1.0000 0.9333 0.9333
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.2889 0.2889
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9000 0.9000