##Loading libraries
library(class)
library(ggplot2)
library(gmodels)
library(GGally)

1. Introduction

The Iris dataset contains the data for 50 flowers from each of the 3 species - Setosa, Versicolor and Virginica. The data gives the measurements in centimeters of the variables sepal length and width and petal length and width for each of the flowers.

Goal of the study is to perform exploratory analysis on the data and build a KNN Model with different k and compare the out-of-sample prediction accuracy.

2. Summary statistics

The dataset has 150 observations equally distributed observations among the three species - Setosa, Versicolor and Verginica. The below table shows the summary statistics of all the 4 variables.

Frequencies of Species variable
Variable Species Percentage of Total
Freq. of setosa 50 33.33%
Freq. of versicolor 50 33.33%
Freq of virginica 50 33.33%
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
sapply(iris[,-5], var)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.6856935    0.1899794    3.1162779    0.5810063
  1. Histogram and density plots

The histogram and density plots show clear demarcations between the three species of flowers with respect to all the 4 variables sepal length, width, petal length and petal width. Also, the sepal width looks like a normal distribution unlike the other three variables sepal length, petal length and width.

attach(iris)
par(mfrow = c(2,2))
hist(iris$Sepal.Length, freq = FALSE, main  ="Histogram of Iris Sepal Length", ylim = c(0,2))
lines(density(iris[,c("Sepal.Length")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Sepal.Length")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Sepal.Length")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Sepal.Length")]), col = "red")
legend("topright", 
       c("setosa", "versicolor", "virginica", "Overall"), 
       lty=c(1,1,1,1),
       col=c("blue","green","red","black"), 
       bty = "n")

hist(iris$Sepal.Width, freq = FALSE, main  ="Histogram of Iris Sepal Width", ylim = c(0,2))
lines(density(iris[,c("Sepal.Width")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Sepal.Width")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Sepal.Width")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Sepal.Width")]), col = "red")
legend("topright", 
       c("setosa", "versicolor", "virginica", "Overall"), 
       lty=c(1,1,1,1),
       col=c("blue","green","red","black"), 
       bty = "n")

hist(iris$Petal.Length, freq = FALSE, main  ="Histogram of Iris Petal Length", ylim = c(0,4))
lines(density(iris[,c("Petal.Length")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Petal.Length")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Petal.Length")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Petal.Length")]), col = "red")
legend("topright", 
       c("setosa", "versicolor", "virginica", "Overall"), 
       lty=c(1,1,1,1),
       col=c("blue","green","red","black"), 
       bty = "n")

hist(iris$Petal.Width, freq = FALSE, main  ="Histogram of Iris Petal Width", ylim = c(0,8))
lines(density(iris[,c("Petal.Width")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Petal.Width")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Petal.Width")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Petal.Width")]), col = "red")
legend("topright", 
       c("setosa", "versicolor", "virginica", "Overall"), 
       lty=c(1,1,1,1),
       col=c("blue","green","red","black"), 
       bty = "n")

  1. Scatter Plots From the scatter plots below, we can see there is a positive correlation between petal width and petal length and between sepal width and sepal length. We can also see Setosa has lower petal width and petal lengths followed by versicolor and virginica. With respect to Sepal length and width Setosa has slightly higher lengths and widths as compared to versicolor and virginica.
ggplot(iris, aes(Sepal.Length,Sepal.Width, color = Species)) +
  geom_point() +
  facet_wrap(~Species)

ggplot(iris, aes(Petal.Length,Petal.Width, color = Species)) +
  geom_point() +
  facet_wrap(~Species)

  1. K nearest neighbor model building

We standardized the data and divided it into 40% testing and 60% training data. We then built KNN models on the training data with varying K starting from 1 to 20 and checked the out-of-sample prediction error on the testing data.

The below graph shows the out-of-sample prediction error plotted vs K

#KNN #Developing the model
scale_iris <- iris
scale_iris[,1:4] <- scale(iris[,1:4])

#Splitting training and test dataset
set.seed(12420328)
index <- sample(1:nrow(scale_iris), nrow(scale_iris)*0.6)
iris_train <- scale_iris[index,]
iris_test <- scale_iris[-index,]
#KNN Model
error <- c()
for (i in 1:20)
{
  knn.fit <- knn(train = iris_train[,1:4], test = iris_test[,1:4], cl = as.factor(iris_train$Species), k = i)
  error[i] = 1- mean(knn.fit == iris_test$Species)
}

ggplot(data = data.frame(error), aes(x = 1:20, y = error)) +
  geom_line(color = "Blue") +
  ggtitle("Error Rate vs K") +
  labs( y = "Error", x = "K")

which.min(error)
## [1] 14
#

From the plot above, we can see that the minimum out-of-sample prediction error is observed with K = 14 and K = 15. As we would prefer a less complex model for similar prediction errors thus, we decided to choose K = 15 for the final model. The final model with K = 15 has an out-of-sample error of 1.7% and thus, a prediction accuracy of 98.3%.