##Loading libraries
library(class)
library(ggplot2)
library(gmodels)
library(GGally)
The Iris dataset contains the data for 50 flowers from each of the 3 species - Setosa, Versicolor and Virginica. The data gives the measurements in centimeters of the variables sepal length and width and petal length and width for each of the flowers.
Goal of the study is to perform exploratory analysis on the data and build a KNN Model with different k and compare the out-of-sample prediction accuracy.
The dataset has 150 observations equally distributed observations among the three species - Setosa, Versicolor and Verginica. The below table shows the summary statistics of all the 4 variables.
| Variable | Species | Percentage of Total |
|---|---|---|
| Freq. of setosa | 50 | 33.33% |
| Freq. of versicolor | 50 | 33.33% |
| Freq of virginica | 50 | 33.33% |
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
sapply(iris[,-5], var)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.6856935 0.1899794 3.1162779 0.5810063
The histogram and density plots show clear demarcations between the three species of flowers with respect to all the 4 variables sepal length, width, petal length and petal width. Also, the sepal width looks like a normal distribution unlike the other three variables sepal length, petal length and width.
attach(iris)
par(mfrow = c(2,2))
hist(iris$Sepal.Length, freq = FALSE, main ="Histogram of Iris Sepal Length", ylim = c(0,2))
lines(density(iris[,c("Sepal.Length")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Sepal.Length")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Sepal.Length")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Sepal.Length")]), col = "red")
legend("topright",
c("setosa", "versicolor", "virginica", "Overall"),
lty=c(1,1,1,1),
col=c("blue","green","red","black"),
bty = "n")
hist(iris$Sepal.Width, freq = FALSE, main ="Histogram of Iris Sepal Width", ylim = c(0,2))
lines(density(iris[,c("Sepal.Width")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Sepal.Width")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Sepal.Width")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Sepal.Width")]), col = "red")
legend("topright",
c("setosa", "versicolor", "virginica", "Overall"),
lty=c(1,1,1,1),
col=c("blue","green","red","black"),
bty = "n")
hist(iris$Petal.Length, freq = FALSE, main ="Histogram of Iris Petal Length", ylim = c(0,4))
lines(density(iris[,c("Petal.Length")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Petal.Length")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Petal.Length")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Petal.Length")]), col = "red")
legend("topright",
c("setosa", "versicolor", "virginica", "Overall"),
lty=c(1,1,1,1),
col=c("blue","green","red","black"),
bty = "n")
hist(iris$Petal.Width, freq = FALSE, main ="Histogram of Iris Petal Width", ylim = c(0,8))
lines(density(iris[,c("Petal.Width")]), col = "black")
lines(density(iris[which(iris$Species == "setosa"),c("Petal.Width")]), col = "blue")
lines(density(iris[which(iris$Species == "versicolor"),c("Petal.Width")]), col = "green")
lines(density(iris[which(iris$Species == "virginica"),c("Petal.Width")]), col = "red")
legend("topright",
c("setosa", "versicolor", "virginica", "Overall"),
lty=c(1,1,1,1),
col=c("blue","green","red","black"),
bty = "n")
ggplot(iris, aes(Sepal.Length,Sepal.Width, color = Species)) +
geom_point() +
facet_wrap(~Species)
ggplot(iris, aes(Petal.Length,Petal.Width, color = Species)) +
geom_point() +
facet_wrap(~Species)
We standardized the data and divided it into 40% testing and 60% training data. We then built KNN models on the training data with varying K starting from 1 to 20 and checked the out-of-sample prediction error on the testing data.
The below graph shows the out-of-sample prediction error plotted vs K
#KNN #Developing the model
scale_iris <- iris
scale_iris[,1:4] <- scale(iris[,1:4])
#Splitting training and test dataset
set.seed(12420328)
index <- sample(1:nrow(scale_iris), nrow(scale_iris)*0.6)
iris_train <- scale_iris[index,]
iris_test <- scale_iris[-index,]
#KNN Model
error <- c()
for (i in 1:20)
{
knn.fit <- knn(train = iris_train[,1:4], test = iris_test[,1:4], cl = as.factor(iris_train$Species), k = i)
error[i] = 1- mean(knn.fit == iris_test$Species)
}
ggplot(data = data.frame(error), aes(x = 1:20, y = error)) +
geom_line(color = "Blue") +
ggtitle("Error Rate vs K") +
labs( y = "Error", x = "K")
which.min(error)
## [1] 14
#
From the plot above, we can see that the minimum out-of-sample prediction error is observed with K = 14 and K = 15. As we would prefer a less complex model for similar prediction errors thus, we decided to choose K = 15 for the final model. The final model with K = 15 has an out-of-sample error of 1.7% and thus, a prediction accuracy of 98.3%.