In this project we will use the K-nearest neigbors algorithm to make predictive insights on the Iris dataset.

library(ISLR)

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Standardize Data

In order to minimize any potential errors from the data, we will standardize the features in our data.

stand.features <- scale(iris[1:4])

var(stand.features[,1])
## [1] 1

Now we will join the standardized data with the target column

final.data <- cbind(stand.features, iris[5])

head(final.data)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1   -0.8976739  1.01560199    -1.335752   -1.311052  setosa
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052  setosa
## 3   -1.3807271  0.32731751    -1.392399   -1.311052  setosa
## 4   -1.5014904  0.09788935    -1.279104   -1.311052  setosa
## 5   -1.0184372  1.24503015    -1.335752   -1.311052  setosa
## 6   -0.5353840  1.93331463    -1.165809   -1.048667  setosa

Train Test Split

set.seed(101)

library(caTools)

sample <- sample.split(final.data$Species, SplitRatio = 0.70)
train <- subset(final.data, sample == TRUE)
test <- subset(final.data, sample == FALSE)

Building the Model

library(class)

predicted.species <- knn(train[1:4], test[1:4], train$Species, k=1)

predicted.species
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor virginica  versicolor versicolor versicolor
## [25] versicolor versicolor virginica  versicolor versicolor versicolor
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica

Let’s see what our misclassification rate was.

mean(test$Species != predicted.species)
## [1] 0.04444444

Choosing K value

predicted.species <- NULL
error.rate <- NULL

for (i in 1:10) {
  set.seed(101)
  predicted.species <- knn(train[1:4], test[1:4], train$Species, k=i)
  error.rate[i] <- mean(test$Species != predicted.species)
}
library(ggplot2)
k.values <- 1:10
error.df <- data.frame(error.rate, k.values)
p1 <- ggplot(error.df, aes(x=k.values, y=error.rate)) + geom_point()
p1 + geom_line(ty='dotted', color='red')
## Warning: Ignoring unknown parameters: ty

Boom. Looks like the optimal k values are between 2 and 6.