KNN turotial

I am using iris dataset here in R to explain KNN technique.

Understanding the dataset

table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
round(prop.table(table(iris$Species)) * 100, digits = 1)
## 
##     setosa versicolor  virginica 
##       33.3       33.3       33.3

Visualize the dataset

library(ggvis)
## Warning: package 'ggvis' was built under R version 3.4.4
library(class)   # required for knn
iris %>% ggvis(~Sepal.Length, ~Sepal.Width, fill = ~Species) %>% layer_points()
iris %>% ggvis(~Petal.Length, ~Petal.Width, fill = ~Species) %>% layer_points()

Preprocess the dataset

set.seed(100) # required to reproduce the results
ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3)) #randomize sampling (70-30 split)

# Normalize the dataset between values 0 and 1
normalize <- function(x){
  return ((x-min(x))/(max(x)-min(x)))
}

iris.new<- as.data.frame(lapply(iris[,c(1,2,3,4)],normalize))
head(iris.new)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1   0.22222222   0.6250000   0.06779661  0.04166667
## 2   0.16666667   0.4166667   0.06779661  0.04166667
## 3   0.11111111   0.5000000   0.05084746  0.04166667
## 4   0.08333333   0.4583333   0.08474576  0.04166667
## 5   0.19444444   0.6666667   0.06779661  0.04166667
## 6   0.30555556   0.7916667   0.11864407  0.12500000
iris.train<- iris.new[ind==1,]
iris.train.target<- iris[ind==1,5]
iris.test<- iris.new[ind==2,]
iris.test.target<- iris[ind==2,5]
summary(iris.new)
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333  
##  Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000  
##  Mean   :0.4287   Mean   :0.4406   Mean   :0.4675   Mean   :0.45806  
##  3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000
dim(iris.train)
## [1] 107   4

Running the KNN Model for classification

library(class)
model1<- knn(train=iris.train, test=iris.test, cl=iris.train.target, k=3)

table(iris.test.target, model1)
##                 model1
## iris.test.target setosa versicolor virginica
##       setosa         16          0         0
##       versicolor      0         10         2
##       virginica       0          2        13
mean(iris.test.target== model1)
## [1] 0.9069767

Determining the right K

iris_acc<-numeric() #Holding variable
 
for(i in 1:50){
 #Apply knn with k = i
 predict<-knn(iris.train,iris.test,iris.train.target,k=i)
 iris_acc<-c(iris_acc,mean(predict==iris.test.target))
}
#Plot k= 1 through 30
plot(1-iris_acc,type="l",ylab="Error Rate",xlab="K",main="Error Rate for Iris With Varying K")

model1<- knn(train=iris.train, test=iris.test, cl=iris.train.target, k=8)
table(iris.test.target, model1)
##                 model1
## iris.test.target setosa versicolor virginica
##       setosa         16          0         0
##       versicolor      0         11         1
##       virginica       0          1        14
mean(iris.test.target== model1)
## [1] 0.9534884