I am using iris dataset here in R to explain KNN technique.
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
round(prop.table(table(iris$Species)) * 100, digits = 1)
##
## setosa versicolor virginica
## 33.3 33.3 33.3
library(ggvis)
## Warning: package 'ggvis' was built under R version 3.4.4
library(class) # required for knn
iris %>% ggvis(~Sepal.Length, ~Sepal.Width, fill = ~Species) %>% layer_points()
iris %>% ggvis(~Petal.Length, ~Petal.Width, fill = ~Species) %>% layer_points()
set.seed(100) # required to reproduce the results
ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3)) #randomize sampling (70-30 split)
# Normalize the dataset between values 0 and 1
normalize <- function(x){
return ((x-min(x))/(max(x)-min(x)))
}
iris.new<- as.data.frame(lapply(iris[,c(1,2,3,4)],normalize))
head(iris.new)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 0.22222222 0.6250000 0.06779661 0.04166667
## 2 0.16666667 0.4166667 0.06779661 0.04166667
## 3 0.11111111 0.5000000 0.05084746 0.04166667
## 4 0.08333333 0.4583333 0.08474576 0.04166667
## 5 0.19444444 0.6666667 0.06779661 0.04166667
## 6 0.30555556 0.7916667 0.11864407 0.12500000
iris.train<- iris.new[ind==1,]
iris.train.target<- iris[ind==1,5]
iris.test<- iris.new[ind==2,]
iris.test.target<- iris[ind==2,5]
summary(iris.new)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
dim(iris.train)
## [1] 107 4
library(class)
model1<- knn(train=iris.train, test=iris.test, cl=iris.train.target, k=3)
table(iris.test.target, model1)
## model1
## iris.test.target setosa versicolor virginica
## setosa 16 0 0
## versicolor 0 10 2
## virginica 0 2 13
mean(iris.test.target== model1)
## [1] 0.9069767
iris_acc<-numeric() #Holding variable
for(i in 1:50){
#Apply knn with k = i
predict<-knn(iris.train,iris.test,iris.train.target,k=i)
iris_acc<-c(iris_acc,mean(predict==iris.test.target))
}
#Plot k= 1 through 30
plot(1-iris_acc,type="l",ylab="Error Rate",xlab="K",main="Error Rate for Iris With Varying K")
model1<- knn(train=iris.train, test=iris.test, cl=iris.train.target, k=8)
table(iris.test.target, model1)
## model1
## iris.test.target setosa versicolor virginica
## setosa 16 0 0
## versicolor 0 11 1
## virginica 0 1 14
mean(iris.test.target== model1)
## [1] 0.9534884