libraries

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(repr)
## Warning: package 'repr' was built under R version 3.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(kknn)
## Warning: package 'kknn' was built under R version 3.5.1

Data Cleaning & Preparation

data(iris)
attach(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
table(Species)
## Species
##     setosa versicolor  virginica 
##         50         50         50

Data Exploration

options(repr.plot.width = 5,repr.plot.height = 4) # set the dimention plot area
ggplot(iris, aes(Sepal.Width, Sepal.Length)) + geom_point(aes(color = Species))

ggplot(iris, aes(Petal.Width, Sepal.Length)) + geom_point(aes(color = Species))

we notice that in the previous two plots there is over lapping between versicolor and virginica which lowers the efficiency of clustering data usiing K-mean Algorithm

Appling K-mean Algorithm

set.seed(2345)
train.iris<- sample_frac(iris, .7)
test.iris<- iris[-as.numeric(rownames(train.iris)), ]
# use as. numeric function because rownames functions returns charaters 
kkn.3<- kknn(Species~., train= train.iris, test = test.iris, k= 3)
summary(kkn.3)
## 
## Call:
## kknn(formula = Species ~ ., train = train.iris, test = test.iris,     k = 3)
## 
## Response: "nominal"
##           fit prob.setosa prob.versicolor prob.virginica
## 1      setosa           1       0.0000000     0.00000000
## 2      setosa           1       0.0000000     0.00000000
## 3      setosa           1       0.0000000     0.00000000
## 4      setosa           1       0.0000000     0.00000000
## 5      setosa           1       0.0000000     0.00000000
## 6      setosa           1       0.0000000     0.00000000
## 7      setosa           1       0.0000000     0.00000000
## 8      setosa           1       0.0000000     0.00000000
## 9      setosa           1       0.0000000     0.00000000
## 10     setosa           1       0.0000000     0.00000000
## 11     setosa           1       0.0000000     0.00000000
## 12     setosa           1       0.0000000     0.00000000
## 13     setosa           1       0.0000000     0.00000000
## 14     setosa           1       0.0000000     0.00000000
## 15     setosa           1       0.0000000     0.00000000
## 16 versicolor           0       1.0000000     0.00000000
## 17 versicolor           0       0.9113379     0.08866211
## 18 versicolor           0       1.0000000     0.00000000
## 19 versicolor           0       1.0000000     0.00000000
## 20 versicolor           0       1.0000000     0.00000000
## 21  virginica           0       0.3849002     0.61509982
## 22 versicolor           0       1.0000000     0.00000000
## 23 versicolor           0       1.0000000     0.00000000
## 24 versicolor           0       1.0000000     0.00000000
## 25 versicolor           0       1.0000000     0.00000000
## 26 versicolor           0       0.6150998     0.38490018
## 27 versicolor           0       1.0000000     0.00000000
## 28  virginica           0       0.0000000     1.00000000
## 29 versicolor           0       1.0000000     0.00000000
## 30 versicolor           0       1.0000000     0.00000000
## 31 versicolor           0       1.0000000     0.00000000
## 32 versicolor           0       1.0000000     0.00000000
## 33 versicolor           0       1.0000000     0.00000000
## 34 versicolor           0       1.0000000     0.00000000
## 35 versicolor           0       1.0000000     0.00000000
## 36  virginica           0       0.0000000     1.00000000
## 37  virginica           0       0.0000000     1.00000000
## 38  virginica           0       0.0000000     1.00000000
## 39  virginica           0       0.0000000     1.00000000
## 40  virginica           0       0.0000000     1.00000000
## 41  virginica           0       0.0000000     1.00000000
## 42  virginica           0       0.0000000     1.00000000
## 43  virginica           0       0.0000000     1.00000000
## 44  virginica           0       0.0000000     1.00000000
## 45  virginica           0       0.2962381     0.70376193

Testing Results

test.iris$Predicted<- predict(kkn.3)
test.iris$Correct<- test.iris$Species == test.iris$Predicted
100* sum(test.iris$Correct) / nrow(test.iris)
## [1] 95.55556

and here is the new the results in plots

ggplot(test.iris, aes(Sepal.Width, Sepal.Length)) + geom_point(aes(color= Predicted, shape= Correct))

ggplot(test.iris, aes(Petal.Width, Sepal.Length)) + geom_point(aes(color= Predicted, shape= Correct))