libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(repr)
## Warning: package 'repr' was built under R version 3.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kknn)
## Warning: package 'kknn' was built under R version 3.5.1
Data Cleaning & Preparation
data(iris)
attach(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
table(Species)
## Species
## setosa versicolor virginica
## 50 50 50
Data Exploration
options(repr.plot.width = 5,repr.plot.height = 4) # set the dimention plot area
ggplot(iris, aes(Sepal.Width, Sepal.Length)) + geom_point(aes(color = Species))
ggplot(iris, aes(Petal.Width, Sepal.Length)) + geom_point(aes(color = Species))
we notice that in the previous two plots there is over lapping between versicolor and virginica which lowers the efficiency of clustering data usiing K-mean Algorithm
Appling K-mean Algorithm
set.seed(2345)
train.iris<- sample_frac(iris, .7)
test.iris<- iris[-as.numeric(rownames(train.iris)), ]
# use as. numeric function because rownames functions returns charaters
kkn.3<- kknn(Species~., train= train.iris, test = test.iris, k= 3)
summary(kkn.3)
##
## Call:
## kknn(formula = Species ~ ., train = train.iris, test = test.iris, k = 3)
##
## Response: "nominal"
## fit prob.setosa prob.versicolor prob.virginica
## 1 setosa 1 0.0000000 0.00000000
## 2 setosa 1 0.0000000 0.00000000
## 3 setosa 1 0.0000000 0.00000000
## 4 setosa 1 0.0000000 0.00000000
## 5 setosa 1 0.0000000 0.00000000
## 6 setosa 1 0.0000000 0.00000000
## 7 setosa 1 0.0000000 0.00000000
## 8 setosa 1 0.0000000 0.00000000
## 9 setosa 1 0.0000000 0.00000000
## 10 setosa 1 0.0000000 0.00000000
## 11 setosa 1 0.0000000 0.00000000
## 12 setosa 1 0.0000000 0.00000000
## 13 setosa 1 0.0000000 0.00000000
## 14 setosa 1 0.0000000 0.00000000
## 15 setosa 1 0.0000000 0.00000000
## 16 versicolor 0 1.0000000 0.00000000
## 17 versicolor 0 0.9113379 0.08866211
## 18 versicolor 0 1.0000000 0.00000000
## 19 versicolor 0 1.0000000 0.00000000
## 20 versicolor 0 1.0000000 0.00000000
## 21 virginica 0 0.3849002 0.61509982
## 22 versicolor 0 1.0000000 0.00000000
## 23 versicolor 0 1.0000000 0.00000000
## 24 versicolor 0 1.0000000 0.00000000
## 25 versicolor 0 1.0000000 0.00000000
## 26 versicolor 0 0.6150998 0.38490018
## 27 versicolor 0 1.0000000 0.00000000
## 28 virginica 0 0.0000000 1.00000000
## 29 versicolor 0 1.0000000 0.00000000
## 30 versicolor 0 1.0000000 0.00000000
## 31 versicolor 0 1.0000000 0.00000000
## 32 versicolor 0 1.0000000 0.00000000
## 33 versicolor 0 1.0000000 0.00000000
## 34 versicolor 0 1.0000000 0.00000000
## 35 versicolor 0 1.0000000 0.00000000
## 36 virginica 0 0.0000000 1.00000000
## 37 virginica 0 0.0000000 1.00000000
## 38 virginica 0 0.0000000 1.00000000
## 39 virginica 0 0.0000000 1.00000000
## 40 virginica 0 0.0000000 1.00000000
## 41 virginica 0 0.0000000 1.00000000
## 42 virginica 0 0.0000000 1.00000000
## 43 virginica 0 0.0000000 1.00000000
## 44 virginica 0 0.0000000 1.00000000
## 45 virginica 0 0.2962381 0.70376193
Testing Results
test.iris$Predicted<- predict(kkn.3)
test.iris$Correct<- test.iris$Species == test.iris$Predicted
100* sum(test.iris$Correct) / nrow(test.iris)
## [1] 95.55556
and here is the new the results in plots
ggplot(test.iris, aes(Sepal.Width, Sepal.Length)) + geom_point(aes(color= Predicted, shape= Correct))
ggplot(test.iris, aes(Petal.Width, Sepal.Length)) + geom_point(aes(color= Predicted, shape= Correct))