Implementation of KNN and Kmeans Clustering using Iris Dataset

# Getting the data from the famous iris data set for this project. It's a small data set with flower features that can be used to attempt to predict the species of an iris flower.
#Use the ISLR libary to get the iris data set. Then, I will check the head of the iris Data Frame.

library(ISLR)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

head(str(data.frame(iris)))

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

## NULL

#Standardize Data
# Lets go ahead and standardize this data set even though its not necessary for this data!
# I will Use scale() to standardize the feature columns of the iris dataset and then set this standardized version of the data as a new variable(standardized.iris).

standardized.feature<-scale(iris[1:4])

#Check that the scaling worked by checking the variance of one of the new columns
var(standardized.feature[,1])

## [1] 1

var(standardized.feature[,2])

## [1] 1

There seem to be consistency in variance of the first and second column(1).

# Let's go ahead and join the standardized data with the response/target/label column (the column with the species names.
final.data<-cbind(standardized.feature,iris[5])
head(final.data)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1   -0.8976739  1.01560199    -1.335752   -1.311052  setosa
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052  setosa
## 3   -1.3807271  0.32731751    -1.392399   -1.311052  setosa
## 4   -1.5014904  0.09788935    -1.279104   -1.311052  setosa
## 5   -1.0184372  1.24503015    -1.335752   -1.311052  setosa
## 6   -0.5353840  1.93331463    -1.165809   -1.048667  setosa

** I am going to train and Test Splits Use the caTools library to split the standardized data into train and test sets; Using a 70/30 split.**

library(caTools)
set.seed(101)
sample<-sample.split(final.data$Species, SplitRatio = 0.70)
train<-subset(final.data,sample==TRUE)
test<-subset(final.data, sample==FALSE)

# I will go ahead and build a KNN model
#And then use the knn function to predict Species of the test set. Use k=1 
library(class)
predicted.species<-knn(train[1:4],test[1:4],train$Species, k=1)
predicted.species

##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor virginica  versicolor versicolor versicolor
## [25] versicolor versicolor virginica  versicolor versicolor versicolor
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica

#checking to see the misclassification rate?
mean(test$Species!=predicted.species)

## [1] 0.04444444

# Looks like the misclassification error is bigger than I expected. Let me see if I can select a K value that will be able to bring the error down.


#Choosing a K Value
#Although the data is quite small for us to really get a feel for choosing a good K value, I will go ahead and Create a plot of the error (misclassification) rate for k values ranging from 1 to 10.

predicted.species<-NULL
error.rate<-NULL
for(i in 1:10){
  set.seed(101)
predicted.species<-knn(train[1:4],test[1:4],train$Species, k=i)
error.rate[i]<-mean(test$Species!=predicted.species)
}

library(ggplot2)
k.values<-1:10
error.df<-data.frame(error.rate,k.values)
pl<-ggplot(error.df,aes(x=k.values,y=error.rate)) + geom_point()
pl + geom_line(lty="dotted",color='blue')

** In other to minimize the error rate, I have to select K values between 2 and 6**

#Kmeans Clustering
# I am going ahead to see the unsurpervised learning of the Iris data set
pl<-ggplot(iris, aes(Petal.Length, Petal.Width, color=Species))
pl+geom_point(size=4)

# Well, it looks like the Setosa species is already seperated and so K-means clustering is going to have easier time clustering the Setosa species than the rest of the species.
set.seed(101)
iris.cluster<-kmeans(iris[,1:4],3,nstart = 20)
iris.cluster

## K-means clustering with 3 clusters of sizes 62, 50, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.901613    2.748387     4.393548    1.433871
## 2     5.006000    3.428000     1.462000    0.246000
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3
## [106] 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 3 3 3 3 1 3
## [141] 3 3 1 3 3 3 1 3 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 39.82097 15.15100 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

library(cluster)
clusplot(iris,iris.cluster$cluster,color=T, Labels=0, lines=0)

## Warning in plot.window(...): "Labels" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "Labels" is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "Labels" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "Labels" is
## not a graphical parameter

## Warning in box(...): "Labels" is not a graphical parameter

## Warning in title(...): "Labels" is not a graphical parameter

## Warning in polygon(z[[k]], density = if (shade) density[k] else 0, col =
## col.clus[jInd[i]], : "Labels" is not a graphical parameter

## Warning in polygon(z[[k]], density = if (shade) density[k] else 0, col =
## col.clus[jInd[i]], : "Labels" is not a graphical parameter

## Warning in polygon(z[[k]], density = if (shade) density[k] else 0, col =
## col.clus[jInd[i]], : "Labels" is not a graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "Labels" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "Labels" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "Labels" is not a
## graphical parameter

Implementation of KNN and Kmeans Clustering using Iris Dataset

Anthony Ayebiahwe

February 7, 2017