Execute by Neha Raut

Introduction:

The Iris dataset contains the data for 50 flowers from each of the 3 species - Setosa, Versicolor and Virginica. The data gives the measurements in centimeters of the variables sepal length and width and petal length and width for each of the flowers.

Goal of the study is to perform exploratory analysis on the data and build a K-means clustering model to cluster them into groups. Here we have assumed we do not have the species column to form clusters and then used it to check our model performance.

Step 1: Load Data

library(ggplot2)

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

irisScale = scale(iris[,-5])
head(irisScale)

##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052
## [6,]   -0.5353840  1.93331463    -1.165809   -1.048667

ggplot(iris,aes(x = Sepal.Length, y = Sepal.Width, col= Species)) + geom_point()

ggplot(iris,aes(x = Petal.Length, y = Petal.Width, col= Species)) + geom_point()

We can clearly see scatter plot of Petal.length and Petal.width which seperate out Setosa species with very high confidence.However, Versicolor and Virginica Species are overlapped

Finding the optimimum number of clusters using Elbow method

K.max = 10  #maximum 10 cluster assume
wss = rep(NA, K.max = 10) #Repeat NA for 10 cluster i.e create vector with NA of size 10/1
nClust = list() #Initialize empty list for number of cluster

for(i in 1:K.max){
  irisclasses= kmeans(irisScale,i)    #object stored in iris class
  wss[i] = irisclasses$tot.withinss   #in irisclass,total wss distance allocated 
  nClust[[i]] = irisclasses$size      #size allocated to componant of list you have created
}

Once for loop executed, you will get vector of within sum of square and list of no of clusters

plot(1:K.max, wss, 
     type="b",pch = 19,   #b means both(line,point), pch: specify symbol used along with plot
     xlab= "Number of clusters",
     ylab="between ss/ total ss")

fitK = kmeans(irisScale, 3)
str(fitK)

## List of 9
##  $ cluster     : int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
##  $ centers     : num [1:3, 1:4] 1.1322 -1.0112 -0.0501 0.0881 0.8504 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:3] 47.5 47.4 44.1
##  $ tot.withinss: num 139
##  $ betweenss   : num 457
##  $ size        : int [1:3] 47 50 53
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

plot(iris,col = fitK$cluster)

table(Predicted=fitK$cluster,Actual =iris$Species)

##          Actual
## Predicted setosa versicolor virginica
##         1      0         11        36
##         2     50          0         0
##         3      0         39        14

#Total number of correctly classified instances are: 
50 + 39 + 36

## [1] 125

#Total number of incorrectly classified instances are: 
11 + 14

## [1] 25

#Accuracy
125/(125+25)    #i.e our model has achieved 83% accuracy

## [1] 0.8333333

k-Means Clustering

Introduction:

Step 1: Load Data

Finding the optimimum number of clusters using Elbow method