Execute by Neha Raut
The Iris dataset contains the data for 50 flowers from each of the 3 species - Setosa, Versicolor and Virginica. The data gives the measurements in centimeters of the variables sepal length and width and petal length and width for each of the flowers.
Goal of the study is to perform exploratory analysis on the data and build a K-means clustering model to cluster them into groups. Here we have assumed we do not have the species column to form clusters and then used it to check our model performance.
library(ggplot2)
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
irisScale = scale(iris[,-5])
head(irisScale)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.8976739 1.01560199 -1.335752 -1.311052
## [2,] -1.1392005 -0.13153881 -1.335752 -1.311052
## [3,] -1.3807271 0.32731751 -1.392399 -1.311052
## [4,] -1.5014904 0.09788935 -1.279104 -1.311052
## [5,] -1.0184372 1.24503015 -1.335752 -1.311052
## [6,] -0.5353840 1.93331463 -1.165809 -1.048667
ggplot(iris,aes(x = Sepal.Length, y = Sepal.Width, col= Species)) + geom_point()
ggplot(iris,aes(x = Petal.Length, y = Petal.Width, col= Species)) + geom_point()
We can clearly see scatter plot of Petal.length and Petal.width which seperate out Setosa species with very high confidence.However, Versicolor and Virginica Species are overlapped
K.max = 10 #maximum 10 cluster assume
wss = rep(NA, K.max = 10) #Repeat NA for 10 cluster i.e create vector with NA of size 10/1
nClust = list() #Initialize empty list for number of cluster
for(i in 1:K.max){
irisclasses= kmeans(irisScale,i) #object stored in iris class
wss[i] = irisclasses$tot.withinss #in irisclass,total wss distance allocated
nClust[[i]] = irisclasses$size #size allocated to componant of list you have created
}
Once for loop executed, you will get vector of within sum of square and list of no of clusters
plot(1:K.max, wss,
type="b",pch = 19, #b means both(line,point), pch: specify symbol used along with plot
xlab= "Number of clusters",
ylab="between ss/ total ss")
fitK = kmeans(irisScale, 3)
str(fitK)
## List of 9
## $ cluster : int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## $ centers : num [1:3, 1:4] 1.1322 -1.0112 -0.0501 0.0881 0.8504 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:3] 47.5 47.4 44.1
## $ tot.withinss: num 139
## $ betweenss : num 457
## $ size : int [1:3] 47 50 53
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
plot(iris,col = fitK$cluster)
table(Predicted=fitK$cluster,Actual =iris$Species)
## Actual
## Predicted setosa versicolor virginica
## 1 0 11 36
## 2 50 0 0
## 3 0 39 14
#Total number of correctly classified instances are:
50 + 39 + 36
## [1] 125
#Total number of incorrectly classified instances are:
11 + 14
## [1] 25
#Accuracy
125/(125+25) #i.e our model has achieved 83% accuracy
## [1] 0.8333333