library(cluster)
##1) Hierarchical clustering
##upload IRIS dataset and tranform the data in the form of a distance matrix using the DIST function. If "method=" is omitted, the default metric- Eucleadean Distance (root sum-of-squares of differences)- is used.Another avaialable distance function is DAISY (automatic standardardization for mixed data types, based on Gower distance)
##select only numerical variables
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
irispetal<- iris[, 3:4]
iris.matrix<-dist(irispetal)
##In CLUSTER package, hierarchical agglomerative clustering can be achieved via HCLUST or AGNES (the latter provides the agglomerative coefficient and does not necessarily need dissimilarity matrix as input- it can be calculated in line by specifing the metric=eucledean or metric=Manhattan)
clusters <- agnes(iris.matrix) ##no need to specify diss=TRUE and metric=whatever, since input is a distance matrix
plot(clusters)

##cut tree at 3-cluster solution(height of about 2)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
hc<- as.dendrogram(clusters)
rect.dendrogram(hc, 3, border = 2)
abline(h = heights_per_k.dendrogram(hc)["3"] + .1, , col = "blue")

##explore memberships
clusters3 = cutree(hc,3)
table(clusters3)
## clusters3
## 1 2 3
## 50 46 54
aggregate(irispetal,list(clusters3),median)
## Group.1 Petal.Length Petal.Width
## 1 1 1.50 0.2
## 2 2 4.25 1.3
## 3 3 5.50 2.0
aggregate(irispetal,list(clusters3),mean)
## Group.1 Petal.Length Petal.Width
## 1 1 1.462000 0.246000
## 2 2 4.191304 1.302174
## 3 3 5.514815 1.994444
##2) k-means clustering
kcluster<- kmeans(irispetal, 3)
##adding nstart=i argument will change the number of iteration. R automatically chooses the assignments with lowest within-cluster variation.
kcluster
## K-means clustering with 3 clusters of sizes 54, 50, 46
##
## Cluster means:
## Petal.Length Petal.Width
## 1 4.292593 1.359259
## 2 1.462000 0.246000
## 3 5.626087 2.047826
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [106] 3 1 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 3 1 1 3 3 3 3 3 3 3 3 3 3 1 3
## [141] 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 14.22741 2.02200 15.16348
## (between_SS / total_SS = 94.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
centroids=as.data.frame(kcluster$centers)
iris$cluster=factor(kcluster$cluster)
library(ggfortify)
## Loading required package: ggplot2
library(ggplot2)
##using ggplot2::autoplot to plot kmeans through ggfortify (automatically converts input to a df)
ggplot(data=iris, aes(x=Petal.Length, y=Petal.Width, color=cluster ))+
geom_point() +
geom_point(data=centroids, aes(Petal.Length,Petal.Width, color='Center'), size=5, alpha=.3, legend=FALSE)
## Warning: Ignoring unknown parameters: legend

##Or use ggfortfy
###autoplot(kcluster, data=iris, frame = TRUE, scale=0)
##frame=T draws cluster borders
## Use table to compare the results of agnes and kmeans solutions
table(clusters3,kcluster$cluster)
##
## clusters3 1 2 3
## 1 0 50 0
## 2 46 0 0
## 3 8 0 46
library(gridExtra)
agnest<- table(clusters3,iris$Species)
kt<- table(kcluster$cluster, iris$Species)
agnest
##
## clusters3 setosa versicolor virginica
## 1 50 0 0
## 2 0 45 1
## 3 0 5 49
kt
##
## setosa versicolor virginica
## 1 0 48 6
## 2 50 0 0
## 3 0 2 44
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
semi_join(as.data.frame(agnest), as.data.frame(kt))
## Joining, by = c("Var2", "Freq")
## clusters3 Var2 Freq
## 1 2 setosa 0
## 2 3 setosa 0
## 3 1 setosa 50
## 4 1 versicolor 0
## 5 1 virginica 0
##return rows in Kmeans clusters table that are not matched by the Agnes one
anti_join(as.data.frame(agnest), as.data.frame(kt))
## Joining, by = c("Var2", "Freq")
## clusters3 Var2 Freq
## 1 3 versicolor 5
## 2 2 versicolor 45
## 3 2 virginica 1
## 4 3 virginica 49
##3)PAM
pamcluster<- pam(iris.matrix,3)
pamcluster
## Medoids:
## ID
## [1,] 50 50
## [2,] 66 66
## [3,] 129 129
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 3 3 3 3 3 3 3 3
## Objective function:
## build swap
## 0.3713132 0.3663135
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call"
clusplot(pamcluster, color = TRUE)

##or with ggfortify
###autoplot(pam(irispetal,3), scale.default=FALSE)
##compare pam to agnes (run gridtable)
table(pamcluster$clustering, iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 49 7
## 3 0 1 43