Clustering_presentation

library(cluster)

##1) Hierarchical clustering

##upload IRIS dataset and tranform the data in the form of a distance matrix using the DIST function. If "method=" is omitted, the default metric- Eucleadean Distance (root sum-of-squares of differences)- is used.Another avaialable distance function is DAISY (automatic standardardization for mixed data types, based on Gower distance)

##select only numerical variables 
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

irispetal<- iris[, 3:4]

iris.matrix<-dist(irispetal)

##In CLUSTER package, hierarchical agglomerative clustering can be achieved via HCLUST or AGNES (the latter provides the agglomerative coefficient and does not necessarily need dissimilarity matrix as input- it can be calculated in line by specifing the metric=eucledean or metric=Manhattan)

clusters <- agnes(iris.matrix) ##no need to specify diss=TRUE and metric=whatever, since input is a distance matrix
plot(clusters)

##cut tree at 3-cluster solution(height of about 2)

library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

hc<- as.dendrogram(clusters)
rect.dendrogram(hc, 3, border = 2)
abline(h = heights_per_k.dendrogram(hc)["3"] + .1, , col = "blue")

##explore memberships
clusters3 = cutree(hc,3)
table(clusters3)

## clusters3
##  1  2  3 
## 50 46 54

aggregate(irispetal,list(clusters3),median)

##   Group.1 Petal.Length Petal.Width
## 1       1         1.50         0.2
## 2       2         4.25         1.3
## 3       3         5.50         2.0

aggregate(irispetal,list(clusters3),mean)

##   Group.1 Petal.Length Petal.Width
## 1       1     1.462000    0.246000
## 2       2     4.191304    1.302174
## 3       3     5.514815    1.994444

##2) k-means clustering

kcluster<- kmeans(irispetal, 3)
##adding nstart=i argument will change the number of iteration. R automatically chooses the assignments with lowest within-cluster variation.

kcluster

## K-means clustering with 3 clusters of sizes 54, 50, 46
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     4.292593    1.359259
## 2     1.462000    0.246000
## 3     5.626087    2.047826
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [106] 3 1 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 3 1 1 3 3 3 3 3 3 3 3 3 3 1 3
## [141] 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 14.22741  2.02200 15.16348
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

centroids=as.data.frame(kcluster$centers)
iris$cluster=factor(kcluster$cluster)

library(ggfortify)

## Loading required package: ggplot2

library(ggplot2)

##using ggplot2::autoplot to plot kmeans through ggfortify (automatically converts input to a df)
ggplot(data=iris, aes(x=Petal.Length, y=Petal.Width, color=cluster ))+
  geom_point() +
geom_point(data=centroids, aes(Petal.Length,Petal.Width, color='Center'), size=5, alpha=.3, legend=FALSE)

## Warning: Ignoring unknown parameters: legend

##Or use ggfortfy
###autoplot(kcluster, data=iris, frame = TRUE, scale=0) 
##frame=T draws cluster borders

## Use table to compare the results of agnes and kmeans solutions 

table(clusters3,kcluster$cluster)

##          
## clusters3  1  2  3
##         1  0 50  0
##         2 46  0  0
##         3  8  0 46

library(gridExtra)

agnest<- table(clusters3,iris$Species)
kt<- table(kcluster$cluster, iris$Species)
agnest

##          
## clusters3 setosa versicolor virginica
##         1     50          0         0
##         2      0         45         1
##         3      0          5        49

kt

##    
##     setosa versicolor virginica
##   1      0         48         6
##   2     50          0         0
##   3      0          2        44

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

semi_join(as.data.frame(agnest), as.data.frame(kt))

## Joining, by = c("Var2", "Freq")

##   clusters3       Var2 Freq
## 1         2     setosa    0
## 2         3     setosa    0
## 3         1     setosa   50
## 4         1 versicolor    0
## 5         1  virginica    0

##return rows in Kmeans clusters table that are not matched by the Agnes one

anti_join(as.data.frame(agnest), as.data.frame(kt))

## Joining, by = c("Var2", "Freq")

##   clusters3       Var2 Freq
## 1         3 versicolor    5
## 2         2 versicolor   45
## 3         2  virginica    1
## 4         3  virginica   49

##3)PAM


pamcluster<- pam(iris.matrix,3)

pamcluster

## Medoids:
##       ID    
## [1,]  50  50
## [2,]  66  66
## [3,] 129 129
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 3 3 3 3 3 3 3 3
## Objective function:
##     build      swap 
## 0.3713132 0.3663135 
## 
## Available components:
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"

clusplot(pamcluster, color = TRUE)

##or with ggfortify 
###autoplot(pam(irispetal,3), scale.default=FALSE) 

##compare pam to agnes (run gridtable)

table(pamcluster$clustering, iris$Species)

##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         49         7
##   3      0          1        43

Clustering_presentation

Ambra

March 3, 2017