clustering techniques ii

library("cluster")
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

set.seed(1234)
# Generate 500 objects, divided into 2 clusters.
df <- rbind(cbind(rnorm(200,0,8), rnorm(200,0,8)),
cbind(rnorm(300,50,8), rnorm(300,50,8)))
# Specify column and row names
colnames(df) <- c("x", "y")
rownames(df) <- paste0("S", 1:nrow(df))
head(df, nrow = 6)

##             x        y
## S1  -9.656526 3.881815
## S2   2.219434 5.574150
## S3   8.675529 1.484111
## S4 -18.765582 5.605868
## S5   3.432998 2.493448
## S6   4.048447 6.083699

fviz_nbclust(df, clara, method = "silhouette")+
theme_classic()

# Compute CLARA
clara.res <- clara(df, 2, samples = 50, pamLike = TRUE)
print(clara.res)

## Call:     clara(x = df, k = 2, samples = 50, pamLike = TRUE) 
## Medoids:
##              x         y
## S121 -1.531137  1.145057
## S455 48.357304 50.233499
## Objective function:   9.87862
## Clustering vector:    Named int [1:500] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "names")= chr [1:500] "S1" "S2" "S3" "S4" "S5" "S6" "S7" ...
## Cluster sizes:            200 300 
## Best sample:
##  [1] S37  S49  S54  S63  S68  S71  S76  S80  S82  S101 S103 S108 S109 S118
## [15] S121 S128 S132 S138 S144 S162 S203 S210 S216 S231 S234 S249 S260 S261
## [29] S286 S299 S304 S305 S312 S315 S322 S350 S403 S450 S454 S455 S456 S465
## [43] S488 S497
## 
## Available components:
##  [1] "sample"     "medoids"    "i.med"      "clustering" "objective" 
##  [6] "clusinfo"   "diss"       "call"       "silinfo"    "data"

dd <- cbind(df, cluster = clara.res$cluster)
head(dd, n = 4)

##             x        y cluster
## S1  -9.656526 3.881815       1
## S2   2.219434 5.574150       1
## S3   8.675529 1.484111       1
## S4 -18.765582 5.605868       1

# Medoids
clara.res$medoids

##              x         y
## S121 -1.531137  1.145057
## S455 48.357304 50.233499

# Clustering
head(clara.res$clustering, 10)

##  S1  S2  S3  S4  S5  S6  S7  S8  S9 S10 
##   1   1   1   1   1   1   1   1   1   1

fviz_cluster(clara.res,
palette = c("#00AFBB", "#FC4E07"), # color palette
ellipse.type = "t", # Concentration ellipse
geom = "point", pointsize = 1,
ggtheme = theme_classic()
)

data("USArrests")
df <- scale(USArrests)
head(df, nrow = 6)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

# Compute the dissimilarity matrix
# df = the standardized data
res.dist <- dist(df, method = "euclidean")
as.matrix(res.dist)[1:6, 1:6]

##             Alabama   Alaska  Arizona Arkansas California Colorado
## Alabama    0.000000 2.703754 2.293520 1.289810   3.263110 2.651067
## Alaska     2.703754 0.000000 2.700643 2.826039   3.012541 2.326519
## Arizona    2.293520 2.700643 0.000000 2.717758   1.310484 1.365031
## Arkansas   1.289810 2.826039 2.717758 0.000000   3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641   0.000000 1.287619
## Colorado   2.651067 2.326519 1.365031 2.831051   1.287619 0.000000

res.hc <- hclust(d = res.dist, method = "ward.D2")
fviz_dend(res.hc, cex = 0.5)

# Compute cophentic distance
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(res.dist, res.coph)

## [1] 0.6975266

res.hc2 <- hclust(res.dist, method = "average")
cor(res.dist, cophenetic(res.hc2))

## [1] 0.7180382

# Cut tree into 4 groups
grp <- cutree(res.hc, k = 4)
head(grp, n = 4)

##  Alabama   Alaska  Arizona Arkansas 
##        1        2        2        3

# Number of members in each cluster
table(grp)

## grp
##  1  2  3  4 
##  7 12 19 12

# Get the names for the members of cluster 1
rownames(df)[grp == 1]

## [1] "Alabama"        "Georgia"        "Louisiana"      "Mississippi"   
## [5] "North Carolina" "South Carolina" "Tennessee"

# Cut in 4 groups and color by groups
fviz_dend(res.hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)

fviz_cluster(list(data = df, cluster = grp),
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = FALSE, ggtheme = theme_minimal())

# Agglomerative Nesting (Hierarchical Clustering)
res.agnes <- agnes(x = USArrests, # data matrix
stand = TRUE, # Standardize the data
metric = "euclidean", # metric for distance matrix
method = "ward" # Linkage method
)
# DIvisive ANAlysis Clustering
res.diana <- diana(x = USArrests, # data matrix
stand = TRUE, # standardize the data
metric = "euclidean" # metric for distance matrix
)

fviz_dend(res.agnes, cex = 0.6, k = 4)

clustering techniques ii

Kushan De Silva

April 28, 2018