Chapter 8

Data preparation

library(datasets)
df <- scale(USArrests)

# Subset containing 10 rows
set.seed(123)
ss <- sample(1:50, 10)
print(ss)
##  [1] 31 15 14  3 42 43 37 48 25 26
df <- df[ss,]

Dendrograms comparison

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")

# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "average")
hc1
## 
## Call:
## hclust(d = res.dist, method = "average")
## 
## Cluster method   : average 
## Distance         : euclidean 
## Number of objects: 10
hc2 <- hclust(res.dist, method = "ward.D2")
hc2
## 
## Call:
## hclust(d = res.dist, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 10
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)

# Create a list to hold dendrograms
dend_list <- dendlist(dend1, dend2)

Visual comparison of two dendrograms

# Align and plot two dendrograms side by side
  tanglegram(dend1, dend2)

* Customize the tanglegram using other options

# Compute alignment quality. Lower value = good alignment quality
tanglegram(dend1, dend2,
           highlight_distinct_edges = FALSE,
           common_subtrees_color_lines = FALSE,
           common_subtrees_color_branches = FALSE,
           main = paste("entanglement =", round(entanglement(dend_list), 2)))

Correlations

#Cophenetic correlation matrix
cor.dendlist(dend_list, method = "cophenetic")
##           [,1]      [,2]
## [1,] 1.0000000 0.9925544
## [2,] 0.9925544 1.0000000
# Baker correlation
cor.dendlist(dend_list, method = "baker")
##           [,1]      [,2]
## [1,] 1.0000000 0.9895528
## [2,] 0.9895528 1.0000000

Numbers look similar allowing us to conclude that the correlations seem correct

Then we compute the cor between two trees

cor_cophenetic(dend1, dend2)
## [1] 0.9925544

The two tress seem to be highly correlated to confirm we do baker as well

cor_bakers_gamma(dend1, dend2)
## [1] 0.9895528

compare multiple dendrograms together simultaneously. This resembles a heat map of all the corr together

# Create multiple dendrograms by chaining
dend1 <- df %>% dist %>% hclust ("complete") %>% as.dendrogram
dend2 <- df %>% dist %>% hclust ("single") %>% as.dendrogram
dend3 <- df %>% dist %>% hclust ("average") %>% as.dendrogram
dend4 <- df %>% dist %>% hclust ("centroid") %>% as.dendrogram
# Compute correlation matrix
dend_list <- dendlist ("Complete" = dend1, "Single" = dend2,
"Average" = dend3, "Centroid" = dend4)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round (cors, 2)
##          Complete Single Average Centroid
## Complete     1.00   0.46    0.45     0.30
## Single       0.46   1.00    0.23     0.17
## Average      0.45   0.23    1.00     0.31
## Centroid     0.30   0.17    0.31     1.00
# Visualize the correlatioin matrix using corrplot package
library(corrplot)
## corrplot 0.92 loaded
corrplot(cors, "pie", "lower")