Renni Ekaputri
|Data Scientist|Business Analyst|BI Specialist|Senior Project Management Officer|Senior Project Planning|
|EASA Part M CAMO 145 Technical Service Engineer|Aircraft Asset Management Specialist|

Understand Cognition of Clustering Analysis

For a better visualization and sensitivity analysis, on this project we used several R statistical packages:

Iris for flower species, repub for presidential vote, animals for animal clustering, and microarrray gene for train and test expression, those 4 datasets will be used on this project.

library("DT")
iris <- datasets::iris
datatable(iris, options=list(pageLength = 4, scrollX=T))
library("DT")
votes.repub <- cluster::votes.repub
datatable(votes.repub, options=list(pageLength = 4, scrollX=T))
library("DT")
animals <- cluster::animals
datatable(animals, options=list(pageLength = 4, scrollX=T))

StudyStudy Cases

Flower Species

library(dendextend)
library(knitr)
knitr::opts_chunk$set(
   cache = TRUE,
   dpi = 75,
   fig.width = 6, fig.height = 6,
   tidy = FALSE)
iris <- datasets::iris
iris2 <- iris[,-5]
species_labels <- iris[,5]
library(colorspace) # get nice colors
hcl_palettes(plot = TRUE)

(palette = "Dark 3")
## [1] "Dark 3"
species_col <- rev(rainbow_hcl(3))[as.numeric(species_labels)]
# Plot a SPLOM:
pairs(iris2, col = species_col,
      lower.panel = NULL,
       cex.labels=1.1, pch=20, cex = 1.2)

# Add a legend
par(xpd = TRUE)
legend(x = 0.01, y = 0.2, cex = 1,
   legend = as.character(levels(species_labels)),
    fill = unique(species_col))

par(xpd = NA)
par(las = 1, mar = c(4.5, 3, 3, 2) + 0.1, cex = .8)
MASS::parcoord(iris2, col = species_col, var.label = TRUE, lwd = 2)

# Add Title
title("Parallel coordinates plot of the Iris data")
# Add a legend
par(xpd = TRUE)
legend(x = 1.75, y = -.25, cex = 1,
   legend = as.character(levels(species_labels)),
    fill = unique(species_col), horiz = TRUE)

par(xpd = NA)
d_iris <- dist(iris2) # method="man" # is a bit better
hc_iris <- hclust(d_iris, method = "complete")
iris_species <- rev(levels(iris[,5]))

library(dendextend)
dend <- as.dendrogram(hc_iris)
# order it the closest we can to the order of the observations:
dend <- rotate(dend, 1:150)

# Color the branches based on the clusters:
dend <- color_branches(dend, k=3) #, groupLabels=iris_species)

# Manually match the labels, as much as possible, to the real classification of the flowers:
labels_colors(dend) <-
   rainbow_hcl(3)[sort_levels_values(
      as.numeric(iris[,5])[order.dendrogram(dend)]
   )]

# We shall add the flower type to the labels:
labels(dend) <- paste(as.character(iris[,5])[order.dendrogram(dend)],
                           "(",labels(dend),")", 
                           sep = "")
# We hang the dendrogram a bit:
dend <- hang.dendrogram(dend,hang_height=0.1)
# reduce the size of the labels:
# dend <- assign_values_to_leaves_nodePar(dend, 0.5, "lab.cex")
dend <- set(dend, "labels_cex", 0.5)
# And plot:
par(mar = c(3,3,3,7))
plot(dend, 
     main = "Clustered Iris data set
     (the labels give the true flower species)", 
     horiz =  TRUE,  nodePar = list(cex = .007))
legend("topleft", legend = iris_species, fill = rainbow_hcl(3))

par(mar = rep(0,4))
circlize_dendrogram(dend)

some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
library(gplots)
gplots::heatmap.2(as.matrix(iris2), 
          main = "Heatmap for the Iris data set",
          srtCol = 20,
          dendrogram = "row",
          Rowv = dend,
          Colv = "NA", # this to make sure the columns are not ordered
          trace="none",          
          margins =c(5,0.1),      
          key.xlab = "Cm",
          denscol = "grey",
          density.info = "density",
          RowSideColors = rev(labels_colors(dend)), # to add nice colored strips        
          col = some_col_func
         )

hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", 
        "median", "centroid", "ward.D2")
iris_dendlist <- dendlist()
for(i in seq_along(hclust_methods)) {
   hc_iris <- hclust(d_iris, method = hclust_methods[i])   
   iris_dendlist <- dendlist(iris_dendlist, as.dendrogram(hc_iris))
}
names(iris_dendlist) <- hclust_methods
iris_dendlist
## $ward.D
## 'dendrogram' with 2 branches and 150 members total, at height 199.6205 
## 
## $single
## 'dendrogram' with 2 branches and 150 members total, at height 1.640122 
## 
## $complete
## 'dendrogram' with 2 branches and 150 members total, at height 7.085196 
## 
## $average
## 'dendrogram' with 2 branches and 150 members total, at height 4.062683 
## 
## $mcquitty
## 'dendrogram' with 2 branches and 150 members total, at height 4.497283 
## 
## $median
## 'dendrogram' with 2 branches and 150 members total, at height 2.82744 
## 
## $centroid
## 'dendrogram' with 2 branches and 150 members total, at height 2.994307 
## 
## $ward.D2
## 'dendrogram' with 2 branches and 150 members total, at height 32.44761 
## 
## attr(,"class")
## [1] "dendlist"
iris_dendlist_cor <- cor.dendlist(iris_dendlist)
iris_dendlist_cor
##             ward.D    single  complete   average  mcquitty    median
## ward.D   1.0000000 0.9836838 0.5774013 0.9841333 0.9641103 0.9451815
## single   0.9836838 1.0000000 0.5665529 0.9681156 0.9329029 0.9444723
## complete 0.5774013 0.5665529 1.0000000 0.6195121 0.6107473 0.6889092
## average  0.9841333 0.9681156 0.6195121 1.0000000 0.9828015 0.9449422
## mcquitty 0.9641103 0.9329029 0.6107473 0.9828015 1.0000000 0.9203374
## median   0.9451815 0.9444723 0.6889092 0.9449422 0.9203374 1.0000000
## centroid 0.9809088 0.9903934 0.5870062 0.9801444 0.9499123 0.9403569
## ward.D2  0.9911648 0.9682507 0.6096286 0.9895131 0.9829977 0.9445832
##           centroid   ward.D2
## ward.D   0.9809088 0.9911648
## single   0.9903934 0.9682507
## complete 0.5870062 0.6096286
## average  0.9801444 0.9895131
## mcquitty 0.9499123 0.9829977
## median   0.9403569 0.9445832
## centroid 1.0000000 0.9737886
## ward.D2  0.9737886 1.0000000
corrplot::corrplot(iris_dendlist_cor, "pie", "lower")

iris_dendlist_cor_spearman <- cor.dendlist(iris_dendlist, method_coef = "spearman")
corrplot::corrplot(iris_dendlist_cor_spearman, "pie", "lower")

iris_dendlist %>% dendlist(which = c(1,8)) %>% ladderize %>% 
   set("branches_k_color", k=3) %>%
  untangle(method = "step1side", k_seq = 3:20) %>%
  set("clear_branches") %>% #otherwise the single lines are not black, since they retain the previous color from the branches_k_color.
   tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE)

iris_dendlist %>% dendlist(which = c(1,4)) %>% ladderize %>% 
   set("branches_k_color", k=2) %>%
untangle(method = "step1side", k_seq = 3:20) %>%
tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE)

iris_dendlist %>% dendlist(which = c(1,4)) %>% ladderize %>%
untangle(method = "step1side", k_seq = 3:20) %>%
   set("rank_branches") %>%
   tanglegram(common_subtrees_color_branches = TRUE)

length(unique(common_subtrees_clusters(iris_dendlist[[1]], iris_dendlist[[4]]))[-1])
## [1] 39
# -1 at the end is because we are ignoring the "0" subtree, which indicates leaves that are singletons.
iris_dendlist %>% dendlist(which = c(3,4)) %>% ladderize %>% 
   untangle(method = "step1side", k_seq = 2:6) %>%
   set("branches_k_color", k=2) %>% 
   tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE)

par(mfrow = c(4,2))
for(i in 1:8) {
   iris_dendlist[[i]] %>% set("branches_k_color", k=2) %>% plot(axes = FALSE, horiz = TRUE)
   title(names(iris_dendlist)[i])
}

iris_dendlist_cor2 <- cor.dendlist(iris_dendlist, method = "common")
iris_dendlist_cor2
##             ward.D    single  complete   average  mcquitty    median
## ward.D   1.0000000 0.7324415 0.8595318 0.8461538 0.8361204 0.7458194
## single   0.7324415 1.0000000 0.7324415 0.7491639 0.7458194 0.7591973
## complete 0.8595318 0.7324415 1.0000000 0.8060201 0.7993311 0.7491639
## average  0.8461538 0.7491639 0.8060201 1.0000000 0.8494983 0.7892977
## mcquitty 0.8361204 0.7458194 0.7993311 0.8494983 1.0000000 0.7859532
## median   0.7458194 0.7591973 0.7491639 0.7892977 0.7859532 1.0000000
## centroid 0.7324415 0.7625418 0.7290970 0.7725753 0.7759197 0.8528428
## ward.D2  0.8795987 0.7324415 0.8294314 0.8294314 0.8294314 0.7558528
##           centroid   ward.D2
## ward.D   0.7324415 0.8795987
## single   0.7625418 0.7324415
## complete 0.7290970 0.8294314
## average  0.7725753 0.8294314
## mcquitty 0.7759197 0.8294314
## median   0.8528428 0.7558528
## centroid 1.0000000 0.7357860
## ward.D2  0.7357860 1.0000000
corrplot::corrplot(iris_dendlist_cor2, "pie", "lower")

get_ordered_3_clusters <- function(dend) {
   cutree(dend, k = 3)[order.dendrogram(dend)]
}

dend_3_clusters <- lapply(iris_dendlist, get_ordered_3_clusters)

compare_clusters_to_iris <- function(clus) {FM_index(clus, rep(1:3, each = 50), assume_sorted_vectors = TRUE)}

clusters_performance <- sapply(dend_3_clusters, compare_clusters_to_iris)
dotchart(sort(clusters_performance), xlim = c(0.7,1),
         xlab = "Fowlkes-Mallows Index (from 0 to 1)",
         main = "Perormance of clustering algorithms \n in detecting the 3 species",
         pch = 19)

  • We may conclude The Iris data set is only 4-dimensional, making it possible to explore using pairs plot (SPLOM) or parallel coordinates plot. It is clear from these that two main clusters are visible, while the separation of the third cluster is difficult. In the above analysis, we learned that the complete method fails to do the proper separation of the two main clusters when cut in k=2 (but succeeds in doing it, if moving to k=3 clusters). This is different from all the other 7 methods available in hclust, which do succeed in separating the 2 main clusters from the beginning (i.e.: for k=2). We also noticed that all clustering algorithms share a relatively high proportion of common nodes (between 75% to 90%). Lastly, when it came to trying to separating the flowers into 3 species, the median clustering method did the best, while the single method did the worst in this regard.

Gene

train <- dendextend::khan$train
test <- dendextend::khan$test
d_train <- train %>% dist %>% hclust %>% as.dendrogram
d_test <- test %>% dist %>% hclust %>% as.dendrogram
d_train_test <- dendlist(train = d_train, test = d_test)
d_train_test %>% cor.dendlist
##           train      test
## train 1.0000000 0.5708019
## test  0.5708019 1.0000000
d_train_test %>% cor.dendlist(method_coef = "spearman")
##           train      test
## train 1.0000000 0.4971936
## test  0.4971936 1.0000000
Bk_plot(d_train, d_test, k = 2:30, xlim = c(2,30))

pre_tang_d_train_test <- d_train_test %>% ladderize %>% # untangle %>%
   set("branches_k_color", k = 7)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_test$train)
pre_tang_d_train_test %>% tanglegram(fast = TRUE, color_lines = train_branches_colors)

# dput(d_train_test_common)
d_train_test_common <- structure(list(train = structure(list(structure(list(structure(171L, label = "491565", members = 1L, height = 0, leaf = TRUE), structure(178L, label = "505491", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 7.1369942952198), structure(list(structure(list(structure(8L, label = "283315", members = 1L, height = 0, leaf = TRUE), structure(9L, label = "897177", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.55936539399907), structure(list(structure(list(structure(106L, label = "345553", members = 1L, height = 0, leaf = TRUE), structure(112L, label = "307660", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 5.17910461856101), structure(list(structure(list(structure(268L, label = "504791", members = 1L, height = 0, leaf = TRUE), structure(306L, label = "782503", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 4.27052507661529), structure(list(structure(list(structure(246L, label = "81518", members = 1L, height = 0, leaf = TRUE), structure(290L, label = "280837", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.37572388944875), structure(list(structure(list(structure(266L, label = "866694", members = 1L, height = 0, leaf = TRUE), structure(277L, label = "811956", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.31301518861595), structure(list(structure(273L, label = "842918", members = 1L, height = 0, leaf = TRUE), structure(274L, label = "626555", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.71864544948399)), members = 4, midpoint = 1.5, height = 6.35097701381449)), members = 6, midpoint = 2, height = 8.7097033164167)), members = 8, midpoint = 2.25, height = 9.23807936424017)), members = 10, midpoint = 2.375, height = 11.6573350998416)), members = 12, midpoint = 2.4375, height = 17.5620766260713)), members = 14, midpoint = 2.46875, height = 30.2363452779928, class = "dendrogram"), test = structure(list(structure(list(structure(list(structure(171L, label = "491565", members = 1L, height = 0, leaf = TRUE), structure(178L, label = "505491", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.96666017450449), structure(list(structure(list(structure(list(structure(268L, label = "504791", members = 1L, height = 0, leaf = TRUE), structure(306L, label = "782503", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.31497882927685), structure(list(structure(list(structure(266L, label = "866694", members = 1L, height = 0, leaf = TRUE), structure(277L, label = "811956", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.75475236429532), structure(list(structure(273L, label = "842918", members = 1L, height = 0, leaf = TRUE), structure(274L, label = "626555", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.34617375921535)), members = 4, midpoint = 1.5, height = 2.76465021476497)), members = 6, midpoint = 2, height = 4.52927251774499), structure(list(structure(list(structure(246L, label = "81518", members = 1L, height = 0, leaf = TRUE), structure(290L, label = "280837", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 0.714433271901582), structure(list(structure(8L, label = "283315", members = 1L, height = 0, leaf = TRUE), structure(9L, label = "897177", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.71895552589356)), members = 4, midpoint = 1.5, height = 6.44143803354499)), members = 10, midpoint = 4.75, height = 7.736516720075)), members = 12, midpoint = 3.625, height = 11.0066972375913), structure(list(structure(106L, label = "345553", members = 1L, height = 0, leaf = TRUE), structure(112L, label = "307660", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.6486307417989)), members = 14, midpoint = 8.0625, height = 18.2331742971431, class = "dendrogram")), class = "dendlist", .Names = c("train", 
"test"))
# This was calculated before
# d_train_test_common <- d_train_test %>% prune_common_subtrees.dendlist
# d_train_test_common
d_train_test_common %>% untangle %>%  tanglegram(common_subtrees_color_branches = TRUE)

d_train_test %>% nleaves
## train  test 
##   306   306
d_train_test_common %>% nleaves
## train  test 
##    14    14
  • We may conclude the clustering algorithm resulted in trees which are significantly similar in both the training and the test data sets beyond chance, but that this similarity is restricted to only a very small proportion of genes.

Presidential Candidate

votes.repub <- cluster::votes.repub
years <- as.numeric(gsub("X", "", colnames(votes.repub)))

par(las = 2, mar = c(4.5, 3, 3, 2) + 0.1, cex = .8)
# MASS::parcoord(votes.repub, var.label = FALSE, lwd = 1)
matplot(1L:ncol(votes.repub), t(votes.repub), type = "l", col = 1, lty = 1,
        axes = F, xlab = "", ylab = "")
axis(1, at = seq_along(years), labels = years)
axis(2)
# Add Title
title("Votes for Republican Candidate\n in Presidential Elections \n (each line is a country - over the years)")

arcsin_transformation <- function(x) asin(x/100)

dend_NA <- votes.repub %>% is.na %>%
   dist %>% hclust %>% as.dendrogram %>% ladderize

dend <- votes.repub %>% arcsin_transformation %>%
   dist %>% hclust(method = "com") %>% as.dendrogram %>%
   rotate(labels(dend_NA)) %>%
   color_branches(k=3)

# some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
some_col_func <- colorspace::diverge_hcl


# par(mar = c(3,3,3,3))
# library(gplots)
gplots::heatmap.2(as.matrix(votes.repub), 
          main = "Votes for\n Republican Presidential Candidate\n (clustered using complete)",
          srtCol = 60,
          dendrogram = "row",
          Rowv = dend,
          Colv = "NA", # this to make sure the columns are not ordered
          trace="none",          
          margins =c(3,6),      
          key.xlab = "% Votes for Republican\n Presidential Candidate",
          labCol = years,
          denscol = "grey",
          density.info = "density",
          col = some_col_func
         )

          # RowSideColors = rev(labels_colors(dend)), # to add nice colored strips      
hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", 
        "median", "centroid", "ward.D2")
votes.repub_dendlist <- dendlist()

for(i in seq_along(hclust_methods)) {
   tmp_dend <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = hclust_methods[i]) %>% as.dendrogram 
   votes.repub_dendlist <- dendlist(votes.repub_dendlist, tmp_dend)
}
names(votes.repub_dendlist) <- hclust_methods
# votes.repub_dendlist
corrplot::corrplot(cor.dendlist(votes.repub_dendlist), "pie", "lower")

arcsin_transformation <- function(x) asin(x/100)

dend_NA <- votes.repub %>% is.na %>%
   dist %>% hclust %>% as.dendrogram %>% ladderize

dend <- votes.repub %>% arcsin_transformation %>%
   dist %>% hclust(method = "ave") %>% as.dendrogram %>%
   rotate(labels(dend_NA)) %>%
   color_branches(k=3)

# some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
some_col_func <- colorspace::diverge_hcl


# par(mar = c(3,3,3,3))
# library(gplots)
gplots::heatmap.2(as.matrix(votes.repub), 
          main = "Votes for\n Republican Presidential Candidate\n (clustered using average)",
          srtCol = 60,
          dendrogram = "row",
          Rowv = dend,
          Colv = "NA", # this to make sure the columns are not ordered
          trace="none",          
          margins =c(3,6),      
          key.xlab = "% Votes for Republican\n Presidential Candidate",
          labCol = years,
          denscol = "grey",
          density.info = "density",
          col = some_col_func
         )

          # RowSideColors = rev(labels_colors(dend)), # to add nice colored strips      
ord1 <- c("North Carolina", "Virginia", "Tennessee", "Kentucky", "Maryland", 
"Delaware", "Oklahoma", "Missouri", "New Mexico", "Oregon", "Washington", 
"California", "West Virginia", "Hawaii", "Nevada", "Arizona", 
"Montana", "Idaho", "Wyoming", "Utah", "Colorado", "Alaska", 
"Illinois", "New York", "Indiana", "Ohio", "Connecticut", "New Hampshire", 
"New Jersey", "Pennsylvania", "Iowa", "South Dakota", "North Dakota", 
"Wisconsin", "Minnesota", "Nebraska", "Kansas", "Maine", "Michigan", 
"Massachusetts", "Rhode Island", "Vermont", "Alabama", "Georgia", 
"Louisiana", "Arkansas", "Florida", "Texas", "South Carolina", 
"Mississippi")

ord2 <- c("North Carolina", "Virginia", "Tennessee", "Oklahoma", "Kentucky", 
"Maryland", "Delaware", "Missouri", "New Mexico", "West Virginia", 
"Oregon", "Washington", "California", "Nevada", "Arizona", "Montana", 
"Colorado", "Alaska", "Idaho", "Wyoming", "Utah", "Hawaii", "Maine", 
"Illinois", "New York", "New Jersey", "Indiana", "Ohio", "Connecticut", 
"New Hampshire", "Pennsylvania", "Michigan", "Iowa", "South Dakota", 
"North Dakota", "Wisconsin", "Minnesota", "Massachusetts", "Rhode Island", 
"Nebraska", "Kansas", "Vermont", "Alabama", "Georgia", "Louisiana", 
"Arkansas", "Florida", "Texas", "South Carolina", "Mississippi"
)

# dput(lapply(dends, labels)[[2]])
dend_com <- votes.repub %>% arcsin_transformation %>%
   dist %>% hclust(method = "com") %>% as.dendrogram %>%
   rotate(labels(dend_NA)) %>%
   color_branches(k=3) # %>% ladderize
dend_ave <- votes.repub %>% arcsin_transformation %>%
   dist %>% hclust(method = "ave") %>% as.dendrogram %>%
   rotate(labels(dend_NA)) %>%
   color_branches(k=3) # %>% ladderize

# The orders were predefined after using untangle("step2side")
# They are omitted here to save running time.
dend_com <- rotate(dend_com, ord1)
dend_ave <- rotate(dend_ave, ord2)

dends <- dendlist(complete = dend_com, average = dend_ave) # %>% untangle("step2side")
dends  %>% tanglegram(margin_inner = 7)

  • We may conclude two clusterings give similar results for: “Alabama”, “Georgia”, “Louisiana”, “Arkansas”, “Florida”, “Texas”, “South Carolina”, “Mississippi”. There are also several other sub-trees which are identical between the two methods. The biggest difference lies in several “rouge” states that are placed differently in the two clustering algorithms. They are: Vermont, Michigan, Maine, Hawaii, New Jersey, West Virginia, and Oklahoma.

Animal Clustering

animals <- cluster::animals

colnames(animals) <- c("warm-blooded", 
                       "can fly",
                       "vertebrate",
                       "endangered",
                       "live in groups",
                       "have hair")
dend_r <- animals %>% dist(method = "man") %>% hclust(method = "ward.D") %>% as.dendrogram %>% ladderize %>%
    color_branches(k=4)

dend_c <- t(animals) %>% dist(method = "man") %>% hclust(method = "com") %>% as.dendrogram %>% ladderize%>%
    color_branches(k=3)


# some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
# some_col_func <- colorspace::diverge_hcl
# some_col_func <- colorspace::sequential_hcl
some_col_func <- function(n) (colorspace::diverge_hcl(n, h = c(246, 40), c = 96, l = c(65, 90)))



# par(mar = c(3,3,3,3))
# library(gplots)
gplots::heatmap.2(as.matrix(animals-1), 
          main = "Attributes of Animals",
          srtCol = 35,
          Rowv = dend_r,
          Colv = dend_c,
          trace="row", hline = NA, tracecol = "darkgrey",         
          margins =c(6,3),      
          key.xlab = "no / yes",
          denscol = "grey",
          density.info = "density",
          col = some_col_func
         )

hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", 
        "median", "centroid", "ward.D2")
animals_dendlist <- dendlist()

for(i in seq_along(hclust_methods)) {
   tmp_dend <-  animals %>% dist(method = "man") %>% 
      hclust(method = hclust_methods[i]) %>% as.dendrogram 
   animals_dendlist <- dendlist(animals_dendlist, tmp_dend)
}
names(animals_dendlist) <- hclust_methods
# votes.repub_dendlist
cophenetic_cors <- cor.dendlist(animals_dendlist)
corrplot::corrplot(cophenetic_cors, "pie", "lower")

remove_median <- dendlist(animals_dendlist, which = c(1:8)[-6] )
FM_cors <- cor.dendlist(remove_median, method = "FM_index", k = 4)
corrplot::corrplot(FM_cors, "pie", "lower")

  • We may conclude We removed the “median” method since it did not have k=4 possible. In general, the results seems sensitive to the algorithm used, and the different algorithm methods do not seem to agree with one another (with regards to k=4), so further analyses may be in place in order to decide on which algorithm and interpretation are most appropriate for these data.