For a better visualization and sensitivity analysis, on this project we used several R statistical packages:
Dedendextend-package {dendextend} is a function for extending dendrogram objects. For general tree structure there isDendrogram {stats}, class “dendrogram” provides general functions for handling tree-like structures. It is intended as a replacement for similar functions in hierarchical clustering and classification/regression trees, such that all of these can use the same engine for plotting or cutting trees. Hierarchical cluster analysis hclust {stats} on a set of dissimilarities and methods for analyzing cluster. Furthermore,stats-package {stats}, this package contains functions for statistical calculations and random number generation.
Knitr-package {knitr}takes an input file, extracts the R code in it according to a list of patterns, evaluates the code and writes the output in another file.
Gplot-package {gplots}is a vignette summarises the various formats that grid drawing functions take.
Corrplot-package (corrplot}is a graphical display of a correlation matrix, confidence interval. It also contains some algorithms to do matrix reordering. In addition, corrplot is good at details, including choosing color, text labels, color labels, layout, etc.
Iris for flower species, repub for presidential vote, animals for animal clustering, and microarrray gene for train and test expression, those 4 datasets will be used on this project.
library("DT")
iris <- datasets::iris
datatable(iris, options=list(pageLength = 4, scrollX=T))library("DT")
votes.repub <- cluster::votes.repub
datatable(votes.repub, options=list(pageLength = 4, scrollX=T))library("DT")
animals <- cluster::animals
datatable(animals, options=list(pageLength = 4, scrollX=T))library(dendextend)
library(knitr)
knitr::opts_chunk$set(
cache = TRUE,
dpi = 75,
fig.width = 6, fig.height = 6,
tidy = FALSE)iris <- datasets::iris
iris2 <- iris[,-5]
species_labels <- iris[,5]
library(colorspace) # get nice colors
hcl_palettes(plot = TRUE)(palette = "Dark 3")## [1] "Dark 3"
species_col <- rev(rainbow_hcl(3))[as.numeric(species_labels)]# Plot a SPLOM:
pairs(iris2, col = species_col,
lower.panel = NULL,
cex.labels=1.1, pch=20, cex = 1.2)
# Add a legend
par(xpd = TRUE)
legend(x = 0.01, y = 0.2, cex = 1,
legend = as.character(levels(species_labels)),
fill = unique(species_col))par(xpd = NA)par(las = 1, mar = c(4.5, 3, 3, 2) + 0.1, cex = .8)
MASS::parcoord(iris2, col = species_col, var.label = TRUE, lwd = 2)
# Add Title
title("Parallel coordinates plot of the Iris data")
# Add a legend
par(xpd = TRUE)
legend(x = 1.75, y = -.25, cex = 1,
legend = as.character(levels(species_labels)),
fill = unique(species_col), horiz = TRUE)par(xpd = NA)d_iris <- dist(iris2) # method="man" # is a bit better
hc_iris <- hclust(d_iris, method = "complete")
iris_species <- rev(levels(iris[,5]))
library(dendextend)
dend <- as.dendrogram(hc_iris)
# order it the closest we can to the order of the observations:
dend <- rotate(dend, 1:150)
# Color the branches based on the clusters:
dend <- color_branches(dend, k=3) #, groupLabels=iris_species)
# Manually match the labels, as much as possible, to the real classification of the flowers:
labels_colors(dend) <-
rainbow_hcl(3)[sort_levels_values(
as.numeric(iris[,5])[order.dendrogram(dend)]
)]
# We shall add the flower type to the labels:
labels(dend) <- paste(as.character(iris[,5])[order.dendrogram(dend)],
"(",labels(dend),")",
sep = "")
# We hang the dendrogram a bit:
dend <- hang.dendrogram(dend,hang_height=0.1)
# reduce the size of the labels:
# dend <- assign_values_to_leaves_nodePar(dend, 0.5, "lab.cex")
dend <- set(dend, "labels_cex", 0.5)
# And plot:
par(mar = c(3,3,3,7))
plot(dend,
main = "Clustered Iris data set
(the labels give the true flower species)",
horiz = TRUE, nodePar = list(cex = .007))
legend("topleft", legend = iris_species, fill = rainbow_hcl(3))par(mar = rep(0,4))
circlize_dendrogram(dend)some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))library(gplots)
gplots::heatmap.2(as.matrix(iris2),
main = "Heatmap for the Iris data set",
srtCol = 20,
dendrogram = "row",
Rowv = dend,
Colv = "NA", # this to make sure the columns are not ordered
trace="none",
margins =c(5,0.1),
key.xlab = "Cm",
denscol = "grey",
density.info = "density",
RowSideColors = rev(labels_colors(dend)), # to add nice colored strips
col = some_col_func
)hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty",
"median", "centroid", "ward.D2")
iris_dendlist <- dendlist()
for(i in seq_along(hclust_methods)) {
hc_iris <- hclust(d_iris, method = hclust_methods[i])
iris_dendlist <- dendlist(iris_dendlist, as.dendrogram(hc_iris))
}
names(iris_dendlist) <- hclust_methods
iris_dendlist## $ward.D
## 'dendrogram' with 2 branches and 150 members total, at height 199.6205
##
## $single
## 'dendrogram' with 2 branches and 150 members total, at height 1.640122
##
## $complete
## 'dendrogram' with 2 branches and 150 members total, at height 7.085196
##
## $average
## 'dendrogram' with 2 branches and 150 members total, at height 4.062683
##
## $mcquitty
## 'dendrogram' with 2 branches and 150 members total, at height 4.497283
##
## $median
## 'dendrogram' with 2 branches and 150 members total, at height 2.82744
##
## $centroid
## 'dendrogram' with 2 branches and 150 members total, at height 2.994307
##
## $ward.D2
## 'dendrogram' with 2 branches and 150 members total, at height 32.44761
##
## attr(,"class")
## [1] "dendlist"
iris_dendlist_cor <- cor.dendlist(iris_dendlist)
iris_dendlist_cor## ward.D single complete average mcquitty median
## ward.D 1.0000000 0.9836838 0.5774013 0.9841333 0.9641103 0.9451815
## single 0.9836838 1.0000000 0.5665529 0.9681156 0.9329029 0.9444723
## complete 0.5774013 0.5665529 1.0000000 0.6195121 0.6107473 0.6889092
## average 0.9841333 0.9681156 0.6195121 1.0000000 0.9828015 0.9449422
## mcquitty 0.9641103 0.9329029 0.6107473 0.9828015 1.0000000 0.9203374
## median 0.9451815 0.9444723 0.6889092 0.9449422 0.9203374 1.0000000
## centroid 0.9809088 0.9903934 0.5870062 0.9801444 0.9499123 0.9403569
## ward.D2 0.9911648 0.9682507 0.6096286 0.9895131 0.9829977 0.9445832
## centroid ward.D2
## ward.D 0.9809088 0.9911648
## single 0.9903934 0.9682507
## complete 0.5870062 0.6096286
## average 0.9801444 0.9895131
## mcquitty 0.9499123 0.9829977
## median 0.9403569 0.9445832
## centroid 1.0000000 0.9737886
## ward.D2 0.9737886 1.0000000
corrplot::corrplot(iris_dendlist_cor, "pie", "lower")iris_dendlist_cor_spearman <- cor.dendlist(iris_dendlist, method_coef = "spearman")
corrplot::corrplot(iris_dendlist_cor_spearman, "pie", "lower")iris_dendlist %>% dendlist(which = c(1,8)) %>% ladderize %>%
set("branches_k_color", k=3) %>%
untangle(method = "step1side", k_seq = 3:20) %>%
set("clear_branches") %>% #otherwise the single lines are not black, since they retain the previous color from the branches_k_color.
tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE)iris_dendlist %>% dendlist(which = c(1,4)) %>% ladderize %>%
set("branches_k_color", k=2) %>%
untangle(method = "step1side", k_seq = 3:20) %>%
tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE)iris_dendlist %>% dendlist(which = c(1,4)) %>% ladderize %>%
untangle(method = "step1side", k_seq = 3:20) %>%
set("rank_branches") %>%
tanglegram(common_subtrees_color_branches = TRUE)length(unique(common_subtrees_clusters(iris_dendlist[[1]], iris_dendlist[[4]]))[-1])## [1] 39
# -1 at the end is because we are ignoring the "0" subtree, which indicates leaves that are singletons.iris_dendlist %>% dendlist(which = c(3,4)) %>% ladderize %>%
untangle(method = "step1side", k_seq = 2:6) %>%
set("branches_k_color", k=2) %>%
tanglegram(faster = TRUE) # (common_subtrees_color_branches = TRUE)par(mfrow = c(4,2))
for(i in 1:8) {
iris_dendlist[[i]] %>% set("branches_k_color", k=2) %>% plot(axes = FALSE, horiz = TRUE)
title(names(iris_dendlist)[i])
}iris_dendlist_cor2 <- cor.dendlist(iris_dendlist, method = "common")
iris_dendlist_cor2## ward.D single complete average mcquitty median
## ward.D 1.0000000 0.7324415 0.8595318 0.8461538 0.8361204 0.7458194
## single 0.7324415 1.0000000 0.7324415 0.7491639 0.7458194 0.7591973
## complete 0.8595318 0.7324415 1.0000000 0.8060201 0.7993311 0.7491639
## average 0.8461538 0.7491639 0.8060201 1.0000000 0.8494983 0.7892977
## mcquitty 0.8361204 0.7458194 0.7993311 0.8494983 1.0000000 0.7859532
## median 0.7458194 0.7591973 0.7491639 0.7892977 0.7859532 1.0000000
## centroid 0.7324415 0.7625418 0.7290970 0.7725753 0.7759197 0.8528428
## ward.D2 0.8795987 0.7324415 0.8294314 0.8294314 0.8294314 0.7558528
## centroid ward.D2
## ward.D 0.7324415 0.8795987
## single 0.7625418 0.7324415
## complete 0.7290970 0.8294314
## average 0.7725753 0.8294314
## mcquitty 0.7759197 0.8294314
## median 0.8528428 0.7558528
## centroid 1.0000000 0.7357860
## ward.D2 0.7357860 1.0000000
corrplot::corrplot(iris_dendlist_cor2, "pie", "lower")get_ordered_3_clusters <- function(dend) {
cutree(dend, k = 3)[order.dendrogram(dend)]
}
dend_3_clusters <- lapply(iris_dendlist, get_ordered_3_clusters)
compare_clusters_to_iris <- function(clus) {FM_index(clus, rep(1:3, each = 50), assume_sorted_vectors = TRUE)}
clusters_performance <- sapply(dend_3_clusters, compare_clusters_to_iris)
dotchart(sort(clusters_performance), xlim = c(0.7,1),
xlab = "Fowlkes-Mallows Index (from 0 to 1)",
main = "Perormance of clustering algorithms \n in detecting the 3 species",
pch = 19)train <- dendextend::khan$train
test <- dendextend::khan$test
d_train <- train %>% dist %>% hclust %>% as.dendrogram
d_test <- test %>% dist %>% hclust %>% as.dendrogram
d_train_test <- dendlist(train = d_train, test = d_test)
d_train_test %>% cor.dendlist## train test
## train 1.0000000 0.5708019
## test 0.5708019 1.0000000
d_train_test %>% cor.dendlist(method_coef = "spearman")## train test
## train 1.0000000 0.4971936
## test 0.4971936 1.0000000
Bk_plot(d_train, d_test, k = 2:30, xlim = c(2,30))pre_tang_d_train_test <- d_train_test %>% ladderize %>% # untangle %>%
set("branches_k_color", k = 7)
train_branches_colors <- get_leaves_branches_col(pre_tang_d_train_test$train)
pre_tang_d_train_test %>% tanglegram(fast = TRUE, color_lines = train_branches_colors)# dput(d_train_test_common)
d_train_test_common <- structure(list(train = structure(list(structure(list(structure(171L, label = "491565", members = 1L, height = 0, leaf = TRUE), structure(178L, label = "505491", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 7.1369942952198), structure(list(structure(list(structure(8L, label = "283315", members = 1L, height = 0, leaf = TRUE), structure(9L, label = "897177", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.55936539399907), structure(list(structure(list(structure(106L, label = "345553", members = 1L, height = 0, leaf = TRUE), structure(112L, label = "307660", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 5.17910461856101), structure(list(structure(list(structure(268L, label = "504791", members = 1L, height = 0, leaf = TRUE), structure(306L, label = "782503", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 4.27052507661529), structure(list(structure(list(structure(246L, label = "81518", members = 1L, height = 0, leaf = TRUE), structure(290L, label = "280837", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.37572388944875), structure(list(structure(list(structure(266L, label = "866694", members = 1L, height = 0, leaf = TRUE), structure(277L, label = "811956", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.31301518861595), structure(list(structure(273L, label = "842918", members = 1L, height = 0, leaf = TRUE), structure(274L, label = "626555", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.71864544948399)), members = 4, midpoint = 1.5, height = 6.35097701381449)), members = 6, midpoint = 2, height = 8.7097033164167)), members = 8, midpoint = 2.25, height = 9.23807936424017)), members = 10, midpoint = 2.375, height = 11.6573350998416)), members = 12, midpoint = 2.4375, height = 17.5620766260713)), members = 14, midpoint = 2.46875, height = 30.2363452779928, class = "dendrogram"), test = structure(list(structure(list(structure(list(structure(171L, label = "491565", members = 1L, height = 0, leaf = TRUE), structure(178L, label = "505491", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.96666017450449), structure(list(structure(list(structure(list(structure(268L, label = "504791", members = 1L, height = 0, leaf = TRUE), structure(306L, label = "782503", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 2.31497882927685), structure(list(structure(list(structure(266L, label = "866694", members = 1L, height = 0, leaf = TRUE), structure(277L, label = "811956", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.75475236429532), structure(list(structure(273L, label = "842918", members = 1L, height = 0, leaf = TRUE), structure(274L, label = "626555", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.34617375921535)), members = 4, midpoint = 1.5, height = 2.76465021476497)), members = 6, midpoint = 2, height = 4.52927251774499), structure(list(structure(list(structure(246L, label = "81518", members = 1L, height = 0, leaf = TRUE), structure(290L, label = "280837", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 0.714433271901582), structure(list(structure(8L, label = "283315", members = 1L, height = 0, leaf = TRUE), structure(9L, label = "897177", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 1.71895552589356)), members = 4, midpoint = 1.5, height = 6.44143803354499)), members = 10, midpoint = 4.75, height = 7.736516720075)), members = 12, midpoint = 3.625, height = 11.0066972375913), structure(list(structure(106L, label = "345553", members = 1L, height = 0, leaf = TRUE), structure(112L, label = "307660", members = 1L, height = 0, leaf = TRUE)), members = 2L, midpoint = 0.5, height = 3.6486307417989)), members = 14, midpoint = 8.0625, height = 18.2331742971431, class = "dendrogram")), class = "dendlist", .Names = c("train",
"test"))# This was calculated before
# d_train_test_common <- d_train_test %>% prune_common_subtrees.dendlist
# d_train_test_common
d_train_test_common %>% untangle %>% tanglegram(common_subtrees_color_branches = TRUE)d_train_test %>% nleaves## train test
## 306 306
d_train_test_common %>% nleaves## train test
## 14 14
votes.repub <- cluster::votes.repubyears <- as.numeric(gsub("X", "", colnames(votes.repub)))
par(las = 2, mar = c(4.5, 3, 3, 2) + 0.1, cex = .8)
# MASS::parcoord(votes.repub, var.label = FALSE, lwd = 1)
matplot(1L:ncol(votes.repub), t(votes.repub), type = "l", col = 1, lty = 1,
axes = F, xlab = "", ylab = "")
axis(1, at = seq_along(years), labels = years)
axis(2)
# Add Title
title("Votes for Republican Candidate\n in Presidential Elections \n (each line is a country - over the years)")arcsin_transformation <- function(x) asin(x/100)
dend_NA <- votes.repub %>% is.na %>%
dist %>% hclust %>% as.dendrogram %>% ladderize
dend <- votes.repub %>% arcsin_transformation %>%
dist %>% hclust(method = "com") %>% as.dendrogram %>%
rotate(labels(dend_NA)) %>%
color_branches(k=3)
# some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
some_col_func <- colorspace::diverge_hcl
# par(mar = c(3,3,3,3))
# library(gplots)
gplots::heatmap.2(as.matrix(votes.repub),
main = "Votes for\n Republican Presidential Candidate\n (clustered using complete)",
srtCol = 60,
dendrogram = "row",
Rowv = dend,
Colv = "NA", # this to make sure the columns are not ordered
trace="none",
margins =c(3,6),
key.xlab = "% Votes for Republican\n Presidential Candidate",
labCol = years,
denscol = "grey",
density.info = "density",
col = some_col_func
) # RowSideColors = rev(labels_colors(dend)), # to add nice colored strips hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty",
"median", "centroid", "ward.D2")
votes.repub_dendlist <- dendlist()
for(i in seq_along(hclust_methods)) {
tmp_dend <- votes.repub %>% arcsin_transformation %>% dist %>% hclust(method = hclust_methods[i]) %>% as.dendrogram
votes.repub_dendlist <- dendlist(votes.repub_dendlist, tmp_dend)
}
names(votes.repub_dendlist) <- hclust_methods
# votes.repub_dendlistcorrplot::corrplot(cor.dendlist(votes.repub_dendlist), "pie", "lower")arcsin_transformation <- function(x) asin(x/100)
dend_NA <- votes.repub %>% is.na %>%
dist %>% hclust %>% as.dendrogram %>% ladderize
dend <- votes.repub %>% arcsin_transformation %>%
dist %>% hclust(method = "ave") %>% as.dendrogram %>%
rotate(labels(dend_NA)) %>%
color_branches(k=3)
# some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
some_col_func <- colorspace::diverge_hcl
# par(mar = c(3,3,3,3))
# library(gplots)
gplots::heatmap.2(as.matrix(votes.repub),
main = "Votes for\n Republican Presidential Candidate\n (clustered using average)",
srtCol = 60,
dendrogram = "row",
Rowv = dend,
Colv = "NA", # this to make sure the columns are not ordered
trace="none",
margins =c(3,6),
key.xlab = "% Votes for Republican\n Presidential Candidate",
labCol = years,
denscol = "grey",
density.info = "density",
col = some_col_func
) # RowSideColors = rev(labels_colors(dend)), # to add nice colored strips ord1 <- c("North Carolina", "Virginia", "Tennessee", "Kentucky", "Maryland",
"Delaware", "Oklahoma", "Missouri", "New Mexico", "Oregon", "Washington",
"California", "West Virginia", "Hawaii", "Nevada", "Arizona",
"Montana", "Idaho", "Wyoming", "Utah", "Colorado", "Alaska",
"Illinois", "New York", "Indiana", "Ohio", "Connecticut", "New Hampshire",
"New Jersey", "Pennsylvania", "Iowa", "South Dakota", "North Dakota",
"Wisconsin", "Minnesota", "Nebraska", "Kansas", "Maine", "Michigan",
"Massachusetts", "Rhode Island", "Vermont", "Alabama", "Georgia",
"Louisiana", "Arkansas", "Florida", "Texas", "South Carolina",
"Mississippi")
ord2 <- c("North Carolina", "Virginia", "Tennessee", "Oklahoma", "Kentucky",
"Maryland", "Delaware", "Missouri", "New Mexico", "West Virginia",
"Oregon", "Washington", "California", "Nevada", "Arizona", "Montana",
"Colorado", "Alaska", "Idaho", "Wyoming", "Utah", "Hawaii", "Maine",
"Illinois", "New York", "New Jersey", "Indiana", "Ohio", "Connecticut",
"New Hampshire", "Pennsylvania", "Michigan", "Iowa", "South Dakota",
"North Dakota", "Wisconsin", "Minnesota", "Massachusetts", "Rhode Island",
"Nebraska", "Kansas", "Vermont", "Alabama", "Georgia", "Louisiana",
"Arkansas", "Florida", "Texas", "South Carolina", "Mississippi"
)
# dput(lapply(dends, labels)[[2]])dend_com <- votes.repub %>% arcsin_transformation %>%
dist %>% hclust(method = "com") %>% as.dendrogram %>%
rotate(labels(dend_NA)) %>%
color_branches(k=3) # %>% ladderize
dend_ave <- votes.repub %>% arcsin_transformation %>%
dist %>% hclust(method = "ave") %>% as.dendrogram %>%
rotate(labels(dend_NA)) %>%
color_branches(k=3) # %>% ladderize
# The orders were predefined after using untangle("step2side")
# They are omitted here to save running time.
dend_com <- rotate(dend_com, ord1)
dend_ave <- rotate(dend_ave, ord2)
dends <- dendlist(complete = dend_com, average = dend_ave) # %>% untangle("step2side")
dends %>% tanglegram(margin_inner = 7)animals <- cluster::animals
colnames(animals) <- c("warm-blooded",
"can fly",
"vertebrate",
"endangered",
"live in groups",
"have hair")dend_r <- animals %>% dist(method = "man") %>% hclust(method = "ward.D") %>% as.dendrogram %>% ladderize %>%
color_branches(k=4)
dend_c <- t(animals) %>% dist(method = "man") %>% hclust(method = "com") %>% as.dendrogram %>% ladderize%>%
color_branches(k=3)
# some_col_func <- function(n) rev(colorspace::heat_hcl(n, c = c(80, 30), l = c(30, 90), power = c(1/5, 1.5)))
# some_col_func <- colorspace::diverge_hcl
# some_col_func <- colorspace::sequential_hcl
some_col_func <- function(n) (colorspace::diverge_hcl(n, h = c(246, 40), c = 96, l = c(65, 90)))
# par(mar = c(3,3,3,3))
# library(gplots)
gplots::heatmap.2(as.matrix(animals-1),
main = "Attributes of Animals",
srtCol = 35,
Rowv = dend_r,
Colv = dend_c,
trace="row", hline = NA, tracecol = "darkgrey",
margins =c(6,3),
key.xlab = "no / yes",
denscol = "grey",
density.info = "density",
col = some_col_func
)hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty",
"median", "centroid", "ward.D2")
animals_dendlist <- dendlist()
for(i in seq_along(hclust_methods)) {
tmp_dend <- animals %>% dist(method = "man") %>%
hclust(method = hclust_methods[i]) %>% as.dendrogram
animals_dendlist <- dendlist(animals_dendlist, tmp_dend)
}
names(animals_dendlist) <- hclust_methods
# votes.repub_dendlistcophenetic_cors <- cor.dendlist(animals_dendlist)
corrplot::corrplot(cophenetic_cors, "pie", "lower")remove_median <- dendlist(animals_dendlist, which = c(1:8)[-6] )
FM_cors <- cor.dendlist(remove_median, method = "FM_index", k = 4)
corrplot::corrplot(FM_cors, "pie", "lower")