Contents

suppressPackageStartupMessages({
    library(dplyr)
    library(factoextra)
    library(ComplexHeatmap)
})

1 Load Datasets

bs_dir <- "~/data2/Genomic_Super_Signature/refinebio/bootstrap/data"
pos <- readRDS(file.path(bs_dir, "bootstrap_PCs_rowNorm_Pos.rds"))
neg <- readRDS(file.path(bs_dir, "bootstrap_PCs_rowNorm_Neg.rds"))
data <- lapply(c(pos, neg), function(x) x$rotation) %>% Reduce(cbind,.) %>% t
               
topPCs = 20
numOfDataset = 20
PCsToInclude = 5
ind <- c()
for (i in 1:numOfDataset) {new_ind = c(1:PCsToInclude)+topPCs*(i-1); ind = c(ind, new_ind)}

dat <- data[ind,]
source('~/data2/Genomic_Super_Signature/refinebio/methods/4_Hierarchical_Clustering_func.R')

2 Euclidean Correlation

res.dist <- factoextra::get_dist(dat, method = "euclidean")
res.hclust <- stats::hclust(res.dist, method = "ward.D2")
for (i in 81:90) {
    res.hcut <- factoextra::hcut(res.dist, k = i, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "euclidean")
    eval <- evaluateCluster(res.hcut, PCsToInclude, controlType = "Neg")
    print(eval)
}
## 

k = 84

res.hcut <- factoextra::hcut(res.dist, k = k, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "euclidean")
col = col_vector[1:res.hcut$nbclust]
factoextra::fviz_dend(res.hclust, k = k, cex = 0.5, 
                      rect = TRUE, palette = col,
                      rect_fill = TRUE, rect_border = col) 
## Warning in if (color == "cluster") color <- "default": the condition has length
## > 1 and only the first element will be used

evaluateCluster(res.hcut, PCsToInclude, controlType = "Pos")

3 Manhattan Correlation

res.dist <- factoextra::get_dist(dat, method = "manhattan")
res.hclust <- stats::hclust(res.dist, method = "ward.D2")
for (i in 81:90) {
    res.hcut <- factoextra::hcut(res.dist, k = i, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "manhattan")
    eval <- evaluateCluster(res.hcut, PCsToInclude, controlType = "Neg")
    print(eval)
}

k = 87

res.hcut <- factoextra::hcut(res.dist, k = k, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "manhattan")
col = col_vector[1:res.hcut$nbclust]
factoextra::fviz_dend(res.hclust, k = k, cex = 0.5, 
                      rect = TRUE, palette = col,
                      rect_fill = TRUE, rect_border = col) 
## Warning in if (color == "cluster") color <- "default": the condition has length
## > 1 and only the first element will be used

evaluateCluster(res.hcut, PCsToInclude, controlType = "Pos")

4 Pearson Correlation

res.dist <- factoextra::get_dist(dat, method = "pearson")
res.hclust <- stats::hclust(res.dist, method = "ward.D2")
for (i in 71:80) {
    res.hcut <- factoextra::hcut(res.dist, k = i, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "pearson")
    eval <- evaluateCluster(res.hcut, PCsToInclude, controlType = "Neg")
    print(eval)
}

k = 77

res.hcut <- factoextra::hcut(res.dist, k = k, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "pearson")
col = col_vector[1:res.hcut$nbclust]
factoextra::fviz_dend(res.hclust, k = k, cex = 0.5, 
                      rect = TRUE, palette = col,
                      rect_fill = TRUE, rect_border = col) 
## Warning in if (color == "cluster") color <- "default": the condition has length
## > 1 and only the first element will be used

evaluateCluster(res.hcut, PCsToInclude, controlType = "Pos")

5 Spearman Correlation

res.dist <- factoextra::get_dist(dat, method = "spearman")
res.hclust <- stats::hclust(res.dist, method = "ward.D2")
for (i in 35:45) {
    res.hcut <- factoextra::hcut(res.dist, k = i, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "spearman")
    eval <- evaluateCluster(res.hcut, PCsToInclude, controlType = "Neg")
    print(eval)
}

k = 42

res.hcut <- factoextra::hcut(res.dist, k = k, hc_funct = "hclust", hc_method = "ward.D2", hc_metric = "spearman")
col = col_vector[1:res.hcut$nbclust]
factoextra::fviz_dend(res.hclust, k = k, cex = 0.5, 
                      rect = TRUE, palette = col,
                      rect_fill = TRUE, rect_border = col) 
## Warning in if (color == "cluster") color <- "default": the condition has length
## > 1 and only the first element will be used

evaluateCluster(res.hcut, PCsToInclude, controlType = "Pos")

5.1 different agglomeration methods

For hierarchical clustering based on spearman correlation, all 8 agglomeration methods separate negative controls’ PC1s with 42 clusters.

res.dist <- factoextra::get_dist(dat, method = "spearman")
ag_methods <- c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")

for (ag in ag_methods) {
  res.hclust <- stats::hclust(res.dist, method = ag)
  for (i in 41:43) {
    res.hcut <- factoextra::hcut(res.dist, k = i, hc_funct = "hclust", hc_method = ag, hc_metric = "spearman")
    eval <- evaluateCluster(res.hcut, PCsToInclude, controlType = "Neg",
                            agglomeration.method = ag)
    print(eval)
  }
}