Contents

suppressPackageStartupMessages({
  library(dplyr)
  library(rsvd)
})

1 Synthetic datasets

bs_dir = "~/data2/Genomic_Super_Signature/refinebio/bootstrap/data"
pos <- readRDS(file.path(bs_dir, "bootstrap_PCs_rowNorm_Pos.rds"))
neg <- readRDS(file.path(bs_dir, "bootstrap_PCs_rowNorm_Neg.rds"))
data = lapply(c(pos, neg), function(x) x$rotation) %>% Reduce(cbind,.) %>% t
k = PLIER::num.pc(t(data))
## Warning in if ((class(data) != "list") & (class(data) != "rsvd")) {: the
## condition has length > 1 and only the first element will be used
## Computing svd
dim(data)
## [1]   400 11553
k
## [1] 47

1.1 Test seed_mode for KMeans_arma

  • Estimate centroids using KMeans_arma from different seed_mode
  • Compare the above output against stats::kmeans results the default algorithm, “Hartigan-Wong”
# Clustering with ClusterR functions
kmeans_arma = function(data, seed_mood) {
  km_cl = ClusterR::KMeans_arma(data, clusters = k, seed_mode = seed_mood)
  pred_cl = ClusterR::predict_KMeans(data = data, CENTROIDS = km_cl, threads = 12)
  return(pred_cl)
}
km_base = kmeans(data, centers = k, iter.max = 30)
km_cl = as.vector(km_base$cluster)

seed_mode = c("static_subset", "random_subset", "static_spread", "random_spread")

for (sm in seed_mode) {
  km_arma = kmeans_arma(data, sm)
  
  # compare two functions
  compare = table(km_arma, km_cl) %>% as.matrix
  heatmap(compare, 
          xlab = paste('KMeans_arma_seed_mode:', sm, '\n'),
          col = coul)
}

1.2 Test algorithms of stats::kmeans

  • Using centroids from KMeans_arma with seed_mode="random_subset", perform ClusterR::predict_KMeans
  • Compare the above output against stats::kmeans results with different algorithms.
  • Default algorithm for stats::kmeans is “Hartigan-Wong”
km_arma = kmeans_arma(data, "random_subset")

km_algos = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")

for (algo in km_algos) {
  km_base = kmeans(data, centers = k, iter.max = 70,                 
                   algorithm = algo)
  km_cl = as.vector(km_base$cluster)
  
  # compare two functions
  compare = table(km_arma, km_cl) %>% as.matrix
  heatmap(compare, 
          xlab = paste('base-kmeans-algo:', algo, '\n'),
          col = coul)
}

2 677 studies from refine.bio

Clustering top 20 PCs from 677 studies (total 13540 PCs) with k = 561

wd = "/home/sehyun/data2/Genomic_Super_Signature/refinebio/cluster/reports_kmeans_centroids_allRseq"

# mbkmeans without centroids
mbkeans_res = readRDS(file.path(wd, "5185.pts-3.res_avgLoading.rds"))
mbkmeans_cl = mbkeans_res$Clusters

# kmeans with centroids from "random_spread" 
kmeans_randomSpread_res = readRDS(file.path(wd, "4049.pts-3.res_avgLoading.rds"))
kmeans_cl_v1 = kmeans_randomSpread_res$cluster

# kmeans with centrodis from "random_subset"
kmeans_randomSubset_res = readRDS(file.path(wd, "30926.pts-0.kmeansRes.rds"))
kmeans_cl_v2 = kmeans_randomSubset_res$res$cluster

# ClusterR with centroids from "random_subset"
clusterR_res = readRDS(file.path(wd, "31508.pts-0.res.rds"))
class(clusterR_res$pr) = NULL
clusterR_cl = clusterR_res$pr

The number of 1-element clusters

data.frame(funcName = c("mbkmeans_cl", "kmeans_cl_v1", "kmeans_cl_v2", "clusterR_cl"),
           oneElementCl = c(sum(table(mbkmeans_cl) == 1),
                            sum(table(kmeans_cl_v1) == 1),
                            sum(table(kmeans_cl_v2) == 1),
                            sum(table(clusterR_cl) == 1))
           )
##       funcName oneElementCl
## 1  mbkmeans_cl          499
## 2 kmeans_cl_v1            0
## 3 kmeans_cl_v2            0
## 4  clusterR_cl           20

2.0.1 kmeans_cl_v1 vs. kmeans_cl_v2

compare1 = table(kmeans_cl_v1, kmeans_cl_v2) %>% as.matrix
heatmap(compare1, col = coul)

2.0.2 kmeans_cl_v2 vs. clusterR_cl

kmeans_cl_v2 (kmeans with centrodis from “random_subset”) and clusterR_cl doesn’t seem to be compatable.

compare2 = table(kmeans_cl_v2, clusterR_cl) %>% as.matrix
heatmap(compare2, col = coul)