suppressPackageStartupMessages({
library(dplyr)
library(rsvd)
})
bs_dir = "~/data2/Genomic_Super_Signature/refinebio/bootstrap/data"
pos <- readRDS(file.path(bs_dir, "bootstrap_PCs_rowNorm_Pos.rds"))
neg <- readRDS(file.path(bs_dir, "bootstrap_PCs_rowNorm_Neg.rds"))
data = lapply(c(pos, neg), function(x) x$rotation) %>% Reduce(cbind,.) %>% t
k = PLIER::num.pc(t(data))
## Warning in if ((class(data) != "list") & (class(data) != "rsvd")) {: the
## condition has length > 1 and only the first element will be used
## Computing svd
dim(data)
## [1] 400 11553
k
## [1] 47
KMeans_armaKMeans_arma from different seed_modestats::kmeans results the default algorithm, “Hartigan-Wong”# Clustering with ClusterR functions
kmeans_arma = function(data, seed_mood) {
km_cl = ClusterR::KMeans_arma(data, clusters = k, seed_mode = seed_mood)
pred_cl = ClusterR::predict_KMeans(data = data, CENTROIDS = km_cl, threads = 12)
return(pred_cl)
}
km_base = kmeans(data, centers = k, iter.max = 30)
km_cl = as.vector(km_base$cluster)
seed_mode = c("static_subset", "random_subset", "static_spread", "random_spread")
for (sm in seed_mode) {
km_arma = kmeans_arma(data, sm)
# compare two functions
compare = table(km_arma, km_cl) %>% as.matrix
heatmap(compare,
xlab = paste('KMeans_arma_seed_mode:', sm, '\n'),
col = coul)
}
stats::kmeansKMeans_arma with seed_mode="random_subset", perform ClusterR::predict_KMeansstats::kmeans results with different algorithms.stats::kmeans is “Hartigan-Wong”km_arma = kmeans_arma(data, "random_subset")
km_algos = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")
for (algo in km_algos) {
km_base = kmeans(data, centers = k, iter.max = 70,
algorithm = algo)
km_cl = as.vector(km_base$cluster)
# compare two functions
compare = table(km_arma, km_cl) %>% as.matrix
heatmap(compare,
xlab = paste('base-kmeans-algo:', algo, '\n'),
col = coul)
}
Clustering top 20 PCs from 677 studies (total 13540 PCs) with k = 561
wd = "/home/sehyun/data2/Genomic_Super_Signature/refinebio/cluster/reports_kmeans_centroids_allRseq"
# mbkmeans without centroids
mbkeans_res = readRDS(file.path(wd, "5185.pts-3.res_avgLoading.rds"))
mbkmeans_cl = mbkeans_res$Clusters
# kmeans with centroids from "random_spread"
kmeans_randomSpread_res = readRDS(file.path(wd, "4049.pts-3.res_avgLoading.rds"))
kmeans_cl_v1 = kmeans_randomSpread_res$cluster
# kmeans with centrodis from "random_subset"
kmeans_randomSubset_res = readRDS(file.path(wd, "30926.pts-0.kmeansRes.rds"))
kmeans_cl_v2 = kmeans_randomSubset_res$res$cluster
# ClusterR with centroids from "random_subset"
clusterR_res = readRDS(file.path(wd, "31508.pts-0.res.rds"))
class(clusterR_res$pr) = NULL
clusterR_cl = clusterR_res$pr
The number of 1-element clusters
data.frame(funcName = c("mbkmeans_cl", "kmeans_cl_v1", "kmeans_cl_v2", "clusterR_cl"),
oneElementCl = c(sum(table(mbkmeans_cl) == 1),
sum(table(kmeans_cl_v1) == 1),
sum(table(kmeans_cl_v2) == 1),
sum(table(clusterR_cl) == 1))
)
## funcName oneElementCl
## 1 mbkmeans_cl 499
## 2 kmeans_cl_v1 0
## 3 kmeans_cl_v2 0
## 4 clusterR_cl 20
compare1 = table(kmeans_cl_v1, kmeans_cl_v2) %>% as.matrix
heatmap(compare1, col = coul)
kmeans_cl_v2 (kmeans with centrodis from “random_subset”) and clusterR_cl doesn’t
seem to be compatable.
compare2 = table(kmeans_cl_v2, clusterR_cl) %>% as.matrix
heatmap(compare2, col = coul)