suppressMessages(library(mclust))
## Warning: package 'mclust' was built under R version 4.3.3
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(ggalluvial))
suppressMessages(library(tidyr))
set.seed(123)
sample <- 'TexTerm' ### 'TRM' or 'TexTerm'
tag <- '_subset_TFs_v2'
df_old <- read.csv(paste0("clustering_membership_copula_",sample,"_shrinkage_res0.9_s0.15",tag,".csv"))
df_new <- read.csv(paste0("clustering_membership_copula_",sample,"_shrinkage_res1.1_s0.15",tag,".csv"))
We merged two clustering into a single dataframe and removed the standalone clusters.
# Merge the two results by TF name
# We suffix columns with _old and _new
df_compare <- inner_join(df_old, df_new, by = c("name", "group"), suffix = c("_old", "_new"))
# Function to identify clusters of size 1 (Standalone)
get_singleton_clusters <- function(cluster_vec) {
counts <- table(cluster_vec)
names(counts)[counts == 1]
}
# Identify singleton clusters in both methods
singletons_old <- get_singleton_clusters(df_compare$cluster_old)
singletons_new <- get_singleton_clusters(df_compare$cluster_new)
# Filter: Exclude TFs that belong to a singleton cluster in EITHER method
df_clean <- df_compare %>%
filter(!cluster_old %in% singletons_old,
!cluster_new %in% singletons_new)
cat("TFs remaining after removing standalone clusters:", nrow(df_clean), "\n")
## TFs remaining after removing standalone clusters: 167
We manually checked some important genes, 6/8 remained stable clustering.
tmp <- c("Zscan20","Jdp2","Zfp324","Irf8","Hic1","Prdm1","Gfi1","Fli1")
knitr::kable(df_clean[df_clean$name %in% tmp,], caption = 'clustering comparison on key TFs') |> kableExtra::kable_styling(latex_options = 'scale_down')
| name | cluster_old | group | cluster_new | |
|---|---|---|---|---|
| 51 | Fli1 | 5 | Housekeeping | 4 |
| 62 | Gfi1 | 2 | TRM_Tex | 2 |
| 73 | Prdm1 | 2 | TRM_Tex | 2 |
| 75 | Hic1 | 3 | TRM_Tex | 3 |
| 142 | Irf8 | 2 | Tex_specific | 2 |
| 151 | Zscan20 | 2 | Tex_specific | 2 |
| 152 | Jdp2 | 2 | Tex_specific | 5 |
| 157 | Zfp324 | 3 | Tex_specific | 3 |
We compared the global clustering assignment regardless of TF group
# A. Quantitative Metric: Adjusted Rand Index (ARI)
# ARI = 1.0 means perfect match. ARI ~ 0 means random chance.
global_ari <- adjustedRandIndex(df_clean$cluster_old, df_clean$cluster_new)
cat("Global Adjusted Rand Index (ARI):", round(global_ari, 3), "\n")
## Global Adjusted Rand Index (ARI): 0.591
# B. Visualization: Confusion Matrix Heatmap
# This shows: "Did TFs in Old Cluster 1 move to New Cluster 3?"
confusion_mat <- df_clean %>%
count(cluster_old, cluster_new)
p1 <- ggplot(confusion_mat, aes(x = factor(cluster_new), y = factor(cluster_old), fill = n)) +
geom_tile() +
geom_text(aes(label = n), color = "white") +
scale_fill_viridis_c() +
labs(title = paste0("Global Cluster Comparison (ARI = ", round(global_ari, 3), ")"),
x = "New Method Clusters",
y = "Old Method Clusters",
fill = "# TFs") +
theme_minimal()
print(p1)
We compared the clustering within each TF group
# A. Calculate ARI per TF Group
group_metrics <- df_clean %>%
group_by(group) %>%
summarise(
n_tfs = n(),
group_ari = adjustedRandIndex(cluster_old, cluster_new)
)
print(group_metrics)
## # A tibble: 4 × 3
## group n_tfs group_ari
## <chr> <int> <dbl>
## 1 Housekeeping 54 0.468
## 2 TRM_Tex 28 0.531
## 3 Tex_important 52 0.751
## 4 Tex_specific 33 0.615
Tex_important and Tex_specific groups remained highly stable.
# B. Visualization: Bar Chart of Stability per Group
p2 <- ggplot(group_metrics, aes(x = reorder(group, group_ari), y = group_ari, fill = group)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylim(0, 1) +
labs(title = "Cluster Stability by TF Group",
subtitle = "Higher ARI = Clusters remained more consistent within this group",
y = "Adjusted Rand Index (Agreement)",
x = "TF Group") +
theme_classic()
print(p2)
If you see straight ribbons flowing from Left to Right, the clusters are stable. If the ribbons criss-cross violently, the TFs were reassigned.
# C. Visualization: Detailed Flow (Alluvial Plot) per Group
p3 <- ggplot(df_clean,
aes(axis1 = factor(cluster_old), axis2 = factor(cluster_new))) +
geom_alluvium(aes(fill = group), width = 1/12) +
geom_stratum(width = 1/12, fill = "grey80", color = "grey") +
geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("Old Clusters", "New Clusters"), expand = c(.05, .05)) +
scale_fill_brewer(palette = "Set1") +
labs(title = "TF Cluster Flow: Old vs New",
subtitle = "Colored by Biological Group") +
theme_minimal() +
facet_wrap(~group, scales = "free") # Facet by group to see individual alignments
print(p3)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.0 ggalluvial_0.12.5 dplyr_1.1.2 ggplot2_3.4.2
## [5] mclust_6.1.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.3 jsonlite_1.8.7 highr_0.10 compiler_4.3.1
## [5] webshot_0.5.5 tidyselect_1.2.0 stringr_1.5.0 xml2_1.3.4
## [9] jquerylib_0.1.4 systemfonts_1.0.4 scales_1.2.1 yaml_2.3.7
## [13] fastmap_1.1.1 R6_2.5.1 labeling_0.4.2 generics_0.1.3
## [17] knitr_1.43 tibble_3.2.1 kableExtra_1.3.4 munsell_0.5.0
## [21] svglite_2.1.1 RColorBrewer_1.1-3 bslib_0.5.0 pillar_1.9.0
## [25] rlang_1.1.1 utf8_1.2.3 stringi_1.7.12 cachem_1.0.8
## [29] xfun_0.39 sass_0.4.6 viridisLite_0.4.2 cli_3.6.1
## [33] withr_2.5.0 magrittr_2.0.3 digest_0.6.31 rvest_1.0.3
## [37] grid_4.3.1 rstudioapi_0.14 lifecycle_1.0.3 vctrs_0.6.3
## [41] evaluate_0.21 glue_1.6.2 farver_2.1.1 fansi_1.0.4
## [45] colorspace_2.1-0 rmarkdown_2.23 purrr_1.0.1 httr_1.4.6
## [49] tools_4.3.1 pkgconfig_2.0.3 htmltools_0.5.5