1 import packages and functions

suppressMessages(library(mclust))
## Warning: package 'mclust' was built under R version 4.3.3
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(ggalluvial))
suppressMessages(library(tidyr))

2 data loading

  • clustering 1: main manuscript clustering is based on correlation of correlation, “neighbor of neighbors”
  • clustering 2: more standard clustering based on correlation
set.seed(123)
sample <- 'TexTerm' ### 'TRM' or 'TexTerm'
tag <- '_subset_TFs_v2'
df_old <- read.csv(paste0("clustering_membership_copula_",sample,"_shrinkage_res0.9_s0.15",tag,".csv"))
df_new <- read.csv(paste0("clustering_membership_copula_",sample,"_shrinkage_res1.1_s0.15",tag,".csv"))

3 preprocessing & filtering

We merged two clustering into a single dataframe and removed the standalone clusters.

# Merge the two results by TF name
# We suffix columns with _old and _new
df_compare <- inner_join(df_old, df_new, by = c("name", "group"), suffix = c("_old", "_new"))

# Function to identify clusters of size 1 (Standalone)
get_singleton_clusters <- function(cluster_vec) {
  counts <- table(cluster_vec)
  names(counts)[counts == 1]
}

# Identify singleton clusters in both methods
singletons_old <- get_singleton_clusters(df_compare$cluster_old)
singletons_new <- get_singleton_clusters(df_compare$cluster_new)

# Filter: Exclude TFs that belong to a singleton cluster in EITHER method
df_clean <- df_compare %>%
  filter(!cluster_old %in% singletons_old,
         !cluster_new %in% singletons_new)

cat("TFs remaining after removing standalone clusters:", nrow(df_clean), "\n")
## TFs remaining after removing standalone clusters: 167

We manually checked some important genes, 6/8 remained stable clustering.

tmp <- c("Zscan20","Jdp2","Zfp324","Irf8","Hic1","Prdm1","Gfi1","Fli1")

knitr::kable(df_clean[df_clean$name %in% tmp,], caption = 'clustering comparison on key TFs') |> kableExtra::kable_styling(latex_options = 'scale_down')
clustering comparison on key TFs
name cluster_old group cluster_new
51 Fli1 5 Housekeeping 4
62 Gfi1 2 TRM_Tex 2
73 Prdm1 2 TRM_Tex 2
75 Hic1 3 TRM_Tex 3
142 Irf8 2 Tex_specific 2
151 Zscan20 2 Tex_specific 2
152 Jdp2 2 Tex_specific 5
157 Zfp324 3 Tex_specific 3

4 global comparison (level 1)

We compared the global clustering assignment regardless of TF group

# A. Quantitative Metric: Adjusted Rand Index (ARI)
# ARI = 1.0 means perfect match. ARI ~ 0 means random chance.
global_ari <- adjustedRandIndex(df_clean$cluster_old, df_clean$cluster_new)
cat("Global Adjusted Rand Index (ARI):", round(global_ari, 3), "\n")
## Global Adjusted Rand Index (ARI): 0.591
# B. Visualization: Confusion Matrix Heatmap
# This shows: "Did TFs in Old Cluster 1 move to New Cluster 3?"
confusion_mat <- df_clean %>%
  count(cluster_old, cluster_new)

p1 <- ggplot(confusion_mat, aes(x = factor(cluster_new), y = factor(cluster_old), fill = n)) +
  geom_tile() +
  geom_text(aes(label = n), color = "white") +
  scale_fill_viridis_c() +
  labs(title = paste0("Global Cluster Comparison (ARI = ", round(global_ari, 3), ")"),
       x = "New Method Clusters",
       y = "Old Method Clusters",
       fill = "# TFs") +
  theme_minimal()

print(p1)

5 group-wise comparison (level 2)

We compared the clustering within each TF group

# A. Calculate ARI per TF Group
group_metrics <- df_clean %>%
  group_by(group) %>%
  summarise(
    n_tfs = n(),
    group_ari = adjustedRandIndex(cluster_old, cluster_new)
  )

print(group_metrics)
## # A tibble: 4 × 3
##   group         n_tfs group_ari
##   <chr>         <int>     <dbl>
## 1 Housekeeping     54     0.468
## 2 TRM_Tex          28     0.531
## 3 Tex_important    52     0.751
## 4 Tex_specific     33     0.615

Tex_important and Tex_specific groups remained highly stable.

# B. Visualization: Bar Chart of Stability per Group
p2 <- ggplot(group_metrics, aes(x = reorder(group, group_ari), y = group_ari, fill = group)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylim(0, 1) +
  labs(title = "Cluster Stability by TF Group",
       subtitle = "Higher ARI = Clusters remained more consistent within this group",
       y = "Adjusted Rand Index (Agreement)",
       x = "TF Group") +
  theme_classic()

print(p2)  

If you see straight ribbons flowing from Left to Right, the clusters are stable. If the ribbons criss-cross violently, the TFs were reassigned.

# C. Visualization: Detailed Flow (Alluvial Plot) per Group
p3 <- ggplot(df_clean,
       aes(axis1 = factor(cluster_old), axis2 = factor(cluster_new))) +
  geom_alluvium(aes(fill = group), width = 1/12) +
  geom_stratum(width = 1/12, fill = "grey80", color = "grey") +
  geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("Old Clusters", "New Clusters"), expand = c(.05, .05)) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "TF Cluster Flow: Old vs New",
       subtitle = "Colored by Biological Group") +
  theme_minimal() +
  facet_wrap(~group, scales = "free") # Facet by group to see individual alignments

print(p3)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

6 session info

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tidyr_1.3.0       ggalluvial_0.12.5 dplyr_1.1.2       ggplot2_3.4.2    
## [5] mclust_6.1.1     
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.3       jsonlite_1.8.7     highr_0.10         compiler_4.3.1    
##  [5] webshot_0.5.5      tidyselect_1.2.0   stringr_1.5.0      xml2_1.3.4        
##  [9] jquerylib_0.1.4    systemfonts_1.0.4  scales_1.2.1       yaml_2.3.7        
## [13] fastmap_1.1.1      R6_2.5.1           labeling_0.4.2     generics_0.1.3    
## [17] knitr_1.43         tibble_3.2.1       kableExtra_1.3.4   munsell_0.5.0     
## [21] svglite_2.1.1      RColorBrewer_1.1-3 bslib_0.5.0        pillar_1.9.0      
## [25] rlang_1.1.1        utf8_1.2.3         stringi_1.7.12     cachem_1.0.8      
## [29] xfun_0.39          sass_0.4.6         viridisLite_0.4.2  cli_3.6.1         
## [33] withr_2.5.0        magrittr_2.0.3     digest_0.6.31      rvest_1.0.3       
## [37] grid_4.3.1         rstudioapi_0.14    lifecycle_1.0.3    vctrs_0.6.3       
## [41] evaluate_0.21      glue_1.6.2         farver_2.1.1       fansi_1.0.4       
## [45] colorspace_2.1-0   rmarkdown_2.23     purrr_1.0.1        httr_1.4.6        
## [49] tools_4.3.1        pkgconfig_2.0.3    htmltools_0.5.5