1 import packages and functions

suppressMessages(library(data.table))
suppressMessages(library(ggplot2))
suppressMessages(library(ggforce))
suppressMessages(library(dplyr))
suppressMessages(library(amap))

suppressMessages(library(clusterProfiler))
suppressMessages(library(org.Mm.eg.db))
suppressMessages(library(enrichplot))
suppressMessages(library(xlsx))
suppressMessages(library(pheatmap))

# load functions
set.seed(42)
fl.sources <- list.files("../../scripts/utils/", full.names = T)
tmp <- sapply(fl.sources,source)

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:enrichplot':
## 
##     color_palette

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# set plot parameters, which is suitable for CNS publication
plot.format = theme(
    plot.background=element_blank(),
    panel.grid=element_blank(),
    panel.background=element_blank(),
    panel.border=element_rect(color="black", linewidth=0.5, fill=NA),
    axis.line=element_blank(),
    axis.ticks=element_line(color="black",linewidth=0.5),
    axis.text=element_text(color="black", size=7),
    axis.title=element_text(color="black", size=7),
    plot.title=element_text(color="black", size=7),
    legend.background=element_blank(),
    legend.key=element_blank(),
    legend.text=element_text(color="black", size=7),
    legend.title=element_text(color="black", size=7))

2 import data

We load the PageRank scores matrix along with the meta file containing the cell types.

Data_normed <- read.csv("pagerank.csv", row.names = 1)
group_sorted <- read.table('group_file.txt',header = T, row.names = 1)
group_sorted$oldName <- rownames(group_sorted)
rownames(group_sorted) <- group_sorted$newName

3 Kmeans clustering of TFs

3.1 pca plot

The following plot shows cumulative variance explained vs number of samples. After manually checking, we decide to use PC=10 to reduce the data based on “elbow” method, which explained ~75% variance.

data.pca <- prcomp(Data_normed)

# plot the cumulative variance explained vs # of components
s = (data.pca$sdev)^2/sum((data.pca$sdev)^2)
df2 <- data.frame(x=c(1:length(data.pca$sdev)),y=cumsum(s))
p2 <- ggplot(df2)+aes(x=x,y=y)+geom_point()+geom_line()+
  labs(x="Principal Component",y="Cumulative Proportion of Variance Explained")+
  plot.format
print(p2)

## save to file
pdf("pca.pdf", width=3, height=3)
print(p2)
dev.off()

## quartz_off_screen 
##                 2

3.2 hyperparams selection for Kmeans clustering

We need to determine the best distance metric and number of K for Kmeans clustering. To evaluate the clustering quality, we computed the average silhouette width which combines both cohesion (how close data points in a cluster are to each other) and separation (how distinct a cluster is from other clusters). A higher silhouette width value indicates better clustering quality.

We used five common distance metrics:

Euclidean distance
Manhattan distance
Kendall correlation
Pearson correlation

Number of k is selected from the range [3, 20]. The lower limit is considering the number of disease states while the upper limit is the balance of searching space and searching speed.

PCNo <- 10
data_reduced <- as.data.frame(prcomp(Data_normed, rank. = PCNo)$x)
df2 <- findOptimal(data_reduced, max_k = 20)

# save to file
write.csv(df2, 'kmeans_param.csv')

p2 <- ggplot(df2)+aes(x=clusters,y=y,color=method,group=method)+
    geom_line()+geom_point()+xlim(3,20)+
    labs(x='number of groups K',y='average silhouette width',color='distance metrics')+plot.format
print(p2)

## Warning: Removed 10 rows containing missing values (`geom_line()`).

## Warning: Removed 10 rows containing missing values (`geom_point()`).

# save to file
pdf("kmeans_param.pdf",width=3.5,height = 2)
print(p2)

## Warning: Removed 10 rows containing missing values (`geom_line()`).

## Warning: Removed 10 rows containing missing values (`geom_point()`).

dev.off()

## quartz_off_screen 
##                 2

3.3 Kmeans clustering

set.seed(42)
clusterNo <- 7
Cluster <- amap::Kmeans(data_reduced, centers = clusterNo, nstart = 25,iter.max = 50, method = "pearson")

# extract tfs from same cluster
df2 <- as.data.frame(Cluster$cluster)
names(df2) <- "cluster"

# save to file
write.csv(df2, paste0('cluster_k',clusterNo,'.csv'))

4 generate plots

We now import the pre-defined graph showing the differentiation path.

wavedf <- data.frame(x = c(1,2,2,3,3,4,4,5,5), 
                     y = c(3,5,1,3,1,3,0,4,2),
                     samplename = c("Naive","TE","TexProg","MP","TexInt","TRM","TexTerm","TEM","TCM"),
                     labelposx = c(1,2,2,3,3,4,4,5,5),
                     labelposy = c(3,5,1,3,1.4,3,0,4,2)-0.4)

tmp <- lapply(c(1:clusterNo), function(x) outputWave(x, wavedf, df2, Data_normed, group_sorted))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(tmp[[2]])

5 pathway analysis

You can skip this part if you already have enriched pathways. Running this part may take ~10 min.

# L <- list.files(path="./", pattern = "c[0-9]+.txt")
# tmp <- lapply(L, gsea)

6 selected pathways visualization

df <- read.xlsx("SuppFig4 wave GSEA pathways.xlsx", sheetIndex = 1) 
df2 <- df %>% dplyr::select(wave.cluster, Description, p.adjust) %>%
      tidyr::pivot_wider(names_from = wave.cluster, values_from = p.adjust) %>% as.data.frame() %>% replace(is.na(.), 1) %>%
      tibble::column_to_rownames("Description") 
knitr::kable(df[1:6,1:6], caption = "pathways result") |> kableExtra::kable_styling(latex_options = 'scale_down')

pathways result
wave.cluster	related.cell.state	ID	Description	GeneRatio	BgRatio
c1	TE TEM	GO:0007389	pattern specification process	71/198	480/23355
c1	TE TEM	GO:0045165	cell fate commitment	54/198	287/23355
c1	TE TEM	GO:0098727	maintenance of cell number	16/198	176/23355
c1	TE TEM	GO:0007219	Notch signaling pathway	15/198	180/23355
c1	TE TEM	GO:0023019	signal transduction involved in regulation of gene expression	6/198	23/23355
c1	TE TEM	mmu05202	Transcriptional misregulation in cancer	12/58	224/9010

knitr::kable(df2[1:6,1:6], caption = "data for visualization") |> kableExtra::kable_styling(latex_options = 'scale_down')

data for visualization
	c1	c3	c4	c5	c6	c7
pattern specification process	0e+00	1.0000000	1.0000000	1e+00	1.0e+00	1.0000000
cell fate commitment	0e+00	0.0003808	1.0000000	1e+00	6.2e-06	0.0003793
maintenance of cell number	0e+00	1.0000000	1.0000000	1e-07	1.0e+00	1.0000000
Notch signaling pathway	0e+00	1.0000000	1.0000000	1e+00	1.0e+00	1.0000000
signal transduction involved in regulation of gene expression	4e-07	1.0000000	0.0021083	1e+00	1.0e+00	1.0000000
Transcriptional misregulation in cancer	5e-07	1.0000000	1.0000000	1e+00	1.0e+00	1.0000000

cutoff <- 0.05
dt2 <- apply(df2,1:2, function(x) min(x, cutoff)) %>% as.data.frame()

## remove the rows with no variation
dt3 <- dt2[rowSums(dt2)!=cutoff*ncol(dt2),]
print(dim(dt3))

## [1] 49  7

## heatmap
p1 <- pheatmap(dt3, fontsize = 7, show_rownames = T,
                 angle_col = 45, show_colnames = T,
                 cellwidth = 8, cellheight = 8,
                 cluster_rows = T, cluster_cols = T,
                 clustering_distance_rows = "correlation",
                 clustering_distance_cols = "correlation",
                 clustering_method = "average",
                 cutree_rows = 7, 
                 treeheight_row = 10, treeheight_col = 10,
                 border_color = NA,
                 color = colorRampPalette(c("red","blue"))(20))
print(p1)

## save to file
pdf(paste0("hp_transcriptional_waves_selected_pathway_summary_",cutoff,"_max_hclust_cluster.pdf"))
print(p1)
dev.off()

## quartz_off_screen 
##                 2

7 session info

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] factoextra_1.0.7      ggpubr_0.6.0          RColorBrewer_1.1-3   
##  [4] pheatmap_1.0.12       xlsx_0.6.5            enrichplot_1.20.0    
##  [7] org.Mm.eg.db_3.17.0   AnnotationDbi_1.62.2  IRanges_2.34.1       
## [10] S4Vectors_0.38.1      Biobase_2.60.0        BiocGenerics_0.46.0  
## [13] clusterProfiler_4.9.1 amap_0.8-19           dplyr_1.1.2          
## [16] ggforce_0.4.1         ggplot2_3.4.2         data.table_1.14.8    
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.14         jsonlite_1.8.7          magrittr_2.0.3         
##   [4] farver_2.1.1            rmarkdown_2.23          fs_1.6.2               
##   [7] zlibbioc_1.46.0         vctrs_0.6.3             memoise_2.0.1          
##  [10] RCurl_1.98-1.12         ggtree_3.8.0            webshot_0.5.5          
##  [13] rstatix_0.7.2           htmltools_0.5.5         broom_1.0.5            
##  [16] gridGraphics_0.5-1      sass_0.4.6              bslib_0.5.0            
##  [19] plyr_1.8.8              cachem_1.0.8            igraph_1.5.0           
##  [22] lifecycle_1.0.3         pkgconfig_2.0.3         Matrix_1.6-5           
##  [25] R6_2.5.1                fastmap_1.1.1           gson_0.1.0             
##  [28] GenomeInfoDbData_1.2.10 digest_0.6.31           aplot_0.1.10           
##  [31] colorspace_2.1-0        patchwork_1.1.2         RSQLite_2.3.1          
##  [34] labeling_0.4.2          fansi_1.0.4             abind_1.4-5            
##  [37] httr_1.4.6              polyclip_1.10-4         compiler_4.3.1         
##  [40] bit64_4.0.5             withr_2.5.0             downloader_0.4         
##  [43] backports_1.4.1         BiocParallel_1.34.2     carData_3.0-5          
##  [46] viridis_0.6.3           DBI_1.1.3               highr_0.10             
##  [49] ggsignif_0.6.4          MASS_7.3-60             HDO.db_0.99.1          
##  [52] tools_4.3.1             ape_5.7-1               scatterpie_0.2.1       
##  [55] glue_1.6.2              nlme_3.1-162            GOSemSim_2.26.0        
##  [58] shadowtext_0.1.2        cluster_2.1.4           reshape2_1.4.4         
##  [61] fgsea_1.26.0            generics_0.1.3          gtable_0.3.3           
##  [64] tidyr_1.3.0             xml2_1.3.4              tidygraph_1.2.3        
##  [67] car_3.1-2               utf8_1.2.3              XVector_0.40.0         
##  [70] ggrepel_0.9.3           pillar_1.9.0            stringr_1.5.0          
##  [73] yulab.utils_0.0.6       rJava_1.0-6             splines_4.3.1          
##  [76] tweenr_2.0.2            treeio_1.24.1           lattice_0.21-8         
##  [79] bit_4.0.5               tidyselect_1.2.0        GO.db_3.17.0           
##  [82] Biostrings_2.68.1       knitr_1.43              gridExtra_2.3          
##  [85] svglite_2.1.1           xfun_0.39               graphlayouts_1.0.0     
##  [88] stringi_1.7.12          lazyeval_0.2.2          ggfun_0.1.1            
##  [91] yaml_2.3.7              kableExtra_1.3.4        evaluate_0.21          
##  [94] codetools_0.2-19        xlsxjars_0.6.1          ggraph_2.1.0           
##  [97] tibble_3.2.1            qvalue_2.32.0           ggplotify_0.1.1        
## [100] cli_3.6.1               systemfonts_1.0.4       munsell_0.5.0          
## [103] jquerylib_0.1.4         Rcpp_1.0.10             GenomeInfoDb_1.36.1    
## [106] png_0.1-8               parallel_4.3.1          blob_1.2.4             
## [109] DOSE_3.26.1             bitops_1.0-7            viridisLite_0.4.2      
## [112] tidytree_0.4.2          scales_1.2.1            purrr_1.0.1            
## [115] crayon_1.5.2            rlang_1.1.1             rvest_1.0.3            
## [118] cowplot_1.1.1           fastmatch_1.1-3         KEGGREST_1.40.0

TF transcriptional waves and pathway analysis

Cong

2024-06-28