suppressMessages(library(data.table))
suppressMessages(library(ggplot2))
suppressMessages(library(ggforce))
suppressMessages(library(dplyr))
suppressMessages(library(amap))
suppressMessages(library(clusterProfiler))
suppressMessages(library(org.Mm.eg.db))
suppressMessages(library(enrichplot))
suppressMessages(library(xlsx))
suppressMessages(library(pheatmap))
# load functions
set.seed(42)
fl.sources <- list.files("../../scripts/utils/", full.names = T)
tmp <- sapply(fl.sources,source)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:enrichplot':
##
## color_palette
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# set plot parameters, which is suitable for CNS publication
plot.format = theme(
plot.background=element_blank(),
panel.grid=element_blank(),
panel.background=element_blank(),
panel.border=element_rect(color="black", linewidth=0.5, fill=NA),
axis.line=element_blank(),
axis.ticks=element_line(color="black",linewidth=0.5),
axis.text=element_text(color="black", size=7),
axis.title=element_text(color="black", size=7),
plot.title=element_text(color="black", size=7),
legend.background=element_blank(),
legend.key=element_blank(),
legend.text=element_text(color="black", size=7),
legend.title=element_text(color="black", size=7))
We load the PageRank scores matrix along with the meta file containing the cell types.
Data_normed <- read.csv("pagerank.csv", row.names = 1)
group_sorted <- read.table('group_file.txt',header = T, row.names = 1)
group_sorted$oldName <- rownames(group_sorted)
rownames(group_sorted) <- group_sorted$newName
The following plot shows cumulative variance explained vs number of samples. After manually checking, we decide to use PC=10 to reduce the data based on “elbow” method, which explained ~75% variance.
data.pca <- prcomp(Data_normed)
# plot the cumulative variance explained vs # of components
s = (data.pca$sdev)^2/sum((data.pca$sdev)^2)
df2 <- data.frame(x=c(1:length(data.pca$sdev)),y=cumsum(s))
p2 <- ggplot(df2)+aes(x=x,y=y)+geom_point()+geom_line()+
labs(x="Principal Component",y="Cumulative Proportion of Variance Explained")+
plot.format
print(p2)
## save to file
pdf("pca.pdf", width=3, height=3)
print(p2)
dev.off()
## quartz_off_screen
## 2
We need to determine the best distance metric and number of K for Kmeans clustering. To evaluate the clustering quality, we computed the average silhouette width which combines both cohesion (how close data points in a cluster are to each other) and separation (how distinct a cluster is from other clusters). A higher silhouette width value indicates better clustering quality.
We used five common distance metrics:
Number of k is selected from the range [3, 20]. The lower limit is considering the number of disease states while the upper limit is the balance of searching space and searching speed.
PCNo <- 10
data_reduced <- as.data.frame(prcomp(Data_normed, rank. = PCNo)$x)
df2 <- findOptimal(data_reduced, max_k = 20)
# save to file
write.csv(df2, 'kmeans_param.csv')
p2 <- ggplot(df2)+aes(x=clusters,y=y,color=method,group=method)+
geom_line()+geom_point()+xlim(3,20)+
labs(x='number of groups K',y='average silhouette width',color='distance metrics')+plot.format
print(p2)
## Warning: Removed 10 rows containing missing values (`geom_line()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).
# save to file
pdf("kmeans_param.pdf",width=3.5,height = 2)
print(p2)
## Warning: Removed 10 rows containing missing values (`geom_line()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).
dev.off()
## quartz_off_screen
## 2
set.seed(42)
clusterNo <- 7
Cluster <- amap::Kmeans(data_reduced, centers = clusterNo, nstart = 25,iter.max = 50, method = "pearson")
# extract tfs from same cluster
df2 <- as.data.frame(Cluster$cluster)
names(df2) <- "cluster"
# save to file
write.csv(df2, paste0('cluster_k',clusterNo,'.csv'))
We now import the pre-defined graph showing the differentiation path.
wavedf <- data.frame(x = c(1,2,2,3,3,4,4,5,5),
y = c(3,5,1,3,1,3,0,4,2),
samplename = c("Naive","TE","TexProg","MP","TexInt","TRM","TexTerm","TEM","TCM"),
labelposx = c(1,2,2,3,3,4,4,5,5),
labelposy = c(3,5,1,3,1.4,3,0,4,2)-0.4)
tmp <- lapply(c(1:clusterNo), function(x) outputWave(x, wavedf, df2, Data_normed, group_sorted))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(tmp[[2]])
You can skip this part if you already have enriched pathways. Running this part may take ~10 min.
# L <- list.files(path="./", pattern = "c[0-9]+.txt")
# tmp <- lapply(L, gsea)
df <- read.xlsx("SuppFig4 wave GSEA pathways.xlsx", sheetIndex = 1)
df2 <- df %>% dplyr::select(wave.cluster, Description, p.adjust) %>%
tidyr::pivot_wider(names_from = wave.cluster, values_from = p.adjust) %>% as.data.frame() %>% replace(is.na(.), 1) %>%
tibble::column_to_rownames("Description")
knitr::kable(df[1:6,1:6], caption = "pathways result") |> kableExtra::kable_styling(latex_options = 'scale_down')
| wave.cluster | related.cell.state | ID | Description | GeneRatio | BgRatio |
|---|---|---|---|---|---|
| c1 | TE TEM | GO:0007389 | pattern specification process | 71/198 | 480/23355 |
| c1 | TE TEM | GO:0045165 | cell fate commitment | 54/198 | 287/23355 |
| c1 | TE TEM | GO:0098727 | maintenance of cell number | 16/198 | 176/23355 |
| c1 | TE TEM | GO:0007219 | Notch signaling pathway | 15/198 | 180/23355 |
| c1 | TE TEM | GO:0023019 | signal transduction involved in regulation of gene expression | 6/198 | 23/23355 |
| c1 | TE TEM | mmu05202 | Transcriptional misregulation in cancer | 12/58 | 224/9010 |
knitr::kable(df2[1:6,1:6], caption = "data for visualization") |> kableExtra::kable_styling(latex_options = 'scale_down')
| c1 | c3 | c4 | c5 | c6 | c7 | |
|---|---|---|---|---|---|---|
| pattern specification process | 0e+00 | 1.0000000 | 1.0000000 | 1e+00 | 1.0e+00 | 1.0000000 |
| cell fate commitment | 0e+00 | 0.0003808 | 1.0000000 | 1e+00 | 6.2e-06 | 0.0003793 |
| maintenance of cell number | 0e+00 | 1.0000000 | 1.0000000 | 1e-07 | 1.0e+00 | 1.0000000 |
| Notch signaling pathway | 0e+00 | 1.0000000 | 1.0000000 | 1e+00 | 1.0e+00 | 1.0000000 |
| signal transduction involved in regulation of gene expression | 4e-07 | 1.0000000 | 0.0021083 | 1e+00 | 1.0e+00 | 1.0000000 |
| Transcriptional misregulation in cancer | 5e-07 | 1.0000000 | 1.0000000 | 1e+00 | 1.0e+00 | 1.0000000 |
cutoff <- 0.05
dt2 <- apply(df2,1:2, function(x) min(x, cutoff)) %>% as.data.frame()
## remove the rows with no variation
dt3 <- dt2[rowSums(dt2)!=cutoff*ncol(dt2),]
print(dim(dt3))
## [1] 49 7
## heatmap
p1 <- pheatmap(dt3, fontsize = 7, show_rownames = T,
angle_col = 45, show_colnames = T,
cellwidth = 8, cellheight = 8,
cluster_rows = T, cluster_cols = T,
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
clustering_method = "average",
cutree_rows = 7,
treeheight_row = 10, treeheight_col = 10,
border_color = NA,
color = colorRampPalette(c("red","blue"))(20))
print(p1)
## save to file
pdf(paste0("hp_transcriptional_waves_selected_pathway_summary_",cutoff,"_max_hclust_cluster.pdf"))
print(p1)
dev.off()
## quartz_off_screen
## 2
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] factoextra_1.0.7 ggpubr_0.6.0 RColorBrewer_1.1-3
## [4] pheatmap_1.0.12 xlsx_0.6.5 enrichplot_1.20.0
## [7] org.Mm.eg.db_3.17.0 AnnotationDbi_1.62.2 IRanges_2.34.1
## [10] S4Vectors_0.38.1 Biobase_2.60.0 BiocGenerics_0.46.0
## [13] clusterProfiler_4.9.1 amap_0.8-19 dplyr_1.1.2
## [16] ggforce_0.4.1 ggplot2_3.4.2 data.table_1.14.8
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.14 jsonlite_1.8.7 magrittr_2.0.3
## [4] farver_2.1.1 rmarkdown_2.23 fs_1.6.2
## [7] zlibbioc_1.46.0 vctrs_0.6.3 memoise_2.0.1
## [10] RCurl_1.98-1.12 ggtree_3.8.0 webshot_0.5.5
## [13] rstatix_0.7.2 htmltools_0.5.5 broom_1.0.5
## [16] gridGraphics_0.5-1 sass_0.4.6 bslib_0.5.0
## [19] plyr_1.8.8 cachem_1.0.8 igraph_1.5.0
## [22] lifecycle_1.0.3 pkgconfig_2.0.3 Matrix_1.6-5
## [25] R6_2.5.1 fastmap_1.1.1 gson_0.1.0
## [28] GenomeInfoDbData_1.2.10 digest_0.6.31 aplot_0.1.10
## [31] colorspace_2.1-0 patchwork_1.1.2 RSQLite_2.3.1
## [34] labeling_0.4.2 fansi_1.0.4 abind_1.4-5
## [37] httr_1.4.6 polyclip_1.10-4 compiler_4.3.1
## [40] bit64_4.0.5 withr_2.5.0 downloader_0.4
## [43] backports_1.4.1 BiocParallel_1.34.2 carData_3.0-5
## [46] viridis_0.6.3 DBI_1.1.3 highr_0.10
## [49] ggsignif_0.6.4 MASS_7.3-60 HDO.db_0.99.1
## [52] tools_4.3.1 ape_5.7-1 scatterpie_0.2.1
## [55] glue_1.6.2 nlme_3.1-162 GOSemSim_2.26.0
## [58] shadowtext_0.1.2 cluster_2.1.4 reshape2_1.4.4
## [61] fgsea_1.26.0 generics_0.1.3 gtable_0.3.3
## [64] tidyr_1.3.0 xml2_1.3.4 tidygraph_1.2.3
## [67] car_3.1-2 utf8_1.2.3 XVector_0.40.0
## [70] ggrepel_0.9.3 pillar_1.9.0 stringr_1.5.0
## [73] yulab.utils_0.0.6 rJava_1.0-6 splines_4.3.1
## [76] tweenr_2.0.2 treeio_1.24.1 lattice_0.21-8
## [79] bit_4.0.5 tidyselect_1.2.0 GO.db_3.17.0
## [82] Biostrings_2.68.1 knitr_1.43 gridExtra_2.3
## [85] svglite_2.1.1 xfun_0.39 graphlayouts_1.0.0
## [88] stringi_1.7.12 lazyeval_0.2.2 ggfun_0.1.1
## [91] yaml_2.3.7 kableExtra_1.3.4 evaluate_0.21
## [94] codetools_0.2-19 xlsxjars_0.6.1 ggraph_2.1.0
## [97] tibble_3.2.1 qvalue_2.32.0 ggplotify_0.1.1
## [100] cli_3.6.1 systemfonts_1.0.4 munsell_0.5.0
## [103] jquerylib_0.1.4 Rcpp_1.0.10 GenomeInfoDb_1.36.1
## [106] png_0.1-8 parallel_4.3.1 blob_1.2.4
## [109] DOSE_3.26.1 bitops_1.0-7 viridisLite_0.4.2
## [112] tidytree_0.4.2 scales_1.2.1 purrr_1.0.1
## [115] crayon_1.5.2 rlang_1.1.1 rvest_1.0.3
## [118] cowplot_1.1.1 fastmatch_1.1-3 KEGGREST_1.40.0