CASCC github page: https://github.com/LingyiC/CASCC
conda create -n CASCC_env
source activate CASCC_env
conda install conda-forge::r-fs # necessary for devtools
conda install conda-forge::r-devtools
R
.libPaths("./anaconda3/envs/CASCC_env/lib/R/library") # set conda R library path
install.packages("rlang",version="1.1.0") # dependency:update rlang package
remotes::install_version("Seurat", "4.3.0") # dependency
devtools::install_github("weiyi-bitw/cafr") # dependency
devtools::install_github("LingyiC/CASCC")
CASCC::checkDependencies()In this tutorial, we will be analyzing an example dataset from a pancreatic ductal adenocarcinoma (PDAC) patient from the study by (Peng et al. 2019). This example includes 8 different cell types. The script of generating the example dataset is provided in the “Generating the example dataset” section.
Here we show running CASCC using main function
run.CASCC. Running CASCC step by step is shown in the
“Running CASCC step by step” section.
rm(list = ls()); gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 515009 27.6 1144262 61.2 654382 35.0
#> Vcells 944302 7.3 8388608 64.0 1771154 13.6
library(CASCC)
#> Warning: package 'CASCC' was built under R version 4.2.2
library(Seurat)
set.seed(1)
## load the example dataset
data("Data_PDAC_peng_2k")
dim(Data_PDAC_peng_2k) # 400 cells
#> [1] 2000 400
## running CASCC
CASCC::checkDependencies()
#> cafr is already installed.
#> Seurat is already installed.
#> SeuratObject is already installed.
res <- CASCC::run.CASCC(Data_PDAC_peng_2k) # inputDataType `data` is normalized and log-transformed.
#> Step 1: preparing the seed list...
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> Step 2: applying the adaptive attractor algorithm...
#> Step 3: selecting features...
#> Step 4: estimating the number of clusters...
#> Step 5: performing clustering...
#> Done.
## clustering results
labels <- res$mainType.output$clusteringResults## plot the results
adata <- res$res.predictK$adata # feature-selected UMAP
labels <- res$mainType.output$clusteringResults # clustering output: k-means with attractor centers
Idents(adata) <- labels
DimPlot(adata)The analysis identifies the presence of eight distinct clusters, each corresponding to a specific attractor, as shown in the table below. We showed the top 10 genes of each attractor, which closely align with recognized cell type markers. From left to right, these are macrophage cells, ductal cell type 1, B cells, endothelial cells, ductal cell type 2, T cells, stellate cells, and fibroblasts.
## results list
attr.raw <- res$fs.res$attr.raw # raw attractor scanning results, can be used if rerun CASCC
# res$fs.res$attrs # attractors removed identical ones
finalAttrs <- res$fs.res$finalAttrs # filtered attractors
## attractors used as the clustering centers
attrs <- res$mainType.output$attrs.center
df <- as.data.frame(do.call(cbind, lapply(attrs, function(x) {data.frame("gene" = names(x))})))
titles <- c()
for (attr.ID in paste0("attr_", unlist(lapply(attrs, function(x){names(head(x)[1])})))) {
titles <- c(titles, c(attr.ID))
}
colnames(df) <- titles
df[1:10, ] # top 10 genes (rows) of each attractor (column)The identified 8 attractors is using as centers for each cluster.
Idents(adata) <- res$mainType.output$clusteringResults
nClust <- length(unique(Idents(adata)))
mycolor <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(n = 8, name = "Dark2"))(nClust)
attractorDimplot(adata, attr.list = res$mainType.output$attrs.center, col = mycolor) # col can set as NULLWe color the cells according to the average expression of the top 5 genes in each attractor.
#> TableGrob (2 x 2) "arrange": 4 grobs
#> z cells name grob
#> 1 1 (1-1,1-1) arrange gtable[layout]
#> 2 2 (1-1,2-2) arrange gtable[layout]
#> 3 3 (2-2,1-1) arrange gtable[layout]
#> 4 4 (2-2,2-2) arrange gtable[layout]
We find differentially expressed genes (DEGs) between the clusters.
CASCC output object is a list in R:
Here is an example of CASCC output object:
rm(list = ls()); gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 3648272 194.9 6010114 321.0 6010114 321.0
#> Vcells 8169904 62.4 16932982 129.2 16932980 129.2
data("Res_PDAC_peng_2k")
str(Res_PDAC_peng_2k, max.level = 2)
#> List of 4
#> $ fs.res :List of 7
#> ..$ features : chr [1:506] "FCER1G" "AIF1" "TYROBP" "IGSF6" ...
#> ..$ adata :Formal class 'Seurat' [package "SeuratObject"] with 13 slots
#> ..$ dfMarkers :'data.frame': 756 obs. of 7 variables:
#> ..$ markers : chr [1:11] "TYROBP" "DEFB1" "CD79A" "SPRY1" ...
#> ..$ attrs.raw :List of 5
#> ..$ attrs :List of 11
#> ..$ finalAttrs:List of 10
#> $ res.predictK :List of 3
#> ..$ clusterK : Named num 8
#> .. ..- attr(*, "names")= chr "Number_clusters"
#> ..$ umap.cell.embeddings: num [1:400, 1:2] -9.09 -8.97 -8.32 -8.41 -8.79 ...
#> .. ..- attr(*, "scaled:center")= num [1:2] 2.04 1.26
#> .. ..- attr(*, "dimnames")=List of 2
#> ..$ adata :Formal class 'Seurat' [package "SeuratObject"] with 13 slots
#> $ res.kmeans :List of 2
#> ..$ obj :List of 9
#> .. ..- attr(*, "class")= chr "kmeans"
#> ..$ clusteringResults: Factor w/ 8 levels "1","2","3","4",..: 3 3 3 3 3 3 3 3 3 3 ...
#> .. ..- attr(*, "names")= chr [1:400] "T15_CGGACTGCATCGGAAG" "T15_TCAGCAATCCACTGGG" "T15_CAACTAGAGACAGACC" "T15_ACCAGTAAGTGTGGCA" ...
#> $ mainType.output:List of 2
#> ..$ clusteringResults: Factor w/ 8 levels "1","2","3","4",..: 3 3 3 3 3 3 3 3 3 3 ...
#> .. ..- attr(*, "names")= chr [1:400] "T15_CGGACTGCATCGGAAG" "T15_TCAGCAATCCACTGGG" "T15_CAACTAGAGACAGACC" "T15_ACCAGTAAGTGTGGCA" ...
#> ..$ attrs.center :List of 8We provide an adaptive version of attractor algorithm to find a converged attractor based on the seed gene provided. If users are interested in the co-expression attractor only, they can use the following code to find the co-expression without running the whole CASCC pipeline.
library("CASCC")
library("cafr")
data("Data_PDAC_peng_2k")
# Find AIF1-seeded attractor. AIF1 gene is a marker of macrophage cells.
attr.res <- findAttractor.adaptive(Data_PDAC_peng_2k, "AIF1")
#> 1st round start...
1st round end.
2nd round start...
2nd round end.
# co-expression attractor
head(names(attr.res$attractor.final))
#> [1] "FCER1G" "AIF1" "TYROBP" "IGSF6" "CD163" "C1QA"In this section, we show how to run CASCC step by step.
rm(list = ls()); gc()
library("CASCC")
data("Data_PDAC_peng_2k")
# Step 0 - Step 3
fs.res <- CASCC.featureSelection(Data_PDAC_peng_2k, inputDataType = "well-normalized", attr.raw = NULL, exponent.max = 10, exponent.min = 2,
generalSeedList = NULL, mc.cores = 1, topDEGs = 10, topAttr = 50,
removeMT.RP.ERCC = TRUE, removeNonProtein = FALSE,
topN.DEG.as.seed = 1,
overlapN = 10)
adata <- fs.res$adata;
SeuratVersionCheck = as.numeric(strsplit(as.character(packageVersion("Seurat")), "\\.")[[1]][1])
if (SeuratVersionCheck == 4) {
data <- adata[["RNA"]]@data
}else if (SeuratVersionCheck == 5) {
data <- adata[["RNA"]]$data
}
data <- as.matrix(data)
# Step 4
res.predictK <- predictClusterK(data, fs.res, method = "ward.D2", index = "silhouette", min.nc_fix = FALSE)
# results list
res <- list()
res$fs.res <- fs.res
res$res.predictK <- res.predictK
# Step 5
res$mainType.output <- kmeansCenterModify(res)
Generate the example dataset for the quick start.
set.seed(1)
rm(list = ls()); gc()
## Raw data from PRJCA001063
load("./adata.T15.RData")
load("./cell.type.RData")
cell.type <- cell.type[which(cell.type$cell.name %in% colnames(adata)), ]
table(cell.type$cluster)
library(dplyr)
sampled_cells <- cell.type %>%
group_by(cluster) %>%
## if a cell type does not have more than 50 cells, we removed this cell type
sample_n(size = if_else(n() >= 50, 50, 0)) %>%
ungroup()
print(sampled_cells)
genes <- Seurat::VariableFeatures(adata)
cells <- sampled_cells$cell.name
SeuratVersionCheck = as.numeric(strsplit(as.character(packageVersion("Seurat")), "\\.")[[1]][1])
if (SeuratVersionCheck == 4) {
data <- as.matrix(adata[["RNA"]]@data[genes, cells])
}else if (SeuratVersionCheck == 5) {
data <- as.matrix(adata[["RNA"]]$data[genes, cells])
}
save(data, file = "./CASCC/data/Data_PDAC_peng_2k.RData")
## ~ 1 min
start_time <- Sys.time()
res <- CASCC::run.CASCC(data)
end_time <- Sys.time()
end_time - start_time
save(res, file = "./CASCC/data/Res_PDAC_peng_2k.RData")Here we show how to run CASCC within Seurat workflow.
## after processed Seurat workflow
# Seurat V4
## Users' SeuratObject
SeuratVersionCheck = as.numeric(strsplit(as.character(packageVersion("Seurat")), "\\.")[[1]][1])
if (SeuratVersionCheck == 4) {
data <- SeuratObject[["RNA"]]@data
}else if (SeuratVersionCheck == 5) {
data <- SeuratObject[["RNA"]]$data
}
res <- CASCC::run.CASCC(data)
Idents(SeuratObject) <- res$mainType.output$clusteringResults
SeuratObjectUsers can submit questions and feedback to https://github.com/LingyiC/CASCC/issues.
We welcome any questions and feedback. We will provide a list of frequently asked questions and the answers.
Session info to replicate the clustering results of Table S1 datasets.
sessionInfo()
# R version 4.0.5 (2021-03-31)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Ubuntu 18.04.5 LTS
#
# Matrix products: default
# BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
# LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
# [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
# [9] LC_ADDRESS=C LC_TELEPHONE=C
# [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] cafr_0.312 dplyr_1.1.2 aricode_1.0.2 stringr_1.5.0
#
# loaded via a namespace (and not attached):
# [1] Seurat_4.3.0 Rtsne_0.16
# [3] colorspace_2.1-0 deldir_1.0-6
# [5] ellipsis_0.3.2 ggridges_0.5.4
# [7] XVector_0.30.0 GenomicRanges_1.42.0
# [9] spatstat.data_3.0-0 leiden_0.4.3
# [11] listenv_0.9.0 ggrepel_0.9.3
# [13] fansi_1.0.4 codetools_0.2-19
# [15] splines_4.0.5 polyclip_1.10-4
# [17] jsonlite_1.8.4 ica_1.0-3
# [19] cluster_2.1.4 png_0.1-8
# [21] uwot_0.1.14 shiny_1.7.4
# [23] sctransform_0.3.5 spatstat.sparse_3.0-0
# [25] compiler_4.0.5 httr_1.4.5
# [27] SeuratObject_4.1.3 Matrix_1.5-3
# [29] fastmap_1.1.1 lazyeval_0.2.2
# [31] limma_3.46.0 cli_3.6.1
# [33] later_1.3.0 htmltools_0.5.5
# [35] tools_4.0.5 igraph_1.4.2
# [37] gtable_0.3.3 glue_1.6.2
# [39] GenomeInfoDbData_1.2.4 RANN_2.6.1
# [41] reshape2_1.4.4 Rcpp_1.0.10
# [43] scattermore_0.8 Biobase_2.50.0
# [45] vctrs_0.6.2 nlme_3.1-162
# [47] spatstat.explore_3.0-6 progressr_0.13.0
# [49] lmtest_0.9-40 spatstat.random_3.1-3
# [51] globals_0.16.2 mime_0.12
# [53] miniUI_0.1.1.1 lifecycle_1.0.3
# [55] irlba_2.3.5.1 goftest_1.2-3
# [57] future_1.32.0 zlibbioc_1.36.0
# [59] MASS_7.3-58.3 zoo_1.8-12
# [61] scales_1.2.1 promises_1.2.0.1
# [63] MatrixGenerics_1.2.1 spatstat.utils_3.0-1
# [65] NbClust_3.0.1 parallel_4.0.5
# [67] SummarizedExperiment_1.20.0 RColorBrewer_1.1-3
# [69] reticulate_1.28 pbapply_1.7-0
# [71] gridExtra_2.3 ggplot2_3.4.2
# [73] stringi_1.7.12 S4Vectors_0.28.1
# [75] BiocGenerics_0.36.1 GenomeInfoDb_1.26.7
# [77] rlang_1.1.0 pkgconfig_2.0.3
# [79] matrixStats_0.63.0 bitops_1.0-7
# [81] lattice_0.20-45 tensor_1.5
# [83] ROCR_1.0-11 purrr_1.0.1
# [85] patchwork_1.1.2 htmlwidgets_1.6.2
# [87] cowplot_1.1.1 tidyselect_1.2.0
# [89] parallelly_1.35.0 RcppAnnoy_0.0.20
# [91] plyr_1.8.8 magrittr_2.0.3
# [93] R6_2.5.1 IRanges_2.24.1
# [95] generics_0.1.3 DelayedArray_0.16.3
# [97] pillar_1.9.0 fitdistrplus_1.1-11
# [99] abind_1.4-5 survival_3.5-3
# [101] RCurl_1.98-1.10 sp_1.6-0
# [103] tibble_3.2.1 future.apply_1.10.0
# [105] CASCC_0.1.0 KernSmooth_2.23-20
# [107] utf8_1.2.3 spatstat.geom_3.0-6
# [109] plotly_4.10.1 grid_4.0.5
# [111] data.table_1.14.8 digest_0.6.31
# [113] xtable_1.8-4 tidyr_1.3.0
# [115] httpuv_1.6.9 stats4_4.0.5
# [117] munsell_0.5.0 viridisLite_0.4.1