library(SingleCellExperiment)
library(here) # reproducible paths
library(scater) # plot reduced dims
library(tibble) # for `rownames_to_column`
library(scDblFinder) # detect doublets
library(pals) # for palettes with large n #kelly()22, #polychrome()#36, unname(alphabet())
library(dplyr) # df filtering
project <- "fire-mice"
if (!file.exists(here("processed", project, "sce_anno_01.RDS"))) {
sce <- readRDS(here("processed", project, "sce_clusters_01.RDS"))
}else{
sce <- readRDS(here("processed", project, "sce_anno_01.RDS"))
}
cols25 <- unname(cols25())
# remove the black and white from the pallete, and the similars to cols25
# assessed with pal.bands and pal.cube
kelly_col <- unname(kelly()[-c(1,2,5,6)])
# remove the colours that are similar to the the
cols25 <-
cols <- c(kelly_col, cols25)
As in [publication] we will perform a cluster QC to remove clusters of poorer quality. This will be assessed by the number of UMI counts, the mitochondrial percentage, doublet analysis, ribosomal genes and the number of mice that contribute to each cluster. Moreover we will keep in mind our experimental groups in order to ensure biological effects are not being lost.
plotTSNE(sce, colour_by = "cluster_names", point_size=0.5,
text_by = "cluster_names", text_size = 3) +
scale_color_manual(values = cols)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
plotTSNE(sce, colour_by = "total") +
ggtitle("Total number of umi counts per cell")
plotTSNE(sce, colour_by = "detected") +
ggtitle("Detected number of genes per cell")
Lower values of umi counts and detected genes can be associated to lower quality cells. Cells can also have lower expressed genes due to their biological state or celltype.
pct_cells<- 50
min_umi <- 3000
Select clusters with 50 % cells having less than 3000umi counts.
umi_df <- as.data.frame(cbind(sce$total, as.character(sce$cluster_names)))
colnames(umi_df) <- c("umi", "cluster_names")
# caculate the percentatges of cells that are "low umi"
umi_df <-
umi_df %>%
mutate(low_umi = as.numeric(umi) < min_umi) %>%
group_by(cluster_names) %>%
summarise(n_low_umi = sum(low_umi), n_cells = n()) %>%
mutate(pct_cell_low_umi = (n_low_umi/n_cells)*100)
umi_df %>% arrange(desc(pct_cell_low_umi))
# Filter the clusters that have a high proportion of "low umi"
low_umi_clusters <-
umi_df %>%
filter(pct_cell_low_umi > pct_cells) %>%
.$cluster_names
The clusters flagged are astrocyte-restricted_precursors, astrocyte_dead_1, astrocyte_dead_2, dead_2, microglia_dead, OPC_dead
High mithocondrial genes is associated with stressed, lower quality, cells.
plotTSNE(sce, colour_by = "subsets_mt_percent", text_by = "cluster_names") +
ggtitle("Percentatge mithocondrial genes")
plotTSNE(sce, colour_by = "subsets_mt_percent") +
ggtitle("Percentatge mithocondrial genes")
pct_cells<- 50
pct_mt <- 10
Select clusters with 50 % cells having more than 10% mithocondrial genes.
mt_df <- as.data.frame(cbind(sce$subsets_mt_percent, as.character(sce$cluster_names)))
colnames(mt_df) <- c("subsets_mt_percent", "cluster_names")
mt_df <-
mt_df %>%
mutate(high_pct = as.numeric(subsets_mt_percent) > 10) %>%
group_by(cluster_names) %>%
summarise(n_high_mt = sum(high_pct), n_cells = n()) %>%
mutate(pct_cell_high_mt = (n_high_mt/n_cells)*100) %>%
arrange(desc(pct_cell_high_mt))
mt_df
high_mt_clusters <-
mt_df %>%
filter(pct_cell_high_mt > pct_cells) %>%
.$cluster_names
The clusters flagged are oligo_dead.
To visualise the ribosomal content in the whole dataset we plotted the cells according to their ribosomal content. High ribosomal content in one cluster, that expresses a mix profile, could indicate that the cells are clustering based on ribosomal gene content.
# the cluster
# save ribosomal genes
is_ribo <- grepl("^Rp[sl]", rownames(sce))
# recompute the quality metrics, subseting the ribosomal genes
ribo_qc <- perCellQCMetrics(sce, subsets = list(ribo = is_ribo))
# we are only interested in ribo
sce$subsets_ribo_percent <- ribo_qc$subsets_ribo_percent
sce$subsets_ribo_sum <- ribo_qc$subsets_ribo_sum
sce$subsets_ribo_detected <- ribo_qc$subsets_ribo_detected
plotTSNE(sce, colour_by = "subsets_ribo_percent") +
ggtitle("Percentatge ribosomal genes")
How many mice contribute to each cluster?
# count how many cells from each gnt group there are per cluster
sum_per_gnt_cluster <- table(sce$genotype, sce$cluster_names )
# for each cluster count how many cells are from each mice, dividing by KO and WT mice
sum_per_mice_cluster <- table(sce$mouse, sce$genotype, sce$cluster_names )
# For each cluster sum of mice that do have cells on that cluster
colSums(sum_per_mice_cluster > 0)
##
## OPC_1 oligo_1 astrocyte_1 astrocyte_dead_1 oligo_2 microglia_1 oligo_3
## HET 2 2 2 2 2 2 2
## KO 2 2 2 2 2 0 2
## WT 2 2 2 2 2 2 2
##
## oligo_dead oligo_OPCs microglia_dead OPC_2 endothelial immature_neurons
## HET 2 2 2 2 2 2
## KO 2 2 1 2 2 2
## WT 2 2 2 2 2 2
##
## OPC_or_oligo OPC_dead mature_neurons_1 oligo_4 astrocyte_2
## HET 2 2 2 2 2
## KO 2 2 2 2 2
## WT 2 2 2 2 2
##
## astrocyte_dead_2 mature_neurons_2 immune pericytes microglia_2 BAMs
## HET 2 2 2 2 2 2
## KO 2 2 2 2 2 2
## WT 2 2 2 2 2 2
##
## astrocyte_OPCs microglia_3 astrocyte-restricted_precursors microglia_OPCs
## HET 2 2 2 2
## KO 2 0 2 2
## WT 2 2 2 2
##
## oligo_5 OPCs_3 ChP_epithelial oligo_astrocytes dead_2
## HET 2 2 2 2 2
## KO 2 2 2 2 2
## WT 2 2 2 2 2
# create a summary
summary <- as.data.frame(rbind(colSums(sum_per_mice_cluster > 0), sum_per_gnt_cluster, colSums(sum_per_gnt_cluster)))
row.names(summary) <- c("HET mice","KO mice", "WT mice", "HET cells","KO cells", "WT cells", "total cells")
summary
Except from the obvious microglia clusters, where the numbers are very low or even absent in the fire mice nothing stands out.
Detection of clusters formed by doublets/multiplets (i.e. multiple
cells captured within the same droplet or reaction volume). The function
test each cluster against the null hypothesis that it does
consist of doublets. The tool consider every possible triplet of
clusters consisting of a query cluster and two putative “source”
clusters. Under the null hypothesis that the query consists of doublets
from the two sources, it computes the number of genes
(num.de) that are differentially expressed in the same
direction in the query cluster compared to both of the source clusters.
Such genes would be unique markers for the query cluster and provide
evidence against the null hypothesis.
Clusters with the few unique genes (num.de) are more
likely to be composed of doublets.
Potential doublets should also have larger library size than the
cells from the source clusters, resulting in lib.size
ratios bellow unity.
if(!file.exists(here("outs",project, "doublet_clusters_res0.6.csv"))){
res_dbl <- findDoubletClusters(sce, sce$cluster_names)
res_dbl <- res_dbl %>%
as.data.frame() %>%
rownames_to_column("cluster")
write.csv(res_dbl, here("outs", project, "doublet_clusters_res0.6.csv"))
}else{
res_dbl <- read.csv(here("outs", project, "doublet_clusters_res0.6.csv"), row.names = 1)
}
res_dbl
Analyse the results:
microglia_OPCs OPCs_3 oligo_5 microglia_3 OPC_or_oligo microglia_2
and astrocyte_OPCs are the clusterst with less num.de.
microglia_OPCs, microglia_3 and microglia_2: These populations could be microglia pruning bits of astrocytes or oligodendrocytes, we do not keep these populations in the downstream analysis.
OPCs_3: Clear doublet as it expresses markers from neurons and OPCs.
oligo_5: Potential doublet between two other clusters that are formed
of doublets too, with very low num.de; we do not keep
it.
OPC_or_oligo: This could seem to be a transitioning state, however they do express multiple markers usually found in late Oligodendrocytes.
astrocyte_OPCs: These cells could also be transitioning state, and
the lib.size is not bellow the unity, so we first did a
preliminary analysis keeping them, but reran doubletfinder after
deleting the other doublet cluster and finaly decided to be stricter and
deleting them.
oligo_4: This cluster is kept, as there is supporting evidence for Astro-Oligos populations in the mouse brain, it will be renamed as astro-oligo in the next annotation. https://doi.org/10.1101/2021.04.11.439393
We also tested keeping them and rerunning doublet finder,
source1 source2 num.de median.de best p.value lib.size1 lib.size2 prop
OPC-Astrocyte OPC_1 Astrocyte_2 149 3603.0 Gnao1 7.180342e-83 0.6135593 0.4465331 0.005588435
Oligo-Astrocyte Oligo_1 Astrocyte_1 254 3264.0 mt-Atp6 3.848991e-35 0.8738298 0.4604424 0.013932279
OEG ChP_epithelia Astrocyte_2 623 2344.0 Npy 1.935603e-29 1.5961975 0.8223610 0.001940429
[...]
Code in commit 0b2c63cd from the github repo, in annotation_02.rmd
doublet<-c("microglia_OPCs", "OPCs_3", "oligo_5", "microglia_3", "OPC_or_oligo", "microglia_2", "astrocyte_OPCs")
We want to have a closer look at the clusters that do have a difference between the knockout and the wild type before deleting the clusters.
# divide the two objects
sce_ko <- sce[,sce$genotype == "KO"]
sce_ctr <- sce[,sce$genotype == "WT"]
# plot them side by side
gridExtra::grid.arrange(
plotTSNE(sce_ko, colour_by = "cluster_names", point_size=0.5,
point_alpha = 0.3, text_by = "cluster_names", text_size = 3) +
scale_color_manual(values = cols) +
ggtitle("fire mice"),
plotTSNE(sce_ctr, colour_by = "cluster_names", point_size=0.5,
point_alpha = 0.3, text_by = "cluster_names", text_size = 3) +
scale_color_manual(values = cols) +
ggtitle("control"),
ncol = 2
)
## Warning: Removed 2 rows containing missing values (geom_text).
In order to visualise the proportions from KO and WT for each cluster, we do not take in consideration the microglia clusters, as these are only present in the control, and we normalise per number of cells per cluster.
# delete the microglia
sce_no_mc <- sce[,!(sce$cluster_names %in% c(5, 9, 25))]
# count how many cells from each gnt group there are per cluster
sum_per_gnt_cluster_no_mc <- table(sce_no_mc$genotype, sce_no_mc$cluster_names )
# normalise per cluster, looking how the KO and WT are distributed
# across the clusters, to give to both groups the same weight
prop_per_cluster_no_mc <- prop.table(sum_per_gnt_cluster_no_mc, margin = 1)
# calculate the proportions for each cluster
prop_per_gnt_no_mc <- round(prop.table(prop_per_cluster_no_mc , margin = 2 )*100, 2)
# Display
prop_per_gnt_no_mc<- as.data.frame(prop_per_gnt_no_mc)
colnames(prop_per_gnt_no_mc) <- c("Genotype", "cluster", "Proportion")
prop_per_gnt_no_mc %>%
filter(Genotype == "KO") %>%
arrange(desc(Proportion))
visualise in a plot
levels(prop_per_gnt_no_mc$Genotype) <- c("KO", "HET", "WT")
ggplot(data = prop_per_gnt_no_mc, aes(x = cluster, y = Proportion, fill = Genotype)) +
geom_bar(position = "fill", stat = "identity") + theme_classic()
ko_pct <- 60
difference_KO_WT <-
prop_per_gnt_no_mc %>%
arrange(desc(Proportion)) %>%
# the microglia clusters will be the top differences, but this is not
# something we are specially interested
filter(!(cluster %in% c(21, 24, 1, 16))) %>%
filter(Genotype == "KO") %>%
# select the "interesting" clusters
filter(Proportion > ko_pct ) %>%
.$cluster
gt_ko_pct <- unique(sce$cluster_names[sce$cluster_names %in% c( high_mt_clusters, low_umi_clusters, doublet) &
(sce$cluster_names %in% difference_KO_WT)])
There are 0 clusters flagged as poor quality and also more abundant in the KO
the clusters highlighted as:
low umi: majority of cells having less than 3000 umi counts
high mt : majority of cells having more than 10 % mitochondrial genes
doublet: clusters composed by cells that express markers from different cell types.
There was no clusters flagged from the other categories
sce$filter_out <- sce$cluster_names %in% c( high_mt_clusters, low_umi_clusters, doublet)
filter_out <- sum(sce$filter_out)
plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
point_alpha = 0.3, text_by = "cluster_names", text_size = 3)
plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
point_alpha = 0.3, text_by = "celltype", text_size = 3)
plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
point_alpha = 0.3)
if (!file.exists(here("processed", project, "sce_clusterQC.RDS"))){
sce <- sce[, sce$filter_out == FALSE]
saveRDS(sce, here("processed", project, "sce_clusterQC.RDS"))
}
This would filter 15632 cells filtered out, leaving an object with 51194cells.
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dplyr_1.0.7 pals_1.7
## [3] scDblFinder_1.6.0 tibble_3.1.5
## [5] scater_1.20.1 ggplot2_3.3.5
## [7] scuttle_1.2.1 here_1.0.1
## [9] SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
## [11] Biobase_2.52.0 GenomicRanges_1.44.0
## [13] GenomeInfoDb_1.28.4 IRanges_2.26.0
## [15] S4Vectors_0.30.2 BiocGenerics_0.38.0
## [17] MatrixGenerics_1.4.3 matrixStats_0.61.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 rprojroot_2.0.2
## [3] tools_4.1.1 bslib_0.3.1
## [5] utf8_1.2.2 R6_2.5.1
## [7] irlba_2.3.3 vipor_0.4.5
## [9] DBI_1.1.1 colorspace_2.0-2
## [11] withr_2.4.2 tidyselect_1.1.1
## [13] gridExtra_2.3 compiler_4.1.1
## [15] BiocNeighbors_1.10.0 DelayedArray_0.18.0
## [17] labeling_0.4.2 sass_0.4.0
## [19] scales_1.1.1 stringr_1.4.0
## [21] digest_0.6.28 rmarkdown_2.11
## [23] XVector_0.32.0 dichromat_2.0-0
## [25] pkgconfig_2.0.3 htmltools_0.5.2
## [27] sparseMatrixStats_1.4.2 highr_0.9
## [29] maps_3.4.0 fastmap_1.1.0
## [31] limma_3.48.3 rlang_0.4.12
## [33] DelayedMatrixStats_1.14.3 farver_2.1.0
## [35] jquerylib_0.1.4 generics_0.1.1
## [37] jsonlite_1.7.2 BiocParallel_1.26.2
## [39] RCurl_1.98-1.5 magrittr_2.0.1
## [41] BiocSingular_1.8.1 GenomeInfoDbData_1.2.6
## [43] Matrix_1.3-4 Rcpp_1.0.7
## [45] ggbeeswarm_0.6.0 munsell_0.5.0
## [47] fansi_0.5.0 viridis_0.6.2
## [49] lifecycle_1.0.1 stringi_1.7.5
## [51] yaml_2.2.1 edgeR_3.34.1
## [53] zlibbioc_1.38.0 grid_4.1.1
## [55] dqrng_0.3.0 crayon_1.4.2
## [57] lattice_0.20-45 cowplot_1.1.1
## [59] beachmat_2.8.1 mapproj_1.2.7
## [61] locfit_1.5-9.4 metapod_1.0.0
## [63] knitr_1.36 pillar_1.6.4
## [65] igraph_1.2.7 xgboost_1.4.1.1
## [67] ScaledMatrix_1.0.0 glue_1.4.2
## [69] evaluate_0.14 scran_1.20.1
## [71] data.table_1.14.2 vctrs_0.3.8
## [73] gtable_0.3.0 purrr_0.3.4
## [75] assertthat_0.2.1 xfun_0.27
## [77] rsvd_1.0.5 viridisLite_0.4.0
## [79] beeswarm_0.4.0 cluster_2.1.2
## [81] bluster_1.2.1 statmod_1.4.36
## [83] ellipsis_0.3.2