Cluster quality control

set-up

library(SingleCellExperiment)
library(here) # reproducible paths
library(scater) # plot reduced dims
library(tibble)  # for `rownames_to_column`
library(scDblFinder) # detect doublets
library(pals) # for palettes with large n #kelly()22, #polychrome()#36, unname(alphabet())
library(dplyr) # df filtering

project <- "fire-mice"

if (!file.exists(here("processed", project, "sce_anno_01.RDS"))) {
  sce <- readRDS(here("processed", project, "sce_clusters_01.RDS"))
}else{
  sce <- readRDS(here("processed", project, "sce_anno_01.RDS"))
}

cols25 <- unname(cols25())
# remove the black and white from the pallete, and the similars to cols25
# assessed with pal.bands and pal.cube
kelly_col <- unname(kelly()[-c(1,2,5,6)])
# remove the colours that are similar to the the 
cols25 <- 
cols <- c(kelly_col, cols25)

Introduction

As in [publication] we will perform a cluster QC to remove clusters of poorer quality. This will be assessed by the number of UMI counts, the mitochondrial percentage, doublet analysis, ribosomal genes and the number of mice that contribute to each cluster. Moreover we will keep in mind our experimental groups in order to ensure biological effects are not being lost.

plotTSNE(sce, colour_by = "cluster_names", point_size=0.5,
          text_by = "cluster_names", text_size = 3) +
  scale_color_manual(values = cols)

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Number of molecules per cluster

plotTSNE(sce, colour_by = "total") + 
  ggtitle("Total number of umi counts per cell")

plotTSNE(sce, colour_by = "detected") +
  ggtitle("Detected number of genes per cell")

Lower values of umi counts and detected genes can be associated to lower quality cells. Cells can also have lower expressed genes due to their biological state or celltype.

pct_cells<- 50
min_umi  <- 3000

Select clusters with 50 % cells having less than 3000umi counts.

umi_df <- as.data.frame(cbind(sce$total, as.character(sce$cluster_names)))
colnames(umi_df) <- c("umi", "cluster_names")

# caculate the percentatges of cells that are "low umi"
umi_df <- 
  umi_df %>% 
  mutate(low_umi = as.numeric(umi) < min_umi) %>% 
  group_by(cluster_names) %>% 
  summarise(n_low_umi = sum(low_umi), n_cells = n()) %>%
  mutate(pct_cell_low_umi = (n_low_umi/n_cells)*100) 
umi_df %>% arrange(desc(pct_cell_low_umi))

# Filter the clusters that have a high proportion of "low umi"
low_umi_clusters <- 
  umi_df %>% 
  filter(pct_cell_low_umi > pct_cells) %>% 
  .$cluster_names

The clusters flagged are astrocyte-restricted_precursors, astrocyte_dead_1, astrocyte_dead_2, dead_2, microglia_dead, OPC_dead

Mithocondrial genes

High mithocondrial genes is associated with stressed, lower quality, cells.

plotTSNE(sce, colour_by = "subsets_mt_percent", text_by = "cluster_names") + 
  ggtitle("Percentatge mithocondrial genes")

plotTSNE(sce, colour_by = "subsets_mt_percent") + 
  ggtitle("Percentatge mithocondrial genes")

pct_cells<- 50
pct_mt  <- 10

Select clusters with 50 % cells having more than 10% mithocondrial genes.

mt_df <- as.data.frame(cbind(sce$subsets_mt_percent, as.character(sce$cluster_names)))
colnames(mt_df) <- c("subsets_mt_percent", "cluster_names")

mt_df <- 
  mt_df %>% 
  mutate(high_pct = as.numeric(subsets_mt_percent) > 10) %>% 
  group_by(cluster_names) %>% 
  summarise(n_high_mt = sum(high_pct), n_cells = n()) %>% 
  mutate(pct_cell_high_mt = (n_high_mt/n_cells)*100) %>%  
  arrange(desc(pct_cell_high_mt))
mt_df

high_mt_clusters <- 
  mt_df %>% 
  filter(pct_cell_high_mt > pct_cells) %>% 
  .$cluster_names

The clusters flagged are oligo_dead.

Ribosomal genes

To visualise the ribosomal content in the whole dataset we plotted the cells according to their ribosomal content. High ribosomal content in one cluster, that expresses a mix profile, could indicate that the cells are clustering based on ribosomal gene content.

# the cluster 
# save ribosomal genes
is_ribo <- grepl("^Rp[sl]", rownames(sce))
# recompute the quality metrics, subseting the ribosomal genes
ribo_qc <- perCellQCMetrics(sce, subsets = list(ribo = is_ribo))
# we are only interested in ribo
sce$subsets_ribo_percent <- ribo_qc$subsets_ribo_percent
sce$subsets_ribo_sum <- ribo_qc$subsets_ribo_sum
sce$subsets_ribo_detected <- ribo_qc$subsets_ribo_detected

plotTSNE(sce, colour_by = "subsets_ribo_percent") + 
  ggtitle("Percentatge ribosomal genes")

Number of mice per cluster

How many mice contribute to each cluster?

# count how many cells from each gnt group  there are per cluster
sum_per_gnt_cluster <- table(sce$genotype, sce$cluster_names )
# for each cluster count how many cells are from each mice, dividing by KO and WT mice
sum_per_mice_cluster <- table(sce$mouse, sce$genotype, sce$cluster_names )
# For each cluster sum of mice that do have cells on that cluster
colSums(sum_per_mice_cluster > 0)

##      
##       OPC_1 oligo_1 astrocyte_1 astrocyte_dead_1 oligo_2 microglia_1 oligo_3
##   HET     2       2           2                2       2           2       2
##   KO      2       2           2                2       2           0       2
##   WT      2       2           2                2       2           2       2
##      
##       oligo_dead oligo_OPCs microglia_dead OPC_2 endothelial immature_neurons
##   HET          2          2              2     2           2                2
##   KO           2          2              1     2           2                2
##   WT           2          2              2     2           2                2
##      
##       OPC_or_oligo OPC_dead mature_neurons_1 oligo_4 astrocyte_2
##   HET            2        2                2       2           2
##   KO             2        2                2       2           2
##   WT             2        2                2       2           2
##      
##       astrocyte_dead_2 mature_neurons_2 immune pericytes microglia_2 BAMs
##   HET                2                2      2         2           2    2
##   KO                 2                2      2         2           2    2
##   WT                 2                2      2         2           2    2
##      
##       astrocyte_OPCs microglia_3 astrocyte-restricted_precursors microglia_OPCs
##   HET              2           2                               2              2
##   KO               2           0                               2              2
##   WT               2           2                               2              2
##      
##       oligo_5 OPCs_3 ChP_epithelial oligo_astrocytes dead_2
##   HET       2      2              2                2      2
##   KO        2      2              2                2      2
##   WT        2      2              2                2      2

# create a summary 
summary <- as.data.frame(rbind(colSums(sum_per_mice_cluster > 0), sum_per_gnt_cluster, colSums(sum_per_gnt_cluster)))
row.names(summary) <- c("HET mice","KO mice", "WT mice", "HET cells","KO cells", "WT cells", "total cells")
summary

Except from the obvious microglia clusters, where the numbers are very low or even absent in the fire mice nothing stands out.

Clusters of doublet cells

Detection of clusters formed by doublets/multiplets (i.e. multiple cells captured within the same droplet or reaction volume). The function test each cluster against the null hypothesis that it does consist of doublets. The tool consider every possible triplet of clusters consisting of a query cluster and two putative “source” clusters. Under the null hypothesis that the query consists of doublets from the two sources, it computes the number of genes (num.de) that are differentially expressed in the same direction in the query cluster compared to both of the source clusters. Such genes would be unique markers for the query cluster and provide evidence against the null hypothesis.

Clusters with the few unique genes (num.de) are more likely to be composed of doublets.
Potential doublets should also have larger library size than the cells from the source clusters, resulting in lib.size ratios bellow unity.

if(!file.exists(here("outs",project,  "doublet_clusters_res0.6.csv"))){
  res_dbl <- findDoubletClusters(sce, sce$cluster_names)
  res_dbl <- res_dbl %>% 
    as.data.frame() %>% 
    rownames_to_column("cluster")
  write.csv(res_dbl, here("outs", project,  "doublet_clusters_res0.6.csv"))
}else{
  res_dbl <- read.csv(here("outs", project,  "doublet_clusters_res0.6.csv"), row.names = 1)
}
res_dbl

Analyse the results:

microglia_OPCs OPCs_3 oligo_5 microglia_3 OPC_or_oligo microglia_2 and astrocyte_OPCs are the clusterst with less num.de.

microglia_OPCs, microglia_3 and microglia_2: These populations could be microglia pruning bits of astrocytes or oligodendrocytes, we do not keep these populations in the downstream analysis.

OPCs_3: Clear doublet as it expresses markers from neurons and OPCs.

oligo_5: Potential doublet between two other clusters that are formed of doublets too, with very low num.de; we do not keep it.

OPC_or_oligo: This could seem to be a transitioning state, however they do express multiple markers usually found in late Oligodendrocytes.

astrocyte_OPCs: These cells could also be transitioning state, and the lib.size is not bellow the unity, so we first did a preliminary analysis keeping them, but reran doubletfinder after deleting the other doublet cluster and finaly decided to be stricter and deleting them.

oligo_4: This cluster is kept, as there is supporting evidence for Astro-Oligos populations in the mouse brain, it will be renamed as astro-oligo in the next annotation. https://doi.org/10.1101/2021.04.11.439393

Note on astrocyte-OPCs:

We also tested keeping them and rerunning doublet finder,

                      source1       source2 num.de median.de          best     p.value lib.size1 lib.size2        prop
OPC-Astrocyte           OPC_1   Astrocyte_2    149    3603.0         Gnao1    7.180342e-83   0.6135593 0.4465331 0.005588435            
Oligo-Astrocyte       Oligo_1   Astrocyte_1    254    3264.0       mt-Atp6    3.848991e-35   0.8738298 0.4604424 0.013932279
OEG             ChP_epithelia   Astrocyte_2    623    2344.0           Npy    1.935603e-29   1.5961975 0.8223610 0.001940429
[...]

Code in commit 0b2c63cd from the github repo, in annotation_02.rmd

doublet<-c("microglia_OPCs", "OPCs_3", "oligo_5", "microglia_3", "OPC_or_oligo", "microglia_2", "astrocyte_OPCs")

Control vs fire mice

We want to have a closer look at the clusters that do have a difference between the knockout and the wild type before deleting the clusters.

# divide the two objects
sce_ko <- sce[,sce$genotype == "KO"]
sce_ctr <- sce[,sce$genotype == "WT"]
# plot them side by side
gridExtra::grid.arrange(
plotTSNE(sce_ko, colour_by = "cluster_names", point_size=0.5,
         point_alpha = 0.3, text_by = "cluster_names", text_size = 3) +
  scale_color_manual(values = cols) +
  ggtitle("fire mice"),
plotTSNE(sce_ctr, colour_by = "cluster_names", point_size=0.5,
         point_alpha = 0.3, text_by = "cluster_names", text_size = 3) +
  scale_color_manual(values = cols) +
  ggtitle("control"), 
ncol = 2
)

## Warning: Removed 2 rows containing missing values (geom_text).

Proportion KO-WT

In order to visualise the proportions from KO and WT for each cluster, we do not take in consideration the microglia clusters, as these are only present in the control, and we normalise per number of cells per cluster.

# delete the microglia
sce_no_mc <- sce[,!(sce$cluster_names %in% c(5, 9, 25))]

# count how many cells from each gnt group  there are per cluster
sum_per_gnt_cluster_no_mc <- table(sce_no_mc$genotype, sce_no_mc$cluster_names )

# normalise per cluster, looking how the KO and WT are distributed
# across the clusters, to give to both groups the same weight
prop_per_cluster_no_mc <- prop.table(sum_per_gnt_cluster_no_mc, margin = 1)

# calculate the proportions for each cluster
prop_per_gnt_no_mc <- round(prop.table(prop_per_cluster_no_mc , margin = 2 )*100, 2)

# Display
prop_per_gnt_no_mc<- as.data.frame(prop_per_gnt_no_mc)
colnames(prop_per_gnt_no_mc) <- c("Genotype", "cluster", "Proportion")

prop_per_gnt_no_mc %>% 
  filter(Genotype == "KO") %>% 
  arrange(desc(Proportion))

visualise in a plot

levels(prop_per_gnt_no_mc$Genotype) <- c("KO", "HET", "WT")
ggplot(data = prop_per_gnt_no_mc, aes(x = cluster, y = Proportion, fill = Genotype)) +
  geom_bar(position = "fill", stat = "identity") + theme_classic()

ko_pct <- 60

difference_KO_WT <-
  prop_per_gnt_no_mc %>% 
  arrange(desc(Proportion)) %>% 
# the microglia clusters will be the top differences, but this is not 
    # something we are specially interested
  filter(!(cluster %in% c(21, 24, 1, 16))) %>% 
    filter(Genotype == "KO") %>% 
    # select the "interesting" clusters
  filter(Proportion > ko_pct ) %>% 
  .$cluster

gt_ko_pct <- unique(sce$cluster_names[sce$cluster_names %in% c( high_mt_clusters, low_umi_clusters, doublet) &
    (sce$cluster_names %in% difference_KO_WT)])

There are 0 clusters flagged as poor quality and also more abundant in the KO

Cluster QC

the clusters highlighted as:

low umi: majority of cells having less than 3000 umi counts
high mt : majority of cells having more than 10 % mitochondrial genes
doublet: clusters composed by cells that express markers from different cell types.
There was no clusters flagged from the other categories

sce$filter_out <- sce$cluster_names %in% c( high_mt_clusters, low_umi_clusters, doublet) 
filter_out <- sum(sce$filter_out)
plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
         point_alpha = 0.3, text_by = "cluster_names", text_size = 3)

plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
         point_alpha = 0.3, text_by = "celltype", text_size = 3)

plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
         point_alpha = 0.3)

if (!file.exists(here("processed", project, "sce_clusterQC.RDS"))){
 sce <- sce[, sce$filter_out == FALSE]
 saveRDS(sce, here("processed", project, "sce_clusterQC.RDS"))
}

This would filter 15632 cells filtered out, leaving an object with 51194cells.

Session Info

Click to expand

sessionInfo()

## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dplyr_1.0.7                 pals_1.7                   
##  [3] scDblFinder_1.6.0           tibble_3.1.5               
##  [5] scater_1.20.1               ggplot2_3.3.5              
##  [7] scuttle_1.2.1               here_1.0.1                 
##  [9] SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
## [11] Biobase_2.52.0              GenomicRanges_1.44.0       
## [13] GenomeInfoDb_1.28.4         IRanges_2.26.0             
## [15] S4Vectors_0.30.2            BiocGenerics_0.38.0        
## [17] MatrixGenerics_1.4.3        matrixStats_0.61.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7              rprojroot_2.0.2          
##  [3] tools_4.1.1               bslib_0.3.1              
##  [5] utf8_1.2.2                R6_2.5.1                 
##  [7] irlba_2.3.3               vipor_0.4.5              
##  [9] DBI_1.1.1                 colorspace_2.0-2         
## [11] withr_2.4.2               tidyselect_1.1.1         
## [13] gridExtra_2.3             compiler_4.1.1           
## [15] BiocNeighbors_1.10.0      DelayedArray_0.18.0      
## [17] labeling_0.4.2            sass_0.4.0               
## [19] scales_1.1.1              stringr_1.4.0            
## [21] digest_0.6.28             rmarkdown_2.11           
## [23] XVector_0.32.0            dichromat_2.0-0          
## [25] pkgconfig_2.0.3           htmltools_0.5.2          
## [27] sparseMatrixStats_1.4.2   highr_0.9                
## [29] maps_3.4.0                fastmap_1.1.0            
## [31] limma_3.48.3              rlang_0.4.12             
## [33] DelayedMatrixStats_1.14.3 farver_2.1.0             
## [35] jquerylib_0.1.4           generics_0.1.1           
## [37] jsonlite_1.7.2            BiocParallel_1.26.2      
## [39] RCurl_1.98-1.5            magrittr_2.0.1           
## [41] BiocSingular_1.8.1        GenomeInfoDbData_1.2.6   
## [43] Matrix_1.3-4              Rcpp_1.0.7               
## [45] ggbeeswarm_0.6.0          munsell_0.5.0            
## [47] fansi_0.5.0               viridis_0.6.2            
## [49] lifecycle_1.0.1           stringi_1.7.5            
## [51] yaml_2.2.1                edgeR_3.34.1             
## [53] zlibbioc_1.38.0           grid_4.1.1               
## [55] dqrng_0.3.0               crayon_1.4.2             
## [57] lattice_0.20-45           cowplot_1.1.1            
## [59] beachmat_2.8.1            mapproj_1.2.7            
## [61] locfit_1.5-9.4            metapod_1.0.0            
## [63] knitr_1.36                pillar_1.6.4             
## [65] igraph_1.2.7              xgboost_1.4.1.1          
## [67] ScaledMatrix_1.0.0        glue_1.4.2               
## [69] evaluate_0.14             scran_1.20.1             
## [71] data.table_1.14.2         vctrs_0.3.8              
## [73] gtable_0.3.0              purrr_0.3.4              
## [75] assertthat_0.2.1          xfun_0.27                
## [77] rsvd_1.0.5                viridisLite_0.4.0        
## [79] beeswarm_0.4.0            cluster_2.1.2            
## [81] bluster_1.2.1             statmod_1.4.36           
## [83] ellipsis_0.3.2