Cluster quality control

set-up

library(SingleCellExperiment)
library(here) # reproducible paths
library(scater) # plot reduced dims
library(tibble)  # for `rownames_to_column`
library(scDblFinder) # detect doublets
library(dplyr) # df filtering

project <- "fire-mice"

if (!file.exists(here("processed", project, "sce_anno_01.RDS"))) {
  sce <- readRDS(here("processed", project, "sce_clusters_01.RDS"))
}else{
  sce <- readRDS(here("processed", project, "sce_anno_01.RDS"))
}

source(here("src/colours.R"))

Introduction

As in [publication] we will perform a cluster QC to remove clusters of poorer quality. This will be assessed by the number of UMI counts, the mitochondrial percentage, doublet analysis, ribosomal genes and the number of samples that contribute to each cluster. Moreover we will keep in mind our experimental groups in order to ensure biological effects are not being lost.

plotTSNE(sce, colour_by = "clusters_named", point_size=0.5,
          text_by = "clusters_named", text_size = 3) +
  scale_color_manual(values = cols)

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Number of molecules per cluster

plotTSNE(sce, colour_by = "total") + 
  ggtitle("Total number of umi counts per cell")

plotTSNE(sce, colour_by = "detected") +
  ggtitle("Detected number of genes per cell")

Lower values of umi counts and detected genes can be associated to lower quality cells. Cells can also have lower expressed genes due to their biological state or celltype.

pct_cells<- 50
min_umi  <- 3000

Select clusters with 50 % cells having less than 3000umi counts.

umi_df <- as.data.frame(cbind(sce$total, as.character(sce$clusters_named)))
colnames(umi_df) <- c("umi", "clusters_named")

# caculate the percentatges of cells that are "low umi"
umi_df <- 
  umi_df %>% 
  mutate(low_umi = as.numeric(umi) < min_umi) %>% 
  group_by(clusters_named) %>% 
  summarise(n_low_umi = sum(low_umi), n_cells = n()) %>%
  mutate(pct_cell_low_umi = (n_low_umi/n_cells)*100) 
umi_df %>% arrange(desc(pct_cell_low_umi))

# Filter the clusters that have a high proportion of "low umi"
low_umi_clusters <- 
  umi_df %>% 
  filter(pct_cell_low_umi > pct_cells) %>% 
  .$clusters_named

The clusters flagged are Astro5, DeadAstro1, DeadAstro2, DeadOPC, Erythrocytes, Microglia2, Oligo3. For the Erythrocytes low number of genes is to be expected, as Erythrocytes have low gene expression.

Mithocondrial genes

High mithocondrial genes is associated with stressed, lower quality, cells.

plotTSNE(sce, colour_by = "subsets_mt_percent") +#, text_by = "clusters_named") + 
  ggtitle("Percentatge mithocondrial genes")

pct_cells<- 50
pct_mt  <- 10

Select clusters with 50 % cells having more than 10% mithocondrial genes.

mt_df <- as.data.frame(cbind(sce$subsets_mt_percent, as.character(sce$clusters_named)))
colnames(mt_df) <- c("subsets_mt_percent", "clusters_named")

mt_df <- 
  mt_df %>% 
  mutate(high_pct = as.numeric(subsets_mt_percent) > 10) %>% 
  group_by(clusters_named) %>% 
  summarise(n_high_mt = sum(high_pct), n_cells = n()) %>% 
  mutate(pct_cell_high_mt = (n_high_mt/n_cells)*100) %>%  
  arrange(desc(pct_cell_high_mt))
mt_df

high_mt_clusters <- 
  mt_df %>% 
  filter(pct_cell_high_mt > pct_cells) %>% 
  .$clusters_named

The clusters flagged are DeadAstro1, DeadOligo, DeadAstro2, DeadAstro3, DeadOPC, Astro6, mNeurons, Microglia2, Astro4.

Ribosomal genes

To visualise the ribosomal content in the whole dataset we plotted the cells according to their ribosomal content. High ribosomal content in one cluster, that expresses a mix profile, could indicate that the cells are clustering based on ribosomal gene content.

# the cluster 
# save ribosomal genes
is_ribo <- grepl("^Rp[sl]", rownames(sce))
# recompute the quality metrics, subseting the ribosomal genes
ribo_qc <- perCellQCMetrics(sce, subsets = list(ribo = is_ribo))
# we are only interested in ribo
sce$subsets_ribo_percent <- ribo_qc$subsets_ribo_percent
sce$subsets_ribo_sum <- ribo_qc$subsets_ribo_sum
sce$subsets_ribo_detected <- ribo_qc$subsets_ribo_detected

plotTSNE(sce, colour_by = "subsets_ribo_percent") + 
  ggtitle("Percentatge ribosomal genes")

Number of mice per cluster

How many mice contribute to each cluster?

# count how many cells from each gnt group  there are per cluster
sum_per_gnt_cluster <- table(sce$genotype, sce$clusters_named )
# for each cluster count how many cells are from each mice, dividing by KO and WT mice
sum_per_mice_cluster <- table(sce$mouse, sce$genotype, sce$clusters_named )
# For each cluster sum of mice that do have cells on that cluster
#colSums(sum_per_mice_cluster > 0)



# create a summary
summary <- as.data.frame(rbind(colSums(sum_per_mice_cluster > 0), sum_per_gnt_cluster, colSums(sum_per_gnt_cluster)))
row.names(summary) <- c("KO mice", "WT mice", "KO cells", "WT cells", "total cells")
summary

# create a summary per mouse
sum_per_mouse_cluster <- table(sce$mouse, sce$clusters_named )
rownames(sum_per_mouse_cluster) <- paste(rownames(sum_per_mouse_cluster), c("WT","KO"))
sum_per_mouse_cluster <- sum_per_mouse_cluster[c(1,3,5,2,4,6),]
as.data.frame.matrix(sum_per_mouse_cluster)

# create a summary per sample
sum_per_mouse_cluster <- table(sce$Sample, sce$clusters_named )
as.data.frame.matrix(sum_per_mouse_cluster)

# count how many cells from each gnt group  there are per cluster
sum_per_smp_cluster <- table(sce$Sample, sce$clusters_named )

# normalise per cluster, looking how the KO and WT are distributed
# across the clusters, to give to both groups the same weight
prop_per_cluster <- prop.table(sum_per_smp_cluster, margin = 1)

# calculate the proportions for each cluster
prop_per_smp <- round(prop.table(prop_per_cluster , margin = 2 )*100, 2)

# Display
prop_per_smp<- as.data.frame(prop_per_smp)
colnames(prop_per_smp) <- c("Sample", "cluster", "Proportion")

ggplot(data = prop_per_smp, aes(x = cluster, y = Proportion, fill = Sample)) +
  geom_bar(position = "fill", stat = "identity") + theme(axis.text.x = element_text(angle = 45, hjust=1, vjust = 1)) +
  scale_fill_manual(values = brewer.paired(12))

Paired colour values are from the same mice. Erythrocytes should had been filtered when gating, two of the samples seem to have been more bloody than the others and more Erythrocytes have come through. From these same samples come the cluster Astro5, that was already flagged as it has low number of umi counts and will be deleted as it is probably just an artefact.

low_n_mice <- c("Astro5", "Erythrocytes")

Clusters of doublet cells

Detection of clusters formed by doublets/multiplets (i.e. multiple cells captured within the same droplet or reaction volume). The function test each cluster against the null hypothesis that it does consist of doublets. The tool consider every possible triplet of clusters consisting of a query cluster and two putative “source” clusters. Under the null hypothesis that the query consists of doublets from the two sources, it computes the number of genes (num.de) that are differentially expressed in the same direction in the query cluster compared to both of the source clusters. Such genes would be unique markers for the query cluster and provide evidence against the null hypothesis.

Clusters with the few unique genes (num.de) are more likely to be composed of doublets.
Potential doublets should also have larger library size than the cells from the source clusters, resulting in lib.size ratios bellow unity.

if(!file.exists(here("outs",project,  "doublet_clusters_res0.7.csv"))){
  res_dbl <- findDoubletClusters(sce, sce$clusters_named)
  res_dbl <- res_dbl %>% 
    as.data.frame() %>% 
    rownames_to_column("cluster")
  write.csv(res_dbl, here("outs", project,  "doublet_clusters_res0.7.csv"))
}else{
  res_dbl <- read.csv(here("outs", project,  "doublet_clusters_res0.7.csv"), row.names = 1)
}
res_dbl

All clusters have a significant p-value to reject the null hypothesis that they do consist of doublets.

The clusters with less DE genes with the potential sources are clusters with already identified as dead cells and the erythrocytes ( that have expression of few genes, making it a good candidate to have little genes differentially expressed with other cells)

doublet<-c("DeadAstro3", "DeadAstro2" )

Control vs fire mice

We want to have a closer look at the clusters that do have a difference between the knockout and the wild type before deleting the clusters.

# divide the two objects
sce_ko <- sce[,sce$genotype == "KO"]
sce_ctr <- sce[,sce$genotype == "WT"]
# plot them side by side
gridExtra::grid.arrange(
plotTSNE(sce_ko, colour_by = "clusters_named", point_size=0.5,
         point_alpha = 0.3, text_by = "clusters_named", text_size = 3) +
  scale_color_manual(values = cols) +
  ggtitle("fire mice"),
plotTSNE(sce_ctr, colour_by = "clusters_named", point_size=0.5,
         point_alpha = 0.3, text_by = "clusters_named", text_size = 3) +
  scale_color_manual(values = cols) +
  ggtitle("control"), 
ncol = 1
)

## Warning: Removed 1 rows containing missing values (geom_text_repel).

Proportion KO-WT

In order to visualise the proportions from KO and WT for each cluster, we do not take in consideration the microglia clusters, as these are only present in the control, and we normalise per number of cells per cluster.

# delete the microglia
sce_no_mc <- sce[,!(sce$celltype %in% "Microglia")]

# count how many cells from each gnt group  there are per cluster
sum_per_gnt_cluster_no_mc <- table(sce_no_mc$genotype, sce_no_mc$clusters_named )

# normalise per cluster, looking how the KO and WT are distributed
# across the clusters, to give to both groups the same weight
prop_per_cluster_no_mc <- prop.table(sum_per_gnt_cluster_no_mc, margin = 1)

# calculate the proportions for each cluster
prop_per_gnt_no_mc <- round(prop.table(prop_per_cluster_no_mc , margin = 2 )*100, 2)

# Display
prop_per_gnt_no_mc<- as.data.frame(prop_per_gnt_no_mc)
colnames(prop_per_gnt_no_mc) <- c("Genotype", "cluster", "Proportion")

prop_per_gnt_no_mc %>% 
  filter(Genotype == "KO") %>% 
  arrange(desc(Proportion))

visualise in a plot

ggplot(data = prop_per_gnt_no_mc, aes(x = cluster, y = Proportion, fill = Genotype)) +
  geom_bar(position = "fill", stat = "identity") + theme(axis.text.x = element_text(angle = 45, hjust=1, vjust = 1)) +
  scale_fill_manual(values = col_wt_ko)

## Warning: Removed 4 rows containing missing values (position_stack).

ko_pct <- 65

difference_KO_WT <-
  prop_per_gnt_no_mc %>% 
  arrange(desc(Proportion)) %>% 
# the microglia clusters will be the top differences, but this is not 
    # something we are specially interested
  filter(!(cluster %in% c("Microglia1", "Microglia2"))) %>% 
    filter(Genotype == "KO") %>% 
    # select the "interesting" clusters
  filter(Proportion > ko_pct) %>% 
  .$cluster

gt_ko_pct <- unique(sce$clusters_named[sce$clusters_named %in% c( high_mt_clusters, low_umi_clusters, doublet) &
    (sce$clusters_named %in% difference_KO_WT)])
gt_ko_pct

## [1] mNeurons Oligo3  
## 24 Levels: Astro1 Astro2 Oligo1 Astro3 Astro4 DeadOligo Astro5 ... ChP epithelia

There are 2 clusters flagged as poor quality and also more abundant (greater than 65 %) in the KO animals, these are mNeurons, Oligo3.

The two clusters that seem much more abundant in the WT are an artefact coming mostly from only one ore two samples.

Cluster QC

the clusters delted from downstream analysis are highlighted bellow, and are composed by clusters with:

low umi: majority of cells having less than 3000 umi counts. These are:

Astro5, DeadAstro1, DeadAstro2, DeadOPC, Erythrocytes, Microglia2, Oligo3

high mt : majority of cells having more than 10 % mitochondrial genes

DeadAstro1, DeadOligo, DeadAstro2, DeadAstro3, DeadOPC, Astro6, mNeurons, Microglia2, Astro4

doublet: clusters composed by cells that express markers from different cell types.

DeadAstro3, DeadAstro2

artefacts coming from few samples:

Astro5, Erythrocytes

Clusters with different abundance between KO and WT were kept, as they could be different due to the biological condition.

mNeurons, Oligo3

sce$filter_out <- sce$clusters_named %in% c( high_mt_clusters, low_umi_clusters, doublet, low_n_mice) & !(sce$clusters_named %in% difference_KO_WT)
filter_out <- sum(sce$filter_out)
plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
         point_alpha = 0.3, text_by = "clusters_named", text_size = 3)

plotTSNE(sce, colour_by = "filter_out", point_size=0.5,
         point_alpha = 0.3)

if (!file.exists(here("processed", project, "sce_clusterQC.RDS"))){
 sce <- sce[, sce$filter_out == FALSE]
 saveRDS(sce, here("processed", project, "sce_clusterQC.RDS"))
}

This filters 11691 cells, leaving an object with 18481cells.

Session Info

Click to expand

sessionInfo()

## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] pals_1.7                    dplyr_1.0.9                
##  [3] scDblFinder_1.10.0          tibble_3.1.7               
##  [5] scater_1.24.0               ggplot2_3.3.6              
##  [7] scuttle_1.6.2               here_1.0.1                 
##  [9] SingleCellExperiment_1.18.0 SummarizedExperiment_1.26.1
## [11] Biobase_2.56.0              GenomicRanges_1.48.0       
## [13] GenomeInfoDb_1.32.3         IRanges_2.30.0             
## [15] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [17] MatrixGenerics_1.8.1        matrixStats_0.62.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7              rprojroot_2.0.3          
##  [3] tools_4.2.1               bslib_0.3.1              
##  [5] utf8_1.2.2                R6_2.5.1                 
##  [7] irlba_2.3.5               vipor_0.4.5              
##  [9] DBI_1.1.3                 colorspace_2.0-3         
## [11] withr_2.5.0               tidyselect_1.1.2         
## [13] gridExtra_2.3             compiler_4.2.1           
## [15] cli_3.3.0                 BiocNeighbors_1.14.0     
## [17] DelayedArray_0.22.0       labeling_0.4.2           
## [19] rtracklayer_1.56.1        sass_0.4.1               
## [21] scales_1.2.0              stringr_1.4.0            
## [23] digest_0.6.29             Rsamtools_2.12.0         
## [25] rmarkdown_2.14            XVector_0.36.0           
## [27] dichromat_2.0-0.1         pkgconfig_2.0.3          
## [29] htmltools_0.5.2           sparseMatrixStats_1.8.0  
## [31] highr_0.9                 maps_3.4.0               
## [33] limma_3.52.2              fastmap_1.1.0            
## [35] rlang_1.0.3               rstudioapi_0.13          
## [37] DelayedMatrixStats_1.18.0 farver_2.1.1             
## [39] BiocIO_1.6.0              jquerylib_0.1.4          
## [41] generics_0.1.3            jsonlite_1.8.0           
## [43] BiocParallel_1.30.3       RCurl_1.98-1.8           
## [45] magrittr_2.0.3            BiocSingular_1.12.0      
## [47] GenomeInfoDbData_1.2.8    Matrix_1.4-1             
## [49] Rcpp_1.0.9                ggbeeswarm_0.6.0         
## [51] munsell_0.5.0             fansi_1.0.3              
## [53] viridis_0.6.2             lifecycle_1.0.1          
## [55] edgeR_3.38.4              stringi_1.7.8            
## [57] yaml_2.3.5                MASS_7.3-57              
## [59] zlibbioc_1.42.0           grid_4.2.1               
## [61] dqrng_0.3.0               parallel_4.2.1           
## [63] ggrepel_0.9.1             crayon_1.5.1             
## [65] lattice_0.20-45           cowplot_1.1.1            
## [67] Biostrings_2.64.0         beachmat_2.12.0          
## [69] mapproj_1.2.8             locfit_1.5-9.6           
## [71] metapod_1.4.0             knitr_1.39               
## [73] pillar_1.7.0              igraph_1.3.4             
## [75] rjson_0.2.21              xgboost_1.6.0.1          
## [77] codetools_0.2-18          ScaledMatrix_1.4.0       
## [79] XML_3.99-0.10             glue_1.6.2               
## [81] evaluate_0.15             data.table_1.14.2        
## [83] scran_1.24.0              vctrs_0.4.1              
## [85] gtable_0.3.0              purrr_0.3.4              
## [87] assertthat_0.2.1          xfun_0.31                
## [89] rsvd_1.0.5                restfulr_0.0.15          
## [91] viridisLite_0.4.0         GenomicAlignments_1.32.1 
## [93] beeswarm_0.4.0            cluster_2.1.3            
## [95] statmod_1.4.37            bluster_1.6.0            
## [97] ellipsis_0.3.2