Set-up

library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qcs
library(org.Mm.eg.db) # To annotate the genenames
library(ggplot2) # for the bin2 density
library(pals) # viridis colour
sce <- readRDS(here("processed", "sce_vold.RDS"))
project <- "fire-mice"

The object has 32285 genes and 38921 cells before filtering

Add cell QC metrics to the sce

First we need to sort the gene names and gene symbols, because the default ensembl notation is not very handy. And then save the mitochondrial genes as such.

if (!file.exists(here("processed", project, "sce_preliminary.RDS"))) {
  # obtain full genenames
  genename <- mapIds(org.Mm.eg.db,
                     keys = rownames(sce),
                     keytype = "ENSEMBL",
                     column = c("GENENAME")
                     )
  # Use the symbols as rownames
  # first make gene names unique
  # TODO: save duplicate gene name list
  symb_unique <- uniquifyFeatureNames(rownames(sce), rowData(sce)[, "Symbol"])
  # Now they can be used as rownames
  rownames(sce) <- symb_unique
  # Add full gene names and the uniuqe symbols to the rowdata
  rowData(sce)$symb_uniq <- symb_unique
  rowData(sce)$gene_name <- genename
  # Subset the mitochondrial genes
  is_mito <- grepl("^mt-", rownames(sce))
} else {
  sce <- readRDS(here( "processed", project, "sce_preliminary.RDS"))
}

Then we can use the scater package to add the quality per cell. This computes for each cell some useful metrics such as the number of umi counts (library size), the number of detected genes and the percentage of mitochondiral genes.

Then we use the automatic isOutlier function from the same package that determine which values in a numeric vector are outliers based on the median absolute deviation (MAD). When using this function with low number a log transformation is added, that prevents negative thresholds. We also take only in consideration the first three batches, as the last one is lower quality

if (!file.exists(here( "processed", project, "sce_preliminary.RDS"))) {
  sce <- addPerCellQC(sce, subsets = list(mt = is_mito))

  # Automated outlier detection
  outlier_lib_low <- isOutlier(sce$total, log = TRUE, type = "lower")
  outlier_expr_low <-
    isOutlier(sce$detected, log = TRUE, type = "lower")
  outlier_lib_high <- isOutlier(sce$total, type = "higher")
  outlier_expr_high <-
    isOutlier(sce$detected, type = "higher")
  outlier_mt <- isOutlier(sce$subsets_mt_percent, type = "higher")
  # total
  outlier <-
    outlier_lib_low |
      outlier_expr_low |
      outlier_lib_high | outlier_expr_high | outlier_mt

  # Visualize the thresholds and the cells deleted by each parametre
  summary_outlier <- data.frame(
    lib_size_high = c(sum(outlier_lib_high),
                      attr(outlier_lib_high, "thresholds")[2]),
    expression_high = c(sum(outlier_expr_high),
                        attr(outlier_expr_high, "thresholds")[2]),
    lib_size_low = c(sum(outlier_lib_low),
                     attr(outlier_lib_low, "thresholds")[1]),
    expression_low = c(sum(outlier_expr_low),
                       attr(outlier_expr_low, "thresholds")[1]),
    mt_pct = c(sum(outlier_mt), attr(outlier_mt, "thresholds")[2]),
    total = c(sum(outlier), NA)
  )
  row.names(summary_outlier) <- c("Cells filtered", "Threshold")
  write.csv(summary_outlier, here("outs", project, "autofilter_summary.csv"))
  # Add if it is an outlier to the metadata
  sce$outlier <- outlier
} else {
  summary_outlier <- read.csv(here("outs", project, "autofilter_summary.csv"))
}
summary_outlier
##                X lib_size_high expression_high lib_size_low expression_low
## 1 Cells filtered       1561.00          96.000       0.0000       116.0000
## 2      Threshold      17346.98        6293.816     156.1676       108.9684
##       mt_pct total
## 1 5618.00000  7291
## 2   31.10665    NA

This data is saved in outs/fire-mice/autofilter_summary.csv

Plots before QC

Diagnostic plots to visualize the data distribution. The orange cells are marked as outliers by the automatic detection from scater.

Violin plots

plotColData(sce, x = "Sample", y = "sum", colour_by = "outlier") +
  ggtitle("Total count") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "sum", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "detected", colour_by = "outlier") +
  scale_y_log10() + ggtitle("Detected Genes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
  scale_y_log10() + ggtitle("total count by batch") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "subsets_mt_percent", colour_by = "outlier") +
  ggtitle("Mitocchondrial percentatge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Histograms

In the x axis we can see the total number of umi (library size) per cell, the number of detected genes per cell and the mitochondrial percentage per cell; with the number of cells for each measure in the y axis.

hist(
  sce$total,
  breaks = 100
)

This object had already been filtrated with the cell-calling algorithm from CellRanger, that is meant to remove empty droplets. Therefore it is expected to see the total sum of umi skewed as in the plot above.

hist(
  sce$detected,
  breaks = 100
)

The bimodality present in the number of counts was already visible in the violin plots.

hist(
  sce$subsets_mt_percent,
  breaks = 100
)

There is a very heavy tail of cells with high mitochondrial genes.

Scatter plots

plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "outlier")

plotColData(sce, x = "sum", y = "detected", colour_by = "outlier")

plotColData(sce, x = "sum", y = "detected", colour_by = "Sample")

Density plots:

plotColData(sce, x = "sum", y="subsets_mt_percent") + geom_bin_2d(bins=c(100,100)) + scale_fill_gradientn(colours = viridis(200))

plotColData(sce, x = "sum", y="detected") + geom_bin_2d(bins=c(100,100)) + scale_fill_gradientn(colours = viridis(200))

PCA

Here we run a PCA using the information in the metadata instead of the gene expression. It is useful to visualize the QC parametres.

if (!file.exists(here( "processed", project, "sce_preliminary.RDS"))) {
  sce <- runColDataPCA(sce, variables = c("sum", "detected", "subsets_mt_percent"))
}
plotReducedDim(sce, dimred = "PCA_coldata", colour_by = "Sample")

plotReducedDim(sce, dimred = "PCA_coldata", colour_by = "chip")

Ratio between sum and gene counts

This measures the number of detected genes per cell divided by its library size. This will be very useful to delete the cells that have low gene counts but a relatively high umi count (visible in the scatter plots).

if (!file.exists(here( "processed", project, "sce_preliminary.RDS"))) {
  sce$ratio_detected_sum <- sce$detected / sce$sum
  sce$outlier_ratio <- isOutlier(sce$ratio_detected_sum, type = "both")
}

summary(sce$ratio_detected_sum)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005381 0.333367 0.385602 0.398436 0.454288 0.829308
plotColData(sce, x = "sum", y = "detected", colour_by = "ratio_detected_sum")

# Plot an histogram with the ratio between umi and gene counts
hist(
  sce$ratio_detected_sum,
  breaks = 100
)

The isoutlier function can be used to find the outliers of any distribution, as far as it is roughly normal. Bellow we use it with the ratio between the number of genes expressed and the number of umi. Again, only the first 3 chips are considered to calculate the cut-offs

# Use the is outlier function from scater to see the cutoffs suggestions
plotColData(sce, x = "sum", y = "detected", colour_by = "outlier_ratio")

attr(sce$outlier_ratio, "thresholds")
##    lower   higher 
## 0.123428 0.647776

This filters out 1706 cells. From them 1458 cells were not already outlier for other reasons.

Thresholds for cell filtering

The thresholds set with isOutlier are quite relaxed, we will keep them like this for this first round of QC and first exploratory analysis, and then be stricter in a subsequent round.

# save the object with all cells and genes but with outlier parametres saved
if (!file.exists(here( "processed", project, "sce_preliminary.RDS"))) {
saveRDS(sce, here( "processed", project, "sce_preliminary.RDS"))
}
# filter cells
sce <- sce[, sce$outlier == FALSE & sce$outlier_ratio == FALSE]

Gene QC

It is typically a good idea to remove genes whose expression level is considered “undetectable”. Here we define a gene as detectable if at least two cells contain a transcript from the gene. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells.

# filter genes
genes_beforeqc <- dim(sce)[1]
keep_feature <- rowSums(counts(sce) > 0) > 1
sce <- sce[keep_feature, ]
genes_beforeqc - dim(sce)[1]
## [1] 9273

This way we deleted 9273 genes and kept 23012 genes

We can look at a plot that shows the top 50 (by default) most-expressed features. Each row in the plot below corresponds to a gene; each bar corresponds to the expression of a gene in a single cell; and the circle indicates the median expression of each gene, with which genes are sorted. We expect to see the “usual suspects”, i.e., mitochondrial genes, actin, ribosomal protein, MALAT1. A large number of pseudo-genes or predicted genes may indicate problems with alignment.

plotHighestExprs(sce, exprs_values = "counts")

Save new filetered object

if (!file.exists(here( "processed", project, "sce_QC_01.RDS"))) {
  saveRDS(sce, here( "processed", project, "sce_QC_01.RDS"))
}

The object has 23012 genes and 30172 cells after filtering

Session Info

Click to expand
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] pals_1.7                    org.Mm.eg.db_3.14.0        
##  [3] AnnotationDbi_1.56.2        scater_1.23.5              
##  [5] ggplot2_3.3.5               scuttle_1.4.0              
##  [7] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
##  [9] Biobase_2.54.0              GenomicRanges_1.46.1       
## [11] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [13] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [15] MatrixGenerics_1.6.0        matrixStats_0.61.0         
## [17] here_1.0.1                 
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7              bit64_4.0.5              
##  [3] httr_1.4.2                rprojroot_2.0.2          
##  [5] tools_4.1.1               bslib_0.3.1              
##  [7] utf8_1.2.2                R6_2.5.1                 
##  [9] irlba_2.3.5               vipor_0.4.5              
## [11] DBI_1.1.2                 colorspace_2.0-2         
## [13] withr_2.4.3               tidyselect_1.1.1         
## [15] gridExtra_2.3             bit_4.0.4                
## [17] compiler_4.1.1            cli_3.2.0                
## [19] BiocNeighbors_1.12.0      DelayedArray_0.20.0      
## [21] labeling_0.4.2            sass_0.4.0               
## [23] scales_1.1.1              stringr_1.4.0            
## [25] digest_0.6.29             rmarkdown_2.11           
## [27] XVector_0.34.0            dichromat_2.0-0          
## [29] pkgconfig_2.0.3           htmltools_0.5.2          
## [31] sparseMatrixStats_1.6.0   highr_0.9                
## [33] maps_3.4.0                fastmap_1.1.0            
## [35] rlang_1.0.1               rstudioapi_0.13          
## [37] RSQLite_2.2.9             DelayedMatrixStats_1.16.0
## [39] farver_2.1.0              jquerylib_0.1.4          
## [41] generics_0.1.2            jsonlite_1.7.3           
## [43] BiocParallel_1.28.3       dplyr_1.0.8              
## [45] RCurl_1.98-1.6            magrittr_2.0.2           
## [47] BiocSingular_1.10.0       GenomeInfoDbData_1.2.7   
## [49] Matrix_1.4-0              Rcpp_1.0.8               
## [51] ggbeeswarm_0.6.0          munsell_0.5.0            
## [53] fansi_1.0.2               viridis_0.6.2            
## [55] lifecycle_1.0.1           stringi_1.7.6            
## [57] yaml_2.2.2                zlibbioc_1.40.0          
## [59] grid_4.1.1                blob_1.2.2               
## [61] parallel_4.1.1            ggrepel_0.9.1            
## [63] crayon_1.5.0              lattice_0.20-45          
## [65] cowplot_1.1.1             Biostrings_2.62.0        
## [67] beachmat_2.10.0           mapproj_1.2.8            
## [69] KEGGREST_1.34.0           knitr_1.37               
## [71] pillar_1.7.0              ScaledMatrix_1.2.0       
## [73] glue_1.6.1                evaluate_0.14            
## [75] vctrs_0.3.8               png_0.1-7                
## [77] gtable_0.3.0              purrr_0.3.4              
## [79] assertthat_0.2.1          cachem_1.0.6             
## [81] xfun_0.29                 rsvd_1.0.5               
## [83] viridisLite_0.4.0         tibble_3.1.6             
## [85] beeswarm_0.4.0            memoise_2.0.1            
## [87] ellipsis_0.3.2