Set-up

library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qcs
library(ggplot2) # To add titles to plots
library(patchwork) # agregate plots
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_clusterQC.RDS"))
source(here("src/colours.R"))

The object has 23012 genes and 18481 cells.

Cell QC

Previous thresholds were:

lib_size_high expression_high lib_size_low expression_low mt_pct total
Cells filtered 1561 96 0 116 5618 7291
Threshold 17346.98 6293.816 156.1676 108.9684 31.10665 NA
Click to expand plots

Violin plots

Split by sample

plotColData(sce, x = "Sample", y = "subsets_mt_percent") +
  scale_y_log10() + ggtitle("mt genes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +  scale_color_manual(values = cols)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Removed 6 rows containing missing values (geom_point).

plotColData(sce, x = "Sample", y = "detected") +
  scale_y_log10() + ggtitle("Detected genes log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +  scale_color_manual(values = cols)

plotColData(sce, x = "Sample", y = "sum") +
  scale_y_log10() + ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +  scale_color_manual(values = cols)

Split by sample and cluster, facet wrap

# detected
plotColData(sce, x = "Sample", y = "detected",  colour_by = "genotype", other_fields = "clusters_named") +
  scale_y_log10(breaks = scales::pretty_breaks(n = 12)) + 
  ggtitle("Detected genes log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +           scale_color_manual(values = col_wt_ko)  + facet_wrap(~clusters_named)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

# sum
plotColData(sce, x = "Sample", y = "sum",  colour_by = "genotype", other_fields = "clusters_named") +
 scale_y_log10(breaks = scales::pretty_breaks(n = 12))+
  ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
  scale_color_manual(values = col_wt_ko)+ facet_wrap(~clusters_named)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

#mt
plotColData(sce, x = "Sample", y = "subsets_mt_percent",  colour_by = "genotype", other_fields = "clusters_named") +
 scale_y_log10(breaks = scales::pretty_breaks(n = 12))+
  ggtitle("Mt. percentage") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
  scale_color_manual(values = col_wt_ko)+ facet_wrap(~clusters_named)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 6 rows containing missing values (geom_point).

hist(
  sce$detected,
  breaks = 100
)

hist(
  sce$sum,
  breaks = 100
)

hist(
  sce$subsets_mt_percent,
  breaks = 100
)

Scatter plots

plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "genotype", other_fields = "clusters_named") + facet_wrap(~clusters_named) + 
  scale_color_manual(values = col_wt_ko)

plotColData(sce, x = "sum", y = "detected", colour_by = "genotype", other_fields = "clusters_named") + facet_wrap(~clusters_named) + 
  scale_color_manual(values = col_wt_ko)
plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "Sample", other_fields = "clusters_named") + facet_wrap(~clusters_named) + 
  scale_color_manual(values = cols)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

plotColData(sce, x = "sum", y = "detected", colour_by = "Sample", other_fields = "clusters_named") + facet_wrap(~clusters_named) + 
  scale_color_manual(values = cols)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Sample WT3_2 and WT2_1 are lower quality. When deciding the thresholds they will be ignored.

We set the thresholds for each celltype:

Sum

OPCs 6000 -na

oligo 6000- exept olito 3 -

Astrocyte 4000

Microglia and immune 3000

vascular 3000

Ependymocytes&NRPs 2000

mature_neurons 2000

Gran&Mono 2000

ChP_epithelial 4000

Mt

all 15%

# create vector with same levels as the clusters_named to replace by thresholds for each
sce$clusters_named <- droplevels(sce$clusters_named)
sce$threshold_umi <- (sce$clusters_named)
#to get the order:
#dput(levels(sce$threshold_umi))
#c("Astro1", "Astro2", "Oligo1", "Astro3", "Vascular", "OPC", "Microglia1", "Immune", "Oligo2", "mNeurons", "Ependymocytes&NRPs", "Gran&Mono", "Oligo3", "ChP epithelia")

# replace 
levels(sce$threshold_umi) <- c(4000, 4000, 6000, 4000, 3000, 6000, 
3000, 3000, 6000, 2000, 2000, 
2000, 0, 4000)


# transform the factor into numeric
sce$threshold_umi <- as.numeric(as.character(sce$threshold_umi))

# set thresholds
discard_umi <- sce$sum < sce$threshold_umi
discard_mt <- sce$subsets_mt_percent > 15
discard <- discard_umi | discard_mt

# subset
sce <- sce[,!discard]

We discard 2666 cells due to umi counts, and 1122 cells for high mt percentatge.

in total we discard 3063 cells

Gene QC

In the preliminary analysis we only deleted genes that had less than 2 cells expressing it, now we filter all genes with less than 10 cells expressing it

# at least 10 cells should express the gene
keep_feature <- rowSums(counts(sce) > 0) > 10
sce <- sce[keep_feature,]

We keep 19148 genes

if(!file.exists(here("processed", project,  "sce_QC_02.RDS"))){
  saveRDS(sce, here("processed", project,  "sce_QC_02.RDS"))
  }

Final dimension is 19148, 15418