library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qcs
library(ggplot2) # To add titles to plots
library(patchwork) # agregate plots
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_clusterQC.RDS"))
source(here("src/colours.R"))
The object has 23012 genes and 18481 cells.
Previous thresholds were:
| lib_size_high | expression_high | lib_size_low | expression_low | mt_pct | total | |
| Cells filtered | 1561 | 96 | 0 | 116 | 5618 | 7291 |
| Threshold | 17346.98 | 6293.816 | 156.1676 | 108.9684 | 31.10665 | NA |
Split by sample
plotColData(sce, x = "Sample", y = "subsets_mt_percent") +
scale_y_log10() + ggtitle("mt genes") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + scale_color_manual(values = cols)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Removed 6 rows containing missing values (geom_point).
plotColData(sce, x = "Sample", y = "detected") +
scale_y_log10() + ggtitle("Detected genes log scale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + scale_color_manual(values = cols)
plotColData(sce, x = "Sample", y = "sum") +
scale_y_log10() + ggtitle("Total count log scale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + scale_color_manual(values = cols)
Split by sample and cluster, facet wrap
# detected
plotColData(sce, x = "Sample", y = "detected", colour_by = "genotype", other_fields = "clusters_named") +
scale_y_log10(breaks = scales::pretty_breaks(n = 12)) +
ggtitle("Detected genes log scale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + scale_color_manual(values = col_wt_ko) + facet_wrap(~clusters_named)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
# sum
plotColData(sce, x = "Sample", y = "sum", colour_by = "genotype", other_fields = "clusters_named") +
scale_y_log10(breaks = scales::pretty_breaks(n = 12))+
ggtitle("Total count log scale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_color_manual(values = col_wt_ko)+ facet_wrap(~clusters_named)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
#mt
plotColData(sce, x = "Sample", y = "subsets_mt_percent", colour_by = "genotype", other_fields = "clusters_named") +
scale_y_log10(breaks = scales::pretty_breaks(n = 12))+
ggtitle("Mt. percentage") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_color_manual(values = col_wt_ko)+ facet_wrap(~clusters_named)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 6 rows containing missing values (geom_point).
hist(
sce$detected,
breaks = 100
)
hist(
sce$sum,
breaks = 100
)
hist(
sce$subsets_mt_percent,
breaks = 100
)
plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "genotype", other_fields = "clusters_named") + facet_wrap(~clusters_named) +
scale_color_manual(values = col_wt_ko)
plotColData(sce, x = "sum", y = "detected", colour_by = "genotype", other_fields = "clusters_named") + facet_wrap(~clusters_named) +
scale_color_manual(values = col_wt_ko)
plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "Sample", other_fields = "clusters_named") + facet_wrap(~clusters_named) +
scale_color_manual(values = cols)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
plotColData(sce, x = "sum", y = "detected", colour_by = "Sample", other_fields = "clusters_named") + facet_wrap(~clusters_named) +
scale_color_manual(values = cols)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
Sample WT3_2 and WT2_1 are lower quality. When deciding the thresholds they will be ignored.
We set the thresholds for each celltype:
Sum
OPCs 6000 -na
oligo 6000- exept olito 3 -
Astrocyte 4000
Microglia and immune 3000
vascular 3000
Ependymocytes&NRPs 2000
mature_neurons 2000
Gran&Mono 2000
ChP_epithelial 4000
Mt
all 15%
# create vector with same levels as the clusters_named to replace by thresholds for each
sce$clusters_named <- droplevels(sce$clusters_named)
sce$threshold_umi <- (sce$clusters_named)
#to get the order:
#dput(levels(sce$threshold_umi))
#c("Astro1", "Astro2", "Oligo1", "Astro3", "Vascular", "OPC", "Microglia1", "Immune", "Oligo2", "mNeurons", "Ependymocytes&NRPs", "Gran&Mono", "Oligo3", "ChP epithelia")
# replace
levels(sce$threshold_umi) <- c(4000, 4000, 6000, 4000, 3000, 6000,
3000, 3000, 6000, 2000, 2000,
2000, 0, 4000)
# transform the factor into numeric
sce$threshold_umi <- as.numeric(as.character(sce$threshold_umi))
# set thresholds
discard_umi <- sce$sum < sce$threshold_umi
discard_mt <- sce$subsets_mt_percent > 15
discard <- discard_umi | discard_mt
# subset
sce <- sce[,!discard]
We discard 2666 cells due to umi counts, and 1122 cells for high mt percentatge.
in total we discard 3063 cells
In the preliminary analysis we only deleted genes that had less than 2 cells expressing it, now we filter all genes with less than 10 cells expressing it
# at least 10 cells should express the gene
keep_feature <- rowSums(counts(sce) > 0) > 10
sce <- sce[keep_feature,]
We keep 19148 genes
if(!file.exists(here("processed", project, "sce_QC_02.RDS"))){
saveRDS(sce, here("processed", project, "sce_QC_02.RDS"))
}
Final dimension is 19148, 15418