library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qcs
library(dplyr) # df manipulation
library(scran) # For feature selcetion
library(ggplot2) # To add titles to plots
library(batchelor) # Batch correction
We import the dataset and subset for the clusters that had highest expression of Plp1 (they also had higher expression of other typical oligo genes such as Mog, Mag, Mbp). This includes clusters that also express genes from other celltypes, such as the OligoAstros, that express Astrocyte markers; a cluster that express immune cell markers and another that also expresses OPC markers.
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_clusters_01.RDS"))
sce <- sce[, sce$cluster_k60 %in% c(1,6,5,7, 12, 15, 16)]
We need to bear in mind this dataset comes from a bigger dataset where outliers have already been excluded.
plotColData(sce, x = "Sample", y = "sum") +
ggtitle("Total count")
plotColData(sce, x = "Sample", y = "sum") +
scale_y_log10() + ggtitle("Total count log scale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plotColData(sce, x = "Sample", y = "detected") +
scale_y_log10() + ggtitle("Detected Genes") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
scale_y_log10() + ggtitle("total count by batch") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plotColData(sce, x = "Sample", y = "sum", colour_by = "tissue") +
scale_y_log10() + ggtitle("total count by tissue") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plotColData(sce, x = "Sample", y = "sum", colour_by = "genotype") +
scale_y_log10() + ggtitle("total count by genotype") +
scale_colour_manual(values = c("#E25822", "#888888")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
plotColData(sce, x = "Sample", y = "subsets_mt_percent") +
ggtitle("Mitocchondrial percentatge") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
hist(
sce$detected,
breaks = 100
)
hist(
sce$subsets_mt_percent,
breaks = 100
)
plotColData(sce, x = "sum", y = "detected", colour_by = "Sample")
plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "Sample")
plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "chip")
plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "genotype")
I do not trust the big red blob from sample 2660, cutting at minimum 5000 umi counts will get rid of it. We can also cut at 10% mt genes.
This is the dimensional reduction done with the whole dataset, not as accurate as the one we will compute later, with only the oligos, but good enough for this quality control
plotTSNE(sce, colour_by = "chip", point_size = 0.2)
plotTSNE(sce, colour_by = "detected", point_size = 0.2)
plotTSNE(sce, colour_by = "sum", point_size = 0.2)
plotTSNE(sce, colour_by = "subsets_mt_percent", point_size = 0.2)
There is a small cluster, lower quality (lower umi and detected genes and higer mt genes pct) than the other clusters. We will delete this cluster.
table(sce$Sample, sce$cluster_k60) %>%
as.data.frame.matrix() %>%
select(1,6,5,7, 12, 15, 16)
## 1 6 5 7 12 15 16
## EKDL2651 153 93 2019 74 409 440 61
## EKDL2653 218 75 2311 107 539 337 1
## EKDL2654 14 47 1038 35 333 150 23
## EKDL2656 30 40 802 49 298 99 0
## EKDL2657 27 25 832 46 228 134 9
## EKDL2659 79 65 1592 98 378 208 0
## EKDL2660 59 108 1379 74 279 161 40
## EKDL2662 2 43 727 24 231 189 1
table(sce$genotype, sce$cluster_k60) %>%
as.data.frame.matrix() %>%
select(1,6,5,7, 12, 15, 16)
## 1 6 5 7 12 15 16
## KO 329 223 5432 278 1446 833 2
## WT 253 273 5268 229 1249 885 133
The small cluster only present in the WT also has microglia markers. Most lickeley it comes from microglia “eating” oligodendrocytes. We will remove this cluster.
if (!file.exists(here("processed", project, "sce_oligo.RDS"))) {
print("before filtering")
dim(sce)
# filter cells
keep_cells <- sce$sum >= 5000 &
sce$detected >= 2000 &
sce$subsets_mt_percent <= 10 &
!(sce$cluster_k60 %in% c(15, 16))
sce <- sce[, keep_cells ]
# filter genes
keep_feature <- rowSums(counts(sce) > 0) > 1
sce <- sce[keep_feature, ]
print("after filtering")
dim(sce)
# save results
saveRDS(sce, here("processed", project, "sce_oligo.RDS") )
}
## [1] "before filtering"
## [1] "after filtering"
before filtering: 23155 genes 16358 cells
after filtering : 19506 genes 13583 cells
result:
if (!file.exists(here("processed", project, "sce_oligo.RDS"))) {
plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
scale_y_log10() + ggtitle("total count by batch after filering") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}else{
sce <- readRDS( here("processed", project, "sce_oligo.RDS") )
plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
scale_y_log10() + ggtitle("total count by batch after filering") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}
We quantify per-gene variation computing the variance of the log-normalized expression values (referred to as “log-counts” for simplicity) for each gene across all cells in the population (A. T. L. Lun, McCarthy, and Marioni 2016). We use modelGeneVar() that does also corrects for the abundance of each gene.
#The density weights are removed because the genes
# with highest mean abundance are also HVG, this avoids overfiting
gene_var_df <- modelGeneVar(sce, density.weights=FALSE )
gene_var <- metadata(gene_var_df)
plot(gene_var$mean, gene_var$var, xlab= "Mean of log-expression", ylab= "Variance of log-expression")
curve(gene_var$trend(x), lwd=2, add=T, col = "red")
The next step is to select the subset of HVGs to use in downstream analyses. The simplest HVG selection strategy is to take the top X genes with the largest values for the relevant variance metric. Here I select the top 15 % of genes.
hvgs <- getTopHVGs(gene_var_df, prop=0.15)
# save them in the object
rowSubset(sce) <- hvgs
This leaves us with 1374 highly variable genes.
plot(gene_var$mean, gene_var$var, xlab= "Mean of log-expression", ylab= "Variance of log-expression")
points(gene_var$mean[hvgs], gene_var$var[hvgs], col = "red")
curve(gene_var$trend(x), lwd=2, add=T, col = "red")
Here we recompute the dimensional reduction to better fit our subsetted oligo data. This will remove the dimensional batch correction performed earlier, that will be recomputed.
# Delete all previous dimensions, done with all genes, to not use the wrong ones
reducedDims(sce) <- NULL
# re-run pca
set.seed(1000)
sce <- runPCA(sce)
pct_var <- attr(reducedDim(sce, "PCA"), "percentVar")
plot(pct_var, log="y", xlab="PC", ylab="pct variance explained")
# rerun PCA
reducedDim(sce, "PCA") <- reducedDim(sce, "PCA")[,1:25]
# rerun TSNE if it's not already done
if (!(file.exists(
here("processed", project, "sce_oligo_corrected.RDS")
))) {
set.seed(1000)
sce <- runTSNE(sce)
plotReducedDim(sce, "TSNE", colour_by = "chip")
}else{ # load teh corrected object
sce <- readRDS(here("processed", project, "sce_oligo_corrected.RDS"))
plotReducedDim(sce, "TSNE_uncorrected")
}
if (!(file.exists(
here("processed", project, "sce_oligo_corrected.RDS")
))) {
set.seed(100)
sce <- correctExperiments(sce,
batch = factor(sce$chip),
subset.row = rowSubset(sce),
correct.all=TRUE,
PARAM = FastMnnParam(
merge.order =
list(list("3","5"), list("4","6")),
d = 25,
prop.k=0.10
)
)
# recompute dimensional reduction
#keeping the previous dimensional reduction
reducedDim(sce, "TSNE_uncorrected") <- reducedDim(sce, "TSNE")
set.seed(100)
sce <- runTSNE(sce, dimred="corrected")
# save
saveRDS( sce, here("processed", project, "sce_oligo_corrected.RDS"))
}
## Warning in .eliminate_overlaps(assayNames(merged), assayNames(original), :
## ignoring 'assays' with same name as 'batchCorrect' output
## Warning in .eliminate_overlaps(colnames(colData(merged)),
## colnames(colData(original)), : ignoring 'colData' fields with same name as
## 'batchCorrect' output
## Warning in .eliminate_overlaps(colnames(rowData(merged)),
## colnames(rowData(original)), : ignoring 'rowData' fields with same name as
## 'batchCorrect' output
## Warning in .eliminate_overlaps(names(metadata(merged)),
## names(metadata(original)), : ignoring 'metadata' with same name as
## 'batchCorrect' output
plotReducedDim(sce, colour_by= "chip", dimred = "TSNE") +
ggtitle("TSNE dimensional reduction corrected")