Set-up

library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qcs
library(dplyr) # df manipulation
library(scran) # For feature selcetion
library(ggplot2) # To add titles to plots
library(batchelor) # Batch correction

Import

We import the dataset and subset for the clusters that had highest expression of Plp1 (they also had higher expression of other typical oligo genes such as Mog, Mag, Mbp). This includes clusters that also express genes from other celltypes, such as the OligoAstros, that express Astrocyte markers; a cluster that express immune cell markers and another that also expresses OPC markers.

project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_clusters_01.RDS"))
sce <- sce[, sce$cluster_k60 %in% c(1,6,5,7, 12, 15, 16)]

Cell and gene QC

We need to bear in mind this dataset comes from a bigger dataset where outliers have already been excluded.

Violin plots

plotColData(sce, x = "Sample", y = "sum") +
  ggtitle("Total count") 

plotColData(sce, x = "Sample", y = "sum") +
  scale_y_log10() + ggtitle("Total count log scale") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "detected") +
  scale_y_log10() + ggtitle("Detected Genes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
  scale_y_log10() + ggtitle("total count by batch") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "sum", colour_by = "tissue") +
  scale_y_log10() + ggtitle("total count by tissue") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plotColData(sce, x = "Sample", y = "sum", colour_by = "genotype") +
  scale_y_log10() + ggtitle("total count by genotype") +
  scale_colour_manual(values = c("#E25822", "#888888")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

plotColData(sce, x = "Sample", y = "subsets_mt_percent") +
  ggtitle("Mitocchondrial percentatge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Histograms

hist(
  sce$detected,
  breaks = 100
)

hist(
  sce$subsets_mt_percent,
  breaks = 100
)

Scatter plots

plotColData(sce, x = "sum", y = "detected", colour_by = "Sample")

plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "Sample")

plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "chip")

plotColData(sce, x = "sum", y = "subsets_mt_percent", colour_by = "genotype")

I do not trust the big red blob from sample 2660, cutting at minimum 5000 umi counts will get rid of it. We can also cut at 10% mt genes.

Dimensional redution

This is the dimensional reduction done with the whole dataset, not as accurate as the one we will compute later, with only the oligos, but good enough for this quality control

plotTSNE(sce, colour_by = "chip", point_size = 0.2)

plotTSNE(sce, colour_by = "detected", point_size = 0.2)

plotTSNE(sce, colour_by = "sum", point_size = 0.2)

plotTSNE(sce, colour_by = "subsets_mt_percent", point_size = 0.2)

There is a small cluster, lower quality (lower umi and detected genes and higer mt genes pct) than the other clusters. We will delete this cluster.

Tables

  table(sce$Sample, sce$cluster_k60) %>% 
  as.data.frame.matrix() %>% 
  select(1,6,5,7, 12, 15, 16)
##            1   6    5   7  12  15 16
## EKDL2651 153  93 2019  74 409 440 61
## EKDL2653 218  75 2311 107 539 337  1
## EKDL2654  14  47 1038  35 333 150 23
## EKDL2656  30  40  802  49 298  99  0
## EKDL2657  27  25  832  46 228 134  9
## EKDL2659  79  65 1592  98 378 208  0
## EKDL2660  59 108 1379  74 279 161 40
## EKDL2662   2  43  727  24 231 189  1
table(sce$genotype, sce$cluster_k60) %>% 
  as.data.frame.matrix() %>% 
  select(1,6,5,7, 12, 15, 16)
##      1   6    5   7   12  15  16
## KO 329 223 5432 278 1446 833   2
## WT 253 273 5268 229 1249 885 133

The small cluster only present in the WT also has microglia markers. Most lickeley it comes from microglia “eating” oligodendrocytes. We will remove this cluster.

Subset to the best quality cells and delete non detectable genes

if (!file.exists(here("processed", project, "sce_oligo.RDS"))) {
  print("before filtering")
  dim(sce)
  # filter cells
  keep_cells <- sce$sum >= 5000 &
    sce$detected >= 2000 &
    sce$subsets_mt_percent <= 10 &
    !(sce$cluster_k60 %in% c(15, 16))
  sce <- sce[, keep_cells ]
  # filter genes
  keep_feature <- rowSums(counts(sce) > 0) > 1
  sce <- sce[keep_feature, ]
  print("after filtering")
  dim(sce)
  # save results
  saveRDS(sce, here("processed", project, "sce_oligo.RDS") )
}
## [1] "before filtering"
## [1] "after filtering"

before filtering: 23155 genes 16358 cells

after filtering : 19506 genes 13583 cells

result:

if (!file.exists(here("processed", project, "sce_oligo.RDS"))) {
plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
  scale_y_log10() + ggtitle("total count by batch after filering") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}else{
sce <- readRDS( here("processed", project, "sce_oligo.RDS") )
plotColData(sce, x = "Sample", y = "sum", colour_by = "chip") +
  scale_y_log10() + ggtitle("total count by batch after filering") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}

Feature selection and dimensional reduction

Quantify per-gene variation

We quantify per-gene variation computing the variance of the log-normalized expression values (referred to as “log-counts” for simplicity) for each gene across all cells in the population (A. T. L. Lun, McCarthy, and Marioni 2016). We use modelGeneVar() that does also corrects for the abundance of each gene.

#The density weights are removed because the genes
# with highest mean abundance are also HVG, this avoids overfiting
gene_var_df <- modelGeneVar(sce, density.weights=FALSE )
gene_var <- metadata(gene_var_df)
plot(gene_var$mean, gene_var$var, xlab= "Mean of log-expression", ylab= "Variance of log-expression")
curve(gene_var$trend(x), lwd=2, add=T, col = "red")

Select the HVGs

The next step is to select the subset of HVGs to use in downstream analyses. The simplest HVG selection strategy is to take the top X genes with the largest values for the relevant variance metric. Here I select the top 15 % of genes.

hvgs <- getTopHVGs(gene_var_df, prop=0.15)
# save them in the object
rowSubset(sce) <- hvgs

This leaves us with 1374 highly variable genes.

plot(gene_var$mean, gene_var$var, xlab= "Mean of log-expression", ylab= "Variance of log-expression")
points(gene_var$mean[hvgs], gene_var$var[hvgs], col = "red")
curve(gene_var$trend(x), lwd=2, add=T, col = "red")

Run PCA and choose PCs

Here we recompute the dimensional reduction to better fit our subsetted oligo data. This will remove the dimensional batch correction performed earlier, that will be recomputed.

# Delete all previous dimensions, done with all genes, to not use the wrong ones
reducedDims(sce) <- NULL
# re-run pca
set.seed(1000)
sce <- runPCA(sce)
pct_var <- attr(reducedDim(sce, "PCA"), "percentVar")
plot(pct_var, log="y", xlab="PC", ylab="pct variance explained")

# rerun PCA
reducedDim(sce, "PCA") <- reducedDim(sce, "PCA")[,1:25]
# rerun TSNE if it's not already done
if (!(file.exists(
  here("processed", project,  "sce_oligo_corrected.RDS")
))) {
set.seed(1000)
sce <- runTSNE(sce)
plotReducedDim(sce, "TSNE", colour_by = "chip")
}else{ # load teh corrected object
  sce <- readRDS(here("processed", project,  "sce_oligo_corrected.RDS"))
  plotReducedDim(sce, "TSNE_uncorrected")
}

Rerun batch correction

if (!(file.exists(
  here("processed", project,  "sce_oligo_corrected.RDS")
))) {
set.seed(100)
sce <- correctExperiments(sce,
  batch = factor(sce$chip),
  subset.row = rowSubset(sce),
  correct.all=TRUE,
  PARAM = FastMnnParam(
  merge.order = 
    list(list("3","5"), list("4","6")),
  d = 25,
  prop.k=0.10
  )
)
# recompute dimensional reduction 
#keeping  the previous dimensional reduction
reducedDim(sce, "TSNE_uncorrected") <- reducedDim(sce, "TSNE")
set.seed(100)
sce <- runTSNE(sce,  dimred="corrected")
# save
saveRDS( sce, here("processed", project,  "sce_oligo_corrected.RDS"))
}
## Warning in .eliminate_overlaps(assayNames(merged), assayNames(original), :
## ignoring 'assays' with same name as 'batchCorrect' output
## Warning in .eliminate_overlaps(colnames(colData(merged)),
## colnames(colData(original)), : ignoring 'colData' fields with same name as
## 'batchCorrect' output
## Warning in .eliminate_overlaps(colnames(rowData(merged)),
## colnames(rowData(original)), : ignoring 'rowData' fields with same name as
## 'batchCorrect' output
## Warning in .eliminate_overlaps(names(metadata(merged)),
## names(metadata(original)), : ignoring 'metadata' with same name as
## 'batchCorrect' output
plotReducedDim(sce, colour_by= "chip", dimred = "TSNE") + 
      ggtitle("TSNE dimensional reduction corrected")