Differential Expressino OPC clusters

set-up

library(scran) # for scDE

## Loading required package: SingleCellExperiment

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## Loading required package: scuttle

library(scater) # for aggregate counts

## Loading required package: ggplot2

library(edgeR) #for De

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:scater':
## 
##     plotMDS

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

## 
## Attaching package: 'edgeR'

## The following object is masked from 'package:SingleCellExperiment':
## 
##     cpm

library(here) # reproducible paths

## here() starts at U:/Datastore/CMVM/scs/groups/jpriller-GROUP/scRNAseq/R-analysis-vold

project <- "fire-mice"
sce <- readRDS(here("processed", "fire-mice", "sce_anno_02.RDS"))

OPC 1 vs OPC 2

sce <- sce[,sce$clusters_named %in% c("OPC1", "OPC2")]
sce$clusters_named <- droplevels(sce$clusters_named)

Pre processed

Sum the counts

summed <- aggregateAcrossCells(sce, 
    id=colData(sce)[,c("Sample", "clusters_named")])

# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <-  sample_cluster
# Filter genes with specific function from edgeR
keep <- filterByExpr(dge_summed, group=summed$clusters_named)
dge_summed <- dge_summed[keep,]
summary(keep)

##    Mode   FALSE    TRUE 
## logical   11687    7461

Run edgeR pipeline

edgeR allows to add a design matrix, with the batch as a covariate, to account for batch differences in the differential expression

dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
    plotMD(dge_summed, column=i)
}

Artefacts at low counts are caused by samples with low number of cells.

#evaluate where to cut.
View(as.data.frame(colData(summed)))

We will filter all samples with less than 10 cells. WTvo1_1_OPC1 is kept with 12 cells.

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(summed$ncells >= 10, "darkgreen", "red"), main = "filter out")

summed <- summed[, summed$ncells >= 10]
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <-  sample_cluster
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
    plotMD(dge_summed, column=i)
}

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"), main = "genotype")

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$clusters_named == "OPC1", "darkblue", "darkred"), main = "cluster")

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$chip == "7", "orange", "green"),
    main = "chip")

# Build teh design
design <- model.matrix(~factor(chip) + factor(clusters_named), dge_summed$samples)

# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)

# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))

# Add the ambient info
de_result_ambient_all <- readRDS(here("processed",project, "DE_results_ambient_edgeR_.RDS"))
oligo_ambient <- de_result_ambient_all$mOligo1[c("maxAmbient","microgliaAmbient", "minAmbient")]
de_results_ambient <- cbind(as.data.frame(topTags(de_results, n = length(rownames(de_results)), sort.by = "none")), oligo_ambient)

# save results
write.csv(de_results_ambient, here("outs", project, "DE_edgeR", "de_OPC2vsOPC1_edgeR.csv"))

saveRDS(de_results, here("processed", project, "DE_OPC_edgeR_de_results.RDS"))

Output:

LogFC is the log fold-change, which is the log difference between both groups

LogCPM are the log counts per million, which can be understood as measuring expression level.

F is the F-statistic from the quasi-likelihood F-test.

PValue is the nominal p-value derived from F without any multiple testing correction

FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.

Differential Expressino OPC clusters

Nadine Bestard

2023-01-20

set-up

OPC 1 vs OPC 2

Pre processed

Run edgeR pipeline