set-up

library(scran) # for scDE
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: scuttle
library(scater) # for aggregate counts
## Loading required package: ggplot2
library(edgeR) #for De
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:scater':
## 
##     plotMDS
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
## 
## Attaching package: 'edgeR'
## The following object is masked from 'package:SingleCellExperiment':
## 
##     cpm
library(here) # reproducible paths
## here() starts at U:/Datastore/CMVM/scs/groups/jpriller-GROUP/scRNAseq/R-analysis-VM
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_oligo_corrected.RDS")) 

Pre processed

Sum the counts

summed <- aggregateAcrossCells(sce, 
    id=colData(sce)[,"Sample"])
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# filter out samples with low number of cells (with such big groups this shouldn't be a problem)
dge_summed <- dge_summed[, summed$ncells >= 10]
# Filter genes with specific function from edgeR
keep <- filterByExpr(dge_summed, group=summed$genotype)
dge_summed <- dge_summed[keep,]
summary(keep)
##    Mode   FALSE    TRUE 
## logical    6559   12274

Run edgeR pipeline

edgeR allows to add a design matrix, with the batch as a covariate, to account for batch differences in the differential expression

dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
    plotMD(dge_summed, column=i)
}

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"))

# Build teh design
# Reordering the genotype factor, treated should be second level
dge_summed$samples$genotype <- factor(dge_summed$samples$genotype, levels = c("WT", "KO"))
design <- model.matrix(~factor(chip) + factor(genotype), dge_summed$samples)

# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)

# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))

# save results
write.csv(topTags(de_results, n = 500), here("outs", project, "DE_oligo_edgeR", "de_oligo_edgeR.csv"))

saveRDS(de_results, here("processed", project, "DE_oligo_edgeR_de_results.RDS"))

Output:

LogFC is the log fold-change, which is the log difference between both groups

LogCPM are the log counts per million, which can be understood as measuring expression level.

F is the F-statistic from the quasi-likelihood F-test.

PValue is the nominal p-value derived from F without any multiple testing correction

FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.