library(scran) # for scDE
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Loading required package: scuttle
library(scater) # for aggregate counts
## Loading required package: ggplot2
library(edgeR) #for De
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:scater':
##
## plotMDS
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
##
## Attaching package: 'edgeR'
## The following object is masked from 'package:SingleCellExperiment':
##
## cpm
library(here) # reproducible paths
## here() starts at U:/Datastore/CMVM/scs/groups/jpriller-GROUP/scRNAseq/R-analysis-vold
project <- "fire-mice"
sce <- readRDS(here("processed", "fire-mice", "sce_anno_02.RDS"))
sce <- sce[,sce$clusters_named %in% c("OPC1", "OPC2")]
sce$clusters_named <- droplevels(sce$clusters_named)
Sum the counts
summed <- aggregateAcrossCells(sce,
id=colData(sce)[,c("Sample", "clusters_named")])
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <- sample_cluster
# Filter genes with specific function from edgeR
keep <- filterByExpr(dge_summed, group=summed$clusters_named)
dge_summed <- dge_summed[keep,]
summary(keep)
## Mode FALSE TRUE
## logical 11687 7461
edgeR allows to add a design matrix, with the batch as a covariate, to account for batch differences in the differential expression
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
plotMD(dge_summed, column=i)
}
Artefacts at low counts are caused by samples with low number of
cells.
#evaluate where to cut.
View(as.data.frame(colData(summed)))
We will filter all samples with less than 10 cells. WTvo1_1_OPC1 is kept with 12 cells.
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(summed$ncells >= 10, "darkgreen", "red"), main = "filter out")
summed <- summed[, summed$ncells >= 10]
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <- sample_cluster
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
plotMD(dge_summed, column=i)
}
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"), main = "genotype")
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$clusters_named == "OPC1", "darkblue", "darkred"), main = "cluster")
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$chip == "7", "orange", "green"),
main = "chip")
# Build teh design
design <- model.matrix(~factor(chip) + factor(clusters_named), dge_summed$samples)
# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)
# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))
# Add the ambient info
de_result_ambient_all <- readRDS(here("processed",project, "DE_results_ambient_edgeR_.RDS"))
oligo_ambient <- de_result_ambient_all$mOligo1[c("maxAmbient","microgliaAmbient", "minAmbient")]
de_results_ambient <- cbind(as.data.frame(topTags(de_results, n = length(rownames(de_results)), sort.by = "none")), oligo_ambient)
# save results
write.csv(de_results_ambient, here("outs", project, "DE_edgeR", "de_OPC2vsOPC1_edgeR.csv"))
saveRDS(de_results, here("processed", project, "DE_OPC_edgeR_de_results.RDS"))
Output:
LogFC is the log fold-change, which is the log difference between both groups
LogCPM are the log counts per million, which can be understood as measuring expression level.
F is the F-statistic from the quasi-likelihood F-test.
PValue is the nominal p-value derived from F without any multiple testing correction
FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.