set-up

library(scran) # for scDE
library(scater) # for aggregate counts
library(edgeR) #for De
library(here) # reproducible paths
project <- "fire-mice"
sce <- readRDS(here("processed", "fire-mice", "sce_anno_02.RDS")) 

Oligo 1 vs Oligo 3

sce <- sce[,sce$clusters_named %in% c("mOligo1", "mOligo3")]
sce$clusters_named <- droplevels(sce$clusters_named)

Pre processed

Sum the counts

summed <- aggregateAcrossCells(sce, 
    id=colData(sce)[,c("Sample", "clusters_named")])
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <-  sample_cluster
# Filter genes with specific function from edgeR
keep <- filterByExpr(dge_summed, group=summed$clusters_named)
dge_summed <- dge_summed[keep,]
summary(keep)
##    Mode   FALSE    TRUE 
## logical   13750    5398

Run edgeR pipeline

Samples with low number of cells are not kept.

#evaluate where to cut.
View(as.data.frame(colData(summed)))

We will filter all samples with less than 10 cells.

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(summed$ncells >= 10, "darkgreen", "red"), main = "filter out")

# only oligo3
plotMDS(cpm(dge_summed[,summed$clusters_named == "mOligo3"], log=TRUE), 
    col=ifelse(summed[,summed$clusters_named == "mOligo3"]$ncells >= 10, "darkgreen", "red"), main = "filter out only oligo3")

summed <- summed[, summed$ncells >= 10]
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <-  sample_cluster
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
    plotMD(dge_summed, column=i)
}

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"), main = "genotype")

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$clusters_named == "mOligo1", "darkblue", "darkred"), main = "cluster")

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$chip == "7", "orange", "green"),
    main = "chip")

# Build teh design
design <- model.matrix(~factor(chip) + factor(clusters_named), dge_summed$samples)

# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)

# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))

# save results
write.csv(topTags(de_results, n = 500), here("outs", project, "DE_edgeR", "de_mOligo3vsmOligo1_edgeR.csv"))

saveRDS(de_results, here("processed", project, "DE_oligo3_edgeR_de_results.RDS"))

DE oligo3 vs 2

sce <- readRDS(here("processed", "fire-mice", "sce_anno_02.RDS")) 
sce <- sce[,sce$clusters_named %in% c("mOligo3", "mOligo2")]
sce$clusters_named <- droplevels(sce$clusters_named)

Pre processed

Sum the counts

summed <- aggregateAcrossCells(sce, 
    id=colData(sce)[,c("Sample", "clusters_named")])

Run edgeR pipeline

Samples with low number of cells are not kept.

summed <- summed[, summed$ncells >= 15]
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <-  sample_cluster
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
    plotMD(dge_summed, column=i)
}

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"), main = "genotype")

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$clusters_named == "mOligo2", "darkblue", "darkred"), main = "cluster")

plotMDS(cpm(dge_summed, log=TRUE), 
    col=ifelse(dge_summed$samples$chip == "7", "orange", "green"),
    main = "chip")

# Build teh design
design <- model.matrix(~factor(chip) + factor(clusters_named), dge_summed$samples)

# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)

# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))

# save results
write.csv(topTags(de_results, n = 500), here("outs", project, "DE_edgeR", "de_mOligo3vsmOligo2_edgeR.csv"))

saveRDS(de_results, here("processed", project, "DE_oligo3vs2_edgeR_de_results.RDS"))

Output:

LogFC is the log fold-change, which is the log difference between both groups

LogCPM are the log counts per million, which can be understood as measuring expression level.

F is the F-statistic from the quasi-likelihood F-test.

PValue is the nominal p-value derived from F without any multiple testing correction

FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.