library(scran) # for scDE
library(scater) # for aggregate counts
library(edgeR) #for De
library(here) # reproducible paths
project <- "fire-mice"
sce <- readRDS(here("processed", "fire-mice", "sce_anno_02.RDS"))
sce <- sce[,sce$clusters_named %in% c("mOligo1", "mOligo3")]
sce$clusters_named <- droplevels(sce$clusters_named)
Sum the counts
summed <- aggregateAcrossCells(sce,
id=colData(sce)[,c("Sample", "clusters_named")])
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <- sample_cluster
# Filter genes with specific function from edgeR
keep <- filterByExpr(dge_summed, group=summed$clusters_named)
dge_summed <- dge_summed[keep,]
summary(keep)
## Mode FALSE TRUE
## logical 13750 5398
Samples with low number of cells are not kept.
#evaluate where to cut.
View(as.data.frame(colData(summed)))
We will filter all samples with less than 10 cells.
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(summed$ncells >= 10, "darkgreen", "red"), main = "filter out")
# only oligo3
plotMDS(cpm(dge_summed[,summed$clusters_named == "mOligo3"], log=TRUE),
col=ifelse(summed[,summed$clusters_named == "mOligo3"]$ncells >= 10, "darkgreen", "red"), main = "filter out only oligo3")
summed <- summed[, summed$ncells >= 10]
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <- sample_cluster
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
plotMD(dge_summed, column=i)
}
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"), main = "genotype")
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$clusters_named == "mOligo1", "darkblue", "darkred"), main = "cluster")
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$chip == "7", "orange", "green"),
main = "chip")
# Build teh design
design <- model.matrix(~factor(chip) + factor(clusters_named), dge_summed$samples)
# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)
# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))
# save results
write.csv(topTags(de_results, n = 500), here("outs", project, "DE_edgeR", "de_mOligo3vsmOligo1_edgeR.csv"))
saveRDS(de_results, here("processed", project, "DE_oligo3_edgeR_de_results.RDS"))
sce <- readRDS(here("processed", "fire-mice", "sce_anno_02.RDS"))
sce <- sce[,sce$clusters_named %in% c("mOligo3", "mOligo2")]
sce$clusters_named <- droplevels(sce$clusters_named)
Sum the counts
summed <- aggregateAcrossCells(sce,
id=colData(sce)[,c("Sample", "clusters_named")])
Samples with low number of cells are not kept.
summed <- summed[, summed$ncells >= 15]
# create DGElist
dge_summed <- DGEList(counts(summed), samples=colData(summed))
# add names
sample_cluster <- paste0(dge_summed$samples$Sample, "_", dge_summed$samples$clusters_named)
rownames(dge_summed$samples) <- sample_cluster
colnames(dge_summed$counts) <- sample_cluster
dge_summed <- calcNormFactors(dge_summed)
par(mfrow=c(2,4))
for (i in seq_len(ncol(dge_summed))) {
plotMD(dge_summed, column=i)
}
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$genotype == "KO", "red", "blue"), main = "genotype")
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$clusters_named == "mOligo2", "darkblue", "darkred"), main = "cluster")
plotMDS(cpm(dge_summed, log=TRUE),
col=ifelse(dge_summed$samples$chip == "7", "orange", "green"),
main = "chip")
# Build teh design
design <- model.matrix(~factor(chip) + factor(clusters_named), dge_summed$samples)
# estimate dispersions
dge_summed <- estimateDisp(dge_summed, design)
fit <- glmQLFit(dge_summed, design, robust=TRUE)
# Run DE
de_results <- glmQLFTest(fit, coef=ncol(design))
# save results
write.csv(topTags(de_results, n = 500), here("outs", project, "DE_edgeR", "de_mOligo3vsmOligo2_edgeR.csv"))
saveRDS(de_results, here("processed", project, "DE_oligo3vs2_edgeR_de_results.RDS"))
Output:
LogFC is the log fold-change, which is the log difference between both groups
LogCPM are the log counts per million, which can be understood as measuring expression level.
F is the F-statistic from the quasi-likelihood F-test.
PValue is the nominal p-value derived from F without any multiple testing correction
FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.