library(scran) # for scDE
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Loading required package: scuttle
library(scater) # for aggregate counts
## Loading required package: ggplot2
library(edgeR) #for De
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:scater':
##
## plotMDS
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
##
## Attaching package: 'edgeR'
## The following object is masked from 'package:SingleCellExperiment':
##
## cpm
library(here) # reproducible paths
## here() starts at U:/Datastore/CMVM/scs/groups/jpriller-GROUP/scRNAseq/R-analysis-VM
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_oligo_clusters_01.RDS"))
We perform the DE analysis separately for each label to identify cell type-specific transcriptional effects of KO condition. The actual DE testing is performed on “pseudo-bulk” expression profiles (Tung et al. 2017), generated by summing counts together for all cells with the same combination of label and sample. This leverages the resolution offered by single-cell technologies to define the labels, and combines it with the statistical rigor of existing methods for DE analyses involving a small number of samples.
# each column represents unique combination for cluster_oligo and each sample replicate
summed <- aggregateAcrossCells(sce,
id=colData(sce)[,c("cluster_oligo", "Sample")]) ### change cluster_oligo by whatever resolution
summed
## class: SingleCellExperiment
## dim: 18833 29
## metadata(3): merge.info pca.info Samples
## assays(1): counts
## rownames(18833): Xkr4 Gm19938 ... CAAA01147332.1 AC149090.1
## rowData names(7): rotation ID ... gene_name subset
## colnames: NULL
## colData names(34): batch Sample ... Sample ncells
## reducedDimNames(4): corrected PCA TSNE TSNE_uncorrected
## mainExpName: NULL
## altExpNames(0):
We first filter all the combinations sample-label that resulted in less than 10 cells. These are saved in /outs/fire-mice/DE_oligo_cluster/fail_min_cells_DE.csv
# Removing all pseudo-bulk samples with 'insufficient' cells.
summed_filt <- summed[, summed$ncells >= 10]
# and saving to file which one there are
fail_n_cells <- colData(summed)[ colData(summed)$ncells < 10, c("Sample", "mouse", "genotype", "tissue", "chip", "cluster_oligo", "ncells")]
write.csv(fail_n_cells, here("outs", project,"DE_oligo_cluster", "fail_min_cells_DE.csv"), row.names = FALSE)
We then compute the DE with edgeR, one of the best methods according to (Soneson et al. 2018). The results are saved in /outs/fire-mice/DE_oligo_cluster/de_results
Output:
LogFC is the log fold-change, which is the log difference between both groups
LogCPM are the log counts per million, which can be understood as measuring expression level.
F is the F-statistic from the quasi-likelihood F-test.
PValue is the nominal p-value derived from F without any multiple testing correction
FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.
If the comparison was not possible for a particular cluster, most commonly due to lack of residual degrees of freedom ( e.g. an absence of enough samples from both conditions) it is listed in /outs/fire-mice/DE_oligo_cluster/fail_min_cells_DE.csv
# Reordering the genotype factor, treated should be second level
summed_filt$genotype <- factor(summed_filt$genotype, levels = c("WT", "KO"))
# run de
de_results <- pseudoBulkDGE(summed_filt,
label=summed_filt$cluster_oligo,
design= ~factor(chip) + genotype,
coef="genotypeKO",
condition=summed_filt$genotype
)
# save result
saveRDS(de_results, here("processed",project, "DE_oligo_cluster_de_results.RDS"))
lapply(names(de_results), function(x){
de_results_clu <- de_results[[x]]
write.csv(de_results_clu, here("outs", project, "DE_oligo_cluster", "de_results", paste0("de_results_", x, ".csv")), quote = FALSE)
}
)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
# save fail
write.csv(metadata(de_results), here("outs", project,"DE_oligo_cluster", "fail_contrast_DE.csv"), row.names = FALSE)
For each one of the labels a list is produced with the DE genes at a FDR of 5%. Summaries of these results are saved in /outs/fire-mice/DE_oligo_cluster/
Values of 1, -1 and 0 indicate that the gene is significantly upregulated, downregulated or not significant, respectively. Genes listed as NA were either filtered out as low-abundance genes for a given label’s analysis, or the comparison of interest was not possible for that particular label.
de_filtered_genecounts.csv contains a small summary with the number of genes DE for each one of the clusters.
de_filtered_genes.csv contains the list of genes DE.
is_de <- decideTestsPerLabel(de_results, threshold=0.05)
summary_de <- summarizeTestsPerLabel(is_de)
# virtually 0 or NA we do not care, we want only + or -
is_de_0 <- is_de
is_de_0[is.na(is_de_0)]<- 0
# but filter the original object to get them back
is_de_flt <- is_de[(rowSums(abs(is_de_0)) != 0), ]
write.csv(summary_de, here("outs", project,"DE_oligo_cluster", "de_filtered_genecounts.csv"))
write.csv(is_de_flt, here("outs", project, "DE_oligo_cluster", "de_filtered_genes.csv" ))
Saved in /outs/fire-mice/DE_oligo_cluster/de_filtered_plots
is_de_genes <- rownames(is_de_flt)
for( gene in is_de_genes){
pdf(file = here("outs", project, "DE_oligo_cluster", "de_filtered_plots", paste0(gene, ".pdf")))
plot <- plotExpression(logNormCounts(sce),
features=gene,
x="genotype", colour_by="genotype",
other_fields="cluster_oligo") +
facet_wrap(~cluster_oligo) +
scale_color_manual(values = c("#666666", "#E25822")) +
ggtitle(gene)
print(plot)
dev.off()
}