Differential Expression

set-up

library(scran) # for scDE

## Loading required package: SingleCellExperiment

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## Loading required package: scuttle

library(scater) # for aggregate counts

## Loading required package: ggplot2

library(edgeR) #for De

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:scater':
## 
##     plotMDS

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

## 
## Attaching package: 'edgeR'

## The following object is masked from 'package:SingleCellExperiment':
## 
##     cpm

library(here) # reproducible paths

## here() starts at U:/Datastore/CMVM/scs/groups/jpriller-GROUP/scRNAseq/R-analysis-VM

project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_oligo_clusters_01.RDS"))

Use of pseudo-bulk samples

We perform the DE analysis separately for each label to identify cell type-specific transcriptional effects of KO condition. The actual DE testing is performed on “pseudo-bulk” expression profiles (Tung et al. 2017), generated by summing counts together for all cells with the same combination of label and sample. This leverages the resolution offered by single-cell technologies to define the labels, and combines it with the statistical rigor of existing methods for DE analyses involving a small number of samples.

# each column represents unique combination for cluster_oligo and each sample replicate
summed <- aggregateAcrossCells(sce, 
    id=colData(sce)[,c("cluster_oligo", "Sample")]) ### change cluster_oligo by whatever resolution
summed

## class: SingleCellExperiment 
## dim: 18833 29 
## metadata(3): merge.info pca.info Samples
## assays(1): counts
## rownames(18833): Xkr4 Gm19938 ... CAAA01147332.1 AC149090.1
## rowData names(7): rotation ID ... gene_name subset
## colnames: NULL
## colData names(34): batch Sample ... Sample ncells
## reducedDimNames(4): corrected PCA TSNE TSNE_uncorrected
## mainExpName: NULL
## altExpNames(0):

Perform DE

We first filter all the combinations sample-label that resulted in less than 10 cells. These are saved in /outs/fire-mice/DE_oligo_cluster/fail_min_cells_DE.csv

# Removing all pseudo-bulk samples with 'insufficient' cells.
summed_filt <- summed[, summed$ncells >= 10]

# and saving to file which one there are
fail_n_cells  <- colData(summed)[ colData(summed)$ncells < 10, c("Sample", "mouse", "genotype", "tissue", "chip", "cluster_oligo", "ncells")]
write.csv(fail_n_cells, here("outs", project,"DE_oligo_cluster", "fail_min_cells_DE.csv"), row.names = FALSE)

We then compute the DE with edgeR, one of the best methods according to (Soneson et al. 2018). The results are saved in /outs/fire-mice/DE_oligo_cluster/de_results

Output:

LogFC is the log fold-change, which is the log difference between both groups

LogCPM are the log counts per million, which can be understood as measuring expression level.

F is the F-statistic from the quasi-likelihood F-test.

PValue is the nominal p-value derived from F without any multiple testing correction

FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.

If the comparison was not possible for a particular cluster, most commonly due to lack of residual degrees of freedom ( e.g. an absence of enough samples from both conditions) it is listed in /outs/fire-mice/DE_oligo_cluster/fail_min_cells_DE.csv

# Reordering the genotype factor, treated should be second level
summed_filt$genotype <- factor(summed_filt$genotype, levels = c("WT", "KO"))

# run de
de_results <- pseudoBulkDGE(summed_filt, 
    label=summed_filt$cluster_oligo,
    design= ~factor(chip) + genotype,
    coef="genotypeKO",
    condition=summed_filt$genotype 
)

# save result
saveRDS(de_results, here("processed",project, "DE_oligo_cluster_de_results.RDS"))
lapply(names(de_results), function(x){
  de_results_clu <- de_results[[x]]
  write.csv(de_results_clu, here("outs", project, "DE_oligo_cluster", "de_results", paste0("de_results_", x, ".csv")), quote = FALSE)
}
)

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL

# save fail
write.csv(metadata(de_results), here("outs", project,"DE_oligo_cluster", "fail_contrast_DE.csv"), row.names = FALSE)

Filter the results

For each one of the labels a list is produced with the DE genes at a FDR of 5%. Summaries of these results are saved in /outs/fire-mice/DE_oligo_cluster/

Values of 1, -1 and 0 indicate that the gene is significantly upregulated, downregulated or not significant, respectively. Genes listed as NA were either filtered out as low-abundance genes for a given label’s analysis, or the comparison of interest was not possible for that particular label.

de_filtered_genecounts.csv contains a small summary with the number of genes DE for each one of the clusters.

de_filtered_genes.csv contains the list of genes DE.

is_de <- decideTestsPerLabel(de_results, threshold=0.05)
summary_de <- summarizeTestsPerLabel(is_de)

# virtually 0 or NA we do not care, we want only + or -
is_de_0 <- is_de
is_de_0[is.na(is_de_0)]<- 0
# but filter the original object to get them back
is_de_flt <- is_de[(rowSums(abs(is_de_0)) != 0), ]

write.csv(summary_de, here("outs", project,"DE_oligo_cluster", "de_filtered_genecounts.csv"))

write.csv(is_de_flt, here("outs", project, "DE_oligo_cluster", "de_filtered_genes.csv" ))

Plot the results

Saved in /outs/fire-mice/DE_oligo_cluster/de_filtered_plots

is_de_genes <- rownames(is_de_flt)
for( gene in is_de_genes){
pdf(file = here("outs", project, "DE_oligo_cluster", "de_filtered_plots", paste0(gene, ".pdf")))
plot <- plotExpression(logNormCounts(sce),
    features=gene,
    x="genotype", colour_by="genotype",
    other_fields="cluster_oligo") +
    facet_wrap(~cluster_oligo) +
    scale_color_manual(values = c("#666666", "#E25822")) +
    ggtitle(gene)
print(plot)
dev.off()
}