library(scran) # for scDE
library(scater) # for aggregate counts
library(edgeR) #for De
library(here) # reproducible paths
library(DropletUtils) # ambient RNA
library(scuttle) # modify gene names
source(here("src/colours.R"))
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_anno_02.RDS"))
We perform the DE analysis separately for each label to identify cell type-specific transcriptional effects of KO condition. The actual DE testing is performed on “pseudo-bulk” expression profiles (Tung et al. 2017), generated by summing counts together for all cells with the same combination of label and sample. This leverages the resolution offered by single-cell technologies to define the labels, and combines it with the statistical rigor of existing methods for DE analyses involving a small number of samples.
# each column represents unique combination for each cluster and each sample replicate
summed <- aggregateAcrossCells(sce,
id=colData(sce)[,c("clusters_named", "Sample")])
summed
## class: SingleCellExperiment
## dim: 19148 213
## metadata(1): Samples
## assays(1): counts
## rownames(19148): Xkr4 Gm19938 ... CAAA01147332.1 AC149090.1
## rowData names(6): ID Symbol ... gene_name subset
## colnames: NULL
## colData names(41): Sample Barcode ... Sample ncells
## reducedDimNames(5): PCA_coldata PCA PCA_all UMAP TSNE
## mainExpName: NULL
## altExpNames(0):
We first filter out all the combinations sample-label that resulted in less than 10 cells. These are saved in /outs/fire-mice/DE_cluster/fail_min_cells_DE.csv
# Removing all pseudo-bulk samples with 'insufficient' cells.
summed_filt <- summed[, summed$ncells > 15]
# and saving to file which one there are
fail_n_cells <- colData(summed)[ colData(summed)$ncells < 15, c("Sample", "mouse", "genotype", "chip", "clusters_named", "ncells")]
write.csv(fail_n_cells, here("outs", project,"DE_edgeR", "fail_min_cells_DE.csv"), row.names = FALSE)
We then compute the DE with edgeR, one of the best methods according to (Soneson et al. 2018). The results are saved in /outs/fire-mice/DE_edgeR/de_results
Output:
LogFC is the log fold-change, which is the log difference between both groups
LogCPM are the log counts per million, which can be understood as measuring expression level.
F is the F-statistic from the quasi-likelihood F-test.
PValue is the nominal p-value derived from F without any multiple testing correction
FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.
If the comparison was not possible for a particular cluster, most commonly due to lack of residual degrees of freedom ( e.g. an absence of enough samples from both conditions) it is listed in /outs/fire-mice/DE_edgeR/fail_min_cells_DE.csv
# Reordering the genotype factor, treated should be second level
summed_filt$genotype <- factor(summed_filt$genotype, levels = c("WT", "KO"))
# run de
de_results <- pseudoBulkDGE(summed_filt,
label=summed_filt$clusters_named,
design= ~factor(chip) + genotype,
coef="genotypeKO",
condition=summed_filt$genotype
)
#save R objects
dir.create(here("processed",project), showWarnings = FALSE)
saveRDS(de_results, here("processed",project, "DE_results_edgeR.RDS"))
# save failed test
write.csv(metadata(de_results), here("outs", project,"DE_edgeR", "fail_contrast_DE.csv"), row.names = FALSE)
Extracellular RNA (most commonly released upon cell lysis) is captured along with each cell in its reaction chamber, contributing counts to genes that are not otherwise expressed in that cell. This is problematic for DE analyses between conditions, as DEGs detected for a particular cell type may be driven by differences in the ambient profiles rather than any intrinsic change in gene regulation. Here he observed microglia genes are shown as down regulated in cell types were their expression is not expected (such as oligodendrocytes and astrocytes). We will use these known genes as previous knowledge to build an ambient model and remove it from the DE.
The list of microglia genes used are in top 100 of middle-aged microglia markers and significantly down in at least one cluster in young and/or middle-aged datasets.
if (!file.exists(here("processed", project, "max_ambient.RDS"))) {
# import data
# to calculate the ambient the raw data should be used
samples <- dir(here("data/raw/"))
# create empty vector to hold ambient
ambient <- vector("list", length(samples))
names(ambient) <- samples
microglia_genes <- readLines(here("data", "microglia_genes.csv"))
# generate profile for each sample
for(sample in samples){
matrix <- here("data/raw/", sample, "raw_feature_bc_matrix")
sce_sample <- read10xCounts(matrix, sample, version = "auto", col.names = TRUE)
# calculate ambient counts, with good turning FALSE because we will use the counts for further estimations and round false because we already have integers.
ambient[[sample]] <- ambientProfileEmpty(counts(sce_sample), good.turing = FALSE, round = FALSE)
}
# clean up ambient output
ambient <- do.call(cbind, ambient)
# use symbol nomenclature
rownames(ambient) <- uniquifyFeatureNames(
ID=rowData(sce_sample)$ID,
names=rowData(sce_sample)$Symbol
)
# maximum proportion for each gene that could be due to ambient
# I need here to sort out the ambient vecotr, so I can do the 19 clusters at once, repeating n (often 12) times
# the ambient profile. the ambient profile needs to be calculated on groups of cells that are similar: We'll use
# the clusters we run the DEG on, like this it can be added to each of these. It also needs to run per sample (obviously you don't calculate the ambient with another sample), but we can take for each gene the avareage of the proportion obtained in each sample. For this again we will need to sort out the fact I'm doing this for all clusters at once, as we only want to average for each cluster.
# create an aggregate ambient, duplicating the information to match the format of the summed sce
n=0
aggregate_ambient <- matrix(nrow = nrow(ambient), ncol=ncol(summed_filt))
for( sample in summed_filt$Sample){
n = n+1
aggregate_ambient[,n] <- ambient[,sample]
}
# add the gene names again
rownames(aggregate_ambient) <- rownames(ambient)
# subset for same genes as in the summed sce
aggregate_ambient <- aggregate_ambient[row.names(summed_filt),]
# upper bound of gene contribution from ambient contamination
max_ambient <- ambientContribMaximum(counts(summed_filt), ambient= aggregate_ambient, mode = "proportion")
microglia_ambient <- ambientContribNegative(counts(summed_filt), ambient = aggregate_ambient, features = microglia_genes, mode = "proportion")
# save for later use
saveRDS(max_ambient, here("processed", project, "max_ambient.RDS"))
saveRDS(microglia_ambient, here("processed", project, "microglia_ambient.RDS"))
}else{
max_ambient <- readRDS(here("processed", project, "max_ambient.RDS"))
microglia_ambient <- readRDS(here("processed", project, "microglia_ambient.RDS"))
}
# mean of result obtained between all samples -> needs to be done for each cluster
# create a list to hold the updated DFs ( i tried with lapply but not there yet)
# add to the list of dataframes with results for each cluster the ambient info
de_result_ambient <- lapply(names(de_results), function(cluster) {
de_results_clu <- de_results[[cluster]]
# calculate mean of all samples for that cluster
cluster_max_ambient <- rowMeans(max_ambient[, summed_filt$clusters_named == cluster], na.rm = TRUE)
# for the ambient calculated from microglia genes exlude the immune and microglia clusters
if(cluster %in% c("Microglia1", "Microglia2", "BAMs&DCs&Mono")){
cluster_microglia_ambient <- NA
}else{
cluster_microglia_ambient <- rowMeans(microglia_ambient[, summed_filt$clusters_named == cluster], na.rm = TRUE)
}
# combine with previous dataframe
cbind(de_results_clu, maxAmbient = cluster_max_ambient, microgliaAmbient = cluster_microglia_ambient, minAmbient = pmin(cluster_max_ambient, cluster_microglia_ambient))
})
names(de_result_ambient) <- names(de_results)
For each one of the labels a list is produced with the DE genes at a FDR of 5%. Summaries of these results are saved in /outs/fire-mice/DE_edgeR/
Values of 1, -1 and 0 indicate that the gene is significantly upregulated, downregulated or not significant, respectively. Genes listed as NA were either filtered out as low-abundance genes for a given label’s analysis, or the comparison of interest was not possible for that particular label.
de_filtered_genecounts.csv contains a small summary with the number of genes DE for each one of the clusters.
de_filtered_genes.csv contains the list of genes DE.
## SAVE per cluster
#save R objects
dir.create(here("processed",project), showWarnings = FALSE)
saveRDS(de_result_ambient, here("processed",project, "DE_results_ambient_edgeR_.RDS"))
# save results DE tables, diferent files for each cluster
lapply(names(de_result_ambient), function(cluster){
de_results_clu <- de_result_ambient[[cluster]]
dir.create(here("outs", project, "DE_edgeR", "de_results"), showWarnings = FALSE)
write.csv(de_results_clu, here("outs", project, "DE_edgeR", "de_results", paste0("de_results_", cluster, ".csv")), quote = FALSE)
}
)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
## FILTER and SUMMARISE
is_de <- decideTestsPerLabel(de_results, threshold=0.05)
summary_de <- summarizeTestsPerLabel(is_de)
# virtually 0 or NA we do not care, we want only + or -
is_de_0 <- is_de
is_de_0[is.na(is_de_0)]<- 0
# but filter the original object to get them back
is_de_flt <- is_de[(rowSums(abs(is_de_0)) != 0), ]
write.csv(summary_de, here("outs", project,"DE_edgeR", "de_filtered_genecounts.csv"))
write.csv(is_de_flt, here("outs", project, "DE_edgeR", "de_filtered_genes.csv" ))
for(gnt in c("KO", "HET", "KOvsHET")){
# read the filtered table
is_de_flt <- read.csv(here("outs", project, "DE_edgeR", paste0( "de_", gnt, "_filtered_genes.csv" ))
# plots
dir.create(here("outs", project, "DE_edgeR", "plots", paste0( "de_", gnt, "_filtered_plots")))
is_de_genes <- rownames(is_de_flt)
for( gene in is_de_genes){
pdf(file = here("outs", project, "DE_edgeR", "plots", paste0( "de_", gnt, "_filtered_plots"), paste0(gene, ".pdf")))
plot <- plotExpression(logNormCounts(sce),
features=gene,
x="genotype", colour_by="genotype",
other_fields="clusters_named") +
facet_wrap(~clusters_named) +
scale_color_manual(values = col_wt_het_ko) +
ggtitle(gene)
print(plot)
dev.off()
}
}
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] pals_1.7 DropletUtils_1.16.0
## [3] here_1.0.1 edgeR_3.38.4
## [5] limma_3.52.2 scater_1.24.0
## [7] ggplot2_3.3.6 scran_1.24.0
## [9] scuttle_1.6.2 SingleCellExperiment_1.18.0
## [11] SummarizedExperiment_1.26.1 Biobase_2.56.0
## [13] GenomicRanges_1.48.0 GenomeInfoDb_1.32.3
## [15] IRanges_2.30.0 S4Vectors_0.34.0
## [17] BiocGenerics_0.42.0 MatrixGenerics_1.8.1
## [19] matrixStats_0.62.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 rprojroot_2.0.3
## [3] tools_4.2.1 bslib_0.3.1
## [5] utf8_1.2.2 R6_2.5.1
## [7] irlba_2.3.5 HDF5Array_1.24.2
## [9] vipor_0.4.5 DBI_1.1.3
## [11] colorspace_2.0-3 rhdf5filters_1.8.0
## [13] withr_2.5.0 gridExtra_2.3
## [15] tidyselect_1.1.2 compiler_4.2.1
## [17] cli_3.3.0 BiocNeighbors_1.14.0
## [19] DelayedArray_0.22.0 sass_0.4.1
## [21] scales_1.2.0 stringr_1.4.0
## [23] digest_0.6.29 R.utils_2.12.0
## [25] rmarkdown_2.14 XVector_0.36.0
## [27] dichromat_2.0-0.1 pkgconfig_2.0.3
## [29] htmltools_0.5.2 sparseMatrixStats_1.8.0
## [31] maps_3.4.0 fastmap_1.1.0
## [33] rlang_1.0.3 rstudioapi_0.13
## [35] DelayedMatrixStats_1.18.0 farver_2.1.1
## [37] jquerylib_0.1.4 generics_0.1.3
## [39] jsonlite_1.8.0 BiocParallel_1.30.3
## [41] R.oo_1.25.0 dplyr_1.0.9
## [43] RCurl_1.98-1.8 magrittr_2.0.3
## [45] BiocSingular_1.12.0 GenomeInfoDbData_1.2.8
## [47] Matrix_1.4-1 Rhdf5lib_1.18.2
## [49] Rcpp_1.0.9 ggbeeswarm_0.6.0
## [51] munsell_0.5.0 fansi_1.0.3
## [53] viridis_0.6.2 R.methodsS3_1.8.2
## [55] lifecycle_1.0.1 stringi_1.7.8
## [57] yaml_2.3.5 zlibbioc_1.42.0
## [59] rhdf5_2.40.0 grid_4.2.1
## [61] ggrepel_0.9.1 parallel_4.2.1
## [63] dqrng_0.3.0 crayon_1.5.1
## [65] lattice_0.20-45 splines_4.2.1
## [67] beachmat_2.12.0 mapproj_1.2.8
## [69] locfit_1.5-9.6 metapod_1.4.0
## [71] knitr_1.39 pillar_1.7.0
## [73] igraph_1.3.4 codetools_0.2-18
## [75] ScaledMatrix_1.4.0 glue_1.6.2
## [77] evaluate_0.15 vctrs_0.4.1
## [79] gtable_0.3.0 purrr_0.3.4
## [81] assertthat_0.2.1 xfun_0.31
## [83] rsvd_1.0.5 viridisLite_0.4.0
## [85] tibble_3.1.7 beeswarm_0.4.0
## [87] cluster_2.1.3 bluster_1.6.0
## [89] statmod_1.4.37 ellipsis_0.3.2