Differential Expression

set-up

library(scran) # for scDE
library(scater) # for aggregate counts
library(edgeR) #for De
library(here) # reproducible paths
library(DropletUtils) # ambient RNA
library(scuttle) # modify gene names

source(here("src/colours.R"))
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_anno_02.RDS"))
clusters_name <- "clusters_named"

Use of pseudo-bulk samples

We perform the DE analysis separately for each label to identify cell type-specific transcriptional effects of KO condition. The actual DE testing is performed on “pseudo-bulk” expression profiles (Tung et al. 2017), generated by summing counts together for all cells with the same combination of label and sample. This leverages the resolution offered by single-cell technologies to define the labels, and combines it with the statistical rigor of existing methods for DE analyses involving a small number of samples.

# each column represents unique combination for each cluster and each sample replicate
summed <- aggregateAcrossCells(sce, 
                               id=colData(sce)[,c(clusters_name, "Sample")]) 
summed

## class: SingleCellExperiment 
## dim: 19957 217 
## metadata(3): merge.info pca.info Samples
## assays(1): counts
## rownames(19957): Xkr4 Gm19938 ... CAAA01147332.1 AC149090.1
## rowData names(7): rotation ID ... gene_name subset
## colnames: NULL
## colData names(53): batch Sample ... Sample ncells
## reducedDimNames(8): corrected PCA_coldata ... TSNE_uncorrected
##   UMAP_uncorrected
## mainExpName: NULL
## altExpNames(0):

Perform DE

Filter replicates that do not have enough cells

We first filter out all the combinations sample-label that resulted in less than 10 cells. These are saved in /outs/fire-mice/DE_cluster/fail_min_cells_DE.csv

# Removing all pseudo-bulk samples with 'insufficient' cells.
summed_filt <- summed[, summed$ncells >= 10]

# and saving to file which one there are
fail_n_cells  <- colData(summed)[ colData(summed)$ncells < 10, c("Sample", "mouse", "genotype", "tissue", "chip", clusters_name, "ncells")]

write.csv(fail_n_cells, here("outs", project,"DE_edgeR", "fail_min_cells_DE.csv"), row.names = FALSE)

Design

tissue and chip are confounded, meaning it is not possible to correct for both; and seen there is greater variation in chip than tissue correcting for chip seems more sensible.

table(sce$tissue, sce$chip)

##              
##                   3     4     5     6
##   cortex          0 11853     0  8156
##   hippocampus 14468     0 13061     0

table(sce$genotype, sce$chip)

##      
##          3    4    5    6
##   WT  6015 3766 3135 3996
##   HET 2458 4481 5843 2351
##   KO  5995 3606 4083 1809

table(sce$genotype, sce$tissue)

##      
##       cortex hippocampus
##   WT    7762        9150
##   HET   6832        8301
##   KO    5415       10078

design=~factor(chip) + genotype
sce$chip <- factor(sce$chip)
design_KOvsHET=~0+ chip + genotype

We then compute the DE with edgeR, one of the best methods according to (Soneson et al. 2018). The results are saved in /outs/fire-mice/DE_edgeR/de_results

Output:

LogFC is the log fold-change, which is the log difference between both groups

LogCPM are the log counts per million, which can be understood as measuring expression level.

F is the F-statistic from the quasi-likelihood F-test.

PValue is the nominal p-value derived from F without any multiple testing correction

FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.

If the comparison was not possible for a particular cluster, most commonly due to lack of residual degrees of freedom ( e.g. an absence of enough samples from both conditions) it is listed in /outs/fire-mice/DE_edgeR/fail_min_cells_DE.csv

# Reordering the genotype factor, treated should be second level
summed_filt$genotype <- factor(summed_filt$genotype, levels = c("WT", "HET", "KO"))

# KO ones
# run de
de_results_KO <- pseudoBulkDGE(summed_filt, 
                               label=summed_filt$clusters_named,
                               design= design,
                               coef="genotypeKO",
                               condition=summed_filt$genotype 
)

# HET ones
de_results_HET <- pseudoBulkDGE(summed_filt, 
                                label=summed_filt$clusters_named,
                                design= design,
                                coef="genotypeHET",
                                condition=summed_filt$genotype 
)
# KOvsHET ones
de_results_KOvsHET <- pseudoBulkDGE(summed_filt, 
                                    label=summed_filt$clusters_named,
                                    design= design_KOvsHET,
                                    contrast=c("genotypeKO - genotypeHET"),
                                    condition=summed_filt$genotype 
)

# create a list to loop through the 3 de_results
de_results <- list(KO=de_results_KO, HET=de_results_HET, KOvsHET=de_results_KOvsHET)
for (gnt in c("KO", "HET", "KOvsHET")) {
    #extract each genotype dataframe list
  de_results_gnt <- de_results[[gnt]]
  
  #save R objects
  dir.create(here("processed",project), showWarnings = FALSE)
  saveRDS(de_results_gnt, here("processed",project, paste0("DE_results_edgeR_", gnt, ".RDS")))
     # save failed test
  write.csv(metadata(de_results_gnt), here("outs", project,"DE_edgeR", paste0("fail_contrast_DE_",gnt,".csv")), row.names = FALSE)
}

Filter ambient RNA background

Extracellular RNA (most commonly released upon cell lysis) is captured along with each cell in its reaction chamber, contributing counts to genes that are not otherwise expressed in that cell. This is problematic for DE analyses between conditions, as DEGs detected for a particular cell type may be driven by differences in the ambient profiles rather than any intrinsic change in gene regulation. Here he observed microglia genes are shown as down regulated in cell types were their expression is not expected (such as oligodendrocytes and astrocytes). We will use these known genes as previous knowledge to build an ambient model and remove it from the DE.

The list of microglia genes used are in top 100 of middle-aged microglia markers and significantly down in at least one cluster in young and/or middle-aged datasets.

# import data
# to calculate the ambient the raw data should be used
samples <- dir(here("data/raw/"))
# create empty vector to hold ambient
ambient <- vector("list", length(samples))
names(ambient) <- samples
microglia_genes <- readLines(here("data", "microglia_genes.csv"))
# generate profile for each sample
for(sample in samples){
  matrix <- here("data/raw/", sample, "raw_feature_bc_matrix")
  sce_sample <- read10xCounts(matrix, sample, version = "auto", col.names = TRUE)
  # calculate ambient counts, with good turning FALSE because we will use the counts for further estimations and round false because we already have integers. 
  ambient[[sample]] <- ambientProfileEmpty(counts(sce_sample), good.turing = FALSE, round = FALSE)
}

# clean up ambient output
ambient <-  do.call(cbind, ambient)

# use symbol nomenclature
rownames(ambient) <- uniquifyFeatureNames(
  ID=rowData(sce_sample)$ID,
  names=rowData(sce_sample)$Symbol
)

# maximum proportion for each gene that could be due to ambient
# I need here to sort out the ambient vecotr, so I can do the 19 clusters at once, repeating n (often 12) times
# the ambient profile. the ambient profile needs to be calculated on groups of cells that are similar: We'll use 
# the clusters we run the DEG on, like this it can be added to each of these. It also needs to run per sample (obviously you don't calculate the ambient with another sample), but we can take for each gene the avareage of the proportion obtained in each sample. For this again we will need to sort out the fact I'm doing this for all clusters at once, as we only want to average for each cluster. 

# create an aggregate ambient, duplicating the information to match the format of the summed sce
n=0
aggregate_ambient <- matrix(nrow = nrow(ambient), ncol=ncol(summed_filt))
for( sample in summed_filt$Sample){
  n = n+1
  aggregate_ambient[,n] <-  ambient[,sample]
}
# add the gene names again
rownames(aggregate_ambient) <- rownames(ambient)

# subset for same genes as in the summed sce
aggregate_ambient <- aggregate_ambient[row.names(summed_filt),]

# upper bound of gene contribution from ambient contamination
max_ambient <- ambientContribMaximum(counts(summed_filt), ambient= aggregate_ambient, mode = "proportion")
microglia_ambient <- ambientContribNegative(counts(summed_filt), ambient = aggregate_ambient, features =  microglia_genes, mode = "proportion")

# mean of result obtained between all samples -> needs to be done for each cluster

# create a list to hold the updated DFs ( i tried with lapply but not there yet)
de_results_ambient <- list(KO="", HET="", KOvsHET="")
for (gnt in c("KO", "HET", "KOvsHET")) {
    #extract each genotype dataframe list
  de_results_gnt <- de_results[[gnt]]
  
  # add to the list of dataframes with results for each cluster the ambient info
  de_result_gnt_ambient <- lapply(names(de_results_gnt), function(cluster) {
    de_results_clu <- de_results_gnt[[cluster]]
    # calculate mean of all samples for that cluster
    cluster_max_ambient <- rowMeans(max_ambient[, summed_filt$clusters_named == cluster], na.rm = TRUE)
    # for the ambient calculated from microglia genes exlude the immune and microglia clusters
    if(cluster %in% c("Microglia", "BAMs", "Immune")){
      cluster_microglia_ambient <- NA
    }else{
      cluster_microglia_ambient <- rowMeans(microglia_ambient[, summed_filt$clusters_named == cluster], na.rm = TRUE)
    }
    # combine with previous dataframe
    cbind(de_results_clu, maxAmbient = cluster_max_ambient, microgliaAmbient = cluster_microglia_ambient, minAmbient = pmin(cluster_max_ambient, cluster_microglia_ambient))
  })
  names(de_result_gnt_ambient) <- names(de_results_gnt)
   # save in the genotypes list
  de_results_ambient[[gnt]] <- de_result_gnt_ambient
}

Save, filter and summarise the results

For each one of the labels a list is produced with the DE genes at a FDR of 5%. Summaries of these results are saved in /outs/fire-mice/DE_edgeR/

Values of 1, -1 and 0 indicate that the gene is significantly upregulated, downregulated or not significant, respectively. Genes listed as NA were either filtered out as low-abundance genes for a given label’s analysis, or the comparison of interest was not possible for that particular label.

de_filtered_genecounts.csv contains a small summary with the number of genes DE for each one of the clusters.

de_filtered_genes.csv contains the list of genes DE.

## SAVE per cluster 

for(gnt in c("KO", "HET", "KOvsHET")){
  
  #extract each table
  de_results_gnt <- de_results_ambient[[gnt]]

  #save R objects
  dir.create(here("processed",project), showWarnings = FALSE)
  saveRDS(de_results_gnt, here("processed",project, paste0("DE_results_ambient_edgeR_", gnt, ".RDS")))

  
   # save results DE tables, diferent files for each cluster
   lapply(names(de_results_gnt), function(cluster){
     de_results_clu <- de_results_gnt[[cluster]]
     dir.create(here("outs", project, "DE_edgeR", paste0("de_results_", gnt)), showWarnings = FALSE)
     write.csv(de_results_clu, here("outs", project, "DE_edgeR", paste0("de_results_", gnt), paste0("de_results_", gnt, "_", cluster, ".csv")), quote = FALSE)
   }
   )
            
}

## FILTER and SUMMARISE
de_results <- list(KO=de_results_KO, HET=de_results_HET, KOvsHET=de_results_KOvsHET)
cols <- list(KO=col_wt_het_ko[3], HET=col_wt_het_ko[2], WT=col_wt_het_ko[1])
for(gnt in c("KO", "HET", "KOvsHET")){
  
  de_results_gnt <- de_results[[gnt]]
  is_de <- decideTestsPerLabel(de_results_gnt, threshold=0.05)
  summary_de <- summarizeTestsPerLabel(is_de)
  
  # virtually 0 or NA we do not care, we want only + or -
  is_de_0 <- is_de
  is_de_0[is.na(is_de_0)]<- 0
  # but filter the original object to get them back
  is_de_flt <- is_de[(rowSums(abs(is_de_0)) != 0), ]
  
  write.csv(summary_de, here("outs", project,"DE_edgeR", paste0( "de_", gnt, "_filtered_genecounts.csv")))
  write.csv(is_de_flt, here("outs", project, "DE_edgeR", paste0( "de_", gnt, "_filtered_genes.csv" )))
}

Plot the results

Saved in /outs/fire-mice/DE_edgeR/de_filtered_plots

for(gnt in c("KO", "HET", "KOvsHET")){
  # read the filtered table
  is_de_flt <-  read.csv(here("outs", project, "DE_edgeR", paste0( "de_", gnt, "_filtered_genes.csv" ))
                         
                         # plots
                         dir.create(here("outs", project, "DE_edgeR", "plots", paste0( "de_", gnt, "_filtered_plots")))
                         is_de_genes <- rownames(is_de_flt)
                         for( gene in is_de_genes){
                           pdf(file = here("outs", project, "DE_edgeR", "plots", paste0( "de_", gnt, "_filtered_plots"), paste0(gene, ".pdf")))
                           plot <- plotExpression(logNormCounts(sce),
                                                  features=gene,
                                                  x="genotype", colour_by="genotype",
                                                  other_fields="clusters_named") +
                             facet_wrap(~clusters_named) +
                             scale_color_manual(values = col_wt_het_ko) +
                             ggtitle(gene)
                           print(plot)
                           dev.off()
                         }
                         
}

Session Info

sessionInfo()

## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pals_1.7                    DropletUtils_1.12.3        
##  [3] here_1.0.1                  edgeR_3.34.1               
##  [5] limma_3.48.3                scater_1.20.1              
##  [7] ggplot2_3.3.5               scran_1.20.1               
##  [9] scuttle_1.2.1               SingleCellExperiment_1.14.1
## [11] SummarizedExperiment_1.22.0 Biobase_2.52.0             
## [13] GenomicRanges_1.44.0        GenomeInfoDb_1.28.4        
## [15] IRanges_2.26.0              S4Vectors_0.30.2           
## [17] BiocGenerics_0.38.0         MatrixGenerics_1.4.3       
## [19] matrixStats_0.61.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7              rprojroot_2.0.2          
##  [3] tools_4.1.1               bslib_0.3.1              
##  [5] utf8_1.2.2                R6_2.5.1                 
##  [7] irlba_2.3.3               HDF5Array_1.20.0         
##  [9] vipor_0.4.5               DBI_1.1.1                
## [11] colorspace_2.0-2          rhdf5filters_1.4.0       
## [13] withr_2.4.2               tidyselect_1.1.1         
## [15] gridExtra_2.3             compiler_4.1.1           
## [17] BiocNeighbors_1.10.0      DelayedArray_0.18.0      
## [19] sass_0.4.0                scales_1.1.1             
## [21] stringr_1.4.0             digest_0.6.28            
## [23] R.utils_2.11.0            rmarkdown_2.11           
## [25] XVector_0.32.0            dichromat_2.0-0          
## [27] pkgconfig_2.0.3           htmltools_0.5.2          
## [29] sparseMatrixStats_1.4.2   maps_3.4.0               
## [31] fastmap_1.1.0             rlang_0.4.12             
## [33] DelayedMatrixStats_1.14.3 farver_2.1.0             
## [35] jquerylib_0.1.4           generics_0.1.1           
## [37] jsonlite_1.7.2            BiocParallel_1.26.2      
## [39] R.oo_1.24.0               dplyr_1.0.7              
## [41] RCurl_1.98-1.5            magrittr_2.0.1           
## [43] BiocSingular_1.8.1        GenomeInfoDbData_1.2.6   
## [45] Matrix_1.3-4              Rhdf5lib_1.14.2          
## [47] Rcpp_1.0.7                ggbeeswarm_0.6.0         
## [49] munsell_0.5.0             fansi_0.5.0              
## [51] viridis_0.6.2             R.methodsS3_1.8.1        
## [53] lifecycle_1.0.1           stringi_1.7.5            
## [55] yaml_2.2.1                zlibbioc_1.38.0          
## [57] rhdf5_2.36.0              grid_4.1.1               
## [59] dqrng_0.3.0               crayon_1.4.2             
## [61] lattice_0.20-45           splines_4.1.1            
## [63] beachmat_2.8.1            mapproj_1.2.7            
## [65] locfit_1.5-9.4            metapod_1.0.0            
## [67] knitr_1.36                pillar_1.6.4             
## [69] igraph_1.2.7              ScaledMatrix_1.0.0       
## [71] glue_1.4.2                evaluate_0.14            
## [73] vctrs_0.3.8               gtable_0.3.0             
## [75] purrr_0.3.4               assertthat_0.2.1         
## [77] xfun_0.27                 rsvd_1.0.5               
## [79] viridisLite_0.4.0         tibble_3.1.5             
## [81] beeswarm_0.4.0            cluster_2.1.2            
## [83] bluster_1.2.1             statmod_1.4.36           
## [85] ellipsis_0.3.2