Differential Expression

set-up

library(scran) # for scDE
library(scater) # for aggregate counts
library(edgeR) #for De
library(here) # reproducible paths
library(DropletUtils) # ambient RNA
library(scuttle) # modify gene names

source(here("src/colours.R"))
project <- "fire-mice"
sce <- readRDS(here("processed", project, "sce_anno_02.RDS"))

Use of pseudo-bulk samples

We perform the DE analysis separately for each label to identify cell type-specific transcriptional effects of KO condition. The actual DE testing is performed on “pseudo-bulk” expression profiles (Tung et al. 2017), generated by summing counts together for all cells with the same combination of label and sample. This leverages the resolution offered by single-cell technologies to define the labels, and combines it with the statistical rigor of existing methods for DE analyses involving a small number of samples.

# each column represents unique combination for each cluster and each sample replicate
summed <- aggregateAcrossCells(sce, 
                               id=colData(sce)[,c("clusters_named", "Sample")]) 
summed

## class: SingleCellExperiment 
## dim: 19148 213 
## metadata(1): Samples
## assays(1): counts
## rownames(19148): Xkr4 Gm19938 ... CAAA01147332.1 AC149090.1
## rowData names(6): ID Symbol ... gene_name subset
## colnames: NULL
## colData names(41): Sample Barcode ... Sample ncells
## reducedDimNames(5): PCA_coldata PCA PCA_all UMAP TSNE
## mainExpName: NULL
## altExpNames(0):

Perform DE

Filter replicates that do not have enough cells

We first filter out all the combinations sample-label that resulted in less than 10 cells. These are saved in /outs/fire-mice/DE_cluster/fail_min_cells_DE.csv

# Removing all pseudo-bulk samples with 'insufficient' cells.
summed_filt <- summed[, summed$ncells > 15]

# and saving to file which one there are
fail_n_cells  <- colData(summed)[ colData(summed)$ncells < 15, c("Sample", "mouse", "genotype", "chip", "clusters_named", "ncells")]

write.csv(fail_n_cells, here("outs", project,"DE_edgeR", "fail_min_cells_DE.csv"), row.names = FALSE)

We then compute the DE with edgeR, one of the best methods according to (Soneson et al. 2018). The results are saved in /outs/fire-mice/DE_edgeR/de_results

Output:

LogFC is the log fold-change, which is the log difference between both groups

LogCPM are the log counts per million, which can be understood as measuring expression level.

F is the F-statistic from the quasi-likelihood F-test.

PValue is the nominal p-value derived from F without any multiple testing correction

FDR (False discovery rate) is the PValue after Benjamini-Hochberg correction. For example, the set of genes with adjusted p value less than 0.1 should contain no more than 10% false positives.

If the comparison was not possible for a particular cluster, most commonly due to lack of residual degrees of freedom ( e.g. an absence of enough samples from both conditions) it is listed in /outs/fire-mice/DE_edgeR/fail_min_cells_DE.csv

# Reordering the genotype factor, treated should be second level
summed_filt$genotype <- factor(summed_filt$genotype, levels = c("WT", "KO"))

# run de
de_results <- pseudoBulkDGE(summed_filt, 
                               label=summed_filt$clusters_named,
                               design= ~factor(chip) + genotype,
                               coef="genotypeKO",
                               condition=summed_filt$genotype 
)

 #save R objects
dir.create(here("processed",project), showWarnings = FALSE)
saveRDS(de_results, here("processed",project, "DE_results_edgeR.RDS"))
 # save failed test
write.csv(metadata(de_results), here("outs", project,"DE_edgeR", "fail_contrast_DE.csv"), row.names = FALSE)

Filter ambient RNA background

Extracellular RNA (most commonly released upon cell lysis) is captured along with each cell in its reaction chamber, contributing counts to genes that are not otherwise expressed in that cell. This is problematic for DE analyses between conditions, as DEGs detected for a particular cell type may be driven by differences in the ambient profiles rather than any intrinsic change in gene regulation. Here he observed microglia genes are shown as down regulated in cell types were their expression is not expected (such as oligodendrocytes and astrocytes). We will use these known genes as previous knowledge to build an ambient model and remove it from the DE.

The list of microglia genes used are in top 100 of middle-aged microglia markers and significantly down in at least one cluster in young and/or middle-aged datasets.

if (!file.exists(here("processed", project, "max_ambient.RDS"))) {
  # import data
  # to calculate the ambient the raw data should be used
  samples <- dir(here("data/raw/"))
  # create empty vector to hold ambient
  ambient <- vector("list", length(samples))
  names(ambient) <- samples
  microglia_genes <- readLines(here("data", "microglia_genes.csv"))
  # generate profile for each sample
  for(sample in samples){
    matrix <- here("data/raw/", sample, "raw_feature_bc_matrix")
    sce_sample <- read10xCounts(matrix, sample, version = "auto", col.names = TRUE)
    # calculate ambient counts, with good turning FALSE because we will use the counts for further estimations and round false because we already have integers. 
    ambient[[sample]] <- ambientProfileEmpty(counts(sce_sample), good.turing = FALSE, round = FALSE)
  }
  
  # clean up ambient output
  ambient <-  do.call(cbind, ambient)
  
  # use symbol nomenclature
  rownames(ambient) <- uniquifyFeatureNames(
    ID=rowData(sce_sample)$ID,
    names=rowData(sce_sample)$Symbol
  )
  
  # maximum proportion for each gene that could be due to ambient
  # I need here to sort out the ambient vecotr, so I can do the 19 clusters at once, repeating n (often 12) times
  # the ambient profile. the ambient profile needs to be calculated on groups of cells that are similar: We'll use 
  # the clusters we run the DEG on, like this it can be added to each of these. It also needs to run per sample (obviously you don't calculate the ambient with another sample), but we can take for each gene the avareage of the proportion obtained in each sample. For this again we will need to sort out the fact I'm doing this for all clusters at once, as we only want to average for each cluster. 
  
  # create an aggregate ambient, duplicating the information to match the format of the summed sce
  n=0
  aggregate_ambient <- matrix(nrow = nrow(ambient), ncol=ncol(summed_filt))
  for( sample in summed_filt$Sample){
    n = n+1
    aggregate_ambient[,n] <-  ambient[,sample]
  }
  # add the gene names again
  rownames(aggregate_ambient) <- rownames(ambient)
  
  # subset for same genes as in the summed sce
  aggregate_ambient <- aggregate_ambient[row.names(summed_filt),]
  
  # upper bound of gene contribution from ambient contamination
  max_ambient <- ambientContribMaximum(counts(summed_filt), ambient= aggregate_ambient, mode = "proportion")
  microglia_ambient <- ambientContribNegative(counts(summed_filt), ambient = aggregate_ambient, features =  microglia_genes, mode = "proportion")
  
  # save for later use
  saveRDS(max_ambient, here("processed", project, "max_ambient.RDS"))
  saveRDS(microglia_ambient, here("processed", project, "microglia_ambient.RDS"))
  
}else{
  max_ambient <- readRDS(here("processed", project, "max_ambient.RDS"))
  microglia_ambient <- readRDS(here("processed", project, "microglia_ambient.RDS"))
}

# mean of result obtained between all samples -> needs to be done for each cluster

# create a list to hold the updated DFs ( i tried with lapply but not there yet)

# add to the list of dataframes with results for each cluster the ambient info
de_result_ambient <- lapply(names(de_results), function(cluster) {
  de_results_clu <- de_results[[cluster]]
  # calculate mean of all samples for that cluster
  cluster_max_ambient <- rowMeans(max_ambient[, summed_filt$clusters_named == cluster], na.rm = TRUE)
  # for the ambient calculated from microglia genes exlude the immune and microglia clusters
  if(cluster %in% c("Microglia1", "Microglia2", "BAMs&DCs&Mono")){
    cluster_microglia_ambient <- NA
  }else{
    cluster_microglia_ambient <- rowMeans(microglia_ambient[, summed_filt$clusters_named == cluster], na.rm = TRUE)
  }
  # combine with previous dataframe
  cbind(de_results_clu, maxAmbient = cluster_max_ambient, microgliaAmbient = cluster_microglia_ambient, minAmbient = pmin(cluster_max_ambient, cluster_microglia_ambient))
})
names(de_result_ambient) <- names(de_results)

Save, filter and summarise the results

For each one of the labels a list is produced with the DE genes at a FDR of 5%. Summaries of these results are saved in /outs/fire-mice/DE_edgeR/

Values of 1, -1 and 0 indicate that the gene is significantly upregulated, downregulated or not significant, respectively. Genes listed as NA were either filtered out as low-abundance genes for a given label’s analysis, or the comparison of interest was not possible for that particular label.

de_filtered_genecounts.csv contains a small summary with the number of genes DE for each one of the clusters.

de_filtered_genes.csv contains the list of genes DE.

## SAVE per cluster 


  #save R objects
  dir.create(here("processed",project), showWarnings = FALSE)
  saveRDS(de_result_ambient, here("processed",project, "DE_results_ambient_edgeR_.RDS"))

  
   # save results DE tables, diferent files for each cluster
   lapply(names(de_result_ambient), function(cluster){
     de_results_clu <- de_result_ambient[[cluster]]
     dir.create(here("outs", project, "DE_edgeR", "de_results"), showWarnings = FALSE)
     write.csv(de_results_clu, here("outs", project, "DE_edgeR", "de_results", paste0("de_results_", cluster, ".csv")), quote = FALSE)
   }
   )

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL

## FILTER and SUMMARISE

  is_de <- decideTestsPerLabel(de_results, threshold=0.05)
  summary_de <- summarizeTestsPerLabel(is_de)
  
  # virtually 0 or NA we do not care, we want only + or -
  is_de_0 <- is_de
  is_de_0[is.na(is_de_0)]<- 0
  # but filter the original object to get them back
  is_de_flt <- is_de[(rowSums(abs(is_de_0)) != 0), ]
  
  write.csv(summary_de, here("outs", project,"DE_edgeR",  "de_filtered_genecounts.csv"))
  write.csv(is_de_flt, here("outs", project, "DE_edgeR", "de_filtered_genes.csv" ))

for(gnt in c("KO", "HET", "KOvsHET")){
  # read the filtered table
  is_de_flt <-  read.csv(here("outs", project, "DE_edgeR", paste0( "de_", gnt, "_filtered_genes.csv" ))
                         
                         # plots
                         dir.create(here("outs", project, "DE_edgeR", "plots", paste0( "de_", gnt, "_filtered_plots")))
                         is_de_genes <- rownames(is_de_flt)
                         for( gene in is_de_genes){
                           pdf(file = here("outs", project, "DE_edgeR", "plots", paste0( "de_", gnt, "_filtered_plots"), paste0(gene, ".pdf")))
                           plot <- plotExpression(logNormCounts(sce),
                                                  features=gene,
                                                  x="genotype", colour_by="genotype",
                                                  other_fields="clusters_named") +
                             facet_wrap(~clusters_named) +
                             scale_color_manual(values = col_wt_het_ko) +
                             ggtitle(gene)
                           print(plot)
                           dev.off()
                         }
                         
}

Session Info

sessionInfo()

## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] pals_1.7                    DropletUtils_1.16.0        
##  [3] here_1.0.1                  edgeR_3.38.4               
##  [5] limma_3.52.2                scater_1.24.0              
##  [7] ggplot2_3.3.6               scran_1.24.0               
##  [9] scuttle_1.6.2               SingleCellExperiment_1.18.0
## [11] SummarizedExperiment_1.26.1 Biobase_2.56.0             
## [13] GenomicRanges_1.48.0        GenomeInfoDb_1.32.3        
## [15] IRanges_2.30.0              S4Vectors_0.34.0           
## [17] BiocGenerics_0.42.0         MatrixGenerics_1.8.1       
## [19] matrixStats_0.62.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7              rprojroot_2.0.3          
##  [3] tools_4.2.1               bslib_0.3.1              
##  [5] utf8_1.2.2                R6_2.5.1                 
##  [7] irlba_2.3.5               HDF5Array_1.24.2         
##  [9] vipor_0.4.5               DBI_1.1.3                
## [11] colorspace_2.0-3          rhdf5filters_1.8.0       
## [13] withr_2.5.0               gridExtra_2.3            
## [15] tidyselect_1.1.2          compiler_4.2.1           
## [17] cli_3.3.0                 BiocNeighbors_1.14.0     
## [19] DelayedArray_0.22.0       sass_0.4.1               
## [21] scales_1.2.0              stringr_1.4.0            
## [23] digest_0.6.29             R.utils_2.12.0           
## [25] rmarkdown_2.14            XVector_0.36.0           
## [27] dichromat_2.0-0.1         pkgconfig_2.0.3          
## [29] htmltools_0.5.2           sparseMatrixStats_1.8.0  
## [31] maps_3.4.0                fastmap_1.1.0            
## [33] rlang_1.0.3               rstudioapi_0.13          
## [35] DelayedMatrixStats_1.18.0 farver_2.1.1             
## [37] jquerylib_0.1.4           generics_0.1.3           
## [39] jsonlite_1.8.0            BiocParallel_1.30.3      
## [41] R.oo_1.25.0               dplyr_1.0.9              
## [43] RCurl_1.98-1.8            magrittr_2.0.3           
## [45] BiocSingular_1.12.0       GenomeInfoDbData_1.2.8   
## [47] Matrix_1.4-1              Rhdf5lib_1.18.2          
## [49] Rcpp_1.0.9                ggbeeswarm_0.6.0         
## [51] munsell_0.5.0             fansi_1.0.3              
## [53] viridis_0.6.2             R.methodsS3_1.8.2        
## [55] lifecycle_1.0.1           stringi_1.7.8            
## [57] yaml_2.3.5                zlibbioc_1.42.0          
## [59] rhdf5_2.40.0              grid_4.2.1               
## [61] ggrepel_0.9.1             parallel_4.2.1           
## [63] dqrng_0.3.0               crayon_1.5.1             
## [65] lattice_0.20-45           splines_4.2.1            
## [67] beachmat_2.12.0           mapproj_1.2.8            
## [69] locfit_1.5-9.6            metapod_1.4.0            
## [71] knitr_1.39                pillar_1.7.0             
## [73] igraph_1.3.4              codetools_0.2-18         
## [75] ScaledMatrix_1.4.0        glue_1.6.2               
## [77] evaluate_0.15             vctrs_0.4.1              
## [79] gtable_0.3.0              purrr_0.3.4              
## [81] assertthat_0.2.1          xfun_0.31                
## [83] rsvd_1.0.5                viridisLite_0.4.0        
## [85] tibble_3.1.7              beeswarm_0.4.0           
## [87] cluster_2.1.3             bluster_1.6.0            
## [89] statmod_1.4.37            ellipsis_0.3.2