Single Cell Deconvolution of Rat Bulk-RNAseq using Mouse Celltype References

We will be using Dampended Weighted Least Squares (DWLS). This algorithm was developed by Tsuocas et al. in 2018 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6611906/) to improve estimation for cells with very little representation.

library(DWLS)
library(stringr)

Set working directory and make results folder. In your working diectory, you need:
1. The single cell reference matrix
2. The metadata containing celltype labels for each column of sc reference matrix
3. Your bulk RNA-seq data (in this case with rows as Ensembl Gene IDs for rats)
4. Results folder for the signature matrix/marker gene computations

working.dir <- "/sc/arion/scratch/rompag01/single.decon/scDECONfiles"
setwd(working.dir)
list.files(working.dir)
## [1] "amygdala.raw.counts.csv"         "DWLS.results"                   
## [3] "GSE124952_expression_matrix.csv" "GSE124952_meta_data.csv"

We will be using single cell RNA-sequencing data from this 2019 study using the dataset avaiable in Gene Expression Omnibus (GEO) (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE124952)

# Run the following command in bash to download the GEO data (expression matrix and metadata) into your working directory and unzip the files
# wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE124nnn/GSE124952/suppl/GSE124952_meta_data.csv.gz
# wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE124nnn/GSE124952/suppl/GSE124952_meta_data.csv.gz
# gunzip GSE124952*

Read single cell data into R and prep counts matix and celltype labels to make the DWLS Signature Matrix.
NOTE: the format that the publically available single cell data from GEO comes in will vary. Thus, different data wrangling steps may be needed for each unique single cell dataset downloaed from GEO/other sources.


# Load your single cell reference matrix
data <- read.csv("GSE124952_expression_matrix.csv",header=T,row.names=1)
dim(data) # observe number of rows(genes) and columns(cells)
## [1] 21000 35360

# Extract the celltype labels
meta <- read.csv("GSE124952_meta_data.csv",header=T,row.names=1)
dim(meta)
## [1] 35360    10
colnames(meta)
##  [1] "nGene"        "nUMI"         "percent.mito" "Sample"      
##  [5] "treatment"    "Period"       "stage"        "CellType"    
##  [9] "L2_clusters"  "DevStage"

label <- as.vector(meta$CellType) # Grab celltype labels from metadata celltype column
labels <- str_replace_all(label,"NF Oligo","NF_Oligo")  # Fixing the string label for one celltype with a space

Build the signature matrix– this took several hours and requires a lot of memory- I used 160 GB for this dataset. DWLS does not support multi-threading so this was run on one compute core. You will likely need to run this on a HPC cluster/AWS if you are working with ~30K cells like we are here. After running, in the DWLS.results folder, you’ll find marker gene differential expresison tables for each celltype.

Signature<-buildSignatureMatrixMAST(scdata=data,
                                    id=labels,
                                    path="DWLS.results",
                                    diff.cutoff=0.5,
                                    pval.cutoff=0.01)

save(Signature,file="DWLS.RData")

list.files("DWLS.results")

##  [1] "Astro_lrTest.csv"      "Astro_MIST.RData"     
##  [3] "Endo_lrTest.csv"       "Endo_MIST.RData"      
##  [5] "Excitatory_lrTest.csv" "Excitatory_MIST.RData"
##  [7] "Inhibitory_lrTest.csv" "Inhibitory_MIST.RData"
##  [9] "Microglia_lrTest.csv"  "Microglia_MIST.RData" 
## [11] "NF Oligo_lrTest.csv"   "NF Oligo_MIST.RData"  
## [13] "NF_Oligo_lrTest.csv"   "NF_Oligo_MIST.RData"  
## [15] "Oligo_lrTest.csv"      "Oligo_MIST.RData"     
## [17] "OPC_lrTest.csv"        "OPC_MIST.RData"       
## [19] "Sig.RData"

Read in your RNA-seq data.

COUNTS <- read.csv("amygdala.raw.counts.csv",header=T,row.names=1)

Use biomaRt to get mouse orthologs from your rat Ensembl Gene IDs.

library("biomaRt") 
ensembl = useMart("ensembl", dataset="rnorvegicus_gene_ensembl")
values <- unique(rownames(COUNTS))

data <- getBM(
  
  attributes=c(
    "ensembl_gene_id",
    "mmusculus_homolog_ensembl_gene",
    "mmusculus_homolog_associated_gene_name"),
  
  filters = "ensembl_gene_id", 
  values = values, 
  mart= ensembl
)

data <- data[duplicated(data$ensembl_gene_id)==F & 
               duplicated(data$mmusculus_homolog_associated_gene_name)==F ,]

rownames(data) <- data$ensembl_gene_id

data <- data[!data$mmusculus_homolog_associated_gene_name=="",]

Read in your counts, filter them for mouse orthologs, and nomalize to counts per million with a handy-dandy sweep function.

COUNTS <- read.csv("amygdala.raw.counts.csv",header=T,row.names=1)

mouse.COUNTS <- COUNTS[rownames(data),]

CPM <-sweep(mouse.COUNTS,2,as.numeric(colSums(mouse.COUNTS)),FUN="/")*1000000

Fitting bulk data to the model based on psuedobulk expression: let’s iterate through each bulk-RNAseq subject and fit the data with three different models in the DWLS package (DWLS,OLS,SVR).

# 2 step process of making your mouse gene symbols (to avoding a formatting error)
Name <- data$mmusculus_homolog_associated_gene_name 
rownames(CPM) <- Name 
#

CPM.trim <- CPM[rownames(Signature),] # filters your counts for genes maintained in the Signature Matrix

samples<- colnames(CPM.trim) # grabbing a vector of your subject names

# Initializing Results Data Frames for Each Algorithm that DWLS employs (DWLS being the superstar)
DWLS <- data.frame()
SVR <- data.frame()
OLS <- data.frame()

# Computing Cell Fractions for Each Sample and Adding to Results Data Frames
for(sample in samples){
  bulk <- CPM.trim[,sample]
  names(bulk) <- rownames(CPM.trim)
  tr<-trimData(Signature,bulk)
  solDWLS<-solveDampenedWLS(tr$sig,tr$bulk)
  solSVR <- solveSVR(tr$sig,tr$bulk)
  solOLS <- solveOLS(tr$sig,tr$bulk)
  DWLS <-rbind(DWLS,solDWLS)
  SVR <- rbind(SVR,solSVR)
  OLS <- rbind(OLS,solOLS)
}

model<- list(DWLS,SVR,OLS) # List of dataframes

# Adding the rownames and columns back to each dataframe in list
L <- lapply(model, function(df)
            {
              colnames(df) <- colnames(Signature)
              rownames(df) <- samples
              df
            } 
           )

Write your data out to files.

# Writing each dataframe in list to file
#write.csv(L[1],file="CellFractions_PFCref_DWLS.csv")
#write.csv(L[2],file="CellFractions_PFCref_SVR.csv")
#write.csv(L[3],file="CellFractions_PFCref_OLS.csv")

Single Cell Deconvolution of Rat Bulk-RNAseq using Mouse Celltype References

Gregory Rompala

12/5/2020