Main learning objective: perform cell-type deconvolution to estimate how proportions of cell-types in the brain change in response to various conditions. You can learn more about the Marker Gene Profiles tool by reading the documentation and by reading Mancarci et al. 2017 that describes it in greater detail.
We will use Marker Gene Profiles, a computational technique that attempts to decompose bulk tissue gene expression into a list of cell-types proportion estimates.
Cell-type deconvolution requires three components:
tutorial 01 from human scRNA-seq data.Conceptualizing Cell Type Proportions in Bulk Tissue.
We will begin by loading the packages necessary for this tutorial, which are already installed in your docker container. If the package is not loading, uncomment the line for installing it.
#BiocManager::install("edgeR")
library(edgeR)
#devtools::install_github('oganm/markerGeneProfile', force = T) # install marker gene profile tool from github
library(markerGeneProfile)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("matrixStats")
library(matrixStats)
#install.packages("cowplot")
library(cowplot)
#install.packages("broom")
library(broom)
#install.packages("knitr")
library(knitr)
#install.packages("ggpubr")
library(ggpubr)
theme_set(theme_cowplot())
Next, import the metadata and counts matrix for the bulk RNA-seq we’re analyzing.
# import metadata
labonte_meta = read_csv(file = "./data/GSE102556-metadata.csv") %>%
# grab specific columns and rename for clarity
select(geo_accession,
expr_names,
age = age.ch1,
gender = gender.ch1,
pmi = pmi.ch1,
rin = rin.ch1,
ph = ph.ch1,
phenotype = phenotype.ch1)
# preview
kable(labonte_meta[1:5,])
| geo_accession | expr_names | age | gender | pmi | rin | ph | phenotype |
|---|---|---|---|---|---|---|---|
| GSM2740657 | X14.BA8_9 | 47 | Male | 12.00 | 7.8 | 6.49 | CTRL |
| GSM2740658 | X17.BA8_9 | 41 | Male | 24.00 | 7.6 | 6.00 | CTRL |
| GSM2740659 | X20.BA8_9 | 31 | Male | 29.50 | 6.9 | 6.67 | CTRL |
| GSM2740660 | X23.BA8_9 | 19 | Male | 27.75 | 7.7 | 6.74 | CTRL |
| GSM2740661 | X28.BA8_9 | 46 | Male | 19.50 | 7.7 | 6.42 | CTRL |
# import expression matrix
labonte_counts = read_tsv("./data/GSE102556-expression-counts.txt") %>%
# grab column with gene names and samples from the metadata file
select(c("gene_symbol", labonte_meta$geo_accession)) %>%
# remove periods and numbers in gene names used to avoid duplicates
mutate(gene_symbol = gsub("\\..*", "", gene_symbol)) %>%
# filter duplicated genes
filter(duplicated(gene_symbol) == FALSE) %>%
# convert gene_symbol column to rownames
column_to_rownames(var = "gene_symbol")
# preview
kable(labonte_counts[1:5, 1:10])
| GSM2740657 | GSM2740658 | GSM2740659 | GSM2740660 | GSM2740661 | GSM2740662 | GSM2740663 | GSM2740664 | GSM2740665 | GSM2740666 | |
|---|---|---|---|---|---|---|---|---|---|---|
| A1BG | 147 | 194 | 268 | 175 | 133 | 288 | 162 | 235 | 238 | 155 |
| A1CF | 37 | 70 | 61 | 19 | 71 | 92 | 51 | 66 | 54 | 26 |
| A2M | 2539 | 2719 | 3594 | 3405 | 2087 | 2374 | 2619 | 5051 | 3779 | 4542 |
| A2ML1 | 129 | 162 | 208 | 169 | 133 | 205 | 125 | 217 | 214 | 171 |
| A2MP1 | 36 | 87 | 89 | 67 | 56 | 80 | 71 | 113 | 79 | 58 |
Because of possible differences in sequencing depth between different bulk brain gene expression samples, we will normalize for these differences using the Counts Per Million metric in the edgeR package..
# normalize using CPM, add 0.1 to every observation, and log2 transform
labonte_cpm = edgeR::cpm(labonte_counts, log = TRUE, prior.count = 0.1)
# preview
kable(labonte_cpm[1:5, 1:10])
| GSM2740657 | GSM2740658 | GSM2740659 | GSM2740660 | GSM2740661 | GSM2740662 | GSM2740663 | GSM2740664 | GSM2740665 | GSM2740666 | |
|---|---|---|---|---|---|---|---|---|---|---|
| A1BG | 3.216664 | 2.933249 | 3.596465 | 2.8763379 | 2.424683 | 3.588944 | 2.5975636 | 2.997711 | 3.0433369 | 2.3916043 |
| A1CF | 1.227975 | 1.463728 | 1.462455 | -0.3216042 | 1.519926 | 1.943428 | 0.9318589 | 1.167122 | 0.9053833 | -0.1795624 |
| A2M | 7.326552 | 6.741614 | 7.341384 | 7.1579475 | 6.395781 | 6.631774 | 6.6117720 | 7.422976 | 7.0317638 | 7.2637107 |
| A2ML1 | 3.028291 | 2.673310 | 3.230930 | 2.8260294 | 2.424683 | 3.098660 | 2.2237319 | 2.882795 | 2.8900512 | 2.5332471 |
| A2MP1 | 1.188503 | 1.777050 | 2.006905 | 1.4922662 | 1.177981 | 1.741981 | 1.4084738 | 1.942020 | 1.4534678 | 0.9749854 |
Find genes with low standard deviations and remove them from the expression matrix.
# calculate standard deviation per row
gene_sds = rowSds(labonte_cpm, na.rm = T)
# keep rows (genes) with sd greater than 0.1 across all samples
gene_mat = labonte_cpm[gene_sds > .1, ] %>%
as.data.frame() %>%
rownames_to_column(var = "gene_symbol")
Import the marker gene list you created in tutorial 1 and reformat.
# import from folder
marker_data = read.csv(file = "./data/human_markers.csv")[-1] %>%
# convert class labels to abbreviations
mutate(class_label = case_when(
class_label == "Glutamatergic" ~ "Exc",
class_label == "GABAergic" ~ "Inh")
) %>%
# add as prefix to subclass labels
unite(subclass_label, c(class_label, subclass_label), sep = "_", remove = F, na.rm = T)
# fix any spaces
marker_data$subclass_label = gsub(" ", "_", marker_data$subclass)
# add non-neuronal label back
marker_data$class_label[is.na(marker_data$class_label)] = "NonN"
# check matches between markers and bulk RNA-seq genes
paste("marker matches in data: ", length(intersect(unlist(gene_mat$gene), unlist(marker_data$gene))), "/", nrow(marker_data))
## [1] "marker matches in data: 2255 / 3203"
# get vector of unique cell types
cell_types = marker_data$subclass_label %>% unique()
print(cell_types)
## [1] "Astrocyte" "Endothelial" "Exc_IT" "Exc_L4_IT"
## [5] "Exc_L5_ET" "Exc_L5/6_IT_Car3" "Exc_L5/6_NP" "Exc_L6_CT"
## [9] "Exc_L6b" "Inh_LAMP5" "Microglia" "Oligodendrocyte"
## [13] "OPC" "Inh_PAX6" "Pericyte" "Inh_PVALB"
## [17] "Inh_SST" "Inh_VIP" "VLMC"
# organize markers into a list, which is the format will need for next steps
marker_list = lapply(cell_types, function(cell_type){
return(marker_data %>% filter(subclass_label == cell_type) %>% pull(gene) %>% unlist())
})
names(marker_list) = cell_types
Run mgpEstimate to get cell type proportions for each bulk tissue sample.
# Run marker gene profile (MGP) analysis
estimations = mgpEstimate(
exprData = gene_mat,
genes = marker_list,
geneColName = 'gene_symbol',
outlierSampleRemove = FALSE, # should outlier samples removed. This is done using boxplot stats
geneTransform = NULL, # this is the default option for geneTransform
groups = NULL, # if there are experimental groups provide them here. if not desired set to NULL
seekConsensus = FALSE, # ensures gene rotations are positive in both of the groups
removeMinority = TRUE)
# get proportion estimates as data frame
estimations_df = as.data.frame(estimations$estimates) %>%
rownames_to_column(var = "geo_accession")
# proportions are unit-less and negative numbers are just a result of PCA
# we can scale proportions between 0-1 for visualization purposes
scale0 = function(x){
(x-min(x))/(max(x)-min(x))
}
estimations_df_norm = scale0(estimations_df[,-1])
estimations_df_norm = estimations_df_norm %>%
mutate(geo_accession = estimations_df$geo_accession) %>%
dplyr::select(geo_accession, everything())
# merge cell type proportions with sample metadata
mgp_df = inner_join(labonte_meta, estimations_df_norm, by = "geo_accession") %>%
# pivot longer so there is one row per sample and cell type
# this type of data structure is preferred by ggplot
pivot_longer(-colnames(labonte_meta),
names_to = "cell_type",
values_to = "cell_proportion")
# fix labels
mgp_df$cell_type = gsub("\\.", "/", mgp_df$cell_type)
# preview
kable(mgp_df[1:10,])
| geo_accession | expr_names | age | gender | pmi | rin | ph | phenotype | cell_type | cell_proportion |
|---|---|---|---|---|---|---|---|---|---|
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Astrocyte | 0.4190530 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Endothelial | 0.4236108 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_IT | 0.3883149 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_L4_IT | 0.4157877 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_L5_ET | 0.3699714 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_L5/6_IT_Car3 | 0.4012421 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_L5/6_NP | 0.3751158 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_L6_CT | 0.3622663 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Exc_L6b | 0.3989537 |
| GSM2740657 | X14.BA8_9 | 47 | Male | 12 | 7.8 | 6.49 | CTRL | Inh_LAMP5 | 0.4302652 |
Let’s take a look at some of the quality metrics we have available for MGP calculations when we use the mgpEstimate function.
We define our QC algorithm such that it returns a dataframe with the cell type, markers_used (list of marker genes used per cell type), removed_marker_ratios (list of removed marker ratios per cell type), and percent_variance_PC1 (list of variance explained by the first PC per cell type)
# loop through each cell type
for(i in 1:length(cell_types)){
# get the expression values of markers kept by mgpEstimate() per bulk sample
cells_df = estimations$usedMarkerExpression[i] %>% as.data.frame()
# get list of markers kept by mgpEstimate()
masterlist = paste0(rownames(cells_df), collapse=', ')
# number of markers kept by mgpEstimate()
num_markers = length(rownames(cells_df))
# ratio of markers removed
rm_marker_ratios = estimations$removedMarkerRatios[i]
# check if the trimmed PCs are not(!) empty (NULL)
if(!is.null(estimations$trimmedPCAs[[i]])){
# get the percent variance that each principal component (PC) captures in the data
percent_variance = ((summary(estimations$trimmedPCAs[[i]]))[6]) %>% as.data.frame()
# get the percent variance that the first principal component (PC1) captures in the data
percent_variance_PC1 = percent_variance[2,1]
}
else{
# otherwise, the set this to NA
percent_variance_PC1 = NA
}
# build a dataframe with all our metrics
# initialize dataframe if this is the first iteration of the for() loop
if(i==1){
master_df = data.frame("markers_used" = masterlist,
"removed_marker_ratios" = rm_marker_ratios,
"percent_variance_PC1" = percent_variance_PC1,
"num_markers" = num_markers)
}
# bind previous iterations results
else{
df = data.frame("markers_used" = masterlist,
"removed_marker_ratios" = rm_marker_ratios,
"percent_variance_PC1" = percent_variance_PC1,
"num_markers" = num_markers)
master_df = rbind(master_df, df)
}
}
QC_metrics = rownames_to_column(master_df, var = "celltype")
Let’s plot these QC metrics to get a better idea of whats going on in the mgpEstimate function.
QC_metrics %>% ggplot(aes(x = celltype, y = num_markers)) +
theme_minimal() +
geom_bar(stat = "identity", fill = "#e0abf5") +
geom_hline(yintercept = 4) +
labs(title="Plot of Number of Markers Per Celltype",
x="Cell Type", y = "Markers Used")+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
coord_flip()
We’ve drawn a line here where we consider there to be too few markers used to calculate the cell-type proportion estimate for it to be considered robust. Let’s look at another measure of the quality of the MGPs, the percent variance explained by PC (principal component) 1.
QC_metrics %>% ggplot(aes(x=celltype, y=percent_variance_PC1))+
geom_bar(stat = "identity", fill =ifelse(QC_metrics$percent_variance_PC1 > 0.35, "#AFEEEE", "#808080")) +
geom_hline(yintercept = 0.35) +
labs(title="Percent Variance Explained by Each MGP",
x="MGPs", y = "Percent Variance Explained by PC 1")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90)) + coord_flip()
We consider more than 35% variance explained by PC1 (indicated by the horizontal line) to be a robust cell type proportion estimate. This means all the blue cell type proportion estimates (MGPs) are likely to be good surrogates for cell counts.
Now, let’s plot marker gene proportions (MGPs) vs. sample age
# create a vector of cell types to plot
plot_genes = c("Exc_IT","Inh_SST","Oligodendrocyte")
# create a list to store plots generated within the for loop
plot_list = list()
# loop through vector of genes to plot
for(i in 1:length(plot_genes)){
plot_list[[i]] = ggplot(
# subset by cell type
mgp_df %>% filter(cell_type == plot_genes[i]),
aes(x = age, y = cell_proportion, color = phenotype)) +
geom_smooth(method = "lm", se = F) +
geom_point() +
ggtitle(paste(plot_genes[i])) +
ylab(paste(plot_genes[i], " MGP")) + xlab ("sample age (years)") +
theme_bw()
}
# bring plots together
plot_grid(plotlist = plot_list, nrow = 1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
We will use R’s linear modeling function lm to fit a statistical model against each cell type proportion and a number of covariates, including gender, pH, RIN (RNA integrity number), PMI (post mortem interval), age, and phenotype (control / MDD).
The model form here is: cell_type_prop ~ phenotype + gender + ph + rin + pmi + age
To fit many models with broom, it’s useful to use pivot_longer to stack the data into one column - then group_by() to “split” the stacked data.
# Linear modeling
lm_df = mgp_df %>%
# each step performed after this line is done with each cell type
group_by(cell_type) %>%
# fit cell type proportions according to this model
# using the broom package to tidy the results
do(tidy(lm(scale(cell_proportion) ~ phenotype + gender + scale(pmi) + scale(ph) + scale(rin) + scale(age), data = .))) %>%
# ungroup the data
ungroup() %>%
# adjust for multiple comparisons using the Benjamini-Hochberg method
mutate(padj = p.adjust(`p.value`, method = 'BH')) %>%
# clean up variable names
mutate(term = recode(term,
`(Intercept)` = "Intercept",
`genderMale` = "gender",
`scale(rin)` = "rin",
`scale(pmi)` = "pmi",
`scale(ph)` = "ph",
`scale(age)` = "age",
`phenotypeMDD` = "MDD")) %>%
mutate(class = case_when(
str_detect(cell_type, "Inh") ~ "Inhibitory",
str_detect(cell_type, "Exc") ~ "Excitatory",
TRUE ~ "Non-Neuronal"
))
# print data frame for just MDD beta coefficients
kable(lm_df %>% filter(term == 'MDD'))
| cell_type | term | estimate | std.error | statistic | p.value | padj | class |
|---|---|---|---|---|---|---|---|
| Astrocyte | MDD | -0.1814554 | 0.2824241 | -0.6424927 | 0.5241301 | 0.7464357 | Non-Neuronal |
| Endothelial | MDD | -0.1048819 | 0.2890632 | -0.3628339 | 0.7185929 | 0.8115858 | Non-Neuronal |
| Exc_IT | MDD | -0.3973346 | 0.2481806 | -1.6009901 | 0.1170572 | 0.3706812 | Excitatory |
| Exc_L4_IT | MDD | 0.1088939 | 0.2665840 | 0.4084790 | 0.6850477 | 0.8115858 | Excitatory |
| Exc_L5/6_IT_Car3 | MDD | -0.0459116 | 0.2718776 | -0.1688687 | 0.8667303 | 0.9076782 | Excitatory |
| Exc_L5/6_NP | MDD | -0.4382747 | 0.2592019 | -1.6908622 | 0.0984528 | 0.3538978 | Excitatory |
| Exc_L5_ET | MDD | -0.3762635 | 0.2543732 | -1.4791788 | 0.1467354 | 0.4247805 | Excitatory |
| Exc_L6_CT | MDD | -0.2164139 | 0.2703507 | -0.8004932 | 0.4280394 | 0.7298620 | Excitatory |
| Exc_L6b | MDD | 0.0150031 | 0.2954186 | 0.0507858 | 0.9597428 | 0.9654755 | Excitatory |
| Inh_LAMP5 | MDD | -0.2134051 | 0.2241255 | -0.9521680 | 0.3465917 | 0.6880104 | Inhibitory |
| Inh_PAX6 | MDD | -0.2114222 | 0.2803350 | -0.7541769 | 0.4550529 | 0.7380736 | Inhibitory |
| Inh_PVALB | MDD | -0.1346649 | 0.2169222 | -0.6207982 | 0.5381684 | 0.7534358 | Inhibitory |
| Inh_SST | MDD | -0.1950828 | 0.1687478 | -1.1560613 | 0.2543484 | 0.5545628 | Inhibitory |
| Inh_VIP | MDD | -0.4210047 | 0.2534309 | -1.6612209 | 0.1042983 | 0.3650441 | Inhibitory |
| Microglia | MDD | -0.1511913 | 0.2963048 | -0.5102558 | 0.6126085 | 0.7910382 | Non-Neuronal |
| Oligodendrocyte | MDD | 0.1867213 | 0.2559009 | 0.7296625 | 0.4697449 | 0.7437628 | Non-Neuronal |
| OPC | MDD | -0.2178394 | 0.2853736 | -0.7633480 | 0.4496262 | 0.7380736 | Non-Neuronal |
| Pericyte | MDD | -0.1880874 | 0.2943114 | -0.6390762 | 0.5263279 | 0.7464357 | Non-Neuronal |
| VLMC | MDD | -0.2168155 | 0.2857047 | -0.7588797 | 0.4522654 | 0.7380736 | Non-Neuronal |
# print data frame for just age beta coefficients
kable(lm_df %>% filter(term == 'age'))
| cell_type | term | estimate | std.error | statistic | p.value | padj | class |
|---|---|---|---|---|---|---|---|
| Astrocyte | age | 0.2463604 | 0.1522435 | 1.6182001 | 0.1132880 | 0.3706812 | Non-Neuronal |
| Endothelial | age | 0.3306262 | 0.1558224 | 2.1218146 | 0.0399426 | 0.2043218 | Non-Neuronal |
| Exc_IT | age | 0.0526654 | 0.1337842 | 0.3936595 | 0.6958723 | 0.8115858 | Excitatory |
| Exc_L4_IT | age | -0.3500279 | 0.1437047 | -2.4357442 | 0.0192950 | 0.1243466 | Excitatory |
| Exc_L5/6_IT_Car3 | age | -0.5041654 | 0.1465583 | -3.4400332 | 0.0013496 | 0.0128211 | Excitatory |
| Exc_L5/6_NP | age | -0.0445024 | 0.1397254 | -0.3184994 | 0.7517230 | 0.8262741 | Excitatory |
| Exc_L5_ET | age | -0.0906562 | 0.1371224 | -0.6611337 | 0.5122249 | 0.7464357 | Excitatory |
| Exc_L6_CT | age | -0.1007912 | 0.1457352 | -0.6916050 | 0.4930834 | 0.7464357 | Excitatory |
| Exc_L6b | age | -0.2295658 | 0.1592483 | -1.4415589 | 0.1570218 | 0.4314175 | Excitatory |
| Inh_LAMP5 | age | -0.5033480 | 0.1208170 | -4.1662003 | 0.0001556 | 0.0029513 | Inhibitory |
| Inh_PAX6 | age | 0.0088710 | 0.1511174 | 0.0587025 | 0.9534743 | 0.9654755 | Inhibitory |
| Inh_PVALB | age | -0.5229318 | 0.1169341 | -4.4720226 | 0.0000602 | 0.0018698 | Inhibitory |
| Inh_SST | age | -0.6418557 | 0.0909652 | -7.0560600 | 0.0000000 | 0.0000018 | Inhibitory |
| Inh_VIP | age | -0.5961769 | 0.1366144 | -4.3639374 | 0.0000844 | 0.0018698 | Inhibitory |
| Microglia | age | 0.1600091 | 0.1597260 | 1.0017719 | 0.3223297 | 0.6495432 | Non-Neuronal |
| Oligodendrocyte | age | 0.5420832 | 0.1379459 | 3.9296794 | 0.0003198 | 0.0042527 | Non-Neuronal |
| OPC | age | 0.1343605 | 0.1538335 | 0.8734152 | 0.3875239 | 0.7172334 | Non-Neuronal |
| Pericyte | age | 0.2199408 | 0.1586515 | 1.3863141 | 0.1731442 | 0.4428497 | Non-Neuronal |
| VLMC | age | 0.2481138 | 0.1540120 | 1.6110036 | 0.1148519 | 0.3706812 | Non-Neuronal |
# beta coeffs per cell type for phenotype
beta_plot = lm_df %>%
filter(term %in% 'MDD') %>%
mutate(cell_type = fct_reorder(cell_type, estimate)) %>%
ggplot(aes(x = cell_type, y = estimate)) +
geom_hline(yintercept = 0) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error)) +
ggtitle("MDD vs. controls") +
ylab('Std. Beta coeff.') +
xlab('Cell type proportions') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
facet_wrap(~class, drop = T, scale = "free")
beta_plot
# beta coeffs per cell type for age effects
beta_plot = lm_df %>%
filter(term %in% 'age') %>%
mutate(cell_type = fct_reorder(cell_type, estimate)) %>%
ggplot(aes(x = cell_type, y = estimate)) +
geom_hline(yintercept = 0) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error)) +
ggtitle("Aging Effects") +
ylab('Std. Beta coeff.') +
xlab('Cell type proportions') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
facet_wrap(~class, drop = T, scale = "free")
beta_plot
That concludes the mgpEstimate tutorial.