Setup

Set working directory

knitr::opts_knit$set(root.dir = "/research/labs/neurology/fryer/m214960/aducanumab/scripts/R")

Load libraries

# load libraries
library(cowplot)    # plot_grid()
library(dplyr)      # left_join()
library(ggplot2)    # ggplot()
library(gridExtra)  # grid.arrange()
library(harmony)    # RunHarmony()
library(parallel)   # detectCores()
library(plotly)     # plot_ly()
library(Seurat)     # Read10X_h5()
library(ShinyCell)  # makeShinyApp()
library(stringr)    # str_match()

Set variables and thresholds

# variables
sample_order <- c("IgG.F.939A","IgG.F.939B","IgG.F.959B",
                  "IgG.M.823A","IgG.M.851A",
                  "Adu.F.736B","Adu.F.738A","Adu.F.738B",
                  "Adu.M.705A","Adu.M.734A")
group_order <- c("IgG","Adu")
sex_order <- c("Male","Female")
sample_colors <- c("gray","red","orange","yellow","green","forestgreen","cyan","blue","purple","pink")
group_colors <- c("gray","cornflowerblue")
sex_colors <- c("darkgray","purple")

# thresholds
nCount.min <- 200
nCount.max <- 10000
nFeature.min <- 100
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 20
hb.cutoff <- 3
ttr.cutoff <- 1

# set seed
set.seed(8)

# work in parallel
options(mc.cores = detectCores() - 1)

Save functions

These functions with help simultaneously save plots as a png and pdf.

saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}
saveToPNG <- function(...) {
    d = dev.copy(png,...)
    dev.off(d)
}

Load data

Merge h5

  • We are using CellBender filtered output with false positive rate of 0.05.
  • Two of the files say ‘pass2’ because the CellBender -total-droplets-included argument was adjusted.
prefix <- "../../cellbender/"
suffix <- "_fpr_0.05_filtered.h5"

if (file.exists("../../rObjects/mouse_merged_h5.rds")) {
  mouse <- readRDS("../../rObjects/mouse_merged_h5.rds")
} else {
  # individual sample objects
  IgG.F.939A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"939A_IgG_Female",suffix)))
  IgG.F.939B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"939B_IgG_Female",suffix)))
  IgG.F.959B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"959B_IgG_Female",suffix)))
  IgG.M.823A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"823A_IgG_Male",suffix)))
  IgG.M.851A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"851A_IgG_Male",suffix)))
  Adu.F.736B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"736B_Adu_Female",suffix)))
  Adu.F.738A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"738A_Adu_Female",suffix)))
  Adu.F.738B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"738B_Adu_Female",suffix)))
  Adu.M.705A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"705A_Adu_Male",suffix)))
  Adu.M.734A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"734A_Adu_Male",suffix)))

  # merge objects
  mouse <- merge(x = IgG.F.939A, 
                y = c(IgG.F.939B, IgG.F.959B, IgG.M.823A, IgG.M.851A,
                      Adu.F.736B, Adu.F.738A, Adu.F.738B, Adu.M.705A, Adu.M.734A), 
                add.cell.ids = c("IgG.F.939A","IgG.F.939B","IgG.F.959B","IgG.M.823A",
                                 "IgG.M.851A","Adu.F.736B","Adu.F.738A","Adu.F.738B",
                                 "Adu.M.705A","Adu.M.734A"), 
                project = "Aducanumab Mice scRNAseq")
  
  # cleanup and save
  remove(IgG.F.939A, IgG.F.939B, IgG.F.959B, IgG.M.823A, IgG.M.851A,
         Adu.F.736B, Adu.F.738A, Adu.F.738B, Adu.M.705A, Adu.M.734A)
  saveRDS(mouse, "../../rObjects/mouse_merged_h5.rds")
}

# preview
mouse
## An object of class Seurat 
## 32285 features across 70028 samples within 1 assay 
## Active assay: RNA (32285 features, 0 variable features)

Annotation

  • Gm* genes are originally annotated by MGI and the *Rik genes are annotated by RIKEN
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
  genes <- readRDS("../../rObjects/annotation.rds")
} else {
  gtf.file <- "../../refs/mouse_genes.gtf"
  genes <- rtracklayer::import(gtf.file)
  genes <- as.data.frame(genes)
  genes <- genes[genes$type == "gene",]
  saveRDS(genes, "../../rObjects/annotation.rds")
}

Metadata columns

nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique transcripts (features)

# create sample column
barcodes <- colnames(mouse)
sample <- str_match(barcodes, "(.+)_[ACGT]+-(\\d+)")[,2]
mouse$sample <- factor(sample, levels = sample_order)
Idents(mouse) <- mouse$sample

# sex column
sex <- str_match(mouse$sample, "[IgGAdu]+\\.([FM]).[0-9]+[AB]")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)

# group column
group <- str_match(mouse$sample, "([IgGAdu]+)\\.[FM].[0-9]+[AB]")[,2]
mouse$group <- factor(group, levels = group_order)

# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)

# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
##   [1] "Mrpl15"     "Mrpl30"     "Mrps9"      "Mrpl44"     "Mrps14"    
##   [6] "Mrpl41"     "Mrps2"      "Mrps5"      "Mrps26"     "Mrps28"    
##  [11] "Mrpl47"     "Mrpl24"     "Mrpl9"      "Mrps21"     "Mrpl50"    
##  [16] "Mrpl37"     "Mrps15"     "Mrpl20"     "Mrpl33"     "Mrpl1"     
##  [21] "Mrps18c"    "Mrps17"     "Mrps33"     "Mrpl35"     "Mrpl19"    
##  [26] "Mrpl53"     "Mrps25"     "Mrpl51"     "Mrps35"     "Mrps12"    
##  [31] "Mrpl46"     "Mrps11"     "Mrpl48"     "Mrpl17"     "Mrpl23"    
##  [36] "Mrps31"     "Mrpl34"     "Mrpl4"      "Mrps22"     "Mrpl3"     
##  [41] "Mrpl54"     "Mrpl42"     "Mrps24"     "Mrpl22"     "Mrpl55"    
##  [46] "Mrps23"     "Mrpl27"     "Mrpl10"     "Mrpl45"     "Mrpl58"    
##  [51] "Mrps7"      "Mrpl38"     "Mrpl12"     "Mrpl32"     "Mrpl36"    
##  [56] "Mrps27"     "Mrps36"     "Mrps30"     "Mrps16"     "Mrpl52"    
##  [61] "Mrpl57"     "Mrpl13"     "Mrpl40"     "Mrpl39"     "Mrps6"     
##  [66] "Mrpl18"     "Mrps34"     "Mrpl28"     "Mrps18b"    "Mrpl14"    
##  [71] "Mrps18a"    "Mrpl2"      "Mrps10"     "Mrpl21"     "Mrpl11"    
##  [76] "Mrpl49"     "Mrpl16"     "Mrpl43"     "Rpl7"       "Rpl31"     
##  [81] "Rpl37a"     "Rps6kc1"    "Rpl7a"      "Rpl12"      "Rpl35"     
##  [86] "Rps21"      "Rpl22l1"    "Rps3a1"     "Rps27"      "Rpl34"     
##  [91] "Rps20"      "Rps6"       "Rps8"       "Rps6ka1"    "Rpl11"     
##  [96] "Rpl22"      "Rpl9"       "Rpl5"       "Rplp0"      "Rpl6"      
## [101] "Rpl21"      "Rpl32"      "Rps9"       "Rpl28"      "Rps5"      
## [106] "Rps19"      "Rps16"      "Rps11"      "Rpl13a"     "Rpl18"     
## [111] "Rps17"      "Rps3"       "Rpl27a"     "Rps13"      "Rps15a"    
## [116] "Rplp2"      "Rpl18a"     "Rpl13"      "Rps25"      "Rpl10-ps3" 
## [121] "Rplp1"      "Rpl4"       "Rps27l"     "Rpl29"      "Rps27rt"   
## [126] "Rpsa"       "Rpl14"      "Rps12"      "Rps15"      "Rpl41"     
## [131] "Rps26"      "Rps27a"     "Rpl26"      "Rpl23a"     "Rpl9-ps1"  
## [136] "Rps6kb1"    "Rpl23"      "Rpl19"      "Rpl27"      "Rpl38"     
## [141] "Rps7"       "Rpl10l"     "Rps29"      "Rpl36al"    "Rps6kl1"   
## [146] "Rps6ka5"    "Rps23"      "Rpl15"      "Rps24"      "Rpl36a-ps1"
## [151] "Rpl37"      "Rpl30"      "Rpl8"       "Rpl3"       "Rps19bp1"  
## [156] "Rpl39l"     "Rpl35a"     "Rpl24"      "Rps6ka2"    "Rps2"      
## [161] "Rpl3l"      "Rps10"      "Rpl10a"     "Rps28"      "Rps18"     
## [166] "Rpl7l1"     "Rpl36"      "Rpl36-ps4"  "Rps14"      "Rpl17"     
## [171] "Rps6kb2"    "Rps6ka4"    "Rpl9-ps6"   "Rpl39"      "Rpl10"     
## [176] "Rps4x"      "Rps6ka6"    "Rpl36a"     "Rps6ka3"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"
# percent Ttr
mouse$percent.ttr <- PercentageFeatureSet(mouse, features = "Ttr")

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")

ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,20000, by = 2000), limits = c(0,20000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = ribo.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Violin plots

# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v1

#  percent violins
v2 <- VlnPlot(mouse,
              features = c("percent.mt","percent.ribo.protein","percent.hb","percent.ttr"),
              ncol = 4,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v2

Scatter plots

s1 <- ggplot(
  mouse@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_hline(yintercept = nFeature.min) + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula 'y ~ x'

s2 <- FeatureScatter(mouse,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
s2

Filtering

Cell-level filtering

# filter
mouse.filtered <- subset(mouse,
                        subset = (nCount_RNA > nCount.min) &
                          (nCount_RNA < nCount.max) &
                          (nFeature_RNA > nFeature.min) &
                          (cell.complexity > complexity.cutoff) &
                          (percent.mt < mt.cutoff) &
                          (percent.ribo.protein < ribo.cutoff) &
                          (percent.hb < hb.cutoff) &
                          (percent.ttr < ttr.cutoff))

# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "18965 cells removed"

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
nonzero <- counts > 0  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "11246 features removed"

Multiple passes were made to assess whether mitochondrial, ribosomal, and immunoglobulin genes should be removed. Ultimately, removal of these genes enhanced clustering.

# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]

# remove ribo.genes
#keep <- !rownames(counts.filtered) %in% ribo.combined
#counts.filtered <- counts.filtered[keep,]

# remove Ig genes + Jchain but keep Igha + Ighd to enahnce clustering
#gene.types <- c("IG_C_gene","IG_C_pseudogene","IG_D","IG_J_gene","IG_LV_gene",
#                "IG_V_gene","IG_V_pseudogene")
#keep <- (genes$gene_type) %in% gene.types
#ig.genes <- genes[keep,]
#ig.genes <- c(ig.genes$gene_name, "Jchain")
#ig.genes <- ig.genes[-c(185,192)] # keep Igha and Ighd
#ig.genes
#keep <- !rownames(counts.filtered) %in% ig.genes
#counts.filtered <- counts.filtered[keep,]

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "13 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero)

Post-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")

ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,20000, by = 2000), limits = c(0,20000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))

# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = ribo.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Violin plots

# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v3

#  percent violins
v4 <- VlnPlot(mouse.filtered,
              features = c("percent.mt","percent.ribo.protein","percent.hb","percent.ttr"),
              ncol = 4,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v4

Scatter plots

s3 <- ggplot(
  mouse.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_hline(yintercept = nFeature.min) + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula 'y ~ x'

s4 <- FeatureScatter(mouse.filtered,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
s4

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Sample")
b1

Top transcripts

df <- data.frame(row.names = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum, decreasing = TRUE),]
colnames(df)[1] <- "rsum_raw_count"
head(df, 30)
##            rsum_raw_count  gene_name
## Malat1            8852580     Malat1
## Meg3               803471       Meg3
## Gm42418            632351    Gm42418
## Snhg11             589849     Snhg11
## Cst3               543699       Cst3
## Srrm2              171775      Srrm2
## Apoe               169222       Apoe
## Kcnq1ot1           149220   Kcnq1ot1
## Gria2              138148      Gria2
## R3hdm1             136552     R3hdm1
## Nrxn1              122678      Nrxn1
## Ctsd               117293       Ctsd
## Ctss               114740       Ctss
## Gm26917            114606    Gm26917
## Ahi1               111865       Ahi1
## Rian               107692       Rian
## Ptprd              103415      Ptprd
## Hexb               102375       Hexb
## C1qa               101719       C1qa
## C1qb                99022       C1qb
## Macf1               95535      Macf1
## Syne1               90737      Syne1
## Syt1                90044       Syt1
## Ank2                86509       Ank2
## Mycbp2              85895     Mycbp2
## Snhg14              85811     Snhg14
## AC149090.1          85717 AC149090.1
## Celf2               79533      Celf2
## Son                 79097        Son
## Ube3a               77287      Ube3a

Unwanted variation

Cell cycle

Score

  • cell cycle scoring guide
  • The most common biological data correction is to remove the effects of the cell cycle on the transcriptome.
  • Raw counts are not comparable between cells. Each cell has a different nCount_RNA. The log normalization, ((nCount_RNA / nFeature_RNA) * log1p, is taken in order to explore variation.
  • We will not ultimately use Seurat’s method of normalization; we will use SCTransform.
# log normalization
mouse.phase <- NormalizeData(mouse.filtered)
  • Each cell is given score based on expression of G1, G2/M, and S phase markers.
  • G1: ~10 hrs, S: ~5-6 hrs, G2: ~3-4 hrs, M: ~2 hrs
  • G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
  • If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise, the the greatest positive value between S.Score and G2M.Score determines the phase.
# load mouse cell cycle markers
phase.markers <- read.delim("../../refs/mouse_cell_cycle.txt")
colnames(phase.markers)[2] <- "gene_id"
phase.markers <- left_join(phase.markers, genes[,c(10,13)], by = "gene_id")
g2m <- phase.markers[phase.markers$phase == "G2/M", 4]
g2m
##  [1] "Ube2c"   "Lbr"     "Ctcf"    "Cdc20"   "Cbx5"    "Kif11"   "Anp32e" 
##  [8] "Birc5"   "Cdk1"    "Tmpo"    "Hmmr"    "Jpt1"    "Pimreg"  "Aurkb"  
## [15] "Top2a"   "Gtse1"   "Rangap1" "Cdca3"   "Ndc80"   "Kif20b"  "Cenpf"  
## [22] "Nek2"    "Nuf2"    "Nusap1"  "Bub1"    "Tpx2"    "Aurka"   "Ect2"   
## [29] "Cks1b"   "Kif2c"   "Cdca8"   "Cenpa"   "Mki67"   "Ccnb2"   "Kif23"  
## [36] "Smc4"    "G2e3"    "Tubb4b"  "Anln"    "Tacc3"   "Dlgap5"  "Ckap2"  
## [43] "Ncapd2"  "Ttk"     "Ckap5"   "Cdc25c"  "Hjurp"   "Cenpe"   "Ckap2l" 
## [50] "Cdca2"   "Hmgb2"   "Cks2"    "Psrc1"   "Gas2l3"
s <- phase.markers[phase.markers$phase == "S", 4]
s
##  [1] "Cdc45"    "Uhrf1"    "Mcm2"     "Slbp"     "Mcm5"     "Pola1"   
##  [7] "Gmnn"     "Cdc6"     "Rrm2"     "Atad2"    "Dscc1"    "Mcm4"    
## [13] "Chaf1b"   "Rfc2"     "Msh2"     "Fen1"     "Hells"    "Prim1"   
## [19] "Tyms"     "Mcm6"     "Wdr76"    "Rad51"    "Pcna"     "Ccne2"   
## [25] "Casp8ap2" "Usp1"     "Nasp"     "Rpa2"     "Ung"      "Rad51ap1"
## [31] "Blm"      "Pold3"    "Rrm1"     "Cenpu"    "Gins2"    "Tipin"   
## [37] "Brip1"    "Dtl"      "Exo1"     "Ubr7"     "Clspn"    "E2f8"    
## [43] "Cdca7"
# write table
write.table(phase.markers, 
            "../../results/ten_samples/cellcycle/mouse_cell_cycle_phase_markers.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# score cells
mouse.phase <- CellCycleScoring(mouse.phase,
                                g2m.features = g2m,
                                s.features = s,
                                set.ident = TRUE)
mouse.filtered[["phase"]] <- mouse.phase$Phase

Top variable genes

Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.

# Identify the most variable genes
mouse.phase <- FindVariableFeatures(mouse.phase, verbose = FALSE)

# Preview top 40
head(VariableFeatures(mouse.phase), 40)
##  [1] "Spp1"     "Apoe"     "Cd74"     "Lyz2"     "Ptgds"    "Ccl4"    
##  [7] "Tmsb4x"   "Ccl12"    "Flt1"     "Fth1"     "Bsg"      "H2-Aa"   
## [13] "Cxcl12"   "Cldn5"    "H2-Ab1"   "H2-Eb1"   "Plp1"     "Rgs5"    
## [19] "Cxcl10"   "Ccl3"     "Cst7"     "Ctsd"     "Vtn"      "Ftl1"    
## [25] "Rps29"    "Mal"      "Sst"      "Actb"     "Camk2n1"  "Ccl5"    
## [31] "Fau"      "Tyrobp"   "Slco1a4"  "Itm2a"    "Eef1a1"   "Ifi27l2a"
## [37] "Cryab"    "Selenow"  "Vip"      "Ifitm3"
# Scale the counts
mouse.phase <- ScaleData(mouse.phase)
## Centering and scaling data matrix

PCA

If the PCA plots for each phase do not look similar you may want to regress out variation due to cell cycle phase. Otherwise, nothing needs to be done. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

# Run PCA
mouse.phase <- RunPCA(mouse.phase, nfeatures.print = 10)
## PC_ 1 
## Positive:  R3hdm1, Atp2b1, Phactr1, Pde10a, Gria1, Homer1, Cit, Gm3764, Slc17a7, Pex5l 
## Negative:  Cst3, Tmsb4x, Ctsd, C1qb, C1qa, Itm2b, C1qc, Ctss, Hexb, Tyrobp 
## PC_ 2 
## Positive:  Cx3cr1, Gpr34, Csf1r, Hexb, Lgmn, C1qc, Ctss, Lpcat2, C1qa, Tgfbr1 
## Negative:  Cox8a, Selenow, Rpl9, Cox4i1, Dbi, Mt3, Chchd2, Calm1, Rpl38, Rps19 
## PC_ 3 
## Positive:  Atp1a2, Slc1a2, Slc1a3, Ntsr2, Ptprz1, Qk, Bcan, Appl2, Ndrg2, Sparcl1 
## Negative:  R3hdm1, Atp2b1, Homer1, Phactr1, Pde10a, Slc17a7, Plk2, Gria1, Cit, A830018L16Rik 
## PC_ 4 
## Positive:  Camk2n1, Mt3, Pcsk1n, Selenow, Cox8a, Cpe, Dbi, Aldoc, 2900097C17Rik, Cox6c 
## Negative:  Flt1, Ahnak, Slco1a4, Rgs5, Pltp, Cldn5, Ly6a, Cxcl12, Ptprb, Egfl7 
## PC_ 5 
## Positive:  Plp1, Mag, Mbp, Enpp2, Mal, Gjc3, Aspa, Gatm, Ptgds, Sept4 
## Negative:  Ntsr2, Gm6145, Gm3764, Slc1a2, Nwd1, Phkg1, Fgfr3, Slc1a3, Atp13a4, Slc7a10
# Plot
pca1 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "Phase",
               split.by = "Phase")
pca1

pca2 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "Phase",
               shuffle = TRUE)
pca2

Bar graph

data <- as.data.frame(table(mouse.phase$Phase))
colnames(data) <- c("Phase","frequency")

ncells3 <- ggplot(data, aes(x = Phase, y = frequency, fill = Phase)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  ggtitle("Cells per phase")
ncells3

Percent cells per phase

percent.phase <- mouse.phase@meta.data %>%
  group_by(sample, Phase) %>%
  dplyr::count() %>%
  group_by(sample) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = sample, y = percent, fill = Phase)) +
  geom_col() +
  ggtitle("Percentage of phase per sample") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
percent.phase

Mitochondrial fraction

PCA

Evaluating effects of mitochondrial expression

# Check quartile values and store
summary(mouse.phase$percent.mt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  1.4398  0.7092 19.9800
first <- as.numeric(summary(mouse.phase$percent.mt)[2])
mean <- as.numeric(summary(mouse.phase$percent.mt)[4])
third <- as.numeric(summary(mouse.phase$percent.mt)[5])

# Turn percent.mt into factor based on quartile value
mouse.phase@meta.data$mito.factor <- cut(mouse.phase@meta.data$percent.mt, 
                   breaks=c(-Inf, first, mean, third, Inf), 
                   labels=c("Low","Medium","Medium high", "High"))
mouse.filtered[["mito.factor"]] <- mouse.phase$mito.factor

# Plot
pca1 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "mito.factor",
               split.by = "mito.factor")
pca1

pca2 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "mito.factor",
               shuffle = TRUE)
pca2

Bar graph

data <- as.data.frame(table(mouse.phase$mito.factor))
colnames(data) <- c("mito.factor","frequency")

ncells4 <- ggplot(data, aes(x = mito.factor, y = frequency, fill = mito.factor)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  ggtitle("Cells per mitochondria level")
ncells4

Percent cells per quartile

percent <- mouse.phase@meta.data %>%
  group_by(sample, mito.factor) %>%
  dplyr::count() %>%
  group_by(sample) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = sample, y = percent, fill = mito.factor)) +
  geom_col() +
  ggtitle("Mitochondrial fraction per sample") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
percent

SCTransform

  • Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model.

  • Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase or percent.mito and so we chose not to regress this out of our data.

  • Since we have 8 samples in our dataset we want to keep them as separate objects and transform them as that is what is required for integration.

# split
mouse.split <- SplitObject(mouse.filtered, split.by = "sample")
  • We will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.

  • Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb).

options(future.globals.maxSize = 4000 * 1024^5)

for (i in 1:length(mouse.split)) {
  print(paste0("Sample ", i))
  mouse.split[[i]] <- SCTransform(mouse.split[[i]], 
                                  verbose = FALSE, 
                                  vars.to.regress = "percent.mt")
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"
## [1] "Sample 9"
## [1] "Sample 10"
remove(mouse.filtered)
  • NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument. Additionally the last line of output specifies “Set default assay to SCT”.
  • It is suggested to not regress out batch, and instead use a data integration method: github link

Integration

Run harmony

  • Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.
  • To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.
  • Now, using our SCTransform object as input, let’s perform the integration across conditions.
  • First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.
# Choose the features to use when integrating multiple data sets 
# We will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = mouse.split, 
                                          nfeatures = 3000)

# merge the mouse
mouse.merged <- merge(x = mouse.split[[1]],
                      y = c(mouse.split[[2]], mouse.split[[3]], 
                            mouse.split[[4]], mouse.split[[5]], 
                            mouse.split[[6]], mouse.split[[7]],
                            mouse.split[[8]], mouse.split[[9]],
                            mouse.split[[10]]))

# define the variable features 
VariableFeatures(mouse.merged) <- var.features

# run PCA on the merged object
mouse.merged <- RunPCA(object = mouse.merged, assay = "SCT")

# harmony dimensional reduction
mouse.integrated <- RunHarmony(object = mouse.merged,
                               group.by.vars = "sample", 
                               assay.use = "SCT",
                               reduction = "pca", 
                               plot_convergence = TRUE)

# save and cleanup
saveRDS(mouse.integrated, "../../rObjects/mouse_ten_samples_integrated.rds")
remove(mouse.split, var.features, mouse.merged)

Check output

# Reset idents and levels
DefaultAssay(mouse.integrated) <- "SCT"
Idents(mouse.integrated) <- "sample"
mouse.integrated$sample <- factor(mouse.integrated$sample, 
                                  levels = sample_order)
mouse.integrated$sex <- factor(mouse.integrated$sex, 
                               levels = sex_order)
mouse.integrated$group <- factor(mouse.integrated$group,
                                 levels = group_order)

# check PCA
p1 <- DimPlot(object = mouse.integrated, 
              reduction = "harmony",
              group.by = "sample",
              cols = sample_colors,
              shuffle = TRUE) + NoLegend()
p1

p2 <- VlnPlot(object = mouse.integrated, 
              features = "harmony_1", 
              group.by = "sample", 
              pt.size = 0, 
              cols = sample_colors) + NoLegend()
p2

Top variable features

Top 20 variable features

top20 <- mouse.integrated@assays$SCT@var.features[1:20]
top20
##  [1] "Cst3"    "Apoe"    "Ctss"    "Tmsb4x"  "Ctsd"    "Fth1"    "C1qa"   
##  [8] "C1qb"    "Hexb"    "Gm42418" "Plp1"    "C1qc"    "Cst7"    "Snhg11" 
## [15] "Trem2"   "Cdr1os"  "Ptgds"   "Itm2b"   "Mbp"     "Ctsz"

PCA plot

After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.

To generate these visualizations with the harmony output, use reduction = “harmony”

# Plot PCA
pca1 <- DimPlot(mouse.integrated,
                reduction = "harmony",
                ncol = 3,
                split.by = "sample",
                group.by = "sample",
                cols = sample_colors)
pca1

pca2 <- DimPlot(mouse.integrated,
                reduction = "harmony",
                split.by = "group",
                group.by = "group",
                cols = group_colors)
pca2

pca3 <- DimPlot(mouse.integrated,
                reduction = "harmony",
                split.by = "sex",
                group.by = "sex",
                cols = sex_colors)
pca3

Clustering

Find significant PCs

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# Printing out the most variable genes driving PCs
print(x = mouse.integrated[["pca"]], 
      dims = 1:10,
      nfeatures = 10)
## PC_ 1 
## Positive:  Cst3, C1qa, Ctss, C1qb, Hexb, Ctsd, C1qc, Itm2b, Trem2, Fcer1g 
## Negative:  Gm42418, R3hdm1, Ptprd, Gria2, Ahi1, Gm26917, Kcnq1ot1, Rian, Syt1, Nrxn1 
## PC_ 2 
## Positive:  Tmsb4x, Fth1, Rpl13, Rps24, Eef1a1, Rps29, Fau, Rpl19, Rplp1, Tpt1 
## Negative:  Hexb, Ctss, C1qb, Cst3, C1qa, Gpr34, Ctsd, C1qc, Cx3cr1, Csf1r 
## PC_ 3 
## Positive:  Slc1a2, Atp1a2, Slc1a3, Qk, Ptprz1, Bcan, Apoe, Appl2, Ntsr2, Ndrg2 
## Negative:  R3hdm1, Ahi1, Syt1, Rian, Arpp21, Snhg11, Celf2, Snhg14, Atp2b1, Trank1 
## PC_ 4 
## Positive:  Plp1, Mbp, Neat1, Mag, Enpp2, P2ry12, Cx3cr1, Qk, Ptgds, Gatm 
## Negative:  Apoe, Cst7, Lyz2, Ctsb, Ctsz, Tyrobp, Cd63, Ctsl, Trem2, Slc1a2 
## PC_ 5 
## Positive:  Plp1, Mbp, Neat1, Apoe, Mag, Cst7, Enpp2, Lyz2, Trf, Ptgds 
## Negative:  Cx3cr1, Slc1a2, P2ry12, Tmem119, Selplg, Serinc3, Lpcat2, Cst3, Csf1r, Ifngr1 
## PC_ 6 
## Positive:  Tmsb4x, Actb, Marcks, Rgs10, Pfn1, Fau, Aif1, Rps9, Ftl1, Sh3bgrl3 
## Negative:  Gm42418, Camk2n1, Pcsk1n, Calm1, Selenow, Mt3, Cpe, Cox8a, Dbi, 2900097C17Rik 
## PC_ 7 
## Positive:  Gm42418, Bsg, Flt1, Ahnak, H2-D1, Clec2d, H2-K1, Ifitm3, Slc2a1, Myl6 
## Negative:  Cst3, Plp1, C1qb, Mbp, C1qa, Ctss, Slc1a2, Neat1, Hexb, C1qc 
## PC_ 8 
## Positive:  Gm42418, AY036118, Gm26917, C130073E24Rik, Rgs9, Erbb4, Dpp6, Usp29, Cacna2d2, Pde10a 
## Negative:  R3hdm1, Mef2c, Slc17a7, Ptprd, Arpp21, Camk2a, A830036E02Rik, Gpm6b, Homer1, Gria3 
## PC_ 9 
## Positive:  Gm42418, Snhg11, Tcf4, Ptprd, Miat, Nrxn1, Peg3, Ntng1, Cpne7, Elavl2 
## Negative:  Rgs9, Pde10a, Phactr1, Gnal, Foxp1, Meis2, Pde1b, Dgkb, Cacna2d3, Unc13c 
## PC_ 10 
## Positive:  Bsg, Flt1, Ahnak, Ptma, Slc2a1, Ifitm3, Atp1a2, Ptn, Spock2, Myl6 
## Negative:  Apoe, Fth1, Camk2n1, Cyth4, Gm42418, Pcsk1n, Hpgds, Mafb, Cst7, Maf

Quantitative approach to an elbow plot
- The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation.
- The point where the percent change in variation between the consecutive PCs is less than 0.1%.

First metric

# Determine percent of variation associated with each PC
stdv <- mouse.integrated[["pca"]]@stdev
sum.stdv <- sum(mouse.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100

# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)

# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 41

Second metric

# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
  (percent.stdv[1:length(percent.stdv) - 1] - 
     percent.stdv[2:length(percent.stdv)]) > 0.1), 
  decreasing = T)[1] + 1

# last point where change of % of variation is more than 0.1%.
co2
## [1] 15

Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.

# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 15

Elbow plot

Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

UMAP

# Run UMAP
mouse.integrated <- RunUMAP(mouse.integrated,
                           dims = 1:min.pc,
                           reduction = "harmony",
                           n.components = 3) # set to 3 to use with VR

# plot UMAP
DimPlot(mouse.integrated,
        shuffle = TRUE)

Find clusters

Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].

We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.

The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.

# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.integrated,
                                   assay = "SCT", # set as default after SCTransform
                                   reduction = "harmony",
                                   dims = 1:min.pc)

# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
                                  algorithm = 1, # 1 = Louvain
                                  resolution = seq(0.1,0.5,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 51063
## Number of edges: 1617833
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9621
## Number of communities: 7
## Elapsed time: 17 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 51063
## Number of edges: 1617833
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9435
## Number of communities: 9
## Elapsed time: 19 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 51063
## Number of edges: 1617833
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9273
## Number of communities: 9
## Elapsed time: 19 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 51063
## Number of edges: 1617833
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9152
## Number of communities: 15
## Elapsed time: 20 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 51063
## Number of edges: 1617833
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9053
## Number of communities: 16
## Elapsed time: 20 seconds
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.3

Explore resolutions

# 0.1
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE)

# 0.2
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE)

# 0.3
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE)

# 0.4
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE)

# 0.5
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.5",
        label = TRUE)

3D UMAP

embeddings <- mouse.unannotated@reductions$umap@cell.embeddings
embeddings <- cbind(embeddings, as.character(mouse.unannotated$seurat_clusters))
colnames(embeddings)[4] <- "seurat_clusters"
embeddings <- as.data.frame(embeddings)
embeddings$seurat_clusters <- factor(embeddings$seurat_clusters,
                                     levels = c("0","1","2","3","4",
                                                "5","6","7","8"))
cluster_colors <- c("gray","red1","yellow","green", "darkgreen","cyan",
                    "blue","plum1","magenta1")

three.dim <- plot_ly(embeddings,
                     x = ~UMAP_1, 
                     y = ~UMAP_2, 
                     z = ~UMAP_3, 
                     color = ~seurat_clusters,
                     colors = cluster_colors,
                     size = 1) 
three.dim <- three.dim %>% add_markers() 
three.dim <- three.dim %>% layout(scene = list(xaxis = list(title = 'UMAP_1'), 
                                     yaxis = list(title = 'UMAP_2'), 
                                     zaxis = list(title = 'UMAP_3')))
three.dim

Clustering QC

Split UMAPs

# not split
Idents(mouse.unannotated) <- "SCT_snn_res.0.3"
u0 <- DimPlot(mouse.unannotated,
              label = FALSE,
              cols = cluster_colors)
u0

# sample
u1 <- DimPlot(mouse.unannotated,
              label = FALSE,
              cols = cluster_colors,
              split.by = "sample",
              ncol = 3)
u1

# group
u2 <- DimPlot(mouse.unannotated,
              label = FALSE, 
              cols = cluster_colors,
              split.by = "group")
u2

# sex
u3 <- DimPlot(mouse.unannotated,
              label = FALSE, 
              cols = cluster_colors,
              split.by = "sex")
u3

# phase
u4 <- DimPlot(mouse.unannotated,
              label = FALSE, 
              cols = cluster_colors,
              split.by = "phase")
u4

# mito.factor
u5 <- DimPlot(mouse.unannotated,
              label = FALSE, 
              cols = cluster_colors,
              split.by = "mito.factor",ncol = 2)
u5

Revisit QC metrics

# nCount
f1 <- FeaturePlot(mouse.unannotated, 
                  features = "nCount_RNA",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f1

# nFeature
f2 <- FeaturePlot(mouse.unannotated, 
                  features = "nFeature_RNA",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f2

# percent.mt
f3 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.mt",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f3

# cell.complexity
f4 <- FeaturePlot(mouse.unannotated, 
                  features = "cell.complexity",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f4

# percent.ribo
f5 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.ribo.protein",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f5

# percent.hb
f6 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.hb",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f6

# percent.ttr
f7 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.ttr",
                  pt.size = 0.4, 
                  order = TRUE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f7

Percent cells per cluster

# sample
b1 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, sample) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
  theme_classic() +
  geom_col() +
  scale_fill_manual(values = sample_colors) +
  ggtitle("Percentage of sample per cluster")
b1

# group
b2 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, group) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=group)) +
  theme_classic() +
  geom_col() +
  scale_fill_manual(values = group_colors) +
  ggtitle("Percentage of group per cluster")
b2

# sex
b3 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, sex) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=sex)) +
  theme_classic() +
  geom_col() +
  scale_fill_manual(values = sex_colors) +
  ggtitle("Percentage of sex per cluster")
b3

# phase
b4 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, phase) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=phase)) +
  theme_classic() +
  geom_col() +
  ggtitle("Percentage of phase per cluster")
b4

# mito.factor
b5 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, mito.factor) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=mito.factor)) +
  theme_classic() +
  geom_col() +
  ggtitle("Percentage of mito.factor per cluster")
b5

Number cells per cluster

# sample
sample_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "sample")) %>%
  dplyr::count(ident, sample) %>%
  tidyr::spread(ident, n)
sample_ncells
##        sample    0    1    2    3   4   5   6   7   8
## 1  IgG.F.939A 1146 1351  932  586 327 344 661 294 216
## 2  IgG.F.939B  158   62  286   50   8   8  44  82  11
## 3  IgG.F.959B  238   99  421   72  20  15  78 137  11
## 4  IgG.M.823A 4597 2556 1194 1607 923 449 361 330 242
## 5  IgG.M.851A 5245 2995 1321 1951 861 724 595 313 252
## 6  Adu.F.736B  153   60  215   31  12   6  43  77   5
## 7  Adu.F.738A   16    4   28    5   1   2   6  18   1
## 8  Adu.F.738B  574  292  824  169  50  46  88 266  33
## 9  Adu.M.705A 1702  950  804  771 403 407 200 551 193
## 10 Adu.M.734A 2294 1573 1102 1009 704 504 316 146 236
write.table(sample_ncells, 
            "../../results/ten_samples/ncells/cells_per_cluster_per_sample_uannotated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# group
group_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "group")) %>%
  dplyr::count(ident, group) %>%
  tidyr::spread(ident, n)
group_ncells
##   group     0    1    2    3    4    5    6    7   8
## 1   IgG 11384 7063 4154 4266 2139 1540 1739 1156 732
## 2   Adu  4739 2879 2973 1985 1170  965  653 1058 468
write.table(group_ncells, 
            "../../results/ten_samples/ncells/cells_per_cluster_per_group_unannotated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# sex
sex_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "sex")) %>%
  dplyr::count(ident, sex) %>%
  tidyr::spread(ident, n)
sex_ncells
##      sex     0    1    2    3    4    5    6    7   8
## 1   Male 13838 8074 4421 5338 2891 2084 1472 1340 923
## 2 Female  2285 1868 2706  913  418  421  920  874 277
write.table(sex_ncells, 
            "../../results/ten_samples/ncells/cells_per_cluster_per_sex_unannoated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# mito.factor
mito.factor_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "mito.factor")) %>%
  dplyr::count(ident, mito.factor) %>%
  tidyr::spread(ident, n)
mito.factor_ncells
##   mito.factor     0    1    2    3    4    5    6    7   8
## 1        High   635  577 4375  339  103  599 2072 1103 294
## 2         Low 14055 7279  666 4990 2789 1015   91  100 516
## 3      Medium  1176 1653 1267  730  323  647   73  654 278
## 4 Medium high   257  433  819  192   94  244  156  357 112
write.table(mito.factor_ncells, 
            "../../results/ten_samples/ncells/cells_per_cluster_per_mito_factor_unannotated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# phase
phase_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "phase")) %>%
  dplyr::count(ident, phase) %>%
  tidyr::spread(ident, n)
phase_ncells
##       phase    0    1    2    3    4    5    6    7   8
## 1        G1 5657 3550 4032 2470 1425  783 1254 1116 422
## 2       G2M 4891 2999 1505 2436  811 1117  699  600 426
## 3         S 5575 3391 1590 1345 1073  605  439  498 352
## 4 Undecided   NA    2   NA   NA   NA   NA   NA   NA  NA
write.table(phase_ncells, 
            "../../results/ten_samples/ncells/cells_per_cluster_per_phase_unannotated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

Potential Markers

Astrocytes

DefaultAssay(mouse.unannotated) <- "RNA"
mouse.unannotated <- NormalizeData(mouse.unannotated)
Idents(mouse.unannotated) <- "SCT_snn_res.0.3"
VlnPlot(mouse.unannotated,
        features = "Aqp4",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Gfap",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Gja1",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

Endothelial cells

VlnPlot(mouse.unannotated,
        features = "Pecam1",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cdh5",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Kdr",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Flt1",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

Fibroblasts

VlnPlot(mouse.unannotated,
        features = "Col1a1",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Col1a2",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Dcn",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

Macrophage/Microglia

VlnPlot(mouse.unannotated,
        features = "Tmem119",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Itgam", # aka Cd11b
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd14",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd68",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Ccr5",
        cols = cluster_colors,
        group.by = "SCT_snn_res.0.3")

Neurons

VlnPlot(mouse.unannotated,
        features = "Gad1",
        cols = cluster_colors,
        split.by = "seurat_clusters")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.

VlnPlot(mouse.unannotated,
        features = "Gad2",
        cols = cluster_colors,
        split.by = "seurat_clusters")

VlnPlot(mouse.unannotated,
        features = "Slc32a1",
        cols = cluster_colors,
        split.by = "seurat_clusters")

Oligodendrocyte precursor cells

VlnPlot(mouse.unannotated,
        features = "Cspg5",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Gpr17",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Olig1",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

Pericytes & SMCs

VlnPlot(mouse.unannotated,
        features = "Acta2", # actin alpha 2, smooth muscle
        cols = cluster_colors)

T-cell

  • Trac/Cd3d/Cd3e/Cd3g components of the TCR
VlnPlot(mouse.unannotated,
        features = "Trac",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd3d",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd3e",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd3g",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd8a",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

VlnPlot(mouse.unannotated,
        features = "Cd4",
        cols = cluster_colors,
        split.by = "SCT_snn_res.0.3")

Shiny App

Cleanup object

mouse.unannotated@assays$RNA@var.features <- mouse.unannotated@assays$SCT@var.features
metadata <- mouse.unannotated@meta.data
metadata <- metadata[,c(1,21,2:20)]
mouse.unannotated@meta.data <- metadata
mouse.unannotated@assays$SCT@meta.features <- metadata
mouse.unannotated@assays$RNA@meta.features <- metadata

Output directory

# make shiny folder
DefaultAssay(mouse.unannotated) <- "RNA"
Idents(mouse.unannotated) <- mouse.unannotated$seurat_clusters
sc.config <- createConfig(mouse.unannotated)
makeShinyApp(mouse.unannotated, sc.config, gene.mapping = TRUE,
             shiny.title = "ten_samples")