Background

Tanner’s notes

  • Based on findings in earlier paper, Clu knockout mice expressing APP saw a unique amyloid pathology where plaques moved from brain parenmycha (where they are usually found) to within blood vessels. Using a cell extraction method to enrich for vasculature and mural cells, we investigate using single cell RNA sequencing, the transcriptional states of these cells across differing Clu and APP genotypes. Mural/Vasculature tissue was collected from 16 mice and pooled into 8 samples. Two samples each had the following genotypes (APP/Clu): WT/WT, WT/KO, APP/WT, APP/KO. We expect to see transcriptionally different vasculature cells in the APP_plus/Clu_KO mice as this genotype was the only one associated with Amyloid plaque localization to vasculature. APP_WT mice do not have any amyloid plaques. APP_plus mice with WT Clu should have plaques but located in the parenmycha not the vasculature.
  • Notes from m199515_Tanner_Jensen/Petrucelli/Ola_sc_mural/Ola_sc_Mural.Rmd
    • Sample name is the last numeric character of the cell barcode name stored in colnames.
    • Genotypes are determined from sample names based on mapping that was sent in an Email from Ola:
    • WT/KO1 - Lane 1
    • WT/WT1 - Lane 2
    • APP/KO1 - Lane 3
    • APP/WT1 - Lane 4
    • WT/WT2 - Lane 5
    • WT/KO2 - Lane 6
    • APP/KO2 - Lane 7 ( For project Davis-US-Mayo-2-premade-libseq-2-lane-WOBI-NVUS2018092827)
    • APP/WT2 - Lane 8 ( For project Davis-US-Mayo-2-premade-libseq-2-lane-WOBI-NVUS2018092827)
  • Notes from m199515_Tanner_Jensen/Petrucelli/Ola_sc_mural/APP_vasc.Rmd
    • WT/WT1 - Lane 2
    • APP/WT1 - Lane 4
    • WT/WT2 - Lane 5
    • APP/WT2 - Lane 8 ( For project Davis-US-Mayo-2-premade-libseq-2-lane-WOBI-NVUS2018092827)
    • Lane 2 and Lane 4 = Batch1
    • Lane 5 and Lane 8 = Batch2

Kennedi and Kimberly’s notes

Old absolute paths to R1 fastq file.

  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read1/128.120.88.242/C202SC18071782/USPD16090210/Lane2_S1_L001_R1_001.fastq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read1/128.120.88.242/C202SC18071782/USPD16090211/Lane1_S1_L001_R1_001.fastq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read1/128.120.88.242/C202SC18071782/USPD16090212/Lane4_S1_L001_R1_001.fastq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read1/128.120.88.242/C202SC18071782/USPD16090213/Lane3_S1_L001_R1_001.fastq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read1/128.120.88.242/C202SC18071782/USPD16090214/USPD16090214_L5_1.fq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read1/128.120.88.242/C202SC18071782/USPD16090215/USPD16090215_L6_1.fq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read2/128.120.88.242/P202SC18100486-01-01_data_release/USPD16090216_L8_1.fq.gz
  • /research/labs/neurology/fryer/project_archive/m163373_Jonathan_Sens/data/Ola_sc_mural/Lane1Read2/128.120.88.242/P202SC18100486-01-01_data_release/USPD16090217_L7_1.fq.gz

New path to R1 fastq files.

  • All fastq files have been renamed and moved to the following directory:
    • /research/labs/neurology/fryer/projects/Ola_sc_mural/mouse/scRNA
  • File names are now more intuitive and named from APP_pathology_Clu_genotype_batch_sample_lane_read_001.fastq.gz
    • APP_WT_Clu_KO_B1_S2_L001_R1_001.fastq.gz
    • APP_WT_Clu_KO_B2_S6_L006_R1_001.fastq.gz
    • APP_WT_Clu_WT_B1_S1_L002_R1_001.fastq.gz
    • APP_WT_Clu_WT_B2_S5_L005_R1_001.fastq.gz
    • APP_pos_Clu_KO_B1_S4_L003_R1_001.fastq.gz
    • APP_pos_Clu_KO_B2_S8_L007_R1_001.fastq.gz
    • APP_pos_Clu_WT_B1_S3_L004_R1_001.fastq.gz
    • APP_pos_Clu_WT_B2_S7_L008_R1_001.fastq.gz

Setup

Set working directory

knitr::opts_knit$set(root.dir = "/research/labs/neurology/fryer/m214960/Ola_mural/scRNA/scripts/R")

Load libraries

# load libraries
library(cowplot)    # plot_grid()
library(dplyr)      # left_join()
library(ggplot2)    # ggplot()
library(gridExtra)  # grid.arrange()
library(harmony)    # RunHarmony()
library(parallel)   # detectCores()
library(Seurat)     # Read10X_h5()
library(stringr)    # str_match()

Set variables and thresholds

# variables
sample_order <- c("APP.WT.Clu.WT.B1", "APP.WT.Clu.WT.B2", 
                  "APP.pos.Clu.WT.B1", "APP.pos.Clu.WT.B2",
                  "APP.WT.Clu.KO.B1", "APP.WT.Clu.KO.B2", 
                  "APP.pos.Clu.KO.B1", "APP.pos.Clu.KO.B2")
sample_colors <- c("gray","red","orange","yellow","green","blue","purple","pink")
group_order <- c("APP.WT.Clu.WT","APP.pos.Clu.WT",
                  "APP.WT.Clu.KO", "APP.pos.Clu.KO")
group_colors <- c("firebrick1","gold","chartreuse3","dodgerblue")

# thresholds
nCount.min <- 400
nCount.max <- 7500
nFeature.min <- 200
complexity.cutoff <- 0.85
mt.cutoff <- 5
ribo.cutoff <- 20
hb.cutoff <- 1
ttr.cutoff <- 1

# set seed
set.seed(8)

# work in parallel
options(mc.cores = detectCores() - 1)

Save functions

These functions with help simultaneously save plots as a png and pdf.

saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}
saveToPNG <- function(...) {
    d = dev.copy(png,...)
    dev.off(d)
}

Load data

Merge h5

  • We are using CellBender filtered output with false positive rate of 0.05.
prefix <- "../../cellbender/"
suffix <- "_fpr_0.05_filtered.h5"

if (file.exists("../../rObjects/mouse_merged_h5.rds")) {
  mouse <- readRDS("../../rObjects/mouse_merged_h5.rds")
} else {
  # individual sample objects
  APP.WT.Clu.WT.B1 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_WT_Clu_WT_B1",suffix)))
  APP.WT.Clu.WT.B2 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_WT_Clu_WT_B2",suffix)))
  APP.pos.Clu.WT.B1 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_pos_Clu_WT_B1",suffix)))
  APP.pos.Clu.WT.B2 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_pos_Clu_WT_B2",suffix)))
  APP.WT.Clu.KO.B1 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_WT_Clu_KO_B1",suffix)))
  APP.WT.Clu.KO.B2 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_WT_Clu_KO_B2",suffix)))
  APP.pos.Clu.KO.B1 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_pos_Clu_KO_B1",suffix)))
  APP.pos.Clu.KO.B2 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"APP_pos_Clu_KO_B2",suffix)))

  # merge objects
  mouse <- merge(x = APP.WT.Clu.WT.B1, 
                y = c(APP.WT.Clu.WT.B2, 
                      APP.pos.Clu.WT.B1, APP.pos.Clu.WT.B2,
                      APP.WT.Clu.KO.B1, APP.WT.Clu.KO.B2, 
                      APP.pos.Clu.KO.B1, APP.pos.Clu.KO.B2), 
                add.cell.ids = c("APP.WT.Clu.WT.B1", "APP.WT.Clu.WT.B2", 
                                 "APP.pos.Clu.WT.B1", "APP.pos.Clu.WT.B2",
                                 "APP.WT.Clu.KO.B1", "APP.WT.Clu.KO.B2", 
                                 "APP.pos.Clu.KO.B1", "APP.pos.Clu.KO.B2"), 
                project = "APP/Clu Mural scRNAseq")
  
  # cleanup and save
  remove(APP.WT.Clu.WT.B1,APP.WT.Clu.WT.B2,
         APP.pos.Clu.WT.B1, APP.pos.Clu.WT.B2,
         PP.WT.Clu.KO.B1, APP.WT.Clu.KO.B2, 
         APP.pos.Clu.KO.B1, APP.pos.Clu.KO.B2)
  saveRDS(mouse, "../../rObjects/mouse_merged_h5.rds")
}

# preview
mouse
## An object of class Seurat 
## 33885 features across 156280 samples within 1 assay 
## Active assay: RNA (33885 features, 0 variable features)

Annotation

  • Gm* genes are originally annotated by MGI and *Rik genes are annotated by RIKEN
  • The original gtf file was filtered to include specific biotypes only.
# read in annotation file, GRCm39 v107
if (file.exists("../../rObjects/annotation.rds")) {
  genes <- readRDS("../../rObjects/annotation.rds")
} else {
  gtf.file <- "../../refs/genes.gtf"
  genes <- rtracklayer::import(gtf.file)
  genes <- as.data.frame(genes)
  genes <- genes[genes$type == "gene",]
  saveRDS(genes, "../../rObjects/annotation.rds")
}

# check
all.equal(rownames(mouse),genes$gene_name) # 1378 mismatches
## [1] "'is.NA' value mismatch: 1378 in current 0 in target"
# make gene names with NA field the gene_id (cellranger does this)
na.indices <- which(is.na(genes$gene_name))
replacement.geneids <- genes$gene_id[na.indices]
genes$gene_name[na.indices] <- replacement.geneids
all.equal(rownames(mouse),genes$gene_name) # 38 mismatches
## [1] "38 string mismatches"
# cellranger does not handle duplicate gene names well
# it appends the gene name with .1 or .2 with each additional occurence
dup.genes <- genes$gene_name[which(duplicated(genes$gene_name))]
dup.indices <- which(duplicated(genes$gene_name))
new.names <- paste0(dup.genes,".1")
# manually fix a triplicated genes
new.names[6:7] <- c("Ndor1.1","Ndor1.2") 
new.names[10:11] <- c("Pakap.1","Pakap.2") 
all.names <- genes$gene_name
all.names[dup.indices] <- new.names
genes$gene_name <- all.names
all.equal(rownames(mouse),genes$gene_name) # no more mismatches
## [1] TRUE

Metadata columns

nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique transcripts (features)

# create sample column
barcodes <- colnames(mouse)
sample <- str_match(barcodes, "([WTAPPCluB12KOpos.]+).[ACGT]+-[0-9]")[,2]
table(sample)
## sample
## APP.pos.Clu.KO.B1 APP.pos.Clu.KO.B2 APP.pos.Clu.WT.B1 APP.pos.Clu.WT.B2 
##             19999             19999             14825             15927 
##  APP.WT.Clu.KO.B1  APP.WT.Clu.KO.B2  APP.WT.Clu.WT.B1  APP.WT.Clu.WT.B2 
##             25000             17811             24910             17809
mouse$sample <- factor(sample, levels = sample_order)
Idents(mouse) <- mouse$sample

# group column
group <- str_match(barcodes, "([WTAPPCluB12KOpos.]+).[B12]+_[ACGT]+-[0-9]")[,2]
group <- gsub(".$","",group)
table(group)
## group
## APP.pos.Clu.KO APP.pos.Clu.WT  APP.WT.Clu.KO  APP.WT.Clu.WT 
##          39998          30752          42811          42719
mouse$group <- factor(group, levels = group_order)

# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)

# percent.mt
mt.genes <- genes[genes$seqnames == "MT", 12]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
##   [1] "Mrpl44"     "Mrps14"     "Mrpl30"     "Mrpl15"     "Mrps9"     
##   [6] "Mrps2"      "Mrps5"      "Mrpl41"     "Mrps26"     "Mrpl47"    
##  [11] "Mrps21"     "Mrpl24"     "Mrps28"     "Mrpl9"      "Mrpl37"    
##  [16] "Mrpl20"     "Mrps15"     "Mrpl50"     "Mrpl1"      "Mrpl33"    
##  [21] "Mrps17"     "Mrps18c"    "Mrpl35"     "Mrps25"     "Mrpl19"    
##  [26] "Mrpl51"     "Mrps35"     "Mrps33"     "Mrpl53"     "Mrpl17"    
##  [31] "Mrpl46"     "Mrps11"     "Mrpl48"     "Mrpl23"     "Mrps12"    
##  [36] "Mrpl42"     "Mrpl54"     "Mrps31"     "Mrpl34"     "Mrpl57"    
##  [41] "Mrps16"     "Mrpl52"     "Mrpl4"      "Mrps22"     "Mrpl3"     
##  [46] "Mrps7"      "Mrpl38"     "Mrpl12"     "Mrpl22"     "Mrps23"    
##  [51] "Mrps24"     "Mrpl58"     "Mrpl27"     "Mrpl10"     "Mrpl45"    
##  [56] "Mrpl55"     "Mrps27"     "Mrpl36"     "Mrps36"     "Mrps30"    
##  [61] "Mrpl32"     "Mrpl13"     "Mrps6"      "Mrpl40"     "Mrpl39"    
##  [66] "Mrps34"     "Mrps18b"    "Mrpl18"     "Mrpl14"     "Mrps18a"   
##  [71] "Mrpl2"      "Mrps10"     "Mrpl28"     "Mrpl11"     "Mrpl21"    
##  [76] "Mrpl43"     "Mrpl16"     "Mrpl49"     "Rpl7"       "Rps6kc1"   
##  [81] "Rpl31"      "Rpl37a"     "Rps21"      "Rpl12"      "Rpl7a"     
##  [86] "Rpl35"      "Rpl39"      "Rpl36a"     "Rps6ka3"    "Rpl10"     
##  [91] "Rps4x"      "Rps6ka6"    "Rps3a1"     "Rpl34"      "Rpl22l1"   
##  [96] "Rps27"      "Rpl22"      "Rps6ka1"    "Rps6"       "Rpl11"     
## [101] "Rps8"       "Rps20"      "Rpl5"       "Rpl21"      "Rplp0"     
## [106] "Rpl9"       "Rpl6"       "Rpl32"      "Rps16"      "Rpl27a"    
## [111] "Rps9"       "Rps11"      "Rpl13a"     "Rpl28"      "Rps19"     
## [116] "Rps17"      "Rps3"       "Rps13"      "Rpl18"      "Rps15a"    
## [121] "Rps5"       "Rplp2"      "Rpl41"      "Rps26"      "Rps12"     
## [126] "Rps15"      "Rpl18a"     "Rps23rg1"   "Rpl13"      "Rpl36a-ps1"
## [131] "Rpl15"      "Rps24"      "Rpl29"      "Rpl10-ps3"  "Rps25"     
## [136] "Rplp1"      "Rpl4"       "Rps27l"     "Rps27rt"    "Rpsa"      
## [141] "Rpl14"      "Rpl23"      "Rpl19"      "Rps27a"     "Rpl27"     
## [146] "Rpl23a"     "Rpl26"      "Rps6kb1"    "Rpl38"      "Rps18-ps4" 
## [151] "Rps18-ps6"  "Rps18-ps5"  "Rps23"      "Rps6kl1"    "Rps7"      
## [156] "Rpl10l"     "Rps29"      "Rpl36al"    "Rps6ka5"    "Rplp2-ps1" 
## [161] "Rpl37"      "Rpl30"      "Rpl8"       "Rpl3"       "Rps19bp1"  
## [166] "Rpl24"      "Rpl39l"     "Rpl35a"     "Rpl36-ps4"  "Rpl36"     
## [171] "Rps2"       "Rpl3l"      "Rps6ka2"    "Rps10"      "Rpl10a"    
## [176] "Rpl7l1"     "Rps28"      "Rps18"      "Rps14"      "Rpl17"     
## [181] "Rpl9-ps6"   "Rps6kb2"    "Rps6ka4"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hbb-bt"  "Hbb-bs"  "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"
# Ttr nCount column
counts.df <- FetchData(mouse, vars = "Ttr")
mouse$Ttr_nCount <- counts.df$Ttr

# Percent Ttr
mouse$percent.choroid <- PercentageFeatureSet(mouse, features = "Ttr")

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")

ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = ribo.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Violin plots

# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
v1

#  percent violins
v2 <- VlnPlot(mouse,
              features = c("percent.mt","percent.ribo.protein","percent.hb","percent.choroid"),
              ncol = 4,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
v2

Scatter plots

s1 <- ggplot(
  mouse@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_hline(yintercept = nFeature.min) + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula 'y ~ x'

s2 <- FeatureScatter(mouse,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
s2

Filtering

Cell-level filtering

# filter
mouse.filtered <- subset(mouse,
                        subset = (nCount_RNA > nCount.min) &
                          (nCount_RNA < nCount.max) &
                          (nFeature_RNA > nFeature.min) &
                          (cell.complexity > complexity.cutoff) &
                          (percent.mt < mt.cutoff) &
                          (percent.ribo.protein < ribo.cutoff) &
                          (percent.hb < hb.cutoff) &
                          (percent.choroid < ttr.cutoff))

# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "55188 cells removed"

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
nonzero <- counts > 0  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "16143 features removed"

Remove specific genes.

# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]

# remove ribo.genes
#keep <- !rownames(counts.filtered) %in% ribo.combined
#counts.filtered <- counts.filtered[keep,]

# remove Ig genes + Jchain but keep Igha + Ighd to enahnce clustering
#gene.types <- c("IG_C_gene","IG_C_pseudogene","IG_D","IG_J_gene","IG_LV_gene",
#                "IG_V_gene","IG_V_pseudogene")
#keep <- (genes$gene_type) %in% gene.types
#ig.genes <- genes[keep,]
#ig.genes <- c(ig.genes$gene_name, "Jchain")
#ig.genes <- ig.genes[-c(185,192)] # keep Igha and Ighd
#ig.genes
#keep <- !rownames(counts.filtered) %in% ig.genes
#counts.filtered <- counts.filtered[keep,]

# remove Ttr
keep <- !rownames(counts.filtered) %in% c("Ttr")
counts.filtered <- counts.filtered[keep,]

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "14 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero,data)

Post-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")

ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))

# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- cbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min) +
  geom_vline(xintercept = nCount.max) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff) +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = ribo.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff) +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Violin plots

# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
v3

#  percent violins
v4 <- VlnPlot(mouse.filtered,
              features = c("percent.mt","percent.ribo.protein","percent.hb","percent.choroid"),
              ncol = 4,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
v4

Scatter plots

s3 <- ggplot(
  mouse.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_hline(yintercept = nFeature.min) + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula 'y ~ x'

s4 <- FeatureScatter(mouse.filtered,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
s4

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Sample")
b1

Top transcripts

df <- data.frame(row.names = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum, decreasing = TRUE),]
head(df, 30)
##            rsum gene_name
## Malat1  9735251    Malat1
## Bsg     3701960       Bsg
## Tmsb4x  1384605    Tmsb4x
## Klf2    1227725      Klf2
## Actb    1174071      Actb
## Cldn5   1118762     Cldn5
## Ly6c1   1050026     Ly6c1
## Itm2a    876213     Itm2a
## Ubb      793028       Ubb
## Rps27    723876     Rps27
## Rps29    701791     Rps29
## Jund     687893      Jund
## Rpl41    685783     Rpl41
## Junb     681692      Junb
## Fos      634931       Fos
## Rps14    629531     Rps14
## Rpl13    589204     Rpl13
## Rps19    577301     Rps19
## Rpl18a   555796    Rpl18a
## Pltp     549357      Pltp
## Flt1     537776      Flt1
## Calm1    530642     Calm1
## Ptma     528637      Ptma
## Rpl37a   527804    Rpl37a
## Tsc22d1  512065   Tsc22d1
## Rpl38    508340     Rpl38
## Ftl1     483200      Ftl1
## Eif1     473950      Eif1
## Itm2b    470502     Itm2b
## Hspa1a   456567    Hspa1a

Unwanted variation

Cell cycle

Score

  • cell cycle scoring guide
  • The most common biological data correction is to remove the effects of the cell cycle on the transcriptome.
  • Raw counts are not comparable between cells. Each cell has a different nCount_RNA. The log normalization, ((nCount_RNA / nFeature_RNA) * log1p, is taken in order to explore variation.
# log normalization
mouse.phase <- NormalizeData(mouse.filtered)
  • Each cell is given score based on expression of G1, G2/M, and S phase markers.
  • G1: ~10 hrs, S: ~5-6 hrs, G2: ~3-4 hrs, M: ~2 hrs
  • G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
  • If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise, the the greatest positive value between S.Score and G2M.Score determines the phase.
# load mouse cell cycle markers
phase.markers <- read.delim("../../refs/mouse_cell_cycle.txt")
colnames(phase.markers)[2] <- "gene_id"
phase.markers <- left_join(phase.markers, genes[,c(10,12)], by = "gene_id")
g2m <- phase.markers[phase.markers$phase == "G2/M", 4]
g2m
##  [1] "Ube2c"   "Lbr"     "Ctcf"    "Cdc20"   "Cbx5"    "Kif11"   "Anp32e" 
##  [8] "Birc5"   "Cdk1"    "Tmpo"    "Hmmr"    "Jpt1"    "Pimreg"  "Aurkb"  
## [15] "Top2a"   "Gtse1"   "Rangap1" "Cdca3"   "Ndc80"   "Kif20b"  "Cenpf"  
## [22] "Nek2"    "Nuf2"    "Nusap1"  "Bub1"    "Tpx2"    "Aurka"   "Ect2"   
## [29] "Cks1b"   "Kif2c"   "Cdca8"   "Cenpa"   "Mki67"   "Ccnb2"   "Kif23"  
## [36] "Smc4"    "G2e3"    "Tubb4b"  "Anln"    "Tacc3"   "Dlgap5"  "Ckap2"  
## [43] "Ncapd2"  "Ttk"     "Ckap5"   "Cdc25c"  "Hjurp"   "Cenpe"   "Ckap2l" 
## [50] "Cdca2"   "Hmgb2"   "Cks2"    "Psrc1"   "Gas2l3"
s <- phase.markers[phase.markers$phase == "S", 4]
s
##  [1] "Cdc45"    "Uhrf1"    "Mcm2"     "Slbp"     "Mcm5"     "Pola1"   
##  [7] "Gmnn"     "Cdc6"     "Rrm2"     "Atad2"    "Dscc1"    "Mcm4"    
## [13] "Chaf1b"   "Rfc2"     "Msh2"     "Fen1"     "Hells"    "Prim1"   
## [19] "Tyms"     "Mcm6"     "Wdr76"    "Rad51"    "Pcna"     "Ccne2"   
## [25] "Casp8ap2" "Usp1"     "Nasp"     "Rpa2"     "Ung"      "Rad51ap1"
## [31] "Blm"      "Pold3"    "Rrm1"     "Cenpu"    "Gins2"    "Tipin"   
## [37] "Brip1"    "Dtl"      "Exo1"     "Ubr7"     "Clspn"    "E2f8"    
## [43] "Cdca7"
# write table
write.table(phase.markers, 
            "../../results/strict_threshold/cellcycle/mouse_phase_markers.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# score cells
mouse.phase <- CellCycleScoring(mouse.phase,
                                g2m.features = g2m,
                                s.features = s,
                                set.ident = TRUE)

# save
mouse.filtered[["phase"]] <- mouse.phase$Phase

Top variable genes

Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.

# Identify the most variable genes
mouse.phase <- FindVariableFeatures(mouse.phase, verbose = FALSE)

# Preview top 40
head(VariableFeatures(mouse.phase), 40)
##  [1] "Ccl5"    "Acta2"   "Cd74"    "Pcp4"    "Ptgds"   "Vtn"     "Tagln"  
##  [8] "Rasl11a" "Myl9"    "Meg3"    "Mt3"     "Dcn"     "Cebpb"   "Aldoc"  
## [15] "Npy"     "Ccl4"    "Tpm2"    "Spp1"    "Gkn3"    "Tmem212" "Gm13889"
## [22] "Igkc"    "Mgp"     "Lyz2"    "Ecrg4"   "Lrg1"    "Vcam1"   "Gzma"   
## [29] "Rgs5"    "Fabp7"   "Ccdc153" "Pclaf"   "Slc1a2"  "Gpx3"    "Rrad"   
## [36] "Prg4"    "Ppbp"    "Olig1"   "Cspg5"   "Snap25"
# Scale the counts
mouse.phase <- ScaleData(mouse.phase)
## Centering and scaling data matrix

PCA

If the PCA plots for each phase do not look similar you may want to regress out variation due to cell cycle phase. Otherwise, nothing needs to be done.

# Run PCA
mouse.phase <- RunPCA(mouse.phase, nfeatures.print = 10)
## PC_ 1 
## Positive:  Gm13889, Cd63, Myl9, Atp1a2, Ckb, Plac9, Atp1b1, Acta2, Pcsk1n, Meg3 
## Negative:  Cldn5, Ly6c1, Klf2, Pltp, Flt1, Pglyrp1, Selenop, Cxcl12, Ly6a, Spock2 
## PC_ 2 
## Positive:  Myl9, Acta2, Plac9, Tpm2, Tagln, Rasl11a, Rgs4, Myh11, Fxyd1, Mylk 
## Negative:  Pcsk1n, Cspg5, Gpm6a, Gria2, Rtn1, Olig1, Ptprz1, Gpm6b, Cntn1, Bex2 
## PC_ 3 
## Positive:  Car4, Tmsb10, Myl9, Rgs4, Aspn, Tpm2, Ndufa4l2, Plac9, Fxyd1, Tagln 
## Negative:  Gkn3, Timp3, Stmn2, Tm4sf1, Glul, Apold1, Slc6a6, Egr1, Klf4, Cdk19 
## PC_ 4 
## Positive:  Snap25, Syt1, Gad1, Ndrg4, Meis2, Pcp4, Atp1b1, Snhg11, Grin2b, Camk2b 
## Negative:  Olig1, Ptprz1, Bcan, Pdgfra, Cspg5, C1ql1, Matn4, Gpr37l1, S100b, Olig2 
## PC_ 5 
## Positive:  Gkn3, Stmn2, Ccnd2, Mgp, Vegfc, Cd9, Bmx, Glul, Egfl8, S100a6 
## Negative:  Car4, Tmsb10, Ctla2a, Vtn, Slc16a1, Slc7a5, Slc22a8, Kcnj8, Atp13a5, Slc38a5
# Plot
pca1 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "Phase",
               split.by = "Phase")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
pca1

pca2 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "Phase",
               shuffle = TRUE)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
pca2

Bar graph

data <- as.data.frame(table(mouse.phase$Phase))
colnames(data) <- c("Phase","frequency")

ncells3 <- ggplot(data, aes(x = Phase, y = frequency, fill = Phase)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  ggtitle("Cells per phase")
ncells3

Percent cells per phase

percent.phase <- mouse.phase@meta.data %>%
  group_by(sample, Phase) %>%
  dplyr::count() %>%
  group_by(sample) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = sample, y = percent, fill = Phase)) +
  geom_col() +
  ggtitle("Percentage of phase per sample") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
percent.phase

Mitochondrial fraction

PCA

Evaluating effects of mitochondrial expression

# Check quartile values and store
summary(mouse.phase$percent.mt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.7417  1.4384  1.6871  2.4007  4.9971
first <- as.numeric(summary(mouse.phase$percent.mt)[2])
mean <- as.numeric(summary(mouse.phase$percent.mt)[4])
third <- as.numeric(summary(mouse.phase$percent.mt)[5])

# Turn percent.mt into factor based on quartile value
mouse.phase@meta.data$mito.factor <- cut(mouse.phase@meta.data$percent.mt, 
                   breaks=c(-Inf, first, mean, third, Inf), 
                   labels=c("Low","Medium","Medium high", "High"))
mouse.filtered[["mito.factor"]] <- mouse.phase$mito.factor

# Plot
pca1 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "mito.factor",
               split.by = "mito.factor")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
pca1

pca2 <- DimPlot(mouse.phase,
               reduction = "pca",
               group.by = "mito.factor",
               shuffle = TRUE)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
pca2

Percent cells per quartile

percent <- mouse.phase@meta.data %>%
  group_by(sample, mito.factor) %>%
  dplyr::count() %>%
  group_by(sample) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x = sample, y = percent, fill = mito.factor)) +
  geom_col() +
  ggtitle("Mitochondrial fraction per sample") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
percent

Integrate and cluster

Normalize

  • Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model.

  • Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase or percent.mito and so we chose not to regress this out of our data.

  • We will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.

  • Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb).

# split
mouse.split <- SplitObject(mouse.filtered, split.by = "sample")

# transform
options(future.globals.maxSize = 4000 * 1024^5)
for (i in 1:length(mouse.split)) {
  print(paste0("Sample ", i))
  mouse.split[[i]] <- SCTransform(mouse.split[[i]], 
                                  vars.to.regress = c("percent.mt","percent.choroid"),
                                  verbose = FALSE)
}
mouse.split

# cleanup
remove(mouse.filtered,mouse.phase)
  • NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument. Additionally the last line of output specifies “Set default assay to SCT”.

  • It is suggested to not regress out batch, and instead use a data integration method: github link

Integrate

  • Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.

  • To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.

  • Now, using our SCTransform object as input, let’s perform the integration across conditions.

  • First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.

# Choose the features to use when integrating multiple datasets 
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = mouse.split, 
                                          nfeatures = 3000)

# merge the mouse
mouse.merged <- merge(x = mouse.split[[1]],
                      y = c(mouse.split[[2]], mouse.split[[3]], 
                            mouse.split[[4]], mouse.split[[5]], 
                            mouse.split[[6]], mouse.split[[7]],
                            mouse.split[[8]]))

# define the variable features 
VariableFeatures(mouse.merged) <- var.features

# run PCA on the merged object
mouse.merged <- RunPCA(object = mouse.merged, assay = "SCT")

# harmony dimensional reduction
mouse.integrated <- RunHarmony(object = mouse.merged, 
                               group.by.vars = "sample", 
                               assay.use = "SCT",
                               reduction = "pca",
                               plot_convergence = TRUE)

# save and cleanup
saveRDS(mouse.integrated, "../../rObjects/mouse_integrated_strict.rds")
remove(mouse.split, var.features, mouse.merged)
# Reset idents and levels
DefaultAssay(mouse.integrated) <- "SCT"
Idents(mouse.integrated) <- "sample"
mouse.integrated$sample <- factor(mouse.integrated$sample, 
                                  levels = sample_order)
mouse.integrated$group <- factor(mouse.integrated$group, 
                                 levels = group_order)

# check PCA
p1 <- DimPlot(object = mouse.integrated, 
              reduction = "harmony",
              group.by = "sample",
              cols = sample_colors,
              shuffle = TRUE) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
p1

p2 <- VlnPlot(object = mouse.integrated, 
              features = "harmony_1", 
              group.by = "sample", 
              pt.size = 0, 
              cols = sample_colors) + NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
p2

Top 20 variable features

top20 <- mouse.integrated@assays$SCT@var.features[1:20]
top20
##  [1] "Acta2"   "Vtn"     "Myl9"    "Tagln"   "Pcp4"    "Rasl11a" "Cebpb"  
##  [8] "Actb"    "Gm13889" "Gkn3"    "Cd74"    "Rgs5"    "Tpm2"    "Meg3"   
## [15] "Rrad"    "Mt3"     "Cspg5"   "Mgp"     "Snap25"  "Crip1"
  • After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.

  • To generate these visualizations with the harmony output, use reduction = “harmony”

# Plot PCA
pca1 <- DimPlot(mouse.integrated,
        reduction = "harmony",
        split.by = "sample",
        ncol = 2,
        group.by = "sample",
        cols = sample_colors)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
pca1

pca2 <- DimPlot(mouse.integrated,
        reduction = "harmony",
        split.by = "group",
        group.by = "group",
        ncol = 2,
        cols = group_colors)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
pca2

PCA

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# Printing out the most variable genes driving PCs
print(x = mouse.integrated[["pca"]], 
      dims = 1:10,
      nfeatures = 10)
## PC_ 1 
## Positive:  Itm2a, Cldn5, Klf2, Ly6c1, Tmsb4x, Tmsb10, Pltp, Pglyrp1, Flt1, Rpl38 
## Negative:  Myl9, Acta2, Tagln, Gm13889, Plac9, Cebpb, Vtn, Tpm2, Rgs5, Crip1 
## PC_ 2 
## Positive:  Myl9, Acta2, Tagln, Crip1, Cebpb, Tpm2, Gm13889, Plac9, Rgs5, Vtn 
## Negative:  Cspg5, Pcsk1n, Olig1, Ptprz1, Gpm6b, Bcan, Meg3, Gpm6a, Rtn1, Gria2 
## PC_ 3 
## Positive:  Cspg5, Olig1, Ptprz1, Bcan, Pdgfra, Serpine2, C1ql1, Matn4, Gpr37l1, Apoe 
## Negative:  Snap25, Atp1b1, Pcp4, Gad1, Syt1, Ndrg4, Meis2, Snhg11, Meg3, Calm2 
## PC_ 4 
## Positive:  Tagln, Acta2, Cebpb, Crip1, Tpm2, Rasl11a, Myh11, Pln, Mylk, Errfi1 
## Negative:  Vtn, Rgs5, Atp1a2, Higd1b, Ndufa4l2, P2ry14, Atp13a5, Kcnj8, Art3, Ifitm1 
## PC_ 5 
## Positive:  Gkn3, Mgp, Tm4sf1, Glul, Fos, Gadd45g, Stmn2, Ly6c1, Timp3, Egfl8 
## Negative:  Tmsb10, Car4, Ctla2a, Rpl38, Itm2a, Pglyrp1, Actb, Slc16a1, Tmsb4x, Gatm 
## PC_ 6 
## Positive:  Itm2a, Cldn5, Car4, Tmsb10, Tagln, Apoe, Acta2, Hspa1a, Rpl38, Cst3 
## Negative:  Actb, Abhd2, Pam, Slc39a1, Nfia, Lrrc8a, Alkbh5, Dusp3, Cbx3, Wtap 
## PC_ 7 
## Positive:  Klf2, Fos, Junb, Jund, Nfkbia, Dusp1, Gm26532, Zfp36, Atf3, Vcam1 
## Negative:  Gkn3, Apoe, Cst3, Slc1a2, Slc1a3, Aldoc, Ntsr2, Mt3, Mgp, Id4 
## PC_ 8 
## Positive:  Olig1, Hspa1a, Gkn3, C1ql1, Pdgfra, Ly6c1, Cntn1, Matn4, Tnr, Cxcl12 
## Negative:  Mt1, Slc1a2, Junb, Slc1a3, Klf2, Fos, Vcam1, Aldoc, Mt3, Ntsr2 
## PC_ 9 
## Positive:  Fos, Klf2, Cxcl12, Apold1, Ccn1, Plk2, Depp1, Rgcc, Glul, Btg2 
## Negative:  Vcam1, Tmsb10, Gkn3, Mgp, Icam1, Cyp2e1, Car4, Cfh, Slc38a5, Nfkbia 
## PC_ 10 
## Positive:  Sox11, Marcksl1, Pfn2, Hmgn2, Ccnd2, Stmn1, Igfbpl1, Tubb3, Nnat, Dlx2 
## Negative:  Gkn3, Vcam1, Meg3, Gria2, Snhg11, Bcan, Apoe, Cspg5, Mgp, Abhd2

Quantitative approach to an elbow plot
- The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation.
- The point where the percent change in variation between the consecutive PCs is less than 0.1%.

First metric

# Determine percent of variation associated with each PC
stdv <- mouse.integrated[["pca"]]@stdev
sum.stdv <- sum(mouse.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100

# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)

# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 42

Second metric

# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
  (percent.stdv[1:length(percent.stdv) - 1] - 
     percent.stdv[2:length(percent.stdv)]) > 0.1), 
  decreasing = T)[1] + 1

# last point where change of % of variation is more than 0.1%.
co2
## [1] 15

Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data

# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 15

Use min.pc we just calculated to generate the clusters
Plot elbow and overlay the information determined using our metrics

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

UMAP

# Run UMAP
mouse.integrated <- RunUMAP(mouse.integrated,
                           dims = 1:min.pc,
                           reduction = "harmony",
                           n.components = 3) # set to 3 to use with VR

# Plot UMAP
DimPlot(mouse.integrated,
        shuffle = TRUE,
        raster = FALSE)

Cluster

Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].

We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.

The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.

# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.integrated,
                                   assay = "SCT", # set as default after SCTransform
                                   reduction = "harmony",
                                   dims = 1:min.pc)

# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
                                  algorithm = 1, # 1 = Louvain
                                  resolution = seq(0.1,0.5,by = 0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 101092
## Number of edges: 2761570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9384
## Number of communities: 8
## Elapsed time: 63 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 101092
## Number of edges: 2761570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9103
## Number of communities: 13
## Elapsed time: 64 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 101092
## Number of edges: 2761570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8902
## Number of communities: 14
## Elapsed time: 57 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 101092
## Number of edges: 2761570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8736
## Number of communities: 17
## Elapsed time: 59 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 101092
## Number of edges: 2761570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8598
## Number of communities: 17
## Elapsed time: 62 seconds
mouse.unannotated$seurat_clusters <- factor(mouse.unannotated$SCT_snn_res.0.2,
                                            levels = c("0","1","2","3","4","5",
                                                       "6","7","8","9","10"))
Idents(mouse.unannotated) <- "seurat_clusters"
DefaultAssay(mouse.unannotated) <- "RNA"

Explore resolutions

# 0.1
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE,
        raster = FALSE)

# 0.2
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE,
        raster = FALSE)

# 0.3
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE,,
        raster = FALSE)

# 0.4
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE,
        raster = FALSE)

# 0.5
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.5",
        label = TRUE,
        raster = FALSE)

# Set params
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.2
DefaultAssay(mouse.unannotated) <- "RNA"

Clustering QC

Treatment, sample, phase

# set params
cluster_colors <- c("chocolate4","lightgray","firebrick1","gold","green","darkgreen","cyan",
                    "blue","pink","purple","magenta")
Idents(mouse.unannotated) <- "seurat_clusters"

# plot
u1 <- DimPlot(mouse.unannotated,
              cols = cluster_colors,
              label = TRUE,
              raster = FALSE)
u2 <- DimPlot(mouse.unannotated,
              dims = c(2,3),
              label = TRUE,
              cols = cluster_colors,
              raster = FALSE)
plots <- list(u1,u2)
layout <- rbind(c(1,2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# sample
u3 <- DimPlot(mouse.unannotated,
              label = TRUE, 
              split.by = "sample",
              group.by = "seurat_clusters",
              cols = cluster_colors,
              ncol = 2,
              raster = FALSE)
u3

# group
u4 <- DimPlot(mouse.unannotated,
              label = TRUE, 
              split.by = "group",
              group.by = "seurat_clusters",
              ncol = 2,
              cols = cluster_colors,
              raster = FALSE)
u4

Revisit QC metrics

# nCount
f1 <- FeaturePlot(mouse.unannotated, 
                  features = "nCount_RNA",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f1

# nFeature
f2 <- FeaturePlot(mouse.unannotated, 
                  features = "nFeature_RNA",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f2

# percent.mt
f3 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.mt",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f3

# cell.complexity
f4 <- FeaturePlot(mouse.unannotated, 
                  features = "cell.complexity",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f4

# percent.ribo
f5 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.ribo.protein",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f5

# percent.hb
f6 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.hb",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f6

# Ttr
f7 <- FeaturePlot(mouse.unannotated, 
                  features = "percent.choroid",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f8 <- FeaturePlot(mouse.unannotated, 
                  dims = c(2,3),
                  features = "percent.choroid",
                  pt.size = 0.4, 
                  order = TRUE,
                  raster = FALSE) + 
  scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
plots <- list(f7,f8)
layout <- rbind(c(1,2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Percent cells per cluster

# sample
b1 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, sample) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
  theme_classic() +
  geom_col() +
  scale_fill_manual(values = sample_colors) +
  ggtitle("Percentage of sample per cluster")
b1

# group
b2 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, group) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=group)) +
  theme_classic() +
  geom_col() +
  scale_fill_manual(values = group_colors) +
  ggtitle("Percentage of group per cluster")
b2

# mito.factor
b4 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, mito.factor) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=mito.factor)) +
  theme_classic() +
  geom_col() +
  ggtitle("Percentage of mito.factor per cluster")
b4

# phase
b5 <- mouse.unannotated@meta.data %>%
  group_by(seurat_clusters, phase) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=phase)) +
  theme_classic() +
  geom_col() +
  ggtitle("Percentage of phase per cluster")
b5

Number cells per cluster

# sample
sample_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "sample")) %>%
  dplyr::count(ident, sample) %>%
  tidyr::spread(ident, n)
sample_ncells
##              sample    0    1 10    2    3   4   5   6   7   8  9
## 1  APP.WT.Clu.WT.B1 5879 3952 42 3913 1154 316 256 200 107 135 56
## 2  APP.WT.Clu.WT.B2 4175 2825 25 2572  771 232 256 172  81  NA 11
## 3 APP.pos.Clu.WT.B1 3912 2460 14 2587  425 244 156 143 329   3 29
## 4 APP.pos.Clu.WT.B2 4370 2850 13 2677  367 252 110 115  24   1  9
## 5  APP.WT.Clu.KO.B1 5604 3823 19 4221 1019 332 235 288  90 187 45
## 6  APP.WT.Clu.KO.B2 3063 2321 12 1833  400 205 182  80   8   1  4
## 7 APP.pos.Clu.KO.B1 5312 3276 43 3215  444 298 299 287 454   7 30
## 8 APP.pos.Clu.KO.B2 5997 3988 29 4013  486 388  98 189  31   2 14
write.table(sample_ncells, 
            "../../results/strict_threshold/ncells/cells_per_cluster_per_sample_uannotated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)

# group
group_ncells <- FetchData(mouse.unannotated, 
                     vars = c("ident", "group")) %>%
  dplyr::count(ident, group) %>%
  tidyr::spread(ident, n)
group_ncells
##            group     0    1 10    2    3   4   5   6   7   8  9
## 1  APP.WT.Clu.WT 10054 6777 67 6485 1925 548 512 372 188 135 67
## 2 APP.pos.Clu.WT  8282 5310 27 5264  792 496 266 258 353   4 38
## 3  APP.WT.Clu.KO  8667 6144 31 6054 1419 537 417 368  98 188 49
## 4 APP.pos.Clu.KO 11309 7264 72 7228  930 686 397 476 485   9 44
write.table(group_ncells, 
            "../../results/strict_threshold/ncells/cells_per_cluster_per_group_uannotated.tsv",
            quote = FALSE, sep = "\t", row.names = FALSE)