knitr::opts_knit$set(root.dir = "/research/labs/neurology/fryer/m214960/aducanumab/scripts/R")
# load libraries
library(cowplot) # plot_grid()
library(dplyr) # left_join()
library(ggplot2) # ggplot()
library(gridExtra) # grid.arrange()
library(harmony) # RunHarmony()
library(parallel) # detectCores()
library(plotly) # plot_ly()
library(Seurat) # Read10X_h5()
library(ShinyCell) # makeShinyApp()
library(stringr) # str_match()
# variables
sample_order <- c("IgG.F.939A","IgG.F.939B","IgG.F.959B",
"IgG.M.823A","IgG.M.851A",
"Adu.F.736B","Adu.F.738A","Adu.F.738B",
"Adu.M.705A","Adu.M.734A")
group_order <- c("IgG","Adu")
sex_order <- c("Male","Female")
sample_colors <- c("gray","red","orange","yellow","green","forestgreen","cyan","blue","purple","pink")
group_colors <- c("gray","cornflowerblue")
sex_colors <- c("darkgray","purple")
# thresholds
nCount.min <- 200
nCount.max <- 10000
nFeature.min <- 100
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 20
hb.cutoff <- 3
ttr.cutoff <- 1
# set seed
set.seed(8)
# work in parallel
options(mc.cores = detectCores() - 1)
These functions with help simultaneously save plots as a png and pdf.
saveToPDF <- function(...) {
d = dev.copy(pdf,...)
dev.off(d)
}
saveToPNG <- function(...) {
d = dev.copy(png,...)
dev.off(d)
}
prefix <- "../../cellbender/"
suffix <- "_fpr_0.05_filtered.h5"
if (file.exists("../../rObjects/mouse_merged_h5.rds")) {
mouse <- readRDS("../../rObjects/mouse_merged_h5.rds")
} else {
# individual sample objects
IgG.F.939A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"939A_IgG_Female",suffix)))
IgG.F.939B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"939B_IgG_Female",suffix)))
IgG.F.959B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"959B_IgG_Female",suffix)))
IgG.M.823A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"823A_IgG_Male",suffix)))
IgG.M.851A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"851A_IgG_Male",suffix)))
Adu.F.736B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"736B_Adu_Female",suffix)))
Adu.F.738A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"738A_Adu_Female",suffix)))
Adu.F.738B <- CreateSeuratObject(Read10X_h5(paste0(prefix,"738B_Adu_Female",suffix)))
Adu.M.705A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"705A_Adu_Male",suffix)))
Adu.M.734A <- CreateSeuratObject(Read10X_h5(paste0(prefix,"734A_Adu_Male",suffix)))
# merge objects
mouse <- merge(x = IgG.F.939A,
y = c(IgG.F.939B, IgG.F.959B, IgG.M.823A, IgG.M.851A,
Adu.F.736B, Adu.F.738A, Adu.F.738B, Adu.M.705A, Adu.M.734A),
add.cell.ids = c("IgG.F.939A","IgG.F.939B","IgG.F.959B","IgG.M.823A",
"IgG.M.851A","Adu.F.736B","Adu.F.738A","Adu.F.738B",
"Adu.M.705A","Adu.M.734A"),
project = "Aducanumab Mice scRNAseq")
# cleanup and save
remove(IgG.F.939A, IgG.F.939B, IgG.F.959B, IgG.M.823A, IgG.M.851A,
Adu.F.736B, Adu.F.738A, Adu.F.738B, Adu.M.705A, Adu.M.734A)
saveRDS(mouse, "../../rObjects/mouse_merged_h5.rds")
}
# preview
mouse
## An object of class Seurat
## 32285 features across 70028 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
genes <- readRDS("../../rObjects/annotation.rds")
} else {
gtf.file <- "../../refs/mouse_genes.gtf"
genes <- rtracklayer::import(gtf.file)
genes <- as.data.frame(genes)
genes <- genes[genes$type == "gene",]
saveRDS(genes, "../../rObjects/annotation.rds")
}
nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique transcripts (features)
# create sample column
barcodes <- colnames(mouse)
sample <- str_match(barcodes, "(.+)_[ACGT]+-(\\d+)")[,2]
mouse$sample <- factor(sample, levels = sample_order)
Idents(mouse) <- mouse$sample
# sex column
sex <- str_match(mouse$sample, "[IgGAdu]+\\.([FM]).[0-9]+[AB]")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)
# group column
group <- str_match(mouse$sample, "([IgGAdu]+)\\.[FM].[0-9]+[AB]")[,2]
mouse$group <- factor(group, levels = group_order)
# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)
# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
## [1] "Mrpl15" "Mrpl30" "Mrps9" "Mrpl44" "Mrps14"
## [6] "Mrpl41" "Mrps2" "Mrps5" "Mrps26" "Mrps28"
## [11] "Mrpl47" "Mrpl24" "Mrpl9" "Mrps21" "Mrpl50"
## [16] "Mrpl37" "Mrps15" "Mrpl20" "Mrpl33" "Mrpl1"
## [21] "Mrps18c" "Mrps17" "Mrps33" "Mrpl35" "Mrpl19"
## [26] "Mrpl53" "Mrps25" "Mrpl51" "Mrps35" "Mrps12"
## [31] "Mrpl46" "Mrps11" "Mrpl48" "Mrpl17" "Mrpl23"
## [36] "Mrps31" "Mrpl34" "Mrpl4" "Mrps22" "Mrpl3"
## [41] "Mrpl54" "Mrpl42" "Mrps24" "Mrpl22" "Mrpl55"
## [46] "Mrps23" "Mrpl27" "Mrpl10" "Mrpl45" "Mrpl58"
## [51] "Mrps7" "Mrpl38" "Mrpl12" "Mrpl32" "Mrpl36"
## [56] "Mrps27" "Mrps36" "Mrps30" "Mrps16" "Mrpl52"
## [61] "Mrpl57" "Mrpl13" "Mrpl40" "Mrpl39" "Mrps6"
## [66] "Mrpl18" "Mrps34" "Mrpl28" "Mrps18b" "Mrpl14"
## [71] "Mrps18a" "Mrpl2" "Mrps10" "Mrpl21" "Mrpl11"
## [76] "Mrpl49" "Mrpl16" "Mrpl43" "Rpl7" "Rpl31"
## [81] "Rpl37a" "Rps6kc1" "Rpl7a" "Rpl12" "Rpl35"
## [86] "Rps21" "Rpl22l1" "Rps3a1" "Rps27" "Rpl34"
## [91] "Rps20" "Rps6" "Rps8" "Rps6ka1" "Rpl11"
## [96] "Rpl22" "Rpl9" "Rpl5" "Rplp0" "Rpl6"
## [101] "Rpl21" "Rpl32" "Rps9" "Rpl28" "Rps5"
## [106] "Rps19" "Rps16" "Rps11" "Rpl13a" "Rpl18"
## [111] "Rps17" "Rps3" "Rpl27a" "Rps13" "Rps15a"
## [116] "Rplp2" "Rpl18a" "Rpl13" "Rps25" "Rpl10-ps3"
## [121] "Rplp1" "Rpl4" "Rps27l" "Rpl29" "Rps27rt"
## [126] "Rpsa" "Rpl14" "Rps12" "Rps15" "Rpl41"
## [131] "Rps26" "Rps27a" "Rpl26" "Rpl23a" "Rpl9-ps1"
## [136] "Rps6kb1" "Rpl23" "Rpl19" "Rpl27" "Rpl38"
## [141] "Rps7" "Rpl10l" "Rps29" "Rpl36al" "Rps6kl1"
## [146] "Rps6ka5" "Rps23" "Rpl15" "Rps24" "Rpl36a-ps1"
## [151] "Rpl37" "Rpl30" "Rpl8" "Rpl3" "Rps19bp1"
## [156] "Rpl39l" "Rpl35a" "Rpl24" "Rps6ka2" "Rps2"
## [161] "Rpl3l" "Rps10" "Rpl10a" "Rps28" "Rps18"
## [166] "Rpl7l1" "Rpl36" "Rpl36-ps4" "Rps14" "Rpl17"
## [171] "Rps6kb2" "Rps6ka4" "Rpl9-ps6" "Rpl39" "Rpl10"
## [176] "Rps4x" "Rps6ka6" "Rpl36a" "Rps6ka3"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt" "Hbb-bs" "Hbb-bh2" "Hbb-bh1" "Hbb-y" "Hba-x" "Hba-a1"
## [8] "Hba-a2"
# percent Ttr
mouse$percent.ttr <- PercentageFeatureSet(mouse, features = "Ttr")
Remove sample Adu.F.738A due to low cell count and abberant QC metrics.
# remove Adu.F.738A
mouse <- subset(mouse, sample != "Adu.F.738A")
# reset order
sample_order <- c("IgG.F.939A","IgG.F.939B","IgG.F.959B",
"IgG.M.823A","IgG.M.851A",
"Adu.F.736B","Adu.F.738B",
"Adu.M.705A","Adu.M.734A")
mouse$sample <- factor(mouse$sample, levels = sample_order)
sample_colors <- c("gray","red","orange","yellow","green","forestgreen","blue",
"purple","pink")
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")
ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,20000, by = 2000), limits = c(0,20000)) +
ggtitle("Raw: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1
# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = ribo.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v1
# percent violins
v2 <- VlnPlot(mouse,
features = c("percent.mt","percent.ribo.protein","percent.hb","percent.ttr"),
ncol = 4,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v2
s1 <- ggplot(
mouse@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_hline(yintercept = nFeature.min) +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula 'y ~ x'
s2 <- FeatureScatter(mouse,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s2
# filter
mouse.filtered <- subset(mouse,
subset = (nCount_RNA > nCount.min) &
(nCount_RNA < nCount.max) &
(nFeature_RNA > nFeature.min) &
(cell.complexity > complexity.cutoff) &
(percent.mt < mt.cutoff) &
(percent.ribo.protein < ribo.cutoff) &
(percent.hb < hb.cutoff) &
(percent.ttr < ttr.cutoff))
# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "18815 cells removed"
Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.
# filter genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
nonzero <- counts > 0 # produces logical
keep <- Matrix::rowSums(nonzero) >= 10 # sum the true/false
counts.filtered <- counts[keep,] # keep certain genes
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "11269 features removed"
Multiple passes were made to assess whether mitochondrial, ribosomal, and immunoglobulin genes should be removed. Ultimately, removal of these genes enhanced clustering.
# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]
# remove ribo.genes
#keep <- !rownames(counts.filtered) %in% ribo.combined
#counts.filtered <- counts.filtered[keep,]
# remove Ig genes + Jchain but keep Igha + Ighd to enahnce clustering
#gene.types <- c("IG_C_gene","IG_C_pseudogene","IG_D","IG_J_gene","IG_LV_gene",
# "IG_V_gene","IG_V_pseudogene")
#keep <- (genes$gene_type) %in% gene.types
#ig.genes <- genes[keep,]
#ig.genes <- c(ig.genes$gene_name, "Jchain")
#ig.genes <- ig.genes[-c(185,192)] # keep Igha and Ighd
#ig.genes
#keep <- !rownames(counts.filtered) %in% ig.genes
#counts.filtered <- counts.filtered[keep,]
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "13 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero)
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")
ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,20000, by = 2000), limits = c(0,20000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = ribo.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v3
# percent violins
v4 <- VlnPlot(mouse.filtered,
features = c("percent.mt","percent.ribo.protein","percent.hb","percent.ttr"),
ncol = 4,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v4
s3 <- ggplot(
mouse.filtered@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_hline(yintercept = nFeature.min) +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula 'y ~ x'
s4 <- FeatureScatter(mouse.filtered,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s4
# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
aes(x = sample,
y = log10(nFeature_RNA),
fill=sample)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ggtitle("Unique Genes / Cell / Sample") +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Sample")
b1
df <- data.frame(row.names = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum, decreasing = TRUE),]
colnames(df)[1] <- "rsum_raw_count"
head(df, 30)
## rsum_raw_count gene_name
## Malat1 8815978 Malat1
## Meg3 800075 Meg3
## Gm42418 620311 Gm42418
## Snhg11 586740 Snhg11
## Cst3 532592 Cst3
## Srrm2 170959 Srrm2
## Apoe 164760 Apoe
## Kcnq1ot1 148645 Kcnq1ot1
## Gria2 137485 Gria2
## R3hdm1 135961 R3hdm1
## Nrxn1 122347 Nrxn1
## Ctsd 114336 Ctsd
## Gm26917 114066 Gm26917
## Ctss 112414 Ctss
## Ahi1 111442 Ahi1
## Rian 107241 Rian
## Ptprd 103018 Ptprd
## Hexb 100381 Hexb
## C1qa 99541 C1qa
## C1qb 96931 C1qb
## Macf1 94906 Macf1
## Syne1 90413 Syne1
## Syt1 89682 Syt1
## Ank2 86064 Ank2
## Snhg14 85484 Snhg14
## Mycbp2 85463 Mycbp2
## AC149090.1 85272 AC149090.1
## Celf2 79188 Celf2
## Son 78609 Son
## Ube3a 76987 Ube3a
# log normalization
mouse.phase <- NormalizeData(mouse.filtered)
# load mouse cell cycle markers
phase.markers <- read.delim("../../refs/mouse_cell_cycle.txt")
colnames(phase.markers)[2] <- "gene_id"
phase.markers <- left_join(phase.markers, genes[,c(10,13)], by = "gene_id")
g2m <- phase.markers[phase.markers$phase == "G2/M", 4]
g2m
## [1] "Ube2c" "Lbr" "Ctcf" "Cdc20" "Cbx5" "Kif11" "Anp32e"
## [8] "Birc5" "Cdk1" "Tmpo" "Hmmr" "Jpt1" "Pimreg" "Aurkb"
## [15] "Top2a" "Gtse1" "Rangap1" "Cdca3" "Ndc80" "Kif20b" "Cenpf"
## [22] "Nek2" "Nuf2" "Nusap1" "Bub1" "Tpx2" "Aurka" "Ect2"
## [29] "Cks1b" "Kif2c" "Cdca8" "Cenpa" "Mki67" "Ccnb2" "Kif23"
## [36] "Smc4" "G2e3" "Tubb4b" "Anln" "Tacc3" "Dlgap5" "Ckap2"
## [43] "Ncapd2" "Ttk" "Ckap5" "Cdc25c" "Hjurp" "Cenpe" "Ckap2l"
## [50] "Cdca2" "Hmgb2" "Cks2" "Psrc1" "Gas2l3"
s <- phase.markers[phase.markers$phase == "S", 4]
s
## [1] "Cdc45" "Uhrf1" "Mcm2" "Slbp" "Mcm5" "Pola1"
## [7] "Gmnn" "Cdc6" "Rrm2" "Atad2" "Dscc1" "Mcm4"
## [13] "Chaf1b" "Rfc2" "Msh2" "Fen1" "Hells" "Prim1"
## [19] "Tyms" "Mcm6" "Wdr76" "Rad51" "Pcna" "Ccne2"
## [25] "Casp8ap2" "Usp1" "Nasp" "Rpa2" "Ung" "Rad51ap1"
## [31] "Blm" "Pold3" "Rrm1" "Cenpu" "Gins2" "Tipin"
## [37] "Brip1" "Dtl" "Exo1" "Ubr7" "Clspn" "E2f8"
## [43] "Cdca7"
# write table
write.table(phase.markers,
"../../results/nine_samples/cellcycle/mouse_cell_cycle_phase_markers.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# score cells
mouse.phase <- CellCycleScoring(mouse.phase,
g2m.features = g2m,
s.features = s,
set.ident = TRUE)
mouse.filtered[["phase"]] <- mouse.phase$Phase
Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.
# Identify the most variable genes
mouse.phase <- FindVariableFeatures(mouse.phase, verbose = FALSE)
# Preview top 40
head(VariableFeatures(mouse.phase), 40)
## [1] "Spp1" "Apoe" "Cd74" "Ptgds" "Lyz2" "Tmsb4x"
## [7] "Ccl12" "Ccl4" "Fth1" "Flt1" "H2-Aa" "Bsg"
## [13] "Cxcl12" "Cldn5" "H2-Ab1" "H2-Eb1" "Plp1" "Rgs5"
## [19] "Cst7" "Cxcl10" "Ctsd" "Vtn" "Ccl3" "Ftl1"
## [25] "Rps29" "Sst" "Actb" "Camk2n1" "Mal" "Tyrobp"
## [31] "Slco1a4" "Ccl5" "Ifi27l2a" "Selenow" "Fau" "Vip"
## [37] "Eef1a1" "Itm2a" "Ifitm3" "Cryab"
# Scale the counts
mouse.phase <- ScaleData(mouse.phase)
## Centering and scaling data matrix
If the PCA plots for each phase do not look similar you may want to regress out variation due to cell cycle phase. Otherwise, nothing needs to be done. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
# Run PCA
mouse.phase <- RunPCA(mouse.phase, nfeatures.print = 10)
## PC_ 1
## Positive: R3hdm1, Erc2, Atp2b1, Phactr1, Pde10a, Gria1, Homer1, Cit, Gm3764, Slc17a7
## Negative: Cst3, Tmsb4x, Ctsd, C1qb, C1qa, Itm2b, C1qc, Ctss, Hexb, Tyrobp
## PC_ 2
## Positive: Cx3cr1, Gpr34, Csf1r, Hexb, Lgmn, C1qc, Ctss, Lpcat2, Tgfbr1, C1qa
## Negative: Cox8a, Selenow, Rpl9, Cox4i1, Dbi, Mt3, Chchd2, Rpl38, Calm1, Rpl3
## PC_ 3
## Positive: Atp1a2, Slc1a2, Slc1a3, Ntsr2, Ptprz1, Qk, Bcan, Appl2, Ndrg2, Sparcl1
## Negative: R3hdm1, Atp2b1, Erc2, Homer1, Phactr1, Pde10a, Slc17a7, Plk2, Gria1, Cit
## PC_ 4
## Positive: Camk2n1, Mt3, Selenow, Pcsk1n, Cox8a, Cpe, Dbi, Aldoc, 2900097C17Rik, Cox6c
## Negative: Flt1, Ahnak, Rgs5, Slco1a4, Pltp, Cldn5, Ly6a, Cxcl12, Ptprb, Egfl7
## PC_ 5
## Positive: Plp1, Mbp, Mag, Enpp2, Mal, Gjc3, Aspa, Gatm, Ptgds, Sept4
## Negative: Ntsr2, Gm6145, Gm3764, Nwd1, Slc1a2, Phkg1, Fgfr3, Slc1a3, Atp13a4, Slc7a10
# Plot
pca1 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "Phase",
split.by = "Phase")
pca1
pca2 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "Phase",
shuffle = TRUE)
pca2
data <- as.data.frame(table(mouse.phase$Phase))
colnames(data) <- c("Phase","frequency")
ncells3 <- ggplot(data, aes(x = Phase, y = frequency, fill = Phase)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
ggtitle("Cells per phase")
ncells3
percent.phase <- mouse.phase@meta.data %>%
group_by(sample, Phase) %>%
dplyr::count() %>%
group_by(sample) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x = sample, y = percent, fill = Phase)) +
geom_col() +
ggtitle("Percentage of phase per sample") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
percent.phase
Evaluating effects of mitochondrial expression
# Check quartile values and store
summary(mouse.phase$percent.mt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 1.4390 0.6983 19.9800
first <- as.numeric(summary(mouse.phase$percent.mt)[2])
mean <- as.numeric(summary(mouse.phase$percent.mt)[4])
third <- as.numeric(summary(mouse.phase$percent.mt)[5])
# Turn percent.mt into factor based on quartile value
mouse.phase@meta.data$mito.factor <- cut(mouse.phase@meta.data$percent.mt,
breaks=c(-Inf, first, mean, third, Inf),
labels=c("Low","Medium","Medium high", "High"))
mouse.filtered[["mito.factor"]] <- mouse.phase$mito.factor
# Plot
pca1 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "mito.factor",
split.by = "mito.factor")
pca1
pca2 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "mito.factor",
shuffle = TRUE)
pca2
data <- as.data.frame(table(mouse.phase$mito.factor))
colnames(data) <- c("mito.factor","frequency")
ncells4 <- ggplot(data, aes(x = mito.factor, y = frequency, fill = mito.factor)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
ggtitle("Cells per phase")
ncells4
percent <- mouse.phase@meta.data %>%
group_by(sample, mito.factor) %>%
dplyr::count() %>%
group_by(sample) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x = sample, y = percent, fill = mito.factor)) +
geom_col() +
ggtitle("Mitochondrial fraction per sample") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
percent
Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model.
Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase or percent.mito and so we chose not to regress this out of our data.
Since we have 8 samples in our dataset we want to keep them as separate objects and transform them as that is what is required for integration.
# split
mouse.split <- SplitObject(mouse.filtered, split.by = "sample")
We will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.
Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb).
options(future.globals.maxSize = 4000 * 1024^5)
for (i in 1:length(mouse.split)) {
print(paste0("Sample ", i))
mouse.split[[i]] <- SCTransform(mouse.split[[i]],
verbose = FALSE,
vars.to.regress = "percent.mt")
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"
## [1] "Sample 9"
remove(mouse.filtered)
# Choose the features to use when integrating multiple data sets
# We will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = mouse.split,
nfeatures = 3000)
# merge the mouse
mouse.merged <- merge(x = mouse.split[[1]],
y = c(mouse.split[[2]], mouse.split[[3]],
mouse.split[[4]], mouse.split[[5]],
mouse.split[[6]], mouse.split[[7]],
mouse.split[[8]], mouse.split[[9]]))
# define the variable features
VariableFeatures(mouse.merged) <- var.features
# run PCA on the merged object
mouse.merged <- RunPCA(object = mouse.merged, assay = "SCT")
# harmony dimensional reduction
mouse.integrated <- RunHarmony(object = mouse.merged,
group.by.vars = "sample",
assay.use = "SCT",
reduction = "pca",
plot_convergence = TRUE)
# save and cleanup
saveRDS(mouse.integrated, "../../rObjects/mouse_nine_samples_integrated.rds")
remove(mouse.split, var.features, mouse.merged)
# Reset idents and levels
DefaultAssay(mouse.integrated) <- "SCT"
Idents(mouse.integrated) <- "sample"
mouse.integrated$sample <- factor(mouse.integrated$sample,
levels = sample_order)
mouse.integrated$sex <- factor(mouse.integrated$sex,
levels = sex_order)
mouse.integrated$group <- factor(mouse.integrated$group,
levels = group_order)
# check PCA
p1 <- DimPlot(object = mouse.integrated,
reduction = "harmony",
group.by = "sample",
cols = sample_colors,
shuffle = TRUE) + NoLegend()
p1
p2 <- VlnPlot(object = mouse.integrated,
features = "harmony_1",
group.by = "sample",
pt.size = 0,
cols = sample_colors) + NoLegend()
p2
Top 20 variable features
top20 <- mouse.integrated@assays$SCT@var.features[1:20]
top20
## [1] "Cst3" "Apoe" "Ctss" "Ctsd" "Fth1" "Tmsb4x" "C1qa"
## [8] "C1qb" "Hexb" "Gm42418" "Plp1" "C1qc" "Ptgds" "Trem2"
## [15] "Cdr1os" "Cst7" "Snhg11" "Itm2b" "Ctsb" "Ctsz"
After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.
To generate these visualizations with the harmony output, use reduction = “harmony”
# Plot PCA
pca1 <- DimPlot(mouse.integrated,
reduction = "harmony",
ncol = 3,
split.by = "sample",
group.by = "sample",
cols = sample_colors)
pca1
pca2 <- DimPlot(mouse.integrated,
reduction = "harmony",
split.by = "group",
group.by = "group",
cols = group_colors)
pca2
pca3 <- DimPlot(mouse.integrated,
reduction = "harmony",
split.by = "sex",
group.by = "sex",
cols = sex_colors)
pca3
To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
# Printing out the most variable genes driving PCs
print(x = mouse.integrated[["pca"]],
dims = 1:10,
nfeatures = 10)
## PC_ 1
## Positive: Gm42418, R3hdm1, Ptprd, Gria2, Ahi1, Gm26917, Kcnq1ot1, Rian, Syt1, Nrxn1
## Negative: Cst3, C1qa, Ctss, C1qb, Hexb, Ctsd, C1qc, Itm2b, Trem2, Fcer1g
## PC_ 2
## Positive: Hexb, Ctss, C1qb, Cst3, C1qa, Gpr34, Ctsd, Cx3cr1, C1qc, Csf1r
## Negative: Tmsb4x, Fth1, Rpl13, Rps24, Eef1a1, Rps29, Fau, Rpl19, Rplp1, Tpt1
## PC_ 3
## Positive: R3hdm1, Ahi1, Syt1, Rian, Arpp21, Snhg11, Celf2, Snhg14, Atp2b1, Trank1
## Negative: Slc1a2, Atp1a2, Slc1a3, Qk, Ptprz1, Bcan, Apoe, Appl2, Ntsr2, Ndrg2
## PC_ 4
## Positive: Plp1, Mbp, Neat1, Mag, Enpp2, P2ry12, Cx3cr1, Qk, Ptgds, Gatm
## Negative: Apoe, Cst7, Lyz2, Ctsb, Ctsz, Tyrobp, Cd63, Ctsl, Trem2, Slc1a2
## PC_ 5
## Positive: Cx3cr1, Slc1a2, P2ry12, Tmem119, Selplg, Serinc3, Cst3, Lpcat2, Csf1r, Ifngr1
## Negative: Plp1, Mbp, Neat1, Apoe, Mag, Cst7, Enpp2, Lyz2, Trf, Ptgds
## PC_ 6
## Positive: Gm42418, Camk2n1, Pcsk1n, Calm1, Selenow, Mt3, Cpe, Cox8a, 2900097C17Rik, Dbi
## Negative: Tmsb4x, Actb, Marcks, Rgs10, Pfn1, Fau, Aif1, Rps9, Ftl1, Sh3bgrl3
## PC_ 7
## Positive: Gm42418, Bsg, Flt1, Ahnak, Ly6a, Clec2d, H2-D1, H2-K1, Ifitm3, Pecam1
## Negative: Cst3, Plp1, C1qb, Mbp, C1qa, Ctss, Slc1a2, Neat1, Hexb, C1qc
## PC_ 8
## Positive: R3hdm1, Mef2c, Slc17a7, Ptprd, Arpp21, Camk2a, A830036E02Rik, Homer1, Gpm6b, Gria3
## Negative: Gm42418, AY036118, Gm26917, C130073E24Rik, Erbb4, Cst3, Rgs9, Dpp6, Usp29, Cacna2d2
## PC_ 9
## Positive: Rgs9, Pde10a, Phactr1, Gnal, Foxp1, Meis2, Dgkb, Unc13c, Cacna2d3, Cpne5
## Negative: Gm42418, Snhg11, Ptprd, Tcf4, Miat, Nrxn1, Peg3, Elavl2, Ntng1, Srrm4
## PC_ 10
## Positive: Bsg, Flt1, Ahnak, Ly6a, Pecam1, Slc2a1, Ptma, Ptn, Spock2, Ifitm3
## Negative: Apoe, Fth1, Camk2n1, Cyth4, Hpgds, Gm42418, Pcsk1n, Mafb, Cst7, Maf
Quantitative approach to an elbow plot
- The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation.
- The point where the percent change in variation between the consecutive PCs is less than 0.1%.
First metric
# Determine percent of variation associated with each PC
stdv <- mouse.integrated[["pca"]]@stdev
sum.stdv <- sum(mouse.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)
# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 41
Second metric
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
(percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
## [1] 15
Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 15
Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
# Run UMAP
mouse.integrated <- RunUMAP(mouse.integrated,
dims = 1:min.pc,
reduction = "harmony",
n.components = 3) # set to 3 to use with VR
# plot UMAP
DimPlot(mouse.integrated,
shuffle = TRUE)
Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].
We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.
The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.
# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.integrated,
assay = "SCT", # set as default after SCTransform
reduction = "harmony",
dims = 1:min.pc)
# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
algorithm = 1, # 1 = Louvain
resolution = seq(0.1,0.5,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 50982
## Number of edges: 1614632
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9616
## Number of communities: 7
## Elapsed time: 18 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 50982
## Number of edges: 1614632
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9434
## Number of communities: 9
## Elapsed time: 18 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 50982
## Number of edges: 1614632
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9270
## Number of communities: 10
## Elapsed time: 22 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 50982
## Number of edges: 1614632
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9146
## Number of communities: 16
## Elapsed time: 19 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 50982
## Number of edges: 1614632
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9054
## Number of communities: 16
## Elapsed time: 21 seconds
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.3
# 0.1
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE)
# 0.2
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
# 0.3
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.3",
label = TRUE)
# 0.4
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.4",
label = TRUE)
# 0.5
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.5",
label = TRUE)
embeddings <- mouse.unannotated@reductions$umap@cell.embeddings
embeddings <- cbind(embeddings, as.character(mouse.unannotated$seurat_clusters))
colnames(embeddings)[4] <- "seurat_clusters"
embeddings <- as.data.frame(embeddings)
embeddings$seurat_clusters <- factor(embeddings$seurat_clusters,
levels = c("0","1","2","3","4",
"5","6","7","8","9"))
cluster_colors <- c("chocolate4","gray","red1","yellow","green", "darkgreen","cyan",
"blue","plum1","magenta1")
three.dim <- plot_ly(embeddings,
x = ~UMAP_1,
y = ~UMAP_2,
z = ~UMAP_3,
color = ~seurat_clusters,
colors = cluster_colors,
size = 1)
three.dim <- three.dim %>% add_markers()
three.dim <- three.dim %>% layout(scene = list(xaxis = list(title = 'UMAP_1'),
yaxis = list(title = 'UMAP_2'),
zaxis = list(title = 'UMAP_3')))
three.dim
# not split
Idents(mouse.unannotated) <- "SCT_snn_res.0.3"
u0 <- DimPlot(mouse.unannotated,
label = FALSE,
cols = cluster_colors)
u0
# sample
u1 <- DimPlot(mouse.unannotated,
label = FALSE,
cols = cluster_colors,
split.by = "sample",
ncol = 3)
u1
# group
u2 <- DimPlot(mouse.unannotated,
label = FALSE,
cols = cluster_colors,
split.by = "group")
u2
# sex
u3 <- DimPlot(mouse.unannotated,
label = FALSE,
cols = cluster_colors,
split.by = "sex")
u3
# phase
u4 <- DimPlot(mouse.unannotated,
label = FALSE,
cols = cluster_colors,
split.by = "phase")
u4
# mito.factor
u5 <- DimPlot(mouse.unannotated,
label = FALSE,
cols = cluster_colors,
split.by = "mito.factor",ncol = 2)
u5
# nCount
f1 <- FeaturePlot(mouse.unannotated,
features = "nCount_RNA",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f1
# nFeature
f2 <- FeaturePlot(mouse.unannotated,
features = "nFeature_RNA",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f2
# percent.mt
f3 <- FeaturePlot(mouse.unannotated,
features = "percent.mt",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f3
# cell.complexity
f4 <- FeaturePlot(mouse.unannotated,
features = "cell.complexity",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f4
# percent.ribo
f5 <- FeaturePlot(mouse.unannotated,
features = "percent.ribo.protein",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f5
# percent.hb
f6 <- FeaturePlot(mouse.unannotated,
features = "percent.hb",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f6
# percent.ttr
f7 <- FeaturePlot(mouse.unannotated,
features = "percent.ttr",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f7
# sample
b1 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, sample) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = sample_colors) +
ggtitle("Percentage of sample per cluster")
b1
# group
b2 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, group) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=group)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = group_colors) +
ggtitle("Percentage of group per cluster")
b2
# sex
b3 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, sex) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sex)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = sex_colors) +
ggtitle("Percentage of sex per cluster")
b3
# phase
b4 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, phase) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=phase)) +
theme_classic() +
geom_col() +
ggtitle("Percentage of phase per cluster")
b4
# mito.factor
b5 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, mito.factor) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=mito.factor)) +
theme_classic() +
geom_col() +
ggtitle("Percentage of mito.factor per cluster")
b5
# sample
sample_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "sample")) %>%
dplyr::count(ident, sample) %>%
tidyr::spread(ident, n)
sample_ncells
## sample 0 1 2 3 4 5 6 7 8 9
## 1 IgG.F.939A 1092 1384 917 438 337 354 343 391 558 43
## 2 IgG.F.939B 135 88 282 36 9 19 8 90 39 3
## 3 IgG.F.959B 182 154 412 50 20 30 13 151 75 4
## 4 IgG.M.823A 4278 2809 1191 1105 986 740 464 367 280 39
## 5 IgG.M.851A 4939 3281 1324 1273 909 974 716 328 474 39
## 6 Adu.F.736B 110 104 211 22 13 9 5 86 38 4
## 7 Adu.F.738B 466 389 805 124 60 60 46 296 84 12
## 8 Adu.M.705A 1505 1135 807 562 425 394 401 576 157 19
## 9 Adu.M.734A 2056 1813 1092 748 733 485 502 173 259 23
write.table(sample_ncells,
"../../results/nine_samples/ncells/cells_per_cluster_per_sample_uannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# group
group_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "group")) %>%
dplyr::count(ident, group) %>%
tidyr::spread(ident, n)
group_ncells
## group 0 1 2 3 4 5 6 7 8 9
## 1 IgG 10626 7716 4126 2902 2261 2117 1544 1327 1426 128
## 2 Adu 4137 3441 2915 1456 1231 948 954 1131 538 58
write.table(group_ncells,
"../../results/nine_samples/ncells/cells_per_cluster_per_group_unannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# sex
sex_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "sex")) %>%
dplyr::count(ident, sex) %>%
tidyr::spread(ident, n)
sex_ncells
## sex 0 1 2 3 4 5 6 7 8 9
## 1 Male 12778 9038 4414 3688 3053 2593 2083 1444 1170 120
## 2 Female 1985 2119 2627 670 439 472 415 1014 794 66
write.table(sex_ncells,
"../../results/nine_samples/ncells/cells_per_cluster_per_sex_unannoated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# mito.factor
mito.factor_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "mito.factor")) %>%
dplyr::count(ident, mito.factor) %>%
tidyr::spread(ident, n)
mito.factor_ncells
## mito.factor 0 1 2 3 4 5 6 7 8 9
## 1 High 514 699 4277 432 112 288 582 1327 1673 144
## 2 Low 13166 8047 682 3376 2941 2062 1031 102 85 9
## 3 Medium 909 1882 1265 423 338 551 639 643 65 20
## 4 Medium high 174 529 817 127 101 164 246 386 141 13
write.table(mito.factor_ncells,
"../../results/nine_samples/ncells/cells_per_cluster_per_mito_factor_unannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# phase
phase_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "phase")) %>%
dplyr::count(ident, phase) %>%
tidyr::spread(ident, n)
phase_ncells
## phase 0 1 2 3 4 5 6 7 8 9
## 1 G1 5013 4154 4003 1894 1480 1071 793 1168 999 58
## 2 G2M 4514 3264 1487 1780 874 1047 1090 807 586 77
## 3 S 5236 3738 1551 684 1138 947 615 483 379 51
## 4 Undecided NA 1 NA NA NA NA NA NA NA NA
write.table(phase_ncells,
"../../results/nine_samples/ncells/cells_per_cluster_per_phase_unannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
DefaultAssay(mouse.unannotated) <- "RNA"
mouse.unannotated <- NormalizeData(mouse.unannotated)
Idents(mouse.unannotated) <- "SCT_snn_res.0.3"
VlnPlot(mouse.unannotated,
features = "Aqp4",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Gfap",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Gja1",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Pecam1",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cdh5",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Kdr",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Flt1",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Col1a1",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Col1a2",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Dcn",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Tmem119",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Itgam", # aka Cd11b
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd14",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd68",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Ccr5",
cols = cluster_colors,
group.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Gad1",
cols = cluster_colors,
split.by = "seurat_clusters")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
VlnPlot(mouse.unannotated,
features = "Gad2",
cols = cluster_colors,
split.by = "seurat_clusters")
VlnPlot(mouse.unannotated,
features = "Slc32a1",
cols = cluster_colors,
split.by = "seurat_clusters")
VlnPlot(mouse.unannotated,
features = "Cspg5",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Gpr17",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Olig1",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Acta2", # actin alpha 2, smooth muscle
cols = cluster_colors)
VlnPlot(mouse.unannotated,
features = "Trac",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd3d",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd3e",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd3g",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd8a",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
VlnPlot(mouse.unannotated,
features = "Cd4",
cols = cluster_colors,
split.by = "SCT_snn_res.0.3")
mouse.unannotated@assays$RNA@var.features <- mouse.unannotated@assays$SCT@var.features
metadata <- mouse.unannotated@meta.data
metadata <- metadata[,c(1,21,2:20)]
mouse.unannotated@meta.data <- metadata
mouse.unannotated@assays$SCT@meta.features <- metadata
mouse.unannotated@assays$RNA@meta.features <- metadata
# make shiny folder
DefaultAssay(mouse.unannotated) <- "RNA"
Idents(mouse.unannotated) <- mouse.unannotated$seurat_clusters
sc.config <- createConfig(mouse.unannotated)
makeShinyApp(mouse.unannotated, sc.config, gene.mapping = TRUE,
shiny.title = "nine_samples")