knitr::opts_knit$set(root.dir = ".")
# load libraries
library(cowplot) # plot_grid()
library(DoubletFinder) # paramSweep()
library(dplyr) # ungroup()
library(ggrepel) # geom_text_repel()
library(ggplot2) # ggplot()
library(grid) # grid.arrange()
library(gridExtra) # grid.arrange()
library(harmony) # RunHarmony()
library(reshape2) # melt()
library(Seurat) # Read10X_h5()
library(Seurat.utils) # RenameGenesSeurat()
library(stringr) # str_match()
pathToRef <- "/research/labs/neurology/fryer/projects/references/pig"
tissue <- "Brain"
control <- "Saline"
treatment <- "LPS"
control_color <- "darkgray"
treatment_color <- "purple"
sample_colors <- c(control_color,control_color,treatment_color, treatment_color)
treatment_colors <- c(control_color, treatment_color)
myContrasts <- c("LPS - Control")
tool <- "cellranger"
nCount.min <- 500
nFeature.min <- 250
complexity.cutoff <- 0.85
mt.cutoff <- 1
hb.cutoff <- 5
These functions with help simultaneously save plots as a png and pdf.
saveToPDF <- function(...) {
d = dev.copy(pdf,...)
dev.off(d)
}
saveToPNG <- function(...) {
d = dev.copy(png,...)
dev.off(d)
}
Using CellBender filtered output.
prefix <- "../../cellbender/"
suffix <- "_cellbender_fpr_0.05_filtered.h5"
if (tissue == "Brain" && file.exists("../../rObjects/brain_merged_h5.rds")) {
pigs <- readRDS("../../rObjects/brain_merged_h5.rds")
} else if (tissue == "Brain") {
# individual sample objects
br4r <- CreateSeuratObject(Read10X_h5(paste0(prefix,"4_BR_R",suffix)))
br10 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"10_BR",suffix)))
br8 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"8_BR_R",suffix)))
br12 <- CreateSeuratObject(Read10X_h5(paste0(prefix,"12_BR",suffix)))
# merge objects
pigs <- merge(x = br4r,
y = c(br10, br8, br12),
add.cell.ids = c("4.R.Saline","10.Saline","8.R.LPS", "12.LPS"),
project = paste0("LPS Pigs ", tissue, " Single Nucleus"))
# cleanup and save
remove(br10,br12,br4r,br8)
saveRDS(pigs, "../../rObjects/brain_merged_h5.rds")
}
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# add manual pig annotations (KK_gene_name)
genes <- read.csv(paste0(pathToRef, "/manual_pig_annotations.csv"))
new.names <- str_replace_all(rownames(pigs),
genes$ENSEMBL_ID,
genes$OUR_ID)
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
pigs <- RenameGenesSeurat(pigs, new.names)
## [1] "Run this before integration. It only changes obj@assays$RNA@counts, @data and @scale.data."
# preview
pigs
## An object of class Seurat
## 30490 features across 38007 samples within 1 assay
## Active assay: RNA (30490 features, 0 variable features)
nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique genes (features)
# create sample column
barcodes <- colnames(pigs)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample)
## sample
## 10.Saline 12.LPS 4.R.Saline 8.R.LPS
## 24959 10002 2360 686
pigs$sample <- factor(sample, levels = c("4.R.Saline", "10.Saline", "8.R.LPS", "12.LPS"))
table(pigs$sample) # check
##
## 4.R.Saline 10.Saline 8.R.LPS 12.LPS
## 2360 24959 686 10002
Idents(pigs) <- pigs$sample
Add treatment column to metadata
# create treatment column
treat <- gsub("4.R.Saline", control, pigs$sample)
treat <- gsub("10.Saline", control, treat)
treat <- gsub("8.R.LPS", treatment, treat)
treat <- gsub("12.LPS", treatment, treat)
pigs$treatment <- factor(treat, levels = c("Saline","LPS"))
table(pigs$treatment)
##
## Saline LPS
## 27319 10688
# cell.complexity
pigs$cell.complexity <- log10(pigs$nFeature_RNA) / log10(pigs$nCount_RNA)
# percent.mt
mt.genes <- c("ND1","ND2","COX1","COX2","ATP8","ATP6","COX3","ND3",
"ND4L","ND4","ND5","ND6","CYTB")
pigs$percent.mt <- PercentageFeatureSet(pigs, features = mt.genes)
# percent.ribo
# ribosomal proteins begin with 'RPL' or 'RSL' for this annotation file
gene.names <- rownames(pigs)
ribo.genes <- gene.names[grep("^R[SP]L", gene.names)]
pigs$percent.ribo <- PercentageFeatureSet(pigs, features = ribo.genes)
# percent.hb
# percent.hb - hemoglobin proteins begin with 'HB' or 'HBP' for pig
hb.genes <- gene.names[grep("^HB[^(P)]", gene.names)]
pigs$percent.hb <- PercentageFeatureSet(pigs, features = hb.genes)
# Visualize the number of cell counts per sample
data <- as.data.frame(table(pigs$sample))
colnames(data) <- c("sample","frequency")
ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
ggtitle("Raw: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1
# set graphical parameter
par(mfrow = c(3,1))
# Visualize nCount_RNA
den1 <- ggplot(pigs@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
geom_vline(xintercept = nCount.min)
# Visualize percent.mt
den2 <- ggplot(pigs@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density")
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(pigs@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff)
# Arrange graphs in grid
plots1 <- list(den1,den2,den3)
layout1 <- rbind(c(1),c(2),c(3))
grid1 <- grid.arrange(grobs = plots1, layout_matrix = layout1)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Removed 17 rows containing non-finite values (stat_density).
## Removed 17 rows containing non-finite values (stat_density).
# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(pigs,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v1
## Warning: Removed 17 rows containing non-finite values (stat_ydensity).
# percent violins
v2 <- VlnPlot(pigs,
features = c("percent.mt","percent.ribo","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v2
## Warning: Removed 17 rows containing non-finite values (stat_ydensity).
## Removed 17 rows containing non-finite values (stat_ydensity).
## Removed 17 rows containing non-finite values (stat_ydensity).
s1 <- ggplot(
pigs@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_hline(yintercept = nFeature.min) +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
#geom_rect(aes(xmin=300, xmax=300, ymin=1000,
# ymax=3000), color="transparent", fill="orange", alpha=0.3)
s1
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 17 rows containing non-finite values (stat_smooth).
s2 <- FeatureScatter(pigs,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s2
## Warning: Removed 17 rows containing missing values (geom_point).
We want to be careful filtering because removing things can easily lead to misinterpretation. For example, cells with high percent.mt could actually just be involved in respiratory processes.
We will filter based on 6 conditions:
– nCount_RNA > 500 – nFeature_RNA > 250 – cell.complexity > 0.85 – percent.mt < 1 – percent.hb < 5
And removing MT genes as to not alter down stream differential expression
# filter
pigs.filtered <- subset(pigs,
subset = (nCount_RNA > nCount.min) &
(nFeature_RNA > nFeature.min) &
(cell.complexity > complexity.cutoff) &
(percent.mt < mt.cutoff) &
(percent.hb < hb.cutoff))
# print cells removed
print(paste0(dim(pigs)[2] - dim(pigs.filtered)[2]," cells removed"))
## [1] "26910 cells removed"
Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.
# filter genes
counts <- GetAssayData(object = pigs.filtered, slot = "counts")
nonzero <- counts > 0 # produces logical
keep <- Matrix::rowSums(nonzero) >= 10 # sum the true/false
counts.filtered <- counts[keep,] # keep certain genes
# overwrite pigs.filtered
pigs.filtered <- CreateSeuratObject(counts.filtered,
meta.data = pigs.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "8672 features removed"
https://github.com/chris-mcginnis-ucsf/DoubletFinder\
Heterotypic - doublets derived from transcriptionally distinct cells. DoubletFinder works best on this type of doublet.
Homotopic - Transcriptionally similar cell doublets. DoubletFinder does not work as great on this type of doublet.
pANN - proportion of artificial nearest neighbors (pANN)
BCMVN - mean-variance normalized bimodality coefficient of pANN distributions produced during pN -pK parameter sweeps. The BCMVN may be used to identify the pK parameter.
Overview of steps:
A. Prepare each sample
B. pK Identification (no ground-truth) - defines the PC neighborhood size used to compute pANN
C. Homotypic Doublet Proportion Estimate - homotypic doublets may not be a problem depending on the type of analysis you are performing. If you have some doublets of the same type and their counts are normalized, they will generally represent the profile of single cells of the same type.
D. DoubletFinder
E. Visualize where the doublets are located
# split object by sample
pigs.split <- SplitObject(pigs.filtered, split.by = "sample")
for (i in 1:length(pigs.split)) {
# normalize and find PCs
print(i)
pig_sample <- NormalizeData(pigs.split[[i]])
sampleID <- levels(droplevels(pig_sample@meta.data$sample))
pig_sample <- FindVariableFeatures(pig_sample, selection.method = "vst", nfeatures = 2000)
pig_sample <- ScaleData(pig_sample)
pig_sample <- RunPCA(pig_sample)
# get significant PCs
stdv <- pig_sample[["pca"]]@stdev
sum.stdv <- sum(pig_sample[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
# run umap
pig_sample <- RunUMAP(pig_sample, dims = 1:min.pc, reduction = "pca")
# cluster
pig_sample <- FindNeighbors(object = pig_sample, dims = 1:min.pc)
pig_sample <- FindClusters(object = pig_sample, resolution = 0.2)
# Assign identity of clusters
Idents(object = pig_sample) <- "RNA_snn_res.0.2"
d1 <- DimPlot(pig_sample,
reduction = "umap",
label = TRUE,
label.size = 6)
path <- paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_UMAP_res0.02_",sampleID)
pdf(paste0(path, ".pdf"), width = 5, height = 4)
print(d1)
dev.off()
# number of cells in each cluster
n_cells <- FetchData(pig_sample, vars = c("ident")) %>% dplyr::count(ident) %>%tidyr::spread(ident, n)
## pK Identification (no ground-truth)
sweep.res.list <- paramSweep_v3(pig_sample, PCs = 1:min.pc, sct = FALSE)
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats)
# Optimal pK for any scRNA-seq data can be manually discerned as maxima in BCmvn distributions
bcmvn_max <- bcmvn[which.max(bcmvn$BCmetric),]
pK_value <- bcmvn_max$pK
pK_value <- as.numeric(levels(pK_value))[pK_value]
# Homotypic Doublet Proportion Estimate
annotations <- pig_sample@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)
nExp_poi <- round(pK_value*nrow(pig_sample@meta.data))
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
# Run DoubletFinder with varying classification
pig_sample <- doubletFinder_v3(pig_sample, PCs = 1:min.pc,
pN = 0.25, pK = pK_value, nExp = nExp_poi.adj,
reuse.pANN = FALSE, sct = FALSE)
# set DF class for calling doublets
DF_class <- pig_sample@meta.data[, grep("DF.classifications",colnames(pig_sample@meta.data)),]
DF_class[which(DF_class == "Doublet")] <- "Doublet"
table(DF_class)
# table showing the number of doublets and singlets
write.table(table(DF_class), paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_table_",sampleID), sep = "\t",
row.names = FALSE, quote = FALSE)
pig_sample@meta.data[,"CellTypes_DF"] <- DF_class
# plot
d2 <- DimPlot(pig_sample, group.by="CellTypes_DF", reduction="umap",
order=c("Coll.Duct.TC","Doublet"),
cols=c("#66C2A5","black"))
path <- paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_UMAP_",sampleID)
pdf(paste0(path, ".pdf"), width = 5,height = 4)
print(d2)
dev.off()
# plot
f1 <- FeaturePlot(pig_sample,
reduction = "umap",
features = c("nFeature_RNA", "nCount_RNA",
"cell.complexity", "percent.mt"),
pt.size = 0.4,
order = TRUE,
label = TRUE)
path <- paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_FeaturePlot_",sampleID)
pdf(paste0(path, ".pdf"), width = 7, height = 7)
print(f1)
dev.off()
#only keep singlets
pig_sample_singlets <- subset(pig_sample, subset = CellTypes_DF == "Singlet")
# inspect
d3 <- DimPlot(pig_sample_singlets, group.by="CellTypes_DF", reduction="umap",
order=c("Coll.Duct.TC","Doublet"),
cols=c("#66C2A5","black"))
path <- paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_UMAP_singlets_",sampleID)
pdf(paste0(path, ".pdf"), width = 5, height = 4)
print(d3)
dev.off()
# number of cells in each cluster per and post removing doublets
n_cells_singlets <- FetchData(pig_sample_singlets, vars = c("ident")) %>% dplyr::count(ident) %>% tidyr::spread(ident, n)
n_cells_singlets
ncells_per_cluster <- rbind(n_cells, n_cells_singlets)
row.names(ncells_per_cluster) <- c("Doublets and singlets", "Singlets only")
ncells_per_cluster
difference <- diff(as.matrix(ncells_per_cluster))
difference <- as.data.frame(difference)
row.names(difference) <- c("difference")
cbind(difference, ncells_per_cluster)
write.table(ncells_per_cluster, paste0(
"../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_table_ncells_per_cluster",sampleID, ".txt"), sep = "\t",
row.names = FALSE, quote = FALSE)
# plot the number of cells in each cluster per and post doubletFinder
ncell_matrix <- as.matrix(ncells_per_cluster)
ncells_melt <- melt(ncell_matrix)
colnames(ncells_melt) <- c("doublet type","cluster","number of cells")
ncell_max <- ncells_melt[which.max(ncells_melt$`number of cells`),]
ncell_max_value <- ncell_max$`number of cells`
cellmax <- ncell_max_value + 800 # so that the figure doesn't cut off the text
b1 <- ggplot(ncells_melt, aes(x = factor(cluster), y = `number of cells`,
fill = `doublet type`)) +
geom_bar(stat="identity", colour="black", width=1, position = position_dodge(width=0.8)) +
geom_text(aes(label = `number of cells`),
position=position_dodge(width=0.9), vjust=-0.25, angle = 45, hjust=-.01) +
theme_classic() + scale_fill_manual(values = c("gray", "#66C2A5")) +
ggtitle("Number of cells per cluster") + xlab("cluster") +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_y_continuous(limits = c(0,cellmax))
path <- paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_barplot_ncells_per_cluster",sampleID)
pdf(paste0(path, ".pdf"), width = 7,height = 5)
print(b1)
dev.off()
f2 <- FeaturePlot(pig_sample_singlets,
reduction = "umap",
features = c("nFeature_RNA", "nCount_RNA",
"cell.complexity", "percent.mt"),
pt.size = 0.4,
order = TRUE,
label = TRUE)
path <- paste0("../../results/doubletFinder/",treatment,"_",tolower(tissue),
"_doubletFinder_FeaturePlot_singlets",sampleID)
pdf(paste0(path, ".pdf"), width = 7,height = 7)
print(f2)
dev.off()
# put the pigs together again
pigs.split[[i]] <- pig_sample_singlets
}
## [1] 1
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1296
## Number of edges: 36836
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9665
## Number of communities: 10
## Elapsed time: 0 seconds
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## NULL
## [1] "Creating 432 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] 2
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1413
## Number of edges: 44397
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9575
## Number of communities: 9
## Elapsed time: 0 seconds
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## NULL
## [1] "Creating 471 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] 3
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 501
## Number of edges: 12790
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9479
## Number of communities: 6
## Elapsed time: 0 seconds
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## NULL
## [1] "Creating 167 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
## [1] 4
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 7887
## Number of edges: 271196
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9593
## Number of communities: 18
## Elapsed time: 0 seconds
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## NULL
## [1] "Creating 2629 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
# converge pigs.split
pigs.singlets <- merge(x = pigs.split[[1]],
y = c(pigs.split[[2]], pigs.split[[3]], pigs.split[[4]]),
project = paste0("LPS Pigs ", tissue, " Single Nucleus"))
# print how many cells removed
print(paste0(dim(pigs.filtered)[2] - dim(pigs.singlets)[2]," cells removed"))
## [1] "2163 cells removed"
# how many removed if we had an upper nCount and nFeature
pigs.upper <- subset(pigs.singlets,
subset = (nCount_RNA < 10000) & (nFeature_RNA < 5000))
print(paste0(dim(pigs.filtered)[2] - dim(pigs.upper)[2],
" cells would have been removed if upper bound applied"))
## [1] "2869 cells would have been removed if upper bound applied"
# overwrite pigs.filtered
pigs.filtered <- pigs.singlets
# reset levels
pigs.filtered$treatment <- factor(pigs.filtered$treatment,
levels = c("Saline","LPS"))
pigs.filtered$sample <- factor(pigs.filtered$sample,
levels = c("4.R.Saline", "10.Saline","8.R.LPS", "12.LPS"))
# cleanup
remove(pigs.singlets, pigs.upper, pigs.split, pig_sample_singlets, pig_sample)
remove(n_cells,n_cells_singlets,ncell_matrix,ncell_max,ncells_per_cluster,ncells_melt)
remove(sweep.res.list, sweep.stats,bcmvn,bcmvn_max,difference)
remove(d1,d2,d3,f1,f2)
remove(counts,counts.filtered, nonzero)
# remove mt.genes
counts <- GetAssayData(object = pigs.filtered, slot = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]
# overwrite pigs.filtered
pigs.filtered <- CreateSeuratObject(counts.filtered,
meta.data = pigs.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "13 features removed"
# User params
goi <- "KK-MALAT1"
threshold <- 5
# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(pigs.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)
# Histogram
title <- paste0("LPS Brain Nuclei: ", goi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) +
geom_histogram(bins = 100, fill = "gray", color = "black") +
labs(title = title, x=NULL, y=NULL) +
xlab(paste0(goi, " nCount_RNA")) + ylab("# of Samples") + theme_bw() +
geom_vline(xintercept = threshold, col = "blue") +
annotate("rect",
xmin = -Inf,
xmax = threshold,
ymin = 0,
ymax=Inf,
alpha=0.2,
fill="chocolate4") +
annotate("rect",
xmin = threshold,
xmax = Inf,
ymin = 0,
ymax=Inf,
alpha=0.2,
fill="deepskyblue")
# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) +
geom_histogram(bins = 100, fill = "gray", color = "black") +
labs(title = title, x=NULL, y=NULL) +
xlab(paste0("Log2(",goi, " nCount_RNA)")) + ylab("# of Samples") + theme_bw() +
geom_vline(xintercept = log2.threshold, col = "blue") +
annotate("rect",
xmin = -Inf,
xmax = log2.threshold,
ymin = 0,
ymax=Inf,
alpha=0.2,
fill="chocolate4") +
annotate("rect",
xmin = log2.threshold,
xmax = Inf,
ymin = 0,
ymax=Inf,
alpha=0.2,
fill="deepskyblue")
# plot
plots1 <- list(hist1,hist2)
layout1 <- rbind(c(1),c(2))
grid1 <- grid.arrange(grobs = plots1, layout_matrix = layout1)
# number removed
table(counts.df$counts > threshold)
##
## FALSE TRUE
## 456 8478
# Visualize the number of cell counts per sample
data <- as.data.frame(table(pigs.filtered$sample))
colnames(data) <- c("sample","frequency")
ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
# scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
# Arrange graphs in grid
plots2 <- list(ncells1,ncells2)
layout2 <- cbind(c(1),c(2))
grid2 <- grid.arrange(grobs = plots2, layout_matrix = layout2)
# set graphical parameter
par(mfrow = c(3,1))
# Visualize the number of counts per cell
den4 <- ggplot(pigs.filtered@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
geom_vline(xintercept = nCount.min)
# Visualize percent.mt
den5 <- ggplot(pigs.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
geom_vline(xintercept = mt.cutoff)
# Visualize cell complexity
# Quality cells are usually above 0.80
den6 <- ggplot(pigs.filtered@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff)
# Arrange graphs in grid
plots3 <- list(den1,den2,den3,den4,den5,den6)
layout3 <- rbind(c(1,4),c(2,5),c(3,6))
grid3 <- grid.arrange(grobs = plots3, layout_matrix = layout3)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 17 rows containing non-finite values (stat_density).
## Removed 17 rows containing non-finite values (stat_density).
## Removed 17 rows containing non-finite values (stat_density).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5254 rows containing non-finite values (stat_density).
# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(pigs.filtered,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v3
# percent violins
v4 <- VlnPlot(pigs.filtered,
features = c("percent.mt","percent.ribo","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v4
s3 <- ggplot(
pigs.filtered@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_hline(yintercept = nFeature.min) +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
#geom_rect(aes(xmin=300, xmax=300, ymin=1000,
# ymax=3000), color="transparent", fill="orange", alpha=0.3)
s3
## `geom_smooth()` using formula 'y ~ x'
s4 <- FeatureScatter(pigs.filtered,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s4
# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(pigs.filtered@meta.data,
aes(x = sample,
y = log10(nFeature_RNA),
fill=sample)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ggtitle("Unique Genes / Cell / Sample") +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Sample")
b1
df <- data.frame(row.names = rownames(pigs.filtered))
df$rsum <- rowSums(x = pigs.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 10)
## rsum gene_name
## KK-MALAT1 499029 KK-MALAT1
## CADM2 168914 CADM2
## JAZF1 166875 JAZF1
## NRXN3 162131 NRXN3
## PCDH9 155648 PCDH9
## OPCML 132072 OPCML
## DLG2 131668 DLG2
## RBFOX1 129691 RBFOX1
## KCNIP4 127731 KCNIP4
## NRXN1 123550 NRXN1
For something to be informative, it needs to exhibit variation, but not all variation is informative. The goal of our clustering analysis is to keep the major sources of variation in our dataset that should define our cell types, while restricting the variation due to uninteresting sources of variation (sequencing depth, cell cycle differences, mitochondrial expression, batch effects, etc.). Then, to determine the cell types present, we will perform a clustering analysis using the most variable genes to define the major sources of variation in the dataset.
The most common biological data correction is to remove the effects of the cell cycle on the transcriptome. This data correction can be performed by a simple linear regression against a cell cycle score.
Check cell cycle phase BEFORE doing sctransform. Counts to need to be comparable between cells and each cell has a different number for nCount_RNA.
# summary of counts per cell
summary(pigs.filtered@meta.data$nCount_RNA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 501 1484 2718 3994 5206 31826
Use the NormalizeData() function with the argument LogNormalze to account for sequencing depth. nCount_RNA for each gene is divided by the total nCount_RNA for that cell. This is done for all cells. This number is then multiplied by the scale.factor so we don’t have to deal with a tiny number. This number is then natural-log transformed using log1p. log1p is the natural logarithm (base e) of 1 + count. The 1 will prevent taking the log of 0.
# EXAMPLE
# TUBB1 gene expression in cell 1
MALAT1_cell1_count <- FetchData(pigs.filtered, vars = "KK-MALAT1", cells = 1)
MALAT1_cell1_count <- MALAT1_cell1_count$`KK-MALAT1`
# get total counts in cell 1
cell1_count <- pigs.filtered$nCount_RNA[1]
# divide, scale, and log1p transform
log1p((MALAT1_cell1_count/cell1_count)*10000)
## 4.R.Saline_AAAGGGCGTACTCCGG-1
## 3.440406
pigs.phase <- NormalizeData(pigs.filtered,
scale.factor = 10000, # default
normalization.method = "LogNormalize" # default
)
# check
FetchData(pigs.phase, vars = "KK-MALAT1", cells = 1)
## KK-MALAT1
## 4.R.Saline_AAAGGGCGTACTCCGG-1 3.440406
Give each cell a score based on expression of G1, G2/M, and S phase markers. A list of markers is provided for humans. Since the pig genome is poorly annotated we will use the human list. We will use the CellCycleScoring() function in seurat.
Below is a resource for acquiring cell markers for other organisms https://hbctraining.github.io/scRNA-seq_online/lessons/cell_cycle_scoring.html
G1 ~10 hrs S ~5-6 hrs G2 ~3-4 hrs M ~2 hrs
G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise the the greatest positive value between S.Score and G2M.Score determines the phase.
g2m_genes <- c("NCAPD2","ANLN","KK-TACC3","HMMR","GTSE1","NDC80","KK-AURKA",
"TPX2","BIRC5","G2E3","CBX5","RANGAP1","CTCF","CDCA3","TTK",
"SMC4","ECT2","CENPA","CDC20","NEK2","CENPF","TMPO","HJURP",
"CKS2","DLGAP5","PIMREG","TOP2A","PSRC1","CDCA8","CKAP2",
"KK-NUSAP1","KIF23","KIF11","KIF20B","CENPE","GAS2L3","KIF2C",
"NUF2","KK-ANP32E","LBR","MKI67","CCNB2","CDC25C","HMGB2",
"CKAP2L","BUB1","CDK1","CKS1B","UBE2C","CKAP5","AURKB","CDCA2",
"TUBB4B","JPT1")
s_genes <- c("KK-UBR7","RFC2","RAD51","MCM2","TIPIN","MCM6","UNG","POLD3",
"WDR76","CLSPN","CDC45","CDC6","MSH2","MCM5","POLA1","MCM4",
"RAD51AP1","GMNN","RPA2","CASP8AP2","HELLS","E2F8","GINS2","PCNA",
"NASP","BRIP1","DSCC1","DTL","CDCA7","CENPU","ATAD2","CHAF1B",
"USP1","KK-SLBP","RRM1","FEN1","KK-RRM2","EXO1","CCNE2","TYMS",
"BLM","KK-PRIM1","UHRF1")
# score cells for cell cycle
pigs.phase <- CellCycleScoring(pigs.phase,
g2m.features = g2m_genes,
s.features = s_genes,
set.ident = TRUE)
## Warning: The following features are not present in the object: KK-UBR7, E2F8,
## KK-SLBP, FEN1, KK-RRM2, KK-PRIM1, not searching for symbol synonyms
## Warning: The following features are not present in the object: KK-TACC3, KK-
## AURKA, CDCA3, CENPA, CDC20, KK-ANP32E, MKI67, CKS1B, UBE2C, not searching for
## symbol synonyms
cellcyclecount_barplot <-
as_tibble(pigs.phase[[]]) %>%
ggplot(aes(Phase, fill = Phase)) + geom_bar()
cellcyclecount_barplot
# pie point
cellcyclecount_piepoint <-
as_tibble(pigs.phase[[]]) %>%
ggplot(aes(x=S.Score, y=G2M.Score, color=Phase)) +
geom_point()
cellcyclecount_piepoint
# Identify the most variable genes
pigs.phase <- FindVariableFeatures(pigs.phase,
selection.method = "vst", # default vst
nfeatures = 2000, # default 2000
verbose = FALSE)
# view top variable genes
top40 <- head(VariableFeatures(pigs.phase), 40)
top40
## [1] "SLC16A9" "IL1R1" "CEMIP"
## [4] "ENSSSCG00000049542" "ENSSSCG00000004830" "BCAS1"
## [7] "MECOM" "VWF" "ENSSSCG00000009672"
## [10] "ENSSSCG00000047530" "ENSSSCG00000044602" "RELN"
## [13] "SLC47A1" "SLC26A2" "COLEC12"
## [16] "ALDH1A2" "LEF1" "ALPL"
## [19] "ADAM12" "TTR" "BMP6"
## [22] "SLC2A7" "C16orf89" "COL8A1"
## [25] "CALCR" "ENSSSCG00000016527" "ENSSSCG00000002749"
## [28] "TMSB10" "GRIK1" "NXPH1"
## [31] "ADARB2" "ADGRL4" "GPR17"
## [34] "PRDM6" "TRPM3" "ADGRF5"
## [37] "MBP" "BNC2" "LHFPL3"
## [40] "HS3ST4"
# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(pigs.phase, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot,
points = top40, repel = TRUE, fontface="italic",
xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel
# The variability information can be accessed using the HVFInfo method.
# The names of the variable features can be accessed with VariableFeatures().
variance.data <- as_tibble(HVFInfo(pigs.phase),rownames = "Gene")
variance.data <- variance.data %>% mutate(hypervariable=Gene %in% VariableFeatures(pigs.phase))
# We can plot out a graph of the variance vs mean and highlight the selected genes
# this way, we can see whether we think we’re likely to capture what we need.
subset_data <- subset(variance.data, hypervariable == TRUE)
varGeneslog <- variance.data %>%
ggplot(aes(log(mean),log(variance),color=hypervariable)) +
geom_point() +
scale_color_manual(values=c("black","red")) + geom_text_repel(
data = subset_data, max.overlaps = 20,
aes(
x = log(mean),
y = log(variance),
label = Gene,
fontface="italic",),segment.alpha = 1,size = 4) +
theme(legend.position="bottom")
varGeneslog
See if the cell cycle is a major source of variation using PCA. Choose the most variable gene features (we have already done), then e the data. We scale the data because highly expressed genes exhibit the highest amount of variation and we don’t want our ‘highly variable genes’ only to reflect high expression.
vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then, feature values are standardized using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
The ScaleData() function in Seurat will adjust gene expressions so that the mean expression in each cell is 0. It will also scale each gene to give a variance of 1 for each cell.
# Scale the counts
pigs.phase <- ScaleData(pigs.phase)
## Centering and scaling data matrix
pigs.phase@assays
## $RNA
## Assay data with 21805 features for 8934 cells
## Top 10 variable features:
## SLC16A9, IL1R1, CEMIP, ENSSSCG00000049542, ENSSSCG00000004830, BCAS1,
## MECOM, VWF, ENSSSCG00000009672, ENSSSCG00000047530
pigs.phase.pca <- RunPCA(pigs.phase, features = c(s_genes, g2m_genes))
## Warning in PrepDR(object = object, features = features, verbose = verbose): The
## following 88 features requested have not been scaled (running reduction without
## them): KK-UBR7, RFC2, RAD51, MCM2, TIPIN, MCM6, UNG, POLD3, WDR76, CLSPN, CDC45,
## CDC6, MSH2, MCM5, POLA1, MCM4, RAD51AP1, GMNN, RPA2, CASP8AP2, HELLS, E2F8,
## GINS2, PCNA, NASP, BRIP1, DSCC1, CDCA7, CENPU, ATAD2, CHAF1B, USP1, KK-SLBP,
## RRM1, FEN1, KK-RRM2, CCNE2, TYMS, BLM, KK-PRIM1, NCAPD2, KK-TACC3, HMMR, GTSE1,
## KK-AURKA, TPX2, BIRC5, G2E3, CBX5, RANGAP1, CTCF, CDCA3, TTK, SMC4, ECT2, CENPA,
## CDC20, NEK2, TMPO, HJURP, CKS2, DLGAP5, PIMREG, PSRC1, CDCA8, CKAP2, KK-NUSAP1,
## KIF23, KIF20B, GAS2L3, KIF2C, NUF2, KK-ANP32E, LBR, MKI67, CCNB2, CDC25C, HMGB2,
## CKAP2L, BUB1, CDK1, CKS1B, UBE2C, CKAP5, AURKB, CDCA2, TUBB4B, JPT1
## Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
## a percentage of total singular values, use a standard svd instead.
## Warning: Requested number is larger than the number of available items (9).
## Setting to 9.
## Warning: Requested number is larger than the number of available items (9).
## Setting to 9.
## Warning: Requested number is larger than the number of available items (9).
## Setting to 9.
## Warning: Requested number is larger than the number of available items (9).
## Setting to 9.
## Warning: Requested number is larger than the number of available items (9).
## Setting to 9.
## PC_ 1
## Positive: ANLN, UHRF1, CENPE, KIF11
## Negative: EXO1, CENPF, TOP2A, NDC80
## PC_ 2
## Positive: ANLN, DTL, TOP2A, CENPF
## Negative: UHRF1, KIF11, CENPE, EXO1
## PC_ 3
## Positive: EXO1, CENPE, KIF11, CENPF
## Negative: UHRF1, ANLN, DTL, TOP2A
## PC_ 4
## Positive: EXO1, UHRF1, ANLN, DTL
## Negative: CENPE, KIF11, TOP2A, NDC80
## PC_ 5
## Positive: CENPE, EXO1, UHRF1, ANLN
## Negative: KIF11, DTL, CENPF, NDC80
DimPlot(pigs.phase.pca)
If the plots for each phase look very similar to each other, do not regress out variation due to cell cycle. You can plot PC1 vs PC2 before and after regression to see how effective it was. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)
# Perform PCA
pigs.phase <- RunPCA(pigs.phase)
# Plot the PCA colored by cell cycle phase
cycle.pca <- DimPlot(pigs.phase,
reduction = "pca",
group.by= "Phase",
split.by = "Phase")
cycle.pca
Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model. ??Variance is also adjusted based on pooling information across genes with similar abundances??
Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase and so we chose not to regress this out of our data. We observed some effect of mitochondrial expression and so we choose to regress this out from the data.
Since we have four samples in our dataset (from two conditions), we want to keep them as separate objects and transform them as that is what is required for integration. We will first split the cells in seurat.phase object by sample.
# Split seurat object by timepoint to perform SCT on all samples
pigs.split <- SplitObject(pigs.phase, split.by = "sample")
Now we will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.
Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb) using the following code:
options(future.globals.maxSize = 4000 * 1024^5)
for (i in 1:length(pigs.split)) {
print(paste0("Sample ", i))
pigs.split[[i]] <- SCTransform(pigs.split[[i]],
vars.to.regress = c("percent.mt")
)
}
## [1] "Sample 1"
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|======= | 9%
|
|========= | 12%
|
|=========== | 16%
|
|============= | 19%
|
|=============== | 22%
|
|================== | 25%
|
|==================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 38%
|
|============================ | 41%
|
|=============================== | 44%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 56%
|
|========================================== | 59%
|
|============================================ | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 75%
|
|======================================================= | 78%
|
|========================================================= | 81%
|
|=========================================================== | 84%
|
|============================================================= | 88%
|
|=============================================================== | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|======= | 9%
|
|========= | 12%
|
|=========== | 16%
|
|============= | 19%
|
|=============== | 22%
|
|================== | 25%
|
|==================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 38%
|
|============================ | 41%
|
|=============================== | 44%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 56%
|
|========================================== | 59%
|
|============================================ | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 75%
|
|======================================================= | 78%
|
|========================================================= | 81%
|
|=========================================================== | 84%
|
|============================================================= | 88%
|
|=============================================================== | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## [1] "Sample 2"
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 21%
|
|================ | 24%
|
|=================== | 26%
|
|===================== | 29%
|
|======================= | 32%
|
|========================= | 35%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================= | 71%
|
|=================================================== | 74%
|
|====================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================== | 88%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 21%
|
|================ | 24%
|
|=================== | 26%
|
|===================== | 29%
|
|======================= | 32%
|
|========================= | 35%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================= | 71%
|
|=================================================== | 74%
|
|====================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================== | 88%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## [1] "Sample 3"
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 4%
|
|===== | 8%
|
|======== | 12%
|
|=========== | 15%
|
|============= | 19%
|
|================ | 23%
|
|=================== | 27%
|
|====================== | 31%
|
|======================== | 35%
|
|=========================== | 38%
|
|============================== | 42%
|
|================================ | 46%
|
|=================================== | 50%
|
|====================================== | 54%
|
|======================================== | 58%
|
|=========================================== | 62%
|
|============================================== | 65%
|
|================================================ | 69%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|========================================================= | 81%
|
|=========================================================== | 85%
|
|============================================================== | 88%
|
|================================================================= | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
##
|
| | 0%
|
|=== | 4%
|
|===== | 8%
|
|======== | 12%
|
|=========== | 15%
|
|============= | 19%
|
|================ | 23%
|
|=================== | 27%
|
|====================== | 31%
|
|======================== | 35%
|
|=========================== | 38%
|
|============================== | 42%
|
|================================ | 46%
|
|=================================== | 50%
|
|====================================== | 54%
|
|======================================== | 58%
|
|=========================================== | 62%
|
|============================================== | 65%
|
|================================================ | 69%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|========================================================= | 81%
|
|=========================================================== | 85%
|
|============================================================== | 88%
|
|================================================================= | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
## [1] "Sample 4"
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|======= | 10%
|
|======== | 12%
|
|========== | 14%
|
|============ | 17%
|
|============= | 19%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 45%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 57%
|
|========================================== | 60%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|======= | 10%
|
|======== | 12%
|
|========== | 14%
|
|============ | 17%
|
|============= | 19%
|
|=============== | 21%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 45%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 57%
|
|========================================== | 60%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 79%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument.
Note, the last line of output specifies “Set default assay to SCT”. We can view the different assays that we have stored in our seurat object.
A thread about whether or not regress out batch: https://github.com/satijalab/seurat/issues/3270 It is suggested to not regress out batch, and instead use a data integration method
# Check
pigs.split
## $`4.R.Saline`
## An object of class Seurat
## 37614 features across 1273 samples within 2 assays
## Active assay: SCT (15809 features, 3000 variable features)
## 1 other assay present: RNA
## 1 dimensional reduction calculated: pca
##
## $`10.Saline`
## An object of class Seurat
## 38573 features across 1232 samples within 2 assays
## Active assay: SCT (16768 features, 3000 variable features)
## 1 other assay present: RNA
## 1 dimensional reduction calculated: pca
##
## $`8.R.LPS`
## An object of class Seurat
## 34348 features across 489 samples within 2 assays
## Active assay: SCT (12543 features, 3000 variable features)
## 1 other assay present: RNA
## 1 dimensional reduction calculated: pca
##
## $`12.LPS`
## An object of class Seurat
## 42490 features across 5940 samples within 2 assays
## Active assay: SCT (20685 features, 3000 variable features)
## 1 other assay present: RNA
## 1 dimensional reduction calculated: pca
Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.
To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.
Now, using our SCTransform object as input, let’s perform the integration across conditions.
First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.
# Choose the features to use when integrating multiple datasets.
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = pigs.split,
nfeatures = 3000)
# merge the pigs
pigs.sct.merged <- merge(x = pigs.split[[1]],
y = c(pigs.split[[2]], pigs.split[[3]], pigs.split[[4]]),
project = paste0("LPS Pigs ", tissue, " Single Nucleus"))
# define the variable features
VariableFeatures(pigs.sct.merged) <- var.features
# run PCA on the merged object
pigs.sct.merged <- RunPCA(object = pigs.sct.merged, assay = "SCT")
# run harmony to harmonize over samples
pigs.integrated <- RunHarmony(object = pigs.sct.merged,
group.by.vars = "sample",
assay.use = "SCT",
reduction = "pca",
plot_convergence = TRUE)
# first put the pigs back in place
Idents(pigs.integrated) <- pigs.integrated$sample
pigs.integrated$treatment <- factor(pigs.integrated$treatment,
levels = c("Saline","LPS"))
pigs.integrated$sample <- factor(pigs.integrated$sample,
levels = c("4.R.Saline", "10.Saline", "8.R.LPS",
"12.LPS"))
# check the embedding
harmony_embeddings <- Embeddings(pigs.integrated, 'harmony')
harmony_embeddings[1:5, 1:5]
## harmony_1 harmony_2 harmony_3 harmony_4
## 4.R.Saline_AAAGGGCGTACTCCGG-1 -41.31665 24.93789 15.138772 5.7822519
## 4.R.Saline_GTCAGCGGTCTGTCAA-1 -32.65086 18.29518 4.824791 0.5360655
## 4.R.Saline_GATTGGTGTTAGGCCC-1 -30.69634 21.30152 6.355259 -1.2883544
## 4.R.Saline_GACGCTGTCTCGTCAC-1 -32.97359 19.22914 4.627168 1.0038339
## 4.R.Saline_TTTCACACATGATAGA-1 -48.14160 28.12002 16.902541 7.8662559
## harmony_5
## 4.R.Saline_AAAGGGCGTACTCCGG-1 7.015794
## 4.R.Saline_GTCAGCGGTCTGTCAA-1 -10.898717
## 4.R.Saline_GATTGGTGTTAGGCCC-1 -21.102502
## 4.R.Saline_GACGCTGTCTCGTCAC-1 -10.961952
## 4.R.Saline_TTTCACACATGATAGA-1 11.976632
# check the PCA plot
p1 <- DimPlot(object = pigs.integrated,
reduction = "harmony",
group.by = "treatment",
cols = treatment_colors) + NoLegend()
p2 <- VlnPlot(object = pigs.integrated,
features = "harmony_1",
group.by = "treatment",
pt.size = 0,
cols = treatment_colors) + NoLegend()
plot_grid(p1,p2)
Top 20 variable features
top20 <- pigs.integrated@assays$SCT@var.features[1:20]
top20
## [1] "SLC1A2" "TNR" "SLC16A9"
## [4] "LHFPL3" "NOL11" "ENSSSCG00000049542"
## [7] "CALCR" "MBP" "RNF220"
## [10] "ENSSSCG00000044602" "XYLT1" "ST18"
## [13] "ARHGAP24" "CEMIP" "GALNTL6"
## [16] "C10orf90" "RBFOX1" "KCNIP4"
## [19] "ARHGAP15" "ANK1"
After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.
To generate these visualizations with the harmony output, use reduction = “harmony”
# Plot PCA
pca1 <- DimPlot(pigs.integrated,
reduction = "harmony",
split.by = "treatment",
group.by = "treatment",
cols = treatment_colors)
pca1
pca2 <- DimPlot(pigs.integrated,
reduction = "harmony",
split.by = "sample",
group.by = "sample",
cols = sample_colors)
pca2
pca3 <- DimPlot(pigs.integrated,
reduction = "harmony",
group.by = "sample",
shuffle = TRUE)
pca3
To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
# Printing out the most variable genes driving PCs
print(x = pigs.integrated[["pca"]],
dims = 1:10,
nfeatures = 10)
## PC_ 1
## Positive: SLC1A2, PREX2, NOL11, NHSL1, SLC1A3, EYA2, IRAG1, POLR3B, LRIG1, ZBTB20
## Negative: KCNIP4, RBFOX1, DPP10, ROBO2, OPCML, NRXN3, TENM2, LRRTM4, GALNTL6, HS6ST3
## PC_ 2
## Positive: SLC1A2, PREX2, NOL11, NHSL1, LSAMP, CADM2, IRAG1, POLR3B, CTNND2, NTM
## Negative: ENSSSCG00000017146, MX2, ENSSSCG00000033089, GAB2, RBMS3, HERC6, EPSTI1, BNC2, ARHGAP24, LRMDA
## PC_ 3
## Positive: CEMIP, BNC2, RBMS3, ADAM12, SVIL, EYA1, COLEC12, GPC6, USP53, BMP6
## Negative: RNF220, C10orf90, ST18, MBP, OPALIN, PLP1, PLEKHH1, ENSSSCG00000049542, DOCK10, ZNF536
## PC_ 4
## Positive: CALCR, INPP5D, ARHGAP24, ARHGAP15, DOCK8, GAB2, PDE3B, LCP2, RUNX1, VAV1
## Negative: RBMS3, CEMIP, BNC2, ADAM12, EYA1, COLEC12, FBXL7, BMP6, SVIL, BICC1
## PC_ 5
## Positive: KCNIP4, NKAIN2, TAFA1, ENSSSCG00000011729, RNF220, C10orf90, MBP, SLIT3, PLEKHH1, OPALIN
## Negative: LHFPL3, NXPH1, SOX6, TNR, MMP16, NELL1, ITGA9, KCNIP1, AGAP1, THSD7B
## PC_ 6
## Positive: LHFPL3, DSCAM, TNR, XYLT1, AGAP1, MMP16, KCNB2, ITGA9, STK32A, KCNIP4
## Negative: ERBB4, NXPH1, GALNTL6, PTCHD4, KIAA1217, ZNF536, BTBD11, ZNF804B, KCNC2, GAD2
## PC_ 7
## Positive: SLC16A9, GPC6, FOXP1, NTN1, SLC26A2, SLC47A1, KIAA1755, BNC2, ZFHX3, ADAMTS9
## Negative: CEMIP, MX2, ENSSSCG00000033089, EPSTI1, RSAD2, ENSSSCG00000017146, COLEC12, HERC6, ARHGAP28, PARP14
## PC_ 8
## Positive: ENSSSCG00000033089, MX2, RSAD2, ENSSSCG00000017146, RBMS3, ROR2, PARP14, C3, PLEKHG1, ADAMTS9
## Negative: TRPM3, GPC5, CEMIP, ATP1A4, ENSSSCG00000044580, SLC6A20, KCNK2, ADAM12, PRKAG2, MAPK4
## PC_ 9
## Positive: CA10, TAFA1, ENC1, RFX3, GRIA4, RASGRF2, CCK, SLIT2, FSTL5, ENSSSCG00000014097
## Negative: HS3ST4, DPP10, GRIK3, IL1RAPL2, KIAA1217, SEMA3E, ZFPM2, CLSTN2, SLC35F4, RASGRP1
## PC_ 10
## Positive: NHSL1, EYA2, SLC1A2, EYA1, EFEMP1, PREX2, GPC5, RBFOX1, ENSSSCG00000051274, ROBO2
## Negative: HS3ST4, DPP10, ESRRG, GRM8, WWC1, GALNT9, GPM6A, SEMA3E, KIRREL3, SEZ6L
Quantitative approach to an elbow plot - The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation. - The point where the percent change in variation between the consecutive PCs is less than 0.1%.
First metric
# Determine percent of variation associated with each PC
stdv <- pigs.integrated[["pca"]]@stdev
sum.stdv <- sum(pigs.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)
# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 41
Second metric
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
(percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
## [1] 17
Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 17
Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
# Run UMAP
pigs.integrated <- RunUMAP(pigs.integrated,
dims = 1:min.pc,
reduction = "harmony",
n.components = 3) # set to 3 to use with VR
# plot UMAP and color based on treatment
DimPlot(pigs.integrated,
group.by = "treatment",
split.by = "treatment",
shuffle = TRUE,
cols = treatment_colors)
Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].
We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.
The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.
# Determine the K-nearest neighbor graph
pigs.unannotated <- FindNeighbors(object = pigs.integrated,
assay = "SCT", # set as default after SCTransform
reduction = "harmony",
dims = 1:min.pc)
# Determine the clusters for various resolutions
pigs.unannotated <- FindClusters(object = pigs.unannotated,
algorithm = 1, # 1= Louvain
resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9779
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9642
## Number of communities: 17
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9542
## Number of communities: 18
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9453
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9374
## Number of communities: 23
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9300
## Number of communities: 24
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9230
## Number of communities: 24
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 8934
## Number of edges: 318511
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9165
## Number of communities: 26
## Elapsed time: 0 seconds
colors <- c("dodgerblue","yellow","firebrick1","gold","gray40","lightgray",
"cyan","chocolate4","pink","orange","darkgreen","purple","lightgray","tan")
DimPlot(pigs.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE,
cols = colors)
# 0.1
umap0.1 <- DimPlot(pigs.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE)
umap0.1
# 0.2
umap0.2 <- DimPlot(pigs.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
umap0.2
# 0.3
umap0.3 <- DimPlot(pigs.unannotated,
group.by = "SCT_snn_res.0.3",
label = TRUE)
umap0.3
# 0.4
umap0.4 <- DimPlot(pigs.unannotated,
group.by = "SCT_snn_res.0.4",
label = TRUE)
umap0.4
# treatment
u1 <- DimPlot(pigs.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "treatment") +
NoLegend()
u1
# sample
u2 <- DimPlot(pigs.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "sample") +
NoLegend()
u2
# phase
u3 <- DimPlot(pigs.unannotated,
label = FALSE,
group.by = "SCT_snn_res.0.1",
split.by = "Phase") +
NoLegend()
u3
# nCount
f1 <- FeaturePlot(pigs.unannotated,
features = "nCount_RNA",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
f1
# nFeature
f2 <- FeaturePlot(pigs.unannotated,
features = "nFeature_RNA",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
f2
# percent.mt
f3 <- FeaturePlot(pigs.unannotated,
features = "percent.mt",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
f3
# cell.complexity
f4 <- FeaturePlot(pigs.unannotated,
features = "cell.complexity",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
f4
f5 <- FeaturePlot(pigs.unannotated,
features = "S.Score",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
f5
f6 <- FeaturePlot(pigs.unannotated,
features = "G2M.Score",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q10',
label = TRUE)
f6
# cell.complexity
f5 <- FeaturePlot(pigs.unannotated,
features = c("ATF3","CCL2","CXCL10","ICAM1"),
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q1',
label = FALSE)
f5
f6 <- FeaturePlot(pigs.unannotated,
features = "KK-MALAT1",
pt.size = 0.4,
order = TRUE,
min.cutoff = 'q1',
label = TRUE)
f6
pigs.unannotated@meta.data$seurat_clusters <-
pigs.unannotated@meta.data$SCT_snn_res.0.1
# treatment
b1 <- pigs.unannotated@meta.data %>%
group_by(seurat_clusters, treatment) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=treatment)) +
geom_col() +
scale_fill_manual(values = treatment_colors) +
ggtitle("Percentage of treatment per cluster")
b1
# sample
b2 <- pigs.unannotated@meta.data %>%
group_by(seurat_clusters, sample) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
geom_col() +
#scale_fill_manual(values = sample_colors) +
ggtitle("Percentage of sample per cluster")
b2
treatment_ncells <- FetchData(pigs.unannotated,
vars = c("ident", "treatment")) %>%
dplyr::count(ident, treatment) %>%
tidyr::spread(ident, n)
write.table(treatment_ncells,
paste0("../../results/nCells/",
treatment, "_",tolower(tissue),
"_cells_per_cluster_treatment.txt"),
quote = FALSE, sep = "\t")
sample_ncells <- FetchData(pigs.unannotated,
vars = c("ident", "sample")) %>%
dplyr::count(ident,sample) %>%
tidyr::spread(ident, n)
write.table(sample_ncells,
paste0("../../results/nCells/",
treatment, "_",tolower(tissue),
"_cells_per_cluster_sample.txt"),
quote = FALSE, sep = "\t")