Single cell replicate 1 group only

Note: make results directory strucutre first mkdir PCA UMAP boxPlots cellCycle density doubletFinder featurePlots geneLists nCells scatterPlots topTranscripts vlnPlots cd UMAP mkdir annotated unannotated

Samples in replicate group 1 WT_0_hr WT_3_hr PS19_0_hr PS19_3_hr

Set working directory

knitr::opts_knit$set(root.dir = ".")

Load libraries

# load libraries
library(cowplot)  # plot_grid()
library(DoubletFinder)  # paramSweep()
library(dplyr)  # ungroup()
library(ggrepel) # geom_text_repel()
library(ggplot2)  # ggplot()
library(grid)  # grid.arrange()
library(gridExtra)  # grid.arrange()
library(harmony) # RunHarmony()
library(reshape2) # melt()
library(Seurat)  # Read10X_h5()
library(Seurat.utils) # RenameGenesSeurat()
library(stringr) # str_match()

Set variables and thresholds

pathToRef <- "/research/labs/neurology/fryer/projects/references/mouse/refdata-gex-mm10-2020-A"
pathToTestType <- "scRNA/cell_rep_1/"
tissue <- "Brain"
seqType <- "Cell"
treatment <- "PMI"
tool <- "cellbender"

# replicate groups
replicate <- "1"
replicate1 <- "replicate1"
#replicate2 <- "replicate2"

# genotype
WT <- "WT"
PS19 <- "PS19"

# timepoints
treatment_0hr <- "0_hr"
treatment_3hr <- "3_hr"

# colors
treatment_color <- "#C1FFC1"
control_color <- "gray"
treatment_color_PS19 <- "#9933FF"
treatment_color_WT <- "blue"
WT_treatment_color_0hr_rep1 <- "gray" # light blue grey
WT_treatment_color_3hr_rep1 <- "#6699FF" # light blue grey
#WT_treatment_color_3hr_rep2 <- "blue" # light blue grey
PS19_treatment_color_0hr_rep1 <- "#FFCCFF" # pink/purple
PS19_treatment_color_3hr_rep1 <- "#CC99FF" # pink/purple
#PS19_treatment_color_3hr_rep2 <- "purple" # pink/purple
sample_colors <- c(WT_treatment_color_0hr_rep1, WT_treatment_color_3hr_rep1, PS19_treatment_color_0hr_rep1, PS19_treatment_color_3hr_rep1)
treatment_colors <- c(control_color, treatment_color)
genotype_colors <- c("thistle", "skyblue")
replicate_colors <- c("tan3", "turquoise")

# contrast
myContrasts <- c("0 hr - 3 hr")

nCount.min <- 250
nFeature.min <- 250
complexity.cutoff <- 0.85
mt.cutoff <- 25
hb.cutoff <- 5

Save functions

These functions with help simultaneously save plots as a png and pdf.

saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}
saveToPNG <- function(...) {
    d = dev.copy(png,...)
    dev.off(d)
}

Setup seurat object

Using CellBender filtered output.

prefix <- "../../cellbender/scRNA/"
suffix <- "/.h5_filtered.h5"

if (tissue == "Brain" && file.exists(paste0("../../rObjects/", pathToTestType, "PMI_merged_h5.rds"))) {
  PMI <- readRDS(paste0("../../rObjects/", pathToTestType, "PMI_merged_h5.rds"))
} else if (tissue == "Brain") {
  # individual sample objects
  rep1_WT_0_hr <- CreateSeuratObject(Read10X_h5(paste0(prefix,"rep1_WT_0_hr",suffix)))
  rep1_WT_3_hr <- CreateSeuratObject(Read10X_h5(paste0(prefix,"rep1_WT_3_hr",suffix)))
  rep1_PS19_0_hr <- CreateSeuratObject(Read10X_h5(paste0(prefix,"rep1_PS19_0_hr",suffix)))
  rep1_PS19_3_hr <- CreateSeuratObject(Read10X_h5(paste0(prefix,"rep1_PS19_3_hr",suffix)))
  
  # merge objects
  PMI <- merge(x = rep1_WT_0_hr, 
                y = c(rep1_WT_3_hr, rep1_PS19_0_hr, rep1_PS19_3_hr), 
                add.cell.ids = c("rep1_WT_0_hr", "rep1_WT_3_hr", "rep1_PS19_0_hr", "rep1_PS19_3_hr"), 
                project = paste0("PMI replicate group ", replicate, " - Single Cell"))
  
  # cleanup and save
  remove(rep1_WT_0_hr, rep1_WT_3_hr, rep1_PS19_0_hr, rep1_PS19_3_hr)
  saveRDS(PMI, paste0("../../rObjects/", pathToTestType, "PMI_merged_h5.rds"))
} 

# preview
PMI

## An object of class Seurat 
## 32285 features across 46711 samples within 1 assay 
## Active assay: RNA (32285 features, 0 variable features)

nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique genes (features)

# create sample column
barcodes <- colnames(PMI)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample)

## sample
## rep1_PS19_0_hr rep1_PS19_3_hr   rep1_WT_0_hr   rep1_WT_3_hr 
##           7458          14280          15890           9083

PMI$sample <- factor(sample, levels = c("rep1_WT_0_hr", "rep1_WT_3_hr", 
                                        "rep1_PS19_0_hr", "rep1_PS19_3_hr"))
table(PMI$sample)  # check

## 
##   rep1_WT_0_hr   rep1_WT_3_hr rep1_PS19_0_hr rep1_PS19_3_hr 
##          15890           9083           7458          14280

Idents(PMI) <- PMI$sample

Add treatment column to metadata

# create treatment column
treat <- gsub("rep1_WT_0_hr", treatment_0hr, PMI$sample)
treat <- gsub("rep1_WT_3_hr", treatment_3hr, treat)
treat <- gsub("rep1_PS19_0_hr", treatment_0hr, treat)
treat <- gsub("rep1_PS19_3_hr", treatment_3hr, treat)

PMI$treatment <- factor(treat, levels = c("0_hr","3_hr"))
table(PMI$treatment)

## 
##  0_hr  3_hr 
## 23348 23363

Add genotype column to metadata

# create treatment column
geno <- gsub("rep1_WT_0_hr", WT, PMI$sample)
geno <- gsub("rep1_WT_3_hr", WT, geno)
geno <- gsub("rep1_PS19_0_hr", PS19, geno)
geno <- gsub("rep1_PS19_3_hr", PS19, geno)

PMI$genotype <- factor(geno, levels = c("WT","PS19"))
table(PMI$genotype)

## 
##    WT  PS19 
## 24973 21738

Add replicate column to metadata

# create treatment column
repl <- gsub("rep1_WT_0_hr", replicate1, PMI$sample)
repl <- gsub("rep1_WT_3_hr", replicate1, repl)
repl <- gsub("rep1_PS19_0_hr", replicate1, repl)
repl <- gsub("rep1_PS19_3_hr", replicate1, repl)

PMI$replicate <- factor(repl, levels = c("replicate1"))
table(PMI$replicate )

## 
## replicate1 
##      46711

# cell.complexity
PMI$cell.complexity <- log10(PMI$nFeature_RNA) / log10(PMI$nCount_RNA)

# percent MT
grep("^mt-",rownames(PMI@assays$RNA@counts),value = TRUE)

##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"

mt.genes <- c("mt-Nd1", "mt-Nd2", "mt-Co1", "mt-Co2", "mt-Atp8", "mt-Atp6", "mt-Co3", "mt-Nd3",
              "mt-Nd4l", "mt-Nd4", "mt-Nd5", "mt-Nd6", "mt-Cytb")
PMI$percent.mt <- PercentageFeatureSet(PMI, pattern = "^mt-")

# ribosomal proteins begin with 'RPL' or 'RSL' for this annotation file
PMI$percent.ribo <- PercentageFeatureSet(PMI, pattern = "^R[sp]l")

# percent.hb - hemoglobin proteins begin with 'HB' or 'HBP' 
PMI$percent.hb <- PercentageFeatureSet(PMI, pattern = "^Hb[^(p)]")

# percent Xist in each cell 
PMI$percent.Xist <- PercentageFeatureSet(PMI, features = "Xist")

# percent Ddx3x in each cell 
PMI$percent.Ddx3x <- PercentageFeatureSet(PMI, features = "Ddx3x")

# percent Ttr in each cell 
PMI$percent.Ttr <- PercentageFeatureSet(PMI, features = "Ttr")

# percent Apoe in each cell 
PMI$percent.Apoe <- PercentageFeatureSet(PMI, features = "Apoe")

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(PMI$sample))
colnames(data) <- c("sample","frequency")

ncellsRaw <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
ncellsRaw

Density plots

# set graphical parameter
par(mfrow = c(3,1))

# Visualize nCount_RNA
denCount <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nCount.min)

# Visualize percent.mt
denMt <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density")

# Visualize cell complexity
# Quality cells are usually above 0.85
denCellComplexity <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff)

# Arrange graphs in grid
plots <- list(denCount,denMt,denCellComplexity)
layout <- rbind(c(1),c(2),c(3))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 39 rows containing non-finite values (stat_density).
## Removed 39 rows containing non-finite values (stat_density).
## Removed 39 rows containing non-finite values (stat_density).

# Visualize percent.Ttr
denTtr <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = log2(percent.Ttr),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
 # geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Ttr") +
  ylab("Density")
denTtr

## Warning: Removed 31389 rows containing non-finite values (stat_density).

# Visualize percent.Xist
denXist <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = log2(percent.Xist),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
 # geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Xist") +
  ylab("Density")
denXist

## Warning: Removed 14687 rows containing non-finite values (stat_density).

# Visualize percent.Ddx3x
denDdx3x <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = log2(percent.Ddx3x),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
 # geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Ddx3x") +
  ylab("Density")
denDdx3x

## Warning: Removed 25419 rows containing non-finite values (stat_density).

# Visualize percent.Ddx3x
denApoe <- ggplot(PMI@meta.data,
       aes(color = sample,
           x = log2(percent.Apoe),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
 # geom_vline(xintercept = mt.cutoff) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Apoe") +
  ylab("Density")
denApoe

## Warning: Removed 9271 rows containing non-finite values (stat_density).

Violin plots

# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(PMI,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vCellInfo

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).

#  percent violins
vMtRiboHb <- VlnPlot(PMI,
              features = c("percent.mt","percent.ribo","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vMtRiboHb

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).
## Removed 39 rows containing non-finite values (stat_ydensity).
## Removed 39 rows containing non-finite values (stat_ydensity).

#  percent violins
vXist <- VlnPlot(PMI,
              features = c("percent.Xist"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vXist

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).

#  percent violins
vDdx3x <- VlnPlot(PMI,
              features = c("percent.Ddx3x"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vDdx3x

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).

#  percent violins
vTtr <- VlnPlot(PMI,
              features = c("percent.Ttr"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vTtr

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).

#  percent violins
vApoe<- VlnPlot(PMI,
              features = c("percent.Apoe"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vApoe

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).

vGenes <- VlnPlot(PMI,
              features = c("percent.Xist", "percent.Ddx3x", 
                           "percent.Ttr", "percent.Apoe"),
              ncol = 4,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vGenes

## Warning: Removed 39 rows containing non-finite values (stat_ydensity).
## Removed 39 rows containing non-finite values (stat_ydensity).
## Removed 39 rows containing non-finite values (stat_ydensity).
## Removed 39 rows containing non-finite values (stat_ydensity).

Scatter plots

sMt <- ggplot(
  PMI@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  stat_smooth(method=lm) +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_hline(yintercept = nFeature.min) + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 39 rows containing non-finite values (stat_smooth).

sMt <- FeatureScatter(PMI,
               feature1 = "nCount_RNA",
               feature2 = "percent.mt",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sMt

## Warning: Removed 39 rows containing missing values (geom_point).

sTtr <- FeatureScatter(PMI,
               feature1 = "percent.Ttr",
               feature2 = "nCount_RNA",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sTtr

## Warning: Removed 39 rows containing missing values (geom_point).

sTtrMt <- FeatureScatter(PMI,
               feature1 = "percent.Ttr",
               feature2 = "percent.mt",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sTtrMt

## Warning: Removed 39 rows containing missing values (geom_point).

sXist <- FeatureScatter(PMI,
               feature1 = "percent.Xist",
               feature2 = "nCount_RNA",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sXist

## Warning: Removed 39 rows containing missing values (geom_point).

Filtering

Cell-level filtering

We want to be careful filtering because removing things can easily lead to misinterpretation. For example, cells with high percent.mt could actually just be involved in respiratory processes.

We will filter based on 6 conditions:
– nCount_RNA > 500 – nFeature_RNA > 250 – cell.complexity > 0.85 – percent.mt < 1 – percent.hb < 5

And removing MT genes as to not alter down stream differential expression

# filter
PMI.filtered <- subset(PMI,
                        subset = (nCount_RNA > nCount.min) &
                          (nFeature_RNA > nFeature.min) &
                          (cell.complexity > complexity.cutoff) &
                          (percent.mt < mt.cutoff))

# print cells removed
print(paste0(dim(PMI)[2] - dim(PMI.filtered)[2]," cells removed"))

## [1] "10004 cells removed"

data <- as.data.frame(table(PMI.filtered$sample))
colnames(data) <- c("sample","frequency")

ncellsFilter1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
ncellsFilter1

# Arrange graphs in grid
plots <- list(ncellsRaw, ncellsFilter1)
layout <- cbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# cleanup
remove(plots, layout, grid, ncellsFilter1)

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
counts <- GetAssayData(object = PMI.filtered, slot = "counts")
nonzero <- counts > 0  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite PMI.filtered
PMI.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = PMI.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))

## [1] "11791 features removed"

Doublet filtering

https://github.com/chris-mcginnis-ucsf/DoubletFinder\

Heterotypic - doublets derived from transcriptionally distinct cells. DoubletFinder works best on this type of doublet.

Homotopic - Transcriptionally similar cell doublets. DoubletFinder does not work as great on this type of doublet.

pANN - proportion of artificial nearest neighbors (pANN)

BCMVN - mean-variance normalized bimodality coefficient of pANN distributions produced during pN -pK parameter sweeps. The BCMVN may be used to identify the pK parameter.

Overview of steps:
A. Prepare each sample
B. pK Identification (no ground-truth) - defines the PC neighborhood size used to compute pANN
C. Homotypic Doublet Proportion Estimate - homotypic doublets may not be a problem depending on the type of analysis you are performing. If you have some doublets of the same type and their counts are normalized, they will generally represent the profile of single cells of the same type.
D. DoubletFinder
E. Visualize where the doublets are located

# split object by sample
PMI.split <- SplitObject(PMI.filtered, split.by = "sample")

for (i in 1:length(PMI.split)) {
  # normalize and find PCs
  print(i)
  PMI_sample <- NormalizeData(PMI.split[[i]])
  sampleID <- levels(droplevels(PMI_sample@meta.data$sample))
  PMI_sample <- FindVariableFeatures(PMI_sample, selection.method = "vst", nfeatures = 2000)
  PMI_sample <- ScaleData(PMI_sample)
  PMI_sample <- RunPCA(PMI_sample)
  
  # get significant PCs
  stdv <- PMI_sample[["pca"]]@stdev
  sum.stdv <- sum(PMI_sample[["pca"]]@stdev)
  percent.stdv <- (stdv / sum.stdv) * 100
  cumulative <- cumsum(percent.stdv)
  co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
  co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] - 
                       percent.stdv[2:length(percent.stdv)]) > 0.1), 
              decreasing = T)[1] + 1
  min.pc <- min(co1, co2)
  min.pc
  
  # run umap
  PMI_sample <- RunUMAP(PMI_sample, dims = 1:min.pc, reduction = "pca")
  
  # cluster
  PMI_sample <- FindNeighbors(object = PMI_sample, dims = 1:min.pc)                           
  PMI_sample <- FindClusters(object = PMI_sample, resolution = 0.2)
  
  # Assign identity of clusters
  Idents(object = PMI_sample) <- "RNA_snn_res.0.4"
  d1 <- DimPlot(PMI_sample,
        reduction = "umap",
        label = TRUE,
        label.size = 6)
  path <- paste0("../../results/", pathToTestType, "doubletFinder/",
                 treatment,"_",tolower(tissue),
               "_doubletFinder_UMAP_res0.02_",sampleID)
  pdf(paste0(path, ".pdf"), width = 5, height = 4)
  print(d1)
  dev.off()
  
  # number of cells in each cluster
  n_cells <- FetchData(PMI_sample, vars = c("ident")) %>% dplyr::count(ident) %>%tidyr::spread(ident, n)
  
  ## pK Identification (no ground-truth) 
  sweep.res.list <- paramSweep_v3(PMI_sample, PCs = 1:min.pc, sct = FALSE)
  sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
  bcmvn <- find.pK(sweep.stats)
  
  # Optimal pK for any scRNA-seq data can be manually discerned as maxima in BCmvn distributions
  bcmvn_max <- bcmvn[which.max(bcmvn$BCmetric),]
  pK_value <- bcmvn_max$pK
  pK_value <- as.numeric(levels(pK_value))[pK_value]
  
  # Homotypic Doublet Proportion Estimate 
  annotations <- PMI_sample@meta.data$seurat_clusters
  homotypic.prop <- modelHomotypic(annotations) 
  nExp_poi <- round(pK_value*nrow(PMI_sample@meta.data))
  nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
  # Run DoubletFinder with varying classification  
  PMI_sample <- doubletFinder_v3(PMI_sample, PCs = 1:min.pc, 
                pN = 0.25, pK = pK_value, nExp = nExp_poi.adj, 
                reuse.pANN = FALSE, sct = FALSE)
  
  # set DF class for calling doublets
  DF_class <- PMI_sample@meta.data[, grep("DF.classifications", colnames(PMI_sample@meta.data)),]
  DF_class[which(DF_class == "Doublet")] <- "Doublet"
  table(DF_class)
  
  # table showing the number of doublets and singlets
  write.table(table(DF_class), paste0("../../results/", pathToTestType, "doubletFinder/",
                                      treatment,"_",tolower(tissue),
               "_doubletFinder_table_",sampleID), sep = "\t", 
              row.names = FALSE, quote = FALSE)
  PMI_sample@meta.data[,"CellTypes_DF"] <- DF_class
  
  # plot
  d2 <- DimPlot(PMI_sample, group.by="CellTypes_DF", reduction="umap",
          order=c("Coll.Duct.TC","Doublet"), 
          cols=c("#66C2A5","black"))
  path <- paste0("../../results/", pathToTestType, "doubletFinder/",
                 treatment,"_",tolower(tissue),
               "_doubletFinder_UMAP_",sampleID)
  pdf(paste0(path, ".pdf"), width = 5,height = 4)
  print(d2)
  dev.off()
  
  # plot
  f1 <- FeaturePlot(PMI_sample, 
            reduction = "umap", 
            features = c("nFeature_RNA", "nCount_RNA", 
                         "cell.complexity", "percent.mt"),
            pt.size = 0.4, 
            order = TRUE,
            label = TRUE)
  path <- paste0("../../results/", pathToTestType, "doubletFinder/",
                 treatment,"_",tolower(tissue),
               "_doubletFinder_FeaturePlot_",sampleID)
  pdf(paste0(path, ".pdf"), width = 7, height = 7)
  print(f1)
  dev.off()
  
  #only keep singlets
  PMI_sample_singlets <- subset(PMI_sample, subset = CellTypes_DF == "Singlet")
  
  # inspect
  d3 <- DimPlot(PMI_sample_singlets, group.by="CellTypes_DF", reduction="umap",
          order=c("Coll.Duct.TC","Doublet"),
          cols=c("#66C2A5","black"))
  path <- paste0("../../results/", pathToTestType, "doubletFinder/",
                 treatment,"_",tolower(tissue),
               "_doubletFinder_UMAP_singlets_",sampleID)
  pdf(paste0(path, ".pdf"), width = 5, height = 4)
  print(d3)
  dev.off()
  
  # number of cells in each cluster per and post removing doublets
  n_cells_singlets <- FetchData(PMI_sample_singlets, vars = c("ident")) %>% 
    dplyr::count(ident) %>% tidyr::spread(ident, n)
  n_cells_singlets
  ncells_per_cluster <- rbind(n_cells, n_cells_singlets)
  row.names(ncells_per_cluster) <- c("Doublets and singlets", "Singlets only")
  ncells_per_cluster
  difference <- diff(as.matrix(ncells_per_cluster))
  difference <- as.data.frame(difference)
  row.names(difference) <- c("difference")
  cbind(difference, ncells_per_cluster)
  write.table(ncells_per_cluster, paste0(
    "../../results/", pathToTestType, "doubletFinder/",
    treatment,"_",tolower(tissue),
    "_doubletFinder_table_ncells_per_cluster",sampleID, ".txt"), sep = "\t", 
    row.names = FALSE, quote = FALSE)
  
  # plot the number of cells in each cluster per and post doubletFinder
  ncell_matrix <- as.matrix(ncells_per_cluster)
  ncells_melt <- melt(ncell_matrix)
  colnames(ncells_melt) <- c("doublet type","cluster","number of cells")
  ncell_max <- ncells_melt[which.max(ncells_melt$`number of cells`),]
  ncell_max_value <- ncell_max$`number of cells`
  cellmax <- ncell_max_value + 800 # so that the figure doesn't cut off the text
  b1 <- ggplot(ncells_melt, aes(x = factor(cluster), y = `number of cells`,
                          fill = `doublet type`)) + 
    geom_bar(stat="identity", colour="black", width=1, position = position_dodge(width=0.8)) +
    geom_text(aes(label = `number of cells`), 
              position=position_dodge(width=0.9), vjust=-0.25, angle = 45, hjust=-.01) + 
    theme_classic() + scale_fill_manual(values = c("gray", "#66C2A5")) +
    ggtitle("Number of cells per cluster") +  xlab("cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust=1)) +
    scale_y_continuous(limits = c(0,cellmax))
  path <- paste0("../../results/", pathToTestType, "doubletFinder/",
                 treatment,"_",tolower(tissue),
               "_doubletFinder_barplot_ncells_per_cluster",sampleID)
  pdf(paste0(path, ".pdf"), width = 7,height = 5)
  print(b1)
  dev.off()
  f2 <- FeaturePlot(PMI_sample_singlets, 
            reduction = "umap", 
            features = c("nFeature_RNA", "nCount_RNA", 
                         "cell.complexity", "percent.mt"),
            pt.size = 0.4, 
            order = TRUE,
            label = TRUE)
  path <- paste0("../../results/", pathToTestType, "doubletFinder/",
                 treatment,"_",tolower(tissue),
               "_doubletFinder_FeaturePlot_singlets",sampleID)
  pdf(paste0(path, ".pdf"), width = 7,height = 7)
  print(f2)
  dev.off()
  
  # put the PMI together again
  PMI.split[[i]] <- PMI_sample_singlets
}

## [1] 1
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 13449
## Number of edges: 500320
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9654
## Number of communities: 20
## Elapsed time: 1 seconds

## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."

## NULL
## [1] "Creating 4483 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

## [1] 2
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7305
## Number of edges: 253508
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9629
## Number of communities: 14
## Elapsed time: 0 seconds

## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."

## NULL
## [1] "Creating 2435 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

## [1] 3
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5543
## Number of edges: 187038
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9621
## Number of communities: 14
## Elapsed time: 0 seconds

## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."

## NULL
## [1] "Creating 1848 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

## [1] 4
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10410
## Number of edges: 372851
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9656
## Number of communities: 17
## Elapsed time: 0 seconds

## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.001..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."

## NULL
## [1] "Creating 3470 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

# converge PMI.split
PMI.singlets <- merge(x = PMI.split[[1]],
                       y = c(PMI.split[[2]], PMI.split[[3]],PMI.split[[4]]),
                       project = paste0("LPS PMI ", tissue, " Single Cell"))

# print how many cells removed
print(paste0(dim(PMI.filtered)[2] - dim(PMI.singlets)[2]," cells removed"))

## [1] "4963 cells removed"

# how many removed if we had an upper nCount and nFeature
PMI.upper <- subset(PMI.singlets,
                     subset = (nCount_RNA < 10000) & (nFeature_RNA < 5000))
print(paste0(dim(PMI.filtered)[2] - dim(PMI.upper)[2],
             " cells would have been removed if upper bound applied"))

## [1] "7491 cells would have been removed if upper bound applied"

# overwrite PMI.filtered
PMI.filtered <- PMI.singlets

# reset levels
PMI.filtered$treatment <- factor(PMI.filtered$treatment,
                                  levels = c("0_hr", "3_hr"))
PMI.filtered$replicate <- factor(PMI.filtered$replicate,
                                 levels = c("replicate1"))
PMI.filtered$genotype <- factor(PMI.filtered$genotype,
                                    levels = c("WT", "PS19"))
PMI.filtered$sample <- factor(PMI.filtered$sample,
                               levels = c("rep1_WT_0_hr", "rep1_WT_3_hr", 
                                          "rep1_PS19_0_hr", "rep1_PS19_3_hr"))

# cleanup
remove(PMI.singlets, PMI.upper, PMI.split, PMI_sample_singlets, PMI_sample)
remove(n_cells,n_cells_singlets,ncell_matrix,ncell_max,ncells_per_cluster,ncells_melt)
remove(sweep.res.list, sweep.stats,bcmvn,bcmvn_max,difference)
remove(d1,d2,d3,f1,f2)
remove(counts,counts.filtered, nonzero)

Mitochondrial gene filtering

# remove mt.genes
counts <- GetAssayData(object = PMI.filtered, slot = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]

# overwrite PMI.filtered
PMI.filtered <- CreateSeuratObject(counts.filtered, 
                                    meta.data = PMI.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))

## [1] "13 features removed"

Histogram

All samples together

# User params
goi <- "Ttr"
sample_oi <- "ALL"
threshold <- 2
# Split seurat object by timepoint to perform SCT on all samples
#PMI.split <- SplitObject(PMI.filtered, split.by = "sample")

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(PMI.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <- paste0("PMI Brain Cell: ", goi, " - ", sample_oi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0("Log2(",goi, " nCount_RNA)")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = log2.threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = log2.threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")

# plot
plots <- list(hist1,hist2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)

## 
## FALSE  TRUE 
## 27928  3816

Malat1 all samples

# User params
goi <- "Malat1"
threshold <- 0.1

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(PMI.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <- paste0("LPS Brain Cell: ", goi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of Samples") + theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4") +
  annotate("rect", 
           xmin = threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue")

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0("Log2(",goi, " nCount_RNA)")) + ylab("# of Samples") + theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = log2.threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4") +
  annotate("rect", 
           xmin = log2.threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue")
# plot
plots <- list(hist1,hist2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)

## 
## FALSE  TRUE 
##  3586 28158

Xist all samples

# User params
goi <- "Xist"
sample_oi <- "ALL"
threshold <- 2
# Split seurat object by timepoint to perform SCT on all samples
#PMI.split <- SplitObject(PMI.filtered, split.by = "sample")

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(PMI.filtered, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <- paste0("PMI Brain Cell: ", goi, " - ", sample_oi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")

# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0("Log2(",goi, " nCount_RNA)")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = log2.threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = log2.threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")

# plot
plots <- list(hist1,hist2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

# number removed
table(counts.df$counts > threshold)

## 
## FALSE  TRUE 
## 11530 20214

Split by sample

PMI.filtered.split <- SplitObject(PMI.filtered, split.by = "sample")

Ttr by sample

# histogram of Ttr for each sample
for (i in 1:length(PMI.filtered.split)) {
  print(i)
  PMI_sample <- PMI.filtered.split[[i]]
  goi <- "Ttr"
  sample_oi <- print(i)
  threshold <- 2

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(PMI_sample, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <- paste0("PMI Brain Cell: ", goi, " - ", sample_oi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")
# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0("Log2(",goi, " nCount_RNA)")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = log2.threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = log2.threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")
# plot
plots <- list(hist1,hist2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# number removed
table(counts.df$counts > threshold)
}

## [1] 1
## [1] 1

## [1] 2
## [1] 2

## [1] 3
## [1] 3

## [1] 4
## [1] 4

Xist by sample

# histogram of Ttr for each sample
for (i in 1:length(PMI.filtered.split)) {
  print(i)
  PMI_sample <- PMI.filtered.split[[i]]
  goi <- "Xist"
  sample_oi <- print(i)
  threshold <- 2

# Subset data
log2.threshold <- log2(threshold + 0.01)
counts.df <- FetchData(PMI_sample, vars = goi)
colnames(counts.df) <- "counts"
log2.counts.df <- log2(counts.df + 0.01)

# Histogram
title <- paste0("PMI Brain Cell: ", goi, " - ", sample_oi, "\nnCount_RNA > ", threshold)
hist1 <- ggplot(counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0(goi, " nCount_RNA")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")
# Histogram log transformed
hist2 <- ggplot(log2.counts.df, aes(x = counts)) + 
  geom_histogram(bins = 100, fill = "gray", color = "black") + 
  labs(title = title, x=NULL, y=NULL) +
  xlab(paste0("Log2(",goi, " nCount_RNA)")) + ylab("# of cells") + theme_bw() +
  geom_vline(xintercept = log2.threshold, col = "blue") +
  annotate("rect", 
           xmin = -Inf,
           xmax = log2.threshold, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="deepskyblue") +
  annotate("rect", 
           xmin = log2.threshold,
           xmax = Inf, 
           ymin = 0, 
           ymax=Inf, 
           alpha=0.2, 
           fill="chocolate4")
# plot
plots <- list(hist1,hist2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# number removed
table(counts.df$counts > threshold)
}

## [1] 1
## [1] 1

## [1] 2
## [1] 2

## [1] 3
## [1] 3

## [1] 4
## [1] 4

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(PMI.filtered$sample))
colnames(data) <- c("sample","frequency")

ncellsFiltered <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,30000, by = 5000), limits = c(0,30000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))

# Arrange graphs in grid
plots <- list(ncellsRaw,ncellsFiltered)
layout <- cbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# set graphical parameter
par(mfrow = c(3,1))

# Visualize the number of counts per cell
denCount_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nCount.min)

# Visualize percent.mt
denMt_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  geom_vline(xintercept = mt.cutoff)

# Visualize cell complexity
# Quality cells are usually above 0.80
denCellComplexity_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff)

# Arrange graphs in grid
plots <- list(denCount,denMt,denCellComplexity,denCount_filter,
              denMt_filter,denCellComplexity_filter)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid_den_raw_filter <- grid.arrange(grobs = plots, layout_matrix = layout)

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 39 rows containing non-finite values (stat_density).
## Removed 39 rows containing non-finite values (stat_density).
## Removed 39 rows containing non-finite values (stat_density).

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 31 rows containing non-finite values (stat_density).

Density plots for genes of interest

# set graphical parameter
par(mfrow = c(4,1))

# Visualize the number of counts per cell
denApoe_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Apoe),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Apoe") +
  ylab("Density") 

# Visualize percent.mt
denXist_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Xist),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Xist") +
  ylab("Density")

# Visualize cell complexity
# Quality cells are usually above 0.80
denDdx3x_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Ddx3x),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Ddx3x") +
  ylab("Density") 

denTtr_filter <- ggplot(PMI.filtered@meta.data,
       aes(color = sample,
           x = log2(percent.Ttr),
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("percent Ttr") +
  ylab("Density") 

# Arrange graphs in grid
plots <- list(denApoe, denXist, denDdx3x, denTtr,  denApoe_filter, 
              denXist_filter, denDdx3x_filter, denTtr_filter)
layout <- rbind(c(1,5),c(2,6),c(3,7), c(4,8))
grid_den_raw_filter_genes <- grid.arrange(grobs = plots, layout_matrix = layout)

## Warning: Removed 9271 rows containing non-finite values (stat_density).

## Warning: Removed 14687 rows containing non-finite values (stat_density).

## Warning: Removed 25419 rows containing non-finite values (stat_density).

## Warning: Removed 31389 rows containing non-finite values (stat_density).

## Warning: Removed 6077 rows containing non-finite values (stat_density).

## Warning: Removed 7894 rows containing non-finite values (stat_density).

## Warning: Removed 16486 rows containing non-finite values (stat_density).

## Warning: Removed 22248 rows containing non-finite values (stat_density).

Violin plots

# nFeature, nCount, and cell.complexity violins
vCellInfo <- VlnPlot(PMI.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vCellInfo

#  percent violins
vMtRiboHb <- VlnPlot(PMI.filtered,
              features = c("percent.mt","percent.ribo","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vMtRiboHb

#  percent violins
vXist <- VlnPlot(PMI.filtered,
              features = c("percent.Xist"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vXist

#  percent violins
vDdx3x <- VlnPlot(PMI.filtered,
              features = c("percent.Ddx3x"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vDdx3x

#  percent violins
vTtr <- VlnPlot(PMI.filtered,
              features = c("percent.Ttr"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vTtr

#  percent violins
vApoe<- VlnPlot(PMI.filtered,
              features = c("percent.Apoe"),
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vApoe

vGenes <- VlnPlot(PMI.filtered,
              features = c("percent.Xist", "percent.Ddx3x", 
                           "percent.Ttr", "percent.Apoe"),
              ncol = 4,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
vGenes

Scatter plots

sMt <- ggplot(
  PMI.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  stat_smooth(method=lm) +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min) + 
  geom_hline(yintercept = nFeature.min) + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
sMt

## `geom_smooth()` using formula 'y ~ x'

sMt <- FeatureScatter(PMI.filtered,
               feature1 = "nCount_RNA",
               feature2 = "percent.mt",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sMt

sTtr <- FeatureScatter(PMI.filtered,
               feature1 = "percent.Ttr",
               feature2 = "nCount_RNA",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sTtr

sTtrMt <- FeatureScatter(PMI.filtered,
               feature1 = "percent.Ttr",
               feature2 = "percent.mt",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sTtrMt

sXist <- FeatureScatter(PMI.filtered,
               feature1 = "percent.Xist",
               feature2 = "nCount_RNA",
               group.by = 'sample',
               cols = sample_colors,
               shuffle = TRUE)
sXist

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(PMI.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Sample")
b1

Top transcripts

df <- data.frame(row.names = rownames(PMI.filtered))
df$rsum <- rowSums(x = PMI.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum,decreasing = TRUE),]
head(df, 10)

##            rsum gene_name
## Malat1  7098221    Malat1
## Gm42418 4101833   Gm42418
## Cst3    3634426      Cst3
## Apoe    1612206      Apoe
## Tmsb4x  1222268    Tmsb4x
## Fth1    1085016      Fth1
## Ttr      781458       Ttr
## Plp1     682194      Plp1
## Actb     553819      Actb
## C1qa     552419      C1qa

Next steps

For something to be informative, it needs to exhibit variation, but not all variation is informative. The goal of our clustering analysis is to keep the major sources of variation in our dataset that should define our cell types, while restricting the variation due to uninteresting sources of variation (sequencing depth, cell cycle differences, mitochondrial expression, batch effects, etc.). Then, to determine the cell types present, we will perform a clustering analysis using the most variable genes to define the major sources of variation in the dataset.

Explain sources of unwanted variation: cell cycle, percent.mt (cell stress)
Normalizing and regressing out sources of unwanted variation: sctransform
Integration (OPTIONAL STEP): recommended when comparing groups, uses shared sources of high variation to identify shared sub-populations across conditions, we visualize after integration to ensure it’s good before clustering
- Stuart and Butler et al. (2018)
Clustering cells: cluster cells based on similarity of their gene expression profiles, cells are assigned to a cluster based on their PCA scores based on expression of integrated highly variable genes
Cluster quality evaluation: see what clusters aren’t influenced by sources of uninteresting variation, check major PCs are driving clusters, explore cell identities by looking a known markers

Cell cycle

The most common biological data correction is to remove the effects of the cell cycle on the transcriptome. This data correction can be performed by a simple linear regression against a cell cycle score.

Check cell cycle phase BEFORE doing sctransform. Counts to need to be comparable between cells and each cell has a different number for nCount_RNA.

# summary of counts per cell
summary(PMI.filtered@meta.data$nCount_RNA)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     377    1959    4352    4837    6713   34701

Use the NormalizeData() function with the argument LogNormalze to account for sequencing depth. nCount_RNA for each gene is divided by the total nCount_RNA for that cell. This is done for all cells. This number is then multiplied by the scale.factor so we don’t have to deal with a tiny number. This number is then natural-log transformed using log1p. log1p is the natural logarithm (base e) of 1 + count. The 1 will prevent taking the log of 0.

PMI.phase <- NormalizeData(PMI.filtered,
                            scale.factor = 10000, # default
                            normalization.method = "LogNormalize" # default
                            )

Give each cell a score based on expression of G1, G2/M, and S phase markers. A list of markers is provided for humans. We will need to convert human gene names to mouse gene names. We will use the CellCycleScoring() function in seurat.

Below is a resource for acquiring cell markers for other organisms https://hbctraining.github.io/scRNA-seq_online/lessons/cell_cycle_scoring.html

G1 ~10 hrs S ~5-6 hrs G2 ~3-4 hrs M ~2 hrs

G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

If the score is negative for both S.Score and G2M.Score the phase is G1. Otherwise the the greatest positive value between S.Score and G2M.Score determines the phase.

Using a human list for mouse https://github.com/satijalab/seurat/issues/2493

# Basic function to convert human to mouse gene names
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", 
                 values = x , mart = human, attributesL = c("mgi_symbol"), 
                 martL = mouse, uniqueRows=T)
humanx <- unique(genesV2[, 2])
# Print the first 6 genes found to the screen
print(head(humanx))
return(humanx)
}

m.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
m.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
saveRDS(m.s.genes, "../../results/geneLists/m.s.genes.rds")
saveRDS(m.g2m.genes, "../../results/geneLists/m.g2m.genes.rds")

m.s.genes <- readRDS("../../results/geneLists/m.s.genes.rds")
m.g2m.genes <- readRDS("../../results/geneLists/m.g2m.genes.rds")

# score cells for cell cycle
PMI.phase <- CellCycleScoring(PMI.phase,
                               g2m.features = m.g2m.genes,
                               s.features = m.s.genes,
                               set.ident = TRUE)

Bar graph

cellcyclecount_barplot <- 
  as_tibble(PMI.phase[[]]) %>%
  ggplot(aes(Phase, fill = Phase)) + geom_bar()
cellcyclecount_barplot

Pie chart

# pie point 
cellcyclecount_piepoint <- 
  as_tibble(PMI.phase[[]]) %>%
  ggplot(aes(x=S.Score, y=G2M.Score, color=Phase)) + 
  geom_point()
cellcyclecount_piepoint

Top variable genes

# Identify the most variable genes
PMI.phase <- FindVariableFeatures(PMI.phase,
                                   selection.method = "vst",  # default vst
                                   nfeatures = 2000,  # default 2000
                                   verbose = FALSE)

# view top variable genes
top40 <- head(VariableFeatures(PMI.phase), 40)
top40

##  [1] "S100a9"    "S100a8"    "Mgp"       "Retnlg"    "Hist1h2ap" "Ccl5"     
##  [7] "Hist1h1b"  "Dcn"       "Tmem212"   "Vtn"       "Cxcl10"    "Acta2"    
## [13] "Ttr"       "Chil3"     "Ccn3"      "H2-Eb1"    "Ccdc153"   "Gzma"     
## [19] "Igkc"      "Crym"      "H2-Aa"     "Tagln"     "Rarres2"   "Pf4"      
## [25] "H2-Ab1"    "Cxcl2"     "Hist1h2ae" "Vpreb3"    "Cd74"      "Lcn2"     
## [31] "Spp1"      "Rgs5"      "Tpm2"      "Top2a"     "Ly6c2"     "Ngp"      
## [37] "Myoc"      "Hbb-bs"    "Ube2c"     "Plac8"

# plot variable features with labels
VarFeatPlot <- VariableFeaturePlot(PMI.phase, cols = c("gray47","red"))
VarFeatPlotLabel <- LabelPoints(plot = VarFeatPlot, 
                    points = top40, repel = TRUE, fontface="italic", 
                    xnudge = 0, ynudge = 0, max.overlaps = 12)
VarFeatPlotLabel

Mean-variance

# The variability information can be accessed using the HVFInfo method. 
# The names of the variable features can be accessed with VariableFeatures().
variance.data <- as_tibble(HVFInfo(PMI.phase),rownames = "Gene")
variance.data <- variance.data %>% mutate(hypervariable=Gene %in% VariableFeatures(PMI.phase))

# We can plot out a graph of the variance vs mean and highlight the selected genes 
# this way, we can see whether we think we’re likely to capture what we need.
subset_data <- subset(variance.data, hypervariable == TRUE)
varGeneslog <- variance.data %>% 
  ggplot(aes(log(mean),log(variance),color=hypervariable)) + 
  geom_point() + 
  scale_color_manual(values=c("black","red")) +  geom_text_repel(
    data = subset_data, max.overlaps = 20,
    aes(
      x = log(mean),
      y = log(variance),
      label = Gene, 
      fontface="italic",),segment.alpha = 1,size = 4) +
  theme(legend.position="bottom")
varGeneslog

PCA

See if the cell cycle is a major source of variation using PCA. Choose the most variable gene features (we have already done), then e the data. We scale the data because highly expressed genes exhibit the highest amount of variation and we don’t want our ‘highly variable genes’ only to reflect high expression.

vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then, feature values are standardized using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).

The ScaleData() function in Seurat will adjust gene expressions so that the mean expression in each cell is 0. It will also scale each gene to give a variance of 1 for each cell.

# Scale the counts
PMI.phase <- ScaleData(PMI.phase)

## Centering and scaling data matrix

PMI.phase@assays

## $RNA
## Assay data with 20481 features for 31744 cells
## Top 10 variable features:
##  S100a9, S100a8, Mgp, Retnlg, Hist1h2ap, Ccl5, Hist1h1b, Dcn, Tmem212,
## Vtn

PMI.phase.pca <- RunPCA(PMI.phase, features = c(m.s.genes, m.g2m.genes))

## Warning in PrepDR(object = object, features = features, verbose = verbose): The
## following 45 features requested have not been scaled (running reduction without
## them): Mcm4, Mrpl36, Chaf1b, Exo1, Gmnn, Msh2, Cdc45, Fen1, Slbp, Cdc6, Cenpu,
## Dscc1, Ubr7, Rad51ap1, Gins2, Tipin, Prim1, Wdr76, Mcm5, Pola1, Usp1, Ung,
## Blm, Casp8ap2, Rfc2, Polr1b, Rrm1, Mcm7, Rad51, Ccne2, E2f8, Cbx5, Gtse1, Ctcf,
## Anp32e, G2e3, Ncapd2, Gas2l3, Tacc3, Rangap1, Lbr, Kif2c, Hjurp, Nek2, Ckap5

## Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
## a percentage of total singular values, use a standard svd instead.

## Warning in irlba(A = t(x = object), nv = npcs, ...): did not converge--results
## might be invalid!; try increasing work or maxit

## Warning: Requested number is larger than the number of available items (49).
## Setting to 49.

## Warning: Requested number is larger than the number of available items (49).
## Setting to 49.

## Warning: Requested number is larger than the number of available items (49).
## Setting to 49.

## Warning: Requested number is larger than the number of available items (49).
## Setting to 49.

## Warning: Requested number is larger than the number of available items (49).
## Setting to 49.

## PC_ 1 
## Positive:  Anln, Tubb4b, Pcna, Nasp, Jpt1, Cdc25c, Nuf2, Cdca7, Mcm6, Tyms 
##     Psrc1, Hells, Dtl, Aurka, Cdca2, Ttk, Ect2, Uhrf1, Pimreg, Kif20b 
##     Clspn, Ckap2, Ndc80, Dlgap5 
## Negative:  Top2a, Mki67, Birc5, Cdca8, Cdca3, Nusap1, Cenpf, Ube2c, Cdk1, Tpx2 
##     Kif11, Cks1b, Kif23, Cenpe, Cks2, Hmmr, Rrm2, Ccnb2, Cenpa, Ckap2l 
##     Aurkb, Cdc20, Smc4, Hmgb2 
## PC_ 2 
## Positive:  Cdca7, Jpt1, Mcm6, Pcna, Nasp, Hells, Hmgb2, Dtl, Tyms, Clspn 
##     Uhrf1, Rrm2, Smc4, Cks2, Top2a, Ttk, Cks1b, Cdk1, Aurkb, Ckap2l 
##     Bub1, Ndc80, Cdca2, Mki67 
## Negative:  Tubb4b, Cenpa, Cenpe, Cdc20, Ccnb2, Anln, Hmmr, Nuf2, Ube2c, Cenpf 
##     Tpx2, Dlgap5, Aurka, Cdca3, Ckap2, Kif11, Kif23, Psrc1, Birc5, Pimreg 
##     Nusap1, Cdca8, Ect2, Cdc25c 
## PC_ 3 
## Positive:  Jpt1, Hmgb2, Cks2, Cenpa, Cdca7, Ccnb2, Ube2c, Cdc20, Cenpe, Cenpf 
##     Cks1b, Psrc1, Ckap2, Hmmr, Dlgap5, Ckap2l, Pimreg, Aurka, Kif23, Cdc25c 
##     Cdca3, Ect2, Cdca8, Tpx2 
## Negative:  Anln, Tubb4b, Pcna, Mcm6, Hells, Clspn, Uhrf1, Rrm2, Dtl, Nasp 
##     Smc4, Tyms, Nuf2, Top2a, Ttk, Kif11, Cdk1, Mki67, Aurkb, Bub1 
##     Nusap1, Cdca2, Ndc80, Birc5 
## PC_ 4 
## Positive:  Tyms, Tubb4b, Hells, Nuf2, Clspn, Rrm2, Mcm6, Uhrf1, Dtl, Kif11 
##     Cdk1, Top2a, Ttk, Nusap1, Aurkb, Bub1, Mki67, Cdca2, Ndc80, Birc5 
##     Ckap2l, Cdca3, Tpx2, Cks1b 
## Negative:  Anln, Pcna, Smc4, Nasp, Cenpa, Jpt1, Cks2, Cdc20, Cenpe, Psrc1 
##     Cdca7, Ccnb2, Hmgb2, Ckap2, Aurka, Hmmr, Ube2c, Kif20b, Cenpf, Cdca8 
##     Ect2, Dlgap5, Cdc25c, Pimreg 
## PC_ 5 
## Positive:  Hells, Clspn, Pcna, Tyms, Cks2, Uhrf1, Mcm6, Top2a, Mki67, Cdk1 
##     Dtl, Rrm2, Birc5, Aurkb, Kif23, Ckap2l, Hmgb2, Anln, Cdca3, Nusap1 
##     Bub1, Kif20b, Ccnb2, Dlgap5 
## Negative:  Tubb4b, Nasp, Nuf2, Cdca7, Jpt1, Smc4, Psrc1, Cenpf, Kif11, Ckap2 
##     Tpx2, Cdc20, Pimreg, Cenpa, Aurka, Cdc25c, Ube2c, Ect2, Cks1b, Cenpe 
##     Hmmr, Cdca8, Ndc80, Cdca2

DimPlot(PMI.phase.pca)

If the plots for each phase look very similar to each other, do not regress out variation due to cell cycle. You can plot PC1 vs PC2 before and after regression to see how effective it was. G1 (10 hrs) > G2/M (5-6 hrs) = S (5-6 hrs)

# Perform PCA
PMI.phase <- RunPCA(PMI.phase)

# Plot the PCA colored by cell cycle phase
cycle.pca <- DimPlot(PMI.phase,
        reduction = "pca",
        group.by= "Phase",
        split.by = "Phase")
cycle.pca

SCTransform

Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model. ??Variance is also adjusted based on pooling information across genes with similar abundances??

Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase and so we chose not to regress this out of our data. We observed some effect of mitochondrial expression and so we choose to regress this out from the data.

Since we have four samples in our dataset (from two conditions), we want to keep them as separate objects and transform them as that is what is required for integration. We will first split the cells in seurat.phase object by sample.

# Split seurat object by timepoint to perform SCT on all samples
PMI.split <- SplitObject(PMI.phase, split.by = "sample")

Now we will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.

Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb) using the following code:

options(future.globals.maxSize = 4000 * 1024^5)

for (i in 1:length(PMI.split)) {
  print(paste0("Sample ", i))
  PMI.split[[i]] <- SCTransform(PMI.split[[i]]
                                # vars.to.regress = c("percent.mt") 
                                 )
}

## [1] "Sample 1"
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## [1] "Sample 2"
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## [1] "Sample 3"
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## [1] "Sample 4"
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%

NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument.

Note, the last line of output specifies “Set default assay to SCT”. We can view the different assays that we have stored in our seurat object.

A thread about whether or not regress out batch: https://github.com/satijalab/seurat/issues/3270 It is suggested to not regress out batch, and instead use a data integration method

# Check
PMI.split

## $rep1_WT_0_hr
## An object of class Seurat 
## 39755 features across 11657 samples within 2 assays 
## Active assay: SCT (19274 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $rep1_WT_3_hr
## An object of class Seurat 
## 39005 features across 6221 samples within 2 assays 
## Active assay: SCT (18524 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $rep1_PS19_0_hr
## An object of class Seurat 
## 37868 features across 4891 samples within 2 assays 
## Active assay: SCT (17387 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## $rep1_PS19_3_hr
## An object of class Seurat 
## 39099 features across 8975 samples within 2 assays 
## Active assay: SCT (18618 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca

Integration

Run harmony

Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.

To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.

Now, using our SCTransform object as input, let’s perform the integration across conditions.

First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.

# Choose the features to use when integrating multiple datasets. 
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = PMI.split, 
                                          nfeatures = 3000)

# merge the PMI
PMI.sct.merged <- merge(x = PMI.split[[1]],
                         y = c(PMI.split[[2]],PMI.split[[3]],PMI.split[[4]]),
                         project = paste0("LPS PMI ", tissue, " Single Cell"))

# define the variable features 
VariableFeatures(PMI.sct.merged) <- var.features

save rObject

insepct data

PMI.sct.merged <- RunPCA(PMI.sct.merged)

## PC_ 1 
## Positive:  Slc1a2, Atp1a2, Sparcl1, Plpp3, Clu, Aldoc, Mt3, Slc4a4, Gja1, Mt1 
##     Apoe, Gpr37l1, Atp1b2, Ptprz1, Pla2g7, Slc1a3, Cpe, Htra1, Acsl3, Cspg5 
##     Slc6a11, Mfge8, Cldn10, Ptn, Ntsr2, F3, Luzp2, Ttyh1, Ndrg2, Bcan 
## Negative:  C1qa, C1qb, Hexb, Tmsb4x, C1qc, Ctss, Cx3cr1, Jun, Fos, Ccl4 
##     Atf3, Junb, Ctsd, Tyrobp, Zfp36, Btg2, Klf2, Egr1, Csf1r, P2ry12 
##     Trem2, Fcer1g, Ccl3, Nfkbia, Kctd12, Gpr34, B2m, Mafb, Selplg, Laptm5 
## PC_ 2 
## Positive:  Plp1, Mbp, Mal, Cryab, Apod, Mobp, Ptgds, Ermn, Mag, Cnp 
##     Qdpr, Fth1, Tubb4a, Cldn11, Dbndd2, Stmn1, Trf, Sept4, Aplp1, Stmn4 
##     Enpp2, Ppp1r14a, Tspan2, Car2, Mog, Gatm, App, Ttll7, Opalin, Ugt8a 
## Negative:  Slc1a2, Atp1a2, Sparcl1, Aldoc, Clu, Plpp3, C1qa, Mt3, C1qb, Apoe 
##     Gja1, Slc4a4, Mt1, Gpr37l1, Slc1a3, Pla2g7, Hexb, Atp1b2, Cldn10, Slc6a11 
##     C1qc, Ntsr2, Mfge8, Ctss, Cpe, F3, S1pr1, Acsl3, Aqp4, Cx3cr1 
## PC_ 3 
## Positive:  Flt1, Cldn5, Pltp, Ly6c1, Spock2, Ly6a, Slco1a4, Bsg, Cxcl12, Itm2a 
##     Igfbp7, Abcb1a, Egfl7, Adgrl4, Ptprb, Adgrf5, Pglyrp1, Hspb1, Ctla2a, Fn1 
##     Jcad, Sptbn1, Slc9a3r2, Esam, Ahnak, Podxl, Car4, Ramp2, Vim, Pcp4l1 
## Negative:  C1qa, C1qb, Hexb, Slc1a2, Plp1, C1qc, Mbp, Ctss, Cx3cr1, Mobp 
##     Trf, Ccl4, Fth1, Ermn, Ctsd, Atf3, Mag, Cnp, Plpp3, Aldoc 
##     Cryab, Aplp1, Apoe, P2ry12, Atp1a2, Jun, Dbndd2, Fos, Csf1r, Qdpr 
## PC_ 4 
## Positive:  Flt1, Cldn5, Slc1a2, Mal, Atp1a2, Ly6c1, Mbp, Slco1a4, Ly6a, Cxcl12 
##     Plp1, Sparcl1, Itm2a, Apod, Spock2, Plpp3, Bsg, Egfl7, Mobp, C1qa 
##     Aldoc, Abcb1a, Adgrl4, Ptprb, Ermn, Hexb, C1qb, Atf3, Mag, Adgrf5 
## Negative:  Meg3, Tmsb10, Sox11, Rtn1, Stmn2, Nnat, Nrxn3, Map1b, Vcan, Pdgfra 
##     Dlx6os1, Dcx, Ccnd2, Stmn3, Igfbpl1, Tuba1a, Sox4, Tnr, Cd24a, Tubb3 
##     Meis2, Dpysl3, Stmn1, Lhfpl3, Ttr, Celf4, Nsg2, Ecrg4, Dlx1, Cacng4 
## PC_ 5 
## Positive:  Pdgfra, Tnr, Vcan, Olig1, Lhfpl3, C1ql1, Gpr17, Cacng4, Rtn1, Cspg4 
##     Cntn1, Opcml, Matn4, Stmn3, Sox11, Ptprz1, Meg3, Olig2, Nrxn3, Stmn2 
##     Cspg5, Neu4, Epn2, Vxn, Dcx, Gng3, Dpysl3, Sox4, Pcdh15, Nxph1 
## Negative:  Ecrg4, Ttr, Calml4, Rsph1, Chchd10, Dynlrb2, Gm19935, Ccdc153, Tmem212, Rarres2 
##     1110017D15Rik, Mia, 1700094D03Rik, Fam183b, 2900040C04Rik, Cfap126, Dbi, Meig1, 1700012B09Rik, Rbp1 
##     Ak7, Mlf1, Enkur, Pcp4l1, Lbp, Tubb4b, Foxj1, Folr1, Enpp2, Pifo

PMI.sct.merged <- RunUMAP(PMI.sct.merged, dims = 1:15, reduction = "pca")

## 16:52:53 UMAP embedding parameters a = 0.9922 b = 1.112

## 16:52:53 Read 31744 rows and found 15 numeric columns

## 16:52:53 Using Annoy for neighbor search, n_neighbors = 30

## 16:52:53 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 16:52:57 Writing NN index file to temp file /tmp/RtmpIqSb0g/file12d084625ee8
## 16:52:57 Searching Annoy index using 1 thread, search_k = 3000
## 16:53:09 Annoy recall = 100%
## 16:53:10 Commencing smooth kNN distance calibration using 1 thread
## 16:53:13 Initializing from normalized Laplacian + noise
## 16:53:15 Commencing optimization for 200 epochs, with 1280738 positive edges
## 16:53:52 Optimization finished

Plot PCA

pcatreatment <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        split.by = "treatment",
        group.by = "treatment",
        cols = treatment_colors)
pcatreatment

pcareplicate <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        group.by = "replicate",
        cols = replicate_colors)
pcareplicate

pcareplicate_split <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        split.by = "replicate",
        group.by = "replicate",
        cols = replicate_colors)
pcareplicate_split

pcagenotype <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        group.by = "genotype", 
        cols = genotype_colors)
pcagenotype

pcagenotype_split <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        split.by = "genotype",
        group.by = "genotype", 
        cols = genotype_colors)
pcagenotype_split

pcasample <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        split.by = "sample",
        group.by = "sample",
        cols = sample_colors)
pcasample

pcasample_split <- DimPlot(PMI.sct.merged,
        reduction = "umap",
        group.by = "sample",
        shuffle = TRUE)
pcasample_split

# clean up
remove(pca1, pca2, pca3, pca4, pca5, pca6)

## Warning in remove(pca1, pca2, pca3, pca4, pca5, pca6): object 'pca1' not found

## Warning in remove(pca1, pca2, pca3, pca4, pca5, pca6): object 'pca2' not found

## Warning in remove(pca1, pca2, pca3, pca4, pca5, pca6): object 'pca3' not found

## Warning in remove(pca1, pca2, pca3, pca4, pca5, pca6): object 'pca4' not found

## Warning in remove(pca1, pca2, pca3, pca4, pca5, pca6): object 'pca5' not found

## Warning in remove(pca1, pca2, pca3, pca4, pca5, pca6): object 'pca6' not found

run harmony

# run PCA on the merged object
# run harmony to harmonize over samples 
PMI.integrated <- RunHarmony(object = PMI.sct.merged, 
                              group.by.vars = "sample", 
                              assay.use = "SCT",
                              reduction = "pca", 
                              plot_convergence = TRUE)

## Harmony 1/10

## Harmony 2/10

## Harmony 3/10

## Harmony 4/10

## Harmony 5/10

## Harmony 6/10

## Harmony 7/10

## Harmony 8/10

## Harmony converged after 8 iterations

## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

Check output

# first put the PMI back in place
Idents(PMI.integrated) <- PMI.integrated$sample
PMI.integrated$treatment <- factor(PMI.integrated$treatment,
                                    levels = c("0_hr", "3_hr"))
PMI.integrated$replicate <- factor(PMI.integrated$replicate,
                                    levels = c("replicate1"))
PMI.integrated$genotype <- factor(PMI.integrated$genotype,
                                    levels = c("WT", "PS19"))
PMI.integrated$sample <- factor(PMI.integrated$sample,
                                 levels = c("rep1_WT_0_hr", "rep1_WT_3_hr", 
                                            "rep1_PS19_0_hr", "rep1_PS19_3_hr"))
# check the embedding
harmony_embeddings <- Embeddings(PMI.integrated, 'harmony')
harmony_embeddings[1:5, 1:5]

##                                 harmony_1  harmony_2 harmony_3 harmony_4
## rep1_WT_0_hr_ATCACGAAGTACCGGA-1  16.27297  1.7593391  35.39635 -68.78883
## rep1_WT_0_hr_GCTGCAGAGCTCCGAC-1  16.24401  2.1675571  35.43971 -64.81111
## rep1_WT_0_hr_AGACAGGTCAAGTGGG-1  17.03499  1.6557100  33.56699 -64.89926
## rep1_WT_0_hr_TTCATGTTCATGCTAG-1  14.88158  0.7963308  39.57828 -61.03813
## rep1_WT_0_hr_GTACAACGTCCATAGT-1  14.35989 79.9560269  -8.41787  14.76841
##                                   harmony_5
## rep1_WT_0_hr_ATCACGAAGTACCGGA-1 -115.167309
## rep1_WT_0_hr_GCTGCAGAGCTCCGAC-1 -107.518833
## rep1_WT_0_hr_AGACAGGTCAAGTGGG-1 -102.853375
## rep1_WT_0_hr_TTCATGTTCATGCTAG-1 -102.997543
## rep1_WT_0_hr_GTACAACGTCCATAGT-1   -1.032315

# check the PCA plot
p1 <- DimPlot(object = PMI.integrated, 
              reduction = "harmony",
              group.by = "replicate", 
              cols = treatment_colors) + NoLegend()
p2 <- VlnPlot(object = PMI.integrated, 
              features = "harmony_1", 
              group.by = "replicate", 
              pt.size = 0, 
              cols = treatment_colors) + NoLegend()
plot_grid(p1,p2)

# clean up
remove(p1, p2)

Top variable features

Top 20 variable features

top20 <- PMI.integrated@assays$SCT@var.features[1:20]
top20

##  [1] "Plp1"   "Ttr"    "Ptgds"  "Cldn5"  "Cd74"   "Ccl4"   "Apod"   "Vtn"   
##  [9] "Rgs5"   "Mbp"    "Mal"    "Flt1"   "Ccl5"   "H2-Aa"  "Cryab"  "Igkc"  
## [17] "H2-Eb1" "Il1b"   "Cxcl12" "H2-Ab1"

PCA plot

After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.

To generate these visualizations with the harmony output, use reduction = “harmony”

# Plot PCA
pcaTreatment <- DimPlot(PMI.integrated,
        reduction = "harmony",
        split.by = "treatment",
        group.by = "treatment",
        cols = treatment_colors)
pcaTreatment

pcaTreatmentSplit <- DimPlot(PMI.integrated,
        reduction = "harmony",
        group.by = "treatment",
        cols = treatment_colors)
pcaTreatmentSplit

pcaReplicate <- DimPlot(PMI.integrated,
        reduction = "harmony",
        group.by = "replicate",
        cols = replicate_colors)
pcaReplicate

pcaReplicateSplit <- DimPlot(PMI.integrated,
        reduction = "harmony",
        split.by = "replicate",
        group.by = "replicate",
        cols = replicate_colors)
pcaReplicateSplit

pcaGenotype <- DimPlot(PMI.integrated,
        reduction = "harmony",
        group.by = "genotype",
        cols = genotype_colors,
        shuffle = TRUE)
pcaGenotype

pcaGenotypeSplit <- DimPlot(PMI.integrated,
        reduction = "harmony",
        split.by = "genotype",
        group.by = "genotype",
        cols = genotype_colors,
        shuffle = TRUE)
pcaGenotypeSplit

pcaSample <- DimPlot(PMI.integrated,
        reduction = "harmony",
        group.by = "sample",
        cols = sample_colors, 
        shuffle = TRUE)
pcaSample

pcaSampleSplit <- DimPlot(PMI.integrated,
        reduction = "harmony",
        split.by = "sample",
        group.by = "sample",
        cols = sample_colors)
pcaSampleSplit

Find significant PCs

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# Printing out the most variable genes driving PCs
print(x = PMI.integrated[["pca"]], 
      dims = 1:10, 
      nfeatures = 10)

## PC_ 1 
## Positive:  Slc1a2, Atp1a2, Sparcl1, Plpp3, Clu, Aldoc, Mt3, Slc4a4, Gja1, Mt1 
## Negative:  C1qa, C1qb, Hexb, Tmsb4x, C1qc, Ctss, Cx3cr1, Jun, Fos, Ccl4 
## PC_ 2 
## Positive:  Plp1, Mbp, Mal, Cryab, Apod, Mobp, Ptgds, Ermn, Mag, Cnp 
## Negative:  Slc1a2, Atp1a2, Sparcl1, Aldoc, Clu, Plpp3, C1qa, Mt3, C1qb, Apoe 
## PC_ 3 
## Positive:  Flt1, Cldn5, Pltp, Ly6c1, Spock2, Ly6a, Slco1a4, Bsg, Cxcl12, Itm2a 
## Negative:  C1qa, C1qb, Hexb, Slc1a2, Plp1, C1qc, Mbp, Ctss, Cx3cr1, Mobp 
## PC_ 4 
## Positive:  Flt1, Cldn5, Slc1a2, Mal, Atp1a2, Ly6c1, Mbp, Slco1a4, Ly6a, Cxcl12 
## Negative:  Meg3, Tmsb10, Sox11, Rtn1, Stmn2, Nnat, Nrxn3, Map1b, Vcan, Pdgfra 
## PC_ 5 
## Positive:  Pdgfra, Tnr, Vcan, Olig1, Lhfpl3, C1ql1, Gpr17, Cacng4, Rtn1, Cspg4 
## Negative:  Ecrg4, Ttr, Calml4, Rsph1, Chchd10, Dynlrb2, Gm19935, Ccdc153, Tmem212, Rarres2 
## PC_ 6 
## Positive:  Pdgfra, Tnr, Olig1, Lhfpl3, C1ql1, Gpr17, Cspg4, Cntn1, Matn4, Vcan 
## Negative:  Tmsb10, Stmn1, Stmn2, Ccnd2, Sox11, Dlx6os1, Igfbpl1, Meis2, Rps24, S100a10 
## PC_ 7 
## Positive:  Stmn1, Stmn2, Sox11, Dlx6os1, Meis2, Igfbpl1, Egr1, Jun, Tuba1a, Atf3 
## Negative:  Ttr, Pdgfra, Tnr, Cd52, Cd74, Lhfpl3, Olig1, C1ql1, Ifi27l2a, Gpr17 
## PC_ 8 
## Positive:  Crip1, Vim, S100a11, S100a6, Rarres2, Cd74, Fos, Cd52, Ifitm3, Ccdc153 
## Negative:  Ttr, 2900040C04Rik, Enpp2, Ecrg4, Rbp1, Kl, Calml4, Cox8b, Pcp4, Kcnj13 
## PC_ 9 
## Positive:  Tmsb4x, C1qb, C1qa, Pltp, Tyrobp, Cldn5, Ctss, Flt1, Hexb, Spock2 
## Negative:  Vtn, Rgs5, Cald1, Ndufa4l2, Mgp, Igfbp7, Myl9, Pdgfrb, Rgs4, Higd1b 
## PC_ 10 
## Positive:  C1qb, C1qa, Hexb, Tmsb4x, Tyrobp, Vtn, Ctss, Rgs5, C1qc, Ctsd 
## Negative:  Fos, Atf3, Egr1, Junb, Klf2, Jun, Zfp36, Dusp1, Nfkbia, Btg2

Quantitative approach to an elbow plot - The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation. - The point where the percent change in variation between the consecutive PCs is less than 0.1%.

First metric

# Determine percent of variation associated with each PC
stdv <- PMI.integrated[["pca"]]@stdev
sum.stdv <- sum(PMI.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100

# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)

# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1

## [1] 39

Second metric

# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
  (percent.stdv[1:length(percent.stdv) - 1] - 
     percent.stdv[2:length(percent.stdv)]) > 0.1), 
  decreasing = T)[1] + 1

# last point where change of % of variation is more than 0.1%.
co2

## [1] 19

Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.

# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc

## [1] 19

Elbow plot

Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:

# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv, 
           cumu = cumulative, 
           rank = 1:length(percent.stdv))

# Elbow plot to visualize 
  ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
  theme_bw()

UMAP

# Run UMAP
PMI.integrated <- RunUMAP(PMI.integrated,
                           dims = 1:min.pc,
                           reduction = "harmony",
                           n.components = 3) # set to 3 to use with VR

DimPlot(PMI.integrated,
        group.by = "treatment",
        shuffle = TRUE,
        cols = treatment_colors)

DimPlot(PMI.integrated,
        group.by = "treatment",
        split.by = "treatment",
        shuffle = TRUE,
        cols = treatment_colors)

DimPlot(PMI.integrated,
        group.by = "replicate",
        shuffle = TRUE,
        cols = replicate_colors)

DimPlot(PMI.integrated,
        group.by = "replicate",
        split.by = "replicate",
        shuffle = TRUE,
        cols = replicate_colors)

DimPlot(PMI.integrated,
        group.by = "genotype",
        cols = genotype_colors,
        shuffle = TRUE)

DimPlot(PMI.integrated,
        group.by = "genotype",
        split.by = "genotype",
        cols = genotype_colors,
        shuffle = TRUE)

DimPlot(PMI.integrated,
        group.by = "sample",
        shuffle = TRUE,
        cols = sample_colors)

DimPlot(PMI.integrated,
        group.by = "sample",
        split.by = "sample",
        shuffle = TRUE,
        cols = sample_colors)

FeaturePlot(PMI.integrated, 
            features = "Apoe")

FeaturePlot(PMI.integrated, 
            features = "Ttr")

FeaturePlot(PMI.integrated, 
            features = "Xist")

Cluster the cells

Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].

We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.

The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.

# Determine the K-nearest neighbor graph
PMI.unannotated <- FindNeighbors(object = PMI.integrated, 
                                 assay = "SCT", # set as default after SCTransform
                                 reduction = "harmony",
                                 dims = 1:min.pc)

# Determine the clusters for various resolutions
PMI.unannotated <- FindClusters(object = PMI.unannotated,
                                 algorithm = 1, # 1= Louvain
                                 resolution = seq(0.1,0.8,by=0.1))

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9773
## Number of communities: 14
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9653
## Number of communities: 16
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9563
## Number of communities: 19
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9479
## Number of communities: 19
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9400
## Number of communities: 20
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9316
## Number of communities: 21
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9241
## Number of communities: 23
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 31744
## Number of edges: 1068614
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9178
## Number of communities: 25
## Elapsed time: 3 seconds

Explore resolutions

# 0.1
umap0.1 <- DimPlot(PMI.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE)
umap0.1

# 0.2
umap0.2 <- DimPlot(PMI.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE)
umap0.2

# 0.3
umap0.3 <- DimPlot(PMI.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE)
umap0.3

# 0.4
umap0.4 <- DimPlot(PMI.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE)
umap0.4

Clustering QC

Treatment, sample, phase

# replicate
utreatment <- DimPlot(PMI.unannotated, 
        label = FALSE, 
        group.by = "SCT_snn_res.0.1",
        split.by = "treatment") +
  NoLegend()
utreatment

# replicate
ureplicate <- DimPlot(PMI.unannotated, 
        label = FALSE, 
        group.by = "SCT_snn_res.0.1",
        split.by = "replicate") +
  NoLegend()
ureplicate

# sample
usample <- DimPlot(PMI.unannotated, 
        label = FALSE,
        group.by = "SCT_snn_res.0.1",
        split.by = "sample") +
  NoLegend()
usample

# phase
uphase <- DimPlot(PMI.unannotated, 
        label = FALSE,
        group.by = "SCT_snn_res.0.1",
        split.by = "Phase") +
  NoLegend()
uphase

Percent of cell type markers

PMI.unannotated[["percent.neuron"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("Snap25", "Syt1", "Gad1", "Gad2"))
PMI.unannotated[["percent.astrocytes"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("Clu", "Gfap", "Aqp4"))
PMI.unannotated[["percent.microglia"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("Hexb", "C1qa"))
PMI.unannotated[["percent.DAM"]] <-
  PercentageFeatureSet(PMI.unannotated,
                       features = c("Apoe", "Cst7", "Lyz2", "Lpl", "Cd9", "Trem2"))
PMI.unannotated[["percent.homeostatic"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("P2ry12", "Cx3cr1"))
PMI.unannotated[["percent.oligodendrocytes"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("Plp1", "Mbp", "Mag"))
PMI.unannotated[["percent.OPCs"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("Olig1", "Pdgfra", "Vcan"))
PMI.unannotated[["percent.macrophages"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("F13a1", "Mrc1"))
PMI.unannotated[["percent.endothelial"]] <-
  PercentageFeatureSet(PMI.unannotated, features = c("Igfbp7", "Fn1", "Sox17"))

FeaturePlot(PMI.unannotated, features = "percent.neuron")

FeaturePlot(PMI.unannotated, features = "percent.astrocytes")

FeaturePlot(PMI.unannotated, features = "percent.microglia")

FeaturePlot(PMI.unannotated, features = "percent.DAM")

FeaturePlot(PMI.unannotated, features = "percent.homeostatic")

FeaturePlot(PMI.unannotated, features = "percent.oligodendrocytes")

FeaturePlot(PMI.unannotated, features = "percent.OPCs")

FeaturePlot(PMI.unannotated, features = "percent.macrophages")

FeaturePlot(PMI.unannotated, features = "percent.endothelial")

## Revist QC metrics

# nCount
fnCount <- FeaturePlot(PMI.unannotated, features = "nCount_RNA", order = TRUE)
fnCount

# nFeature
fnFeature <- FeaturePlot(PMI.unannotated, features = "nFeature_RNA", order = TRUE)
fnFeature

# percent.mt
fpercent.mt <- FeaturePlot(PMI.unannotated, features = "percent.mt", order = TRUE)
fpercent.mt

# cell.complexity
fcell.complexity <- FeaturePlot(PMI.unannotated, features = "cell.complexity", order = TRUE)
fcell.complexity

fS.Score <- FeaturePlot(PMI.unannotated, features = "S.Score", order = TRUE)
fS.Score

fG2M.Score <- FeaturePlot(PMI.unannotated, features = "G2M.Score", order = TRUE)
fG2M.Score

FeaturePlots split by sample

if order is set to TRUE this means the cells are in order of expression. Can be useful if cells expressing given feature are getting buried.

# cell information
fnCount <-
  FeaturePlot(
    PMI.unannotated,
    features = "nCount_RNA",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fnCount

fnFeature <-
  FeaturePlot(
    PMI.unannotated,
    features = "nFeature_RNA",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fnFeature

fcell.complextity <-
  FeaturePlot(
    PMI.unannotated,
    features = "cell.complexity",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fcell.complextity

fpercent.mt <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.mt",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.mt

fG2M.Score <-
  FeaturePlot(
    PMI.unannotated,
    features = "G2M.Score",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fG2M.Score

fS.Score <-
  FeaturePlot(
    PMI.unannotated,
    features = "S.Score",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fS.Score

# Gene sets
fpercent.neuron <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.neuron",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.neuron

fpercent.astrocytes <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.astrocytes",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.astrocytes

fpercent.microglia <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.microglia",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.microglia

fpercent.DAM <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.DAM",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.DAM

fpercent.homeostatic <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.homeostatic",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.homeostatic

fpercent.oligodendrocytes <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.oligodendrocytes",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.oligodendrocytes

fpercent.OPCs <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.OPCs",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.OPCs

fpercent.macrophages <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.macrophages",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.macrophages

fpercent.endothelial <-
  FeaturePlot(
    PMI.unannotated,
    features = "percent.endothelial",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fpercent.endothelial

# Genes
fMalat1 <-
  FeaturePlot(
    PMI.unannotated,
    features = "Malat1",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fMalat1

fXist <-
  FeaturePlot(
    PMI.unannotated,
    features = "Xist",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fXist

fTtr <-
  FeaturePlot(
    PMI.unannotated,
    features = "Ttr",
    split.by = "sample",
    keep.scale = "all",
    order = TRUE
  )
fTtr

save plotes

Percent cells per cluster

PMI.unannotated@meta.data$seurat_clusters <- 
  PMI.unannotated@meta.data$SCT_snn_res.0.1

# treatment
btreatment <- PMI.unannotated@meta.data %>%
  group_by(seurat_clusters, treatment) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=treatment)) +
  geom_col() +
  scale_fill_manual(values = treatment_colors) +
  ggtitle("Percentage of treatment per cluster")
btreatment

# replicate
breplicate <- PMI.unannotated@meta.data %>%
  group_by(seurat_clusters, replicate) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=replicate)) +
  geom_col() +
  scale_fill_manual(values = replicate_colors) +
  ggtitle("Percentage of replicate per cluster")
breplicate

# genotype
bgenotype <- PMI.unannotated@meta.data %>%
  group_by(seurat_clusters, genotype) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=genotype)) +
  geom_col() +
  scale_fill_manual(values = genotype_colors) +
  ggtitle("Percentage of genotype per cluster")
bgenotype

# sample
bsample <- PMI.unannotated@meta.data %>%
  group_by(seurat_clusters, sample) %>%
  dplyr::count() %>%
  group_by(seurat_clusters) %>%
  dplyr::mutate(percent = 100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
  geom_col() +
  scale_fill_manual(values = sample_colors) +
  ggtitle("Percentage of sample per cluster")
bsample

Number cells per cluster

treatment_ncells <- FetchData(PMI.unannotated, 
                     vars = c("ident", "treatment")) %>%
  dplyr::count(ident, treatment) %>%
  tidyr::spread(ident, n)
write.table(treatment_ncells, 
            paste0("../../results/", pathToTestType, "nCells/",
                   treatment, "_",tolower(tissue),
                   "_cells_per_cluster_treatment.txt"),
            quote = FALSE, sep = "\t")

sample_ncells <- FetchData(PMI.unannotated, 
                     vars = c("ident", "sample")) %>%
  dplyr::count(ident,sample) %>%
  tidyr::spread(ident, n)
write.table(sample_ncells, 
            paste0("../../results/", pathToTestType, "nCells/",
                   treatment, "_",tolower(tissue),
                   "_cells_per_cluster_sample.txt"),
            quote = FALSE, sep = "\t")

PMI - cell replicate 1 only

Cadiz, Todd, Olney

04/24/2022

Single cell replicate 1 group only

Set working directory

Load libraries

Set variables and thresholds

Save functions

Setup seurat object

Pre-filtering QC

Number of cells

Density plots

Violin plots

Scatter plots

Filtering

Cell-level filtering

Gene-level filtering

Doublet filtering

Mitochondrial gene filtering

Histogram

All samples together

Split by sample

Number of cells

Density plots

Violin plots

Scatter plots

Box plot

Top transcripts

Next steps

Cell cycle

Bar graph

Pie chart

Top variable genes

Mean-variance

PCA

SCTransform

Integration

Run harmony

Plot PCA

Check output

Top variable features

PCA plot

Find significant PCs

Elbow plot

UMAP

Cluster the cells

Explore resolutions

Clustering QC

Treatment, sample, phase

Percent of cell type markers

FeaturePlots split by sample

Percent cells per cluster

Number cells per cluster