<div class="m_Page m_layout-narrow">
  <div class="m_Wrap">
    <div class="m_Sidebar">
      <div class="m_SidebarItems" id="m_toc">
        <ul>
        <li><a href="#m__purpose" rel="noreferrer">1 - Purpose</a></li>
        <li><a href="#m__load-libraries-functions-and-themes" rel="noreferrer">2 - Load libraries, functions, and themes</a>
        <ul>
        <li><a href="#m__libraries" rel="noreferrer">2.1 - Libraries</a></li>
        <li><a href="#m__functions" rel="noreferrer">2.2 - Functions</a></li>
        <li><a href="#m__themes" rel="noreferrer">2.3 - Themes</a></li>
        </ul></li>
        <li><a href="#m__load-data" rel="noreferrer">3 - Load data</a>
        <ul>
        <li><a href="#m__expression-and-metadata" rel="noreferrer">3.1 - Expression and metadata</a>
        <ul>
        <li><a href="#m__setup-the-colors-to-be-used" rel="noreferrer">3.1.1 - Setup the colors to be used</a></li>
        </ul></li>
        </ul></li>
        <li><a href="#m__run-quality-control-qc-steps" rel="noreferrer">4 - Run quality control (QC) steps</a>
        <ul>
        <li><a href="#m__sample-level-qc" rel="noreferrer">4.1 - Sample level QC</a>
        <ul>
        <li><a href="#m__estimate-the-abundance-of-dead-cells" rel="noreferrer">4.1.1 - Estimate the abundance of dead cells</a></li>
        <li><a href="#m__examine-distributions" rel="noreferrer">4.1.2 - Examine distributions</a></li>
        <li><a href="#m__test-modality-of-features" rel="noreferrer">4.1.3 - Test modality of features</a></li>
        <li><a href="#m__inspect-the-modality" rel="noreferrer">4.1.3.1 - Inspect the modality</a></li>
        <li><a href="#m__mean-and-mad-of-features" rel="noreferrer">4.1.4 - Mean and MAD of features</a></li>
        <li><a href="#m__identify-sample-level-outliers" rel="noreferrer">4.1.5 - Identify sample-level outliers</a></li>
        </ul></li>
        <li><a href="#m__cell-level-qc" rel="noreferrer">4.2 - Cell level QC</a>
        <ul>
        <li><a href="#m__doublet-detection" rel="noreferrer">4.2.1 - Doublet detection</a>
        <ul>
        <li><a href="#m__plot-density-of-doublet-scores" rel="noreferrer">4.2.1.1 - Plot density of doublet scores</a></li>
        <li><a href="#m__plot-density-of-doublet-scores-sample-level" rel="noreferrer">4.2.1.2 - Plot density of doublet scores (sample-level)</a></li>
        <li><a href="#m__plot-boxplot-doublet-scores" rel="noreferrer">4.2.1.3 - Plot boxplot doublet scores</a></li>
        <li><a href="#m__call-doublets" rel="noreferrer">4.2.1.4 - Call doublets</a></li>
        </ul></li>
        <li><a href="#m__decontamination" rel="noreferrer">4.2.2 - Decontamination</a>
        <ul>
        <li><a href="#m__plot-density-of-contamination-score" rel="noreferrer">4.2.2.1 - Plot density of contamination score</a></li>
        <li><a href="#m__plot-density-of-contaminations-core-sample-level" rel="noreferrer">4.2.2.2 - Plot density of contaminations core (sample-level)</a></li>
        <li><a href="#m__plot-boxplot-of-contamination" rel="noreferrer">4.2.2.3 - Plot boxplot of contamination</a></li>
        <li><a href="#m__call-ambient-rna-contamination" rel="noreferrer">4.2.2.4 Call ambient RNA contamination</a></li>
        </ul></li>
        <li><a href="#m__establish-ncount_rna-nfeature_rna-and-mt.percent-acceptable-levels" rel="noreferrer">4.2.3 - Establish nCount_RNA, nFeature_RNA, and mt.percent acceptable levels</a></li>
        <li><a href="#m__flag-cells-that-should-be-removed" rel="noreferrer">4.2.4 - Flag cells that should be removed</a>
        <ul>
        <li><a href="#m__add-flag-to-metadata" rel="noreferrer">4.2.4.1 - Add “flag” to metadata</a></li>
        <li><a href="#m__visualize-the-flagged-cells-and-their-cutoffs" rel="noreferrer">4.2.4.2 - Visualize the flagged cells and their cutoffs</a></li>
        <li><a href="#m__remove-flagged-cells-optional" rel="noreferrer">4.2.4.3 - Remove flagged cells (optional)</a></li>
        <li><a href="#m__check-filtering-wscatterplots" rel="noreferrer">4.2.3.4 - Check filtering w/scatterplots</a></li>
        <li><a href="#m__density-plots" rel="noreferrer">4.8.3.2 - Density plots</a></li>
        <li><a href="#m__boxplot-after-filtering" rel="noreferrer">4.8.3.3 - Boxplot after filtering</a></li>
        </ul></li>
        </ul></li>
        </ul></li>
        <li><a href="#m__normalize-and-scale-data" rel="noreferrer">5 - Normalize and scale data</a>
        <ul>
        <li><a href="#m__density-plot-beforeafter-normalizationscaling" rel="noreferrer">5.1 - Density plot before/after normalization/scaling</a></li>
        <li><a href="#m__boxplot-plot-beforeafter-normalizationscaling" rel="noreferrer">5.2 - Boxplot plot before/after normalization/scaling</a></li>
        </ul></li>
        <li><a href="#m__cell-cycle-score" rel="noreferrer">6 - Cell Cycle score</a>
        <ul>
        <li><a href="#m__proportion-of-cells-per-sample-in-each-phase" rel="noreferrer">6.1 - Proportion of cells per sample in each phase</a></li>
        <li><a href="#m__barplot-cell-cycle-phase" rel="noreferrer">6.2 - Barplot cell cycle phase</a></li>
        <li><a href="#m__cell-cycle-regression" rel="noreferrer">6.3 - Cell cycle regression</a></li>
        </ul></li>
        <li><a href="#m__pca" rel="noreferrer">7 - PCA</a>
        <ul>
        <li><a href="#m__plot-elbow-of-pcs" rel="noreferrer">7.1 - Plot Elbow of PCs</a></li>
        <li><a href="#m__dimension-loadings" rel="noreferrer">7.2 - Dimension loadings</a></li>
        <li><a href="#m__interogate-pcs" rel="noreferrer">7.3 - Interogate PCs</a></li>
        </ul></li>
        <li><a href="#m__clustering-umap" rel="noreferrer">8 - Clustering &amp; UMAP</a></li>
        <li><a href="#m__investigate-nfeatures_rna" rel="noreferrer">9 - Investigate nFeatures_RNA</a></li>
        <li><a href="#m__final-conclusions" rel="noreferrer">10 - Final conclusions</a></li>
        <li><a href="#m__section" rel="noreferrer"></a></li>
        <li><a href="#m__r-session-information" rel="noreferrer">R Session Information</a></li>
        </ul>
      </div>
      <div class="m_InjectedComponents"><div class="m_dark-theme-toggler"><div class="m_toggle"><div class="m_toggle-track"><div class="m_toggle-track-check"><img role="presentation" width="16" height="16"></div> <div class="m_toggle-track-x"><img role="presentation" width="16" height="16"></div></div> <div class="m_toggle-thumb"></div></div> <input type="checkbox" aria-label="Switch between Dark and Default theme" class="m_toggler-screen-reader-only"></div></div>
    </div>
    <div class="m_Main">
      <div class="m_Content" id="m_content"> 

  <div class="m_btn-group m_pull-right">
 <button type="button" class="m_btn m_btn-default m_btn-xs m_dropdown-toggle" aria-haspopup="true" aria-expanded="false"><span>Code</span> <span class="m_caret"></span></button>
 <ul class="m_dropdown-menu" style="min-width:50px">
            <li><a id="m_rmd-show-all-code" href="#m__" rel="noreferrer">Show All Code</a></li>
    <li><a id="m_rmd-hide-all-code" href="#m__" rel="noreferrer">Hide All Code</a></li>
                         </ul>

  <h1 class="m_title">Getting to know BioSkryb’s scRNA-seq data</h1>
  
  <p class="m_authors">
       <span class="m_glyphicon m_glyphicon-user"></span> Vincent Perez, Ph.D | QuantBio, LLC; Jen Modliszewski, Ph.D | QuantBio, LLC
  </p>
     <p class="m_date"><span class="m_glyphicon m_glyphicon-calendar"></span> 06 April 2022</p>

1 - Purpose

This script is intended to assess BioSkryb’s single cell RNA-sequencing datasets. The following will be assessed: [1] - Building the seurat object; [2] - Running quality control at the cellular and sample level both. This include 2.1 - estimating dead cells; 2.2 - examining distibution of gene/UMI counts, as well as %MT genes; 2.3 - testing modality of gene/UMI counts and %MT genes; 2.4 - identifying samples w/outlying gene/UMI counts and %MT genes; 2.5 - identifying doublets present; 2.6 - identifying cells with a high abundance of ambient RNA (contamination); 2.7 - and flagging cells w/outlying gene/UMI counts and %MT genes, as well having been detected as doublets or having a high abundance of contaminating RNA.; [3] - Normalizing and scaling counts; [4] - Running PCA on scaled counts; [5] - Cluster identifying and UMAP calculation; [6] - Identifying confounding variables.

2 - Load libraries, functions, and themes

2.1 - Libraries

All libraries are required for this analysis.

begin <- Sys.time()
library(ggplot2)
library(data.table)
library(plyr)
library(FactoMineR)
library(DESeq2)
library(dplyr)
library(BiocParallel)
library(kableExtra)
library(unikn)
library(nord)
library(ggpointdensity)
library(Seurat)
library(microViz)
library(scds)
library(celda)
library(ggridges)
library(paletteer)

2.2 - Functions

These function are required to assess confounding variables in the metadata.

anova_PCA <- function(obj_name, pca.df, metadata, merge_ID, cat_var, numPCs) {
  names(cat_var) <- cat_var
  ind.pca.df <- as.data.frame(pca.df[["ind"]][["coord"]][,1:numPCs])
  ind.pca.df[[merge_ID]] <- rownames(ind.pca.df)
  ind.pca.wcat.df <- merge(ind.pca.df, metadata, by=merge_ID, all.x=TRUE, all.y=FALSE)
  endCol <- length(names(ind.pca.df))
  ind.pca.an <- ind.pca.wcat.df[,2:endCol]
  anovaPvalueList <- lapply(cat_var, function(f) apply(ind.pca.an, 2, function(x) anova(aov(x ~ as.factor(ind.pca.wcat.df[[f]])))$`Pr(>F)`[1]))
  res.pval <- plyr::ldply(anovaPvalueList, .id="Factor")
  setnames(res.pval, names(res.pval), c("Factor", paste0("Dim", seq(1:numPCs),  "_pvalue")))
  endCol2 <- numPCs+1
  if (length(cat_var) == 1) {
    res.padj <- data.frame(t(apply(res.pval[,2:endCol2], 2, function(x) p.adjust(x, method="BH"))))
  } else {
    res.padj <- data.frame(apply(res.pval[,2:endCol2], 2, function(x) p.adjust(x, method="BH")))
  }
  names(res.padj) <- gsub("_pvalue", "_adj_pvalue", names(res.padj))
  res.padj$Factor <- res.pval$Factor
  res_all.df <- list(res.pval, res.padj) %>% purrr::reduce(full_join, by="Factor")
  res_all_ord.df <- res_all.df[,order(colnames(res_all.df))]
  return(res_all_ord.df)
}

glm_PCA <- function(obj_name, pca.list, metadata, merge_ID, num_var, numPCs=10) {
  names(num_var) <- num_var
  ind.pca.df <- as.data.frame(pca.list[["ind"]][["coord"]][,1:numPCs])
  ind.pca.df[[merge_ID]] <- rownames(ind.pca.df)
  ind.pca.wnum.df <- merge(ind.pca.df, metadata, by=merge_ID, all.x=TRUE, all.y=FALSE)
  endCol <- length(names(ind.pca.df))
  ind.pca.an <- ind.pca.wnum.df[,2:endCol]
  glmPvalueList <-lapply(num_var, function(f) apply(ind.pca.an, 2, function(x) summary(glm(x ~ as.numeric(ind.pca.wnum.df[[f]])))$coefficients[2,4]))
  res.df <- plyr::ldply(glmPvalueList, .id="Factor")
  setnames(res.df, names(res.df), c("Factor", paste0("Dim", seq(1:numPCs),  "_pvalue")))
  endCol2 <- numPCs+1
  if (length(num_var) == 1) {
    res.padj <- data.frame(t(apply(res.df[,2:endCol2], 2, function(x) p.adjust(x, method="BH"))))
  } else {
    res.padj <- data.frame(apply(res.df[,2:endCol2], 2, function(x) p.adjust(x, method="BH")))
  }
  names(res.padj) <- gsub("_pvalue", "_adj_pvalue", names(res.padj))
  res.padj$Factor <- res.df$Factor
  setcolorder(res.padj, "Factor")
  res_all.df <- list(res.df, res.padj) %>% purrr::reduce(full_join, by="Factor")
  res_all_ord.df <- res_all.df[,order(colnames(res_all.df))]
  return(res_all_ord.df)
}

2.3 - Themes

These themes are generic to ensure all Bioskryb’s figures have the same theme.

theme_bioskryb <- function(){
  theme_bw(base_size=12)+
    theme(axis.text=element_text(color="black"),
          panel.background=element_rect(color="white"),
          strip.text = element_text(size=12),
          strip.background = element_rect(fill="white"))
}

theme_bioskryb_90 <- function() {
  theme_bw(base_size=12)+
    theme(axis.text.x=element_text(angle=90,hjust = 1,vjust = 0.5),
          axis.text=element_text(color="black"),
          panel.background=element_rect(color="black"),
          strip.text = element_text(size=14),
          strip.background = element_rect(fill="white"))
}

3 - Load data

3.1 - Expression and metadata

Here we load the data and create the seurat object. Note, the gene symbols have several (>20k) that are blank because the ensembl ID has no associated gene symbols. However, gene symbols are required to identify mitochondrial gene contamination and inferring cell cycle. So the blanks get renamed as “Unknown” and made unique.

meta<-as.data.frame(fread(file="df_metadata_quantbio.tsv"))
genes<-as.data.frame(fread(file="df_metadata_gene_id_to_gene_symbol.tsv"))
exp<-as.data.frame(fread(file="tab_quantbio_nreads_all.tsv"))
exp100k<-as.data.frame(fread(file="tab_quantbio_nreads_100kpairs.tsv"))

### Clean the data up a little
rownames(meta)<-meta$SampleId
rownames(exp)<-exp$V1
exp$V1<-NULL
rownames(exp100k)<-exp100k$V1
exp100k$V1<-NULL

### Examine duplicated genes. It seems like there are 19K genes with duplicated
### gene symbols; however, 18,500 of those are duplicated because there is
### no gene symbol present.
dups<-genes[which(duplicated(genes$gene_symbol)),]
blanks<-genes[which(genes$gene_symbol==""),]
print(paste(length(which(genes$gene_symbol=="")),"of",
            length(genes$gene_id), 
            "genes have no associated gene symbol."))

## [1] "20055 of 60664 genes have no associated gene symbol."

### Subset the genes list by those found in exp/exp100k
#identical(rownames(exp), rownames(exp100k))
genes<-genes[genes$gene_id %in% rownames(exp), ]
#identical(genes$gene_id, rownames(exp))

### Add gene symbols to the counts. 
genes$gene_symbol[which(genes$gene_symbol=="")]<-"no_symbol"
exp$gene_symbol<-genes$gene_symbol
exp$gene_symbol[which(exp$gene_symbol=="no_symbol")]<-make.unique(exp$gene_symbol[which(exp$gene_symbol=="no_symbol")])
exp<-aggregate(. ~ gene_symbol, data=exp, FUN=sum)
rownames(exp)<-exp$gene_symbol
exp$gene_symbol<-NULL

### Create seurats
seurat<-CreateSeuratObject(counts=exp, project = "BioSkryb", assay = "RNA",
                           meta.data = meta)
seurat100k<-CreateSeuratObject(counts=exp100k, project = "BioSkryb", assay = "RNA",
                           meta.data = meta)

### Set factor order
order<-unique(seurat@meta.data$CellLine)
seurat@meta.data$CellLine<-factor(seurat@meta.data$CellLine, levels=order)

3.1.1 - Setup the colors to be used

4 - Run quality control (QC) steps

With traditional scRNA-seq (plate or droplet-base), sample-level QC should be performed prior to conducting cell-level QC, so here we start with sample-level QC in section 4.1 then move on to cell-level QC in section 4.2.

4.1 - Sample level QC

4.1.1 - Estimate the abundance of dead cells

The % of mitochondrial genes in a cells usually suggests a high number of dead cells in the samples.

seurat <- PercentageFeatureSet(seurat, pattern = "^MT-|^mt-", col.name = "percent.mt")

summary(seurat@meta.data$percent.mt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5306  2.8801  3.9072  4.8609  5.4257 65.1223

4.1.2 - Examine distributions

Look for samples with skewed, abnormal or bimodal distributions in key variables of gene count (nFeature_RNA), UMI count (nCount_RNA), and % mitochondria genes (mt.percent)

### Gene count
p1 <- RidgePlot(seurat, group.by="SampleID", 
                features="nFeature_RNA", 
                same.y.lims=TRUE, sort=FALSE, log=TRUE) + 
  scale_fill_manual(values=sample.cols) +
  scale_x_continuous(labels = scales::scientific)

### UMI count
p2 <- RidgePlot(seurat, group.by="SampleID", 
                features="nCount_RNA", 
                same.y.lims=TRUE, sort=FALSE, log=TRUE) +
  scale_fill_manual(values=sample.cols) +
  scale_x_continuous(labels = scales::scientific) +
  theme(axis.text.x=element_text(size=9))

### Percent mitochondrial genes
p3 <- RidgePlot(seurat, group.by="SampleID",
                features="percent.mt", y.max=50, sort=FALSE) +
  scale_fill_manual(values=sample.cols) + 
  xlim(0, max(seurat@meta.data$percent.mt)+5)

### Plot them all together
cowplot::plot_grid(p1 + theme(legend.position="none", 
                              axis.title.y=element_blank(),
                              axis.text.x=element_text(angle=90,hjust = 1,vjust = 0.5)),
                   p2 + theme(legend.position="none", 
                              axis.title.y=element_blank(),
                              axis.text.x=element_text(angle=90,hjust = 1,vjust = 0.5)),
                   p3 + theme(legend.position="none", 
                              axis.title.y=element_blank(),
                              axis.text.x=element_text(angle=90,hjust = 1,vjust = 0.5)),
                   align = 'vh',
                   labels = c("A", "B", "C"),
                   hjust = -1,
                   nrow = 1)

4.1.3 - Test modality of features

features <- c("nFeature_RNA", "nCount_RNA", "percent.mt")
names(features) <- features

modetest.list <- BiocParallel::bplapply(features, 
                   function(f) multimode::modetest(seurat@meta.data[[f]]),
                   BPPARAM=SerialParam())

pvalues_logical.list <- lapply(modetest.list, function(mt) mt[["p.value"]] <= 0.05)
                   
lapply(modetest.list, function(t) t)

## $nFeature_RNA
## 
##  Ameijeiras-Alonso et al. (2019) excess mass test
## 
## data:  seurat@meta.data[[f]]
## Excess mass = 0.13776, p-value < 2.2e-16
## alternative hypothesis: true number of modes is greater than 1
## 
## 
## $nCount_RNA
## 
##  Ameijeiras-Alonso et al. (2019) excess mass test
## 
## data:  seurat@meta.data[[f]]
## Excess mass = 0.044221, p-value = 0.034
## alternative hypothesis: true number of modes is greater than 1
## 
## 
## $percent.mt
## 
##  Ameijeiras-Alonso et al. (2019) excess mass test
## 
## data:  seurat@meta.data[[f]]
## Excess mass = 0.030617, p-value = 0.75
## alternative hypothesis: true number of modes is greater than 1

features.test <- features[unlist(pvalues_logical.list)]

modes.list <- BiocParallel::bplapply(features.test, 
                     function(f) multimode::locmodes(seurat@meta.data[[f]],
                                                     mod0=2,
                                                     display=FALSE),
                     BPPARAM=SerialParam())

antimode.list <- lapply(modes.list, 
                        function(m) round(m$locations[[2]], digits=2))

4.1.3.1 - Inspect the modality

If the antimode falls in the valley, it is potentially a good cutoff.

### Gene count
p1 <- ggplot(data=seurat@meta.data, aes(x=nFeature_RNA)) 
p1 <- p1 + geom_density(fill="slateblue4") + labs(x="Number of genes", y="Density")
if (isTRUE(pvalues_logical.list[["nFeature_RNA"]])) {
  p1 <- p1 + geom_vline(xintercept=antimode.list[["nFeature_RNA"]])
  p1 <- p1 + labs(caption=paste("antimode =", antimode.list[["nFeature_RNA"]]))
} else {
  p1 <- p1 + labs(caption="No antimode to display") 
}
p1 <- p1 + theme(plot.caption=element_text(face="italic"))
#s1 <- p1 + facet_wrap(~FlowCell)

### UMI
p2 <- ggplot(data=seurat@meta.data, aes(x=nCount_RNA))
p2 <- p2 + geom_density(fill="magenta4") + labs(x="Number of UMIs") + theme(axis.title.y=element_blank())
if (isTRUE(pvalues_logical.list[["nCount_RNA"]])) {
  p2 <- p2 + geom_vline(xintercept=antimode.list[["nCount_RNA"]])
  p2 <- p2 + labs(caption=paste("antimode =", antimode.list[["nCount_RNA"]]))
} else {
  p2 <- p2 + labs(caption="No antimode to display")
}
p2 <- p2 + theme(plot.caption=element_text(face="italic"))
#s2 <- p2 + facet_wrap(~FlowCell)

### %MT
mt.ul <- max(seurat@meta.data$percent.mt)+2
p3 <- ggplot(data=seurat@meta.data, aes(x=percent.mt))
p3 <- p3 + geom_density(fill="olivedrab3") + labs(x="% mitochondrial genes") + theme(axis.title.y=element_blank())
if (isTRUE(pvalues_logical.list[["percent.mt"]])) {
  p3 <- p3 + geom_vline(xintercept=antimode.list[["percent.mt"]])
  p3 <- p3 + labs(caption=paste("antimode =", antimode.list[["percent.mt"]]))
} else {
  p3 <- p3 + labs(caption="No antimode to display")
}
p3 <- p3 + theme(plot.caption=element_text(face="italic"))
p3 <- p3 +xlim(0, mt.ul)
#s3 <- p3 + facet_wrap(~FlowCell)

cowplot::plot_grid(p1,
                   p2,
                   p3,
                   align = 'vh',
                   labels = c("A", "B", "C"),
                   hjust = -1,
                   nrow = 1)

# cowplot::plot_grid(s1,
#                    s2,
#                    s3,
#                    align = 'vh',
#                    labels = c("A", "B", "C"),
#                    hjust = -1,
#                    nrow = 1)

4.1.4 - Mean and MAD of features

Calculate median, and median absolute deviation (mad) for gene counts, UMI counts, and %MT for entire dataset

median.list <- lapply(features, 
                       function(f) median(seurat@meta.data[[f]], 
                                          na.rm=TRUE))

mad.list <- lapply(features,
                   function(f) mad(seurat@meta.data[[f]],
                                      na.rm=TRUE))

### Print for the report
lapply(features, function(f) median(seurat@meta.data[[f]], 
                                          na.rm=TRUE))

## $nFeature_RNA
## [1] 6978
## 
## $nCount_RNA
## [1] 270603.5
## 
## $percent.mt
## [1] 3.907208

lapply(features,
                   function(f) mad(seurat@meta.data[[f]],
                                      na.rm=TRUE))

## $nFeature_RNA
## [1] 3149.784
## 
## $nCount_RNA
## [1] 180342.7
## 
## $percent.mt
## [1] 1.726394

4.1.5 - Identify sample-level outliers

Calculate median, and median absolute deviation (mad) at the sample level Remove samples that meet the 3 criteria: Gene count median above 75% quantile of entire dataset, UMI count median above 75% quantile of entire dataset, %MT below 25% median of entire dataset.

sample.median.df <- seurat@meta.data %>%
                       group_by(SampleID) %>%
                       summarize(nFeature_RNA_median=median(nFeature_RNA, na.rm=TRUE),
                                 nCount_RNA_median=median(nCount_RNA, na.rm=TRUE),
                                 percent.mt_median=median(percent.mt, na.rm=TRUE))  

### Flag a sample if it meets the following parameters:
sample.median.df$nFeature_RNA_outlr <- ifelse(sample.median.df$nFeature_RNA_median <= median.list[["nFeature_RNA"]] - 3*mad.list[["nFeature_RNA"]] || sample.median.df$nFeature_RNA_median >= median.list[["nFeature_RNA"]] + 3*mad.list[["nFeature_RNA"]],
                                               "FLAG",
                                               "OK")



sample.median.df$nCount_RNA_outlr <- ifelse(sample.median.df$nCount_RNA_median <= median.list[["nCount_RNA"]] - 3*mad.list[["nCount_RNA"]] || sample.median.df$nFeature_RNA_median >= median.list[["nCount_RNA"]] + 3*mad.list[["nCount_RNA"]],
                                               "FLAG",
                                               "OK")

sample.median.df$percent.mt_outlr <- ifelse(sample.median.df$percent.mt_median >= median.list[["percent.mt"]] + 3*mad.list[["percent.mt"]],
                                               "FLAG",
                                               "OK")

### Write to a table
kable(sample.median.df, booktabs = TRUE,table.attr = "style = \"color: black;\"") %>% kable_styling(font_size = 12)

SampleID	nFeature_RNA_median	nCount_RNA_median	percent.mt_median	nFeature_RNA_outlr	nCount_RNA_outlr	percent.mt_outlr
tme_epcamh	3480.0	156315.0	5.359060	OK	OK	OK
tme_epcaml	1835.5	82116.5	4.765005	OK	OK	OK
molm13	7164.0	345293.5	4.014256	OK	OK	OK
tme	8395.0	207120.0	2.929828	OK	OK	OK
Sum159	7550.0	278201.0	3.494873	OK	OK	OK

4.2 - Cell level QC

4.2.1 - Doublet detection

Doublets are identified from each sample individually. Currently, doublet detection is performed via the R package scds. The hybrid score utilized here incorporates two doublet scores: - bcds - artificial doublets are created and a doublet classifier is trained. Cells are assigned a probability of falling into the artificial doublet class. - cxds - expected co-expression of gene-pairs based on frequency is utilized to assign a rank-order based score to all gene-pairs. Scores for co-occuring gene-pairs for each cell are summed.
The hybrid score is derived by summing the normalized bcds and cxdx scores.

# Seurat was the only package that provided workable code for conversion between sce and seurat objects
seurat_dbl <- seurat
seurat_dbl@meta.data$rmv <- ifelse(seurat_dbl@meta.data$nFeature_RNA < 200, "remove", "Keep")
seurat_dbl@meta.data$rmv <- ifelse(seurat_dbl@meta.data$percent.mt >= 20, "remove", seurat_dbl@meta.data$rmv)
#seurat_dbl <- subset(seurat_dbl, subset=`rmv`=="Keep")

samples <- seurat_dbl@meta.data$SampleID
seurat.list <- SplitObject(seurat_dbl, split.by="SampleID")
sce.list <- lapply(seurat.list, 
                   function(s) Seurat::as.SingleCellExperiment(s))
sce.list <- lapply(sce.list, 
                   function(sce) scds::cxds_bcds_hybrid(sce))

## [09:49:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [09:49:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [09:49:51] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [09:49:52] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [09:49:53] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

seurat.list <- lapply(sce.list,
                      function(sce) Seurat::as.Seurat(sce,
                                                      counts = "counts"))
num_seurat_obj <- length(seurat.list)
seurat <- merge(x=seurat.list[[1]], y=seurat.list[2:num_seurat_obj])
hcutoff<-1.5
seurat@meta.data$hybrid_call <- ifelse(seurat@meta.data$hybrid_score >= hcutoff, "eDoublet", "eSinglet")

4.2.1.1 - Plot density of doublet scores

### Plot density of the hybrid scores
p <- ggplot(data=seurat@meta.data, aes(x=hybrid_score))
p <- p + geom_density(fill=dbl.cols[["eDoublet"]], color="#AA005FFF")
p <- p + theme_bioskryb() + labs(x="SCDS doublet score", y="Density") + geom_vline(xintercept=hcutoff)
p

4.2.1.2 - Plot density of doublet scores (sample-level)

dat <- as.data.frame(seurat@meta.data)
dbl_data.df <- data.frame(SampleID=dat$SampleID,
                          hybrid_score=dat$hybrid_score,
                          hybrid_call=dat$hybrid_call)

median.df <- dbl_data.df %>%
              group_by(SampleID) %>%
              summarize(median_h=median(hybrid_score, na.rm=TRUE))
median.dt <- setDT(median.df)

# Get x-axis orders
# setorder(median.dt, median_h)
# h.ord <- median.dt$SampleID
dbl_data.df$SampleID<-factor(dbl_data.df$SampleID, levels=order)

p1 <- ggplot(data=dbl_data.df, aes(x=hybrid_score, y=SampleID, fill=SampleID))
p1 <- p1 + ggridges::geom_density_ridges()
p1 <- p1 + theme_bioskryb() + labs(x="SCDS doublet score")
p1 <- p1 + theme(axis.title.y=element_blank(),
                 legend.position="none")
p1 <- p1 + scale_fill_manual(values=sample.cols)
p1

4.2.1.3 - Plot boxplot doublet scores

A look at the distribution of doublet scores

p1 <- ggplot(data=dbl_data.df, 
             aes(y=hybrid_score, 
                 x=SampleID, 
                 fill=SampleID))
p1 <- p1 + geom_boxplot() + labs(y="scds hybrid score")
p1 <- p1 + theme_bioskryb() #+ scale_x_discrete(limits=h.ord)
p1 <- p1 + theme(axis.text.x=element_text(angle=90),
                 axis.title.x=element_blank(),
                 legend.position="none")
p1 <- p1 + scale_fill_manual(values=sample.cols) + ylab("Doublet score")
p1

4.2.1.4 - Call doublets

dbl_data.df$hybrid_call <- factor(dbl_data.df$hybrid_call, levels=c("eSinglet", "eDoublet"))
p <-  ggplot(data=dbl_data.df, aes(x=SampleID, fill=hybrid_call))
p <- p + geom_bar(color="black") 
p <- p + scale_fill_manual(values=dbl.cols) 
p <- p + theme_bioskryb()
p <- p + theme(axis.title.x = element_blank())
p

4.2.2 - Decontamination

This function is recently incorporated and not completely recommended unless you know what you are doing. The DecontX package is utilized for contaminant detection. A threshold of 0.5 is recommended. DecontX can be run with all samples together (as is done here) or with the raw (unfiltered data) as background.

sce <- as.SingleCellExperiment(seurat)
sce <- celda::decontX(sce)
seurat <- CreateSeuratObject(counts=counts(sce),
                                 meta.data=as.data.frame(colData(sce)))
seurat@meta.data$decontX_contamination <- sce$decontX_contamination
contam_cutoff<-0.5
seurat@meta.data$contam_status <- ifelse(seurat@meta.data$decontX_contamination >= contam_cutoff, # Thresholding is experimental, proceed at your own risk.
                                           "Ambient", 
                                           "Native")

4.2.2.1 - Plot density of contamination score

p <- ggplot(data=seurat@meta.data, aes(x=decontX_contamination))
p <- p + geom_density(fill=dcx.cols[["Ambient"]], color="#62C707FF")
p <- p + theme_bioskryb() + labs(x="Percent contamination", y="Density") + geom_vline(xintercept = contam_cutoff)
p

4.2.2.2 - Plot density of contaminations core (sample-level)

dat <- as.data.frame(seurat@meta.data)

median.df <- dat %>%
              group_by(SampleID) %>%
              summarize(median_X=median(decontX_contamination, na.rm=TRUE))
median.dt <- setDT(median.df)

# Get x-axis orders
#setorder(median.dt, median_X)
#X.ord <- median.dt$SampleID
dat$SampleID<-factor(dat$SampleID, levels=order)

p1 <- ggplot(data=dat, aes(x=decontX_contamination, y=SampleID, fill=SampleID))
p1 <- p1 + ggridges::geom_density_ridges()
p1 <- p1 + labs(x="Percent contamination")
p1 <- p1 + theme_bioskryb()
p1 <- p1 + theme(axis.title.y=element_blank(),
                 legend.position="none")
p1 <- p1 + scale_fill_manual(values=sample.cols)
p1

4.2.2.3 - Plot boxplot of contamination

A look at the distribution of decontamination scores. The score ranges from 0 to 1 and refers to the percentage of cells that are contaminated.

dat <- as.data.frame(seurat@meta.data)

median.df <- dat %>%
              group_by(SampleID) %>%
              summarize(median_X=median(decontX_contamination, na.rm=TRUE))
median.dt <- setDT(median.df)

# Get x-axis orders
# setorder(median.dt, median_X)
# X.ord <- median.dt$SampleID
dat$SampleID<-factor(dat$SampleID, levels=order)

p1 <- ggplot(data=dat, aes(y=decontX_contamination, x=SampleID, fill=SampleID))
p1 <- p1 + geom_boxplot()
p1 <- p1 + labs(y="Percent contamination")
p1 <- p1 + theme_bioskryb() #+ scale_x_discrete(limits=X.ord)
p1 <- p1 + theme(axis.text.x=element_text(angle=90),
                 axis.title.x=element_blank(),
                 legend.position="none")
p1 <- p1 + scale_fill_manual(values=sample.cols)
p1

4.2.2.4 Call ambient RNA contamination

p <-  ggplot(data=dat, aes(x=SampleID, fill=contam_status))
p <- p + labs(y="Contaminated cell count")
p <- p + geom_bar(color="black") #+ scale_x_discrete(limits=X.ord)
p <- p + scale_fill_manual(values=dcx.cols) 
p <- p + theme_bioskryb()
p <- p + theme(axis.title.x = element_blank()) + guides(fill=guide_legend(title="Contamination status"))
p

4.2.3 - Establish nCount_RNA, nFeature_RNA, and mt.percent acceptable levels

We calculate the 3*MAD of nCount_RNA, nFeature_RNA, and mt.percent to assess whether a cell is an outlier.Lower limites of nFeature and nCount are hard cut-offs of 500 and 1,000.

### Lower limit of acceptable gene counts before  a cell is removed
nFeature_RNA.ll <- 500

### Lower limit of acceptable UMI counts before  a cell is removed
nCount_RNA.ll <-  1000

### Upper limit of acceptable gene counts before  a cell is removed
nFeature_RNA.ul <-  round(median.list[["nFeature_RNA"]]+3*mad.list[["nFeature_RNA"]],
                            digits=0)

### Upper limit of acceptable UMI counts before  a cell is removed
percent.mt.ul <- round(median.list[["percent.mt"]]+3*mad.list[["percent.mt"]],
                         digits=0)

### Upper limit of acceptable % mitochondria genes before  a cell is removed
nCount_RNA.ul <-  round(median.list[["nCount_RNA"]]+3*mad.list[["nCount_RNA"]],
                          digits=0)

4.2.4 - Flag cells that should be removed

Now that doublet scores, contamination scores, and nFeature_RNA/nCount_RNA/percent.mt cut-offs have been calculated and added to the metadata,we can begin flagging cells for removal. Here we will remove cells that meet cut-offs for hybrid_call, contam_status, nFeature_RNA, nCount_RNA, and mt.percent.

4.2.4.1 - Add “flag” to metadata

Create metadata variable and plot filtered vs non-filtered cells.
If doublet detection was performed (and data will be downsampled), the cells are already downsampled to 80k and filtered to exclude percent mitochondrial genes above 20% and number of genes below 200.

### nFeature_RNA
seurat@meta.data$Filter <- ifelse(seurat@meta.data$nFeature_RNA < nFeature_RNA.ll | 
                                    seurat@meta.data$nFeature_RNA > nFeature_RNA.ul,
                                  "nFeature",
                                  "")
### Percent mt
seurat@meta.data$Filter <- ifelse(seurat@meta.data$percent.mt > percent.mt.ul,
                                  paste(seurat@meta.data$Filter,"percent.mt"),
                                  seurat@meta.data$Filter)


### nCount_RNA
seurat@meta.data$Filter <- ifelse(seurat@meta.data$nCount_RNA < nCount_RNA.ll | 
                                    seurat@meta.data$nCount_RNA > nCount_RNA.ul,
                                      paste(seurat@meta.data$Filter,"nCount"), 
                                  seurat@meta.data$Filter)


### Doublet status
seurat@meta.data$Filter <-  ifelse(seurat@meta.data$hybrid_call=="eDoublet",
                                   paste(seurat@meta.data$Filter,"doublet"), 
                                   seurat@meta.data$Filter)
 

### Contamination status
seurat@meta.data$Filter <-  ifelse(seurat@meta.data$contam_status=="Ambient",
                                   paste(seurat@meta.data$Filter,"contamination"), 
                                   seurat@meta.data$Filter)

### Clean it up a little
seurat@meta.data$Filter[which(seurat@meta.data$Filter=="")]<-"Keep"
seurat@meta.data$Filter[which(seurat@meta.data$Filter=="NA contamination")]<-"contamination"

### Doublet and contamination algorithms did not calculate anything for the
### two TME cells, so its listed as NA; however, they do not meet the
### nCount/nFeature/percent.mt cut-offs, so I'll keep them.
seurat@meta.data$Filter[which(is.na(seurat@meta.data$Filter))]<-"Keep"

### Summarize
summary(as.factor(seurat@meta.data$Filter))

##             doublet              nCount      nCount doublet          percent.mt 
##                   9                  27                   1                  12 
##   percent.mt nCount                Keep            nFeature     nFeature nCount 
##                   1                 140                  47                   2 
## nFeature percent.mt 
##                   1

4.2.4.2 - Visualize the flagged cells and their cutoffs

keep_remove.cols <-  paletteer_d("pals::alphabet",n=length(unique(seurat@meta.data$Filter)))

p1 <- ggplot(data=seurat@meta.data, aes(x=nCount_RNA, y=nFeature_RNA, color=Filter))
p1 <- p1 + geom_point() + geom_hline(yintercept=nFeature_RNA.ll) + geom_hline(yintercept=nFeature_RNA.ul) + geom_vline(xintercept=nCount_RNA.ul)
p1 <- p1 + theme_bioskryb_90()
p1 <- p1 + scale_color_manual(values=keep_remove.cols)

p2 <- ggplot(data=seurat@meta.data, aes(x=percent.mt, y=nFeature_RNA, color=Filter))
p2 <- p2 + geom_point() + geom_hline(yintercept=nFeature_RNA.ll) + geom_hline(yintercept=nFeature_RNA.ul) + geom_vline(xintercept = percent.mt.ul)
p2 <- p2 + theme_bioskryb_90()
p2 <- p2 + scale_color_manual(values=keep_remove.cols)

p3 <- ggplot(data=seurat@meta.data, aes(x=percent.mt, y=nCount_RNA, color=Filter))
p3 <- p3 + geom_point() + geom_hline(yintercept=nCount_RNA.ul) + geom_hline(yintercept=nCount_RNA.ll) + geom_vline(xintercept = percent.mt.ul)
p3 <- p3 + theme_bioskryb_90()
p3 <- p3 + scale_color_manual(values=keep_remove.cols)

legend <- cowplot::get_legend(p1 + theme(legend.box.margin = margin(0, 0, 0, 12)))

prow <- cowplot::plot_grid(p1 + theme(legend.position="none",
                                      ),
                           p2 + theme(legend.position="none"),
                           p3 + theme(legend.position="none"),
                           align = 'vh',
                           labels = c("A", "B", "C"),
                           hjust = -1,
                           nrow = 1)
cowplot::plot_grid(prow, legend, rel_widths = c(4, .6))

4.2.4.3 - Remove flagged cells (optional)

This step is skipped until further direction is given by Bioskryb

#seurat <- subset(seurat, subset=`Filter`=="Keep")

4.2.3.4 - Check filtering w/scatterplots

This is commented out for now, until BioSkryb instructs for us to remove them.

4.8.3.2 - Density plots

This is commented out for now, until BioSkryb instructs for us to remove them.

4.8.3.3 - Boxplot after filtering

This is commented out for now, until BioSkryb instructs for us to remove them.

5 - Normalize and scale data

By default, Seurat implements a global-scaling normalization method “LogNormalize”. The definition is as follows: Feature (gene) counts for each cell are divided by the total counts (UMI) for that cell and multiplied by the scale.factor. This is then natural-log transformed using log1p. Only standard transformation is recommended at this time, due to statistical suitability and computational efficiency. SCTransformation is possible but has a very long run time.

num_cells <- ncol(seurat)
num_genes <- nrow(seurat)

5.1 - Density plot before/after normalization/scaling

For the next 2 plots (density and box), we’ll only plot 12 samples to get an idea of how normalization impacts the data. Samples 149:151, as well as 191:193 are samples from flow cells Al1 and Al2, respectively, which have high gene counts (nFeature_RNA) overall.

The seurat dataset used for this analysis has:
- 240 cells
- 56334 genes

### Examine distribution of genes in the first 10 samples before normalizing
beforenorm10<-as.data.frame(seurat@assays$RNA@counts[,c(1:6,100:102,142:144)])
mbeforenorm10<-melt(beforenorm10)

p<-ggplot(data=mbeforenorm10,
       aes(x=value,
           y=variable,
           fill=variable))+
  geom_density_ridges(quantile_lines=TRUE, quantiles=4) +
  theme_bioskryb() +
  ylab("Samples") +
  xlab("") +
  ggtitle("Before normalization") +
  scale_fill_discrete(name = "Samples") 
p2 <- p + xlim(c(1,150)) + ggtitle("Before normalization (shortened x-axis)")

#std_prefix <- "RNA_snn_res."
### [1] Normalize
seurat <- NormalizeData(seurat, 
                        normalization.method = "LogNormalize", 
                        scale.factor = 10000)

### Examine distribution of genes in the first 10 samples after normalizing
afternorm10<-as.data.frame(seurat@assays$RNA@data[,c(1:6,100:102,142:144)])
mafternorm10<-melt(afternorm10)
p3<-ggplot(data=mafternorm10,
       aes(x=value,
           y=variable,
           fill=variable))+
  geom_density_ridges(quantile_lines=TRUE, quantiles=4) +
  theme_bioskryb() +
  ylab("Samples") +
  xlab("Expression") +
  ggtitle("After normalization") +
  scale_fill_discrete(name = "Samples")
p4<-p3+xlim(c(1,15)) + ggtitle("After normalization (shortened x-axis)")

### [2] Find variable features 
seurat <- FindVariableFeatures(object=seurat, 
                               selection.method = "vst", 
                               nfeatures = 2000)

### [2.1] Scale the data
seurat <- ScaleData(object = seurat,
                        vars.to.regress=NULL,
                        verbose = TRUE)

### Plot
prow <- cowplot::plot_grid(p + theme(legend.position="none"),
                           p2 + theme(legend.position="none"),
                           p3 + theme(legend.position="none"),
                           p4 + theme(legend.position="none"),
                           align = 'vh',
                           labels = c("A", "B", "C","D"),
                           hjust = -1,
                           nrow = 2)
cowplot::plot_grid(prow, rel_widths = c(4, .6))

5.2 - Boxplot plot before/after normalization/scaling

p1<-ggplot(data=mbeforenorm10,
       aes(x=variable,
           y=value,
           fill=variable))+
  geom_boxplot() +
  theme_bioskryb_90() +
  ylab("Gene expression") +
  xlab("") +
  ggtitle("Before normalization") +
  scale_fill_discrete(name = "") 
p2 <- p1 + ylim(c(1,100)) + ggtitle("Before normalization (shortened y-axis)")

p3<-ggplot(data=mafternorm10,
     aes(x=variable,
         y=value,
         fill=variable))+
geom_boxplot() +
theme_bioskryb_90() +
ylab("Gene expression") +
xlab("Cells (only showing 12)") +
ggtitle("After normalization") +
scale_fill_discrete(name = "") 
p4 <- p3 + ylim(c(1,5)) + ggtitle("After normalization (shortened y-axis)")


### Put together cowplot
prow <- cowplot::plot_grid(p + theme(legend.position="none"),
                           p2 + theme(legend.position="none"),
                           p3 + theme(legend.position="none"),
                           p4 + theme(legend.position="none"),
                           align = 'vh',
                           labels = c("A", "B", "C","D"),
                           hjust = -1,
                           nrow = 2)
cowplot::plot_grid(prow, rel_widths = c(4, .6))

6 - Cell Cycle score

Cell cycle is often a confounding variable, so we will annotate the metadata with the suggested phase of each cell.

### [1] Create G2M marker gene list (i.e. genes associated with the G2M phase of the 
### cell cycle using Seurat's built-in cc.genes (cell cycle) genes list
g2m.genes <- cc.genes$g2m.genes
s.genes <- cc.genes$s.genes
g2m <- rownames(seurat)[rownames(seurat) %in% g2m.genes]

### [2] Calculate G2M marker module score.
seurat <- CellCycleScoring(seurat, 
                           s.features = s.genes, ### Genes associated with s-phase
                           g2m.features = g2m.genes, ### Genes associated with G2M phase
                           set.ident = TRUE)

6.1 - Proportion of cells per sample in each phase

tab<-prop.table(table(seurat@meta.data$Phase, seurat@meta.data$SampleID),2)

nr <- length(levels(as.factor(seurat@meta.data$Phase)))
nc <- length(levels(as.factor(seurat@meta.data$SampleID)))

### Plot
ComplexHeatmap::Heatmap(tab,
                        cluster_columns = TRUE,
                        show_column_dend = TRUE,
                        row_names_gp = grid::gpar(fontsize = 12),
                        column_names_gp = grid::gpar(fontsize=12),
                        cluster_rows = TRUE,
                        show_row_dend = FALSE,
                        col = heatmap.colfun,
                        height = unit(5, "mm")*nr,
                        width = unit(5, "mm")*nc,
                        name="Proportion")

6.2 - Barplot cell cycle phase

cell_cycle.cols <- c("#01617E","#8B9934","#E04883")
names(cell_cycle.cols) <- levels(as.factor(seurat@meta.data$Phase))
p <- ggplot(data=seurat@meta.data, aes(x=SampleID, fill=Phase))
p <- p + geom_bar(color="black")
p <- p + theme_bioskryb_90()
p <- p + scale_fill_manual("", values=cell_cycle.cols)
p <- p + labs(y="Count")
p <- p + theme(axis.title.x=element_blank(),
               axis.text=element_text(size=9),
               legend.text=element_text(size=9),
               legend.title=element_text(size=9),
               axis.title.y=element_text(size=10),
               strip.text=element_text(size=10),
               legend.position="bottom")
p

6.3 - Cell cycle regression

If cell cycle was previously identified as having an effect on the clustering and the differences are not biological, the difference between G2M and S phase scores will be calculated and used as the variable to regress. For most cases, it is not recommended to regress out cell cycle. Other variables will also be regressed out at this time.

seurat@meta.data$cc_dif <- seurat@meta.data$S.Score - seurat@meta.data$G2M.Score

#var_regress <- gsub("cc", "cc_dif", params$var_regress)

#seurat <- ScaleData(object = seurat,
#                        vars.to.regress=var_regress,
#                        verbose = TRUE)

7 - PCA

7.1 - Plot Elbow of PCs

seurat <- RunPCA(seurat)

#The elbow plot gives us an idea of how many PCs will be significant.  When the plot plateaus, that is the point of diminishing returns.
dat <- data.frame(`Standard_deviation`=seurat@reductions[["pca"]]@stdev,
                  PC=c(1:length(seurat@reductions[["pca"]]@stdev)))
p <- ggplot(data=dat, aes(x=PC, y=Standard_deviation))
p <- p + geom_bar(stat="identity", fill="#6AAAB7", color=NA)
p <- p + theme_bioskryb()
p <- p + labs(y="Standard deviation")
p

7.2 - Dimension loadings

The dimension loadings give us an idea of the top 10 genes contributing to each PC.

plot.list <- VizDimLoadings(seurat, dims=1:9, ncol=3, nfeatures=10,
                            combine=FALSE, col=NA)
for (i in 1:length(plot.list)) {
  pc <- paste("PC", i, sep="_")
  #scale_lim <- max(a)
  p <- plot.list[[i]]
  xscale_lim <- max(abs(p$data[[pc]]))
  p <- p + geom_bar(stat="identity", aes(fill=.data[[pc]])) 
  p<- p + scale_fill_gradient2(low=unikn_blue_red.cols[1], 
                               mid="white",
                               high=unikn_blue_red.cols[3],
                               limits=c(-xscale_lim, xscale_lim))
  p <- p + theme_bioskryb() 
  p <- p + theme(axis.title = element_blank(),
                 axis.text.x=element_text(angle=90))
  plot.list[[i]] <- p
}

cowplot::plot_grid(plotlist=plot.list,
                   align="vh",
                   labels="AUTO",
                   label_size=12,
                   ncol=3)

7.3 - Interogate PCs

colnames(seurat@meta.data)

##  [1] "orig.ident"            "nCount_RNA"            "nFeature_RNA"         
##  [4] "SampleId"              "FlowCell"              "CellLine"             
##  [7] "Condition"             "SCBulk"                "SampleID"             
## [10] "percent.mt"            "rmv"                   "ident"                
## [13] "cxds_score"            "bcds_score"            "hybrid_score"         
## [16] "hybrid_call"           "decontX_contamination" "decontX_clusters"     
## [19] "contam_status"         "Filter"                "S.Score"              
## [22] "G2M.Score"             "Phase"                 "old.ident"            
## [25] "cc_dif"

seurat@meta.data$FlowCell[which(seurat@meta.data$FlowCell=="")]<-"Unknown"
categorical_var <- c("FlowCell","CellLine","Condition","SCBulk","Phase")
numeric_var <- c("nCount_RNA","nFeature_RNA","percent.mt","hybrid_score","decontX_contamination")

### Calculate PCs
mat<-as.data.frame(seurat@assays$RNA@data)
pca.list<-list(All=PCA(t(mat), ncp=10, graph=FALSE, scale=FALSE))

### Run ANOVA of categorical variables
anova.results <- anova_PCA(obj_name="All", 
                           pca.df=pca.list[["All"]], 
                           metadata=seurat@meta.data,
                           merge_ID="SampleId", 
                           cat_var=categorical_var,
                           numPCs=10)
rownames(anova.results)<-anova.results$Factor

### Run GLM for continuous variables
glm.results <- glm_PCA(obj_name="All",
                       pca.list=pca.list[["All"]],
                       metadata=seurat@meta.data,
                       merge_ID="SampleId",
                       num_var=numeric_var, 
                       numPCs=10)
rownames(glm.results)<-glm.results$Factor

### Melt results for plotting
results <- rbind(anova.results, glm.results)
df.melt <- melt(results)
df.melt <- cbind(df.melt, PC=as.numeric(gsub("Dim|_adj_pvalue", "", df.melt$variable)))
df.melt$ajp <- ifelse(df.melt$value <= 0.001, "<=0.001", df.melt$value)
df.melt$ajp <- ifelse(df.melt$ajp <= 0.01 &  df.melt$ajp > 0.001, "<=0.01",  df.melt$ajp)
df.melt$ajp <- ifelse(df.melt$ajp <= 0.05 &  df.melt$ajp > 0.01, "<=0.05",  df.melt$ajp)
df.melt$ajp <- ifelse(df.melt$ajp > 0.05, "NS",  df.melt$ajp)

### Plot
ggplot(df.melt, aes(x=PC, y=Factor)) + 
  geom_tile(aes(fill=ajp),color="white") + 
  scale_fill_manual(values=c("<=0.001"="#4c1d4bff",
                                        "<=0.01"="#bd1655ff",
                                        "<=0.05"="#f47f58ff",
                                        "NS"="#FAEBddff"), 
                    name="FDR adjusted p-value") + 
  labs(title="ANOVA FDR-adjusted pvalues - Categorical Factors vs. PCs", 
       subtitle="Test") + 
  theme_bioskryb()

8 - Clustering & UMAP

### Find neighbors
seurat <- FindNeighbors(seurat, dims = 1:30)

### Find clusters
seurat <- FindClusters(seurat, algorithm=1, 
                           resolution=1.0,
                           verbose=FALSE)
#names(seurat@meta.data) <- gsub(std_prefix, "Louvain2_", names(seurat@meta.data))

### Run UMAP
seurat <- RunUMAP(seurat, dims = 1:30, reduction.key="UMAP_")

### Plot PCA
groups<-c(categorical_var, "seurat_clusters","hybrid_call")

for (i in 1: length(groups)) {
  print(
    DimPlot(seurat, reduction = "pca",
        group.by = groups[i],
#        cols = ct.colors2,
#        shuffle = TRUE,
        repel=TRUE,
        label=TRUE, #label.size = 1.5,label.box = TRUE,
        pt.size = 2) +
  ggtitle(paste("PCA of cells colored by",groups[i] )) +
  theme_bioskryb()+
  ylab("PC 2")+
  xlab("PC 1")+
  theme(axis.title.y = element_text(),
        axis.title.x = element_text()) 
  )
}

### Highlight those with high nFeature RNA
print("Highlight those with high nFeature RNA")

## [1] "Highlight those with high nFeature RNA"

tmp<-cbind(seurat@meta.data, seurat@reductions$pca@cell.embeddings[,c("PC_1","PC_2")])
tmp$high_feats<-"Low gene count"
tmp$high_feats[which(tmp$nFeature_RNA>nFeature_RNA.ul)]<-"High gene count"

ggplot(tmp,
       aes(PC_1,PC_2, 
           colour=high_feats))+
  geom_point(size=3)+ 
  scale_color_manual("High nFeature_RNA",values=c("Low gene count"="white",
                              "High gene count"="black"))+
  geom_point(aes(fill=FlowCell),
             shape=21,
             size=2.5) +
  theme_bioskryb()+
  xlab("PC 1")+
  ylab("PC 2")+
  ggtitle("PCA of cells colored by FlowCell")

### Plot UMAP
# for (i in 1: length(groups)) {
#   print(
#     DimPlot(seurat, reduction = "umap",
#         group.by = groups[i],
# #        cols = ct.colors2,
# #        shuffle = TRUE,
#         repel=TRUE,
#         label=TRUE, #label.size = 1.5,label.box = TRUE,
#         pt.size = 2) +
#   ggtitle(paste("UMAP of cells colored by",groups[i] )) +
#   theme_bioskryb()+
#   ylab("UMAP 2")+
#   xlab("UMAP 1")+
#   theme(axis.title.y = element_text(),
#         axis.title.x = element_text()) 
#   )
# }

9 - Investigate nFeatures_RNA

p.list <- list()

tmp<-cbind(seurat@meta.data, seurat@reductions$pca@cell.embeddings[,c("PC_1","PC_2")])
mtmp<-melt(tmp[,c("SampleId","nFeature_RNA","nCount_RNA","percent.mt")], 
           id.vars=c("SampleId"))

labs<-c("gene count (nFeatures_RNA)", "UMI count (nCount_RNA)")
for(i in 1:length(features[1:2])){
  feat <- features[[i]]
   p <- ggplot(data=tmp, 
              aes(x=PC_1, y=PC_2, 
                  color=mtmp$value[which(mtmp$variable==feat)]))
  p <- p + geom_point(size=1.5)
  p <- p + theme_bioskryb()
  if(feat == "percent.mt") {
      p <- p + scale_color_viridis_c(option="rocket", 
                                 direction=-1)
  } else {
    p <- p + scale_color_viridis_c(option="rocket", 
                                 direction=-1,
                                 labels=scales::label_scientific())
  
  }
  p <- p + theme(axis.title.y.right = element_blank(),
                 legend.position="right",
                 legend.title=element_blank()) +
    ggtitle(paste("PCA of cells colored by",labs[i])) +
    ylab("PC 2")+xlab("PC 1")
print(p)
}

10 - Final conclusions

The main findings are as follows: [1] Several of the ensembl gene IDs are not associated with gene symbols (>20k); [2] At the sample-level, no samples (CellLines) were flagged for removal upon performing sample-level QC; [3] at the cell-level, 100 cells were flagged for removal due to doublets (9 cells), high/low UMIs (27 cells), doublets/UMIs (1 cell), high percentage of mitochondria genes (12 cells), high percentage of mitochondria genes and UMIs (1 cell), high gene counts (47 cells), high UMI/gene counts (2), and high percentage of mitochondria genes and gene counts (1 cell). Use table(seurat@meta.data$Filter) for details; [4] the data was scaled by a scaling factor of 10,000, and logarithmically transformed by a natural log; [5] dimension reduction was performed with PCA, and confounding variables include doublet (hybrid_score), mitochondrial gene percentage (percent.mt), gene count (nFeature_RNA), cell cycle phase (Phase), CellLine, and FlowCell; and [6] finally, CellLine, FlowCell, Phase, and gene counts (nFeature_RNA) are dependent variables that affect clustering of cells.

end <- Sys.time()

That’s it! Spend some time looking through the report to determine next steps.

Analysis started: 2022-04-06 09:48:19
Analysis finished: 2022-04-06 09:51:09

R Session Information

Information about R, the OS and attached or loaded packages

pander::pander(sessionInfo(), compact = FALSE)

R version 4.1.1 (2021-08-10)

Platform: x86_64-w64-mingw32/x64 (64-bit)

locale: LC_COLLATE=English_United States.1252, LC_CTYPE=English_United States.1252, LC_MONETARY=English_United States.1252, LC_NUMERIC=C and LC_TIME=English_United States.1252

attached base packages:

parallel
stats4
stats
graphics
grDevices
utils
datasets
methods
base

other attached packages:

paletteer(v.1.4.0)
ggridges(v.0.5.3)
celda(v.1.8.1)
scds(v.1.8.0)
microViz(v.0.9.0)
SeuratObject(v.4.0.4)
Seurat(v.4.1.0)
ggpointdensity(v.0.1.0)
nord(v.1.0.0)
unikn(v.0.4.0)
kableExtra(v.1.3.4)
BiocParallel(v.1.26.2)
dplyr(v.1.0.8)
DESeq2(v.1.32.0)
SummarizedExperiment(v.1.22.0)
Biobase(v.2.52.0)
MatrixGenerics(v.1.4.3)
matrixStats(v.0.61.0)
GenomicRanges(v.1.44.0)
GenomeInfoDb(v.1.28.4)
IRanges(v.2.26.0)
S4Vectors(v.0.30.2)
BiocGenerics(v.0.38.0)
FactoMineR(v.2.4)
plyr(v.1.8.6)
data.table(v.1.14.2)
ggplot2(v.3.3.5)

loaded via a namespace (and not attached):

MCMCprecision(v.0.4.0)
scattermore(v.0.8)
tidyr(v.1.2.0)
bit64(v.4.0.5)
knitr(v.1.37)
multimode(v.1.5)
irlba(v.2.3.5)
DelayedArray(v.0.18.0)
rpart(v.4.1-15)
KEGGREST(v.1.32.0)
RCurl(v.1.98-1.6)
doParallel(v.1.0.17)
generics(v.0.1.2)
ScaledMatrix(v.1.0.0)
cowplot(v.1.1.1)
RSQLite(v.2.2.10)
RANN(v.2.6.1)
combinat(v.0.0-8)
future(v.1.24.0)
bit(v.4.0.4)
spatstat.data(v.2.1-2)
webshot(v.0.5.2)
xml2(v.1.3.3)
httpuv(v.1.6.5)
assertthat(v.0.2.1)
viridis(v.0.6.2)
xfun(v.0.29)
jquerylib(v.0.1.4)
evaluate(v.0.15)
promises(v.1.2.0.1)
fansi(v.1.0.2)
assertive.files(v.0.0-2)
igraph(v.1.2.11)
DBI(v.1.1.2)
geneplotter(v.1.70.0)
htmlwidgets(v.1.5.4)
spatstat.geom(v.2.3-2)
purrr(v.0.3.4)
ellipsis(v.0.3.2)
RSpectra(v.0.16-0)
ks(v.1.13.4)
bookdown(v.0.24)
permute(v.0.9-7)
annotate(v.1.70.0)
prismatic(v.1.1.0)
sparseMatrixStats(v.1.4.2)
deldir(v.1.0-6)
vctrs(v.0.3.8)
SingleCellExperiment(v.1.14.1)
Cairo(v.1.5-14)
ROCR(v.1.0-11)
abind(v.1.4-5)
cachem(v.1.0.6)
RcppEigen(v.0.3.3.9.1)
withr(v.2.4.3)
sctransform(v.0.3.3)
vegan(v.2.5-7)
scran(v.1.20.1)
mclust(v.5.4.9)
goftest(v.1.2-3)
svglite(v.2.1.0)
cluster(v.2.1.2)
ape(v.5.6-2)
lazyeval(v.0.2.2)
crayon(v.1.5.0)
genefilter(v.1.74.1)
edgeR(v.3.34.1)
labeling(v.0.4.2)
pkgconfig(v.2.0.3)
vipor(v.0.4.5)
nlme(v.3.1-152)
diptest(v.0.76-0)
rlang(v.1.0.1)
globals(v.0.14.0)
lifecycle(v.1.0.1)
miniUI(v.0.1.1.1)
dbscan(v.1.1-10)
enrichR(v.3.0)
phyloseq(v.1.38.0)
rsvd(v.1.0.5)
ggrastr(v.1.0.1)
polyclip(v.1.10-0)
lmtest(v.0.9-39)
Matrix(v.1.3-4)
Rhdf5lib(v.1.14.2)
zoo(v.1.8-9)
beeswarm(v.0.4.0)
GlobalOptions(v.0.1.2)
png(v.0.1-7)
viridisLite(v.0.4.0)
rjson(v.0.2.21)
rootSolve(v.1.8.2.3)
bitops(v.1.0-7)
KernSmooth(v.2.23-20)
rhdf5filters(v.1.4.0)
pROC(v.1.18.0)
pander(v.0.6.4)
Biostrings(v.2.60.2)
DelayedMatrixStats(v.1.14.3)
blob(v.1.2.2)
shape(v.1.4.6)
stringr(v.1.4.0)
parallelly(v.1.30.0)
spatstat.random(v.2.1-0)
gridGraphics(v.0.5-1)
beachmat(v.2.8.1)
scales(v.1.1.1)
leaps(v.3.1)
memoise(v.2.0.1)
magrittr(v.2.0.2)
ica(v.1.0-2)
zlibbioc(v.1.38.0)
compiler(v.4.1.1)
dqrng(v.0.3.0)
RColorBrewer(v.1.1-2)
clue(v.0.3-60)
fitdistrplus(v.1.1-6)
cli(v.3.1.1)
ade4(v.1.7-18)
XVector(v.0.32.0)
listenv(v.0.8.0)
patchwork(v.1.1.1)
pbapply(v.1.5-0)
MASS(v.7.3-54)
mgcv(v.1.8-36)
tidyselect(v.1.1.1)
stringi(v.1.7.6)
highr(v.0.9)
yaml(v.2.2.2)
BiocSingular(v.1.8.1)
assertive.numbers(v.0.0-2)
locfit(v.1.5-9.4)
ggrepel(v.0.9.1)
grid(v.4.1.1)
sass(v.0.4.0)
tools(v.4.1.1)
future.apply(v.1.8.1)
circlize(v.0.4.14)
rstudioapi(v.0.13)
bluster(v.1.2.1)
foreach(v.1.5.2)
metapod(v.1.0.0)
gridExtra(v.2.3)
assertive.types(v.0.0-3)
scatterplot3d(v.0.3-41)
rmdformats(v.1.0.3)
farver(v.2.1.0)
Rtsne(v.0.15)
digest(v.0.6.27)
FNN(v.1.1.3)
pracma(v.2.3.8)
shiny(v.1.7.1)
Rcpp(v.1.0.8)
scuttle(v.1.2.1)
later(v.1.3.0)
RcppAnnoy(v.0.0.19)
httr(v.1.4.2)
AnnotationDbi(v.1.54.1)
ComplexHeatmap(v.2.8.0)
assertive.properties(v.0.0-4)
colorspace(v.2.0-2)
rvest(v.1.0.2)
XML(v.3.99-0.8)
tensor(v.1.5)
reticulate(v.1.24)
splines(v.4.1.1)
statmod(v.1.4.36)
uwot(v.0.1.11)
rematch2(v.2.1.2)
spatstat.utils(v.2.3-0)
scater(v.1.20.1)
multtest(v.2.50.0)
xgboost(v.1.5.2.1)
plotly(v.4.10.0)
systemfonts(v.1.0.4)
xtable(v.1.8-4)
jsonlite(v.1.7.3)
assertive.base(v.0.0-9)
rcartocolor(v.2.0.0)
flashClust(v.1.01-2)
R6(v.2.5.1)
pillar(v.1.7.0)
htmltools(v.0.5.2)
mime(v.0.12)
glue(v.1.6.1)
fastmap(v.1.1.0)
DT(v.0.20)
BiocNeighbors(v.1.10.0)
codetools(v.0.2-18)
mvtnorm(v.1.1-3)
utf8(v.1.2.2)
lattice(v.0.20-44)
bslib(v.0.3.1)
spatstat.sparse(v.2.1-0)
tibble(v.3.1.6)
ggbeeswarm(v.0.6.0)
multipanelfigure(v.2.1.2)
leiden(v.0.3.9)
magick(v.2.7.3)
limma(v.3.48.3)
survival(v.3.2-11)
rmarkdown(v.2.11)
biomformat(v.1.22.0)
munsell(v.0.5.0)
GetoptLong(v.1.0.5)
rhdf5(v.2.36.0)
GenomeInfoDbData(v.1.2.6)
iterators(v.1.0.14)
reshape2(v.1.4.4)
gtable(v.0.3.0)
spatstat.core(v.2.4-0)

06 April 2022

          </div>