SingleCell RNA-seq Workshop

Author

Luis P Iniguez

Load Packages

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
if (!require("pacman", quietly = TRUE))
    BiocManager::install("pacman")
p_load("SingleCellExperiment")
p_load("Seurat")
p_load("hdf5r")
p_load("DropletUtils")
p_load("dplyr")
p_load("scater")
p_load("Matrix")
p_load("scran")
p_load("ggplot2")

if (!require("devtools"))
install.packages("devtools")
if(!require("findPC"))
devtools::install_github("haotian-zhuang/findPC")

Download Data and read it.

system("wget -q https://cf.10xgenomics.com/samples/cell-exp/6.1.0/10k_PBMC_3p_nextgem_Chromium_X/10k_PBMC_3p_nextgem_Chromium_X_filtered_feature_bc_matrix.h5")


sce_data<- DropletUtils::read10xCounts("10k_PBMC_3p_nextgem_Chromium_X_filtered_feature_bc_matrix.h5",sample.names = "pbmc")

mat<-Seurat::Read10X_h5("10k_PBMC_3p_nextgem_Chromium_X_filtered_feature_bc_matrix.h5")
seurat_data <- CreateSeuratObject(counts = mat, project = "pbmc", min.cells = 0, min.features =0)

Explore datasets

sce_data
class: SingleCellExperiment 
dim: 36601 11996 
metadata(1): Samples
assays(1): counts
rownames(36601): ENSG00000243485 ENSG00000237613 ... ENSG00000278817
  ENSG00000277196
rowData names(3): ID Symbol Type
colnames: NULL
colData names(2): Sample Barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
seurat_data
An object of class Seurat 
36601 features across 11996 samples within 1 assay 
Active assay: RNA (36601 features, 0 variable features)
colData(sce_data) %>% head()
DataFrame with 6 rows and 2 columns
       Sample            Barcode
  <character>        <character>
1        pbmc AAACCCAAGGCCCAAA-1
2        pbmc AAACCCAAGTAATACG-1
3        pbmc AAACCCAAGTCACACT-1
4        pbmc AAACCCACAAAGCGTG-1
5        pbmc AAACCCACAATCGAAA-1
6        pbmc AAACCCACAGATCACT-1
rownames(sce_data) %>% head()
[1] "ENSG00000243485" "ENSG00000237613" "ENSG00000186092" "ENSG00000238009"
[5] "ENSG00000239945" "ENSG00000239906"
rowData(sce_data) %>% head()
DataFrame with 6 rows and 3 columns
                             ID      Symbol            Type
                    <character> <character>     <character>
ENSG00000243485 ENSG00000243485 MIR1302-2HG Gene Expression
ENSG00000237613 ENSG00000237613     FAM138A Gene Expression
ENSG00000186092 ENSG00000186092       OR4F5 Gene Expression
ENSG00000238009 ENSG00000238009  AL627309.1 Gene Expression
ENSG00000239945 ENSG00000239945  AL627309.3 Gene Expression
ENSG00000239906 ENSG00000239906  AL627309.2 Gene Expression
rownames(seurat_data) %>% head()
[1] "MIR1302-2HG" "FAM138A"     "OR4F5"       "AL627309.1"  "AL627309.3" 
[6] "AL627309.2" 
identical(rownames(seurat_data), rowData(sce_data)$Symbol)
[1] FALSE
setdiff(rownames(seurat_data),rowData(sce_data)$Symbol)
 [1] "TBCE.1"           "LINC01238.1"      "CYB561D2.1"       "MATR3.1"         
 [5] "LINC01505.1"      "HSPA14.1"         "GOLGA8M.1"        "GGT1.1"          
 [9] "ARMCX5-GPRASP2.1" "TMSB15B.1"       
grep("LINC01238", rownames(seurat_data),value=T)
[1] "LINC01238"   "LINC01238.1"
rowData(sce_data)[which(rowData(sce_data)$Symbol == "LINC01238"),]
DataFrame with 2 rows and 3 columns
                             ID      Symbol            Type
                    <character> <character>     <character>
ENSG00000237940 ENSG00000237940   LINC01238 Gene Expression
ENSG00000261186 ENSG00000261186   LINC01238 Gene Expression

Quality Control

Seurat

seurat_data[["Mitochondrial"]] <- PercentageFeatureSet(seurat_data, pattern = "^MT-")
VlnPlot(seurat_data, features = c( "nCount_RNA","nFeature_RNA", "Mitochondrial"), ncol = 3,pt.size = 0.01,log = T)

Bioconductor

is.mito <-grepl("^MT-", rowData(sce_data)$Symbol, perl = T) %>% which
sce_data<-scater::addPerCellQC(sce_data,subsets=list("Mitochondrial"=is.mito))
gridExtra::grid.arrange(
    plotColData(sce_data, y="sum",) +
        scale_y_log10() + ggtitle("Total count"),
    plotColData(sce_data, y="detected") +
        scale_y_log10() + ggtitle("Detected features"),
    plotColData(sce_data, y="subsets_Mitochondrial_percent") + ggtitle("Mito percent")+scale_y_log10(),
    ncol=3
)

Identify thresholds

qc.lib2 <- scater::isOutlier(seurat_data$nCount_RNA, log=TRUE, type="lower",nmads=3) #sce_data$sum
qc.nexprs2 <- isOutlier(seurat_data$nFeature_RNA, log=TRUE,nmads=3, type="lower") #sce_data$detected
qc.mito2 <- isOutlier(seurat_data$Mitochondrial ,nmads=3,type="higher") #sce_data$subsets_Mitochondrial_percent
discard2 <- qc.lib2 | qc.nexprs2  | qc.mito2
data.frame("Total count"=attr(qc.lib2, "thresholds"),
"Detected features"=attr(qc.nexprs2, "thresholds"),
"Mito percent"=attr(qc.mito2, "thresholds"))
       Total.count Detected.features Mito.percent
lower     1679.027          591.9659         -Inf
higher         Inf               Inf     13.47941
data.frame("Total count"=sum(qc.lib2), "Detected features"=sum(qc.nexprs2),
  "Mito percent"=sum(qc.mito2),"Total"=sum(discard2))
  Total.count Detected.features Mito.percent Total
1         307               245          525   571
sce_data$discard<-discard2
seurat_data$discard<-discard2
gridExtra::grid.arrange(
    plotColData(sce_data, y="sum",colour_by="discard") +
        scale_y_log10() + ggtitle("Total count"),
    plotColData(sce_data, y="detected",colour_by="discard") +
        scale_y_log10() + ggtitle("Detected features"),
    plotColData(sce_data, y="subsets_Mitochondrial_percent",colour_by="discard") + ggtitle("Mito percent")+scale_y_log10(),
    ncol=3
)

seurat_data<-subset(seurat_data,subset= discard!=TRUE)
sce_data<-sce_data[,!sce_data$discard]

Normalization

Seurat

seurat_data <- NormalizeData(seurat_data, normalization.method = "LogNormalize", scale.factor = 10000, verbose=F)
seurat_data <- SCTransform(seurat_data, verbose=F) #could regress for multiple factors, including batch
Assays(seurat_data)
[1] "RNA" "SCT"
DefaultAssay(seurat_data)
[1] "SCT"
colSums(GetAssayData(seurat_data,slot = "counts")[,1:5])
AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1 
              8026               7459               7797               6491 
AAACCCACAATCGAAA-1 
              7242 
colSums(GetAssayData(seurat_data,slot = "data")[,1:5])
AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1 
          2727.448           2719.696           2846.016           1608.979 
AAACCCACAATCGAAA-1 
          2070.074 
colSums(GetAssayData(seurat_data,assay = "RNA",slot = "counts")[,1:5])
AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1 
             12143               7811               9741               5561 
AAACCCACAATCGAAA-1 
              7399 
colSums(GetAssayData(seurat_data,assay = "RNA",slot = "data")[,1:5])
AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1 
          3026.689           3154.815           3159.359           2112.424 
AAACCCACAATCGAAA-1 
          2459.633 

Bioconductor

Warning:

Might take a little bit!

set.seed(1000)
clusters <- scran::quickCluster(sce_data)
sce_data <- scran::computeSumFactors(sce_data, cluster=clusters)
sce_data <- scater::logNormCounts(sce_data)
assayNames(sce_data)
[1] "counts"    "logcounts"
colSums(logcounts(sce_data)[,1:5])
[1] 3961.312 3783.641 3897.069 3186.119 3533.383

Feature Selection

Seurat

DefaultAssay(seurat_data)<-"RNA"
seurat_data <- FindVariableFeatures(seurat_data, assay="RNA",selection.method = "vst", nfeatures = 2000, verbose=F)

top10 <- head(VariableFeatures(seurat_data), 10)
plot1 <- VariableFeaturePlot(seurat_data)
LabelPoints(plot = plot1, points = top10, repel = TRUE)+theme(legend.position = "none")
When using repel, set xnudge and ynudge to 0 for optimal results
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Removed 11053 rows containing missing values (geom_point).

Bioconductor

dec.pbmc <- modelGeneVar(sce_data)
chosen <- getTopHVGs(dec.pbmc, n=2000)
rowSubset(sce_data) <- chosen

fit.pbmc <- metadata(dec.pbmc)
plot2<-data.frame(mean=fit.pbmc$mean,var=fit.pbmc$var, color=rownames(sce_data)%in% chosen)%>%
ggplot(aes(x=mean,y=var))+geom_point(aes(color=color))+scale_color_manual(values = c("black","red"))+geom_function(color="dodgerblue",fun =fit.pbmc$trend ,size=1.5)+theme(legend.position = "none")#+scale_x_continuous(trans='log2')+scale_y_continuous(trans='log2')

t<-which(rownames(sce_data)%in% chosen[1:10])
td<-data.frame(mean=fit.pbmc$mean[t],var=fit.pbmc$var[t],symbol=rowData(sce_data)$Symbol[t])
plot2+ggrepel::geom_text_repel(data=td,aes(label=symbol))+ylab("Variance of log-expression")+xlab("Mean of log-expression")

Dimension Reduction

PCA

Seurat

all.genes <- rownames(seurat_data)
seurat_data <- ScaleData(seurat_data, features = all.genes)
Centering and scaling data matrix
seurat_data <- RunPCA(seurat_data,
                      features = VariableFeatures(object = seurat_data),
                      verbose = F,npcs = 50)
DimPlot(seurat_data,reduction =  "pca",)

ElbowPlot(seurat_data)

stdev_pca_seurat<-seurat_data@reductions$pca@stdev 
findPC(sdev = stdev_pca_seurat,number = c(30,40,50),method = 'all',figure = T)

      Piecewise linear model First derivative Second derivative
30PCs                      3                6                 6
40PCs                      3               11                 6
50PCs                      4               15                11
      Preceding residual Perpendicular line K-means clustering
30PCs                  6                  6                  2
40PCs                  6                  6                  6
50PCs                 11                  7                  6

Bioconductor

sce_data <- runPCA(sce_data,ncomponents = 50)
plotPCA(sce_data)

stdev_pca_sce <- attr(reducedDim(sce_data, "PCA"),
  "varExplained") %>% sqrt()

plot(stdev_pca_sce)

findPC(sdev = stdev_pca_sce,number = c(30,40,50),method = 'all',figure = T)

      Piecewise linear model First derivative Second derivative
30PCs                      3                6                 4
40PCs                      3                9                 4
50PCs                      4               11                 9
      Preceding residual Perpendicular line K-means clustering
30PCs                  6                  6                  2
40PCs                  7                  6                  2
50PCs                 11                  7                  2

I choose 11 PC.

TSNE

Seurat

seurat_data <- RunTSNE(seurat_data, dims = 1:11,reduction = "pca")
DimPlot(seurat_data, reduction = "tsne")

sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11)
plotReducedDim(sce_data, dimred="TSNE")

sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11,perplexity=5)
plotReducedDim(sce_data, dimred="TSNE")

sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11,perplexity=20)
plotReducedDim(sce_data, dimred="TSNE")

sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11,perplexity=80)
plotReducedDim(sce_data, dimred="TSNE")

UMAP

seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
DimPlot(seurat_data, reduction = "umap")

seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F,n.neighbors = 5)
DimPlot(seurat_data, reduction = "umap")

seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F,n.neighbors = 50)
DimPlot(seurat_data, reduction = "umap")

seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F,n.neighbors = 100)
DimPlot(seurat_data, reduction = "umap")

sce_data <- runUMAP(sce_data, dimred="PCA",n_dimred=11)
plotReducedDim(sce_data, dimred="UMAP")