if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!require("pacman", quietly = TRUE))
BiocManager::install("pacman")
p_load("SingleCellExperiment")
p_load("Seurat")
p_load("hdf5r")
p_load("DropletUtils")
p_load("dplyr")
p_load("scater")
p_load("Matrix")
p_load("scran")
p_load("ggplot2")
if (!require("devtools"))
install.packages("devtools")
if(!require("findPC"))
devtools::install_github("haotian-zhuang/findPC")SingleCell RNA-seq Workshop
Load Packages
Download Data and read it.
system("wget -q https://cf.10xgenomics.com/samples/cell-exp/6.1.0/10k_PBMC_3p_nextgem_Chromium_X/10k_PBMC_3p_nextgem_Chromium_X_filtered_feature_bc_matrix.h5")
sce_data<- DropletUtils::read10xCounts("10k_PBMC_3p_nextgem_Chromium_X_filtered_feature_bc_matrix.h5",sample.names = "pbmc")
mat<-Seurat::Read10X_h5("10k_PBMC_3p_nextgem_Chromium_X_filtered_feature_bc_matrix.h5")
seurat_data <- CreateSeuratObject(counts = mat, project = "pbmc", min.cells = 0, min.features =0)Explore datasets
sce_dataclass: SingleCellExperiment
dim: 36601 11996
metadata(1): Samples
assays(1): counts
rownames(36601): ENSG00000243485 ENSG00000237613 ... ENSG00000278817
ENSG00000277196
rowData names(3): ID Symbol Type
colnames: NULL
colData names(2): Sample Barcode
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
seurat_dataAn object of class Seurat
36601 features across 11996 samples within 1 assay
Active assay: RNA (36601 features, 0 variable features)
colData(sce_data) %>% head()DataFrame with 6 rows and 2 columns
Sample Barcode
<character> <character>
1 pbmc AAACCCAAGGCCCAAA-1
2 pbmc AAACCCAAGTAATACG-1
3 pbmc AAACCCAAGTCACACT-1
4 pbmc AAACCCACAAAGCGTG-1
5 pbmc AAACCCACAATCGAAA-1
6 pbmc AAACCCACAGATCACT-1
rownames(sce_data) %>% head()[1] "ENSG00000243485" "ENSG00000237613" "ENSG00000186092" "ENSG00000238009"
[5] "ENSG00000239945" "ENSG00000239906"
rowData(sce_data) %>% head()DataFrame with 6 rows and 3 columns
ID Symbol Type
<character> <character> <character>
ENSG00000243485 ENSG00000243485 MIR1302-2HG Gene Expression
ENSG00000237613 ENSG00000237613 FAM138A Gene Expression
ENSG00000186092 ENSG00000186092 OR4F5 Gene Expression
ENSG00000238009 ENSG00000238009 AL627309.1 Gene Expression
ENSG00000239945 ENSG00000239945 AL627309.3 Gene Expression
ENSG00000239906 ENSG00000239906 AL627309.2 Gene Expression
rownames(seurat_data) %>% head()[1] "MIR1302-2HG" "FAM138A" "OR4F5" "AL627309.1" "AL627309.3"
[6] "AL627309.2"
identical(rownames(seurat_data), rowData(sce_data)$Symbol)[1] FALSE
setdiff(rownames(seurat_data),rowData(sce_data)$Symbol) [1] "TBCE.1" "LINC01238.1" "CYB561D2.1" "MATR3.1"
[5] "LINC01505.1" "HSPA14.1" "GOLGA8M.1" "GGT1.1"
[9] "ARMCX5-GPRASP2.1" "TMSB15B.1"
grep("LINC01238", rownames(seurat_data),value=T)[1] "LINC01238" "LINC01238.1"
rowData(sce_data)[which(rowData(sce_data)$Symbol == "LINC01238"),]DataFrame with 2 rows and 3 columns
ID Symbol Type
<character> <character> <character>
ENSG00000237940 ENSG00000237940 LINC01238 Gene Expression
ENSG00000261186 ENSG00000261186 LINC01238 Gene Expression
Quality Control
Seurat
seurat_data[["Mitochondrial"]] <- PercentageFeatureSet(seurat_data, pattern = "^MT-")
VlnPlot(seurat_data, features = c( "nCount_RNA","nFeature_RNA", "Mitochondrial"), ncol = 3,pt.size = 0.01,log = T)Bioconductor
is.mito <-grepl("^MT-", rowData(sce_data)$Symbol, perl = T) %>% which
sce_data<-scater::addPerCellQC(sce_data,subsets=list("Mitochondrial"=is.mito))
gridExtra::grid.arrange(
plotColData(sce_data, y="sum",) +
scale_y_log10() + ggtitle("Total count"),
plotColData(sce_data, y="detected") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(sce_data, y="subsets_Mitochondrial_percent") + ggtitle("Mito percent")+scale_y_log10(),
ncol=3
)Identify thresholds
qc.lib2 <- scater::isOutlier(seurat_data$nCount_RNA, log=TRUE, type="lower",nmads=3) #sce_data$sum
qc.nexprs2 <- isOutlier(seurat_data$nFeature_RNA, log=TRUE,nmads=3, type="lower") #sce_data$detected
qc.mito2 <- isOutlier(seurat_data$Mitochondrial ,nmads=3,type="higher") #sce_data$subsets_Mitochondrial_percent
discard2 <- qc.lib2 | qc.nexprs2 | qc.mito2data.frame("Total count"=attr(qc.lib2, "thresholds"),
"Detected features"=attr(qc.nexprs2, "thresholds"),
"Mito percent"=attr(qc.mito2, "thresholds")) Total.count Detected.features Mito.percent
lower 1679.027 591.9659 -Inf
higher Inf Inf 13.47941
data.frame("Total count"=sum(qc.lib2), "Detected features"=sum(qc.nexprs2),
"Mito percent"=sum(qc.mito2),"Total"=sum(discard2)) Total.count Detected.features Mito.percent Total
1 307 245 525 571
sce_data$discard<-discard2
seurat_data$discard<-discard2
gridExtra::grid.arrange(
plotColData(sce_data, y="sum",colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(sce_data, y="detected",colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(sce_data, y="subsets_Mitochondrial_percent",colour_by="discard") + ggtitle("Mito percent")+scale_y_log10(),
ncol=3
)seurat_data<-subset(seurat_data,subset= discard!=TRUE)
sce_data<-sce_data[,!sce_data$discard]Normalization
Seurat
seurat_data <- NormalizeData(seurat_data, normalization.method = "LogNormalize", scale.factor = 10000, verbose=F)
seurat_data <- SCTransform(seurat_data, verbose=F) #could regress for multiple factors, including batchAssays(seurat_data)[1] "RNA" "SCT"
DefaultAssay(seurat_data)[1] "SCT"
colSums(GetAssayData(seurat_data,slot = "counts")[,1:5])AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1
8026 7459 7797 6491
AAACCCACAATCGAAA-1
7242
colSums(GetAssayData(seurat_data,slot = "data")[,1:5])AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1
2727.448 2719.696 2846.016 1608.979
AAACCCACAATCGAAA-1
2070.074
colSums(GetAssayData(seurat_data,assay = "RNA",slot = "counts")[,1:5])AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1
12143 7811 9741 5561
AAACCCACAATCGAAA-1
7399
colSums(GetAssayData(seurat_data,assay = "RNA",slot = "data")[,1:5])AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1
3026.689 3154.815 3159.359 2112.424
AAACCCACAATCGAAA-1
2459.633
Bioconductor
Warning:
Might take a little bit!
set.seed(1000)
clusters <- scran::quickCluster(sce_data)
sce_data <- scran::computeSumFactors(sce_data, cluster=clusters)
sce_data <- scater::logNormCounts(sce_data)assayNames(sce_data)[1] "counts" "logcounts"
colSums(logcounts(sce_data)[,1:5])[1] 3961.312 3783.641 3897.069 3186.119 3533.383
Feature Selection
Seurat
DefaultAssay(seurat_data)<-"RNA"
seurat_data <- FindVariableFeatures(seurat_data, assay="RNA",selection.method = "vst", nfeatures = 2000, verbose=F)
top10 <- head(VariableFeatures(seurat_data), 10)
plot1 <- VariableFeaturePlot(seurat_data)
LabelPoints(plot = plot1, points = top10, repel = TRUE)+theme(legend.position = "none")When using repel, set xnudge and ynudge to 0 for optimal results
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Removed 11053 rows containing missing values (geom_point).
Bioconductor
dec.pbmc <- modelGeneVar(sce_data)
chosen <- getTopHVGs(dec.pbmc, n=2000)
rowSubset(sce_data) <- chosen
fit.pbmc <- metadata(dec.pbmc)
plot2<-data.frame(mean=fit.pbmc$mean,var=fit.pbmc$var, color=rownames(sce_data)%in% chosen)%>%
ggplot(aes(x=mean,y=var))+geom_point(aes(color=color))+scale_color_manual(values = c("black","red"))+geom_function(color="dodgerblue",fun =fit.pbmc$trend ,size=1.5)+theme(legend.position = "none")#+scale_x_continuous(trans='log2')+scale_y_continuous(trans='log2')
t<-which(rownames(sce_data)%in% chosen[1:10])
td<-data.frame(mean=fit.pbmc$mean[t],var=fit.pbmc$var[t],symbol=rowData(sce_data)$Symbol[t])
plot2+ggrepel::geom_text_repel(data=td,aes(label=symbol))+ylab("Variance of log-expression")+xlab("Mean of log-expression")Dimension Reduction
PCA
Seurat
all.genes <- rownames(seurat_data)
seurat_data <- ScaleData(seurat_data, features = all.genes)Centering and scaling data matrix
seurat_data <- RunPCA(seurat_data,
features = VariableFeatures(object = seurat_data),
verbose = F,npcs = 50)
DimPlot(seurat_data,reduction = "pca",)ElbowPlot(seurat_data)stdev_pca_seurat<-seurat_data@reductions$pca@stdev
findPC(sdev = stdev_pca_seurat,number = c(30,40,50),method = 'all',figure = T) Piecewise linear model First derivative Second derivative
30PCs 3 6 6
40PCs 3 11 6
50PCs 4 15 11
Preceding residual Perpendicular line K-means clustering
30PCs 6 6 2
40PCs 6 6 6
50PCs 11 7 6
Bioconductor
sce_data <- runPCA(sce_data,ncomponents = 50)
plotPCA(sce_data)stdev_pca_sce <- attr(reducedDim(sce_data, "PCA"),
"varExplained") %>% sqrt()
plot(stdev_pca_sce)findPC(sdev = stdev_pca_sce,number = c(30,40,50),method = 'all',figure = T) Piecewise linear model First derivative Second derivative
30PCs 3 6 4
40PCs 3 9 4
50PCs 4 11 9
Preceding residual Perpendicular line K-means clustering
30PCs 6 6 2
40PCs 7 6 2
50PCs 11 7 2
I choose 11 PC.
TSNE
Seurat
seurat_data <- RunTSNE(seurat_data, dims = 1:11,reduction = "pca")
DimPlot(seurat_data, reduction = "tsne")sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11)
plotReducedDim(sce_data, dimred="TSNE")sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11,perplexity=5)
plotReducedDim(sce_data, dimred="TSNE")sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11,perplexity=20)
plotReducedDim(sce_data, dimred="TSNE")sce_data <- runTSNE(sce_data, dimred="PCA",n_dimred=11,perplexity=80)
plotReducedDim(sce_data, dimred="TSNE")UMAP
seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F)Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
DimPlot(seurat_data, reduction = "umap")seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F,n.neighbors = 5)
DimPlot(seurat_data, reduction = "umap")seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F,n.neighbors = 50)
DimPlot(seurat_data, reduction = "umap")seurat_data <- RunUMAP(seurat_data, dims = 1:11,reduction = "pca",verbose = F,n.neighbors = 100)
DimPlot(seurat_data, reduction = "umap")sce_data <- runUMAP(sce_data, dimred="PCA",n_dimred=11)
plotReducedDim(sce_data, dimred="UMAP")