In single cell(sc) data analysis, R is the most welcomed programming language, which provides the user with multiply choices of packages, and these packages are Seurat, Scater and Monocle, mastering one of these three packages means you can analyse any sc data with your own purpose.
In this markdown, I will represent you the usage and workflow of R package Seurat, from data loading to data visualization.
# make sure you choose the nearest mirror in your place
setwd('~/Documents/singleCell/scRNA_smart_seq2/')
options()$BioC_mirror
## NULL
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
#BiocManager::install('scRNAseq', update = F, ask = F)
#BiocManager::install('Seurat', update = F, ask = F)
#install.packages('tidyverse')
#install.packages('ggpubr')
#install.packages('cowplot')
library(scRNAseq)
library(tidyverse)
library(Seurat)
library(ggpubr)
library(cowplot)
Demo data is loaded from package scRNAseq, which includes 130 samples and metadata of it.
data(fluidigm)
ct <- floor(assays(fluidigm)$rsem_counts)
sample_cnn <- colData(fluidigm) %>% as.data.frame()
dim(ct)
## [1] 26255 130
ct[1:4, 1:4]
## SRR1275356 SRR1274090 SRR1275251 SRR1275287
## A1BG 0 0 0 0
## A1BG-AS1 0 0 0 0
## A1CF 0 0 0 0
## A2M 0 0 0 33
DT::datatable(sample_cnn[, 1:6])
lapply(sample_cnn %>% select_if(is.numeric) %>% colnames(), function(x){
dat <- sample_cnn[,x,drop=F]
dat$sample <- rownames(dat)
dat$group='all cells'
p <- ggboxplot(dat, x = "group", y = x,
add = "jitter" )
return(p)
}) %>% cowplot::plot_grid(plotlist=., ncol=5)
pbmc <- CreateSeuratObject(counts = ct,
meta.data =sample_cnn,
min.cells = 3,
min.features = 200,
project = "Pollen")
pbmc
## An object of class Seurat
## 14656 features across 130 samples within 1 assay
## Active assay: RNA (14656 features, 0 variable features)
pbmc[["percent.mt"]] <- PercentageFeatureSet(object = pbmc, pattern = "^mt-")
VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA","percent.mt"),
group.by = 'Biological_Condition', ncol = 3)
plot1 <-FeatureScatter(object = pbmc, feature1 = "nCount_RNA",
feature2 = "percent.mt",group.by = 'Biological_Condition')
plot2 <-FeatureScatter(object = pbmc, feature1 = "nCount_RNA",
feature2 = "nFeature_RNA",group.by = 'Biological_Condition')
CombinePlots(plots = list(plot1, plot2))
rm(plot1, plot2)
pbmc<-NormalizeData(object = pbmc, normalization.method= "LogNormalize",
scale.factor= 10000, verbose = FALSE)
pbmc <- ScaleData(object = pbmc, verbose = F)
pbmc <- ScaleData(object = pbmc, verbose = F)
# 提取差异明显的基因
pbmc <- FindVariableFeatures(object = pbmc, nfeatures = 2000)
p1 <- VariableFeaturePlot(pbmc)
p2 <- LabelPoints(plot = p1, points = head(VariableFeatures(pbmc), 10),
repel = TRUE)
CombinePlots(plots = list(p1, p2),legend="bottom")
rm(p1, p2)
pbmc <- RunPCA(object = pbmc, verbose = F)
pbmc <- RunTSNE(object = pbmc)
p1<-PCAPlot(pbmc, group.by='Biological_Condition')
p2<-TSNEPlot(pbmc, group.by='Biological_Condition')
cowplot::plot_grid(p1,p2,ncol = 2,labels = c('A', 'B'))
PCHeatmap(pbmc, dims=1:3)
VizDimLoadings(pbmc)
print(pbmc)
## An object of class Seurat
## 14656 features across 130 samples within 1 assay
## Active assay: RNA (14656 features, 2000 variable features)
## 2 dimensional reductions calculated: pca, tsne
rm(p1, p2)
pbmc <- JackStraw(pbmc, num.replicate = 100, verbose = F)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
plot1<-JackStrawPlot(pbmc, dims = 1:15)
plot2<-ElbowPlot(pbmc, ndims = 100)
CombinePlots(plots = list(plot1, plot2),legend="bottom")
rm(plot1, plot2)
#确定PC阈值的三个标准:
#(1) 主成分累积贡献大于90%;
#(2) PC本身对方差贡献小于5%;
#(3) 两个连续PCs之间差异小于0.1%
pct <- (pbmc[['pca']]@stdev / sum(pbmc[['pca']]@stdev))*100
which(cumsum(pct) > 90 & pct < 5)[1]
## [1] 45
sort(which((pct[1:length(pct) -1] -pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
## [1] 4
pbmc <- FindNeighbors(object = pbmc, reduction = 'pca', dims = 1:45)
pbmc <- FindClusters(object = pbmc, resolution = 0.9, verbose=F)
pbmc<-BuildClusterTree(pbmc)
PlotClusterTree(pbmc)
pbmc<- RunUMAP(object = pbmc, reduction='pca',dims = 1:45)
pbmc <- RunTSNE(object = pbmc, reduction = 'pca', dims = 1:45, do.fast=T)
DimPlot(pbmc, reduction = "pca")
DimPlot(pbmc, reduction = "tsne")
DimPlot(pbmc, reduction = "umap")
marker.all <- FindAllMarkers(object = pbmc, only.pos = T,
min.pct = 0.25, thresh.use = 0.25, verbose = F)
ggplot(marker.all, aes(x=avg_logFC, y=-log10(p_val_adj), color=cluster))+
geom_point()+theme_bw()+theme(panel.grid = element_line(color=NA),
legend.position = 'none')+
ggrepel::geom_label_repel(aes(label=if_else(avg_logFC >= 2, gene , '')))
top10 <- marker.all %>% group_by(cluster) %>% top_n(10, wt=avg_logFC)
DoHeatmap(object = pbmc, features = unique(top10$gene))+NoLegend()
DotPlot(object = pbmc, features = unique(top10$gene))+RotatedAxis()
FeaturePlot(pbmc, features = top10$gene,min.cutoff = 0, max.cutoff = 4)
table(pbmc@active.ident)
##
## 0 1
## 70 60
table(marker.all$cluster)
##
## 0 1
## 506 1359
dim(pbmc@meta.data)
## [1] 130 34
table(pbmc@active.ident)
##
## 0 1
## 70 60
table(marker.all$cluster)
##
## 0 1
## 506 1359
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] cowplot_1.0.0 ggpubr_0.3.0
## [3] Seurat_3.1.5 forcats_0.5.0
## [5] stringr_1.4.0 dplyr_1.0.0
## [7] purrr_0.3.4 readr_1.3.1
## [9] tidyr_1.1.0 tibble_3.0.1
## [11] ggplot2_3.3.2 tidyverse_1.3.0
## [13] scRNAseq_1.8.0 SummarizedExperiment_1.12.0
## [15] DelayedArray_0.8.0 BiocParallel_1.16.6
## [17] matrixStats_0.56.0 Biobase_2.42.0
## [19] GenomicRanges_1.34.0 GenomeInfoDb_1.18.2
## [21] IRanges_2.16.0 S4Vectors_0.20.1
## [23] BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 plyr_1.8.6
## [4] igraph_1.2.5 lazyeval_0.2.2 splines_3.5.0
## [7] crosstalk_1.1.0.1 listenv_0.8.0 digest_0.6.25
## [10] htmltools_0.5.0 gdata_2.18.0 fansi_0.4.1
## [13] magrittr_1.5 cluster_2.1.0 ROCR_1.0-7
## [16] limma_3.38.3 openxlsx_4.1.5 globals_0.12.5
## [19] modelr_0.1.8 colorspace_1.4-1 blob_1.2.1
## [22] rvest_0.3.5 ggrepel_0.8.2 haven_2.3.1
## [25] xfun_0.15 crayon_1.3.4 RCurl_1.98-1.2
## [28] jsonlite_1.7.0 survival_3.2-3 zoo_1.8-8
## [31] ape_5.3 glue_1.4.1 gtable_0.3.0
## [34] zlibbioc_1.28.0 XVector_0.22.0 leiden_0.3.3
## [37] car_3.0-8 future.apply_1.6.0 abind_1.4-5
## [40] scales_1.1.1 DBI_1.1.0 rstatix_0.6.0
## [43] Rcpp_1.0.4.6 viridisLite_0.3.0 reticulate_1.16
## [46] foreign_0.8-74 rsvd_1.0.3 DT_0.14
## [49] tsne_0.1-3 htmlwidgets_1.5.1 httr_1.4.1
## [52] gplots_3.0.3 RColorBrewer_1.1-2 ellipsis_0.3.1
## [55] ica_1.0-2 farver_2.0.3 pkgconfig_2.0.3
## [58] uwot_0.1.8 dbplyr_1.4.4 labeling_0.3
## [61] tidyselect_1.1.0 rlang_0.4.6 reshape2_1.4.4
## [64] munsell_0.5.0 cellranger_1.1.0 tools_3.5.0
## [67] cli_2.0.2 generics_0.0.2 broom_0.5.6
## [70] ggridges_0.5.2 evaluate_0.14 yaml_2.2.1
## [73] knitr_1.29 fs_1.4.1 fitdistrplus_1.1-1
## [76] zip_2.0.4 caTools_1.17.1.3 RANN_2.6.1
## [79] pbapply_1.4-2 future_1.17.0 nlme_3.1-145
## [82] xml2_1.3.2 compiler_3.5.0 rstudioapi_0.11
## [85] plotly_4.9.2.1 curl_4.3 png_0.1-7
## [88] ggsignif_0.6.0 reprex_0.3.0 stringi_1.4.6
## [91] RSpectra_0.16-0 lattice_0.20-41 Matrix_1.2-18
## [94] vctrs_0.3.1 pillar_1.4.4 lifecycle_0.2.0
## [97] lmtest_0.9-37 RcppAnnoy_0.0.16 data.table_1.12.8
## [100] bitops_1.0-6 irlba_2.3.3 patchwork_1.0.1
## [103] R6_2.4.1 KernSmooth_2.23-16 gridExtra_2.3
## [106] rio_0.5.16 codetools_0.2-16 MASS_7.3-51.6
## [109] gtools_3.8.2 assertthat_0.2.1 withr_2.2.0
## [112] sctransform_0.2.1 GenomeInfoDbData_1.2.0 hms_0.5.3
## [115] grid_3.5.0 rmarkdown_2.3 carData_3.0-4
## [118] Rtsne_0.15 lubridate_1.7.9