Single Cell RNA-seq with R

In single cell(sc) data analysis, R is the most welcomed programming language, which provides the user with multiply choices of packages, and these packages are Seurat, Scater and Monocle, mastering one of these three packages means you can analyse any sc data with your own purpose.
In this markdown, I will represent you the usage and workflow of R package Seurat, from data loading to data visualization.

Package Installation

  # make sure you choose the nearest mirror in your place
  setwd('~/Documents/singleCell/scRNA_smart_seq2/')
  options()$BioC_mirror
## NULL
  options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
  #BiocManager::install('scRNAseq', update = F, ask = F)
  #BiocManager::install('Seurat', update = F, ask = F)
  #install.packages('tidyverse')
  #install.packages('ggpubr')
  #install.packages('cowplot')
  library(scRNAseq)
  library(tidyverse)
  library(Seurat)
  library(ggpubr)
  library(cowplot)

Load Single Cell Data and Have a glimpse

Demo data is loaded from package scRNAseq, which includes 130 samples and metadata of it.

  data(fluidigm)
  ct <- floor(assays(fluidigm)$rsem_counts)
  sample_cnn <- colData(fluidigm) %>% as.data.frame()
  dim(ct)
## [1] 26255   130
  ct[1:4, 1:4]
##          SRR1275356 SRR1274090 SRR1275251 SRR1275287
## A1BG              0          0          0          0
## A1BG-AS1          0          0          0          0
## A1CF              0          0          0          0
## A2M               0          0          0         33
  DT::datatable(sample_cnn[, 1:6])

Explore sample metadata

  lapply(sample_cnn %>% select_if(is.numeric) %>% colnames(), function(x){
    dat <-  sample_cnn[,x,drop=F] 
    dat$sample <- rownames(dat)
    dat$group='all cells'
    p <- ggboxplot(dat, x = "group", y = x, 
                 add = "jitter" )
    return(p)
  }) %>% cowplot::plot_grid(plotlist=., ncol=5)

Workflow of Seurat

Step1: Ceate Seurat Object

  pbmc <- CreateSeuratObject(counts  = ct,
                                 meta.data =sample_cnn,
                                 min.cells = 3,
                                 min.features = 200,
                                 project = "Pollen")
  pbmc
## An object of class Seurat 
## 14656 features across 130 samples within 1 assay 
## Active assay: RNA (14656 features, 0 variable features)

Step2: Quality Control

  pbmc[["percent.mt"]] <- PercentageFeatureSet(object = pbmc, pattern = "^mt-")
  VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA","percent.mt"),
        group.by = 'Biological_Condition', ncol = 3)

  plot1 <-FeatureScatter(object = pbmc, feature1 = "nCount_RNA", 
                       feature2 = "percent.mt",group.by = 'Biological_Condition')
  plot2 <-FeatureScatter(object = pbmc, feature1 = "nCount_RNA", 
                       feature2 = "nFeature_RNA",group.by = 'Biological_Condition')
  CombinePlots(plots = list(plot1, plot2))

  rm(plot1, plot2)

Step3: Normalization of Gene Expression

  pbmc<-NormalizeData(object = pbmc, normalization.method= "LogNormalize", 
                    scale.factor= 10000, verbose = FALSE)
  pbmc <- ScaleData(object = pbmc, verbose = F)

Step4: Eliminate Distrubing Factors

  pbmc <- ScaleData(object = pbmc, verbose = F)

# 提取差异明显的基因
  pbmc <- FindVariableFeatures(object = pbmc, nfeatures = 2000)
  p1 <- VariableFeaturePlot(pbmc)
  p2 <- LabelPoints(plot = p1, points = head(VariableFeatures(pbmc), 10), 
                    repel = TRUE)
  CombinePlots(plots = list(p1, p2),legend="bottom")

  rm(p1, p2)

Step5: Dimension Reduction

  pbmc <- RunPCA(object = pbmc, verbose = F)
  pbmc <- RunTSNE(object = pbmc)
  p1<-PCAPlot(pbmc, group.by='Biological_Condition')
  p2<-TSNEPlot(pbmc, group.by='Biological_Condition')
  cowplot::plot_grid(p1,p2,ncol = 2,labels = c('A', 'B'))

  PCHeatmap(pbmc, dims=1:3)

  VizDimLoadings(pbmc)

  print(pbmc)
## An object of class Seurat 
## 14656 features across 130 samples within 1 assay 
## Active assay: RNA (14656 features, 2000 variable features)
##  2 dimensional reductions calculated: pca, tsne
  rm(p1, p2)

Step6: Visualization of Reduction

  pbmc <- JackStraw(pbmc, num.replicate = 100, verbose = F)
  pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
  plot1<-JackStrawPlot(pbmc, dims = 1:15)
  plot2<-ElbowPlot(pbmc, ndims = 100)
  CombinePlots(plots = list(plot1, plot2),legend="bottom")

  rm(plot1, plot2)
  
  #确定PC阈值的三个标准:
  #(1) 主成分累积贡献大于90%; 
  #(2) PC本身对方差贡献小于5%; 
  #(3) 两个连续PCs之间差异小于0.1%
  pct <- (pbmc[['pca']]@stdev / sum(pbmc[['pca']]@stdev))*100
  which(cumsum(pct) > 90 & pct < 5)[1]
## [1] 45
  sort(which((pct[1:length(pct) -1] -pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
## [1] 4

Step7: Clustering

  pbmc <- FindNeighbors(object = pbmc, reduction = 'pca', dims = 1:45)
  pbmc <- FindClusters(object = pbmc, resolution = 0.9, verbose=F)

Step8: Visualization of Clustering

  pbmc<-BuildClusterTree(pbmc)
  PlotClusterTree(pbmc)

  pbmc<- RunUMAP(object = pbmc, reduction='pca',dims = 1:45)
  pbmc <- RunTSNE(object = pbmc, reduction = 'pca', dims = 1:45, do.fast=T)
  DimPlot(pbmc, reduction = "pca") 

  DimPlot(pbmc, reduction = "tsne")

  DimPlot(pbmc, reduction = "umap") 

step9: Find Markers of clusters

  marker.all <- FindAllMarkers(object = pbmc, only.pos = T, 
                             min.pct = 0.25, thresh.use = 0.25, verbose = F)
  ggplot(marker.all, aes(x=avg_logFC, y=-log10(p_val_adj), color=cluster))+
    geom_point()+theme_bw()+theme(panel.grid = element_line(color=NA),
                                legend.position = 'none')+
    ggrepel::geom_label_repel(aes(label=if_else(avg_logFC >= 2, gene , '')))

  top10 <- marker.all %>% group_by(cluster) %>% top_n(10, wt=avg_logFC)
  DoHeatmap(object = pbmc, features = unique(top10$gene))+NoLegend()

  DotPlot(object = pbmc, features = unique(top10$gene))+RotatedAxis()

  FeaturePlot(pbmc, features = top10$gene,min.cutoff = 0, max.cutoff = 4)

Step10: Calculation

table(pbmc@active.ident)
## 
##  0  1 
## 70 60
table(marker.all$cluster)
## 
##    0    1 
##  506 1359
dim(pbmc@meta.data) 
## [1] 130  34
table(pbmc@active.ident)
## 
##  0  1 
## 70 60
table(marker.all$cluster)
## 
##    0    1 
##  506 1359

Package Version & Environment

  sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] cowplot_1.0.0               ggpubr_0.3.0               
##  [3] Seurat_3.1.5                forcats_0.5.0              
##  [5] stringr_1.4.0               dplyr_1.0.0                
##  [7] purrr_0.3.4                 readr_1.3.1                
##  [9] tidyr_1.1.0                 tibble_3.0.1               
## [11] ggplot2_3.3.2               tidyverse_1.3.0            
## [13] scRNAseq_1.8.0              SummarizedExperiment_1.12.0
## [15] DelayedArray_0.8.0          BiocParallel_1.16.6        
## [17] matrixStats_0.56.0          Biobase_2.42.0             
## [19] GenomicRanges_1.34.0        GenomeInfoDb_1.18.2        
## [21] IRanges_2.16.0              S4Vectors_0.20.1           
## [23] BiocGenerics_0.28.0        
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        plyr_1.8.6            
##   [4] igraph_1.2.5           lazyeval_0.2.2         splines_3.5.0         
##   [7] crosstalk_1.1.0.1      listenv_0.8.0          digest_0.6.25         
##  [10] htmltools_0.5.0        gdata_2.18.0           fansi_0.4.1           
##  [13] magrittr_1.5           cluster_2.1.0          ROCR_1.0-7            
##  [16] limma_3.38.3           openxlsx_4.1.5         globals_0.12.5        
##  [19] modelr_0.1.8           colorspace_1.4-1       blob_1.2.1            
##  [22] rvest_0.3.5            ggrepel_0.8.2          haven_2.3.1           
##  [25] xfun_0.15              crayon_1.3.4           RCurl_1.98-1.2        
##  [28] jsonlite_1.7.0         survival_3.2-3         zoo_1.8-8             
##  [31] ape_5.3                glue_1.4.1             gtable_0.3.0          
##  [34] zlibbioc_1.28.0        XVector_0.22.0         leiden_0.3.3          
##  [37] car_3.0-8              future.apply_1.6.0     abind_1.4-5           
##  [40] scales_1.1.1           DBI_1.1.0              rstatix_0.6.0         
##  [43] Rcpp_1.0.4.6           viridisLite_0.3.0      reticulate_1.16       
##  [46] foreign_0.8-74         rsvd_1.0.3             DT_0.14               
##  [49] tsne_0.1-3             htmlwidgets_1.5.1      httr_1.4.1            
##  [52] gplots_3.0.3           RColorBrewer_1.1-2     ellipsis_0.3.1        
##  [55] ica_1.0-2              farver_2.0.3           pkgconfig_2.0.3       
##  [58] uwot_0.1.8             dbplyr_1.4.4           labeling_0.3          
##  [61] tidyselect_1.1.0       rlang_0.4.6            reshape2_1.4.4        
##  [64] munsell_0.5.0          cellranger_1.1.0       tools_3.5.0           
##  [67] cli_2.0.2              generics_0.0.2         broom_0.5.6           
##  [70] ggridges_0.5.2         evaluate_0.14          yaml_2.2.1            
##  [73] knitr_1.29             fs_1.4.1               fitdistrplus_1.1-1    
##  [76] zip_2.0.4              caTools_1.17.1.3       RANN_2.6.1            
##  [79] pbapply_1.4-2          future_1.17.0          nlme_3.1-145          
##  [82] xml2_1.3.2             compiler_3.5.0         rstudioapi_0.11       
##  [85] plotly_4.9.2.1         curl_4.3               png_0.1-7             
##  [88] ggsignif_0.6.0         reprex_0.3.0           stringi_1.4.6         
##  [91] RSpectra_0.16-0        lattice_0.20-41        Matrix_1.2-18         
##  [94] vctrs_0.3.1            pillar_1.4.4           lifecycle_0.2.0       
##  [97] lmtest_0.9-37          RcppAnnoy_0.0.16       data.table_1.12.8     
## [100] bitops_1.0-6           irlba_2.3.3            patchwork_1.0.1       
## [103] R6_2.4.1               KernSmooth_2.23-16     gridExtra_2.3         
## [106] rio_0.5.16             codetools_0.2-16       MASS_7.3-51.6         
## [109] gtools_3.8.2           assertthat_0.2.1       withr_2.2.0           
## [112] sctransform_0.2.1      GenomeInfoDbData_1.2.0 hms_0.5.3             
## [115] grid_3.5.0             rmarkdown_2.3          carData_3.0-4         
## [118] Rtsne_0.15             lubridate_1.7.9