R Markdown

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
setwd("/Users/raymondotoo/Desktop/scRNA/seuratTutorial")

##General Pipeline overview

#######################################1. Setup the Seurat Object ##########################################
#######################################2. Standard pre-processing workflow #################################
#######################################3. Normalizing the data #############################################
#######################################4. Identification of highly variable features (feature selection) ###
#######################################5. Scaling the data #################################################
#######################################6. Perform linear dimensional reduction #############################
#######################################7. Determine the ‘dimensionality’ of the dataset ####################
#######################################8. Cluster the cells ################################################
#######################################9. Run non-linear dimensional reduction (UMAP/tSNE)##################
#######################################10. Finding differentially expressed features (cluster biomarkers)###
#######################################11.Assigning cell type identity to clusters##########################
# Load the PBMC dataset
pbmc.data <- Read10X("filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
pbmc
## An object of class Seurat 
## 13714 features across 2700 samples within 1 assay 
## Active assay: RNA (13714 features, 0 variable features)
# Lets examine a few genes in the first thirty cells
pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
## 3 x 30 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 30 column names 'AAACATACAACCAC-1', 'AAACATTGAGCTAC-1', 'AAACATTGATCAGC-1' ... ]]
##                                                                    
## CD3D  4 . 10 . . 1 2 3 1 . . 2 7 1 . . 1 3 . 2  3 . . . . . 3 4 1 5
## TCL1A . .  . . . . . . 1 . . . . . . . . . . .  . 1 . . . . . . . .
## MS4A1 . 6  . . . . . . 1 1 1 . . . . . . . . . 36 1 2 . . 2 . . . .
dense.size <- object.size(as.matrix(pbmc.data))
dense.size
## 709591472 bytes
sparse.size <- object.size(pbmc.data)
sparse.size
## 29905192 bytes
dense.size/sparse.size
## 23.7 bytes

##Standard pre-processing workflow

#QC and selecting cells for further analysis
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Show QC metrics for the first 5 cells
head(pbmc@meta.data, 5)
##                  orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759
## AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958
## AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363
## AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845
## AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

##Normalizing the data

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

##Identification of highly variable features (feature selection)

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing missing values (geom_point).

##Scaling the data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- ScaleData(pbmc)
## Centering and scaling data matrix
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
## Regressing out percent.mt
## Centering and scaling data matrix

##Perform linear dimensional reduction

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP 
##     FCER1G, CFD, LGALS1, LGALS2, SERPINA1, S100A8, CTSS, IFITM3, SPI1, CFP 
##     PSAP, IFI30, COTL1, SAT1, S100A11, NPC2, GRN, LGALS3, GSTP1, PYCARD 
## Negative:  MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CTSW, STK17A, CD27 
##     CD247, CCL5, GIMAP5, GZMA, AQP3, CST7, TRAF3IP3, SELL, GZMK, HOPX 
##     MAL, MYC, ITM2A, ETS1, LYAR, GIMAP7, KLRG1, NKG7, ZAP70, BEX2 
## PC_ 2 
## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74 
##     HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB 
##     BLNK, P2RX5, IGLL5, IRF8, SWAP70, ARHGAP24, FCGR2B, SMIM14, PPP1R14A, C16orf74 
## Negative:  NKG7, PRF1, CST7, GZMA, GZMB, FGFBP2, CTSW, GNLY, B2M, SPON2 
##     CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX 
##     TTC38, CTSC, APMAP, S100A4, IGFBP7, ANXA1, ID2, IL32, XCL1, RHOC 
## PC_ 3 
## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1, HLA-DPB1, CD74, MS4A1, HLA-DRB1, HLA-DRA 
##     HLA-DRB5, HLA-DQA2, TCL1A, LINC00926, HLA-DMB, HLA-DMA, CD37, HVCN1, FCRLA, IRF8 
##     PLAC8, BLNK, MALAT1, SMIM14, PLD4, IGLL5, SWAP70, P2RX5, LAT2, FCGR3A 
## Negative:  PPBP, PF4, SDPR, SPARC, GNG11, NRGN, GP9, RGS18, TUBB1, CLU 
##     HIST1H2AC, AP001189.4, ITGA2B, CD9, TMEM40, PTCRA, CA2, ACRBP, MMD, TREML1 
##     NGFRAP1, F13A1, SEPT5, RUFY1, TSC22D1, CMTM5, MPP1, MYL9, RP11-367G6.3, GP1BA 
## PC_ 4 
## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1, CD74, HLA-DPB1, HIST1H2AC, HLA-DPA1, HLA-DRB1 
##     TCL1A, PF4, HLA-DQA2, SDPR, HLA-DRA, LINC00926, PPBP, GNG11, HLA-DRB5, SPARC 
##     GP9, PTCRA, CA2, AP001189.4, CD9, NRGN, RGS18, GZMB, CLU, TUBB1 
## Negative:  VIM, IL7R, S100A6, S100A8, IL32, S100A4, GIMAP7, S100A10, S100A9, MAL 
##     AQP3, CD14, CD2, LGALS2, FYB, GIMAP4, ANXA1, RBP7, CD27, FCN1 
##     LYZ, S100A12, MS4A6A, GIMAP5, S100A11, FOLR3, TRABD2A, AIF1, IL8, TMSB4X 
## PC_ 5 
## Positive:  GZMB, FGFBP2, S100A8, NKG7, GNLY, CCL4, PRF1, CST7, SPON2, GZMA 
##     GZMH, LGALS2, S100A9, CCL3, XCL2, CD14, CLIC3, CTSW, MS4A6A, GSTP1 
##     S100A12, RBP7, IGFBP7, FOLR3, AKR1C3, TYROBP, CCL5, TTC38, XCL1, APMAP 
## Negative:  LTB, IL7R, CKB, MS4A7, RP11-290F20.3, AQP3, SIGLEC10, VIM, CYTIP, HMOX1 
##     LILRB2, PTGES3, HN1, CD2, FAM110A, CD27, ANXA5, CTD-2006K23.1, MAL, VMO1 
##     CORO1B, TUBA1B, LILRA3, GDI2, TRADD, ATP1A1, IL32, ABRACL, CCDC109B, PPA1
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  CST3, TYROBP, LST1, AIF1, FTL 
## Negative:  MALAT1, LTB, IL32, IL7R, CD2 
## PC_ 2 
## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1 
## Negative:  NKG7, PRF1, CST7, GZMA, GZMB 
## PC_ 3 
## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1 
## Negative:  PPBP, PF4, SDPR, SPARC, GNG11 
## PC_ 4 
## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1 
## Negative:  VIM, IL7R, S100A6, S100A8, IL32 
## PC_ 5 
## Positive:  GZMB, FGFBP2, S100A8, NKG7, GNLY 
## Negative:  LTB, IL7R, CKB, MS4A7, RP11-290F20.3
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

##Determine the ‘dimensionality’ of the dataset

# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
## Warning: Removed 23407 rows containing missing values (geom_point).

ElbowPlot(pbmc)

##Cluster the cells

pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2638
## Number of edges: 95893
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8735
## Number of communities: 9
## Elapsed time: 0 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1 
##                0                3                2                1 
## AAACCGTGTATGCG-1 
##                6 
## Levels: 0 1 2 3 4 5 6 7 8

##Run non-linear dimensional reduction (UMAP/tSNE)

# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 19:32:18 UMAP embedding parameters a = 0.9922 b = 1.112
## 19:32:18 Read 2638 rows and found 10 numeric columns
## 19:32:18 Using Annoy for neighbor search, n_neighbors = 30
## 19:32:18 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:32:18 Writing NN index file to temp file /var/folders/60/239m7ccs0ksbbl7n3b9s1cqh0000gn/T//RtmpVRYtrJ/file1549f519f2a4
## 19:32:18 Searching Annoy index using 1 thread, search_k = 3000
## 19:32:19 Annoy recall = 100%
## 19:32:19 Commencing smooth kNN distance calibration using 1 thread
## 19:32:19 Initializing from normalized Laplacian + noise
## 19:32:20 Commencing optimization for 500 epochs, with 106338 positive edges
## 19:32:24 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")

##You can save the object at this point so that it can easily be loaded back in without having to rerun the computationally intensive steps performed above, or easily shared with collaborators.

#NB saves in your current set working directory
saveRDS(pbmc, file = "pbmc_tutorial.rds")
##############Finding differentially expressed features (cluster biomarkers)###############
# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session
head(cluster2.markers, n = 5)
##             p_val avg_log2FC pct.1 pct.2    p_val_adj
## IL32 1.616287e-80  1.1350656 0.951 0.475 2.216576e-76
## LTB  1.931941e-80  1.2727422 0.981 0.650 2.649464e-76
## LDHB 5.248695e-65  0.9390949 0.967 0.618 7.198060e-61
## CD3D 1.379097e-61  0.8797194 0.916 0.443 1.891293e-57
## IL7R 7.116417e-60  1.1636337 0.748 0.335 9.759454e-56
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
##                       p_val avg_log2FC pct.1 pct.2     p_val_adj
## FCGR3A        8.331882e-208   4.261784 0.975 0.040 1.142634e-203
## CFD           1.932644e-198   3.423863 0.938 0.036 2.650429e-194
## IFITM3        2.710023e-198   3.876058 0.975 0.049 3.716525e-194
## CD68          1.069778e-193   3.013656 0.926 0.035 1.467094e-189
## RP11-290F20.3 4.218926e-190   2.722303 0.840 0.016 5.785835e-186
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
pbmc.markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 18 × 7
## # Groups:   cluster [9]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene    
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
##  1 5.01e- 85       1.34 0.437 0.108 6.88e- 81 0       CCR7    
##  2 1.88e-117       1.08 0.913 0.588 2.57e-113 0       LDHB    
##  3 0               5.57 0.996 0.215 0         1       S100A9  
##  4 0               5.48 0.975 0.121 0         1       S100A8  
##  5 1.93e- 80       1.27 0.981 0.65  2.65e- 76 2       LTB     
##  6 2.91e- 58       1.27 0.667 0.251 3.98e- 54 2       CD2     
##  7 0               4.31 0.939 0.042 0         3       CD79A   
##  8 1.06e-269       3.59 0.623 0.022 1.45e-265 3       TCL1A   
##  9 3.60e-221       3.21 0.984 0.226 4.93e-217 4       CCL5    
## 10 4.27e-176       3.01 0.573 0.05  5.85e-172 4       GZMK    
## 11 3.51e-184       3.31 0.975 0.134 4.82e-180 5       FCGR3A  
## 12 2.03e-125       3.09 1     0.315 2.78e-121 5       LST1    
## 13 1.04e-189       5.28 0.961 0.132 1.43e-185 6       GNLY    
## 14 3.17e-267       4.83 0.961 0.068 4.35e-263 6       GZMB    
## 15 1.48e-220       3.87 0.812 0.011 2.03e-216 7       FCER1A  
## 16 1.67e- 21       2.87 1     0.513 2.28e- 17 7       HLA-DPB1
## 17 3.68e-110       8.58 1     0.024 5.05e-106 8       PPBP    
## 18 7.73e-200       7.24 1     0.01  1.06e-195 8       PF4
cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))

# you can plot raw counts as well
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))

pbmc.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A

##Assigning cell type identity to clusters

new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
    "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

saveRDS(pbmc, file = "pbmc3k_final.rds")
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] patchwork_1.1.1    SeuratObject_4.0.4 Seurat_4.1.0       dplyr_1.0.8       
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_2.0-3      deldir_1.0-6         
##   [4] ellipsis_0.3.2        ggridges_0.5.3        rstudioapi_0.13      
##   [7] spatstat.data_2.1-4   farver_2.1.0          leiden_0.3.9         
##  [10] listenv_0.8.0         ggrepel_0.9.1         RSpectra_0.16-0      
##  [13] fansi_1.0.3           codetools_0.2-18      splines_4.1.3        
##  [16] knitr_1.38            polyclip_1.10-0       jsonlite_1.8.0       
##  [19] ica_1.0-2             cluster_2.1.3         png_0.1-7            
##  [22] uwot_0.1.11           shiny_1.7.1           sctransform_0.3.3    
##  [25] spatstat.sparse_2.1-0 compiler_4.1.3        httr_1.4.2           
##  [28] Matrix_1.4-1          fastmap_1.1.0         lazyeval_0.2.2       
##  [31] cli_3.2.0             later_1.3.0           htmltools_0.5.2      
##  [34] tools_4.1.3           igraph_1.3.0          gtable_0.3.0         
##  [37] glue_1.6.2            RANN_2.6.1            reshape2_1.4.4       
##  [40] Rcpp_1.0.8.3          scattermore_0.8       jquerylib_0.1.4      
##  [43] vctrs_0.4.0           nlme_3.1-157          lmtest_0.9-40        
##  [46] spatstat.random_2.2-0 xfun_0.30             stringr_1.4.0        
##  [49] globals_0.14.0        mime_0.12             miniUI_0.1.1.1       
##  [52] lifecycle_1.0.1       irlba_2.3.5           goftest_1.2-3        
##  [55] future_1.24.0         MASS_7.3-56           zoo_1.8-9            
##  [58] scales_1.1.1          spatstat.core_2.4-2   promises_1.2.0.1     
##  [61] spatstat.utils_2.3-0  parallel_4.1.3        RColorBrewer_1.1-2   
##  [64] yaml_2.3.5            reticulate_1.24       pbapply_1.5-0        
##  [67] gridExtra_2.3         ggplot2_3.3.5         sass_0.4.1           
##  [70] rpart_4.1.16          stringi_1.7.6         highr_0.9            
##  [73] rlang_1.0.2           pkgconfig_2.0.3       matrixStats_0.61.0   
##  [76] evaluate_0.15         lattice_0.20-45       ROCR_1.0-11          
##  [79] purrr_0.3.4           tensor_1.5            labeling_0.4.2       
##  [82] htmlwidgets_1.5.4     cowplot_1.1.1         tidyselect_1.1.2     
##  [85] parallelly_1.30.0     RcppAnnoy_0.0.19      plyr_1.8.7           
##  [88] magrittr_2.0.3        R6_2.5.1              generics_0.1.2       
##  [91] DBI_1.1.2             withr_2.5.0           pillar_1.7.0         
##  [94] mgcv_1.8-40           fitdistrplus_1.1-8    survival_3.3-1       
##  [97] abind_1.4-5           tibble_3.1.6          future.apply_1.8.1   
## [100] crayon_1.5.1          KernSmooth_2.23-20    utf8_1.2.2           
## [103] spatstat.geom_2.4-0   plotly_4.10.0         rmarkdown_2.13       
## [106] grid_4.1.3            data.table_1.14.2     digest_0.6.29        
## [109] xtable_1.8-4          tidyr_1.2.0           httpuv_1.6.5         
## [112] munsell_0.5.0         viridisLite_0.4.0     bslib_0.3.1