SQ: Can RUN ALL beautifully, knit to html result in a “seurat_tutorial_v2_files” folder in WD, with all the images. Not an html file though

when Knit to html, comment the shinyCell chunk, otherwise it will not finish.

#load seurat package and dependencies

library(dplyr)
library(Seurat)
library(patchwork)
setwd("D:/BioinformaticsProjects/R Code/Seurat Tutorial")

load pbmc dataset

pbmc.data <- Read10X(data.dir = "D:/BioinformaticsProjects/R Code/Seurat Tutorial/filtered_gene_bc_matrices/hg19/")

dim(pbmc.data)
## [1] 32738  2700
pbmc.data[1:10,1:6]
## 10 x 6 sparse Matrix of class "dgCMatrix"
##               AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
## MIR1302-10                   .                .                .
## FAM138A                      .                .                .
## OR4F5                        .                .                .
## RP11-34P13.7                 .                .                .
## RP11-34P13.8                 .                .                .
## AL627309.1                   .                .                .
## RP11-34P13.14                .                .                .
## RP11-34P13.9                 .                .                .
## AP006222.2                   .                .                .
## RP4-669L17.10                .                .                .
##               AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1 AAACGCACTGGTAC-1
## MIR1302-10                   .                .                .
## FAM138A                      .                .                .
## OR4F5                        .                .                .
## RP11-34P13.7                 .                .                .
## RP11-34P13.8                 .                .                .
## AL627309.1                   .                .                .
## RP11-34P13.14                .                .                .
## RP11-34P13.9                 .                .                .
## AP006222.2                   .                .                .
## RP4-669L17.10                .                .                .

Create a Seurat object and filter data

pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
pbmc
## An object of class Seurat 
## 13714 features across 2700 samples within 1 assay 
## Active assay: RNA (13714 features, 0 variable features)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # common RBC genes for human blood sample
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)

VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

Filter and subset cells

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

normalize expression level

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

identify 2000 most variable genes/features, to be used for downstream analysis

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

top10 <- head(VariableFeatures(pbmc), 10)
top10
##  [1] "PPBP"   "LYZ"    "S100A9" "IGLL5"  "GNLY"   "FTL"    "PF4"    "FTH1"  
##  [9] "GNG11"  "S100A8"
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing missing values (geom_point).

plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Removed 1 rows containing missing values (geom_point).

# Preparation stage for PCA analyses, need to use ScaleData() to scale and center all data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt") # regress out the effects of mitochondria variation
## Regressing out percent.mt
## Centering and scaling data matrix

Linear dimention reduction (PCA), use most variable features by default

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP 
##     FCER1G, CFD, LGALS1, LGALS2, SERPINA1, S100A8, CTSS, IFITM3, SPI1, CFP 
##     PSAP, IFI30, COTL1, SAT1, S100A11, NPC2, GRN, LGALS3, GSTP1, PYCARD 
## Negative:  MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CTSW, STK17A, CD27 
##     CD247, CCL5, GIMAP5, GZMA, AQP3, CST7, TRAF3IP3, SELL, GZMK, HOPX 
##     MAL, MYC, ITM2A, ETS1, LYAR, GIMAP7, KLRG1, NKG7, ZAP70, BEX2 
## PC_ 2 
## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74 
##     HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB 
##     BLNK, P2RX5, IGLL5, IRF8, SWAP70, ARHGAP24, FCGR2B, SMIM14, PPP1R14A, C16orf74 
## Negative:  NKG7, PRF1, CST7, GZMA, GZMB, FGFBP2, CTSW, GNLY, B2M, SPON2 
##     CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX 
##     TTC38, CTSC, APMAP, S100A4, IGFBP7, ANXA1, ID2, IL32, XCL1, RHOC 
## PC_ 3 
## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1, HLA-DPB1, CD74, MS4A1, HLA-DRB1, HLA-DRA 
##     HLA-DRB5, HLA-DQA2, TCL1A, LINC00926, HLA-DMB, HLA-DMA, CD37, HVCN1, FCRLA, IRF8 
##     PLAC8, BLNK, MALAT1, SMIM14, PLD4, IGLL5, SWAP70, P2RX5, LAT2, FCGR3A 
## Negative:  PPBP, PF4, SDPR, SPARC, GNG11, NRGN, GP9, RGS18, TUBB1, CLU 
##     HIST1H2AC, AP001189.4, ITGA2B, CD9, TMEM40, PTCRA, CA2, ACRBP, MMD, TREML1 
##     NGFRAP1, F13A1, SEPT5, RUFY1, TSC22D1, CMTM5, MPP1, MYL9, RP11-367G6.3, GP1BA 
## PC_ 4 
## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1, CD74, HLA-DPB1, HIST1H2AC, HLA-DPA1, HLA-DRB1 
##     TCL1A, PF4, HLA-DQA2, SDPR, HLA-DRA, LINC00926, PPBP, GNG11, HLA-DRB5, SPARC 
##     GP9, PTCRA, CA2, AP001189.4, CD9, NRGN, RGS18, GZMB, CLU, TUBB1 
## Negative:  VIM, IL7R, S100A6, S100A8, IL32, S100A4, GIMAP7, S100A10, S100A9, MAL 
##     AQP3, CD14, CD2, LGALS2, FYB, GIMAP4, ANXA1, RBP7, CD27, FCN1 
##     LYZ, S100A12, MS4A6A, GIMAP5, S100A11, FOLR3, TRABD2A, AIF1, IL8, TMSB4X 
## PC_ 5 
## Positive:  GZMB, FGFBP2, S100A8, NKG7, GNLY, CCL4, PRF1, CST7, SPON2, GZMA 
##     GZMH, LGALS2, S100A9, CCL3, XCL2, CD14, CLIC3, CTSW, MS4A6A, GSTP1 
##     S100A12, RBP7, IGFBP7, FOLR3, AKR1C3, TYROBP, CCL5, TTC38, XCL1, APMAP 
## Negative:  LTB, IL7R, CKB, MS4A7, RP11-290F20.3, AQP3, SIGLEC10, VIM, CYTIP, HMOX1 
##     LILRB2, PTGES3, HN1, CD2, FAM110A, CD27, ANXA5, CTD-2006K23.1, MAL, VMO1 
##     CORO1B, TUBA1B, LILRA3, GDI2, TRADD, ATP1A1, IL32, ABRACL, CCDC109B, PPA1
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  CST3, TYROBP, LST1, AIF1, FTL 
## Negative:  MALAT1, LTB, IL32, IL7R, CD2 
## PC_ 2 
## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1 
## Negative:  NKG7, PRF1, CST7, GZMA, GZMB 
## PC_ 3 
## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1 
## Negative:  PPBP, PF4, SDPR, SPARC, GNG11 
## PC_ 4 
## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1 
## Negative:  VIM, IL7R, S100A6, S100A8, IL32 
## PC_ 5 
## Positive:  GZMB, FGFBP2, S100A8, NKG7, GNLY 
## Negative:  LTB, IL7R, CKB, MS4A7, RP11-290F20.3
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

make PCA dimension plots

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:5, cells = 500, balanced = TRUE)

# determine number of PCs in the dataset

pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
## Warning: Removed 23407 rows containing missing values (geom_point).

ElbowPlot(pbmc)

# cell clustering

pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2638
## Number of edges: 95893
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8735
## Number of communities: 9
## Elapsed time: 0 seconds
head(Idents(pbmc), 5)
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1 
##                0                3                2                1 
## AAACCGTGTATGCG-1 
##                6 
## Levels: 0 1 2 3 4 5 6 7 8
pbmc <- RunTSNE(pbmc, dims = 1:10)
DimPlot(pbmc,reduction = "tsne",label = TRUE,pt.size = 1.5)

pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:08:31 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:08:31 Read 2638 rows and found 10 numeric columns
## 10:08:31 Using Annoy for neighbor search, n_neighbors = 30
## 10:08:31 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:08:31 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\RtmpKoDCsM\file48bc6e614ea3
## 10:08:31 Searching Annoy index using 1 thread, search_k = 3000
## 10:08:32 Annoy recall = 100%
## 10:08:33 Commencing smooth kNN distance calibration using 1 thread
## 10:08:34 Initializing from normalized Laplacian + noise
## 10:08:34 Commencing optimization for 500 epochs, with 106338 positive edges
## 10:08:42 Optimization finished
DimPlot(pbmc,reduction = "umap",label = TRUE,pt.size = 1.5)

# score cell cycle

pbmc<- CellCycleScoring(object = pbmc, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
## Warning: The following features are not present in the object: DTL, UHRF1,
## MLF1IP, EXO1, CASP8AP2, BRIP1, E2F8, not searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, BUB1,
## HJURP, CDCA3, TTK, CDC25C, DLGAP5, CDCA2, ANLN, GAS2L3, not searching for symbol
## synonyms
head(x = pbmc@meta.data)
##                  orig.ident nCount_RNA nFeature_RNA percent.mt percent.HB
## AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759          0
## AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958          0
## AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363          0
## AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845          0
## AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898          0
## AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551          0
##                  RNA_snn_res.0.5 seurat_clusters     S.Score    G2M.Score Phase
## AAACATACAACCAC-1               0               0  0.09853841 -0.044716507     S
## AAACATTGAGCTAC-1               3               3 -0.02364305 -0.046889929    G1
## AAACATTGATCAGC-1               2               2 -0.02177266  0.074841537   G2M
## AAACCGTGCTTCCG-1               1               1  0.03794398  0.006575446     S
## AAACCGTGTATGCG-1               6               6 -0.03309970  0.027910063   G2M
## AAACGCACTGGTAC-1               2               2 -0.04814181 -0.078164839    G1
DimPlot(pbmc,reduction = "tsne",label = TRUE,group.by="Phase",pt.size = 1.5)

# inspect and save UMI, Standardize and normalize data

write.table(pbmc[["RNA"]]@counts,"./umi.xls",sep="\t") # 原始UMI Original UMI
write.table(pbmc[["RNA"]]@data,"./data.xls",sep="\t")#标准化的data,进行了LogNormalize
write.table(pbmc[["RNA"]]@scale.data,"./scale.data.xls",sep="\t")#归一化的scale.data,有正负

save results

saveRDS(pbmc, file = "./pbmc_tutorial.rds")
save(pbmc,file="./res0.5.Robj")

double finder

library(DoubletFinder)
sweep.res.list_pbmc <- paramSweep_v3(pbmc, PCs = 1:10, sct = FALSE)
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
sweep.stats_pbmc <- summarizeSweep(sweep.res.list_pbmc, GT = FALSE)
bcmvn_pbmc <- find.pK(sweep.stats_pbmc)

## NULL
opt_pK <- as.numeric(as.vector(bcmvn_pbmc$pK[which.max(bcmvn_pbmc$BCmetric)]))
print(opt_pK)
## [1] 0.01
annotations <- pbmc@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)           
nExp_poi <- round(0.06*nrow(pbmc@meta.data))  
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

# find doublets
pbmc <- doubletFinder_v3(pbmc,PCs = 1:10,pN = 0.25,pK = opt_pK,nExp = nExp_poi,reuse.pANN = FALSE,sct = FALSE)
## [1] "Creating 879 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
DimPlot(pbmc, reduction = "tsne", group.by = "DF.classifications_0.25_0.01_158",order = T)

table(pbmc$DF.classifications_0.25_0.01_158)
## 
## Doublet Singlet 
##     158    2480

find markers for cluster 1

cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25) #SQ: marker has to be expressed in at least 25% of cluster 1 cells.
head(cluster1.markers, n = 5) # top 5 markers for cluster 1, that distinct from the other clusters
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## S100A9  0.000000e+00   5.570063 0.996 0.215  0.000000e+00
## S100A8  0.000000e+00   5.477394 0.975 0.121  0.000000e+00
## FCN1    0.000000e+00   3.394219 0.952 0.151  0.000000e+00
## LGALS2  0.000000e+00   3.800484 0.908 0.059  0.000000e+00
## CD14   2.856582e-294   2.815626 0.667 0.028 3.917516e-290

find markers for each cluster

pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
## # A tibble: 18 × 7
## # Groups:   cluster [9]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene    
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
##  1 1.88e-117       1.08 0.913 0.588 2.57e-113 0       LDHB    
##  2 5.01e- 85       1.34 0.437 0.108 6.88e- 81 0       CCR7    
##  3 0               5.57 0.996 0.215 0         1       S100A9  
##  4 0               5.48 0.975 0.121 0         1       S100A8  
##  5 1.93e- 80       1.27 0.981 0.65  2.65e- 76 2       LTB     
##  6 2.91e- 58       1.27 0.667 0.251 3.98e- 54 2       CD2     
##  7 0               4.31 0.939 0.042 0         3       CD79A   
##  8 1.06e-269       3.59 0.623 0.022 1.45e-265 3       TCL1A   
##  9 3.60e-221       3.21 0.984 0.226 4.93e-217 4       CCL5    
## 10 4.27e-176       3.01 0.573 0.05  5.85e-172 4       GZMK    
## 11 3.51e-184       3.31 0.975 0.134 4.82e-180 5       FCGR3A  
## 12 2.03e-125       3.09 1     0.315 2.78e-121 5       LST1    
## 13 3.17e-267       4.83 0.961 0.068 4.35e-263 6       GZMB    
## 14 1.04e-189       5.28 0.961 0.132 1.43e-185 6       GNLY    
## 15 1.48e-220       3.87 0.812 0.011 2.03e-216 7       FCER1A  
## 16 1.67e- 21       2.87 1     0.513 2.28e- 17 7       HLA-DPB1
## 17 7.73e-200       7.24 1     0.01  1.06e-195 8       PF4     
## 18 3.68e-110       8.58 1     0.024 5.05e-106 8       PPBP
##存储marker -save markers
write.table(pbmc.markers,file="./allmarker.txt",sep="\t")

SingleR for cell type identification

library(SingleR) 
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## 
## Attaching package: 'stats4'
## The following object is masked from 'package:spam':
## 
##     mle
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:spam':
## 
##     cbind, rbind
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:sp':
## 
##     %over%
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(celldex) 
## 
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData
refdata <- celldex::HumanPrimaryCellAtlasData()
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
refdata_ms <- celldex::MouseRNAseqData()
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
cellpred <- SingleR(test = pbmc@assays$RNA@data, ref = refdata, labels = refdata$label.fine, clusters = pbmc@active.ident, assay.type.test = "logcounts", assay.type.ref = "logcounts")

cellpred$labels
## [1] "T_cell:CD4+_central_memory" "Monocyte:CD16-"            
## [3] "T_cell:CD4+_central_memory" "B_cell:immature"           
## [5] "T_cell:CD8+"                "Monocyte:CD16+"            
## [7] "NK_cell"                    "Monocyte:CD16-"            
## [9] "Monocyte"
# output of calculated cell types
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = F)
write.table(celltype,"./celltype_singleR.xls",row.names = F,sep="\t")


#plot the calculated cell types

pbmc@meta.data$singleR=celltype[match(pbmc@active.ident,celltype$ClusterID),'celltype']
DimPlot(pbmc, reduction = "tsne", group.by = "singleR",label=T, label.size=5)

# collection of plotting codes

FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("gray", "red"))

##绘制Marker基因的小提琴图
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))

VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

##绘制分cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)#n = 10可以自行调整数字
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A

##绘制气泡图
DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))

##绘制RidgePlot
RidgePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14"), ncol = 2)

# subset a cluster for further sub-clustering

sub_pbmc<-subset(pbmc, idents = '0')
DimPlot(sub_pbmc, reduction = "tsne",label = TRUE, pt.size = 1.5)

##########对取出的cluster重新进行分群 - re-cluster the subset
sub_pbmc <- FindVariableFeatures(sub_pbmc, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(sub_pbmc)
sub_pbmc <- ScaleData(sub_pbmc, features = all.genes)
## Centering and scaling data matrix
sub_pbmc <- RunPCA(sub_pbmc, features = VariableFeatures(object = sub_pbmc))
## PC_ 1 
## Positive:  PRSS57, RAB13, CYTL1, SDPR, LAPTM4B, EGFL7, C19orf77, DEPTOR, FES, IL1B 
##     PTMS, GSN, RPL3, SOX4, RPS4X, FCER1A, KIAA0125, RPS2, AKR1C3, HLA-DRB5 
##     CNRIP1, LYL1, RPS5, CDK6, RPL10A, ACSM3, PABPC1, RP11-262H14.4, IGFBP7, CAT 
## Negative:  TMSB4X, MALAT1, B2M, IL32, CD3D, CCL5, TIGIT, CBR4, GZMK, GABPB1 
##     YTHDF1, LTB, CTC-444N24.11, MEPCE, UBR1, UNC93B1, CLASP2, F5, HLA-A, SH2D3A 
##     ATXN7L1, WDR91, CHP1, RP11-108M9.6, HIPK3, CLIC3, ABR, C5orf24, TRMT6, RNF185 
## PC_ 2 
## Positive:  RPL3, RPL19, RPS5, RPL10A, RPS3A, RPS12, RPL10, RPS2, RPS6, RPS4X 
##     RPL15, RPS8, RPS27A, RPL7, GLTSCR2, RPL13A, BTG1, EEF1D, NACA, TPT1 
##     TMEM66, PABPC1, RPL13, RPL18, RPL8, CD7, EVL, CD8B, NOSIP, S100B 
## Negative:  PRSS57, SDPR, CYTL1, S100A4, RAB13, IL1B, ANKRD28, LAPTM4B, CCL5, KLF6 
##     C19orf77, C5orf24, PTMS, CNRIP1, FES, EGFL7, HLA-DRB5, RP11-262H14.4, DEPTOR, CBR4 
##     TMSB4X, VGLL4, DDA1, BTK, AKR1C3, CXXC5, SOX4, RP1-43E13.2, FCER1A, CDK6 
## PC_ 3 
## Positive:  MALAT1, RPL13, RPL13A, RPS3A, RPS6, RPS4X, CD8B, RPS12, RPS18, RPL19 
##     RPS2, GTF2I, SDPR, CD8A, RPL10, RPS3, SPAG9, SARM1, IDH2, H1F0 
##     YBX3, PRSS57, RPL3, RP11-291B21.2, CTD-2260A17.2, RP11-620J15.3, RAB13, RC3H2, REG4, RPS27A 
## Negative:  FOS, JUNB, JUN, H3F3B, IER2, B2M, KLF6, EIF1, TMSB4X, ZFP36 
##     LTB, CXCR4, TMEM66, S100A4, HLA-A, IL7R, IL32, FTH1, RP11-1399P15.1, HLA-B 
##     IFITM2, HNRNPDL, YPEL5, RGS1, DHRS7, PLP2, PNRC1, KRT1, FXYD5, RARRES3 
## PC_ 4 
## Positive:  S100A4, RPL8, ALOX5AP, LMNA, RPS18, RPS7, TNFRSF4, HOPX, VIM, TNFRSF18 
##     RPS2, RPL13A, RPL10, SH3BGRL3, CTSH, RPS3, TRADD, CD99, FLNA, B2M 
##     KLRB1, RPS8, RPS4X, LTB, IFITM2, GSTK1, ACTB, KLRG1, FXYD5, GNB2L1 
## Negative:  MALAT1, JUNB, BTG1, AIF1, MT-CO2, RPL34, YPEL5, PCYOX1L, JUN, FOS 
##     TTC1, RPF2, SELL, DUSP22, GIMAP5, MT-ND2, EIF4A2, TRAF3IP3, FBXO11, DENND2D 
##     CXCR4, PIP4K2C, STK17A, NDFIP1, GIMAP4, LDLRAP1, MYB, TRMT2A, GPR160, AC016629.8 
## PC_ 5 
## Positive:  SLC40A1, LYRM5, ITM2A, ICOS, AQP3, MAL, ZNF264, BPNT1, GLTSCR2, SELL 
##     VAPA, IL23A, RAD54L2, ZFYVE28, RMDN2, DTWD2, KDM1B, RPS9, ABI2, CD82 
##     POLL, FAM213A, RP1-43E13.2, MICU1, ZNF444, RP11-66N11.8, USP38, ATP5SL, ZCCHC11, AEN 
## Negative:  CD8B, CD8A, RP11-291B21.2, CCL5, CARS, MT-CO2, MT-ND2, MT-CYB, SIL1, AC092580.4 
##     S100B, MT-CO3, DNAJA2, RPL34, ENO1, S100A4, DLEU1, C1orf228, ATP6V0E2, SH3BGRL3 
##     PLP2, B2M, ASB1, NFYB, ETAA1, CPNE2, RNF141, IL32, STK24, CST7
sub_pbmc <- FindNeighbors(sub_pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
sub_pbmc <- FindClusters(sub_pbmc, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 709
## Number of edges: 28401
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.4569
## Number of communities: 4
## Elapsed time: 0 seconds
sub_pbmc <- RunTSNE(sub_pbmc, dims = 1:10)
sub_pbmc <- RunUMAP(sub_pbmc, dims = 1:10)
## 10:15:19 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 10:15:19 Read 709 rows and found 10 numeric columns
## 10:15:19 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 10:15:19 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:15:19 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\RtmpKoDCsM\file48bc1e09395d
## 10:15:19 Searching Annoy index using 1 thread, search_k = 3000
## 10:15:20 Annoy recall = 100%
## 10:15:21 Commencing smooth kNN distance calibration using 1 thread
## 10:15:22 Initializing from normalized Laplacian + noise
## 10:15:23 Commencing optimization for 500 epochs, with 23594 positive edges
## 10:15:25 Optimization finished
DimPlot(sub_pbmc, reduction = "tsne",label = TRUE, pt.size = 1.5)

DimPlot(sub_pbmc, reduction = "umap",label = TRUE, pt.size = 1.5)

assign cell types according to markers for each cluster, whole cells

new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 1.5) + NoLegend()

write.table(Idents(pbmc),"./cell-type.xls",sep="\t",quote = F)

save results

saveRDS(pbmc, file = "./pbmc3k_final.rds")

additional plots

VlnPlot(pbmc, features = c("MS4A1", "CD79A"))

VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

##按照细胞类型绘制分cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A

##按照细胞类型绘制气泡图
DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))

##按照细胞类型绘制RidgePlot
RidgePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14"), ncol = 2)
## Picking joint bandwidth of 0.0234
## Picking joint bandwidth of 0.0831
## Picking joint bandwidth of 0.126
## Picking joint bandwidth of 0.0337

# For the next step Cellphonedb analysis

###count文件生成 - generating count file
count_raw <- pbmc[["RNA"]]@counts
count_norm <- apply(count_raw, 2, function(x) (x/sum(x))*10000)
write.table(count_norm, './cellphonedb_count.txt', sep ='\t', quote = F)

###meta文件生成 -generating meta file
write.table(Idents(pbmc),"./cellphonedb_meta.txt",sep="\t",quote = F)

visualization using shinnyCell

Do not knit this block, will not finish.

#library(ShinyCell)

#seurat_integrated = readRDS("pbmc3k_final.RDS")
#scConf = createConfig(seurat_integrated)
#makeShinyApp(seurat_integrated, scConf, gene.mapping = TRUE,
 #            shiny.title = "pbmc3k visualization") 

#setwd("./shinyApp") # verify change of dir in console using getwd()

#shiny::runApp()