#load seurat package and dependencies
library(dplyr)
library(Seurat)
library(patchwork)
setwd("D:/BioinformaticsProjects/R Code/Seurat Tutorial")
load pbmc dataset
pbmc.data <- Read10X(data.dir = "D:/BioinformaticsProjects/R Code/Seurat Tutorial/filtered_gene_bc_matrices/hg19/")
dim(pbmc.data)
## [1] 32738 2700
pbmc.data[1:10,1:6]
## 10 x 6 sparse Matrix of class "dgCMatrix"
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
## MIR1302-10 . . .
## FAM138A . . .
## OR4F5 . . .
## RP11-34P13.7 . . .
## RP11-34P13.8 . . .
## AL627309.1 . . .
## RP11-34P13.14 . . .
## RP11-34P13.9 . . .
## AP006222.2 . . .
## RP4-669L17.10 . . .
## AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1 AAACGCACTGGTAC-1
## MIR1302-10 . . .
## FAM138A . . .
## OR4F5 . . .
## RP11-34P13.7 . . .
## RP11-34P13.8 . . .
## AL627309.1 . . .
## RP11-34P13.14 . . .
## RP11-34P13.9 . . .
## AP006222.2 . . .
## RP4-669L17.10 . . .
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
pbmc
## An object of class Seurat
## 13714 features across 2700 samples within 1 assay
## Active assay: RNA (13714 features, 0 variable features)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # common RBC genes for human blood sample
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
top10
## [1] "PPBP" "LYZ" "S100A9" "IGLL5" "GNLY" "FTL" "PF4" "FTH1"
## [9] "GNG11" "S100A8"
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing missing values (geom_point).
plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Removed 1 rows containing missing values (geom_point).
# Preparation stage for PCA analyses, need to use ScaleData() to scale
and center all data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt") # regress out the effects of mitochondria variation
## Regressing out percent.mt
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP
## FCER1G, CFD, LGALS1, LGALS2, SERPINA1, S100A8, CTSS, IFITM3, SPI1, CFP
## PSAP, IFI30, COTL1, SAT1, S100A11, NPC2, GRN, LGALS3, GSTP1, PYCARD
## Negative: MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CTSW, STK17A, CD27
## CD247, CCL5, GIMAP5, GZMA, AQP3, CST7, TRAF3IP3, SELL, GZMK, HOPX
## MAL, MYC, ITM2A, ETS1, LYAR, GIMAP7, KLRG1, NKG7, ZAP70, BEX2
## PC_ 2
## Positive: CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74
## HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB
## BLNK, P2RX5, IGLL5, IRF8, SWAP70, ARHGAP24, FCGR2B, SMIM14, PPP1R14A, C16orf74
## Negative: NKG7, PRF1, CST7, GZMA, GZMB, FGFBP2, CTSW, GNLY, B2M, SPON2
## CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX
## TTC38, CTSC, APMAP, S100A4, IGFBP7, ANXA1, ID2, IL32, XCL1, RHOC
## PC_ 3
## Positive: HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1, HLA-DPB1, CD74, MS4A1, HLA-DRB1, HLA-DRA
## HLA-DRB5, HLA-DQA2, TCL1A, LINC00926, HLA-DMB, HLA-DMA, CD37, HVCN1, FCRLA, IRF8
## PLAC8, BLNK, MALAT1, SMIM14, PLD4, IGLL5, SWAP70, P2RX5, LAT2, FCGR3A
## Negative: PPBP, PF4, SDPR, SPARC, GNG11, NRGN, GP9, RGS18, TUBB1, CLU
## HIST1H2AC, AP001189.4, ITGA2B, CD9, TMEM40, PTCRA, CA2, ACRBP, MMD, TREML1
## NGFRAP1, F13A1, SEPT5, RUFY1, TSC22D1, CMTM5, MPP1, MYL9, RP11-367G6.3, GP1BA
## PC_ 4
## Positive: HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1, CD74, HLA-DPB1, HIST1H2AC, HLA-DPA1, HLA-DRB1
## TCL1A, PF4, HLA-DQA2, SDPR, HLA-DRA, LINC00926, PPBP, GNG11, HLA-DRB5, SPARC
## GP9, PTCRA, CA2, AP001189.4, CD9, NRGN, RGS18, GZMB, CLU, TUBB1
## Negative: VIM, IL7R, S100A6, S100A8, IL32, S100A4, GIMAP7, S100A10, S100A9, MAL
## AQP3, CD14, CD2, LGALS2, FYB, GIMAP4, ANXA1, RBP7, CD27, FCN1
## LYZ, S100A12, MS4A6A, GIMAP5, S100A11, FOLR3, TRABD2A, AIF1, IL8, TMSB4X
## PC_ 5
## Positive: GZMB, FGFBP2, S100A8, NKG7, GNLY, CCL4, PRF1, CST7, SPON2, GZMA
## GZMH, LGALS2, S100A9, CCL3, XCL2, CD14, CLIC3, CTSW, MS4A6A, GSTP1
## S100A12, RBP7, IGFBP7, FOLR3, AKR1C3, TYROBP, CCL5, TTC38, XCL1, APMAP
## Negative: LTB, IL7R, CKB, MS4A7, RP11-290F20.3, AQP3, SIGLEC10, VIM, CYTIP, HMOX1
## LILRB2, PTGES3, HN1, CD2, FAM110A, CD27, ANXA5, CTD-2006K23.1, MAL, VMO1
## CORO1B, TUBA1B, LILRA3, GDI2, TRADD, ATP1A1, IL32, ABRACL, CCDC109B, PPA1
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: CST3, TYROBP, LST1, AIF1, FTL
## Negative: MALAT1, LTB, IL32, IL7R, CD2
## PC_ 2
## Positive: CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1
## Negative: NKG7, PRF1, CST7, GZMA, GZMB
## PC_ 3
## Positive: HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1
## Negative: PPBP, PF4, SDPR, SPARC, GNG11
## PC_ 4
## Positive: HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1
## Negative: VIM, IL7R, S100A6, S100A8, IL32
## PC_ 5
## Positive: GZMB, FGFBP2, S100A8, NKG7, GNLY
## Negative: LTB, IL7R, CKB, MS4A7, RP11-290F20.3
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:5, cells = 500, balanced = TRUE)
# determine number of PCs in the dataset
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
## Warning: Removed 23407 rows containing missing values (geom_point).
ElbowPlot(pbmc)
# cell clustering
pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2638
## Number of edges: 95893
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8735
## Number of communities: 9
## Elapsed time: 0 seconds
head(Idents(pbmc), 5)
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1
## 0 3 2 1
## AAACCGTGTATGCG-1
## 6
## Levels: 0 1 2 3 4 5 6 7 8
pbmc <- RunTSNE(pbmc, dims = 1:10)
DimPlot(pbmc,reduction = "tsne",label = TRUE,pt.size = 1.5)
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:08:31 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:08:31 Read 2638 rows and found 10 numeric columns
## 10:08:31 Using Annoy for neighbor search, n_neighbors = 30
## 10:08:31 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:08:31 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\RtmpKoDCsM\file48bc6e614ea3
## 10:08:31 Searching Annoy index using 1 thread, search_k = 3000
## 10:08:32 Annoy recall = 100%
## 10:08:33 Commencing smooth kNN distance calibration using 1 thread
## 10:08:34 Initializing from normalized Laplacian + noise
## 10:08:34 Commencing optimization for 500 epochs, with 106338 positive edges
## 10:08:42 Optimization finished
DimPlot(pbmc,reduction = "umap",label = TRUE,pt.size = 1.5)
# score cell cycle
pbmc<- CellCycleScoring(object = pbmc, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
## Warning: The following features are not present in the object: DTL, UHRF1,
## MLF1IP, EXO1, CASP8AP2, BRIP1, E2F8, not searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, BUB1,
## HJURP, CDCA3, TTK, CDC25C, DLGAP5, CDCA2, ANLN, GAS2L3, not searching for symbol
## synonyms
head(x = pbmc@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt percent.HB
## AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 0
## AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 0
## AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 0
## AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 0
## AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898 0
## AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551 0
## RNA_snn_res.0.5 seurat_clusters S.Score G2M.Score Phase
## AAACATACAACCAC-1 0 0 0.09853841 -0.044716507 S
## AAACATTGAGCTAC-1 3 3 -0.02364305 -0.046889929 G1
## AAACATTGATCAGC-1 2 2 -0.02177266 0.074841537 G2M
## AAACCGTGCTTCCG-1 1 1 0.03794398 0.006575446 S
## AAACCGTGTATGCG-1 6 6 -0.03309970 0.027910063 G2M
## AAACGCACTGGTAC-1 2 2 -0.04814181 -0.078164839 G1
DimPlot(pbmc,reduction = "tsne",label = TRUE,group.by="Phase",pt.size = 1.5)
# inspect and save UMI, Standardize and normalize data
write.table(pbmc[["RNA"]]@counts,"./umi.xls",sep="\t") # 原始UMI Original UMI
write.table(pbmc[["RNA"]]@data,"./data.xls",sep="\t")#标准化的data,进行了LogNormalize
write.table(pbmc[["RNA"]]@scale.data,"./scale.data.xls",sep="\t")#归一化的scale.data,有正负
saveRDS(pbmc, file = "./pbmc_tutorial.rds")
save(pbmc,file="./res0.5.Robj")
library(DoubletFinder)
sweep.res.list_pbmc <- paramSweep_v3(pbmc, PCs = 1:10, sct = FALSE)
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
sweep.stats_pbmc <- summarizeSweep(sweep.res.list_pbmc, GT = FALSE)
bcmvn_pbmc <- find.pK(sweep.stats_pbmc)
## NULL
opt_pK <- as.numeric(as.vector(bcmvn_pbmc$pK[which.max(bcmvn_pbmc$BCmetric)]))
print(opt_pK)
## [1] 0.01
annotations <- pbmc@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)
nExp_poi <- round(0.06*nrow(pbmc@meta.data))
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
# find doublets
pbmc <- doubletFinder_v3(pbmc,PCs = 1:10,pN = 0.25,pK = opt_pK,nExp = nExp_poi,reuse.pANN = FALSE,sct = FALSE)
## [1] "Creating 879 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
DimPlot(pbmc, reduction = "tsne", group.by = "DF.classifications_0.25_0.01_158",order = T)
table(pbmc$DF.classifications_0.25_0.01_158)
##
## Doublet Singlet
## 158 2480
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25) #SQ: marker has to be expressed in at least 25% of cluster 1 cells.
head(cluster1.markers, n = 5) # top 5 markers for cluster 1, that distinct from the other clusters
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## S100A9 0.000000e+00 5.570063 0.996 0.215 0.000000e+00
## S100A8 0.000000e+00 5.477394 0.975 0.121 0.000000e+00
## FCN1 0.000000e+00 3.394219 0.952 0.151 0.000000e+00
## LGALS2 0.000000e+00 3.800484 0.908 0.059 0.000000e+00
## CD14 2.856582e-294 2.815626 0.667 0.028 3.917516e-290
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
## # A tibble: 18 × 7
## # Groups: cluster [9]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 1.88e-117 1.08 0.913 0.588 2.57e-113 0 LDHB
## 2 5.01e- 85 1.34 0.437 0.108 6.88e- 81 0 CCR7
## 3 0 5.57 0.996 0.215 0 1 S100A9
## 4 0 5.48 0.975 0.121 0 1 S100A8
## 5 1.93e- 80 1.27 0.981 0.65 2.65e- 76 2 LTB
## 6 2.91e- 58 1.27 0.667 0.251 3.98e- 54 2 CD2
## 7 0 4.31 0.939 0.042 0 3 CD79A
## 8 1.06e-269 3.59 0.623 0.022 1.45e-265 3 TCL1A
## 9 3.60e-221 3.21 0.984 0.226 4.93e-217 4 CCL5
## 10 4.27e-176 3.01 0.573 0.05 5.85e-172 4 GZMK
## 11 3.51e-184 3.31 0.975 0.134 4.82e-180 5 FCGR3A
## 12 2.03e-125 3.09 1 0.315 2.78e-121 5 LST1
## 13 3.17e-267 4.83 0.961 0.068 4.35e-263 6 GZMB
## 14 1.04e-189 5.28 0.961 0.132 1.43e-185 6 GNLY
## 15 1.48e-220 3.87 0.812 0.011 2.03e-216 7 FCER1A
## 16 1.67e- 21 2.87 1 0.513 2.28e- 17 7 HLA-DPB1
## 17 7.73e-200 7.24 1 0.01 1.06e-195 8 PF4
## 18 3.68e-110 8.58 1 0.024 5.05e-106 8 PPBP
##存储marker -save markers
write.table(pbmc.markers,file="./allmarker.txt",sep="\t")
library(SingleR)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
##
## Attaching package: 'stats4'
## The following object is masked from 'package:spam':
##
## mle
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:spam':
##
## cbind, rbind
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:sp':
##
## %over%
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
library(celldex)
##
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
##
## BlueprintEncodeData, DatabaseImmuneCellExpressionData,
## HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
## MouseRNAseqData, NovershternHematopoieticData
refdata <- celldex::HumanPrimaryCellAtlasData()
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
refdata_ms <- celldex::MouseRNAseqData()
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
cellpred <- SingleR(test = pbmc@assays$RNA@data, ref = refdata, labels = refdata$label.fine, clusters = pbmc@active.ident, assay.type.test = "logcounts", assay.type.ref = "logcounts")
cellpred$labels
## [1] "T_cell:CD4+_central_memory" "Monocyte:CD16-"
## [3] "T_cell:CD4+_central_memory" "B_cell:immature"
## [5] "T_cell:CD8+" "Monocyte:CD16+"
## [7] "NK_cell" "Monocyte:CD16-"
## [9] "Monocyte"
# output of calculated cell types
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = F)
write.table(celltype,"./celltype_singleR.xls",row.names = F,sep="\t")
#plot the calculated cell types
pbmc@meta.data$singleR=celltype[match(pbmc@active.ident,celltype$ClusterID),'celltype']
DimPlot(pbmc, reduction = "tsne", group.by = "singleR",label=T, label.size=5)
# collection of plotting codes
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("gray", "red"))
##绘制Marker基因的小提琴图
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
##绘制分cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)#n = 10可以自行调整数字
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A
##绘制气泡图
DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))
##绘制RidgePlot
RidgePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14"), ncol = 2)
# subset a cluster for further sub-clustering
sub_pbmc<-subset(pbmc, idents = '0')
DimPlot(sub_pbmc, reduction = "tsne",label = TRUE, pt.size = 1.5)
##########对取出的cluster重新进行分群 - re-cluster the subset
sub_pbmc <- FindVariableFeatures(sub_pbmc, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(sub_pbmc)
sub_pbmc <- ScaleData(sub_pbmc, features = all.genes)
## Centering and scaling data matrix
sub_pbmc <- RunPCA(sub_pbmc, features = VariableFeatures(object = sub_pbmc))
## PC_ 1
## Positive: PRSS57, RAB13, CYTL1, SDPR, LAPTM4B, EGFL7, C19orf77, DEPTOR, FES, IL1B
## PTMS, GSN, RPL3, SOX4, RPS4X, FCER1A, KIAA0125, RPS2, AKR1C3, HLA-DRB5
## CNRIP1, LYL1, RPS5, CDK6, RPL10A, ACSM3, PABPC1, RP11-262H14.4, IGFBP7, CAT
## Negative: TMSB4X, MALAT1, B2M, IL32, CD3D, CCL5, TIGIT, CBR4, GZMK, GABPB1
## YTHDF1, LTB, CTC-444N24.11, MEPCE, UBR1, UNC93B1, CLASP2, F5, HLA-A, SH2D3A
## ATXN7L1, WDR91, CHP1, RP11-108M9.6, HIPK3, CLIC3, ABR, C5orf24, TRMT6, RNF185
## PC_ 2
## Positive: RPL3, RPL19, RPS5, RPL10A, RPS3A, RPS12, RPL10, RPS2, RPS6, RPS4X
## RPL15, RPS8, RPS27A, RPL7, GLTSCR2, RPL13A, BTG1, EEF1D, NACA, TPT1
## TMEM66, PABPC1, RPL13, RPL18, RPL8, CD7, EVL, CD8B, NOSIP, S100B
## Negative: PRSS57, SDPR, CYTL1, S100A4, RAB13, IL1B, ANKRD28, LAPTM4B, CCL5, KLF6
## C19orf77, C5orf24, PTMS, CNRIP1, FES, EGFL7, HLA-DRB5, RP11-262H14.4, DEPTOR, CBR4
## TMSB4X, VGLL4, DDA1, BTK, AKR1C3, CXXC5, SOX4, RP1-43E13.2, FCER1A, CDK6
## PC_ 3
## Positive: MALAT1, RPL13, RPL13A, RPS3A, RPS6, RPS4X, CD8B, RPS12, RPS18, RPL19
## RPS2, GTF2I, SDPR, CD8A, RPL10, RPS3, SPAG9, SARM1, IDH2, H1F0
## YBX3, PRSS57, RPL3, RP11-291B21.2, CTD-2260A17.2, RP11-620J15.3, RAB13, RC3H2, REG4, RPS27A
## Negative: FOS, JUNB, JUN, H3F3B, IER2, B2M, KLF6, EIF1, TMSB4X, ZFP36
## LTB, CXCR4, TMEM66, S100A4, HLA-A, IL7R, IL32, FTH1, RP11-1399P15.1, HLA-B
## IFITM2, HNRNPDL, YPEL5, RGS1, DHRS7, PLP2, PNRC1, KRT1, FXYD5, RARRES3
## PC_ 4
## Positive: S100A4, RPL8, ALOX5AP, LMNA, RPS18, RPS7, TNFRSF4, HOPX, VIM, TNFRSF18
## RPS2, RPL13A, RPL10, SH3BGRL3, CTSH, RPS3, TRADD, CD99, FLNA, B2M
## KLRB1, RPS8, RPS4X, LTB, IFITM2, GSTK1, ACTB, KLRG1, FXYD5, GNB2L1
## Negative: MALAT1, JUNB, BTG1, AIF1, MT-CO2, RPL34, YPEL5, PCYOX1L, JUN, FOS
## TTC1, RPF2, SELL, DUSP22, GIMAP5, MT-ND2, EIF4A2, TRAF3IP3, FBXO11, DENND2D
## CXCR4, PIP4K2C, STK17A, NDFIP1, GIMAP4, LDLRAP1, MYB, TRMT2A, GPR160, AC016629.8
## PC_ 5
## Positive: SLC40A1, LYRM5, ITM2A, ICOS, AQP3, MAL, ZNF264, BPNT1, GLTSCR2, SELL
## VAPA, IL23A, RAD54L2, ZFYVE28, RMDN2, DTWD2, KDM1B, RPS9, ABI2, CD82
## POLL, FAM213A, RP1-43E13.2, MICU1, ZNF444, RP11-66N11.8, USP38, ATP5SL, ZCCHC11, AEN
## Negative: CD8B, CD8A, RP11-291B21.2, CCL5, CARS, MT-CO2, MT-ND2, MT-CYB, SIL1, AC092580.4
## S100B, MT-CO3, DNAJA2, RPL34, ENO1, S100A4, DLEU1, C1orf228, ATP6V0E2, SH3BGRL3
## PLP2, B2M, ASB1, NFYB, ETAA1, CPNE2, RNF141, IL32, STK24, CST7
sub_pbmc <- FindNeighbors(sub_pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
sub_pbmc <- FindClusters(sub_pbmc, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 709
## Number of edges: 28401
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.4569
## Number of communities: 4
## Elapsed time: 0 seconds
sub_pbmc <- RunTSNE(sub_pbmc, dims = 1:10)
sub_pbmc <- RunUMAP(sub_pbmc, dims = 1:10)
## 10:15:19 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 10:15:19 Read 709 rows and found 10 numeric columns
## 10:15:19 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 10:15:19 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:15:19 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\RtmpKoDCsM\file48bc1e09395d
## 10:15:19 Searching Annoy index using 1 thread, search_k = 3000
## 10:15:20 Annoy recall = 100%
## 10:15:21 Commencing smooth kNN distance calibration using 1 thread
## 10:15:22 Initializing from normalized Laplacian + noise
## 10:15:23 Commencing optimization for 500 epochs, with 23594 positive edges
## 10:15:25 Optimization finished
DimPlot(sub_pbmc, reduction = "tsne",label = TRUE, pt.size = 1.5)
DimPlot(sub_pbmc, reduction = "umap",label = TRUE, pt.size = 1.5)
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 1.5) + NoLegend()
write.table(Idents(pbmc),"./cell-type.xls",sep="\t",quote = F)
saveRDS(pbmc, file = "./pbmc3k_final.rds")
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
##按照细胞类型绘制分cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A
##按照细胞类型绘制气泡图
DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))
##按照细胞类型绘制RidgePlot
RidgePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14"), ncol = 2)
## Picking joint bandwidth of 0.0234
## Picking joint bandwidth of 0.0831
## Picking joint bandwidth of 0.126
## Picking joint bandwidth of 0.0337
# For the next step Cellphonedb analysis
###count文件生成 - generating count file
count_raw <- pbmc[["RNA"]]@counts
count_norm <- apply(count_raw, 2, function(x) (x/sum(x))*10000)
write.table(count_norm, './cellphonedb_count.txt', sep ='\t', quote = F)
###meta文件生成 -generating meta file
write.table(Idents(pbmc),"./cellphonedb_meta.txt",sep="\t",quote = F)
#library(ShinyCell)
#seurat_integrated = readRDS("pbmc3k_final.RDS")
#scConf = createConfig(seurat_integrated)
#makeShinyApp(seurat_integrated, scConf, gene.mapping = TRUE,
# shiny.title = "pbmc3k visualization")
#setwd("./shinyApp") # verify change of dir in console using getwd()
#shiny::runApp()