#load seurat package and dependencies

library(dplyr)
library(Seurat)
library(patchwork)
library(cowplot)
library(ggplot2)
library(rstatix)
library(RColorBrewer)
setwd("D:/Bioinformatics Projects/shalini_bm_noe15")

load NoE15 dataset

# read the NoE15 sample
NoE15.data <- Read10X(data.dir = "NoE15/outs/filtered_feature_bc_matrix/")

dim(NoE15.data)
## [1] 36601  8511
NoE15.data[1:10,1:6]
## 10 x 6 sparse Matrix of class "dgCMatrix"
##             AAACCCACAGGCTACC-1 AAACCCAGTCTCTCTG-1 AAACCCAGTCTTCAAG-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .
## AL627309.5                   1                  .                  .
## AL627309.4                   .                  .                  .
## AP006222.2                   .                  .                  .
## AL732372.1                   .                  .                  .
##             AAACGAAAGACTCTTG-1 AAACGAAAGGTTTGAA-1 AAACGAAAGTCACGAG-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .
## AL627309.5                   .                  .                  .
## AL627309.4                   .                  .                  .
## AP006222.2                   .                  .                  .
## AL732372.1                   .                  .                  .

Create a Seurat object and filter data

NoE15 <- CreateSeuratObject(counts = NoE15.data, project = "ssbm.NoE15.8k", min.cells = 3, min.features = 200)
NoE15
## An object of class Seurat 
## 24570 features across 8478 samples within 1 assay 
## Active assay: RNA (24570 features, 0 variable features)
NoE15[["percent.mt"]] <- PercentageFeatureSet(NoE15, pattern = "^MT-")

HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # common RBC genes for human blood sample
HB_m <- match(HB.genes_total,rownames(NoE15@assays$RNA))
HB.genes <- rownames(NoE15@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
NoE15[["percent.HB"]]<-PercentageFeatureSet(NoE15,features=HB.genes)

VlnPlot(NoE15, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(NoE15, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(NoE15, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

Filter and subset cells

NoE15 <- subset(NoE15, subset = nFeature_RNA > 500 & nFeature_RNA < 7500 & percent.mt < 10)
dim(NoE15)
## [1] 24570  7863
# 24570 7863

normalize expression level

NoE15 <- NormalizeData(NoE15, normalization.method = "LogNormalize", scale.factor = 10000)

identify 2000 most variable genes/features, to be used for downstream analysis

NoE15 <- FindVariableFeatures(NoE15, selection.method = "vst", nfeatures = 2000)

top10 <- head(VariableFeatures(NoE15), 10)
top10
##  [1] "HBB"      "PPBP"     "HBG2"     "HBG1"     "PF4"      "CLC"     
##  [7] "S100A9"   "LTB"      "HIST3H2A" "PRTN3"
plot1 <- VariableFeaturePlot(NoE15)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 

plot2

# Preparation stage for PCA analyses, need to use ScaleData() to scale and center all data

all.genes <- rownames(NoE15)
NoE15 <- ScaleData(NoE15, features = all.genes)
## Centering and scaling data matrix
NoE15 <- ScaleData(NoE15, vars.to.regress = "percent.mt") # regress out the effects of mitochondria variation
## Regressing out percent.mt
## Centering and scaling data matrix

Linear dimention reduction (PCA), use most variable features by default

NoE15 <- RunPCA(NoE15, features = VariableFeatures(object = NoE15))
## PC_ 1 
## Positive:  ATP8B4, P2RY8, KCNQ5, FNDC3B, AUTS2, PLCB1, CD44, LSP1, ITGA4, BCL2 
##     LRMDA, ANXA1, BCL11A, PRAM1, ZEB1, PLAC8, RHEX, FLT3, CSF3R, S100Z 
##     IGLL1, MGST1, SBF2, PRKCE, MED13L, CALN1, SPINK2, MPO, AFF3, RUNX2 
## Negative:  GP1BB, RAB27B, ITGB3, GP9, CAVIN2, CMTM5, DNM3, ARHGAP6, DAAM1, PROS1 
##     PF4, SELP, TUBB1, AC004053.1, RGS6, MPIG6B, MMRN1, MYLK, KALRN, LAT 
##     F2R, PRKAR2B, THBS1, ITGA2B, PPBP, GP6, C2orf88, NRGN, GP1BA, CD36 
## PC_ 2 
## Positive:  MRC1, HLA-DQB1, CST3, SLC8A1, HLA-DQA1, SAMHD1, JAML, PKIB, CD1C, HLA-DQA2 
##     IFI30, MS4A6A, HLA-DMB, TYROBP, CLEC10A, MARCH1, FCER1G, HLA-DPB1, CD86, HLA-DPA1 
##     LY86, FGL2, CTSH, CYBB, FCGRT, LYZ, TMSB4X, CD1E, HLA-DRB1, CD74 
## Negative:  CDK6, TFRC, KCNQ5, GYPC, MIR924HG, PVT1, ZNF521, XACT, TUBB, RIF1 
##     CENPF, HMGB2, KIAA1211, ZNF385D, ST8SIA6, TPX2, PIP5K1B, ZNF804A, PDZD8, MSI2 
##     TOP2A, NUSAP1, HIST1H4C, RNF220, MYH10, TAFA2, SLC40A1, TUBA1B, CYTL1, DIAPH3 
## PC_ 3 
## Positive:  RNF220, CALN1, SPINK2, MEF2C, ERG, PRAM1, AUTS2, BAALC, FLT3, CD34 
##     GUCY1A1, DANT2, PLAC8, PROM1, S100Z, BCL11A, CSF3R, CASC15, SBF2, PTPRM 
##     ANGPT1, FHIT, CHRM3, HMGA2, EGFL7, COL24A1, TNFSF13B, C1QTNF4, SPNS3, PRDX1 
## Negative:  AKAP12, CSF2RB, HDC, CLC, ENPP3, PRG2, SRGN, CR1, GATA2, HPGD 
##     CPA3, SYTL3, SLC24A3, MS4A2, EXT1, ALOX5, IKZF2, PRNP, JAZF1, TMEM154 
##     PAG1, MS4A3, CKAP4, ALOX5AP, SLC45A3, PALM2-AKAP2, GRAP2, GCSAML, LINC00535, PTGER3 
## PC_ 4 
## Positive:  ANK1, KLF1, BLVRB, SPTA1, HBD, THRB, MYH10, DNAJC6, APOC1, HBB 
##     TFR2, CNRIP1, PLEKHG1, S100A4, XACT, SLC40A1, MICAL2, KEL, FREM1, HES6 
##     S100A6, MINPP1, SLC25A21, DLC1, HLA-DRA, PKIG, HLA-DRB1, ADD2, PLXNC1, LDLRAD3 
## Negative:  RFLNB, ATP8B4, PLCB1, FNDC3B, P2RY8, RHEX, MCTP1, SPINK2, LRMDA, DACH1 
##     CALN1, AKAP12, CLC, ZEB1, RFX8, SLC24A3, ARHGEF3, CSF3R, PAG1, RNF220 
##     RAP1GAP2, FLI1, BCL2, IKZF2, GAB2, HDC, PECAM1, BAALC, SRGN, ANXA1 
## PC_ 5 
## Positive:  MKI67, TOP2A, ASPM, CENPE, NUSAP1, GTSE1, TPX2, KNL1, HMMR, KIF14 
##     DLGAP5, PRC1, CDCA2, CENPF, HIST1H4C, NDC80, UBE2C, KPNA2, CDK1, KIF4A 
##     KIF11, CKAP2L, CDCA8, NCAPG, BUB1B, HIST1H1B, HMGB2, PRR11, ARHGAP11B, CDKN3 
## Negative:  PCOLCE, SLC25A4, COL1A1, TJP1, BAIAP2L1, PRRX1, AC116337.3, COL1A2, FN1, AC099560.1 
##     EFEMP2, ZNF90, PMEPA1, HIST3H2A, AL136454.1, CRYAB, PCDH19, AL157388.1, RASGEF1B, HOXC10 
##     TPM2, DNAH12, ENO3, EFCAB10, AP001605.1, AC010528.1, NFIX, MTRNR2L6, CDH11, ZFHX4
print(NoE15[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  ATP8B4, P2RY8, KCNQ5, FNDC3B, AUTS2 
## Negative:  GP1BB, RAB27B, ITGB3, GP9, CAVIN2 
## PC_ 2 
## Positive:  MRC1, HLA-DQB1, CST3, SLC8A1, HLA-DQA1 
## Negative:  CDK6, TFRC, KCNQ5, GYPC, MIR924HG 
## PC_ 3 
## Positive:  RNF220, CALN1, SPINK2, MEF2C, ERG 
## Negative:  AKAP12, CSF2RB, HDC, CLC, ENPP3 
## PC_ 4 
## Positive:  ANK1, KLF1, BLVRB, SPTA1, HBD 
## Negative:  RFLNB, ATP8B4, PLCB1, FNDC3B, P2RY8 
## PC_ 5 
## Positive:  MKI67, TOP2A, ASPM, CENPE, NUSAP1 
## Negative:  PCOLCE, SLC25A4, COL1A1, TJP1, BAIAP2L1
VizDimLoadings(NoE15, dims = 1:2, reduction = "pca")

make PCA dimension plots

DimPlot(NoE15, reduction = "pca")

DimHeatmap(NoE15, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(NoE15, dims = 1:5, cells = 500, balanced = TRUE)

# determine number of PCs in the dataset

NoE15 <- JackStraw(NoE15, num.replicate = 100)
NoE15 <- ScoreJackStraw(NoE15, dims = 1:20)
JackStrawPlot(NoE15, dims = 1:15)
## Warning: Removed 21000 rows containing missing values (`geom_point()`).

ElbowPlot(NoE15)

# cell clustering

NoE15 <- FindNeighbors(NoE15, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
NoE15 <- FindClusters(NoE15, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 7863
## Number of edges: 240537
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8962
## Number of communities: 13
## Elapsed time: 0 seconds
head(Idents(NoE15), 5)
## AAACCCACAGGCTACC-1 AAACCCAGTCTCTCTG-1 AAACCCAGTCTTCAAG-1 AAACGAAAGACTCTTG-1 
##                  1                  8                  3                  1 
## AAACGAAAGGTTTGAA-1 
##                  1 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12
NoE15 <- RunTSNE(NoE15, dims = 1:10)
DimPlot(NoE15,reduction = "tsne",label = TRUE,pt.size = 1.5)

NoE15 <- RunUMAP(NoE15, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 12:30:39 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:30:39 Read 7863 rows and found 10 numeric columns
## 12:30:39 Using Annoy for neighbor search, n_neighbors = 30
## 12:30:39 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:30:40 Writing NN index file to temp file C:\Users\user\AppData\Local\Temp\RtmpWS8G6S\filece84daa7cf3
## 12:30:40 Searching Annoy index using 1 thread, search_k = 3000
## 12:30:43 Annoy recall = 100%
## 12:30:43 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 12:30:44 Initializing from normalized Laplacian + noise (using irlba)
## 12:30:44 Commencing optimization for 500 epochs, with 306596 positive edges
## 12:31:04 Optimization finished
DimPlot(NoE15,reduction = "umap",label = TRUE,pt.size = 1.5)

# score cell cycle

NoE15<- CellCycleScoring(object = NoE15, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms
head(x = NoE15@meta.data)
##                       orig.ident nCount_RNA nFeature_RNA percent.mt percent.HB
## AAACCCACAGGCTACC-1 ssbm.NoE15.8k      14433         5033   4.468925 0.00000000
## AAACCCAGTCTCTCTG-1 ssbm.NoE15.8k       6410         2799   5.819033 0.18720749
## AAACCCAGTCTTCAAG-1 ssbm.NoE15.8k       2561         1754   1.366654 0.07809449
## AAACGAAAGACTCTTG-1 ssbm.NoE15.8k       9698         3734   5.939369 0.02062281
## AAACGAAAGGTTTGAA-1 ssbm.NoE15.8k      16671         5222   5.848479 0.01799532
## AAACGAAAGTCACGAG-1 ssbm.NoE15.8k      12660         3994   7.266983 0.00000000
##                    RNA_snn_res.0.5 seurat_clusters     S.Score  G2M.Score Phase
## AAACCCACAGGCTACC-1               1               1  0.05068216  0.3939077   G2M
## AAACCCAGTCTCTCTG-1               8               8 -0.08082797 -0.2600288    G1
## AAACCCAGTCTTCAAG-1               3               3  0.34361789 -0.0816220     S
## AAACGAAAGACTCTTG-1               1               1 -0.20519010  0.4907255   G2M
## AAACGAAAGGTTTGAA-1               1               1  0.18941026  0.2847622   G2M
## AAACGAAAGTCACGAG-1               2               2  0.06969817 -0.2373076     S
DimPlot(NoE15,reduction = "tsne",label = TRUE,group.by="Phase",pt.size = 1.5)

# inspect and save UMI, Standardize and normalize data

#write.table(NoE15[["RNA"]]@counts,"./umi.xls",sep="\t") # 原始UMI Original UMI ~250MB
#write.table(NoE15[["RNA"]]@data,"./data.xls",sep="\t") #标准化的data,进行了LogNormalize, ~650MB size
#write.table(NoE15[["RNA"]]@scale.data,"./scale.data.xls",sep="\t")#归一化的scale.data,有正负 ~180MB size

save results

saveRDS(NoE15, file = "./NoE15_sample.rds")
save(NoE15,file="./NoE15_res0.5.Robj")

doublet finder

library(DoubletFinder)
sweep.res.list_NoE15 <- paramSweep_v3(NoE15, PCs = 1:10, sct = FALSE)
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
sweep.stats_NoE15 <- summarizeSweep(sweep.res.list_NoE15, GT = FALSE)
bcmvn_NoE15 <- find.pK(sweep.stats_NoE15)

## NULL
opt_pK <- as.numeric(as.vector(bcmvn_NoE15$pK[which.max(bcmvn_NoE15$BCmetric)]))
print(opt_pK)
## [1] 0.005
annotations <- NoE15@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)           
nExp_poi <- round(0.06*nrow(NoE15@meta.data))  
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

# find doublets
NoE15 <- doubletFinder_v3(NoE15,PCs = 1:10,pN = 0.25,pK = opt_pK,nExp = nExp_poi,reuse.pANN = FALSE,sct = FALSE)
## [1] "Creating 2621 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
DimPlot(NoE15, reduction = "tsne", group.by = "DF.classifications_0.25_0.005_472",order = T)

table(NoE15$DF.classifications_0.25_0.005_472)
## 
## Doublet Singlet 
##     472    7391

find markers for cluster 1

cluster1.markers <- FindMarkers(NoE15, ident.1 = 1, min.pct = 0.25) #SQ: marker has to be expressed in at least 25% of cluster 1 cells.
## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session
head(cluster1.markers, n = 15) # top 5 markers for cluster 1, that distinct from the other clusters
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## TUBA1B  0.000000e+00  1.0290026 0.998 0.872  0.000000e+00
## SPINK2 4.108513e-305  1.0974860 0.906 0.313 1.009462e-300
## TOP2A  6.673660e-272  1.2057987 0.962 0.455 1.639718e-267
## TPX2   8.858176e-262  0.9925417 0.946 0.424 2.176454e-257
## BAALC  2.018909e-259  0.7193938 0.775 0.227 4.960459e-255
## ASPM   9.154638e-258  1.0162701 0.892 0.359 2.249295e-253
## SMIM24 9.922466e-256  0.7474917 0.929 0.352 2.437950e-251
## TUBB   1.020341e-249  0.9411356 0.998 0.776 2.506978e-245
## MKI67  4.311422e-243  0.9631272 0.946 0.441 1.059316e-238
## CALN1  3.838443e-242  1.0874761 0.919 0.364 9.431054e-238
## RNF220 8.532574e-226  1.0337896 0.994 0.641 2.096453e-221
## DLGAP5 7.705318e-222  0.7234896 0.700 0.223 1.893197e-217
## GTSE1  1.281418e-220  0.6145660 0.764 0.258 3.148445e-216
## NUSAP1 3.883978e-216  0.8293817 0.930 0.461 9.542935e-212
## CENPE  6.281745e-207  0.7767565 0.758 0.277 1.543425e-202

find markers for each cluster

NoE15.markers <- FindAllMarkers(NoE15, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
NoE15.markers.top100 <- NoE15.markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)

##存储marker -save markers
#write.table(NoE15.markers,file="./NoE15_allmarker.txt",sep="\t")
write.csv(NoE15.markers,file="./NoE15_allmarker.csv")
write.csv(NoE15.markers.top100,file="./NoE15_allmarker_top100.csv")

SingleR for cell type identification

library(SingleR) 
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## 
## Attaching package: 'stats4'
## The following object is masked from 'package:spam':
## 
##     mle
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:spam':
## 
##     cbind, rbind
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:rstatix':
## 
##     desc
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(celldex) 
## 
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData
refdata <- celldex::HumanPrimaryCellAtlasData()
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
# refdata_ms <- celldex::MouseRNAseqData()

cellpred <- SingleR(test = NoE15@assays$RNA@data, ref = refdata, labels = refdata$label.fine, clusters = NoE15@active.ident, assay.type.test = "logcounts", assay.type.ref = "logcounts")

cellpred$labels
##  [1] "CMP"                     "CMP"                    
##  [3] "CMP"                     "CMP"                    
##  [5] "CMP"                     "MEP"                    
##  [7] "CMP"                     "Monocyte:leukotriene_D4"
##  [9] "CMP"                     "CMP"                    
## [11] "CMP"                     "GMP"                    
## [13] "CMP"
# output of calculated cell types
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = F)
write.table(celltype,"./NoE15_celltype_singleR.xls",row.names = F,sep="\t")


#plot the calculated cell types

NoE15@meta.data$singleR=celltype[match(NoE15@active.ident,celltype$ClusterID),'celltype']
DimPlot(NoE15, reduction = "tsne", group.by = "singleR",label=T, label.size=5)

# collection of plotting codes

FeaturePlot(NoE15, features = c("CD44", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("gray", "red"))
## Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features),
## : The following requested variables were not found: FCGR3A

##绘制Marker基因的小提琴图
VlnPlot(NoE15, features = c("CD44", "CD4"))

VlnPlot(NoE15, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

##绘制分cluster的热图
top10 <- NoE15.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)#n = 10可以自行调整数字
DoHeatmap(NoE15, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(NoE15, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay:
## EIF4G2, B4GALT5, CAT

##绘制气泡图
DotPlot(NoE15, features = c("CD44", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))
## Warning in FetchData.Seurat(object = object, vars = features, cells = cells):
## The following requested variables were not found: FCGR3A

##绘制RidgePlot
RidgePlot(NoE15, features = c("CD44", "GNLY", "CD3E", "CD14"), ncol = 2)

saveRDS(NoE15, file = "./NoE15_sample.rds")

Check the other seurat_tutorial_v2.Rmd file for sub-group and assign cell IDs # visualization using shinnyCell # Do not knit this block, will not finish.

library(ShinyCell)

NoE15obj = readRDS("NoE15_sample.rds")
scConf = createConfig(NoE15obj)
makeShinyApp(NoE15obj, scConf, gene.mapping = TRUE,
             shiny.title = "ssbm.NoE15.8k_visualization") 

setwd("D:/Bioinformatics Projects/shalini_bm_noe15/shinyApp") # verify change of dir in console using getwd()

shiny::runApp()