#load seurat package and dependencies

library(dplyr)
library(Seurat)
## Warning: package 'Seurat' was built under R version 4.2.2
library(patchwork)
library(cowplot)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.1
library(rstatix)
library(RColorBrewer)
setwd("D:/Bioinformatics Projects/shalini_bm")

load E15 dataset

# read the E15 sample
E15.data <- Read10X(data.dir = "E15/outs/filtered_feature_bc_matrix/")

dim(E15.data)
## [1] 36601  6757
E15.data[1:10,1:6]
## 10 x 6 sparse Matrix of class "dgCMatrix"
##             AAACCCACATCGCTCT-1 AAACCCAGTAGGGTAC-1 AAACCCAGTGCATCTA-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .
## AL627309.5                   .                  .                  .
## AL627309.4                   .                  .                  .
## AP006222.2                   .                  .                  .
## AL732372.1                   .                  .                  .
##             AAACCCATCATAGAGA-1 AAACCCATCGGAAACG-1 AAACCCATCTAACACG-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .
## AL627309.5                   .                  .                  .
## AL627309.4                   .                  .                  .
## AP006222.2                   .                  .                  .
## AL732372.1                   .                  .                  .

Create a Seurat object and filter data

E15 <- CreateSeuratObject(counts = E15.data, project = "ssbm.e15.8k", min.cells = 3, min.features = 200)
E15
## An object of class Seurat 
## 24124 features across 6710 samples within 1 assay 
## Active assay: RNA (24124 features, 0 variable features)
E15[["percent.mt"]] <- PercentageFeatureSet(E15, pattern = "^MT-")

HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # common RBC genes for human blood sample
HB_m <- match(HB.genes_total,rownames(E15@assays$RNA))
HB.genes <- rownames(E15@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
E15[["percent.HB"]]<-PercentageFeatureSet(E15,features=HB.genes)

VlnPlot(E15, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(E15, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(E15, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

Filter and subset cells

E15 <- subset(E15, subset = nFeature_RNA > 500 & nFeature_RNA < 7500 & percent.mt < 10)
dim(E15)
## [1] 24124  6135
# 24124  6135

normalize expression level

E15 <- NormalizeData(E15, normalization.method = "LogNormalize", scale.factor = 10000)

identify 2000 most variable genes/features, to be used for downstream analysis

E15 <- FindVariableFeatures(E15, selection.method = "vst", nfeatures = 2000)

top10 <- head(VariableFeatures(E15), 10)
top10
##  [1] "PPBP"  "CLC"   "PF4"   "LYZ"   "CST3"  "F13A1" "HPGD"  "HBG2"  "THBS1"
## [10] "NRGN"
plot1 <- VariableFeaturePlot(E15)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2 rows containing missing values (geom_point).

plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Removed 2 rows containing missing values (geom_point).

# Preparation stage for PCA analyses, need to use ScaleData() to scale and center all data

all.genes <- rownames(E15)
E15 <- ScaleData(E15, features = all.genes)
## Centering and scaling data matrix
E15 <- ScaleData(E15, vars.to.regress = "percent.mt") # regress out the effects of mitochondria variation
## Regressing out percent.mt
## Centering and scaling data matrix

Linear dimention reduction (PCA), use most variable features by default

E15 <- RunPCA(E15, features = VariableFeatures(object = E15))
## PC_ 1 
## Positive:  SLC25A21, NFIA, HBM, DNAJA4, KCNH2, APOC1, LINC01133, CD36, NETO2, DDIT4 
##     PKLR, SYNGR1, A4GALT, HIST1H1B, HIST1H4C, C16orf95, HIST1H1D, REXO2, S100A6, HIST2H2AC 
##     ZDHHC14, FAM178B, TOP2A, GLRX, DOCK9, APOE, CENPE, EPB41L4A, HMMR, AC079804.3 
## Negative:  ACTG1, CELF2, PKM, ARHGDIB, MSN, MACF1, ELMO1, ETV6, TPM4, ARPC1B 
##     GMFG, FLI1, MAML3, PRKCB, UTRN, TBXAS1, FXYD5, SH3BGRL3, TMSB4X, FNBP1 
##     WIPF1, ENO1, VAV3, LGALS1, SERPINB1, SEPTIN6, AIF1, CYBA, MEF2C, TMSB10 
## PC_ 2 
## Positive:  ITGB3, GP9, SELP, CMTM5, GP1BB, PF4, PPBP, RAB27B, ABCC3, CAVIN2 
##     KALRN, THBS1, TUBB1, LGALSL, ARHGAP6, MYLK, GP6, MPIG6B, AC004053.1, GP1BA 
##     HPSE, NRGN, ESAM, F2R, MMRN1, ITGB5, CLEC1B, TMEM40, DNM3, CD226 
## Negative:  ATP8B4, P2RY8, DIAPH2, LCP1, RCSD1, AUTS2, LSP1, CHST11, KCNQ5, BST2 
##     HSH2D, ZFP36L2, APBB1IP, FNDC3B, S100Z, PLCB1, ANKRD44, PRAM1, FLT3, CERS6 
##     CD44, DOCK2, GLIPR1, FCHSD2, SBF2, XYLT1, KCNAB2, AFF3, BCL2, PLAC8 
## PC_ 3 
## Positive:  CDK6, RNF220, CALN1, KCNQ5, ERG, NKAIN2, MSI2, CMSS1, EBPL, MYB 
##     MCTP2, ZNF521, SMIM24, CHRM3, PRSS57, CYTL1, CD34, MSRB3, NME1, FIRRE 
##     SPINK2, EGFL7, ZNF804A, NPW, FAM30A, BAALC, CASC15, NPR3, CCDC171, PVT1 
## Negative:  MRC1, HLA-DQB1, SLC8A1, CD1C, HLA-DMB, MARCH1, LY86, HLA-DQA1, JAML, PKIB 
##     FGL2, MS4A6A, CLEC10A, SAMHD1, CD86, HLA-DQA2, IFI30, CST3, TBC1D9, CD1E 
##     CTSH, IGSF6, ARL4C, IL13RA1, MS4A4E, RTN1, CYBB, TYROBP, GPRIN3, KCTD12 
## PC_ 4 
## Positive:  AKAP12, HDC, CSF2RB, CLC, ENPP3, SYTL3, MS4A2, CPA3, MS4A3, LINC00535 
##     IKZF2, CKAP4, IL18R1, SLC45A3, TMEM154, PRNP, HPGD, ALOX5, GATA2, PAG1 
##     THSD7A, SLC7A8, KLF2, ATP10D, ALOX5AP, IL3RA, LINC02458, NCF1, AC093865.1, GCSAML 
## Negative:  AFF3, FLT3, MEF2C, HLA-DRA, GLIPR1, HLA-DRB1, HDAC9, CD74, GSTP1, HLA-DPA1 
##     HLA-DPB1, IFI16, AIF1, S100Z, DISC1, CALN1, PEBP1, CIITA, JAML, CCSER1 
##     ST18, HLA-DQA2, TFEC, SPINK2, PRAM1, HLA-DMB, TNFSF13B, AHR, WDFY4, DANT2 
## PC_ 5 
## Positive:  MPO, CSF3R, PRAM1, PLCB1, RAB27A, P2RY8, TNFSF13B, DGKG, PLAC8, RFX8 
##     SPNS3, RFLNB, PDE4D, TMEM40, SELL, BAHCC1, C1QTNF4, SPARC, FAM107B, C16orf74 
##     CTSG, PXK, SORL1, RAB7B, ATP8B4, TSPOAP1, SPINK2, S100Z, SMIM24, CLEC1B 
## Negative:  THRB, XACT, FREM1, RYR3, PVT1, RNF130, PKIG, LDLRAD3, IL7, ADAMTS14 
##     PLEKHG1, FCER1A, S100A4, RGS20, CPPED1, SLC40A1, ST8SIA6, LDLRAD4, DLC1, CPQ 
##     GDF11, MTSS1, TPSAB1, SLC12A6, SOS1, PLXNC1, ABCC4, TMEM164, LDHA, ELOVL6
print(E15[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  SLC25A21, NFIA, HBM, DNAJA4, KCNH2 
## Negative:  ACTG1, CELF2, PKM, ARHGDIB, MSN 
## PC_ 2 
## Positive:  ITGB3, GP9, SELP, CMTM5, GP1BB 
## Negative:  ATP8B4, P2RY8, DIAPH2, LCP1, RCSD1 
## PC_ 3 
## Positive:  CDK6, RNF220, CALN1, KCNQ5, ERG 
## Negative:  MRC1, HLA-DQB1, SLC8A1, CD1C, HLA-DMB 
## PC_ 4 
## Positive:  AKAP12, HDC, CSF2RB, CLC, ENPP3 
## Negative:  AFF3, FLT3, MEF2C, HLA-DRA, GLIPR1 
## PC_ 5 
## Positive:  MPO, CSF3R, PRAM1, PLCB1, RAB27A 
## Negative:  THRB, XACT, FREM1, RYR3, PVT1
VizDimLoadings(E15, dims = 1:2, reduction = "pca")

make PCA dimension plots

DimPlot(E15, reduction = "pca")

DimHeatmap(E15, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(E15, dims = 1:5, cells = 500, balanced = TRUE)

# determine number of PCs in the dataset

E15 <- JackStraw(E15, num.replicate = 100)
E15 <- ScoreJackStraw(E15, dims = 1:20)
JackStrawPlot(E15, dims = 1:15)
## Warning: Removed 21000 rows containing missing values (geom_point).

ElbowPlot(E15)

# cell clustering

E15 <- FindNeighbors(E15, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
E15 <- FindClusters(E15, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6135
## Number of edges: 191453
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8980
## Number of communities: 10
## Elapsed time: 0 seconds
head(Idents(E15), 5)
## AAACCCACATCGCTCT-1 AAACCCAGTAGGGTAC-1 AAACCCAGTGCATCTA-1 AAACCCATCATAGAGA-1 
##                  3                  4                  0                  1 
## AAACCCATCGGAAACG-1 
##                  7 
## Levels: 0 1 2 3 4 5 6 7 8 9
E15 <- RunTSNE(E15, dims = 1:10)
DimPlot(E15,reduction = "tsne",label = TRUE,pt.size = 1.5)

E15 <- RunUMAP(E15, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:16:39 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:16:39 Read 6135 rows and found 10 numeric columns
## 17:16:39 Using Annoy for neighbor search, n_neighbors = 30
## 17:16:39 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:16:40 Writing NN index file to temp file C:\Users\user\AppData\Local\Temp\Rtmp4iTOPt\file780c3f815bc0
## 17:16:40 Searching Annoy index using 1 thread, search_k = 3000
## 17:16:42 Annoy recall = 100%
## 17:16:42 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:16:43 Initializing from normalized Laplacian + noise (using irlba)
## 17:16:44 Commencing optimization for 500 epochs, with 240570 positive edges
## 17:17:01 Optimization finished
DimPlot(E15,reduction = "umap",label = TRUE,pt.size = 1.5)

# score cell cycle

E15<- CellCycleScoring(object = E15, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms
head(x = E15@meta.data)
##                     orig.ident nCount_RNA nFeature_RNA percent.mt percent.HB
## AAACCCACATCGCTCT-1 ssbm.e15.8k      32469         6770   4.992454  0.1478333
## AAACCCAGTAGGGTAC-1 ssbm.e15.8k       6036         1507   1.590457 42.3956262
## AAACCCAGTGCATCTA-1 ssbm.e15.8k       6346         1735   2.600063 44.6738103
## AAACCCATCATAGAGA-1 ssbm.e15.8k      20373         4500   6.577333 17.3317626
## AAACCCATCGGAAACG-1 ssbm.e15.8k       4683         2395   6.939996  0.3203075
## AAACCCATCTAACACG-1 ssbm.e15.8k       9431         2869   3.456685 26.4553070
##                    RNA_snn_res.0.5 seurat_clusters     S.Score  G2M.Score Phase
## AAACCCACATCGCTCT-1               3               3  0.04567932  0.2927050   G2M
## AAACCCAGTAGGGTAC-1               4               4 -0.31802209 -0.3079963    G1
## AAACCCAGTGCATCTA-1               0               0 -0.04268012 -0.1170832    G1
## AAACCCATCATAGAGA-1               1               1 -0.03453748  0.4065812   G2M
## AAACCCATCGGAAACG-1               7               7 -0.30080349 -0.4493450    G1
## AAACCCATCTAACACG-1               0               0  0.17961522  0.3602452   G2M
DimPlot(E15,reduction = "tsne",label = TRUE,group.by="Phase",pt.size = 1.5)

# inspect and save UMI, Standardize and normalize data

write.table(E15[["RNA"]]@counts,"./umi.xls",sep="\t") # 原始UMI Original UMI ~250MB
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.1 GiB
write.table(E15[["RNA"]]@data,"./data.xls",sep="\t") #标准化的data,进行了LogNormalize, ~650MB size
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 1.1 GiB
write.table(E15[["RNA"]]@scale.data,"./scale.data.xls",sep="\t")#归一化的scale.data,有正负 ~180MB size

save results

saveRDS(E15, file = "./E15_sample.rds")
save(E15,file="./E15_res0.5.Robj")

doublet finder

library(DoubletFinder)
sweep.res.list_E15 <- paramSweep_v3(E15, PCs = 1:10, sct = FALSE)
## Warning: package 'fields' was built under R version 4.2.2
## Warning: package 'spam' was built under R version 4.2.2
## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
sweep.stats_E15 <- summarizeSweep(sweep.res.list_E15, GT = FALSE)
bcmvn_E15 <- find.pK(sweep.stats_E15)

## NULL
opt_pK <- as.numeric(as.vector(bcmvn_E15$pK[which.max(bcmvn_E15$BCmetric)]))
print(opt_pK)
## [1] 0.005
annotations <- E15@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)           
nExp_poi <- round(0.06*nrow(E15@meta.data))  
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

# find doublets
E15 <- doubletFinder_v3(E15,PCs = 1:10,pN = 0.25,pK = opt_pK,nExp = nExp_poi,reuse.pANN = FALSE,sct = FALSE)
## [1] "Creating 2045 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
DimPlot(E15, reduction = "tsne", group.by = "DF.classifications_0.25_0.005_368",order = T)

table(E15$DF.classifications_0.25_0.005_368)
## 
## Doublet Singlet 
##     368    5767

find markers for cluster 1

cluster1.markers <- FindMarkers(E15, ident.1 = 1, min.pct = 0.25) #SQ: marker has to be expressed in at least 25% of cluster 1 cells.
head(cluster1.markers, n = 15) # top 5 markers for cluster 1, that distinct from the other clusters
##                   p_val avg_log2FC pct.1 pct.2     p_val_adj
## LINC01133 1.588785e-302  1.0959463 0.870 0.334 3.832786e-298
## TMEM14B   4.165758e-264  1.0454912 0.981 0.832 1.004947e-259
## MT-CO1    5.486434e-250  0.5489504 1.000 0.993 1.323547e-245
## TMEM14C   1.655414e-248  0.9668561 0.960 0.730 3.993521e-244
## HEBP1     1.220413e-242  0.9628672 0.940 0.641 2.944125e-238
## UBAC1     3.728994e-242  0.9030137 0.978 0.817 8.995825e-238
## PRDX2     3.188152e-235  1.0067714 0.987 0.919 7.691099e-231
## NETO2     1.217452e-193  0.9538187 0.892 0.485 2.936982e-189
## LGALS3    8.871083e-191  0.8668698 0.972 0.661 2.140060e-186
## TK1       4.958848e-185  0.7560403 0.918 0.595 1.196272e-180
## METAP2    1.965007e-181  0.7800846 0.965 0.818 4.740383e-177
## MGST3     6.515865e-181  0.8311863 0.969 0.849 1.571887e-176
## PCLAF     5.228378e-179  0.7481664 0.965 0.763 1.261294e-174
## TUBG1     1.442680e-176  0.7415574 0.894 0.562 3.480321e-172
## ATP5IF1   9.865081e-172  0.7355588 0.977 0.842 2.379852e-167

find markers for each cluster

E15.markers <- FindAllMarkers(E15, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
E15.markers.top100 <- E15.markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)

##存储marker -save markers
write.table(E15.markers,file="./E15_allmarker.txt",sep="\t")
write.csv(E15.markers,file="./E15_allmarker.csv")
write.csv(E15.markers.top100,file="./E15_allmarker_top100.csv")

SingleR for cell type identification

library(SingleR) 
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## 
## Attaching package: 'stats4'
## The following object is masked from 'package:spam':
## 
##     mle
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:spam':
## 
##     cbind, rbind
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:rstatix':
## 
##     desc
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(celldex) 
## 
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData
refdata <- celldex::HumanPrimaryCellAtlasData()
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
# refdata_ms <- celldex::MouseRNAseqData()

cellpred <- SingleR(test = E15@assays$RNA@data, ref = refdata, labels = refdata$label.fine, clusters = E15@active.ident, assay.type.test = "logcounts", assay.type.ref = "logcounts")

cellpred$labels
##  [1] "Erythroblast" "Erythroblast" "Erythroblast" "CMP"          "Erythroblast"
##  [6] "MEP"          "MEP"          "CMP"          "GMP"          "Erythroblast"
# output of calculated cell types
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = F)
write.table(celltype,"./E15_celltype_singleR.xls",row.names = F,sep="\t")


#plot the calculated cell types

E15@meta.data$singleR=celltype[match(E15@active.ident,celltype$ClusterID),'celltype']
DimPlot(E15, reduction = "tsne", group.by = "singleR",label=T, label.size=5)

# collection of plotting codes

FeaturePlot(E15, features = c("CD44", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("gray", "red"))
## Warning in FetchData.Seurat(object = object, vars = c(dims, "ident",
## features), : The following requested variables were not found: CD8A

##绘制Marker基因的小提琴图
VlnPlot(E15, features = c("CD44", "CD4"))

VlnPlot(E15, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

##绘制分cluster的热图
top10 <- E15.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)#n = 10可以自行调整数字
DoHeatmap(E15, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(E15, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: EIF4G2,
## RPL22L1, FAU, XPO7, CPEB4, RIOK3, SEC62, PRDX2, UBAC1, HEBP1, TMEM14C, TMEM14B,
## CR1L, HMGB2, HBA2, SOX6, NUSAP1, HBA1, H1F0, HEMGN

##绘制气泡图
DotPlot(E15, features = c("CD44", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))
## Warning in FetchData.Seurat(object = object, vars = features, cells = cells):
## The following requested variables were not found: CD8A

##绘制RidgePlot
RidgePlot(E15, features = c("CD44", "GNLY", "CD3E", "CD14"), ncol = 2)

saveRDS(E15, file = "./E15_sample.rds")

Check the other seurat_tutorial_v2.Rmd file for sub-group and assign cell IDs # visualization using shinnyCell # Do not knit this block, will not finish.

#library(ShinyCell)

#E15obj = readRDS("E15_sample.rds")
#scConf = createConfig(E15obj)
#makeShinyApp(E15obj, scConf, gene.mapping = TRUE,
             #shiny.title = "ssbm.e15.8k_visualization") 

#setwd("D:/Bioinformatics Projects/shalini_bm/shinyApp") # verify change of dir in console using getwd()

#shiny::runApp()