SQ: Can RUN ALL beautifully, knit to html result in a “seurat_tutorial_v2_files” folder in WD, with all the images. Not an html file though

when Knit to html, comment the shinyCell chunk, otherwise it will not finish.

#load seurat package and dependencies

library(dplyr)
library(Seurat)
library(patchwork)
setwd("D:/BioinformaticsProjects/R Code/Seurat Tutorial")

load pbmc dataset

pbmc.data <- Read10X(data.dir = "D:/BioinformaticsProjects/R Code/Seurat Tutorial/filtered_gene_bc_matrices/hg19/")

dim(pbmc.data)

## [1] 32738  2700

pbmc.data[1:10,1:6]

## 10 x 6 sparse Matrix of class "dgCMatrix"
##               AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1
## MIR1302-10                   .                .                .
## FAM138A                      .                .                .
## OR4F5                        .                .                .
## RP11-34P13.7                 .                .                .
## RP11-34P13.8                 .                .                .
## AL627309.1                   .                .                .
## RP11-34P13.14                .                .                .
## RP11-34P13.9                 .                .                .
## AP006222.2                   .                .                .
## RP4-669L17.10                .                .                .
##               AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1 AAACGCACTGGTAC-1
## MIR1302-10                   .                .                .
## FAM138A                      .                .                .
## OR4F5                        .                .                .
## RP11-34P13.7                 .                .                .
## RP11-34P13.8                 .                .                .
## AL627309.1                   .                .                .
## RP11-34P13.14                .                .                .
## RP11-34P13.9                 .                .                .
## AP006222.2                   .                .                .
## RP4-669L17.10                .                .                .

Create a Seurat object and filter data

pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)

## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

pbmc

## An object of class Seurat 
## 13714 features across 2700 samples within 1 assay 
## Active assay: RNA (13714 features, 0 variable features)

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # common RBC genes for human blood sample
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)

VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

Filter and subset cells

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

normalize expression level

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

identify 2000 most variable genes/features, to be used for downstream analysis

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

top10 <- head(VariableFeatures(pbmc), 10)
top10

##  [1] "PPBP"   "LYZ"    "S100A9" "IGLL5"  "GNLY"   "FTL"    "PF4"    "FTH1"  
##  [9] "GNG11"  "S100A8"

plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

plot1

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 1 rows containing missing values (geom_point).

plot2

## Warning: Transformation introduced infinite values in continuous x-axis
## Removed 1 rows containing missing values (geom_point).

# Preparation stage for PCA analyses, need to use ScaleData() to scale and center all data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)

## Centering and scaling data matrix

pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt") # regress out the effects of mitochondria variation

## Regressing out percent.mt
## Centering and scaling data matrix

Linear dimention reduction (PCA), use most variable features by default

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

## PC_ 1 
## Positive:  CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP 
##     FCER1G, CFD, LGALS1, LGALS2, SERPINA1, S100A8, CTSS, IFITM3, SPI1, CFP 
##     PSAP, IFI30, COTL1, SAT1, S100A11, NPC2, GRN, LGALS3, GSTP1, PYCARD 
## Negative:  MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CTSW, STK17A, CD27 
##     CD247, CCL5, GIMAP5, GZMA, AQP3, CST7, TRAF3IP3, SELL, GZMK, HOPX 
##     MAL, MYC, ITM2A, ETS1, LYAR, GIMAP7, KLRG1, NKG7, ZAP70, BEX2 
## PC_ 2 
## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74 
##     HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB 
##     BLNK, P2RX5, IGLL5, IRF8, SWAP70, ARHGAP24, FCGR2B, SMIM14, PPP1R14A, C16orf74 
## Negative:  NKG7, PRF1, CST7, GZMA, GZMB, FGFBP2, CTSW, GNLY, B2M, SPON2 
##     CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX 
##     TTC38, CTSC, APMAP, S100A4, IGFBP7, ANXA1, ID2, IL32, XCL1, RHOC 
## PC_ 3 
## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1, HLA-DPB1, CD74, MS4A1, HLA-DRB1, HLA-DRA 
##     HLA-DRB5, HLA-DQA2, TCL1A, LINC00926, HLA-DMB, HLA-DMA, CD37, HVCN1, FCRLA, IRF8 
##     PLAC8, BLNK, MALAT1, SMIM14, PLD4, IGLL5, SWAP70, P2RX5, LAT2, FCGR3A 
## Negative:  PPBP, PF4, SDPR, SPARC, GNG11, NRGN, GP9, RGS18, TUBB1, CLU 
##     HIST1H2AC, AP001189.4, ITGA2B, CD9, TMEM40, PTCRA, CA2, ACRBP, MMD, TREML1 
##     NGFRAP1, F13A1, SEPT5, RUFY1, TSC22D1, CMTM5, MPP1, MYL9, RP11-367G6.3, GP1BA 
## PC_ 4 
## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1, CD74, HLA-DPB1, HIST1H2AC, HLA-DPA1, HLA-DRB1 
##     TCL1A, PF4, HLA-DQA2, SDPR, HLA-DRA, LINC00926, PPBP, GNG11, HLA-DRB5, SPARC 
##     GP9, PTCRA, CA2, AP001189.4, CD9, NRGN, RGS18, GZMB, CLU, TUBB1 
## Negative:  VIM, IL7R, S100A6, S100A8, IL32, S100A4, GIMAP7, S100A10, S100A9, MAL 
##     AQP3, CD14, CD2, LGALS2, FYB, GIMAP4, ANXA1, RBP7, CD27, FCN1 
##     LYZ, S100A12, MS4A6A, GIMAP5, S100A11, FOLR3, TRABD2A, AIF1, IL8, TMSB4X 
## PC_ 5 
## Positive:  GZMB, FGFBP2, S100A8, NKG7, GNLY, CCL4, PRF1, CST7, SPON2, GZMA 
##     GZMH, LGALS2, S100A9, CCL3, XCL2, CD14, CLIC3, CTSW, MS4A6A, GSTP1 
##     S100A12, RBP7, IGFBP7, FOLR3, AKR1C3, TYROBP, CCL5, TTC38, XCL1, APMAP 
## Negative:  LTB, IL7R, CKB, MS4A7, RP11-290F20.3, AQP3, SIGLEC10, VIM, CYTIP, HMOX1 
##     LILRB2, PTGES3, HN1, CD2, FAM110A, CD27, ANXA5, CTD-2006K23.1, MAL, VMO1 
##     CORO1B, TUBA1B, LILRA3, GDI2, TRADD, ATP1A1, IL32, ABRACL, CCDC109B, PPA1

print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)

## PC_ 1 
## Positive:  CST3, TYROBP, LST1, AIF1, FTL 
## Negative:  MALAT1, LTB, IL32, IL7R, CD2 
## PC_ 2 
## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1 
## Negative:  NKG7, PRF1, CST7, GZMA, GZMB 
## PC_ 3 
## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPA1 
## Negative:  PPBP, PF4, SDPR, SPARC, GNG11 
## PC_ 4 
## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1 
## Negative:  VIM, IL7R, S100A6, S100A8, IL32 
## PC_ 5 
## Positive:  GZMB, FGFBP2, S100A8, NKG7, GNLY 
## Negative:  LTB, IL7R, CKB, MS4A7, RP11-290F20.3

VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

make PCA dimension plots

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:5, cells = 500, balanced = TRUE)

# determine number of PCs in the dataset

pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)

## Warning: Removed 23407 rows containing missing values (geom_point).

ElbowPlot(pbmc)

# cell clustering

pbmc <- FindNeighbors(pbmc, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

pbmc <- FindClusters(pbmc, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2638
## Number of edges: 95893
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8735
## Number of communities: 9
## Elapsed time: 0 seconds

head(Idents(pbmc), 5)

## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1 
##                0                3                2                1 
## AAACCGTGTATGCG-1 
##                6 
## Levels: 0 1 2 3 4 5 6 7 8

pbmc <- RunTSNE(pbmc, dims = 1:10)
DimPlot(pbmc,reduction = "tsne",label = TRUE,pt.size = 1.5)

pbmc <- RunUMAP(pbmc, dims = 1:10)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 10:08:31 UMAP embedding parameters a = 0.9922 b = 1.112

## 10:08:31 Read 2638 rows and found 10 numeric columns

## 10:08:31 Using Annoy for neighbor search, n_neighbors = 30

## 10:08:31 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 10:08:31 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\RtmpKoDCsM\file48bc6e614ea3
## 10:08:31 Searching Annoy index using 1 thread, search_k = 3000
## 10:08:32 Annoy recall = 100%
## 10:08:33 Commencing smooth kNN distance calibration using 1 thread
## 10:08:34 Initializing from normalized Laplacian + noise
## 10:08:34 Commencing optimization for 500 epochs, with 106338 positive edges
## 10:08:42 Optimization finished

DimPlot(pbmc,reduction = "umap",label = TRUE,pt.size = 1.5)

# score cell cycle

pbmc<- CellCycleScoring(object = pbmc, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)

## Warning: The following features are not present in the object: DTL, UHRF1,
## MLF1IP, EXO1, CASP8AP2, BRIP1, E2F8, not searching for symbol synonyms

## Warning: The following features are not present in the object: FAM64A, BUB1,
## HJURP, CDCA3, TTK, CDC25C, DLGAP5, CDCA2, ANLN, GAS2L3, not searching for symbol
## synonyms

head(x = pbmc@meta.data)

##                  orig.ident nCount_RNA nFeature_RNA percent.mt percent.HB
## AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759          0
## AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958          0
## AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363          0
## AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845          0
## AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898          0
## AAACGCACTGGTAC-1     pbmc3k       2163          781  1.6643551          0
##                  RNA_snn_res.0.5 seurat_clusters     S.Score    G2M.Score Phase
## AAACATACAACCAC-1               0               0  0.09853841 -0.044716507     S
## AAACATTGAGCTAC-1               3               3 -0.02364305 -0.046889929    G1
## AAACATTGATCAGC-1               2               2 -0.02177266  0.074841537   G2M
## AAACCGTGCTTCCG-1               1               1  0.03794398  0.006575446     S
## AAACCGTGTATGCG-1               6               6 -0.03309970  0.027910063   G2M
## AAACGCACTGGTAC-1               2               2 -0.04814181 -0.078164839    G1

DimPlot(pbmc,reduction = "tsne",label = TRUE,group.by="Phase",pt.size = 1.5)

# inspect and save UMI, Standardize and normalize data

write.table(pbmc[["RNA"]]@counts,"./umi.xls",sep="\t") # 原始UMI Original UMI
write.table(pbmc[["RNA"]]@data,"./data.xls",sep="\t")#标准化的data，进行了LogNormalize
write.table(pbmc[["RNA"]]@scale.data,"./scale.data.xls",sep="\t")#归一化的scale.data,有正负

save results

saveRDS(pbmc, file = "./pbmc_tutorial.rds")
save(pbmc,file="./res0.5.Robj")

double finder

library(DoubletFinder)
sweep.res.list_pbmc <- paramSweep_v3(pbmc, PCs = 1:10, sct = FALSE)

## [1] "Creating artificial doublets for pN = 5%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 10%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 15%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 20%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 25%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."
## [1] "Creating artificial doublets for pN = 30%"
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Defining neighborhoods..."
## [1] "Computing pANN across all pK..."
## [1] "pK = 0.005..."
## [1] "pK = 0.01..."
## [1] "pK = 0.02..."
## [1] "pK = 0.03..."
## [1] "pK = 0.04..."
## [1] "pK = 0.05..."
## [1] "pK = 0.06..."
## [1] "pK = 0.07..."
## [1] "pK = 0.08..."
## [1] "pK = 0.09..."
## [1] "pK = 0.1..."
## [1] "pK = 0.11..."
## [1] "pK = 0.12..."
## [1] "pK = 0.13..."
## [1] "pK = 0.14..."
## [1] "pK = 0.15..."
## [1] "pK = 0.16..."
## [1] "pK = 0.17..."
## [1] "pK = 0.18..."
## [1] "pK = 0.19..."
## [1] "pK = 0.2..."
## [1] "pK = 0.21..."
## [1] "pK = 0.22..."
## [1] "pK = 0.23..."
## [1] "pK = 0.24..."
## [1] "pK = 0.25..."
## [1] "pK = 0.26..."
## [1] "pK = 0.27..."
## [1] "pK = 0.28..."
## [1] "pK = 0.29..."
## [1] "pK = 0.3..."

sweep.stats_pbmc <- summarizeSweep(sweep.res.list_pbmc, GT = FALSE)
bcmvn_pbmc <- find.pK(sweep.stats_pbmc)

## NULL

opt_pK <- as.numeric(as.vector(bcmvn_pbmc$pK[which.max(bcmvn_pbmc$BCmetric)]))
print(opt_pK)

## [1] 0.01

annotations <- pbmc@meta.data$seurat_clusters
homotypic.prop <- modelHomotypic(annotations)           
nExp_poi <- round(0.06*nrow(pbmc@meta.data))  
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

# find doublets
pbmc <- doubletFinder_v3(pbmc,PCs = 1:10,pN = 0.25,pK = opt_pK,nExp = nExp_poi,reuse.pANN = FALSE,sct = FALSE)

## [1] "Creating 879 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."

DimPlot(pbmc, reduction = "tsne", group.by = "DF.classifications_0.25_0.01_158",order = T)

table(pbmc$DF.classifications_0.25_0.01_158)

## 
## Doublet Singlet 
##     158    2480

find markers for cluster 1

cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25) #SQ: marker has to be expressed in at least 25% of cluster 1 cells.
head(cluster1.markers, n = 5) # top 5 markers for cluster 1, that distinct from the other clusters

##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## S100A9  0.000000e+00   5.570063 0.996 0.215  0.000000e+00
## S100A8  0.000000e+00   5.477394 0.975 0.121  0.000000e+00
## FCN1    0.000000e+00   3.394219 0.952 0.151  0.000000e+00
## LGALS2  0.000000e+00   3.800484 0.908 0.059  0.000000e+00
## CD14   2.856582e-294   2.815626 0.667 0.028 3.917516e-290

find markers for each cluster

pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

## Calculating cluster 0

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

## Calculating cluster 6

## Calculating cluster 7

## Calculating cluster 8

pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)

## # A tibble: 18 × 7
## # Groups:   cluster [9]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene    
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
##  1 1.88e-117       1.08 0.913 0.588 2.57e-113 0       LDHB    
##  2 5.01e- 85       1.34 0.437 0.108 6.88e- 81 0       CCR7    
##  3 0               5.57 0.996 0.215 0         1       S100A9  
##  4 0               5.48 0.975 0.121 0         1       S100A8  
##  5 1.93e- 80       1.27 0.981 0.65  2.65e- 76 2       LTB     
##  6 2.91e- 58       1.27 0.667 0.251 3.98e- 54 2       CD2     
##  7 0               4.31 0.939 0.042 0         3       CD79A   
##  8 1.06e-269       3.59 0.623 0.022 1.45e-265 3       TCL1A   
##  9 3.60e-221       3.21 0.984 0.226 4.93e-217 4       CCL5    
## 10 4.27e-176       3.01 0.573 0.05  5.85e-172 4       GZMK    
## 11 3.51e-184       3.31 0.975 0.134 4.82e-180 5       FCGR3A  
## 12 2.03e-125       3.09 1     0.315 2.78e-121 5       LST1    
## 13 3.17e-267       4.83 0.961 0.068 4.35e-263 6       GZMB    
## 14 1.04e-189       5.28 0.961 0.132 1.43e-185 6       GNLY    
## 15 1.48e-220       3.87 0.812 0.011 2.03e-216 7       FCER1A  
## 16 1.67e- 21       2.87 1     0.513 2.28e- 17 7       HLA-DPB1
## 17 7.73e-200       7.24 1     0.01  1.06e-195 8       PF4     
## 18 3.68e-110       8.58 1     0.024 5.05e-106 8       PPBP

##存储marker -save markers
write.table(pbmc.markers,file="./allmarker.txt",sep="\t")

SingleR for cell type identification

library(SingleR)

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## 
## Attaching package: 'stats4'

## The following object is masked from 'package:spam':
## 
##     mle

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:spam':
## 
##     cbind, rbind

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:sp':
## 
##     %over%

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'SummarizedExperiment'

## The following object is masked from 'package:SeuratObject':
## 
##     Assays

## The following object is masked from 'package:Seurat':
## 
##     Assays

library(celldex)

## 
## Attaching package: 'celldex'

## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData

refdata <- celldex::HumanPrimaryCellAtlasData()

## snapshotDate(): 2022-04-26

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

refdata_ms <- celldex::MouseRNAseqData()

## snapshotDate(): 2022-04-26

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

cellpred <- SingleR(test = pbmc@assays$RNA@data, ref = refdata, labels = refdata$label.fine, clusters = pbmc@active.ident, assay.type.test = "logcounts", assay.type.ref = "logcounts")

cellpred$labels

## [1] "T_cell:CD4+_central_memory" "Monocyte:CD16-"            
## [3] "T_cell:CD4+_central_memory" "B_cell:immature"           
## [5] "T_cell:CD8+"                "Monocyte:CD16+"            
## [7] "NK_cell"                    "Monocyte:CD16-"            
## [9] "Monocyte"

# output of calculated cell types
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = F)
write.table(celltype,"./celltype_singleR.xls",row.names = F,sep="\t")


#plot the calculated cell types

pbmc@meta.data$singleR=celltype[match(pbmc@active.ident,celltype$ClusterID),'celltype']
DimPlot(pbmc, reduction = "tsne", group.by = "singleR",label=T, label.size=5)

# collection of plotting codes

FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("gray", "red"))

##绘制Marker基因的小提琴图
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))

VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

##绘制分cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)#n = 10可以自行调整数字
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A

##绘制气泡图
DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))

##绘制RidgePlot
RidgePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14"), ncol = 2)

# subset a cluster for further sub-clustering

sub_pbmc<-subset(pbmc, idents = '0')
DimPlot(sub_pbmc, reduction = "tsne",label = TRUE, pt.size = 1.5)

##########对取出的cluster重新进行分群 - re-cluster the subset
sub_pbmc <- FindVariableFeatures(sub_pbmc, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(sub_pbmc)
sub_pbmc <- ScaleData(sub_pbmc, features = all.genes)

## Centering and scaling data matrix

sub_pbmc <- RunPCA(sub_pbmc, features = VariableFeatures(object = sub_pbmc))

## PC_ 1 
## Positive:  PRSS57, RAB13, CYTL1, SDPR, LAPTM4B, EGFL7, C19orf77, DEPTOR, FES, IL1B 
##     PTMS, GSN, RPL3, SOX4, RPS4X, FCER1A, KIAA0125, RPS2, AKR1C3, HLA-DRB5 
##     CNRIP1, LYL1, RPS5, CDK6, RPL10A, ACSM3, PABPC1, RP11-262H14.4, IGFBP7, CAT 
## Negative:  TMSB4X, MALAT1, B2M, IL32, CD3D, CCL5, TIGIT, CBR4, GZMK, GABPB1 
##     YTHDF1, LTB, CTC-444N24.11, MEPCE, UBR1, UNC93B1, CLASP2, F5, HLA-A, SH2D3A 
##     ATXN7L1, WDR91, CHP1, RP11-108M9.6, HIPK3, CLIC3, ABR, C5orf24, TRMT6, RNF185 
## PC_ 2 
## Positive:  RPL3, RPL19, RPS5, RPL10A, RPS3A, RPS12, RPL10, RPS2, RPS6, RPS4X 
##     RPL15, RPS8, RPS27A, RPL7, GLTSCR2, RPL13A, BTG1, EEF1D, NACA, TPT1 
##     TMEM66, PABPC1, RPL13, RPL18, RPL8, CD7, EVL, CD8B, NOSIP, S100B 
## Negative:  PRSS57, SDPR, CYTL1, S100A4, RAB13, IL1B, ANKRD28, LAPTM4B, CCL5, KLF6 
##     C19orf77, C5orf24, PTMS, CNRIP1, FES, EGFL7, HLA-DRB5, RP11-262H14.4, DEPTOR, CBR4 
##     TMSB4X, VGLL4, DDA1, BTK, AKR1C3, CXXC5, SOX4, RP1-43E13.2, FCER1A, CDK6 
## PC_ 3 
## Positive:  MALAT1, RPL13, RPL13A, RPS3A, RPS6, RPS4X, CD8B, RPS12, RPS18, RPL19 
##     RPS2, GTF2I, SDPR, CD8A, RPL10, RPS3, SPAG9, SARM1, IDH2, H1F0 
##     YBX3, PRSS57, RPL3, RP11-291B21.2, CTD-2260A17.2, RP11-620J15.3, RAB13, RC3H2, REG4, RPS27A 
## Negative:  FOS, JUNB, JUN, H3F3B, IER2, B2M, KLF6, EIF1, TMSB4X, ZFP36 
##     LTB, CXCR4, TMEM66, S100A4, HLA-A, IL7R, IL32, FTH1, RP11-1399P15.1, HLA-B 
##     IFITM2, HNRNPDL, YPEL5, RGS1, DHRS7, PLP2, PNRC1, KRT1, FXYD5, RARRES3 
## PC_ 4 
## Positive:  S100A4, RPL8, ALOX5AP, LMNA, RPS18, RPS7, TNFRSF4, HOPX, VIM, TNFRSF18 
##     RPS2, RPL13A, RPL10, SH3BGRL3, CTSH, RPS3, TRADD, CD99, FLNA, B2M 
##     KLRB1, RPS8, RPS4X, LTB, IFITM2, GSTK1, ACTB, KLRG1, FXYD5, GNB2L1 
## Negative:  MALAT1, JUNB, BTG1, AIF1, MT-CO2, RPL34, YPEL5, PCYOX1L, JUN, FOS 
##     TTC1, RPF2, SELL, DUSP22, GIMAP5, MT-ND2, EIF4A2, TRAF3IP3, FBXO11, DENND2D 
##     CXCR4, PIP4K2C, STK17A, NDFIP1, GIMAP4, LDLRAP1, MYB, TRMT2A, GPR160, AC016629.8 
## PC_ 5 
## Positive:  SLC40A1, LYRM5, ITM2A, ICOS, AQP3, MAL, ZNF264, BPNT1, GLTSCR2, SELL 
##     VAPA, IL23A, RAD54L2, ZFYVE28, RMDN2, DTWD2, KDM1B, RPS9, ABI2, CD82 
##     POLL, FAM213A, RP1-43E13.2, MICU1, ZNF444, RP11-66N11.8, USP38, ATP5SL, ZCCHC11, AEN 
## Negative:  CD8B, CD8A, RP11-291B21.2, CCL5, CARS, MT-CO2, MT-ND2, MT-CYB, SIL1, AC092580.4 
##     S100B, MT-CO3, DNAJA2, RPL34, ENO1, S100A4, DLEU1, C1orf228, ATP6V0E2, SH3BGRL3 
##     PLP2, B2M, ASB1, NFYB, ETAA1, CPNE2, RNF141, IL32, STK24, CST7

sub_pbmc <- FindNeighbors(sub_pbmc, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

sub_pbmc <- FindClusters(sub_pbmc, resolution = 0.8)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 709
## Number of edges: 28401
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.4569
## Number of communities: 4
## Elapsed time: 0 seconds

sub_pbmc <- RunTSNE(sub_pbmc, dims = 1:10)
sub_pbmc <- RunUMAP(sub_pbmc, dims = 1:10)

## 10:15:19 UMAP embedding parameters a = 0.9922 b = 1.112

## Found more than one class "dist" in cache; using the first, from namespace 'spam'

## Also defined by 'BiocGenerics'

## 10:15:19 Read 709 rows and found 10 numeric columns

## 10:15:19 Using Annoy for neighbor search, n_neighbors = 30

## Found more than one class "dist" in cache; using the first, from namespace 'spam'

## Also defined by 'BiocGenerics'

## 10:15:19 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 10:15:19 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\RtmpKoDCsM\file48bc1e09395d
## 10:15:19 Searching Annoy index using 1 thread, search_k = 3000
## 10:15:20 Annoy recall = 100%
## 10:15:21 Commencing smooth kNN distance calibration using 1 thread
## 10:15:22 Initializing from normalized Laplacian + noise
## 10:15:23 Commencing optimization for 500 epochs, with 23594 positive edges
## 10:15:25 Optimization finished

DimPlot(sub_pbmc, reduction = "tsne",label = TRUE, pt.size = 1.5)

DimPlot(sub_pbmc, reduction = "umap",label = TRUE, pt.size = 1.5)

assign cell types according to markers for each cluster, whole cells

new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 1.5) + NoLegend()

write.table(Idents(pbmc),"./cell-type.xls",sep="\t",quote = F)

save results

saveRDS(pbmc, file = "./pbmc3k_final.rds")

additional plots

VlnPlot(pbmc, features = c("MS4A1", "CD79A"))

VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

##按照细胞类型绘制分cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: CD8A,
## VPREB3, CD40LG, PIK3IP1, PRKCQ-AS1, NOSIP, LEF1, CD3E, CD3D, CCR7, LDHB, RPS3A

##按照细胞类型绘制气泡图
DotPlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"),cols = c("blue", "red"))

##按照细胞类型绘制RidgePlot
RidgePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14"), ncol = 2)

## Picking joint bandwidth of 0.0234

## Picking joint bandwidth of 0.0831

## Picking joint bandwidth of 0.126

## Picking joint bandwidth of 0.0337

# For the next step Cellphonedb analysis

###count文件生成 - generating count file
count_raw <- pbmc[["RNA"]]@counts
count_norm <- apply(count_raw, 2, function(x) (x/sum(x))*10000)
write.table(count_norm, './cellphonedb_count.txt', sep ='\t', quote = F)

###meta文件生成 -generating meta file
write.table(Idents(pbmc),"./cellphonedb_meta.txt",sep="\t",quote = F)

visualization using shinnyCell

Do not knit this block, will not finish.

#library(ShinyCell)

#seurat_integrated = readRDS("pbmc3k_final.RDS")
#scConf = createConfig(seurat_integrated)
#makeShinyApp(seurat_integrated, scConf, gene.mapping = TRUE,
 #            shiny.title = "pbmc3k visualization") 

#setwd("./shinyApp") # verify change of dir in console using getwd()

#shiny::runApp()

workshop seurat_tutorial_v2

Shenfeng Qiu

2022-07-23

SQ: Can RUN ALL beautifully, knit to html result in a “seurat_tutorial_v2_files” folder in WD, with all the images. Not an html file though

when Knit to html, comment the shinyCell chunk, otherwise it will not finish.

Create a Seurat object and filter data

Filter and subset cells

normalize expression level

identify 2000 most variable genes/features, to be used for downstream analysis

Linear dimention reduction (PCA), use most variable features by default

make PCA dimension plots

save results

double finder

find markers for cluster 1

find markers for each cluster

SingleR for cell type identification

assign cell types according to markers for each cluster, whole cells

save results

additional plots

visualization using shinnyCell

Do not knit this block, will not finish.