pbmc.data <- Read10X(data.dir = "/mnt/nectar_volume/home/eraz0001/KELLY 2020/E11.5/")

The most important stage is preparing Seurat object.

pbmc.data[c("Lyz1", "Hoxd13", "Cdh5"), 1:30]

## 3 x 30 sparse Matrix of class "dgCMatrix"

##    [[ suppressing 30 column names 'AAACCTGAGACGCACA-1', 'AAACCTGCACAGACAG-1', 'AAACCTGCAGGCTCAC-1' ... ]]

##                                                                      
## Lyz1   . . . . . . . . .  . . . . . . .  . . . . . . . . .  . . . . .
## Hoxd13 . . . . . . . . .  . . . . . . .  . 4 . . . . . 1 2  . . . 8 .
## Cdh5   . . . . 9 . . . . 17 . . 1 . . . 37 . . . . 1 . 1 . 17 . . . .

pbmc <- CreateSeuratObject(counts = pbmc.data, project = "E11.5", min.cells = 3, min.features = 200)
pbmc

## An object of class Seurat 
## 16968 features across 2683 samples within 1 assay 
## Active assay: RNA (16968 features, 0 variable features)

VlnPlot(object = pbmc, pt.size = 0.5,features = c("Lyz1", "Hoxd13", "Cdh5"))

## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: Lyz1

#QC metrics in Seurat. The number of genes/percent of genes are depicted in bar using the following command.

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^mt-")

VlnPlot(pbmc, features = c("nFeature_RNA","nCount_RNA","percent.mt", "percent_ribo"),ncol = 4, pt.size = 0.8)

## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: percent_ribo

RidgePlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent_ribo"), ncol =1)

## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: percent_ribo

## Picking joint bandwidth of 178

## Picking joint bandwidth of 896

## Picking joint bandwidth of 0.176

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

If you wish to use GGplot2 to draw your favorite bars, you can use the following command:

ggplot(pbmc@meta.data, aes(x=nFeature_RNA, y = nCount_RNA)) + geom_point()

The percent of MT genes as well as their density can be shown using the following command:

ggplot(pbmc@meta.data, aes(x=percent.mt)) + geom_density() + geom_vline(xintercept = 5, color = 'red', linetype = 2)

We can determine that the cutoff line (e.g., of 5 percent) for this.

plot3 <- FeatureScatter(pbmc, feature1 = "nFeature_RNA", feature2 = "percent.mt")
plot3

If you’re the thresholds are appropriately determined, it can be applied using the following command.

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 5)
pbmc

## An object of class Seurat 
## 16968 features across 2333 samples within 1 assay 
## Active assay: RNA (16968 features, 0 variable features)

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(pbmc)
length(all.genes)

## [1] 16968

pbmc <- ScaleData(pbmc, features = all.genes)

## Centering and scaling data matrix

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

## PC_ 1 
## Positive:  Cd24a, Gpc3, Nnat, Ldhb, Mfap4, Crabp2, H1fx, Hoxa10, Hmga2, Sdc1 
##     Gas1, Capn6, Sox11, Pitx1, Phgdh, Lpar1, Prrx1, Sox9, Pdlim4, Ccnd2 
##     Id2, Fbln1, Cdh11, Cdkn1c, Prrx2, Hoxc10, Bcl11a, Igsf3, Peg3, Frzb 
## Negative:  Cd34, Cdh5, Emcn, Esam, Plvap, Pecam1, Icam2, Cldn5, Ecscr, Eng 
##     Gimap6, Rasip1, Crip2, Tie1, Tmem255a, Flt1, Prkcdbp, Aplnr, Anxa3, Plxnd1 
##     Gng11, Col15a1, S100a16, Icam1, Tagln2, Tfpi, Mfng, Egfl7, Kdr, Cav1 
## PC_ 2 
## Positive:  Sfn, Cldn4, Perp, Cldn6, Slc2a3, Pdlim1, Krt5, Cldn7, Epcam, Krt7 
##     Cdh1, Spint1, Krt14, Wnt6, Emp1, Cldn3, Tacstd2, Krt8, Bcam, Esrp1 
##     Krt18, Ovol1, Grhl3, Pvrl4, Gm16136, Spint2, Rab25, Dsp, Hspb1, Slc9a3r1 
## Negative:  Igfbp4, Fscn1, Hoxa10, Mest, Meg3, Asb4, Crabp2, Cdkn1c, Pitx1, Mfap4 
##     Capn6, Prrx1, Lpar1, Cdh11, Sox11, Nnat, Lgals1, Frzb, Maged2, Nrp1 
##     Col26a1, Clmp, Tmem119, Prrx2, Col9a1, Col3a1, Dusp6, Id3, Sfrp2, Mab21l2 
## PC_ 3 
## Positive:  Rac2, Spi1, Tyrobp, Fermt3, Cd53, Evi2a, Myo1g, Psmb8, Bin2, Srgn 
##     Cd52, Coro1a, Myo1f, Cytip, Fmnl1, Ptpn6, Plek, Laptm5, Fcer1g, Alox5ap 
##     Plac8, Vav1, Plcg2, Glipr1, Pld4, Was, Parvg, Ikzf1, Itgb2, Apbb1ip 
## Negative:  Serpinh1, Mest, Tuba1a, Fscn1, H19, Col18a1, Sparc, Dstn, Maged2, S100a16 
##     Mpzl1, Pvrl2, Tspan6, Gja1, Cald1, Efna1, Igf2, Nrarp, Cd151, Crip2 
##     Igfbp2, Cldn5, Tmem255a, Plvap, Col4a1, Cd81, Sox11, Prkcdbp, Bgn, Cdh5 
## PC_ 4 
## Positive:  Ran, Tuba1b, Birc5, H2afz, Top2a, Cdca8, Ccna2, Ccnb1, Tubb5, Ube2c 
##     Hspa8, Eif5a, Cdca3, Hmgb2, Tubb4b, Cenpa, 2810417H13Rik, Cdc20, Tpx2, Smc4 
##     Ube2s, Tacc3, Pbk, Ddx39, Racgap1, Anp32b, Hn1, Cdk1, H2afx, Aurkb 
## Negative:  Col9a2, Col2a1, Sfrp2, Col9a3, H19, Itm2a, Fibin, Vcan, Cdkn1c, Wwp2 
##     Osr2, Igf2, Sfrp1, Cthrc1, Matn4, Tox, Cpe, Tgfb2, Enpp2, Lect1 
##     Capn6, Malat1, Creb5, Cxcl12, Pmp22, Runx1t1, Meg3, Col9a1, Rhoc, Tbx18 
## PC_ 5 
## Positive:  Kremen2, Cxcl14, Sost, Pdgfa, Wnt3, Trp63, Lypd6b, Gjb6, Pltp, Gjb2 
##     Htra1, Igfbp5, Itga3, Hapln1, Bcam, Lgals7, Gpx2, Fermt1, Wnt7b, Sp8 
##     Frem2, Epha1, C1qtnf3, Npnt, Wnt6, Fmod, Dlx2, Cbr2, Pdpn, Galnt18 
## Negative:  Pvrl4, Tacstd2, Rhov, Lypd3, Grhl3, Aldh3b2, Cldn3, Anxa1, Arg1, Gabrp 
##     Tgm1, Nebl, Upk1a, Smtnl2, Zfp750, Ppl, Tnfaip2, Penk, Paqr5, Tuba1a 
##     Krt8, 3830417A13Rik, Gm16136, Bspry, Mall, Ocln, Ovol1, Cldn23, Heyl, Elf5

names(pbmc)

## [1] "RNA" "pca"

print(pbmc[["pca"]], dims = 1:5, nfeatures = 10)

## PC_ 1 
## Positive:  Cd24a, Gpc3, Nnat, Ldhb, Mfap4, Crabp2, H1fx, Hoxa10, Hmga2, Sdc1 
## Negative:  Cd34, Cdh5, Emcn, Esam, Plvap, Pecam1, Icam2, Cldn5, Ecscr, Eng 
## PC_ 2 
## Positive:  Sfn, Cldn4, Perp, Cldn6, Slc2a3, Pdlim1, Krt5, Cldn7, Epcam, Krt7 
## Negative:  Igfbp4, Fscn1, Hoxa10, Mest, Meg3, Asb4, Crabp2, Cdkn1c, Pitx1, Mfap4 
## PC_ 3 
## Positive:  Rac2, Spi1, Tyrobp, Fermt3, Cd53, Evi2a, Myo1g, Psmb8, Bin2, Srgn 
## Negative:  Serpinh1, Mest, Tuba1a, Fscn1, H19, Col18a1, Sparc, Dstn, Maged2, S100a16 
## PC_ 4 
## Positive:  Ran, Tuba1b, Birc5, H2afz, Top2a, Cdca8, Ccna2, Ccnb1, Tubb5, Ube2c 
## Negative:  Col9a2, Col2a1, Sfrp2, Col9a3, H19, Itm2a, Fibin, Vcan, Cdkn1c, Wwp2 
## PC_ 5 
## Positive:  Kremen2, Cxcl14, Sost, Pdgfa, Wnt3, Trp63, Lypd6b, Gjb6, Pltp, Gjb2 
## Negative:  Pvrl4, Tacstd2, Rhov, Lypd3, Grhl3, Aldh3b2, Cldn3, Anxa1, Arg1, Gabrp

VizDimLoadings(pbmc, dims = 1:5, reduction = "pca",nfeatures = 30, col = "red") & theme(axis.text=element_text(size=5), axis.title=element_text(size=8,face="bold"))

DimPlot(pbmc, reduction = "pca", pt.size = 1) & NoLegend()

DimHeatmap(pbmc, dims = 1, nfeatures = 20, cells = 500, balanced = TRUE)

for(i in 1:15){
  print(DimHeatmap(pbmc, dims = i, cells = 500, nfeatures = 20, balanced = TRUE))
}

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

DimHeatmap(pbmc, dims = 1:15, cells = 500,nfeatures = 30, balanced = TRUE)

Determine the ‘Dimensionality’ of the data-set

pbmc <- JackStraw(pbmc, num.replicate = 10, prop.freq = 0.1)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:20)

## Warning: Removed 28000 rows containing missing values (geom_point).

JackStrawPlot(pbmc, dims = 1:15) + coord_cartesian() + geom_abline(intercept = 0, slope = 0.05)

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

## Warning: Removed 21000 rows containing missing values (geom_point).

What if we determine a threshold to cut down on some PCs. To this end, we just included significant genes with FDR 0.05.

ElbowPlot(pbmc)

Clustering the data

pbmc <- FindNeighbors(pbmc, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

names(pbmc)

## [1] "RNA"     "RNA_nn"  "RNA_snn" "pca"

pbmc <- FindClusters(pbmc, resolution = 1.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2333
## Number of edges: 67099
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8007
## Number of communities: 16
## Elapsed time: 0 seconds

pbmc

## An object of class Seurat 
## 16968 features across 2333 samples within 1 assay 
## Active assay: RNA (16968 features, 2000 variable features)
##  1 dimensional reduction calculated: pca

#if you wanna check the cluster Id in 5 first data-set, use this command.

head(Idents(pbmc), 5)

## AAACCTGAGACGCACA-1 AAACCTGCACAGACAG-1 AAACCTGCAGGCTCAC-1 AAACCTGTCAAGAAGT-1 
##                  2                  0                  0                  5 
## AAACGGGAGGATATAC-1 
##                  6 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

#Run non-linear dimensional reduction (UMAP/tSNE)

pbmc <- RunUMAP(pbmc, dims = 1:10)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 15:44:45 UMAP embedding parameters a = 0.9922 b = 1.112

## 15:44:45 Read 2333 rows and found 10 numeric columns

## 15:44:45 Using Annoy for neighbor search, n_neighbors = 30

## 15:44:45 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 15:44:45 Writing NN index file to temp file /tmp/RtmppzZqod/file26ccbb8b6c8c3
## 15:44:45 Searching Annoy index using 1 thread, search_k = 3000
## 15:44:46 Annoy recall = 100%
## 15:44:46 Commencing smooth kNN distance calibration using 1 thread
## 15:44:48 Found 2 connected components, falling back to 'spca' initialization with init_sdev = 1
## 15:44:48 Initializing from PCA
## 15:44:48 Using 'irlba' for PCA
## 15:44:48 PCA: 2 components explained 55.15% variance
## 15:44:48 Commencing optimization for 500 epochs, with 85398 positive edges
## 15:44:51 Optimization finished

pbmc

## An object of class Seurat 
## 16968 features across 2333 samples within 1 assay 
## Active assay: RNA (16968 features, 2000 variable features)
##  2 dimensional reductions calculated: pca, umap

T1 <- DimPlot(pbmc, reduction = "umap" , label = TRUE, pt.size = 2) & NoLegend()
T1

pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

## Calculating cluster 0

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

## Calculating cluster 6

## Calculating cluster 7

## Calculating cluster 8

## Calculating cluster 9

## Calculating cluster 10

## Calculating cluster 11

## Calculating cluster 12

## Calculating cluster 13

## Calculating cluster 14

## Calculating cluster 15

if you want to show the first 3 biomarkers of each cluster, use this one:

pbmc.markers %>%
    group_by(cluster) %>%
    slice_max(n = 3, order_by = avg_log2FC)

## # A tibble: 48 × 7
## # Groups:   cluster [16]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene    
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
##  1 5.01e- 70      1.22  0.864 0.366 8.50e- 66 0       Ebf1    
##  2 2.57e- 46      1.13  0.909 0.553 4.35e- 42 0       Ube2c   
##  3 2.38e- 47      1.12  0.922 0.565 4.04e- 43 0       Cdc20   
##  4 2.94e-138      4.28  0.915 0.247 4.99e-134 1       Cxcl14  
##  5 3.94e-227      2.89  0.759 0.045 6.69e-223 1       Gjb2    
##  6 3.86e-271      2.79  0.99  0.087 6.54e-267 1       Bcam    
##  7 2.82e- 34      0.975 0.814 0.412 4.79e- 30 2       Hoxa11os
##  8 6.13e- 11      0.724 0.515 0.308 1.04e-  6 2       Pmaip1  
##  9 2.68e- 13      0.704 0.361 0.166 4.55e-  9 2       Hoxd12  
## 10 4.47e- 84      1.64  0.995 0.499 7.59e- 80 3       Prrx2   
## # … with 38 more rows

VlnPlot(pbmc, features = c("Runx2", "Col1a1", "Sox9", "Mki67", "Cdk1", "Stmn1", "Top2a", "Cenpa", "Hoxd13", "Krt14", "Cdh5", "Lyz2", "Col2a1", "Col10a1", "Myog", "Sp7", "Clec11a","Rgs5", "Myh11", "Mcam", "Grem1", "Pdgfra", "Cxcl12", "Pdgfrb"))

FeaturePlot(pbmc, features = c("Runx2", "Col1a1", "Sox9", "Mki67", "Cdk1", "Stmn1", "Top2a", "Cenpa", "Hoxd13", "Tbx13", "Krt14", "Cdh5", "Lyz2", "Col2a1", "Col1a1", "Col10a1", "Myog", "Sp7", "Clec11a", "Col1a1","Rgs5", "Myh11", "Mcam", "Grem1", "Pdgfra", "Cxcl12", "Pdgfrb"))

## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: Tbx13

DotPlot(pbmc, features = c("Runx2", "Col1a1", "Sox9", "Mki67", "Cdk1", "Stmn1", "Top2a", "Cenpa", "Hoxd13", "Tbx13", "Krt14", "Cdh5", "Lyz2", "Col2a1", "Col10a1", "Myog", "Sp7", "Clec11a", "Rgs5", "Myh11", "Mcam", "Grem1", "Pdgfra", "Cxcl12", "Pdgfrb"))

## Warning in FetchData(object = object, vars = features, cells = cells): The
## following requested variables were not found: Tbx13

pbmc.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10

DoHeatmap(pbmc, features = top10$gene) + scale_fill_viridis_b() + theme(text = element_text(size = 9))

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

new.cluster.ids <- c("DCh", "Osteocytes", "Osteoblast", "HtCh", "Osteoblasts", "DCh", "Endothelial Cells", "Hoxd13+", "Endothelial Cells", "DCh", "Chondrocytes", "Terminal HTCh", "Osteoblasts", "Fibroblasts", "Osteoblasts", "Immune cells")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)

T2 <- DimPlot(pbmc, reduction = "umap",label.size = 4, label = TRUE, pt.size = 2) + NoLegend()
T2

Compare T1 and T2

T1 +T2

library(SingleR)
library(Seurat)
library(celldex)

## 
## Attaching package: 'celldex'

## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData

library(SingleCellExperiment)

pbmc <- readRDS("/mnt/nectar_volume/home/eraz0001/KELLY 2020/E11.5/Final_15_clusters.RDS")
cell.type.A <- celldex::MouseRNAseqData()

## snapshotDate(): 2020-10-27

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

## see ?celldex and browseVignettes('celldex') for documentation

## loading from cache

sce <- as.SingleCellExperiment(DietSeurat(pbmc))
sce

## class: SingleCellExperiment 
## dim: 16968 2333 
## metadata(0):
## assays(2): counts logcounts
## rownames(16968): Rp1 Sox17 ... mt-Cytb Cd99
## rowData names(0):
## colnames(2333): AAACCTGAGACGCACA-1 AAACCTGCACAGACAG-1 ...
##   TTTGTCATCTAGAGTC-1 TTTGTCATCTTTAGGG-1
## colData names(19): orig.ident nCount_RNA ... RNA_snn_res.2 ident
## reducedDimNames(0):
## altExpNames(0):

main.group <- SingleR(test = sce,assay.type.test = 1,ref = cell.type.A,labels = cell.type.A$label.main)
fine.group <- SingleR(test = sce,assay.type.test = 1,ref = cell.type.A,labels = cell.type.A$label.fine)
table(main.group$pruned.labels)

## 
##           B cells    Cardiomyocytes Endothelial cells  Epithelial cells 
##                 2                 1               318                 1 
##      Erythrocytes       Fibroblasts       Hepatocytes       Macrophages 
##                 8              1930                27                 3 
##         Monocytes           Neurons          NK cells           T cells 
##                18                 3                 1                 7

table(fine.group$pruned.labels)

## 
##                 aNSCs               B cells        Cardiomyocytes 
##                    11                     3                     1 
##     Endothelial cells          Erythrocytes           Fibroblasts 
##                   319                     3                    23 
## Fibroblasts senescent          Granulocytes           Hepatocytes 
##                   594                     1                     2 
##           Macrophages             Monocytes              NK cells 
##                     2                    19                     2 
##                  NPCs                  OPCs               T cells 
##                  1332                     8                     5

pbmc@meta.data$main.group <- main.group$pruned.labels
pbmc@meta.data$fine.group <- fine.group$pruned.labels
pbmc <- SetIdent(pbmc, value = "fine.group")
T3 <- DimPlot(pbmc, label = T , repel = T,pt.size = 3, label.size = 4) + NoLegend()
T3

## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

T1 + T2 + T3

## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Seurat

Ehsan Razmara Monash Uni

08/03/2022

Clustering the data