pbmc.data <- Read10X(data.dir = "/mnt/nectar_volume/home/eraz0001/KELLY 2020/E11.5/")
The most important stage is preparing Seurat object.
pbmc.data[c("Lyz1", "Hoxd13", "Cdh5"), 1:30]
## 3 x 30 sparse Matrix of class "dgCMatrix"
## [[ suppressing 30 column names 'AAACCTGAGACGCACA-1', 'AAACCTGCACAGACAG-1', 'AAACCTGCAGGCTCAC-1' ... ]]
##
## Lyz1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## Hoxd13 . . . . . . . . . . . . . . . . . 4 . . . . . 1 2 . . . 8 .
## Cdh5 . . . . 9 . . . . 17 . . 1 . . . 37 . . . . 1 . 1 . 17 . . . .
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "E11.5", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat
## 16968 features across 2683 samples within 1 assay
## Active assay: RNA (16968 features, 0 variable features)
VlnPlot(object = pbmc, pt.size = 0.5,features = c("Lyz1", "Hoxd13", "Cdh5"))
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: Lyz1
#QC metrics in Seurat. The number of genes/percent of genes are depicted in bar using the following command.
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^mt-")
VlnPlot(pbmc, features = c("nFeature_RNA","nCount_RNA","percent.mt", "percent_ribo"),ncol = 4, pt.size = 0.8)
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: percent_ribo
RidgePlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent_ribo"), ncol =1)
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: percent_ribo
## Picking joint bandwidth of 178
## Picking joint bandwidth of 896
## Picking joint bandwidth of 0.176
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
If you wish to use GGplot2 to draw your favorite bars, you can use the following command:
ggplot(pbmc@meta.data, aes(x=nFeature_RNA, y = nCount_RNA)) + geom_point()
The percent of MT genes as well as their density can be shown using the following command:
ggplot(pbmc@meta.data, aes(x=percent.mt)) + geom_density() + geom_vline(xintercept = 5, color = 'red', linetype = 2)
We can determine that the cutoff line (e.g., of 5 percent) for this.
plot3 <- FeatureScatter(pbmc, feature1 = "nFeature_RNA", feature2 = "percent.mt")
plot3
If you’re the thresholds are appropriately determined, it can be applied using the following command.
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 5)
pbmc
## An object of class Seurat
## 16968 features across 2333 samples within 1 assay
## Active assay: RNA (16968 features, 0 variable features)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(pbmc)
length(all.genes)
## [1] 16968
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: Cd24a, Gpc3, Nnat, Ldhb, Mfap4, Crabp2, H1fx, Hoxa10, Hmga2, Sdc1
## Gas1, Capn6, Sox11, Pitx1, Phgdh, Lpar1, Prrx1, Sox9, Pdlim4, Ccnd2
## Id2, Fbln1, Cdh11, Cdkn1c, Prrx2, Hoxc10, Bcl11a, Igsf3, Peg3, Frzb
## Negative: Cd34, Cdh5, Emcn, Esam, Plvap, Pecam1, Icam2, Cldn5, Ecscr, Eng
## Gimap6, Rasip1, Crip2, Tie1, Tmem255a, Flt1, Prkcdbp, Aplnr, Anxa3, Plxnd1
## Gng11, Col15a1, S100a16, Icam1, Tagln2, Tfpi, Mfng, Egfl7, Kdr, Cav1
## PC_ 2
## Positive: Sfn, Cldn4, Perp, Cldn6, Slc2a3, Pdlim1, Krt5, Cldn7, Epcam, Krt7
## Cdh1, Spint1, Krt14, Wnt6, Emp1, Cldn3, Tacstd2, Krt8, Bcam, Esrp1
## Krt18, Ovol1, Grhl3, Pvrl4, Gm16136, Spint2, Rab25, Dsp, Hspb1, Slc9a3r1
## Negative: Igfbp4, Fscn1, Hoxa10, Mest, Meg3, Asb4, Crabp2, Cdkn1c, Pitx1, Mfap4
## Capn6, Prrx1, Lpar1, Cdh11, Sox11, Nnat, Lgals1, Frzb, Maged2, Nrp1
## Col26a1, Clmp, Tmem119, Prrx2, Col9a1, Col3a1, Dusp6, Id3, Sfrp2, Mab21l2
## PC_ 3
## Positive: Rac2, Spi1, Tyrobp, Fermt3, Cd53, Evi2a, Myo1g, Psmb8, Bin2, Srgn
## Cd52, Coro1a, Myo1f, Cytip, Fmnl1, Ptpn6, Plek, Laptm5, Fcer1g, Alox5ap
## Plac8, Vav1, Plcg2, Glipr1, Pld4, Was, Parvg, Ikzf1, Itgb2, Apbb1ip
## Negative: Serpinh1, Mest, Tuba1a, Fscn1, H19, Col18a1, Sparc, Dstn, Maged2, S100a16
## Mpzl1, Pvrl2, Tspan6, Gja1, Cald1, Efna1, Igf2, Nrarp, Cd151, Crip2
## Igfbp2, Cldn5, Tmem255a, Plvap, Col4a1, Cd81, Sox11, Prkcdbp, Bgn, Cdh5
## PC_ 4
## Positive: Ran, Tuba1b, Birc5, H2afz, Top2a, Cdca8, Ccna2, Ccnb1, Tubb5, Ube2c
## Hspa8, Eif5a, Cdca3, Hmgb2, Tubb4b, Cenpa, 2810417H13Rik, Cdc20, Tpx2, Smc4
## Ube2s, Tacc3, Pbk, Ddx39, Racgap1, Anp32b, Hn1, Cdk1, H2afx, Aurkb
## Negative: Col9a2, Col2a1, Sfrp2, Col9a3, H19, Itm2a, Fibin, Vcan, Cdkn1c, Wwp2
## Osr2, Igf2, Sfrp1, Cthrc1, Matn4, Tox, Cpe, Tgfb2, Enpp2, Lect1
## Capn6, Malat1, Creb5, Cxcl12, Pmp22, Runx1t1, Meg3, Col9a1, Rhoc, Tbx18
## PC_ 5
## Positive: Kremen2, Cxcl14, Sost, Pdgfa, Wnt3, Trp63, Lypd6b, Gjb6, Pltp, Gjb2
## Htra1, Igfbp5, Itga3, Hapln1, Bcam, Lgals7, Gpx2, Fermt1, Wnt7b, Sp8
## Frem2, Epha1, C1qtnf3, Npnt, Wnt6, Fmod, Dlx2, Cbr2, Pdpn, Galnt18
## Negative: Pvrl4, Tacstd2, Rhov, Lypd3, Grhl3, Aldh3b2, Cldn3, Anxa1, Arg1, Gabrp
## Tgm1, Nebl, Upk1a, Smtnl2, Zfp750, Ppl, Tnfaip2, Penk, Paqr5, Tuba1a
## Krt8, 3830417A13Rik, Gm16136, Bspry, Mall, Ocln, Ovol1, Cldn23, Heyl, Elf5
names(pbmc)
## [1] "RNA" "pca"
print(pbmc[["pca"]], dims = 1:5, nfeatures = 10)
## PC_ 1
## Positive: Cd24a, Gpc3, Nnat, Ldhb, Mfap4, Crabp2, H1fx, Hoxa10, Hmga2, Sdc1
## Negative: Cd34, Cdh5, Emcn, Esam, Plvap, Pecam1, Icam2, Cldn5, Ecscr, Eng
## PC_ 2
## Positive: Sfn, Cldn4, Perp, Cldn6, Slc2a3, Pdlim1, Krt5, Cldn7, Epcam, Krt7
## Negative: Igfbp4, Fscn1, Hoxa10, Mest, Meg3, Asb4, Crabp2, Cdkn1c, Pitx1, Mfap4
## PC_ 3
## Positive: Rac2, Spi1, Tyrobp, Fermt3, Cd53, Evi2a, Myo1g, Psmb8, Bin2, Srgn
## Negative: Serpinh1, Mest, Tuba1a, Fscn1, H19, Col18a1, Sparc, Dstn, Maged2, S100a16
## PC_ 4
## Positive: Ran, Tuba1b, Birc5, H2afz, Top2a, Cdca8, Ccna2, Ccnb1, Tubb5, Ube2c
## Negative: Col9a2, Col2a1, Sfrp2, Col9a3, H19, Itm2a, Fibin, Vcan, Cdkn1c, Wwp2
## PC_ 5
## Positive: Kremen2, Cxcl14, Sost, Pdgfa, Wnt3, Trp63, Lypd6b, Gjb6, Pltp, Gjb2
## Negative: Pvrl4, Tacstd2, Rhov, Lypd3, Grhl3, Aldh3b2, Cldn3, Anxa1, Arg1, Gabrp
VizDimLoadings(pbmc, dims = 1:5, reduction = "pca",nfeatures = 30, col = "red") & theme(axis.text=element_text(size=5), axis.title=element_text(size=8,face="bold"))
DimPlot(pbmc, reduction = "pca", pt.size = 1) & NoLegend()
DimHeatmap(pbmc, dims = 1, nfeatures = 20, cells = 500, balanced = TRUE)
for(i in 1:15){
print(DimHeatmap(pbmc, dims = i, cells = 500, nfeatures = 20, balanced = TRUE))
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
DimHeatmap(pbmc, dims = 1:15, cells = 500,nfeatures = 30, balanced = TRUE)
Determine the ‘Dimensionality’ of the data-set
pbmc <- JackStraw(pbmc, num.replicate = 10, prop.freq = 0.1)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:20)
## Warning: Removed 28000 rows containing missing values (geom_point).
JackStrawPlot(pbmc, dims = 1:15) + coord_cartesian() + geom_abline(intercept = 0, slope = 0.05)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 21000 rows containing missing values (geom_point).
What if we determine a threshold to cut down on some PCs. To this end, we just included significant genes with FDR 0.05.
ElbowPlot(pbmc)
pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
names(pbmc)
## [1] "RNA" "RNA_nn" "RNA_snn" "pca"
pbmc <- FindClusters(pbmc, resolution = 1.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2333
## Number of edges: 67099
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8007
## Number of communities: 16
## Elapsed time: 0 seconds
pbmc
## An object of class Seurat
## 16968 features across 2333 samples within 1 assay
## Active assay: RNA (16968 features, 2000 variable features)
## 1 dimensional reduction calculated: pca
#if you wanna check the cluster Id in 5 first data-set, use this command.
head(Idents(pbmc), 5)
## AAACCTGAGACGCACA-1 AAACCTGCACAGACAG-1 AAACCTGCAGGCTCAC-1 AAACCTGTCAAGAAGT-1
## 2 0 0 5
## AAACGGGAGGATATAC-1
## 6
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#Run non-linear dimensional reduction (UMAP/tSNE)
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 15:44:45 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:44:45 Read 2333 rows and found 10 numeric columns
## 15:44:45 Using Annoy for neighbor search, n_neighbors = 30
## 15:44:45 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:44:45 Writing NN index file to temp file /tmp/RtmppzZqod/file26ccbb8b6c8c3
## 15:44:45 Searching Annoy index using 1 thread, search_k = 3000
## 15:44:46 Annoy recall = 100%
## 15:44:46 Commencing smooth kNN distance calibration using 1 thread
## 15:44:48 Found 2 connected components, falling back to 'spca' initialization with init_sdev = 1
## 15:44:48 Initializing from PCA
## 15:44:48 Using 'irlba' for PCA
## 15:44:48 PCA: 2 components explained 55.15% variance
## 15:44:48 Commencing optimization for 500 epochs, with 85398 positive edges
## 15:44:51 Optimization finished
pbmc
## An object of class Seurat
## 16968 features across 2333 samples within 1 assay
## Active assay: RNA (16968 features, 2000 variable features)
## 2 dimensional reductions calculated: pca, umap
T1 <- DimPlot(pbmc, reduction = "umap" , label = TRUE, pt.size = 2) & NoLegend()
T1
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
if you want to show the first 3 biomarkers of each cluster, use this one:
pbmc.markers %>%
group_by(cluster) %>%
slice_max(n = 3, order_by = avg_log2FC)
## # A tibble: 48 × 7
## # Groups: cluster [16]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 5.01e- 70 1.22 0.864 0.366 8.50e- 66 0 Ebf1
## 2 2.57e- 46 1.13 0.909 0.553 4.35e- 42 0 Ube2c
## 3 2.38e- 47 1.12 0.922 0.565 4.04e- 43 0 Cdc20
## 4 2.94e-138 4.28 0.915 0.247 4.99e-134 1 Cxcl14
## 5 3.94e-227 2.89 0.759 0.045 6.69e-223 1 Gjb2
## 6 3.86e-271 2.79 0.99 0.087 6.54e-267 1 Bcam
## 7 2.82e- 34 0.975 0.814 0.412 4.79e- 30 2 Hoxa11os
## 8 6.13e- 11 0.724 0.515 0.308 1.04e- 6 2 Pmaip1
## 9 2.68e- 13 0.704 0.361 0.166 4.55e- 9 2 Hoxd12
## 10 4.47e- 84 1.64 0.995 0.499 7.59e- 80 3 Prrx2
## # … with 38 more rows
VlnPlot(pbmc, features = c("Runx2", "Col1a1", "Sox9", "Mki67", "Cdk1", "Stmn1", "Top2a", "Cenpa", "Hoxd13", "Krt14", "Cdh5", "Lyz2", "Col2a1", "Col10a1", "Myog", "Sp7", "Clec11a","Rgs5", "Myh11", "Mcam", "Grem1", "Pdgfra", "Cxcl12", "Pdgfrb"))
FeaturePlot(pbmc, features = c("Runx2", "Col1a1", "Sox9", "Mki67", "Cdk1", "Stmn1", "Top2a", "Cenpa", "Hoxd13", "Tbx13", "Krt14", "Cdh5", "Lyz2", "Col2a1", "Col1a1", "Col10a1", "Myog", "Sp7", "Clec11a", "Col1a1","Rgs5", "Myh11", "Mcam", "Grem1", "Pdgfra", "Cxcl12", "Pdgfrb"))
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: Tbx13
DotPlot(pbmc, features = c("Runx2", "Col1a1", "Sox9", "Mki67", "Cdk1", "Stmn1", "Top2a", "Cenpa", "Hoxd13", "Tbx13", "Krt14", "Cdh5", "Lyz2", "Col2a1", "Col10a1", "Myog", "Sp7", "Clec11a", "Rgs5", "Myh11", "Mcam", "Grem1", "Pdgfra", "Cxcl12", "Pdgfrb"))
## Warning in FetchData(object = object, vars = features, cells = cells): The
## following requested variables were not found: Tbx13
pbmc.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + scale_fill_viridis_b() + theme(text = element_text(size = 9))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
new.cluster.ids <- c("DCh", "Osteocytes", "Osteoblast", "HtCh", "Osteoblasts", "DCh", "Endothelial Cells", "Hoxd13+", "Endothelial Cells", "DCh", "Chondrocytes", "Terminal HTCh", "Osteoblasts", "Fibroblasts", "Osteoblasts", "Immune cells")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
T2 <- DimPlot(pbmc, reduction = "umap",label.size = 4, label = TRUE, pt.size = 2) + NoLegend()
T2
Compare T1 and T2
T1 +T2
library(SingleR)
library(Seurat)
library(celldex)
##
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
##
## BlueprintEncodeData, DatabaseImmuneCellExpressionData,
## HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
## MouseRNAseqData, NovershternHematopoieticData
library(SingleCellExperiment)
pbmc <- readRDS("/mnt/nectar_volume/home/eraz0001/KELLY 2020/E11.5/Final_15_clusters.RDS")
cell.type.A <- celldex::MouseRNAseqData()
## snapshotDate(): 2020-10-27
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
sce <- as.SingleCellExperiment(DietSeurat(pbmc))
sce
## class: SingleCellExperiment
## dim: 16968 2333
## metadata(0):
## assays(2): counts logcounts
## rownames(16968): Rp1 Sox17 ... mt-Cytb Cd99
## rowData names(0):
## colnames(2333): AAACCTGAGACGCACA-1 AAACCTGCACAGACAG-1 ...
## TTTGTCATCTAGAGTC-1 TTTGTCATCTTTAGGG-1
## colData names(19): orig.ident nCount_RNA ... RNA_snn_res.2 ident
## reducedDimNames(0):
## altExpNames(0):
main.group <- SingleR(test = sce,assay.type.test = 1,ref = cell.type.A,labels = cell.type.A$label.main)
fine.group <- SingleR(test = sce,assay.type.test = 1,ref = cell.type.A,labels = cell.type.A$label.fine)
table(main.group$pruned.labels)
##
## B cells Cardiomyocytes Endothelial cells Epithelial cells
## 2 1 318 1
## Erythrocytes Fibroblasts Hepatocytes Macrophages
## 8 1930 27 3
## Monocytes Neurons NK cells T cells
## 18 3 1 7
table(fine.group$pruned.labels)
##
## aNSCs B cells Cardiomyocytes
## 11 3 1
## Endothelial cells Erythrocytes Fibroblasts
## 319 3 23
## Fibroblasts senescent Granulocytes Hepatocytes
## 594 1 2
## Macrophages Monocytes NK cells
## 2 19 2
## NPCs OPCs T cells
## 1332 8 5
pbmc@meta.data$main.group <- main.group$pruned.labels
pbmc@meta.data$fine.group <- fine.group$pruned.labels
pbmc <- SetIdent(pbmc, value = "fine.group")
T3 <- DimPlot(pbmc, label = T , repel = T,pt.size = 3, label.size = 4) + NoLegend()
T3
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
T1 + T2 + T3
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps