- High-level goal: Perform single-cell analysis and cell-type clustering - Medium-level goal: Understand how Seurat works and how it’s implemented in R *- Low-level goal: Understand how individual functions work to accomplish our goal.
Consider all the theoretical steps to take us from where we’re starting to our goal. - What do we start with? - What do we want to end with? - How will we accomplish this?
How do we actually perform this, in Seurat style code?
Last time, we developed a pipeline. We made it up to creating an Elbow plot, which is a plot of the most significant principal components.
library(dplyr)
library(Seurat)
library(patchwork)
getwd()
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "../data/filtered_gene_bc_matrices/hg19/")
# pbmc.data <- Seurat::Read10X_h5(filename = "../data/PBMC_Covid/Normal_PBMC_13.h5", use.names = T)
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
pbmc # View the object.
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") # Get mitochondrial genes.
# Create violin plots for quality control.
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# Create scatterplots for the distribution of cells and their data-types
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# Remove cells that have too few features (empty droplets), too many features (doublets), or express too many mitochondrial genes. These cells are the only cells 'worth' doing additional analysis on.
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) # we have control over the amount of features we want to consider "differentially expressed".
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) # Performs PCA. The second argument isn't even needed.
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
This is for slower, more precise determination of PCs
# we can comment out if we want. They're quite slow functions.
# pbmc <- JackStraw(pbmc, num.replicate = 100)
# pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
# JackStrawPlot(pbmc, dims = 1:20)
We can an elbow plot to consider the “most valuable” principal components, quickly.
ElbowPlot(pbmc)
Many methods to dimensionality reduction. Seurat implements PCA, TSNE, and UMAP, but the current most popular are PCA and UMAP.
Question: PCA before UMAP?
Answer: It’s good practice. The original paper suggests that noise gets
suppressed by removing “useless” variables.
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) # We can adjust the dimensions, and the cells, and whether we want an equal of +/- features.
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2638
## Number of edges: 95840
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8726
## Number of communities: 9
## Elapsed time: 0 seconds
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
DimPlot(pbmc, reduction = "umap", label = TRUE)
We can use FindMarkers and FindAllMarkers to identify which genes best separate clusters of cells. - FindMarkers works on specific clusters compared to specific others
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## IL32 7.705178e-91 1.2155919 0.947 0.466 1.056688e-86
## LTB 2.582056e-85 1.2648768 0.981 0.643 3.541032e-81
## CD3D 2.722666e-70 0.9354316 0.922 0.432 3.733865e-66
## IL7R 9.563577e-68 1.1848845 0.751 0.327 1.311549e-63
## LDHB 2.746137e-66 0.9185706 0.953 0.614 3.766053e-62
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## FCGR3A 6.074132e-208 4.251260 0.970 0.039 8.330065e-204
## IFITM3 1.269986e-199 3.889100 0.976 0.048 1.741659e-195
## CFD 1.779978e-198 3.412065 0.939 0.037 2.441062e-194
## CD68 5.000885e-195 3.018954 0.927 0.035 6.858213e-191
## RP11-290F20.3 5.290916e-188 2.707301 0.829 0.016 7.255962e-184
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC) # What's happening here?
## # A tibble: 18 × 7
## # Groups: cluster [9]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 2.87e- 84 1.34 0.436 0.108 3.94e- 80 0 CCR7
## 2 5.95e- 49 1.05 0.333 0.104 8.17e- 45 0 LEF1
## 3 0 5.56 0.996 0.216 0 1 S100A9
## 4 0 5.48 0.975 0.122 0 1 S100A8
## 5 2.58e- 85 1.26 0.981 0.643 3.54e- 81 2 LTB
## 6 4.69e- 59 1.24 0.423 0.111 6.44e- 55 2 AQP3
## 7 0 4.31 0.936 0.041 0 3 CD79A
## 8 9.48e-271 3.59 0.622 0.022 1.30e-266 3 TCL1A
## 9 1.71e-186 3.05 0.981 0.241 2.35e-182 4 CCL5
## 10 1.39e-156 2.94 0.585 0.059 1.91e-152 4 GZMK
## 11 2.74e-183 3.30 0.97 0.134 3.76e-179 5 FCGR3A
## 12 8.88e-127 3.08 1 0.314 1.22e-122 5 LST1
## 13 1.04e-189 5.28 0.961 0.132 1.43e-185 6 GNLY
## 14 3.17e-267 4.83 0.961 0.068 4.35e-263 6 GZMB
## 15 1.48e-220 3.87 0.812 0.011 2.03e-216 7 FCER1A
## 16 1.67e- 21 2.87 1 0.513 2.28e- 17 7 HLA-DPB1
## 17 3.68e-110 8.58 1 0.024 5.05e-106 8 PPBP
## 18 7.73e-200 7.24 1 0.01 1.06e-195 8 PF4
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP","CCR7"))
pbmc.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
Consider this pbmc data from a patient with covid.
library(hdf5r)
getwd()
## [1] "/Users/iancoccimiglio/Library/CloudStorage/OneDrive-Personal/RstudioCode/CodingClub/Eight"
cov.15.data <- Seurat::Read10X_h5(filename = "../data/PBMC_Covid/nCoV_PBMC_15.h5")
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
cov15 <- CreateSeuratObject(counts = cov.15.data, project = "pbmc_Covid")
cov15 <- PercentageFeatureSet(cov15, "^MT-", col.name = "percent_mito")
cov15 <- PercentageFeatureSet(cov15, "^RP[SL]", col.name = "percent_ribo") # Ribosomal genes
cov15 <- PercentageFeatureSet(cov15, "^HB[^(P)]", col.name = "percent_hb") # Hemoglobin genes
cov15 <- PercentageFeatureSet(cov15, "PECAM1|PF4", col.name = "percent_plat") # Platelet genes?
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
VlnPlot(cov15, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 3) +
NoLegend()
FeatureScatter(cov15, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5)
FeatureScatter(cov15, "nFeature_RNA", "percent_mito", group.by = "orig.ident", pt.size = 0.5)