single-cell RNA-seq analysis

Read in packages

library(Seurat)
library(tidyverse)

Read in data

# read in dataset taken from the lung cancer paper
seurat_lung_mouse <- readRDS(file = "/Users/cathal.king/Desktop/SAGC_workshop/seurat_lung_mouse.rds")
# observe data
seurat_lung_mouse
## An object of class Seurat 
## 28205 features across 17549 samples within 1 assay 
## Active assay: originalexp (28205 features, 0 variable features)
##  1 dimensional reduction calculated: SPRING
# check dimensions of dataset
dim(seurat_lung_mouse)
## [1] 28205 17549
# Plot the UMAP plot as shown in Figure 1D of the paper.
DimPlot(object = seurat_lung_mouse, reduction = "SPRING", group.by = "Major.cell.type", label = T)

Re-run Seurat pipeline on this data.

Normalise data

Normalisation of gene counts.

Normalisation of gene counts.

# normalize with the log normalise method
seurat_lung_mouse <- Seurat::NormalizeData(seurat_lung_mouse, normalization.method = "LogNormalize")
# alternate methods for normalisation are possible and relevant to certain use cases

Clustering

Cell Clustering.

Cell Clustering.

# standard Seurat clustering work-flow
seurat_lung_mouse <- FindVariableFeatures(seurat_lung_mouse, nfeatures = 2000)
seurat_lung_mouse <- ScaleData(seurat_lung_mouse)
## Centering and scaling data matrix
seurat_lung_mouse <- RunPCA(seurat_lung_mouse, verbose = F)
ElbowPlot(seurat_lung_mouse)

seurat_lung_mouse <- FindNeighbors(seurat_lung_mouse, dims = 1:30, reduction = "pca")
## Computing nearest neighbor graph
## Computing SNN
seurat_lung_mouse <- FindClusters(seurat_lung_mouse, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 17549
## Number of edges: 693646
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9423
## Number of communities: 21
## Elapsed time: 1 seconds

Dimensionality Reduction

Dimensionality reduction.

Dimensionality reduction.

# Calculate UMAP embedding
seurat_lung_mouse <- RunUMAP(object = seurat_lung_mouse, 
                             reduction = "pca",
                             dims = 1:30,
#                             n.neighbors = 10,
                             reduction.name = "New_UMAP", 
                             verbose = F)

# plot UMAP from what we calculated above
p <- DimPlot(object = seurat_lung_mouse, 
             reduction = "New_UMAP", 
             group.by = "Major.cell.type", 
             label = T, 
             pt.size = 0.5) + 
  NoLegend() +
  ggtitle("Seurat UMAP")

# UMAP from paper
q <- DimPlot(object = seurat_lung_mouse, 
             reduction = "SPRING", 
             group.by = "Major.cell.type", 
             label = T) + 
  NoLegend() +
  ggtitle("Figure 1D UMAP")

# plot 2 UMAP's side-by-side for comparison
p|q

## plot cell clusters on the UMAP
DimPlot(object = seurat_lung_mouse, 
        reduction = "New_UMAP", 
        group.by = "seurat_clusters", 
        label = T, 
        pt.size = 0.5) + 
  NoLegend()

Some other UMAP considerations

A UMAP representation of cell clusters.

A UMAP representation of cell clusters.

## opacity
DimPlot(object = seurat_lung_mouse, 
        reduction = "SPRING", 
        group.by = "Major.cell.type", 
        label = T, 
        pt.size = 0.5)
## Warning: Removed 1610 rows containing missing values (`geom_point()`).

## order and shuffle parameter
j <- DimPlot(object = seurat_lung_mouse,
        reduction = "New_UMAP",
        group.by = "Major.cell.type",
        label = T,
        pt.size = 0.5,
#        order = NULL,
        shuffle = T)
j

## CVD
library(dittoSeq)
#
dittoDimPlot(object = seurat_lung_mouse, 
             reduction.use = "SPRING", 
             var = "Major.cell.type",
             opacity = 0.5,
             do.label = T,
             labels.size = 3) + ggtitle("My UMAP Representation")
## Warning: Removed 1610 rows containing missing values (`geom_point()`).

#
dittoDimPlot(object = seurat_lung_mouse, 
             reduction.use = "SPRING", 
             var = "Major.cell.type",
             opacity = 0.5,
             do.hover = T)

Cell cluster analysis

Differentially expressed genes per cluster

Differentially expressed genes per cluster

# find all markers of cluster 2
cluster2.markers <- FindMarkers(seurat_lung_mouse, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
##               p_val avg_log2FC pct.1 pct.2 p_val_adj
## 4930402H24Rik     0   2.486358 0.583 0.099         0
## Actg1             0  -1.680312 0.583 0.862         0
## Alox5ap           0  -3.312858 0.043 0.513         0
## Anxa2             0  -2.620054 0.037 0.481         0
## Bank1             0   2.074459 0.266 0.025         0
# 
# # find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(seurat_lung_mouse, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## Cd74    4.635817e-232   4.395628 0.340 0.039 1.307532e-227
## S100a11 1.002689e-182  -1.903234 0.150 0.747 2.828083e-178
## S100a6  3.157390e-170  -1.893960 0.140 0.716 8.905418e-166
## Selplg  6.151238e-169  -2.004332 0.089 0.673 1.734957e-164
## Fos     7.315773e-166  -2.075319 0.178 0.724 2.063414e-161
# # find markers for every cluster compared to all remaining cells, report only the positive ones
seurat_lung_mouse.markers <- FindAllMarkers(seurat_lung_mouse, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
seurat_lung_mouse.markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 42 × 7
## # Groups:   cluster [21]
##    p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene  
##    <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr> 
##  1     0       3.41 0.787 0.111         0 0       Retnlg
##  2     0       2.80 0.549 0.082         0 0       Mmp8  
##  3     0       3.14 0.74  0.098         0 1       Clec4n
##  4     0       2.98 0.745 0.138         0 1       Ier3  
##  5     0       3.73 0.947 0.063         0 2       Cd79a 
##  6     0       3.29 0.587 0.035         0 2       Fcmr  
##  7     0       3.19 0.547 0.03          0 3       Thy1  
##  8     0       2.75 0.337 0.025         0 3       Lef1  
##  9     0       3.99 0.825 0.101         0 4       Plac8 
## 10     0       2.98 0.411 0.024         0 4       Fn1   
## # ℹ 32 more rows
## Export that table to a file and open in Excel for example.
#write.csv(x = seurat_lung_mouse.markers, file = "/PATH/seurat_lung_mouse.markers.csv")

## visualise DE
VlnPlot(seurat_lung_mouse, features = c("Cd74", "Trp53"))

# featureplot DE 
FeaturePlot(seurat_lung_mouse, features = c("Cd74", "Trp53"), reduction = "SPRING")

Spatial Transcriptomics

10x Visium Spatial Spots

10x Visium Spatial Spots

Spatial Transcriptomics Image

Spatial Transcriptomics Image

# read in
spatial_seurat <- readRDS("/Users/cathal.king/Documents/Projects/DL/Visium_Mouse/Seurat/Visium_mouse_Re_seurat_A1.rds")

# add name in idents
spatial_seurat@meta.data$orig.ident <- "A1"

# plot spatial image
SpatialPlot(spatial_seurat,
            pt.size.factor = 2.5,
            alpha = 1) + 
  NoLegend() + 
  ggtitle('Spatial Transcriptomics mouse colon') + 
  theme(plot.title = element_text(hjust = 0.5))

QC

# QC metrics
spatial_seurat <- PercentageFeatureSet(spatial_seurat, "^mt-", 
                                       col.name = "percent_mito")
# plot
SpatialFeaturePlot(spatial_seurat, 
                   features = c("nFeature_Spatial", "nCount_Spatial", "percent_mito"), 
                   pt.size.factor = 2.8)

Clustering and Dimensionality Reduction

# Normalise and process data
spatial_seurat <- SCTransform(spatial_seurat, assay = "Spatial", verbose = FALSE)
spatial_seurat <- RunPCA(spatial_seurat, assay = "SCT", verbose = FALSE)
spatial_seurat <- FindNeighbors(spatial_seurat, reduction = "pca", dims = 1:30)
spatial_seurat <- FindClusters(spatial_seurat, verbose = FALSE, algorithm = 1)
spatial_seurat <- RunUMAP(spatial_seurat, reduction = "pca", dims = 1:30)

## UMAP plot
p1 <- DimPlot(spatial_seurat, reduction = "umap", label = TRUE, pt.size = 3, label.size = 7) + ggtitle("UMAP representation of Spot Clusters for A1")+
  theme(plot.title = element_text(hjust = 0.5))
p2 <- SpatialDimPlot(spatial_seurat, label = TRUE, label.size = 3.6, pt.size.factor = 3) + ggtitle("Seurat Clusters A1")+
  theme(plot.title = element_text(hjust = 0.5))
p1 + p2

# Plot chosen Genes on spatial image
SpatialFeaturePlot(spatial_seurat, 
                   features = c("Cd8a", "Cd81", "Cd14", "Lag3"), 
                   pt.size.factor = 2.8, 
                   max.cutoff = 2) # min.cutoff = 0, max.cutoff = 20

# A better colour scheme
color.panel = c("#73CA6F", 
#                             "#AF75CE", 
                             "#C5916F", 
#                             "#ABC0D4", 
                             "#ECEAC5", 
                             "#C49CA4", 
                             "#C66FA0", 
#                             "#706EC4", 
                             "#DDCDE3", 
#                             "#7395C3", 
#                             "#ACA1DC", 
                             "#E5C8BB", 
#                             "#C96C69", 
                             "#FFFF66", 
                             "#99ff99", 
                             "#C1AE76", 
                             "#C4E6E3", 
                             "#C3D37B", 
                             "#BCE9BD", 
                             "#7ABB9B")

# observe clusters clearer
SpatialDimPlot(spatial_seurat, label = TRUE, label.size = 3.6, pt.size.factor = 3, image.alpha = 0.6) + 
  ggtitle("Seurat Clusters overlayed on Spatial image") + 
  theme(plot.title = element_text(hjust = 0.5))

DEG’s

# findmarkers will find markers between spots in ident_1 and all other spots in the dataset (not in that cluster).
Idents(spatial_seurat) <- "seurat_clusters"

# DEG's between cluster 1 and 4
#spatial_seurat <- FindMarkers(object = spatial_seurat, ident.1 = 1, ident.2 = 4)

# This loop runs the FindMarkers function on all of the clusters.
lapply(
  levels(spatial_seurat[["seurat_clusters"]][[1]]),
  function(x)FindMarkers(spatial_seurat,ident.1 = x,min.pct = 0.25)
) -> cluster.markers.A1
#
sapply(0:(length(cluster.markers.A1)-1),function(x) {
  cluster.markers.A1[[x+1]]$gene <<- rownames(cluster.markers.A1[[x+1]])
  cluster.markers.A1[[x+1]]$cluster <<- x
})
## [1] 0 1 2 3 4 5 6
#
as_tibble(do.call(rbind,cluster.markers.A1)) %>% arrange(p_val_adj) -> cluster.markers.A1
cluster.markers.A1 # export
## # A tibble: 17,504 × 7
##       p_val avg_log2FC pct.1 pct.2 p_val_adj gene   cluster
##       <dbl>      <dbl> <dbl> <dbl>     <dbl> <chr>    <int>
##  1 8.16e-59      1.75      1 1      1.36e-54 Acta2        0
##  2 8.99e-59      2.43      1 1      1.50e-54 Tagln        0
##  3 4.04e-58      2.25      1 1      6.75e-54 Actg2        0
##  4 8.03e-58      1.75      1 1      1.34e-53 Tpm2         0
##  5 2.13e-57      1.93      1 1      3.57e-53 Cnn1         0
##  6 3.30e-57      1.73      1 1      5.52e-53 Myl9         0
##  7 3.78e-57      1.66      1 1      6.32e-53 Csrp1        0
##  8 4.14e-57      1.32      1 1      6.92e-53 Flna         0
##  9 4.69e-56      1.71      1 0.994  7.84e-52 Ckb          0
## 10 8.41e-55      0.713     1 1      1.41e-50 mt-Nd4       0
## # ℹ 17,494 more rows
#
# write.csv(x = cluster.markers.A1, file = "/PATH/spatial.cluster.markers.csv", row.names = F)

#
spatial_seurat <- FindSpatiallyVariableFeatures(spatial_seurat, 
                                             assay = "SCT", 
                                             features = VariableFeatures(spatial_seurat)[1:1000],
                                             selection.method = "moransi")
## Computing Moran's I
#
top.features <- head(SpatiallyVariableFeatures(spatial_seurat, selection.method = "moransi"), 6)
#
SpatialFeaturePlot(spatial_seurat, 
                   features = top.features, 
                   ncol = 3, 
                   alpha = c(0.1, 1), 
                   pt.size.factor = 2.5)

Interactive plotting with Seurat

#SpatialDimPlot(spatial_seurat, interactive = TRUE)

Integrate data

  • To integrate two or more datasets together, a integration algorithm is required such as Harmony, Scanorama etc.
  • This process will globally shift the whole transcriptional space in some direction.
  • Not integrating or correcting for batch effects will present clusters that are made up entirely of one sample, patient, sex etc. These clusters can usually be detected prior to integration.
  • Integration algorithms for single-cell RNA-seq and ST are used interchangeably.
  • Recent benchmark papers have been done that compare these algorithms.
  • Today, integrate using the RPCA Algorithm implemented through Seurat - fast and efficient.
  • https://satijalab.org/seurat/articles/integration_rpca
  • 2 samples.
Remove uninteresting sources of variation with integration.

Remove uninteresting sources of variation with integration.

#read in
A1_seu <- readRDS("/Users/cathal.king/Documents/Projects/DL/Visium_Mouse/Seurat/Visium_mouse_Re_seurat_A1.rds")
C1_seu <- readRDS("/Users/cathal.king/Documents/Projects/DL/Visium_Mouse/Seurat/Visium_mouse_Re_seurat_C1.rds")
# rename orig.ident
A1_seu@meta.data$orig.ident <- "A1"
C1_seu@meta.data$orig.ident <- "C1"

# merge
spat.list.merged <- merge(A1_seu, C1_seu)

# plot un-integrated first
# Run the standard workflow for visualization and clustering
spat.list.merged <- FindVariableFeatures(object = spat.list.merged, nfeatures = 2000)
spat.list.merged <- ScaleData(spat.list.merged, verbose = FALSE)
spat.list.merged <- RunPCA(spat.list.merged, npcs = 30, verbose = FALSE)
spat.list.merged <- RunUMAP(spat.list.merged, reduction = "pca", dims = 1:30)
spat.list.merged <- FindNeighbors(spat.list.merged, reduction = "pca", dims = 1:30)
spat.list.merged <- FindClusters(spat.list.merged, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 859
## Number of edges: 24686
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8464
## Number of communities: 8
## Elapsed time: 0 seconds
# plot
DimPlot(spat.list.merged, reduction = "umap", group.by = "orig.ident", label = T,
        pt.size = 1.5, label.size = 6)

# now this time integrate samples
spat.list.merged <- merge(A1_seu, C1_seu)

# split the dataset into a list of two seurat objects (stim and CTRL)
spat.list.merged <- SplitObject(spat.list.merged, split.by = "orig.ident")

# normalize and identify variable features for each dataset independently
spat.list.merged <- lapply(X = spat.list.merged, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = spat.list.merged)

#
immune.anchors <- FindIntegrationAnchors(object.list = spat.list.merged, anchor.features = features)

# this command creates an 'integrated' data assay
immune.combined <- IntegrateData(anchorset = immune.anchors)

# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(immune.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
immune.combined <- ScaleData(immune.combined, verbose = FALSE)
immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
immune.combined <- FindClusters(immune.combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 859
## Number of edges: 29313
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8443
## Number of communities: 7
## Elapsed time: 0 seconds
# Visualization
DimPlot(immune.combined, reduction = "umap", group.by = "orig.ident", pt.size = 1.5, label.size = 6)

DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)