Jason Newbern two libraries. P25 frontal cortex tissue m10 = D1; m11 = D1sirt1
library(Seurat)
library(dplyr)
library(cowplot)
library(ggplot2)
library(rstatix)
library(SingleCellExperiment)
library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)
library(patchwork)
m10 <- AddMetaData(object = m10, metadata = "m10", col.name = "sample.id")
m11 <- AddMetaData(object = m11, metadata = "m11", col.name = "sample.id")
m10 <- AddMetaData(object = m10, metadata = "CTL", col.name = "group.id")
m11 <- AddMetaData(object = m11, metadata = "MUT", col.name = "group.id")
gallitano <- merge(m10, y = m11, add.cell.ids = c("m10","m11"), project = "two_samples")
gallitano[["percent.mt"]] <- PercentageFeatureSet(gallitano, pattern = "^mt-")
# 32885 genes,32391 cells
rm(m10)
rm(m11)
rm(seurat_obj, seurat_data)
head(gallitano)
VlnPlot(gallitano,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
group.by = "sample.id",
ncol = 3) + ggtitle("Quality assessment before filtering")
plot1 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "percent.mt")
line.data <- data.frame(yintercept = c(1500, 7500), Lines = c("lower", "upper"))
line2.data <- data.frame(xintercept = c(500), Lines = c("lower"))
plot2 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data) + geom_vline(aes(xintercept = xintercept, linetype = Lines), line2.data)
plot1 + plot2 + plot_layout(guides = 'collect') + plot_annotation(title = "Quality assessment before filtering")
Joint filtering effects: visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs
I follow the original script to filter based on these criteria: nFeature_RNA < 7500& nFeature_RNA>1500 & nCount_RNA > 500
and then, I added the filtering criteria of “percent.mt < 20”
After subseting, I also remove the genes with zero count in more than 100 cells.
dim(gallitano)
## [1] 16504 13551
VlnPlot(gallitano,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3) + ggtitle("Quality assessment after filtering")
plot1 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "percent.mt")
line.data <- data.frame(yintercept = c(1500, 7500), Lines = c("lower", "upper"))
line2.data <- data.frame(xintercept = c(500), Lines = c("lower"))
plot2 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data) + geom_vline(aes(xintercept = xintercept, linetype = Lines), line2.data)
plot1 + plot2 + plot_layout(guides = 'collect') + plot_annotation(title = "Quality assessment after filtering")
Apply sctransform normalization: These steps can replace
NormalizeData(), ScaleData(), and
FindVariableFeatures(). The results of sctransform are
stored in the “SCT” assay. It is assumed to reveals sharper biological
distinctions compared to the standard Seurat workflow.
Since it is a normalization step, we have to do
separately for two different samples (that’s why split seurat objects
here), then only integrate the sample expression data in the next step,
integration analysis, to remove batch effect.
# Run the standard workflow for visualization and clustering
seurat_integrated <- RunPCA(object = seurat_integrated, npcs = 40, verbose = F)
ElbowPlot(seurat_integrated, ndims = 40)
PCAPlot(seurat_integrated)
seurat_integrated <- RunUMAP(seurat_integrated,
dims = 1:40,
reduction = "pca")
seurat_integrated <- RunTSNE(seurat_integrated,
reduction = "pca")
# Determine the K-nearest neighbor graph
seurat_integrated <- FindNeighbors(object = seurat_integrated,
dims = 1:40)
# Determine the clusters for various resolutions
seurat_integrated <- FindClusters(object = seurat_integrated,
resolution = c(0.05, 0.5))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 13551
## Number of edges: 491943
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9693
## Number of communities: 9
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 13551
## Number of edges: 491943
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9076
## Number of communities: 24
## Elapsed time: 1 seconds
This step gives us some idea about how is the distribution of the number of genes, number of UMIs, and percentage of mitochondrial genes in each cluster. Normally, we expect to see similar distribution of no. of genes (nFeature_RNA) and no. of UMIs (nCount_RNA).
As for the percent.mt (percentage of mitochondrial genes per cell), it can be a reference to check if those high intensity clusters might be having poor quality cells (if so, we can try to remove in the next step or adjust the metrics in the previous filtering step) or it might be due to the differences biologically.
DefaultAssay(seurat_integrated) <- "integrated"
metrics <- c("nFeature_RNA", "nCount_RNA", "percent.mt")
FeaturePlot(seurat_integrated,
reduction = "umap",
features = metrics,
split.by = "sample.id",
pt.size = 0.4,
min.cutoff = 'q10',
label = TRUE)
Find all markers in two samples for cell type identification
select_genes = c('Trem2','Snap25','Epas1', 'Egr1', 'Npas4', 'Calcrl', 'Map2k1', 'Mapk1', 'Mapk3')
select_genes2 <- c('Snap25','Gad1','Gad2', 'Slc32a1', 'Slc17a7', 'Lamp5', 'Aqp4', 'Gfap', "Ndnf", 'Sncg', 'Vip', 'Trem2','Sst','Pvalb', 'Cux2', 'Rorb', 'Fezf2', 'Foxp2', 'Npas4', 'Mbp', 'Cldn5', 'Ctss', 'C1qa','Map2k1', 'Mapk1', 'Mapk3')
Idents(seurat_integrated) <- "seurat_clusters"
cellRanks <- seurat_integrated@meta.data[order(seurat_integrated@meta.data$seurat_clusters),]
PartialMatrix <- seurat_integrated@assays$integrated@scale.data[which(rownames(seurat_integrated@assays$integrated@scale.data) %in% select_genes2), rownames(cellRanks)]
cellRanks$col <- rainbow(max(as.numeric(cellRanks$seurat_clusters))+1)[as.numeric(cellRanks$seurat_clusters)+1]
ha_column <- HeatmapAnnotation(
df = data.frame(
ClusterID = as.numeric(cellRanks$seurat_clusters)
),
col = list(
ClusterID = colorRamp2(unique(as.numeric(cellRanks$seurat_clusters)),
rainbow(max(as.numeric(cellRanks$seurat_clusters))))
)
)
ht1 = Heatmap(PartialMatrix, name = "Scaled\nUMI", column_title = "Allen SMARTseq dataset genes",
top_annotation = ha_column, show_column_names=FALSE, cluster_columns = FALSE,
row_names_gp = gpar(fontsize = 16))
ht1
DoHeatmap(seurat_integrated, features = select_genes2, group.by = 'seurat_clusters' ) + NoLegend()
mapal <- colorRampPalette(RColorBrewer::brewer.pal(11,"RdBu"))(256)
DoHeatmap(seurat_integrated, features = select_genes2, angle = 90, size = 3) + scale_fill_gradientn(colours = rev(mapal))
DoHeatmap(subset(seurat_integrated, downsample = 200), features = select_genes2, angle = 90, size = 3) + scale_fill_gradientn(colours = rev(mapal))
plot <- VlnPlot(seurat_integrated, assay = "RNA", features = c("Map2k1","Trem2"), combine = FALSE, fill.by = c("feature","ident"))
plot <- lapply(
X = plot,
FUN = function(p) p + aes(color= seurat_integrated$integrated_snn_res.0.05)
)
CombinePlots(plots = plot, legend = 'none')
Idents(seurat_integrated)<- "seurat_clusters"
cluster1.markers <- FindMarkers(seurat_integrated, ident.1 = 1, min.pct = 0.25, only.pos = TRUE)
head(cluster1.markers, n = 5)
v1 <- VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster1.markers)[1:15], stack = TRUE, flip = TRUE)
v1+stat_summary(fun=median, geom="point", size=1, color="black")
v1 + scale_fill_manual(values = getPalette(15))+stat_summary(fun=median, geom="point", size=0.5, color="black")
VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster1.markers)[1:5], stack = TRUE, flip = TRUE, split.by = "group.id", split.plot = TRUE)
cluster2.markers <- FindMarkers(seurat_integrated, ident.1 = 2, ident.2 = c(3,4), min.pct = 0.25, only.pos = TRUE)
v1 <- VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster2.markers)[1:15], stack = TRUE, flip = TRUE)
rm(v1)
Astrocyte markers
VlnPlot(object = seurat_integrated, assay = "RNA", features=c("Aqp4","Prdx6","Pla2g7"))
Microglia Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("C1qc","Ly86", "Tmem119"))
Endothelial Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Ly6a", "Ly6c1", "Pltp"))
Oligodendrocyte Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Mag", "Mbp", "Cldn11"))
Glutamatergic Neuron Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Nrgn", "Sv2b", "Snap25"))
GABAergic Neuron Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Gad1", "Gad2", "Rab3b" ))
Oligodendrocyte Precursor Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Pdgfra", "Cspg4", "Gm2a"))
I use a collection of mouse bulk RNA-seq data sets obtained from celldex package (Benayoun et al. 2019). A variety of cell types are available, mostly from blood but also covering several other tissues. This identifies marker genes from the reference and uses them to compute assignment scores (based on the Spearman correlation across markers) for each cell in the test dataset against each label in the reference. The label with the highest score is the assigned to the test cell, possibly with further fine-tuning to resolve closely related labels.
This reference consists of a collection of mouse bulk RNA-seq data sets downloaded from the gene expression omnibus (Benayoun et al. 2019). A variety of cell types are available, again mostly from blood but also covering several other tissues.
library(SingleR)
library(celldex)
ref <- MouseRNAseqData()
table(Idents(object = seurat_integrated))
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2437 1651 969 922 754 744 657 574 555 542 512 390 378 373 338 335
## 16 17 18 19 20 21 22 23
## 281 274 230 158 145 134 103 95
hpca.se <- SingleR(test = seurat_integrated@assays$RNA@data, ref = ref,
labels = ref$label.main)
p1 <- plotScoreHeatmap(hpca.se)
plotDeltaDistribution(hpca.se, ncol = 3)
tab <- table(cluster= seurat_integrated$integrated_snn_res.0.5, label= hpca.se$labels)
p3 <- pheatmap::pheatmap(log10(tab+10))
p3
Astrocyte Markers
VlnPlot(object = seurat_integrated, features =c("Aqp4","Prdx6","Pla2g7"))
Microglia Markers
VlnPlot(object = seurat_integrated, features =c("C1qc","Ly86", "Tmem119", "Trem2"))
Endothelial Markers
VlnPlot(object = seurat_integrated, features =c("Ly6a", "Ly6c1", "Pltp"))
Oligodendrocyte Markers
VlnPlot(object = seurat_integrated, features =c("Mag", "Mbp", "Cldn11"))
Glutamatergic Neuron Markers
VlnPlot(object = seurat_integrated, features =c("Nrgn", "Sv2b", "Snap25"))
GABAergic Neuron Markers
VlnPlot(object = seurat_integrated, features =c("Gad1", "Gad2", "Rab3b" ))
Oligodendrocyte Precursor Markers
VlnPlot(object = seurat_integrated, features =c("Pdgfra", "Cspg4", "Gm2a"))
FeaturePlot(seurat_integrated,
reduction = "umap",
features = c("Tnnc1","Nrgn"),
split.by = "sample.id",
pt.size = 0.4,
min.cutoff = 'q10',
label = TRUE)
DimPlot(seurat_integrated,
reduction = "umap",
group.by = "seurat_clusters",
label = TRUE)
seurat_integrated$celltype_LC <- seurat_integrated@active.ident
DimPlot(seurat_integrated,
reduction = "umap",
group.by = "integrated_snn_res.0.5",
split.by = "sample.id",
label = TRUE)
Idents(object = seurat_integrated) <- "celltype_LC"
plot <- DimPlot(seurat_integrated,
reduction = "umap",
group.by = "celltype_LC",
label = FALSE)
plot$data$seurat_clusters <- seurat_integrated@meta.data$integrated_snn_res.0.5
all(rownames(plot$data) == rownames(seurat_integrated@meta.data))
## [1] TRUE
LabelClusters(plot, id = "celltype_LC")
Idents(object = seurat_integrated) <- "celltype_LC"
plot <- DimPlot(seurat_integrated,
reduction = "umap",
group.by = "celltype_LC",
split.by = "sample.id",
label = FALSE)
plot$data$seurat_clusters <- seurat_integrated@meta.data$integrated_snn_res.0.5
all(rownames(plot$data) == rownames(seurat_integrated@meta.data))
## [1] TRUE
LabelClusters(plot, id = "seurat_clusters")
Refer to Tasic et al, Nature 2018, marker list https://github.com/AllenInstitute/tasic2018analysis/blob/master/RNA-seq%20Analysis/markers.R
Now we do a DEG analysis for all genes between the two groups. And then we will do DEG for each cluster between the two groups.
ab_markers<- subset(full_results, subset = comparison == "CTL_MUT")
ab_markers<- subset(ab_markers, subset = p_val_adj < 0.005)
ab_markers<- ab_markers[order(-abs(ab_markers$avg_log2FC)),]
num_markers<- as.data.frame(table(ab_markers$Cluster))
num_markers<- num_markers[order(num_markers$Freq),]
p<-ggplot(data=num_markers, aes(x=Var1, y=Freq, fill=Var1)) +
geom_bar(stat="identity")+
theme_minimal()+
coord_flip()+
labs(title="Number of Differentially Expressed Genes by Cluster in AB", y="Number of Differentially Expressed Genes", x="Cell Type")+
theme(legend.position="none")
p
features<- unique(ab_markers$genes[1:11])
Idents(seurat_integrated) <- "celltype_LC"
DotPlot(seurat_integrated, features = select_genes, split.by = 'group.id') + RotatedAxis()
v3 <- VlnPlot(seurat_integrated, assay = "RNA", features = "Map2k1") +NoLegend()
v3$layers[[2]]$aes_params$alpha <- 0.1
v3_2 <- VlnPlot(seurat_integrated, assay = "RNA", idents = c("Glut_N", "GABA_N", "Astro", "Micro", "Oligo"), features = "Mapk1", split.by = "group.id") #split.plot = TRUE
v3_2$layers[[2]]$aes_params$alpha <- 0.2
v1 <- VlnPlot(seurat_integrated, assay = "RNA", idents = c("Glut_N", "GABA_N", "Astro", "Micro", "Oligo"), features = "Map2k1", split.by = "group.id")
v2 <- VlnPlot(seurat_integrated, assay = "RNA", idents = c("Glut_N", "GABA_N", "Astro", "Micro", "Oligo"), features = "Mapk1", split.by = "group.id")
v3 <- VlnPlot(seurat_integrated, assay = "RNA", idents = c("Glut_N", "GABA_N", "Astro", "Micro", "Oligo"), features = "Mapk3", split.by = "group.id")
v4 <- VlnPlot(seurat_integrated, assay = "RNA", idents = "Glut_N", features = c("Map2k1", "Mapk1", "Mapk3"), split.by = "group.id")
DefaultAssay(seurat_integrated) <- "RNA"
FeaturePlot(seurat_integrated,
features = "Map2k1",
pt.size = 1,
min.cutoff = "q10",
label = TRUE)
DotPlot(seurat_integrated, features = select_genes2, split.by = "group.id") + RotatedAxis()