1 Loading Packages and Importing Samples

m10 = D1; m11 = D1sirt1

setwd("D:/BioinformaticsProjects/10x-cloud-analysis/NovogeneData1/01.RawData/FASTQs")
library(Seurat)
library(dplyr)
library(cowplot)
library(ggplot2)
library(rstatix)
library(SingleCellExperiment)

for (file in c("m10",
               "m11")){
  seurat_data <- Read10X(data.dir = paste0("D:/BioinformaticsProjects/10x-cloud-analysis/NovogeneData1/01.RawData/FASTQs/slim_counts/", file))
  seurat_obj <- CreateSeuratObject(counts = seurat_data, 
                                   min.features = 100, 
                                   project = file)
  assign(file, seurat_obj)
  
               }

m10 <- AddMetaData(object = m10, metadata = "m10", col.name = "sample.id")
m11 <- AddMetaData(object = m11, metadata = "m11", col.name = "sample.id")
m10 <- AddMetaData(object = m10, metadata = "WT", col.name = "group.id") 
m11 <- AddMetaData(object = m11, metadata = "KO", col.name = "group.id")

gallitano <- merge(m10, y = m11, add.cell.ids = c("m10","m11"), project = "two_samples")
gallitano[["percent.mt"]] <- PercentageFeatureSet(gallitano, pattern = "^mt-") 

rm(m10)
rm(m11)
rm(seurat_obj, seurat_data)

2 Data Analysis

VlnPlot(gallitano, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        group.by = "sample.id",
        ncol = 3) + ggtitle("Quality assessment before filtering")

library(patchwork)
plot1 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "percent.mt") 
line.data <- data.frame(yintercept = c(1500, 7500), Lines = c("lower", "upper"))
line2.data <- data.frame(xintercept = c(500), Lines = c("lower"))

plot2 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data) + geom_vline(aes(xintercept = xintercept, linetype = Lines), line2.data)

plot1 + plot2 +  plot_layout(guides = 'collect') + plot_annotation(title = "Quality assessment before filtering")

2.1 Joint filtering effects:

Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs.

metadata <- gallitano@meta.data
metadata %>% 
    ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) + 
    geom_point() + 
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() + 
    scale_y_log10() + 
    theme_classic() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = c(1500, 7500))

2.2 Selecting Data Subset

Filtering is based on the following criteria: nFeature_RNA < 7500; nFeature_RNA>1500; nCount_RNA > 500; percent.mt < 20 After subseting, only keep those genes expressed in more than 100 cells.

gallitano <- subset(gallitano, nFeature_RNA < 7500 & 
                      nFeature_RNA>1200 & nCount_RNA > 2500 & 
                      percent.mt < 15)

# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = gallitano, slot = "counts")
# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero <- counts > 0
# Sums all TRUE values and returns TRUE if more than 100 TRUE values per gene
keep_genes <- Matrix::rowSums(nonzero) >= 100
# Only keeping those genes expressed in more than 100 cells
filtered_counts <- counts[keep_genes, ]
# Reassign to filtered Seurat object
gallitano <- CreateSeuratObject(filtered_counts, meta.data = gallitano@meta.data)

rm(filtered_counts,nonzero, counts, keep_genes, file)

2.3 Reaccess quality metrics

dim(gallitano)

## [1] 12938  6380

# 13552 genes x 7529 cells (or UMIs)

VlnPlot(gallitano, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3) + ggtitle("Quality assessment after filtering")

plot1 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "percent.mt") 
line.data <- data.frame(yintercept = c(1500, 7500), Lines = c("lower", "upper"))
line2.data <- data.frame(xintercept = c(500), Lines = c("lower"))

plot2 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data) + geom_vline(aes(xintercept = xintercept, linetype = Lines), line2.data)

plot1 + plot2 +  plot_layout(guides = 'collect') + plot_annotation(title = "Quality assessment after filtering")

metadata <- gallitano@meta.data
metadata %>% 
    ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) + 
    geom_point() + 
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() + 
    scale_y_log10() + 
    theme_classic() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = c(1500, 7500))

2.4 Apply sctransform normalization

This steps can replace NormalizeData(), ScaleData(), and FindVariableFeatures(). The results of sctransform are stored in the “SCT” assay. It is assumed to reveals sharper biological distinctions compared to the standard Seurat workflow.

Since it is a normalization step, we have to do separately for two different samples (that’s why split seurat objects here), then only integrate the sample expression data in the next step, integration analysis, to remove batch effect.

options(future.globals.maxSize = 9000 * 1024^2) #100 gb

# Split seurat object by condition to perform cell cycle scoring and SCT on all samples
split_seurat <- SplitObject(gallitano, split.by = "sample.id")

split_seurat <- split_seurat[c("m10", "m11")]

for (i in 1:length(split_seurat)) {
  split_seurat[[i]] <- NormalizeData(split_seurat[[i]], verbose = TRUE)
  split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("percent.mt"))
}

rm(gallitano)

2.5 Integration

# Select the most variable features to use for integration
integ_features <- SelectIntegrationFeatures(object.list = split_seurat, 
                                            nfeatures = 3000) 
# Prepare the SCT list object for integration
split_seurat <- PrepSCTIntegration(object.list = split_seurat, 
                                   anchor.features = integ_features)
# Find best buddies - can take a while to run
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat, 
                                        normalization.method = "SCT", 
                                        anchor.features = integ_features)
# Integrate across conditions
seurat_integrated <- IntegrateData(anchorset = integ_anchors, 
                                   normalization.method = "SCT")

rm(integ_anchors)
rm(integ_features)
rm(split_seurat)

2.6 Clustering

# Run the standard workflow for visualization and clustering
seurat_integrated <- RunPCA(object = seurat_integrated, npcs = 40, verbose = F)
ElbowPlot(seurat_integrated, ndims = 40)

PCAPlot(seurat_integrated)

seurat_integrated <- RunUMAP(seurat_integrated, 
                             dims = 1:40,
                             reduction = "pca")
seurat_integrated <- RunTSNE(seurat_integrated, 
                             reduction = "pca")
# Determine the K-nearest neighbor graph
seurat_integrated <- FindNeighbors(object = seurat_integrated, 
                                   dims = 1:40)

# Determine the clusters for various resolutions                                
seurat_integrated <- FindClusters(object = seurat_integrated,
                                  resolution = c(0.05, 0.5))

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6380
## Number of edges: 224038
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9778
## Number of communities: 9
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6380
## Number of edges: 224038
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9257
## Number of communities: 20
## Elapsed time: 0 seconds

saveRDS(seurat_integrated, "Seurat_integrated_twobrainsample.RDS")

2.7 Clustering quality control

This step gives us some idea about how is the distribution of the number of genes, number of UMIs, and percentage of mitochondrial genes in each cluster. Normally, we expect to see similar distribution of no. of genes (nFeature_RNA) and no. of UMIs (nCount_RNA).

As for the percent.mt (percentage of mitochondrial genes per cell), it can be a reference to check if those high intensity clusters might be having poor quality cells (if so, we can try to remove in the next step or adjust the metrics in the previous filtering step) or it might be due to the differences biologically.

# Determine metrics to plot present in seurat_integrated@meta.data
metrics <-  c("nFeature_RNA", "nCount_RNA", "percent.mt")

FeaturePlot(seurat_integrated, 
            reduction = "umap", 
            features = metrics,
            split.by = "sample.id",
            pt.size = 0.4,
            min.cutoff = 'q10',
            label = TRUE)

3 Find all markers in two samples for cell type identification

Idents(seurat_integrated)<- "seurat_clusters"
gallitano.markers <-  FindAllMarkers(seurat_integrated,
                                    only.pos=TRUE, min.pct=0.25, logfc.threshold=0.25)
write.csv(gallitano.markers, "plots/D1_D1sirt1_markers_20220226.csv")

3.1 Identifying cell type

3.1.1 Option 1: SingleR package with built-in reference

I use a collection of mouse bulk RNA-seq data sets obtained from celldex package (Benayoun et al. 2019). A variety of cell types are available, mostly from blood but also covering several other tissues. This identifies marker genes from the reference and uses them to compute assignment scores (based on the Spearman correlation across markers) for each cell in the test dataset against each label in the reference. The label with the highest score is the assigned to the test cell, possibly with further fine-tuning to resolve closely related labels.

This reference consists of a collection of mouse bulk RNA-seq data sets downloaded from the gene expression omnibus (Benayoun et al. 2019). A variety of cell types are available, again mostly from blood but also covering several other tissues.

#BiocManager::install("SingleR")
#BiocManager::install("celldex")
library(SingleR)
library(celldex)

ref <- MouseRNAseqData()
table(Idents(object = seurat_integrated))

## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 893 881 658 642 410 391 334 319 240 231 210 195 194 173 150 140  99  90  73  57

hpca.se <- SingleR(test = seurat_integrated@assays$RNA@data, ref = ref,
                    labels = ref$label.main)

p1 <- plotScoreHeatmap(hpca.se)

plotDeltaDistribution(hpca.se, ncol = 3)

all.markers <- metadata(hpca.se)$de.genes
seurat_integrated$labels_hpca <- hpca.se$labels
# Extract identity and sample information from seurat object to determine the number of cells per cluster per sample
n_cells <- FetchData(seurat_integrated, 
                     vars = c("integrated_snn_res.0.5",
                              "labels_hpca")) %>%
  dplyr::count(integrated_snn_res.0.5,
               labels_hpca) %>%
  tidyr::spread(integrated_snn_res.0.5, n)

View(n_cells)
write.csv(n_cells, "plots/n_cells_mouseRNAseqData_onesample.csv")

## comparing with the unsupervised clustering
tab <- table(cluster= seurat_integrated$integrated_snn_res.0.5, label= hpca.se$labels) 

p3 <- pheatmap::pheatmap(log10(tab+10))
p3

3.1.2 Option 2: manual annotation

Refer to the seurat_integrated original script

Plotting Astrocyte Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Aqp4","Prdx6","Pla2g7"))

Plotting Microglia Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("C1qc","Ly86", "Tmem119", "Trem2"))

Plotting Endothelial Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Ly6a",  "Ly6c1",  "Pltp"))

Plotting Oligodendrocyte Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Mag",  "Mbp",  "Cldn11"))

Plotting Glutamatergic Neuron Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Nrgn", "Sv2b",  "Snap25"))

Plotting Gabaergic Neuron Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Gad1",  "Gad2",  "Rab3b" ))

Plotting Oligodendrocyte Precursor Markers

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Pdgfra",  "Cspg4",  "Gm2a"))

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Drd1",  "Ppp1r1b"))

D2 neuron marker

VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Drd2",  "Ppp1r1b"))

Sirt1 gene in D1 cluster

VlnPlot(object = seurat_integrated, assay = "RNA", features ="Sirt1", idents = c("0","4","6"), split.by = "group.id") #group.by = "group.id"

Sirt1 gene in D2 cluster

VlnPlot(object = seurat_integrated, assay = "RNA", features ="Sirt1", idents = c("2","7","14"), split.by = "group.id") #group.by = "group.id"

Sirt2 gene in D1 cluster

VlnPlot(object = seurat_integrated, assay = "RNA", features ="Sirt2", idents = c("0","4","6"), split.by = "group.id") #group.by = "group.id"

Continue with this analysis here next time. Overall very good data, need to further filter mito cutoff, using 15%, nFeature with 1200 genes.

readRDS(“Seurat_integrated_twobrainsample.RDS”)

3.2 Rename cluster based on the SingleR results and manual annotation

seurat_integrated <- RenameIdents(object = seurat_integrated, 
                         "0" = "Neurons(D1)",
                         "1" = "Oligodendrocytes",
                         "2" = "Neurons(D2)",
                         "3" = "Astrocytes",
                         "4" = "Neurons(D1)",
                         "5" = "Unidentified",
                         "6" = "Neurons(D1)",
                         "7" = "Neurons(D2)",
                         "8" = "Microglia",
                         "9" = "Neurons(Gabaergic)",
                         "10" = "Neurons(D1)",
                         "11" = "Endothelial cells",
                         "12" = "ODC_precursor",
                         "13" = "Neurons(D2)",
                         "14" = "Unidentified",
                         "15" = "Neurons(D1)",
                         "16" = "Unidentified",
                         "17" = "Neurons(D1)",
                         "18" = "Unidentified",
                         "19" = "Neurons(Cholinergic)",
                         "20"= "Neurons(Cholinergic)",
                         "21"= "Unidentified",
                         "22"= "ODC_precursor")
seurat_integrated$celltype_LC <- seurat_integrated@active.ident

DimPlot(seurat_integrated,
                reduction = "umap",
                group.by = "integrated_snn_res.0.5",
                label = TRUE,
)

DimPlot(seurat_integrated,
                reduction = "umap",
                group.by = "integrated_snn_res.0.5",
                split.by = "sample.id",
                label = TRUE,
)

Idents(object = seurat_integrated) <- "celltype_LC"
plot <- DimPlot(seurat_integrated,
                reduction = "umap",
                group.by = "celltype_LC",
                label = FALSE,
)
plot$data$seurat_clusters <- seurat_integrated@meta.data$integrated_snn_res.0.5
all(rownames(plot$data) == rownames(seurat_integrated@meta.data))

## [1] TRUE

LabelClusters(plot, id = "celltype_LC") & NoLegend()

Idents(object = seurat_integrated) <- "celltype_LC"
plot <- DimPlot(seurat_integrated,
                reduction = "umap",
                group.by = "celltype_LC",
                split.by = "sample.id",
                label = FALSE,
)
plot$data$seurat_clusters <- seurat_integrated@meta.data$integrated_snn_res.0.5
all(rownames(plot$data) == rownames(seurat_integrated@meta.data))

## [1] TRUE

LabelClusters(plot, id = "seurat_clusters")

3.2.1 Option 3: manual annotation

Refer to Tasic et al, Nature 2018, marker list https://github.com/AllenInstitute/tasic2018analysis/blob/master/RNA-seq%20Analysis/markers.R

Inh.markers <- list(c("Gad2","Slc32a1","Prox1","Adarb2","Nfix","Nfib","Cacna2d1",
                  "Cxcl14","Tnfaip8l3","Cplx3","Lamp5","Cd34","Pax6","Krt73",
                  "Scrg1","Egln3","Ndnf","Tmem182","Ntn1","Pde11a","Pdlim5",
                  "Lsp1","Slc35d3","Nkx2-1","Serpinf1","Col14a1","Vip","Sncg",
                  "Crabp1","Slc10a4","Cldn10","Bhlhe22","Crispld2","Slc17a8",
                  "Cyb5r2","Nr1h4","Wnt7b","Prss12","Igfbp6","Calb2","Grpr",
                  "Pthlh","Elfn1","Rspo1","Slc18a3","Lmo1","Rspo4","Sostdc1",
                  "Chat","Cbln4","Gsx2","Gpc3","Mab21l1","C1ql1","Itih5","Mybpc1",
                  "Myl1","Lhx6","Sox6","Sst","Chodl","Calb1","Cbln4","Etv1","Edn1",
                  "Kl","Il1rapl2","Myh8","Ptprk","Chrna2","Myh13","Ptgdr","Crhr2",
                  "Hpse","Igsf9","C1ql3","Tacstd2","Th","Col6a1","Nts","Tac1","Pvalb",
                  "Gabrg1","Il7","Bche","Prdm8","Syt2","Ostn","Pdlim3","C1ql1",
                  "Gpr149","Vipr2","Meis2","Adamts19","Cpa6","Lgr6"))
Inh.comb.markers <- c("Reln","Cnr1","Nr2f2","Cck","Npy","Crh","Tac2")

Ex.markers <- list(unique(c("Slc17a7","Rtn4rl2","Slc30a3","Cux2","Stard8","Otof","Rrad",
                            "Penk","Agmat",
                      "Emx2","S100a3","Macc1","Rorb","Scnn1a","Whrn","Endou","Col26a1",
                      "Rspo1","Fezf2","Hsd11b1","Batf3","Arhgap25","Colq","Pld5","Olfr78",
                      "Tcap","Fgf17","Wfdc18","Wfdc17","Aldh1a7","Tgfb1","Ctsc","Rxfp2",
                      "Prss35","Rgs12","Osr1","Oprk1","Cd52","Col23a1","Col18a1","Car1",
                      "Car3","Fam84b","Chrna6","Chrnb3","Fn1","Tac1","Lce3c","Erg",
                      "Cdc42ep5","Bmp5","Pvalb","Depdc7","Stac","C1ql2","Ptgfr","Slco2a1",
                      "Pappa2","Dppa1","Npsr1","Htr2c","Hpgd","Nxph3","Sla2","Tshz2",
                      "Rapgef3","Slc17a8","Trh","Nxph2","Foxp2","Col12a1","Syt6","Col5a1",
                      "Gpr139","Ly6d","Sla","Cpa6","Ppp1r18","Faim3","Ctxn3","Nxph4",
                      "Cplx3","Ctgf","Col8a1","Mup5","Ngf","Fam150a","F2r","Serpinb11","Fbxl7",
                      "P2ry12","Crh","Kynu","Hsd17b2","Mup3","Tlcd1","Lhx5","Trp73","Cpa6",
                      "Gkn1","Col18a1","Lce3c","Erg","Bmp5","Stac","C1ql2","Slco2a1","Lrrc9",
                      "Trhr","Myzap","Krt80","H60b","Fam150a","Clic5","Kcnj5","Olfr110",
                      "Olfr111")))
Ex.comb.markers <- c("Reln","Cdh13","Cpne7","Alcam","Rprm","Marcksl1")
                            

Global.markers <- list(c("Fez1","Phyhipl","Aplp1","Gnao1","Caly","Snap25","Atp1a3","Camk2b",
                    "Syt1","Gabrg2","Fabp3","Stmn2","Kif5c","Slc32a1","Gad2","Dlx1","Dlx5",
                    "Dlx2","Dlx6os1","Slc6a1","Sox2","Slc17a7","Nrn1","Neurod2","Sv2b","Satb2",
                    "Tbr1","Vsig2","Cmtm5","Kcnj10","S100a16","S100a13","S1pr1","Gja1","Gjb6",
                    "Aqp4","Lcat","Acsbg1","Olig1","Sox10","Neu4","Sapcd2","Gpr17","Plp1",
                    "Cldn11","Mag","Mog","Nkx6-2","Enpp6","9630013A20Rik","Brca1",
                    "Mog","Opalin","Gjb1","Hapln2","Cyba","Ctsh","Ifitm3","Sparc",
                    "S100a11","Dcn","Col1a1","Pltp","Vtn","Slc6a13","Spp1","Slc13a3",
                    "Col15a1","Slc47a1","Tgtp2","Ifi47","Esam","Slco1a4","Slc38a5",
                    "Cldn5","H2-Q7","Slc38a11","Art3","Ace2","Acta2","Myh11","Pln",
                    "Gja5","Kcnj8","Atp13a5","Aoc3","Ctss","C1qb","C1qc","C1qa","Cbr2",
                    "F13a1","Pf4","Mrc1","Siglech","Selplg"))

seurat_integrated <- AddModuleScore(object = seurat_integrated, 
                           features = Ex.markers, 
                           name = "Excitatory_marker_score")

FeaturePlot(object = seurat_integrated, features = "Excitatory_marker_score1", label = T) + 
  ggtitle("Excitatory marker score (refer to Tasic et al., 2018, Nature)")

seurat_integrated <- AddModuleScore(object = seurat_integrated, 
                           features = Inh.markers, 
                           name = "Inhibitory_marker_score")

FeaturePlot(object = seurat_integrated, features = "Inhibitory_marker_score1", label = T) + 
  ggtitle("Inhibitory marker score (refer to Tasic et al., 2018, Nature)")

seurat_integrated <- AddModuleScore(object = seurat_integrated, 
                           features = Global.markers, 
                           name = "Global_marker_score")

FeaturePlot(object = seurat_integrated, features = "Global_marker_score1", label = T) + 
  ggtitle("Global marker score (refer to Tasic et al., 2018, Nature)")

saveRDS(seurat_integrated, "plots/seurat_integrated_twobrainsamples.RDS")

D1 & D1sirt1 Two Brain Sample

Foong Lian Chee

2/26/2022