Analysis

Microglia cd45 dataset analysis

Steady state only

Kin Simon

27-12-2020

Loading library

load("C:/Users/Simon/Documents/VW_lab/cd45_steady/image.RData")
library(dplyr)
library(Seurat)
library(scales)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
library(gplots)
library(sctransform)
library(scDblFinder)
library(SingleCellExperiment)
library(cluster)
library(factoextra)
library(intrinsicDimension)
library(xlsx)
library(tibble)

1) Loading data (4719 cells with 27692 captured genes)

#data.raw <- Read10X(data.dir = "cd45_steady/outs/filtered_feature_bc_matrix")

#data <- CreateSeuratObject(counts = data.raw, names.field = 2, names.delim = "-")

#rm(data.raw)

#Compute the % of MT and Ribosomal genes for each cell
data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^mt-")
data[["percent.ribo"]] <- PercentageFeatureSet(data, pattern = "^rp[sl][[:digit:]]")

2) QC plots before filtering

VlnPlot(data, features = c("nCount_RNA", "percent.mt", "percent.ribo" ),
        ncol = 3, pt.size = 0.001)

FeatureScatter(data, feature1 = "percent.mt", feature2 = "percent.ribo")

#Initially 4719 cells with 27692 captured genes
data

## An object of class Seurat 
## 27692 features across 4719 samples within 1 assay 
## Active assay: RNA (27692 features, 0 variable features)

The percentage of MT expressed genes is not correlated with the percentage of Ribosomal expressed genes, so filtering by MT will not especially affect the other

3) QC filtering

3.1 low RNA counts and %MT < 10 (around 54% of removed cells)

data1 <- subset(data, subset = nCount_RNA > 1500 & percent.mt < 10)
VlnPlot(data1, features = c("nCount_RNA", "percent.mt", "percent.ribo" ),
        ncol = 3, pt.size = 0.001)

data1

## An object of class Seurat 
## 27692 features across 2208 samples within 1 assay 
## Active assay: RNA (27692 features, 0 variable features)

3.2 Filtering doublets/multiplets with scDblFinder package (42 potential doublets found)

#Isolate Seurat raw count matrix
counts <- as.matrix(data1@assays$RNA@data)
#Convert into sce object
sce <- SingleCellExperiment(assays = list(counts = counts))
#Run scDblFinder package on it
sce <- scDblFinder(sce)

#49 DOUBLETS (2.2% of cells) FOUND!


#Transpose the matrix to work easily on it
counts_t <- t(counts)
#Then turn it into datafrme
counts_t <- as.data.frame(counts_t)

#Convert as matrix the scDblFinder results (singlet >< doublet)
matrix <- as.data.frame(colData(sce)$scDblFinder.class)

#Add a state variable for each cell (singlet >< doublet)
counts_t$state <- matrix$`colData(sce)$scDblFinder.class`
#Filter to only keep singlets
counts_t <- filter(counts_t, state == 'singlet')
#Remove the state column
counts_t$state <- NULL

#Create a new df of the count matrix for further analysis
counts_try <- t(counts_t)
counts_try <- as.data.frame(counts_try)

3.3 Re-Organize the data for some plots

#Create a df which sums the count for each gene and the number of cells who express it
genes <- tibble(
  gene = rownames(counts_try),
  count = Matrix::rowSums(counts_try),
  cells = Matrix::rowSums(counts_try != 0)
)

head(genes, n = 10)

###Try as best as possible to make a plot as in the scanpy tutorial, preproccessing part chunk 8

#Order genes by their count
genes_top <- genes[order(-genes$count),]
#Select the top 20
top20genes <- genes_top[1:20,]
#Compute the total count
total_count <- sum(genes$count) 

#Add a variable which computes the percentage of total count for the top 20 genes
top20genes$per_tot_count <- with(top20genes, count/total_count * 100)

#Necessary for the incoming plot
top20genes$gene <- factor(top20genes$gene, levels = top20genes$gene)

#Plot the result with wonderful colors
ggplot(data=top20genes, aes(x=per_tot_count, y=gene, fill = gene)) + geom_bar(stat="identity") + theme(legend.position = "none")

3.4 Filter genes which are expressed in less than 5 cells (14346 genes kept over 27692)

#List of kept genes
genes_to_keep <- genes %>% dplyr::filter(cells >= 5) %>% pull(gene)
#Length of the list
length(genes_to_keep)

## [1] 14346

#Remove filtered genes in the final matrix
liste <- as.vector(genes_to_keep)
counts <- counts[rownames(counts_try) %in% liste ,]

3.5 Restructure the count matrix and convert it into Seurat object for final QC filtered violin plots

#Restructure the matrix
counts <- as.data.frame(counts_try)

#Compute nCount_rna for incoming violin plots
nCount_rna <- as.numeric(colSums(counts_try))

#Convert the matrix into sce object then into Seurat
counts <- as.matrix(counts_try)
sce <- SingleCellExperiment(assays = list(counts = counts))
data_QC <- as.Seurat(sce, data = NULL)

#Replace the nCount_rna inh the same "directory" than non- previously converted Seurat objects
data_QC[["nCount_RNA"]] <- nCount_rna

#Compute the % of MT and Ribosomal genes for each cell
data_QC[["percent.mt"]] <- PercentageFeatureSet(data_QC, pattern = "^mt-")
data_QC[["percent.ribo"]] <- PercentageFeatureSet(data_QC, pattern = "^rp[sl][[:digit:]]")
VlnPlot(data_QC, features = c("nCount_RNA","percent.mt","percent.ribo"),
        ncol = 3) + theme(legend.position = "none")

data_QC

## An object of class Seurat 
## 27692 features across 2159 samples within 1 assay 
## Active assay: RNA (27692 features, 0 variable features)

4) Normalization using SCTransform package + plots

It returns the Pearson residuals from regularized negative binomial regression, where cellular sequencing depth is utilized as a covariate in a generalized linear model. It aims to remove the influence of technical characteristics from downstream analyses while preserving biological heterogeneity.

#SCT normalisation with top 1000 variable features
data_QC <- SCTransform(data_QC, variable.features.n = 1000, verbose = F)
#Variable feature plot
VariableFeaturePlot(data_QC, selection.method = "sct")

#isolate the residual variance from the Seurat object
residual_variance <- as.data.frame(data_QC@assays[["SCT"]]@meta.features[["sct.residual_variance"]])
colnames(residual_variance) <- "residual_variance"

#Plot the ECDF of the residual variance
ggplot(residual_variance, aes(residual_variance)) + stat_ecdf(geom = "point")

5) Dimensional reductions

I tested the dimensionality of the dataset using intrinsicDimension package. The result was 9 (I had manually set it to 11 because it was giving good clustering score instead of 15, 20, 25, 50,… for which there had important outliers in the clustering) and by selecting the first 9 PCs, it gave a silhouette score (see later) of 0.63, which is really good. We finally find 11 clusters at a resolution of 0.5 (actually the one who gives the best silouette score)

#Run PCA with variable features
data_QC <- RunPCA(data_QC, assay = "SCT",features = VariableFeatures(object = data_QC))

#Test the dimensionality
intrinsicDimension::maxLikGlobalDimEst(data_QC@reductions$pca@cell.embeddings,
                                       k = 10)

## Dimension estimate: 9.143848

#Elbow plot
ElbowPlot(data_QC, ndims = 50)

#Umap
data_QC <- RunUMAP(data_QC, dims = 1:9, verbose = F)
#FindNeighbors
data_QC <- FindNeighbors(data_QC, dims = 1:9, verbose = F)
#Findclusters
data_QC <- FindClusters(data_QC, resolution = 0.5, verbose = F)

#Final Umap
DimPlot(data_QC)

6) Clustering quality using silhouette plot

Some metrics such as silhouette coefficient can be used in order to evaluate the goodness of clustering algorithm results. It processes as follows:

Calculation of the average dissimilarity ai between each observation i and all points belonging to i
Calculation of the average dissimilarity d(i,C) for all clusters where i does not belong to.
Define the silhouette width (Si)

The silhouette width varies between -1 and 1, for which a negative score means that observations may probably be in the wrong cluster, while positive score means that observations are well clustered. (From my master thesis)

#Compute distance matrix to UMAP coordinates
distance_matrix <- dist(Embeddings(data_QC[['umap']])[, 1:2])

#Isolate cluster nameq
clusters <- data_QC@active.ident

#Compute silhouette score for each cluster
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
data_QC@meta.data$silhouette_score <- silhouette[,3]

#Plot
fviz_silhouette(silhouette)

##    cluster size ave.sil.width
## 1        1  547          0.63
## 2        2  316          0.69
## 3        3  283          0.77
## 4        4  237          0.50
## 5        5  207          0.89
## 6        6  154          0.65
## 7        7  122          0.65
## 8        8  106          0.63
## 9        9   78         -0.27
## 10      10   74          0.23
## 11      11   35          0.95