Analysis

Mpeg dataset analysis

Steady state only

Kin Simon

27-12-2020

load("C:/Users/Simon/Documents/VW_lab/mpeg_steady/image.RData")
library(dplyr)
library(Seurat)
library(scales)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
library(gplots)
library(sctransform)
library(scDblFinder)
library(SingleCellExperiment)
library(cluster)
library(factoextra)
library(intrinsicDimension)
library(xlsx)
library(tibble)

data <- readRDS("steady_mpeg_Seurat.rds")
#Compute the % of MT and Ribosomal genes for each cell
data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^mt-")
data[["percent.ribo"]] <- PercentageFeatureSet(data, pattern = "^rp[sl][[:digit:]]")

2) QC plots before filtering

VlnPlot(data, features = c("nCount_RNA", "percent.mt", "percent.ribo" ),
        ncol = 3, pt.size = 0.001)

FeatureScatter(data, feature1 = "percent.mt", feature2 = "percent.ribo")

#Initially 1831 cells with 25108 captured genes
data

## An object of class Seurat 
## 25108 features across 1831 samples within 1 assay 
## Active assay: RNA (25108 features, 0 variable features)

The percentage of MT expressed genes is not fully correlated (-0.48) with the percentage of Ribosomal expressed genes, so filtering by MT will not especially affect the other

3) QC filtering

3.1 low RNA counts and %MT < 10 (around 41% of removed cells)

data1 <- subset(data, subset = nCount_RNA > 1500 & percent.mt < 10)
VlnPlot(data1, features = c("nCount_RNA", "percent.mt", "percent.ribo" ),
        ncol = 3, pt.size = 0.001)

data1

## An object of class Seurat 
## 25108 features across 1080 samples within 1 assay 
## Active assay: RNA (25108 features, 0 variable features)

3.2 Filtering doublets/multiplets with scDblFinder package (16 potential doublets found)

#Isolate Seurat raw count matrix
counts <- as.matrix(data1@assays$RNA@data)
#Convert into sce object
sce <- SingleCellExperiment(assays = list(counts = counts))
#Run scDblFinder package on it
sce <- scDblFinder(sce)

#14 DOUBLETS (1.3% of cells) FOUND!


#Transpose the matrix to work easily on it
counts_t <- t(counts)
#Then turn it into datafrme
counts_t <- as.data.frame(counts_t)

#Convert as matrix the scDblFinder results (singlet >< doublet)
matrix <- as.data.frame(colData(sce)$scDblFinder.class)

#Add a state variable for each cell (singlet >< doublet)
counts_t$state <- matrix$`colData(sce)$scDblFinder.class`
#Filter to only keep singlets
counts_t <- filter(counts_t, state == 'singlet')
#Remove the state column
counts_t$state <- NULL

#Create a new df of the count matrix for further analysis
counts_try <- t(counts_t)
counts_try <- as.data.frame(counts_try)

3.3 Re-Organize the data for some plots

#Create a df which sums the count for each gene and the number of cells who express it
genes <- tibble(
  gene = rownames(counts_try),
  count = Matrix::rowSums(counts_try),
  cells = Matrix::rowSums(counts_try != 0)
)

head(genes, n = 10)

## # A tibble: 10 x 3
##    gene            count cells
##    <chr>           <dbl> <dbl>
##  1 ptpn12              6     5
##  2 phtf2               0     0
##  3 phtf2.1             3     3
##  4 CU856344.1          6     6
##  5 si:zfos-932h1.3     8     8
##  6 mansc1              0     0
##  7 lrp6                1     1
##  8 dusp16             71    63
##  9 crebl2             66    64
## 10 gpr19               0     0

###Try as best as possible to make a plot as in the scanpy tutorial, preproccessing part chunk 8

#Order genes by their count
genes_top <- genes[order(-genes$count),]
#Select the top 20
top20genes <- genes_top[1:20,]
#Compute the total count
total_count <- sum(genes$count) 

#Add a variable which computes the percentage of total count for the top 20 genes
top20genes$per_tot_count <- with(top20genes, count/total_count * 100)

#Necessary for the incoming plot
top20genes$gene <- factor(top20genes$gene, levels = top20genes$gene)

#Plot the result with wonderful colors
ggplot(data=top20genes, aes(x=per_tot_count, y=gene, fill = gene)) + geom_bar(stat="identity") + theme(legend.position = "none")

3.4 Filter genes which are expressed in less than 5 cells (12552 genes kept over 25108)

#List of kept genes
genes_to_keep <- genes %>% dplyr::filter(cells >= 5) %>% pull(gene)
#Length of the list
length(genes_to_keep)

## [1] 12541

#Remove filtered genes in the final matrix
liste <- as.vector(genes_to_keep)
counts <- counts[rownames(counts_try) %in% liste ,]

3.5 Restructure the count matrix and convert it into Seurat object for final QC filtered violin plots

#Restructure the matrix
counts <- as.data.frame(counts_try)

#Compute nCount_rna for incoming violin plots
nCount_rna <- as.numeric(colSums(counts_try))

#Convert the matrix into sce object then into Seurat
counts <- as.matrix(counts_try)
sce <- SingleCellExperiment(assays = list(counts = counts))
data_QC <- as.Seurat(sce, data = NULL)

#Replace the nCount_rna inh the same "directory" than non- previously converted Seurat objects
data_QC[["nCount_RNA"]] <- nCount_rna

#Compute the % of MT and Ribosomal genes for each cell
data_QC[["percent.mt"]] <- PercentageFeatureSet(data_QC, pattern = "^mt-")
data_QC[["percent.ribo"]] <- PercentageFeatureSet(data_QC, pattern = "^rp[sl][[:digit:]]")
VlnPlot(data_QC, features = c("nCount_RNA","percent.mt","percent.ribo"),
        ncol = 3) + theme(legend.position = "none")

data_QC

## An object of class Seurat 
## 25108 features across 1064 samples within 1 assay 
## Active assay: RNA (25108 features, 0 variable features)

4) Normalization using SCTransform package + plots

It returns the Pearson residuals from regularized negative binomial regression, where cellular sequencing depth is utilized as a covariate in a generalized linear model. It aims to remove the influence of technical characteristics from downstream analyses while preserving biological heterogeneity.

#SCT normalisation with top 1000 variable features
data_QC <- SCTransform(data_QC, variable.features.n = 1000, verbose = F)
#Variable feature plot
VariableFeaturePlot(data_QC, selection.method = "sct")

#isolate the residual variance from the Seurat object
residual_variance <- as.data.frame(data_QC@assays[["SCT"]]@meta.features[["sct.residual_variance"]])
colnames(residual_variance) <- "residual_variance"

#Plot the ECDF of the residual variance
ggplot(residual_variance, aes(residual_variance)) + stat_ecdf(geom = "point")

5) Dimensional reductions

I tested the dimensionality of the dataset using intrinsicDimension package. The result was 10.5 but it gave the best silhouette score (0.54, see later) with the first 25 PC’s, with the five last clusters being perfectly clustered. We finally find 11 clusters at a resolution of 0.4.

#Run PCA with variable features
data_QC <- RunPCA(data_QC, assay = "SCT",features = VariableFeatures(object = data_QC))

#Test the dimensionality
intrinsicDimension::maxLikGlobalDimEst(data_QC@reductions$pca@cell.embeddings,
                                       k = 10)

## Dimension estimate: 10.64756

#Elbow plot
ElbowPlot(data_QC, ndims = 50)

#Umap
data_QC <- RunUMAP(data_QC, dims = 1:25, verbose = F)
#FindNeighbors
data_QC <- FindNeighbors(data_QC, dims = 1:25, verbose = F)
#Findclusters
data_QC <- FindClusters(data_QC, resolution = 0.4, verbose = F)

#Final Umap
DimPlot(data_QC)

6) Clustering quality using silhouette plot

Some metrics such as silhouette coefficient can be used in order to evaluate the goodness of clustering algorithm results. It processes as follows:

Calculation of the average dissimilarity ai between each observation i and all points belonging to i
Calculation of the average dissimilarity d(i,C) for all clusters where i does not belong to.
Define the silhouette width (Si)

The silhouette width varies between -1 and 1, for which a negative score means that observations may probably be in the wrong cluster, while positive score means that observations are well clustered. (From my master thesis)

#Compute distance matrix to UMAP coordinates
distance_matrix <- dist(Embeddings(data_QC[['umap']])[, 1:2])

#Isolate cluster nameq
clusters <- data_QC@active.ident

#Compute silhouette score for each cluster
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
data_QC@meta.data$silhouette_score <- silhouette[,3]

#Plot
fviz_silhouette(silhouette)

##    cluster size ave.sil.width
## 1        1  223          0.27
## 2        2  199          0.43
## 3        3  138          0.60
## 4        4  134          0.50
## 5        5   97          0.77
## 6        6   68          0.17
## 7        7   52          0.93
## 8        8   52          0.95
## 9        9   49          0.94
## 10      10   34          0.85
## 11      11   18          0.94