Loading library

load("C:/Users/Simon/Documents/VW_lab/cd45_steady/image1.RData")
library(dplyr)
library(Seurat)
library(scales)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
library(gplots)
library(sctransform)
library(scDblFinder)
library(SingleCellExperiment)
library(cluster)
library(factoextra)
library(intrinsicDimension)
library(xlsx)
library(tibble)
library(GenomicTools)
library(snpStats)

1. Loading Data

1.1 Loading GTF data in order to further remove lincRNA genes

#Load GTF file
gtf <- importGTF("GTF_zb.gtf")
#Convert as dataframe
gtf_df <- as.data.frame(gtf)
#Keep genes that are not lincRNA (31027 over 32520)
gtf_filt <- gtf_df %>% filter(gene_biotype != "lincRNA")

#Vector of non-lincRNA genes that will sleep a little bit before being used
lincrna <- as.vector(gtf_filt$gene_name)

1.2 Loading data into Seurat (4719 cells with 27692 captured genes)

data.raw <- Read10X(data.dir = "outs/filtered_feature_bc_matrix")
data <- CreateSeuratObject(counts = data.raw, names.field = 2, names.delim = "-")

2. Gene filtering

2.1 Extracting and creating optimal data for filtering step

#Extract count matrix from Seurat class as a dataframe (BF = Before Filtering with violin plots)
counts_BF <- as.data.frame(as.matrix(data@assays$RNA@data))  

#Create a df which sums the count for each gene and the number of cells who express it
genes <- tibble(
  gene = rownames(counts_BF),
  count = Matrix::rowSums(counts_BF),
  cells = Matrix::rowSums(counts_BF != 0)
)  

#Create sum column to further select genes that have a sum of 0  
genes$sum <- with(genes, count + cells)
head(genes, n = 10)
## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat
## # A tibble: 10 x 4
##    gene       count cells   sum
##    <chr>      <dbl> <dbl> <dbl>
##  1 ptpn12         4     4     8
##  2 phtf2          1     1     2
##  3 phtf2.1       10    10    20
##  4 CU856344.2     2     2     4
##  5 CU856344.1     7     7    14
##  6 mansc1         0     0     0
##  7 lrp6           0     0     0
##  8 dusp16        45    44    89
##  9 crebl2       180   172   352
## 10 gpr19          0     0     0

2.2 Mireia/Giuliano/Dani’s feedback : remove the genes with 0 counts in 0 cells (1 count is ok if it is expressed in several cells). Genes not expressed at all have a sum of 0, and genes that have only 1 count in 1 cell have a sum of 2. So we directly filter by Sum > 2. 18270 genes kept over 27692.

#Filter by Sum > 2
genes_zero <- genes %>% dplyr::filter(sum > 2)
genes_zero <- as.vector(genes_zero$gene)

#We now have removed those genes!
counts_BF1 <- counts_BF[rownames(counts_BF) %in% genes_zero ,]

2.3 Filter lincRNA genes from the count matrix. 17428 genes kept over 18270.

counts_BF2 <- counts_BF1[rownames(counts_BF1) %in% lincrna ,]

2.4 Filter MT genes from the count matrix. 17415 genes kept over 17428

genes_MT <- genes[-grep("^mt-",genes$gene),]
genes_MT <- as.vector(genes_MT$gene)
#We now have removed MT genes!
counts_BF3 <- counts_BF2[rownames(counts_BF2) %in% genes_MT ,]

2.5 Filter Ribosomal genes from the count matrix. 17324 genes kept over 17415

genes_RB <- genes[-grep("^rp[sl]",genes$gene),]
genes_RB <- as.vector(genes_RB$gene)
#We now have removed RB genes!
counts_BF4 <- counts_BF3[rownames(counts_BF3) %in% genes_RB ,]

3. Recompute nFeature and nCount rna for further violin plots

3.1 nCount rna

nCount_rna <- as.numeric(colSums(counts_BF4))
head(nCount_rna, n = 10)
##  [1] 3679  524  400 3247  354  958 3037 1070  243  283

3.2 nFeature rna. Positive counts (>1) are replaced by 1 (to compute the sum of expressed features for each cell)

#Threshold value and replace value
thresh <- 1
repl <- 1

#Re-organize the data
counts_BF5 <- as.data.frame(t(counts_BF4))
#if the value is >1, replace it by 1
counts_feature <- counts_BF5 %>% mutate_if(is.numeric, funs(if_else(. > thresh, repl, .)))
#compute the sum of expressed features for each cell
nFeature_rna <- as.numeric(rowSums(counts_feature))
counts_BF6 <- as.data.frame(t(counts_BF5))
head(nFeature_rna, n = 10)
##  [1] 1305  286  277 1035  244  450 1226  501  164  197

4. New top expressed genes following M/G/D’s feedback.

Here, the data is organized like this: each row is a cell and each column is a gene. As you can see, I’ll use 2 loops for a total of 81 751 956 (4719 cells x 17324 genes) iterations. Basically, when i = 1 and j = 1, we go at the first row (i) of the first column (j) in the dataframe. Then we compute the percentage of the expressed gene within its cell, following this formula: Number at position i,j / total rna count within the cell (via the previously calculated nCount_rna vector) x 100. So the nCount_rna number will be the same along the first 17324 gene iterations (j), then will change when we will be at the second row (i = 2, new cell with new nCount_rna number), until the last cell (i = 4719). As a tip, always convert the dataframe into matrix before doing high computationnal loops, I realized too late that it could be done in less than 1 minute instead of… 8 HOURS!

#Convert as matrix
count_mean_matrix <- as.matrix(t(counts_BF6))
#Create output with NA and same size than our count matrix
output <- matrix(nrow = 4719, ncol = 17324)
#Let's go for looping!
  for (i in 1:4719) {
    for(j in 1:17324){
      output[i,j] <- count_mean_matrix[i,j]/nCount_rna[i] * 100
    }
  }

#Convert resulting matrix into dataframe
output1 <- as.data.frame(output)

#Re-attribute cell name + gene name
counts_BF7 <- as.data.frame(t(counts_BF6))
rownames(output1) <- rownames(counts_BF7)
colnames(output1) <- colnames(counts_BF7)
output2 <- as.data.frame(t(output1))
#Compute the general mean expression of each gene, according the depth sequencing variation within the cell previously computed
output2$mean <- rowMeans(output2)
#Only keep the mean column (the last one)
output3 <- subset(output2, select = c(4720))
head(output3, n = 10)
##                          mean             gene
## ptpn12           3.561958e-05           ptpn12
## CU856344.2       1.784625e-05       CU856344.2
## CU856344.1       1.012789e-04       CU856344.1
## dusp16           7.093048e-04           dusp16
## crebl2           2.289927e-03           crebl2
## tbk1             3.942349e-03             tbk1
## eps8a            1.100152e-05            eps8a
## ptpro            2.368138e-05            ptpro
## si:ch73-252i11.1 9.318202e-04 si:ch73-252i11.1
## CABZ01085275.1   1.389237e-04   CABZ01085275.1
#Create gene column
vec_gene <- as.vector(rownames(output3))
output3$gene <- vec_gene
#Order by mean expression
top20genes <- output3[order(-output3$mean),]
#Select the top 20
top20genes1 <- top20genes[1:20,]

#Plot the result with wonderful colors
top20genes1$gene <- factor(top20genes1$gene, levels = top20genes1$gene)
ggplot(data=top20genes1, aes(x=mean, y=gene, fill = gene)) + geom_bar(stat="identity") + theme(legend.position = "none") 

5. Load the final matrix into Seurat and see violin plots.

counts <- as.matrix(t(counts_BF7))
sce <- SingleCellExperiment(assays = list(counts = counts))
data_QC <- as.Seurat(sce, data = NULL)

#Compute nCount_RNA and nFeature_RNA into Seurat
data_QC[["nCount_RNA"]] <- nCount_rna
data_QC[["nFeature_RNA"]] <- nFeature_rna
VlnPlot(data_QC, features = c("nCount_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10()

VlnPlot(data_QC, features = c("nFeature_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10()

FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

As expected, there is a quasi total correlation between features and counts.

6. Normalization + plots

SCT transform is a 3-in-1 step: it normalizes, scales and finds variables features, that’s why we already have to select a number in this step. I choose 750 because of incoming codes. The 95th percentile of the ECDF plots has a residual variance (rv) of 2.55, which is approximately found at the 750th gene. I was not able to put a vertical line showing where are the first top x genes so I did it in this way.

#SCT normalisation with top 700 variable features
data_QC <- SCTransform(data_QC, variable.features.n = 750)
#Variable feature plot
VariableFeaturePlot(data_QC, selection.method = "sct")

#Isolate the residual variance from the Seurat object
residual_variance <- as.data.frame(data_QC@assays[["SCT"]]@meta.features[["sct.residual_variance"]])
colnames(residual_variance) <- "residual_variance"
#Plot the ECDF of the residual variance
ggplot(residual_variance, aes(residual_variance)) + stat_ecdf(geom = "point")

#Check the rv of the 95th percentile (2.55)
quantile(residual_variance$residual_variance, c(0.95))
##      95% 
## 2.554012
residual_variance1 <- residual_variance[order(-residual_variance$residual_variance),]
residual_variance1 <- as.data.frame(residual_variance1)

7. Dimensional reductions

Following recommendations of members of the teaching team at the Harvard Chan Bioinformatics Core (HBC), we have to compute 2 things that will give us the start of the principal component to “elbow”:

1: The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 95% of the standard deviation.

2: The point where the percent change in variation between the consecutive PCs is less than 0.1%.

We are therefore suggered to select the miminum of the two numbers for downstream steps.

#Run PCA with variable features
data_QC <- RunPCA(data_QC, assay = "SCT",features = VariableFeatures(object = data_QC))

#Identifie the optimal PCs
pct <- data_QC[["pca"]]@stdev / sum(data_QC[["pca"]]@stdev) * 100

#Calculate cumulative percents for each PC
cumu <- cumsum(pct)

#Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5 (47)
co1 <- which(cumu > 95 & pct < 5)[1]
co1

#Determine the difference between variation of PC and subsequent PC (15)
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
co2
#Minimum of the two calculations (15)
pcs <- min(co1, co2)

#Create a dataframe with values
plot_df <- data.frame(pct = pct, 
                      cumu = cumu, 
                      rank = 1:length(pct))
#Elbow plot
ElbowPlot(data_QC, ndims = 50)

#Elbow plot to visualize 
ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
  geom_text() + 
  geom_vline(xintercept = 95, color = "grey") + 
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()

In our case, the point 2 has given the minimal PC result, which is 15. But we will also try with 47 PC’s!

#Umap with 15 PC's taken into account
data_QC_15 <- RunUMAP(data_QC, dims = 1:15, verbose = F)
#FindNeighbors
data_QC_15 <- FindNeighbors(data_QC_15, dims = 1:15, verbose = F)
#Findclusters
data_QC_15 <- FindClusters(data_QC_15, resolution = 0.5, verbose = F)
DimPlot(data_QC_15, label = T)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.

8. Clustering quality

8.1 Using 15 PCs

#Compute distance matrix to UMAP coordinates
distance_matrix <- dist(Embeddings(data_QC_15[['umap']])[, 1:2])

#Isolate cluster nameq
clusters <- data_QC_15@active.ident

#Compute silhouette score for each cluster
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
data_QC_15@meta.data$silhouette_score <- silhouette[,3]
#Plot
fviz_silhouette(silhouette)
##    cluster size ave.sil.width
## 1        1  703         -0.38
## 2        2  692          0.42
## 3        3  612          0.58
## 4        4  591          0.32
## 5        5  375          0.72
## 6        6  334          0.33
## 7        7  306          0.70
## 8        8  215          0.92
## 9        9  189          0.68
## 10      10  154          0.74
## 11      11  146          0.68
## 12      12  138         -0.16
## 13      13  127          0.62
## 14      14  110          0.33
## 15      15   27          0.96

8.2 Using 47 PCs

#Umap with 15 PC's taken into account
data_QC_47 <- RunUMAP(data_QC, dims = 1:47, verbose = F)
#FindNeighbors
data_QC_47 <- FindNeighbors(data_QC_47, dims = 1:47, verbose = F)
#Findclusters
data_QC_47 <- FindClusters(data_QC_47, resolution = 0.5, verbose = F)
DimPlot(data_QC_47, label = T)

#Compute distance matrix to UMAP coordinates
distance_matrix_47 <- dist(Embeddings(data_QC_47[['umap']])[, 1:2])

#Isolate cluster nameq
clusters_47 <- data_QC_47@active.ident

#Compute silhouette score for each cluster
silhouette_47 <- silhouette(as.numeric(clusters_47), dist = distance_matrix_47)
data_QC_47@meta.data$silhouette_score <- silhouette_47[,3]
#Plot
fviz_silhouette(silhouette_47)
##    cluster size ave.sil.width
## 1        1  798          0.40
## 2        2  636         -0.53
## 3        3  604          0.55
## 4        4  586          0.59
## 5        5  386          0.74
## 6        6  337          0.39
## 7        7  314          0.55
## 8        8  216          0.89
## 9        9  150          0.72
## 10      10  147          0.22
## 11      11  133          0.19
## 12      12   93          0.85
## 13      13   83          0.04
## 14      14   77          0.73
## 15      15   72         -0.25
## 16      16   31          0.85
## 17      17   28          0.99
## 18      18   17          0.97
## 19      19   11          0.99

Both PCs used gave, in my sense, not very good results. Although I have not filtered nFeature and nRNA_counts (because we have to decide the threshold together), as well as for potential doublets, I don’t think we will be able to obtain all positive scoring clusters. I tried with more variable features (> 700, not showed here because the lap has no more ram to create other Seurat objects) and by increasing the resolution: it gave worse scores (< 0.38). One of the reason would maybe be that we removed MT and RB genes, and even if we don’t really need it, they may be crucial to cluster different cell types. Next step will be to keep ribosomal genes in the count matrix. To be continued!