Loading library

load("C:/Users/Simon/Documents/VW_lab/cd45_steady/cd45_no_filter.RData")
library(dplyr)
library(Seurat)
library(scales)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
library(gplots)
library(sctransform)
library(scDblFinder)
library(SingleCellExperiment)
library(cluster)
library(factoextra)
library(intrinsicDimension)
library(xlsx)
library(tibble)
library(GenomicTools)
library(snpStats)

1. Loading data into Seurat (4719 cells with 27692 captured genes)

data.raw <- Read10X(data.dir = "outs/filtered_feature_bc_matrix")
data <- CreateSeuratObject(counts = data.raw, names.field = 2, names.delim = "-")

2. Gene filtering

2.1 Extracting and creating optimal data for filtering step

#Extract count matrix from Seurat class as a dataframe (BF = Before Filtering with violin plots)
counts_BF <- as.data.frame(as.matrix(data@assays$RNA@data)) 
#Create a df which sums the count for each gene and the number of cells who express it
genes <- tibble(
  gene = rownames(counts_BF),
  count = Matrix::rowSums(counts_BF),
  cells = Matrix::rowSums(counts_BF != 0)
)  

#Create sum column to further select genes that have a sum of 0  
genes$sum <- with(genes, count + cells)
head(genes, n = 10)
## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat
## # A tibble: 10 x 4
##    gene       count cells   sum
##    <chr>      <dbl> <dbl> <dbl>
##  1 ptpn12         4     4     8
##  2 phtf2          1     1     2
##  3 phtf2.1       10    10    20
##  4 CU856344.2     2     2     4
##  5 CU856344.1     7     7    14
##  6 mansc1         0     0     0
##  7 lrp6           0     0     0
##  8 dusp16        45    44    89
##  9 crebl2       180   172   352
## 10 gpr19          0     0     0

2.2 Mireia/Giuliano/Dani’s feedback : remove the genes with 0 counts in 0 cells (1 count is ok if it is expressed in several cells). Genes not expressed at all have a sum of 0, and genes that have only 1 count in 1 cell have a sum of 2. So we directly filter by Sum > 2. 18270 genes kept over 27692.

#Filter by Sum > 2
genes_zero <- genes %>% dplyr::filter(sum > 2)
genes_zero <- as.vector(genes_zero$gene)

#We now have removed those genes!
counts_BF1 <- counts_BF[rownames(counts_BF) %in% genes_zero ,]

3. Recompute nFeature and nCount rna for further violin plots

3.1 nCount rna

nCount_rna <- as.numeric(colSums(counts_BF1))
head(nCount_rna, n = 10)
##  [1] 5115  681  686 4338  866 1456 3706 1636  552  636

3.2 nFeature rna. Positive counts (>1) are replaced by 1 (to compute the sum of expressed features for each cell)

#Threshold value and replace value
thresh <- 1
repl <- 1

#Re-organize the data
counts_BF2 <- as.data.frame(t(counts_BF1))
#if the value is >1, replace it by 1
counts_feature <- counts_BF2 %>% mutate_if(is.numeric, funs(if_else(. > thresh, repl, .)))
#compute the sum of expressed features for each cell
nFeature_rna <- as.numeric(rowSums(counts_feature))
counts_BF3 <- as.data.frame(t(counts_BF2))
head(nFeature_rna, n = 10)
##  [1] 1422  360  350 1145  300  514 1330  601  212  271

4. New top expressed genes following M/G/D’s feedback.

Here, the data is organized like this: each row is a cell and each column is a gene. As you can see, I’ll use 2 loops for a total of 86 216 130 (4719 cells x 18270 genes) iterations. Basically, when i = 1 and j = 1, we go at the first row (i) of the first column (j) in the dataframe. Then we compute the percentage of the expressed gene within its cell, following this formula: Number at position i,j / total rna count within the cell (via the previously calculated nCount_rna vector) x 100. So the nCount_rna number will be the same along the first 18270 gene iterations (j), then will change when we will be at the second row (i = 2, new cell with new nCount_rna number), until the last cell (i = 4719).

#Convert as matrix
count_mean_matrix <- as.matrix(t(counts_BF3))
#Create output with NA and same size than our count matrix
output <- matrix(nrow = 4719, ncol = 18270)
#Let's go for looping!
  for (i in 1:4719) {
    for(j in 1:18270){
      output[i,j] <- count_mean_matrix[i,j]/nCount_rna[i] * 100
    }
  }

#Convert resulting matrix into dataframe
output1 <- as.data.frame(output)

#Re-attribute cell name + gene name
counts_BF4 <- as.data.frame(t(counts_BF3))
rownames(output1) <- rownames(counts_BF4)
colnames(output1) <- colnames(counts_BF4)
output2 <- as.data.frame(t(output1))
#Compute the general mean expression of each gene, according the depth sequencing variation within the cell previously computed
output2$mean <- rowMeans(output2)
#Only keep the mean column (the last one)
output3 <- subset(output2, select = c(4720))
head(output3, n = 10)
##                    mean       gene
## ptpn12     2.436378e-05     ptpn12
## phtf2.1    8.740261e-05    phtf2.1
## CU856344.2 1.051784e-05 CU856344.2
## CU856344.1 5.968103e-05 CU856344.1
## dusp16     4.283852e-04     dusp16
## crebl2     1.438432e-03     crebl2
## tbk1       2.268429e-03       tbk1
## eps8a      7.226237e-06      eps8a
## ptpro      1.900552e-05      ptpro
## ptpro.1    4.805840e-05    ptpro.1
#Create gene column
vec_gene <- as.vector(rownames(output3))
output3$gene <- vec_gene
#Order by mean expression
top20genes <- output3[order(-output3$mean),]
#Select the top 20
top20genes1 <- top20genes[1:20,]

#Plot the result with wonderful colors
top20genes1$gene <- factor(top20genes1$gene, levels = top20genes1$gene)
ggplot(data=top20genes1, aes(x=mean, y=gene, fill = gene)) + geom_bar(stat="identity") + theme(legend.position = "none") 

5. Load the final matrix into Seurat and see violin plots.

counts <- as.matrix(t(counts_BF4))
sce <- SingleCellExperiment(assays = list(counts = counts))
data_QC <- as.Seurat(sce, data = NULL)

#Compute nCount_RNA and nFeature_RNA into Seurat
data_QC[["nCount_RNA"]] <- nCount_rna
data_QC[["nFeature_RNA"]] <- nFeature_rna
#Compute the % of MT and Ribosomal genes for each cell
data_QC[["percent.mt"]] <- PercentageFeatureSet(data_QC, pattern = "^mt-")
data_QC[["percent.ribo"]] <- PercentageFeatureSet(data_QC, pattern = "^rp[sl][[:digit:]]")
VlnPlot(data_QC, features = c("nCount_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10()

VlnPlot(data_QC, features = c("nFeature_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10()

VlnPlot(data_QC, features = c("percent.mt"),
        ncol = 1) + theme(legend.position = "none") 

VlnPlot(data_QC, features = c("percent.ribo"),
        ncol = 1) + theme(legend.position = "none") 

FeatureScatter(data_QC, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

As expected, there is a quasi total correlation between features and counts.

6. Normalization + plots

SCT transform is a 3-in-1 step: it normalizes, scales and finds variables features, that’s why we already have to select a number in this step. I choose 770 because of incoming codes. The 95th percentile of the ECDF plots has a residual variance (rv) of 2.70, which is approximately found at the 770th gene. I was not able to put a vertical line showing where are the first top x genes so I did it in this way.

#SCT normalisation with top 700 variable features
data_QC <- SCTransform(data_QC, variable.features.n = 770)
#Variable feature plot
VariableFeaturePlot(data_QC, selection.method = "sct")

#Isolate the residual variance from the Seurat object
residual_variance <- as.data.frame(data_QC@assays[["SCT"]]@meta.features[["sct.residual_variance"]])
colnames(residual_variance) <- "residual_variance"
#Plot the ECDF of the residual variance
ggplot(residual_variance, aes(residual_variance)) + stat_ecdf(geom = "point")

#Check the rv of the 95th percentile (2.70)
quantile(residual_variance$residual_variance, c(0.95))
##      95% 
## 2.704771
residual_variance1 <- residual_variance[order(-residual_variance$residual_variance),]
residual_variance1 <- as.data.frame(residual_variance1)

7. Dimensional reductions

Following recommendations of members of the teaching team at the Harvard Chan Bioinformatics Core (HBC), we have to compute 2 things that will give us the start of the principal component to “elbow”:

1: The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 95% of the standard deviation.

2: The point where the percent change in variation between the consecutive PCs is less than 0.1%.

We are therefore suggered to select the miminum of the two numbers for downstream steps.

#Run PCA with variable features
data_QC <- RunPCA(data_QC, assay = "SCT",features = VariableFeatures(object = data_QC))

#Identifie the optimal PCs
pct <- data_QC[["pca"]]@stdev / sum(data_QC[["pca"]]@stdev) * 100

#Calculate cumulative percents for each PC
cumu <- cumsum(pct)

#Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5 (46)
co1 <- which(cumu > 95 & pct < 5)[1]
co1

#Determine the difference between variation of PC and subsequent PC (18)
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
co2

#Minimum of the two calculations (18)
pcs <- min(co1, co2)

#Create a dataframe with values
plot_df <- data.frame(pct = pct, 
                      cumu = cumu, 
                      rank = 1:length(pct))
#Elbow plot
ElbowPlot(data_QC, ndims = 50)

#Elbow plot to visualize 
ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
  geom_text() + 
  geom_vline(xintercept = 95, color = "grey") + 
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()

In our case, the point 2 has given the minimal PC result, which is 18. But we will also try with 46 PCs!

#Umap with 18 PCs taken into account
data_QC_18 <- RunUMAP(data_QC, dims = 1:18, verbose = F)
#FindNeighbors
data_QC_18 <- FindNeighbors(data_QC_18, dims = 1:18, verbose = F)
#Findclusters
data_QC_18 <- FindClusters(data_QC_18, resolution = 0.5, verbose = F)
DimPlot(data_QC_18, label = T)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.

8. Clustering quality

8.1 Using 18 PCs

#Compute distance matrix to UMAP coordinates
distance_matrix <- dist(Embeddings(data_QC_18[['umap']])[, 1:2])

#Isolate cluster nameq
clusters <- data_QC_18@active.ident

#Compute silhouette score for each cluster
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
data_QC_18@meta.data$silhouette_score <- silhouette[,3]
#Plot
fviz_silhouette(silhouette)
##    cluster size ave.sil.width
## 1        1  613          0.49
## 2        2  561          0.55
## 3        3  527         -0.35
## 4        4  383          0.67
## 5        5  317          0.64
## 6        6  314          0.02
## 7        7  281          0.37
## 8        8  250          0.72
## 9        9  249          0.18
## 10      10  224          0.66
## 11      11  216          0.86
## 12      12  191          0.35
## 13      13  161          0.60
## 14      14  145          0.53
## 15      15  135          0.38
## 16      16  125          0.70
## 17      17   27          0.99

8.2 Using 46 PCs

#Umap with 46 PC's taken into account
data_QC_46 <- RunUMAP(data_QC, dims = 1:46, verbose = F)
#FindNeighbors
data_QC_46 <- FindNeighbors(data_QC_46, dims = 1:46, verbose = F)
#Findclusters
data_QC_46 <- FindClusters(data_QC_46, resolution = 0.5, verbose = F)
DimPlot(data_QC_46, label = T)

#Compute distance matrix to UMAP coordinates
distance_matrix_46 <- dist(Embeddings(data_QC_46[['umap']])[, 1:2])

#Isolate cluster nameq
clusters_46 <- data_QC_46@active.ident

#Compute silhouette score for each cluster
silhouette_46 <- silhouette(as.numeric(clusters_46), dist = distance_matrix_46)
data_QC_46@meta.data$silhouette_score <- silhouette_46[,3]
#Plot
fviz_silhouette(silhouette_46)
##    cluster size ave.sil.width
## 1        1  678         -0.38
## 2        2  633          0.52
## 3        3  590          0.46
## 4        4  447          0.34
## 5        5  384          0.69
## 6        6  357          0.45
## 7        7  304          0.36
## 8        8  284          0.33
## 9        9  217          0.90
## 10      10  203          0.68
## 11      11  141          0.77
## 12      12  140          0.65
## 13      13   87          0.11
## 14      14   85          0.73
## 15      15   84          0.12
## 16      16   40         -0.32
## 17      17   28          0.99
## 18      18   17          0.99

As you may have noticed, I have not filtered cells according to RNA/Feature counts (waiting your feedback), as well as for the percentage of MT and RB genes. The first two variables won’t affect the dimensional plot that much, since we will not be filtering a huge amount of cells. But this is not the case for the MT/RB filtering. So here it would not be surprising to have some clusters that have been generated according to an high percentage of MT/RB genes, before their biological relevance. 2 solutions are possible:

1: scaling down those genes so they don’t participate in the cluster formation (we therefore don’t remove cells)

2: filtering cells with a threshold percentage (we therefore remove cells, some dozens of % in our case)

What I usually do is actually both methods: scaling down + filtering with rather high threshold percentage (so we don’t remove half of the dataset). But this is a method as another one, so your advice is welcome!