cd45 analysis - Third version - only protein

load("C:/Users/Simon/Documents/VW_lab/cd45_steady/cd45_v3_1.RData")
library(dplyr)
library(Seurat)
library(scales)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
library(gplots)
library(sctransform)
library(scDblFinder)
library(SingleCellExperiment)
library(cluster)
library(factoextra)
library(intrinsicDimension)
library(xlsx)
library(tibble)
library(GenomicTools)
library(snpStats)

1. Loading Data

1.1 Loading GTF data in order to further remove genes that are not annnotated as protein_coding

#Load GTF file
gtf <- importGTF("GTF_zb.gtf")
#Convert as dataframe
gtf_df <- as.data.frame(gtf)
#Keep genes that are protein_coding (25432 over 32520)
gtf_filt <- gtf_df %>% filter(gene_biotype == "protein_coding")

#Vector of non-lincRNA genes that will sleep a little bit before being used
prot_cod <- as.vector(gtf_filt$gene_name)

1.2 Loading data into Seurat (4719 cells with 27692 captured genes)

data.raw <- Read10X(data.dir = "outs/filtered_feature_bc_matrix")
data <- CreateSeuratObject(counts = data.raw, names.field = 2, names.delim = "-")
nCount_rna <- data@meta.data[["nCount_RNA"]]
nFeature_rna <- data@meta.data[["nFeature_RNA"]]

data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^mt-")
percent_mt <- data@meta.data[["percent.mt"]]

data[["percent.ribo"]] <- PercentageFeatureSet(data, pattern = "^rp[sl][[:digit:]]")
percent_rb <- data@meta.data[["percent.ribo"]]

2. Gene filtering

2.1 Extracting and creating optimal data for filtering step

#Extract count matrix from Seurat class as a dataframe (BF = Before Filtering with violin plots)
counts_BF <- as.data.frame(as.matrix(data@assays$RNA@data))  

#Create a df which sums the count for each gene and the number of cells who express it
genes <- tibble(
  gene = rownames(counts_BF),
  count = Matrix::rowSums(counts_BF),
  cells = Matrix::rowSums(counts_BF != 0)
)  

#Create sum column to further select genes that have a sum of 0  
genes$sum <- with(genes, count + cells)

head(genes, n = 10)

## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat

## # A tibble: 10 x 4
##    gene       count cells   sum
##    <chr>      <dbl> <dbl> <dbl>
##  1 ptpn12         4     4     8
##  2 phtf2          1     1     2
##  3 phtf2.1       10    10    20
##  4 CU856344.2     2     2     4
##  5 CU856344.1     7     7    14
##  6 mansc1         0     0     0
##  7 lrp6           0     0     0
##  8 dusp16        45    44    89
##  9 crebl2       180   172   352
## 10 gpr19          0     0     0

#Filter by Sum > 2
genes_zero <- genes %>% dplyr::filter(sum > 2)
genes_zero <- as.vector(genes_zero$gene)

#We now have removed those genes!
counts_BF1 <- counts_BF[rownames(counts_BF) %in% genes_zero ,]

2.3 Keep only protein_coding genes from the count matrix. 17257 genes kept over 18270.

counts_BF2 <- counts_BF1[rownames(counts_BF1) %in% prot_cod ,]

2.4 Remove the 10 genes that had a very high residual variance in the second analysis: apoc1, si:dkey-21e2.13, si:ch211-212d10.1, ccl19a.1, ccl36.1, si:dkey-21e2.8, si:dkey-21e2.10, il13, il4 apoeb. 17248 genes kept over 17257

high_rv <- c("apoc1", "si:dkey-21e2.13", "si:ch211-212d10.1", "ccl19a.1", "ccl36.1","si:dkey-21e2.8", "si:dkey-21e2.10", "il13", "il4")
counts_BF3 <- counts_BF2[!(rownames(counts_BF2) %in% high_rv ), ]

3. New top expressed genes following M/G/D’s feedback.

#Convert as matrix
count_mean_matrix <- as.matrix(t(counts_BF3))
#Create output with NA and same size than our count matrix
output <- matrix(nrow = 4719, ncol = 17248)
#Let's go for looping!
  for (i in 1:4719) {
    for(j in 1:17248){
      output[i,j] <- count_mean_matrix[i,j]/nCount_rna[i] * 100
    }
  }

#Convert resulting matrix into dataframe
output1 <- as.data.frame(output)

#Re-attribute cell name + gene name
counts_BF4 <- as.data.frame(t(counts_BF3))
rownames(output1) <- rownames(counts_BF4)
colnames(output1) <- colnames(counts_BF4)
output2 <- as.data.frame(t(output1))
#Compute the general mean expression of each gene, according the depth sequencing variation within the cell previously computed
output2$mean <- rowMeans(output2)
#Only keep the mean column (the last one)
output3 <- subset(output2, select = c(4720))

head(output3, n = 10)

##                          mean             gene
## ptpn12           2.436324e-05           ptpn12
## CU856344.2       1.051635e-05       CU856344.2
## CU856344.1       5.967299e-05       CU856344.1
## dusp16           4.283538e-04           dusp16
## crebl2           1.437948e-03           crebl2
## tbk1             2.268033e-03             tbk1
## eps8a            7.207187e-06            eps8a
## ptpro            1.900552e-05            ptpro
## si:ch73-252i11.1 5.930777e-04 si:ch73-252i11.1
## CABZ01085275.1   8.648654e-05   CABZ01085275.1

#Create gene column
vec_gene <- as.vector(rownames(output3))
output3$gene <- vec_gene
#Order by mean expression
top20genes <- output3[order(-output3$mean),]
#Select the top 20
top20genes1 <- top20genes[1:20,]

#Plot the result with wonderful colors
top20genes1$gene <- factor(top20genes1$gene, levels = top20genes1$gene)

ggplot(data=top20genes1, aes(x=mean, y=gene, fill = gene)) + geom_bar(stat="identity") + theme(legend.position = "none")

7 MT/RB genes are present in the top.

4. Load the final matrix into Seurat and see violin plots.

counts <- as.matrix(t(counts_BF4))
sce <- SingleCellExperiment(assays = list(counts = counts))
data_QC <- as.Seurat(sce, data = NULL)

#Compute nCount_RNA, nFeature_RNA, percent.mt and percent.rb from first Seurat object
data_QC[["nCount_RNA"]] <- nCount_rna
data_QC[["nFeature_RNA"]] <- nFeature_rna
data_QC[["percent.mt"]] <- percent_mt
data_QC[["percent.ribo"]] <- percent_rb

VlnPlot(data_QC, features = c("nCount_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10() + geom_hline(yintercept=750, linetype="dashed", color = "red")

VlnPlot(data_QC, features = c("nFeature_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10() + geom_hline(yintercept=290, linetype="dashed", color = "red")

VlnPlot(data_QC, features = c("percent.mt"),
        ncol = 1) + theme(legend.position = "none") + geom_hline(yintercept=25, linetype="dashed", color = "red")

VlnPlot(data_QC, features = c("percent.ribo"),
        ncol = 1) + theme(legend.position = "none") + geom_hline(yintercept=25, linetype="dashed", color = "red")

5. Filtering violin plots. 2000 cells kept over 4719.

data_filt <- subset(data_QC, subset = nFeature_RNA > 290 & nCount_RNA > 750
                    & nCount_RNA < 10000 & percent.mt < 25 & percent.ribo < 25)

VlnPlot(data_filt, features = c("nCount_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10()

VlnPlot(data_filt, features = c("nFeature_RNA"),
        ncol = 1) + theme(legend.position = "none") + scale_y_log10()

VlnPlot(data_filt, features = c("percent.mt"),
        ncol = 1) + theme(legend.position = "none")

VlnPlot(data_filt, features = c("percent.ribo"),
        ncol = 1) + theme(legend.position = "none")

6. Normalization + plots

#SCT normalisation with top  variable features (700, because the 95th percentile of the residual variance is found at this gene number position)
data_filt <- SCTransform(data_filt, variable.features.n = 700)

In last analyses, we had some genes with a residual variance above 50 (until 150). Those genes have been removed from the count matrix. We can see now that the top rv gene is around 45, which is way lower than previously.

#Isolate the residual variance from the Seurat object
residual_variance <- as.data.frame(data_filt@assays[["SCT"]]@meta.features[["sct.residual_variance"]])

residual_genes <- as.data.frame(data_filt@assays[["SCT"]]@var.features)
colnames(residual_variance) <- "residual_variance"

#Plot the ECDF of the residual variance
ggplot(residual_variance, aes(residual_variance)) + stat_ecdf(geom = "point")

#Check the rv of the 95th percentile (2.49)
quantile(residual_variance$residual_variance, c(0.95))

##      95% 
## 2.496655

residual_variance1 <- residual_variance[order(-residual_variance$residual_variance),]
residual_variance1 <- as.data.frame(residual_variance1)

7. Dimensional reductions

To find the number of PCs to use:

1 : Compute the point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 95% of the standard deviation.

2: The point where the percent change in variation between the consecutive PCs is less than 0.1%.

We are therefore suggered to select the miminum of the two numbers for downstream steps.

#Run PCA with variable features
data_filt <- RunPCA(data_filt, assay = "SCT",features = VariableFeatures(object = data_filt))

#Identifie the optimal PCs
pct <- data_filt[["pca"]]@stdev / sum(data_filt[["pca"]]@stdev) * 100

#Calculate cumulative percents for each PC
cumu <- cumsum(pct)

#Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5 (47)
co1 <- which(cumu > 95 & pct < 5)[1]
co1

#Determine the difference between variation of PC and subsequent PC (13)
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
co2

#Minimum of the two calculations (13)
pcs <- min(co1, co2)

#Create a dataframe with values
plot_df <- data.frame(pct = pct, 
                      cumu = cumu, 
                      rank = 1:length(pct))

#Elbow plot to visualize 
ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
  geom_text() + 
  geom_vline(xintercept = 95, color = "grey") + 
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()

The point where the percent change in variation between the consecutive PCs is less than 0.1% gave the smallest PCs number. I therefore choose 13. (Tried with 47 and gave less good results)

#Umap with 13 PCs taken into account
data_filt_17 <- RunUMAP(data_filt, dims = 1:13, verbose = F)
#FindNeighbors
data_filt_17 <- FindNeighbors(data_filt_17, dims = 1:13, verbose = F)
#Findclusters
data_filt_17 <- FindClusters(data_filt_17, resolution = 0.4, verbose = F)

DimPlot(data_filt_17, label = T)

#Compute distance matrix to UMAP coordinates
distance_matrix <- dist(Embeddings(data_filt_17[['umap']])[, 1:2])

#Isolate cluster names
clusters <- data_filt_17@active.ident

#Compute silhouette score for each cluster
silhouette <- silhouette(as.numeric(clusters), dist = distance_matrix)
data_filt_17@meta.data$silhouette_score <- silhouette[,3]

#Plot
fviz_silhouette(silhouette)

##   cluster size ave.sil.width
## 1       1  502          0.53
## 2       2  433          0.37
## 3       3  370          0.59
## 4       4  203          0.34
## 5       5  166          0.64
## 6       6  144          0.70
## 7       7   91          0.61
## 8       8   71          0.87
## 9       9   20          0.98

VlnPlot(data_filt_17, features = c("percent.mt"),
        ncol = 1) + theme(legend.position = "none")

VlnPlot(data_filt_17, features = c("percent.ribo"),
        ncol = 1) + theme(legend.position = "none")

cd45 analysis - Third version - only protein_coding genes

Kin Simon

07/01/2021