Cluster analysis

Set up

# libraries
library(Seurat) # for scrnaseq analyisis

## Attaching SeuratObject

library(here) # for reproducible paths

## here() starts at C:/Users/nbestard/OneDrive/Luise_HCA

library(ggplot2) # plots
library(ggsci)  # colors

# Set up the color palette
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3<-pal_lancet("lanonc", alpha = 0.7)(9)
mypal4<-pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)

mycoloursP<- c(mypal, mypal2, mypal3, mypal4)

Clustering

## Clustering ----
# Only compute if not done before
if (!file.exists(here("processed", "srt_cluster.RDS"))) {
  # Load the seurat object
  srt_all <- readRDS(here("processed", "srt_umap.RDS"))
  # Cluster with 25 dimensions more details for this choice in srt_dim_reduction.R
  srt_all <- FindNeighbors(srt_all, dims = 1:25)
  srt_all <- FindClusters(srt_all, resolution = 0.8)
  # Save the active ident into the metadata
  srt_all$clusters_0.8 <- srt_all@active.ident
  # Save the object
  saveRDS(srt_all, here("processed", "srt_cluster.RDS"))
} else {
  srt_all <- readRDS(here("processed", "srt_cluster.RDS"))
  Idents(srt_all) <- srt_all$clusters_0.8
}

DimPlot(srt_all, label = TRUE, cols = mycoloursP)

DimPlot(srt_all, label = TRUE, cols = mycoloursP, group.by = "rough_annot")

Cluster Quality

Plot the case number with colours according to their group and to the clustering

DimPlot(srt_all, split.by = "caseNO", group.by = "clusters_0.8", cols = mycoloursP, ncol = 5)

Individuals per cluster

How many individuals contribute to each cluster?

# count how many cells there are in each group and cluster
sum_caseNO_cluster <- table(srt_all$clusters_0.8, srt_all$caseNO)
# For each cluster (on the rows) sum of individuals that do have cells on that cluster
rowSums(sum_caseNO_cluster > 0)

##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 20 20 18 20 20 20 20 20 20 20 20 20 20 20 20 20 16 11 20  3 19 19 17 20 19  5 
## 26 27 28 29 30 
## 20  3 16 15 15

Already from this first approach three clusters stand out: 19, 25 and 27 are formed by very few individuals. 17 is the next one to be concerned about.

Percentage of cells that come from each individual

Some of the clusters might be mainly from one person, even though other subjects do have some cells that cluster with it. To address this question we calculate for each cluster the proportion of cells that come from each caseNO.

# calculate the proportions, for each cluster (margin 1 for rows)
prop_caseNO_table <- prop.table(sum_caseNO_cluster, margin = 1)
# change the format to be a data.frame, this also expands to long formatting
prop_caseNO <- as.data.frame(prop_caseNO_table)
colnames(prop_caseNO) <- c("cluster", "caseNO", "proportion")

Flag the clusters where one of the individuals covers more than 40% of the cluster. The expected would be around 1/20= 5%

prop_caseNO[prop_caseNO$proportion > 0.4, ]

##     cluster   caseNO proportion
## 137      12 SD016/13  0.4533898
## 142      17 SD016/13  0.9751861
## 150      25 SD016/13  0.9628713
## 151      26 SD016/13  0.4195584
## 578      19 SD042/18  0.5973783
## 586      27 SD042/18  0.5833333

There are 4 clusters (12, 17, 25, 26) mainly from 16/13. 17 is formed by oligodendrocytes previously detected as pour quality in preliminary analysis. 17 and 25 are formed by more than 90% from one individual, on the other hand the two others are just above 40% and should not be deleted just because of theis criteria

Then two more clusters are more than a 50% from 1 individual, 19 and 27 are stromal/unidentified clusters from 42/18.

With previous analysis, it seemed that some of the clusters were mainly from a person with sepsis, but this is not the case anymore since we removed 10 problematic samples.

DimPlot(srt_all,
        reduction = "umap",
        group.by = "CauseOfDeath_category",
        cols = mycoloursP[8:18], 
        pt.size = 0.5
        )

The 16/13 that forms 4 clusters is unascertained death

Minimum threshold of 2% contribution to count individuals

Another interesting variable is the number of individuals that contribute to more than a certain threshold (2%) to each cluster

# Calcluate for each cluster the number of individuals that fulfil the condition of contributing more than a 2%
num_individuals_gt_2pt <- rowSums(prop_caseNO_table > 0.02)
# Sort the clusters by ascending order of number of individuals that contribute more than 2%
sort(num_individuals_gt_2pt)

## 17 25 19 27  2 29 12 26 30 16 22 28  4 11 18 23 15  1 13 20  5  7  8  9 14 21 
##  1  2  3  3  9  9 11 11 11 12 12 12 13 14 14 14 15 16 16 16 17 17 17 17 17 17 
## 24  3  6 10  0 
## 17 18 18 18 19

#And a general overview of the data
summary(num_individuals_gt_2pt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    11.0    14.0    13.1    17.0    19.0

# Save the ones that are formed by less than 8 individuals that fufill the condition
(clusters_bad <- rownames(prop_caseNO_table)[which(num_individuals_gt_2pt < 8)])

## [1] "17" "19" "25" "27"

“17” “19” “25” "27 are the clusters that stand out. There are not surprisingly the same clusters already identified to be formed by very low numbers of individuals with the first parameter.

Plot the proportions for caseNo

# plot a barplot
ggplot(data = prop_caseNO, aes(x = cluster, y = proportion, fill = caseNO)) +
  geom_bar(stat = "identity") + theme_classic() + scale_fill_manual(values=mycoloursP[1:20])

It is also worth keeping in mind the size of the cluster, there are in ascending order (0 is the biggest cluster and 33 the smallest).

Samples instead of individuals

The proportions are again calculated but taking into consideration the samples instead of the individuals

# count how many cells there are in each group and cluster
sum_caseNOtissue_cluster <- table(srt_all$clusters_0.8, srt_all$caseNO_tissue)
# calculate the proportions, for each cluster (margin 1 for rows)
prop_caseNOtissue_table <- prop.table(sum_caseNOtissue_cluster, margin = 1)
# change the format to be a data.frame, this also expands to long formatting
prop_caseNOtissue <- as.data.frame(prop_caseNOtissue_table)
colnames(prop_caseNOtissue) <- c("cluster", "caseNO_tissue", "proportion")
prop_caseNOtissue[prop_caseNOtissue$proportion > 0.3, ]

##      cluster caseNO_tissue proportion
## 354       12   SD016/13_CB  0.3330508
## 359       17   SD016/13_CB  0.9342432
## 367       25   SD016/13_CB  0.9628713
## 495       29  SD022/16_BA4  0.3624161
## 1429       2   SD046/16_CB  0.3517510
## 1443      16   SD046/16_CB  0.3782447
## 1455      28   SD046/16_CB  0.3743590

Most problematic samples are Cerebellum samples.

Keep an eye on 2, 16, 28, where 46/16_CB contributes significantly

UMI counts

FeaturePlot(srt_all, features = "sum", max.cutoff = "q75")

Filter clusters

if (!file.exists(here("processed", "srt_filter_bad_clusters.RDS"))) {
# Keep dimensions
dim_srt_all <-  dim(srt_all)
#Filter the cells from the "bad clusters"
srt_all <- srt_all[, !(srt_all$clusters_0.8 %in% clusters_bad)]
# Save the changes into the metadata
srt_all$clusters_0.8 <- Idents(srt_all)
# Number of cells filtered # 1844
dim_srt_all[2] - dim(srt_all)[2]

saveRDS(srt_all, here("processed", "srt_filter_bad_clusters.RDS"))
}else{
  srt_all <- readRDS(here("processed", "srt_filter_bad_clusters.RDS"))
}

DimPlot(srt_all, label = TRUE, cols = mycoloursP)