Cluster analysis

Set up

# libraries
library(Seurat) # for scrnaseq analyisis

## Attaching SeuratObject

library(here) # for reproducible paths

## here() starts at C:/Users/nbestard/OneDrive/Luise_HCA

library(ggplot2) # plots
library(ggsci)  # colors

# Set up the color palette
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3<-pal_lancet("lanonc", alpha = 0.7)(9)
mypal4<-pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)

mycoloursP<- c(mypal, mypal2, mypal3, mypal4)

Clustering

## Clustering ----
# Only compute if not done before
if (!file.exists(here("processed", "srt_cluster_02.RDS"))) {
  # Load the seurat object
  srt_all <- readRDS(here("processed", "srt_umap_02.RDS"))
  # Cluster with 25 dimensions more details for this choice in srt_dim_reduction.R
  srt_all <- FindNeighbors(srt_all, dims = 1:25)
  srt_all <- FindClusters(srt_all, resolution = 0.8)
  # Save the active ident into the metadata
  srt_all$clusters_0.8 <- srt_all@active.ident
  # Save the object
  saveRDS(srt_all, here("processed", "srt_cluster_02.RDS"))
} else {
  srt_all <- readRDS(here("processed", "srt_cluster_02.RDS"))
  Idents(srt_all) <- srt_all$clusters_0.8
}

DimPlot(srt_all, label = TRUE, cols = mycoloursP)

DimPlot(srt_all, label = TRUE, cols = mycoloursP, group.by = "rough_annot")

Cluster Quality

Plot the case number with colours according to their group and to the clustering

DimPlot(srt_all, split.by = "caseNO", group.by = "clusters_0.8", cols = mycoloursP, ncol = 5)

Individuals per cluster

How many individuals contribute to each cluster?

# count how many cells there are in each group and cluster
sum_caseNO_cluster <- table(srt_all$clusters_0.8, srt_all$caseNO)
# For each cluster (on the rows) sum of individuals that do have cells on that cluster
rowSums(sum_caseNO_cluster > 0)

##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 20 20 20 18 20 20 19 20 20 20 20 20 20 20 16 17 20 17 19 19 17 19 19 20 20 16 
## 26 
## 15

Nothing stands out, maybe 27 that we are thinking in merging either way with 7

Percentage of cells that come from each individual

Some of the clusters might be mainly from one person, even though other subjects do have some cells that cluster with it. To address this question we calculate for each cluster the proportion of cells that come from each caseNO.

# calculate the proportions, for each cluster (margin 1 for rows)
prop_caseNO_table <- prop.table(sum_caseNO_cluster, margin = 1)
# change the format to be a data.frame, this also expands to long formatting
prop_caseNO <- as.data.frame(prop_caseNO_table)
colnames(prop_caseNO) <- c("cluster", "caseNO", "proportion")

Flag the clusters where one of the individuals covers more than 40% of the cluster. The expected would be around 1/20= 5%

prop_caseNO[prop_caseNO$proportion > 0.4, ]

##     cluster   caseNO proportion
## 126      17 SD016/13  0.8566610
## 528      14 SD046/16  0.4037500
## 539      25 SD046/16  0.4020619

Minimum threshold of 2% contribution to count individuals

Another interesting variable is the number of individuals that contribute to more than a certain threshold (2%) to each cluster

# Calcluate for each cluster the number of individuals that fulfil the condition of contributing more than a 2%
num_individuals_gt_2pt <- rowSums(prop_caseNO_table > 0.02)
# Sort the clusters by ascending order of number of individuals that contribute more than 2%
sort(num_individuals_gt_2pt)

## 17  3 15 26 14 20 25 11 16 23  4 13 22  6  9 24  1  5  7  8 12 18 19 21  2 10 
##  3  9 11 11 12 12 12 14 14 14 15 15 15 16 16 16 17 17 17 17 17 17 17 17 18 18 
##  0 
## 19

#And a general overview of the data
summary(num_individuals_gt_2pt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   13.00   16.00   14.67   17.00   19.00

# Save the ones that are formed by less than 8 individuals that fufill the condition
(clusters_bad <- rownames(prop_caseNO_table)[which(num_individuals_gt_2pt < 8)])

## [1] "17"

Plot the proportions for caseNo

# plot a barplot
ggplot(data = prop_caseNO, aes(x = cluster, y = proportion, fill = caseNO)) +
  geom_bar(stat = "identity") + theme_classic() + scale_fill_manual(values=mycoloursP[1:20])

It is also worth keeping in mind the size of the cluster, there are in ascending order (0 is the biggest cluster and 33 the smallest).

Samples instead of individuals

The proportions are again calculated but taking into consideration the samples instead of the individuals

# count how many cells there are in each group and cluster
sum_caseNOtissue_cluster <- table(srt_all$clusters_0.8, srt_all$caseNO_tissue)
# calculate the proportions, for each cluster (margin 1 for rows)
prop_caseNOtissue_table <- prop.table(sum_caseNOtissue_cluster, margin = 1)
# change the format to be a data.frame, this also expands to long formatting
prop_caseNOtissue <- as.data.frame(prop_caseNOtissue_table)
colnames(prop_caseNOtissue) <- c("cluster", "caseNO_tissue", "proportion")
prop_caseNOtissue[prop_caseNOtissue$proportion > 0.3, ]

##      cluster caseNO_tissue proportion
## 315       17   SD016/13_CB  0.6458685
## 1246       3   SD046/16_CB  0.3493789
## 1257      14   SD046/16_CB  0.3887500
## 1268      25   SD046/16_CB  0.3762887

Conclusion

Redoing the dimensional reduction a new microglia cluster has appear with more than 80% of cells from only one individual and formed by less than 3 individuals that contribute to the cluster at more than 2%.

We will delete this cluster, but because it is only one small cluster we will not redo the dimensionality reduction again.

Filter clusters

if (!file.exists(here("processed", "srt_filter_bad_clusters_02.RDS"))) {
# Keep dimensions
dim_srt_all <-  dim(srt_all)
#Filter the cells from the "bad clusters"
srt_all <- srt_all[, !(srt_all$clusters_0.8 %in% clusters_bad)]
# Save the changes into the metadata
srt_all$clusters_0.8 <- Idents(srt_all)
# Number of cells filtered # 1844
dim_srt_all[2] - dim(srt_all)[2]

saveRDS(srt_all, here("processed", "srt_filter_bad_clusters_02.RDS"))
}else{
  srt_all <- readRDS(here("processed", "srt_filter_bad_clusters_02.RDS"))
}

DimPlot(srt_all, label = TRUE, cols = mycoloursP)