# libraries
library(Seurat) # for scrnaseq analyisis
## Attaching SeuratObject
library(here) # for reproducible paths
## here() starts at C:/Users/nbestard/OneDrive/Luise_HCA
library(ggplot2) # plots
library(ggsci) # colors
# Set up the color palette
mypal <- pal_npg("nrc", alpha = 0.7)(10)
mypal2 <-pal_tron("legacy", alpha = 0.7)(7)
mypal3<-pal_lancet("lanonc", alpha = 0.7)(9)
mypal4<-pal_simpsons(palette = c("springfield"), alpha = 0.7)(16)
mypal5 <- pal_rickandmorty(palette = c("schwifty"), alpha = 0.7)(6)
mypal6 <- pal_futurama(palette = c("planetexpress"), alpha = 0.7)(5)
mypal7 <- pal_startrek(palette = c("uniform"), alpha = 0.7)(5)
mycoloursP<- c(mypal, mypal2, mypal3, mypal4)
## Clustering ----
# Only compute if not done before
if (!file.exists(here("processed", "srt_cluster.RDS"))) {
# Load the seurat object
srt_all <- readRDS(here("processed", "srt_umap.RDS"))
# Cluster with 25 dimensions more details for this choice in srt_dim_reduction.R
srt_all <- FindNeighbors(srt_all, dims = 1:25)
srt_all <- FindClusters(srt_all, resolution = 0.8)
# Save the active ident into the metadata
srt_all$clusters_0.8 <- srt_all@active.ident
# Save the object
saveRDS(srt_all, here("processed", "srt_cluster.RDS"))
} else {
srt_all <- readRDS(here("processed", "srt_cluster.RDS"))
Idents(srt_all) <- srt_all$clusters_0.8
}
DimPlot(srt_all, label = TRUE, cols = mycoloursP)
DimPlot(srt_all, label = TRUE, cols = mycoloursP, group.by = "rough_annot")
Plot the case number with colours according to their group and to the clustering
DimPlot(srt_all, split.by = "caseNO", group.by = "clusters_0.8", cols = mycoloursP, ncol = 5)
How many individuals contribute to each cluster?
# count how many cells there are in each group and cluster
sum_caseNO_cluster <- table(srt_all$clusters_0.8, srt_all$caseNO)
# For each cluster (on the rows) sum of individuals that do have cells on that cluster
rowSums(sum_caseNO_cluster > 0)
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 20 20 18 20 20 20 20 20 20 20 20 20 20 20 20 20 16 11 20 3 19 19 17 20 19 5
## 26 27 28 29 30
## 20 3 16 15 15
Already from this first approach three clusters stand out: 19, 25 and 27 are formed by very few individuals. 17 is the next one to be concerned about.
Some of the clusters might be mainly from one person, even though other subjects do have some cells that cluster with it. To address this question we calculate for each cluster the proportion of cells that come from each caseNO.
# calculate the proportions, for each cluster (margin 1 for rows)
prop_caseNO_table <- prop.table(sum_caseNO_cluster, margin = 1)
# change the format to be a data.frame, this also expands to long formatting
prop_caseNO <- as.data.frame(prop_caseNO_table)
colnames(prop_caseNO) <- c("cluster", "caseNO", "proportion")
Flag the clusters where one of the individuals covers more than 40% of the cluster. The expected would be around 1/20= 5%
prop_caseNO[prop_caseNO$proportion > 0.4, ]
## cluster caseNO proportion
## 137 12 SD016/13 0.4533898
## 142 17 SD016/13 0.9751861
## 150 25 SD016/13 0.9628713
## 151 26 SD016/13 0.4195584
## 578 19 SD042/18 0.5973783
## 586 27 SD042/18 0.5833333
There are 4 clusters (12, 17, 25, 26) mainly from 16/13. 17 is formed by oligodendrocytes previously detected as pour quality in preliminary analysis. 17 and 25 are formed by more than 90% from one individual, on the other hand the two others are just above 40% and should not be deleted just because of theis criteria
Then two more clusters are more than a 50% from 1 individual, 19 and 27 are stromal/unidentified clusters from 42/18.
With previous analysis, it seemed that some of the clusters were mainly from a person with sepsis, but this is not the case anymore since we removed 10 problematic samples.
DimPlot(srt_all,
reduction = "umap",
group.by = "CauseOfDeath_category",
cols = mycoloursP[8:18],
pt.size = 0.5
)
The 16/13 that forms 4 clusters is unascertained death
Another interesting variable is the number of individuals that contribute to more than a certain threshold (2%) to each cluster
# Calcluate for each cluster the number of individuals that fulfil the condition of contributing more than a 2%
num_individuals_gt_2pt <- rowSums(prop_caseNO_table > 0.02)
# Sort the clusters by ascending order of number of individuals that contribute more than 2%
sort(num_individuals_gt_2pt)
## 17 25 19 27 2 29 12 26 30 16 22 28 4 11 18 23 15 1 13 20 5 7 8 9 14 21
## 1 2 3 3 9 9 11 11 11 12 12 12 13 14 14 14 15 16 16 16 17 17 17 17 17 17
## 24 3 6 10 0
## 17 18 18 18 19
#And a general overview of the data
summary(num_individuals_gt_2pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 11.0 14.0 13.1 17.0 19.0
# Save the ones that are formed by less than 8 individuals that fufill the condition
(clusters_bad <- rownames(prop_caseNO_table)[which(num_individuals_gt_2pt < 8)])
## [1] "17" "19" "25" "27"
“17” “19” “25” "27 are the clusters that stand out. There are not surprisingly the same clusters already identified to be formed by very low numbers of individuals with the first parameter.
Plot the proportions for caseNo
# plot a barplot
ggplot(data = prop_caseNO, aes(x = cluster, y = proportion, fill = caseNO)) +
geom_bar(stat = "identity") + theme_classic() + scale_fill_manual(values=mycoloursP[1:20])
It is also worth keeping in mind the size of the cluster, there are in ascending order (0 is the biggest cluster and 33 the smallest).
The proportions are again calculated but taking into consideration the samples instead of the individuals
# count how many cells there are in each group and cluster
sum_caseNOtissue_cluster <- table(srt_all$clusters_0.8, srt_all$caseNO_tissue)
# calculate the proportions, for each cluster (margin 1 for rows)
prop_caseNOtissue_table <- prop.table(sum_caseNOtissue_cluster, margin = 1)
# change the format to be a data.frame, this also expands to long formatting
prop_caseNOtissue <- as.data.frame(prop_caseNOtissue_table)
colnames(prop_caseNOtissue) <- c("cluster", "caseNO_tissue", "proportion")
prop_caseNOtissue[prop_caseNOtissue$proportion > 0.3, ]
## cluster caseNO_tissue proportion
## 354 12 SD016/13_CB 0.3330508
## 359 17 SD016/13_CB 0.9342432
## 367 25 SD016/13_CB 0.9628713
## 495 29 SD022/16_BA4 0.3624161
## 1429 2 SD046/16_CB 0.3517510
## 1443 16 SD046/16_CB 0.3782447
## 1455 28 SD046/16_CB 0.3743590
Most problematic samples are Cerebellum samples.
Keep an eye on 2, 16, 28, where 46/16_CB contributes significantly
FeaturePlot(srt_all, features = "sum", max.cutoff = "q75")
if (!file.exists(here("processed", "srt_filter_bad_clusters.RDS"))) {
# Keep dimensions
dim_srt_all <- dim(srt_all)
#Filter the cells from the "bad clusters"
srt_all <- srt_all[, !(srt_all$clusters_0.8 %in% clusters_bad)]
# Save the changes into the metadata
srt_all$clusters_0.8 <- Idents(srt_all)
# Number of cells filtered # 1844
dim_srt_all[2] - dim(srt_all)[2]
saveRDS(srt_all, here("processed", "srt_filter_bad_clusters.RDS"))
}else{
srt_all <- readRDS(here("processed", "srt_filter_bad_clusters.RDS"))
}
DimPlot(srt_all, label = TRUE, cols = mycoloursP)