Clustering

Set-up

The workflow and explanations bellow are from OSCA

library(SingleCellExperiment)
library(here) #reproducible paths
library(scater) # Plot dimred
library(bluster) # Clustering 
library(clustree) # show relationship clustering
#library(Seurat) # clusternig 2
library(pals) # for palettes with large n #kelly()22, #polychrome()#36, unname(alphabet())
library(Seurat) # to test other clustering

project<- "fire-mice"

if(!file.exists(here("processed", project, "sce_clusters_02.RDS"))){
sce <- readRDS(here("processed", project, "sce_corrected_02.RDS"))
} else {
  sce <- readRDS(here("processed", project, "sce_clusters_02.RDS"))
}

functions

# plot all the seurat or sce dim reduction
plot_list_func <- function(obj,
                           col_pattern="RNA_snn_res.", 
                           plot_cols, 
                           clust_lab = TRUE,
                           label_size = 1,
                           num_col = 4,
                           reduction = "umap",
                           # todo add a save FALSE
                           save_dir = getwd(),
                           width=7,
                           height=5){
  #TODO: add colours if not colours added
if(class(obj)[1] == "Seurat"){
  extr_res_col <- grep(pattern = col_pattern, names(obj@meta.data))
  res_names <- names(obj@meta.data[extr_res_col])
  # gtools function, sorts gene_names alphanumeric:
  res_names <- gtools::mixedsort(res_names) 
  for(i in 1: length(res_names)){
  #pdf(paste0(save_dir, 
    #           res_names[i], "_umap.pdf"), width=width, height=height)
  dim_plot <- DimPlot(obj, 
                          reduction = reduction, 
                          cols= plot_cols,
                          group.by = res_names[i],
                          label = clust_lab,
                          label.size = label_size) + NoLegend()
  print(dim_plot)
 # dev.off()
 # print(dim_plot)
  }
}else if (class(obj)[1] == "SingleCellExperiment"){
  names <- grep(pattern = col_pattern, names(colData(sce)), value = TRUE)
  ks <- stringr::str_remove(string = names, pattern = col_pattern)
  # gtools function, sorts gene_names alphanumeric:
  names <- gtools::mixedsort(names) 
  # plot directly from the sce_clusters_02.RDS
  idx <- c(1:length(names))
  for (i in idx) {
    k <- ks[i]
    cluster_name <- names[i]
    plotDim <-
      plotReducedDim(
        obj,
        dimred = reduction,
        colour_by = cluster_name,
        point_size = label_size,
        point_alpha = 0.3,
        text_by = cluster_name,
        text_size = 3
      )  +  scale_color_manual(values = plot_cols) + ggtitle(paste("Clustering with resolution/k =", k))
    print(plotDim)
  }
}else{
  stop("object must be seurat or singlecellexperiment")
}
  
}

Motivation

Clustering is an unsupervised learning procedure that is used in scRNA-seq data analysis to empirically define groups of cells with similar expression profiles. It is worth stressing the distinction between clusters and cell types. The former is an empirical construct while the latter is a biological truth (albeit a vaguely defined one). For this reason, questions like “what is the true number of clusters?” are usually meaningless. We can define as many clusters as we like, with whatever algorithm we like - each clustering will represent its own partitioning of the high-dimensional expression space, and is as “real” as any other clustering. It is helpful to realize that clustering, like a microscope, is simply a tool to explore the data. We can zoom in and out by changing the resolution of the clustering parameters, and we can experiment with different clustering algorithms to obtain alternative perspectives of the data.

With Bioconductor

# choose the resolutions to compute
ks <- c(5, 10, 20, 30, 40, 50, 60)
names <- paste0("cluster_k", as.character(ks))

#only run if first time
if (!file.exists(here("processed", project, "clusters_SNN_k5-k60_02.RDS"))) {
  # Compute all the cluster resolutions
  #  and save in a list with an appropriate name for later use
  clusters_list <-
    mapply(function(cluster_name, k) {
      clusterRows(reducedDim(sce, "corrected"), NNGraphParam(k = k))
    },
    names,
    ks,
    USE.NAMES = TRUE,
    SIMPLIFY = FALSE)
  saveRDS(clusters_list,
          here("processed", project, "clusters_SNN_k5-k60_02.RDS"))
} else{
  clusters_list <-
    readRDS(here("processed", project, "clusters_SNN_k5-k60_02.RDS"))
}

# only save the new sce if the object is not already here
if(!file.exists((here("processed", project, "sce_clusters_02.RDS")))){
  
  # Use the list to store the info in the sce and plot result
  idx <- c(1:length(names))
  for (i in idx) {
    k <- ks[i]
    cluster_name <- names[i]
    # remove previous versions of this clustering 
    colData(sce)[[cluster_name]] <- NULL
    colData(sce) <- cbind(colData(sce), clusters_list[cluster_name])
    # and plot the result
    plotDim <-
      plotReducedDim(
        sce,
        "TSNE",
        colour_by = cluster_name,
        point_size = 0.5,
        point_alpha = 0.3,
        text_by = cluster_name,
        text_size = 3
      )  + scale_color_hue() + ggtitle(paste("Clustering with k =", k))
    print(plotDim)
  }
  # removed from here, just save after all clustering methods
#  saveRDS(sce, here("processed", project, "sce_clusters_02.RDS"))
  
} else{
  # plot directly from the sce_clusters_02.RDS
  plot_list_func(sce, 
                 col_pattern = "cluster_k", 
                 plot_cols = c( unname(alphabet()),unname(alphabet2()), unname(polychrome()), unname(watlington()), unname(glasbey())),
                 label_size = 0.5,
                 reduction="TSNE",
                 
  )
  
}

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster tree

We use this visualisation to see the relationships between the clusters. The aim is to capture the redistribution of cells from one clustering to another at progressively higher resolutions, providing a convenient depiction of how clusters merge or split apart.

clustree(sce, prefix = "cluster_k", edge_arrow = FALSE)

## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

With Seurat

#only run if first time
if (!file.exists( here("processed", project, "srt_clusters_02.RDS"))) {
srt <- as.Seurat(sce)
# delete old clustering
srt_cluster_names <- grep("originalexp_snn_res", names(srt@meta.data), value = TRUE)
for(cluster in srt_cluster_names){
  srt[[cluster]]<-NULL
}
srt <- FindNeighbors(srt, reduction = "corrected", dims = 1:23)
srt <- FindClusters(srt, 
                    resolution = c(0.01, 0.04, 0.05, 
                                 seq(from = 0.1, to = 1, by = 0.1))
                   )
srt_cluster_names <- grep("originalexp_snn_res", names(srt@meta.data), value = TRUE)
srt_clusters_metadata <- srt[[srt_cluster_names]]

saveRDS(srt_clusters_metadata, here("processed", project, "srt_clusters_02.RDS"))

plot_list_func(srt, 
               col_pattern="originalexp_snn_res.", 
               plot_cols = c( unname(polychrome()), unname(watlington())),
               reduction = "TSNE", 
               label_size = 4 
               )

DimPlot(srt, reduction = "TSNE", group.by = "originalexp_snn_res.1", label = TRUE)

} else{
  srt_clusters_metadata <-
    readRDS( here("processed", project, "srt_clusters_02.RDS"))
}

clustree(srt_clusters_metadata, prefix = "originalexp_snn_res.", edge_arrow = FALSE)

save all clusterings in the same object

if (!file.exists( here("processed", project, "sce_clusters_02.RDS"))) {
# check the cell names are the same
identical(row.names(colData(sce)), row.names(srt_clusters_metadata))

# add them
colData(sce) <- cbind(colData(sce), srt_clusters_metadata)

## save
saveRDS(sce, here("processed", project, "sce_clusters_02.RDS"))
}

#plot seurat clustering
plot_list_func(sce, 
               col_pattern="originalexp_snn_res.", 
               plot_cols = c( unname(polychrome()), unname(watlington())),
               reduction = "TSNE", 
               label_size = 0.5 
               )

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

TODO:

things that I do not trust: - the seurat plot does not have the same clusters in the tree and plots - the sce was plotted duplicated and with the previous notation. This is corrected now.