What are these clusters?

In this Document I aim to analyse the differences between the clusters. I will focus on the clusters that have been spotted of interest from sex_age_tissue_diff.Rmd

Setup

# libraries
library(Seurat) # for scrnaseq analyisis

## Attaching SeuratObject

library(here) # for reproducible paths

## here() starts at C:/Users/nbestard/OneDrive/Luise_HCA

library(dplyr) # manipulated df (rename)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) # plots
library(patchwork) # for adding customs to grid-composed Seurat plots

# load last object from sex_age_tissue_diff_03.Rmd
srt_all <- readRDS(here("processed", "srt_anno_01.RDS"))
Idents(srt_all) <- srt_all$clusters_named

Quality

Visually check by plotting the data by colouring it for mitochondrial count or UMI/ gene count

Idents(srt_all) <- srt_all$clusters_named
FeaturePlot(srt_all, features = "detected") + labs(title="Detected gene number")

FeaturePlot(srt_all, features = "sum" ) + labs(title="Total UMI counts")

# Wich mito is true? Total_mt_pc, high_subsets_mito_percent, percent.mt, $total_percent_mito
FeaturePlot(srt_all, features = "total_percent_mito") +
  labs(title="Mitochondiral percentage content")

Top genes

To see the top genes for the clusters of interest, first calculate for each gene the expression for an ‘average’ single cell in each cluster

# Create an object only with the average expression
if(!file.exists(here("processed","srt_average_expression.RDS"))){
  # Calculate average expression
  average_expresion <- AverageExpression(srt_all, assays="RNA", group.by = "clusters_named")
  # also save to a seurat object to facilitate plotting
  srt_average_expresion <- AverageExpression(srt_all, assays="RNA", return.seurat = TRUE, group.by = "clusters_named")
  # Making sure the clusters are sorted as the srt_all
  Idents(srt_average_expresion) <- 
  factor(srt_average_expresion$clusters_named,
    levels = levels(srt_all$clusters_named)
    )
  srt_average_expresion$clusters_named <- Idents(srt_average_expresion)
  
  saveRDS(srt_average_expresion, here("processed","srt_average_expression.RDS"))
  write.csv(average_expresion, here("outs", "average_expression.csv"))
  
} else{
 srt_average_expresion <- readRDS(here("processed","srt_average_expression.RDS"))
}

Scale data for visualisation

In previous steps we only scaled the variable features, now we are interested in scaling all the genes, as some non highly variable genes may come up from a DE analysis, and for visualisation using scaled genes can help.

srt_all <- ScaleData(srt_all, features = rownames(srt_all))

## Centering and scaling data matrix

Differential expression

For all CellTypes, differences between the subgroups.

if(!file.exists(here("outs", "de_Stromal_all.csv"))) {
  # Create a separate seurato object for each cell type
  srt_celltype_list <- SplitObject(srt_all, split.by = "celltype")
  names_celltype <- names(srt_celltype_list)
  # find markers for each celltype clustering
  mapply(
    FUN = function(srt, name) {
      if (!name %in% c("OPC", "Oligo", "Immune")){
      # Find markers
      markers <- FindAllMarkers(srt, test.use = "MAST", only.pos = TRUE)
      # Select markers of interest
      top10_markers <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
      top100_markers <- markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)
  
      # Plot markers of interest
  
      top10_plot <- DoHeatmap(srt, features = top10_markers$gene, size = 4) + NoLegend() + 
                      ggtitle("Top 10 differentially expressed")
      
      # Save the objects
      write.csv(markers, here("outs", paste0("de_", name, "_all.csv")))
      write.csv(top100_markers, here("outs", paste0("de_", name, "_top100.csv")))

      pdf(here("outs", paste0("de_", name, "_top10.pdf")))
      print(top10_plot)
      dev.off()
      }
      
    },
    srt_celltype_list, #seurat objects
    names_celltype # name of the celltype
  )
}

Add some pairwise comparisons

# Function for pariwise comparisons
pairwise_mark <- function(seur_obj, 
                             int_cols,
                             clust_id_list,
                             only_pos = TRUE,
                             min_pct = 0.25,
                             logfc_threshold = 0.25,
                             fil_pct_1 = 0.25,
                             fil_pct_2 = 0.1,
                             save_dir = getwd(),
                             test_use = "MAST"){
  for(k in 1:length(int_cols)){
    Idents(seur_obj) <- int_cols[k]
    for(i in 1: length(clust_id_list)){
      clust_mark <- FindMarkers(seur_obj, 
                                ident.1 = clust_id_list[[i]][[1]], 
                                ident.2 = clust_id_list[[i]][[2]],
                                min.pct = min_pct, 
                                test.use = test_use)
      clust_mark$cluster <- clust_id_list[[i]][[1]]
      clust_mark$comp_to_clust <- clust_id_list[[i]][[2]]
      # I modify the write csv compared with luise function to match the ones already done
      write.csv(clust_mark, 
                paste(save_dir, 
                      "/", 
                      "de_pairwise_",
                     # int_cols[k],
                      clust_id_list[[i]][[1]],
                      "_",
                      clust_id_list[[i]][[2]],
                      ".csv", sep = "" ))
    }
  }
}
# if not run yet do the comparisons
if(!file.exists(here("outs", "de_pairwise_Astrocyte_1_Astrocyte_2.csv"))) {
  clust_id_list <- list(list("Astrocyte_1", "Astrocyte_2"), 
                        list("Neuron_In_1", "Neuron_In_2"), 
                        list("Neuron_In_2", "Neuron_In_3"), 
                        list("Neuron_In_1", "Neuron_In_3"),
                        list("Neuron_Ex_1", "Neuron_Ex_2"), 
                        list("Neuron_Ex_2", "Neuron_Ex_3"), 
                        list("Neuron_Ex_1", "Neuron_Ex_3"))
  pairwise_mark(srt_all, 
                int_cols = "clusters_named",
                save_dir = here("outs"),
                clust_id_list = clust_id_list)
}

Filter marker genes

With a minimum de averarage log fold change of 1.2, present in more than 25 % of cells in the cluster of interest and less than 60 % in the others. Then taking the top 10 of each group.

gen_mark_list <-function(file_dir = getwd(),
                         pattern = "*.csv",
                         avg_log = 1.2,
                         pct_1 = 0.25,
                         pct_2 = 0.6,
                         pairwise = FALSE
                         ){
  temp = list.files(path = file_dir,
                    pattern= pattern)
  myfiles = lapply(paste(file_dir, temp, sep = "/"), read.csv)
  
  for(i in 1:length(myfiles)){
    dat <- myfiles[[i]]
    av_log_fil <- subset(dat, dat$avg_log2FC > avg_log & 
                       dat$pct.1 > pct_1 & 
                       dat$pct.2 < pct_2)
    if(pairwise == TRUE){
      
      top10 <- av_log_fil %>% top_n(10, avg_log2FC)
      top10$gene <- top10$X
    }else{
    av_log_fil$cluster <- as.character(av_log_fil$cluster)
    top10 <- av_log_fil %>% group_by(cluster) %>% 
      top_n(10, avg_log2FC)
    }
    
    if(i ==1){
      fil_genes <- top10
    }else{
      fil_genes <- rbind(fil_genes, top10)
    }
    
    fil_genes <- fil_genes[!duplicated(fil_genes$gene),]
    
  }
  
  return(fil_genes)
}

fil_genes <- gen_mark_list(file_dir = here("outs"), pattern = "all.*csv")
fil_genes_pw <- gen_mark_list(file_dir = here("outs"), pattern = "de_pairwise_.*csv", pairwise = TRUE)
int_genes <- c(fil_genes$gene, fil_genes_pw$gene)
int_genes <- unique(int_genes)

Plot marker genes

This heatmap does summarise all the DE done so far (filtered): first the celltype to celltype, then in each celltype each group against the others and then, at the end, the pairwise comparisons.

hm_av <- DoHeatmap(object =   srt_average_expresion, 
          features = int_genes, 
          label = TRUE,
          group.by = "clusters_named",
         # group.colors = mycoloursP[6:40],
          draw.lines = F)

## Warning in DoHeatmap(object = srt_average_expresion, features = int_genes, : The
## following features were omitted as they were not found in the scale.data slot
## for the RNA assay: NA

hm_av

Astrocytes

Click to expand

Plot the 4 astrocyte clusters alone

srt_astro <- subset(srt_all, idents = c("Astrocyte_3", "Astrocyte_1", "Astrocyte_2", "Astrocyte_4"))
DimPlot(srt_astro)

Oligo-Astrocytes

Plot the the Astrocytes and Oligodendrocyte markers to visualize this subpopulation

FeaturePlot(srt_astro, features = c("GJA1","AQP4", "GLUL", "SOX9", "NDRG2", "GFAP", "ALDH1A1", "ALDH1L1", "VIM"))

# Oligodendrocytes
FeaturePlot(srt_astro, features = c("PLP1", "CNP", "MAG", "MOG", "MOBP", "MBP", "SOX10" ), ncol = 3) + 
  plot_annotation(title = "Oligodendrocytes")

Reactive markers

, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3480225/

We confirmed the induction of astrocyte reactive gliosis in two complementary brain injury models: focal ischemic stroke produced by transient MCAO and neuroinflammation induced by systemic LPS injection.

FeaturePlot(srt_astro, features = c("GFAP", "VIM", "NES", "SERPINA3", "LCN2", "FGL2", "TAPBP"))

## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: SERPINA3, LCN2

established markers of reactive astrocytes: GFAP“,”VIM“,”NES“, Vimentin should be very low in control”Lcn2 and Serpina3n could not be detected in healthy brain sections ", here either MCAO reactive astrocytes and LPS reactive astrocytes express differing levels of extracellular binding/adhesion/modification genes: FGL2 enes within the antigen presentation pathway and peptide processing and MHC association

These reactive markers do not seem to collocalise with any of the Astrocyte cellstates.

Picking markers

These markers are chosen from the list in the heatmap above, trying to spot genes that are only expressed in the group of interest and not in others.

Astorcyte 1

VlnPlot(srt_astro, features= c("LUZP2","TRPM3","SHISA6","SLC6A11", "RGS6"))

FeaturePlot(srt_all, features = c("LUZP2","TRPM3","SHISA6","SLC6A11", "RGS6"))

VlnPlot(srt_all,features = c("LUZP2","TRPM3","SHISA6","SLC6A11", "RGS6"))

SLC6A11 is a very clear marker for Astrocyte 1, however it is only present in a subpopulation. SHISA6 better represents the whole Asctrocyte 1 state, eventhough it is also slighly expressed in neurons.

Astrocyte 2

VlnPlot(srt_astro, features= c("ADGRV1", "AL589740.1", "EMID1", "EPHB1"), ncol = 4)

FeaturePlot(srt_all, features = c("ADGRV1", "AL589740.1", "EMID1", "EPHB1"))

VlnPlot(srt_all,features = c("ADGRV1", "EMID1", "EPHB1"), ncol = 3)

Astorcyte 3

Astroycte 3 contains a subpopulation of Astro-Oligos

VlnPlot(srt_astro, features= c("MBP","CHI3L1", "MTURN", "PAQR6", "RPL37A"))

FeaturePlot(srt_all, features = c("MBP","CHI3L1", "MTURN",  "PAQR6",  "RPL37A"))

Astrocyte 4

“DNAH9” This gene encodes a ciliary outer dynein arm protein and is a member of the dynein heavy chain family. It is a microtubule-dependent motor ATPase and has been reported to be involved in the movement of respiratory cilia. This gene encodes a member of the cilia- and flagella-associated protein family. Cilia And Flagella Associated Protein 43

VlnPlot(srt_astro, features= c("DNAH9",
"SPAG17",
"DNAH11",
"CFAP43",
"DNAH9"))

FeaturePlot(srt_all, features =c("DNAH9",
"SPAG17",
"DNAH11",
"CFAP43",
"DNAH9"))

Neurons

Click to expand

Exitatory Neurons

Plot the 3 Excitatory neurons clusters alone

srt_neu_ex <- subset(srt_all, idents = c("Neuron_Ex_1","Neuron_Ex_2","Neuron_Ex_3" ))
DimPlot(srt_neu_ex)

Neuron Excitatory 1

VlnPlot(srt_neu_ex,
        features = 
          #  DE with the other ex_neurons
          c( "HS3ST4", "ZFHX3","KIAA1217", "ZNF385D",
            # from all markers heatmap
            "SGCZ", "MGAT4C", "XKR4",
            # From pairwise with other ex_neurons
            "TLE4", "SEMA3E" 
            ))

FeaturePlot(srt_all, features = c("HS3ST4", "ZFHX3", "KIAA1217", "ZNF385D", "SGCZ", "MGAT4C", "XKR4", "TLE4", "SEMA3E"))

VlnPlot(srt_all, features = c("HS3ST4", "ZFHX3", "KIAA1217", "ZNF385D", "SGCZ", "MGAT4C", "XKR4", "TLE4", "SEMA3E"))

HS3ST4 is the best marker for this excitatory neuron state, it is also present in microglia/macrophages. Some markers such as SGCZ and MGAT4 are only present in a sub-population from Neuron_Ex_1.

Extiatory Neurons 2

VlnPlot(srt_neu_ex,
        features = 
          #  DE with the other ex_neurons
          c("TENM2", "FAM19A1", "SYN3",
            # from all markers heatmap
            "CTNNA2", "PTPRK","ADGRL2", "CNTN5")
        )

FeaturePlot(srt_all, features = c("TENM2", "FAM19A1", "SYN3", "CTNNA2", "PTPRK", "ADGRL2", "CNTN5"))

VlnPlot(srt_all, features = c("TENM2", "FAM19A1", "SYN3", "CTNNA2", "PTPRK", "ADGRL2", "CNTN5"))

FAM19A1 and SYN3 are the markers that best differentiate Neuron ex 2 with the other excitatory neurons. CNTN5 is less expressed in other cell types, but is slightly expressed by excitatory 1 too.

Extiatory Neurons 3

These neurons are more abundant in BA4 than the other tissues. The markers from this group are harder to find, it does not seem to express much genes that are not expressed by the other ex. neurons. Some of the slightly more expressed genes could be related with stress.Such as “CLU”, the protein encoded by this gene is a secreted chaperone, and mitochondrial genes.

VlnPlot(srt_neu_ex,
        features = 
          #  DE with the other ex_neurons
          c("MT-ATP8", "CLU", "EEF1A2",
            # from all markers heatmap
            "ETH1", "ALDOA"), 
        ncol = 2
        )

## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: ETH1

FeaturePlot(srt_all, features = c("MT-ATP8", "CLU", "EEF1A2", "ETH1", "ALDOA"))

## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: ETH1

VlnPlot(srt_all, 
        features = c("MT-ATP8", "CLU", "EEF1A2", "ETH1", "ALDOA"),
        ncol = 2)

## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: ETH1

RELN+ Neurons

Expression plot with granulate cells marker genes

# Searching for markers for Granulate cells
FeaturePlot(srt_all, features = c("CDH15", "CALB2", "RBFOX3", "RELN"))

Should I add Neuron_RELN+_1 (only in CB) vs other Neurons?