Analysis of LSCx_v1_20210125 for Fig.6

1. Set up
2. Input Seurat object
3. Initial check
4. Clustify

1. Set up

library(biomaRt)
library(knitr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks biomaRt::select()

library(dplyr)
library(Seurat)

## Registered S3 method overwritten by 'spatstat':
##   method     from
##   print.boxx cli

## Attaching SeuratObject

#remotes::install_github("rnabioco/clustifyrdata")
library(clustifyr)
library(RColorBrewer)
library(gprofiler2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggpubr)
library(jpeg)

# #Install monocle3
# First in terminal do: brew install --build-from-source gcc
# BiocManager::install(c('BiocGenerics', 'DelayedArray', 'DelayedMatrixStats',
#                        'limma', 'S4Vectors', 'SingleCellExperiment',
#                        'SummarizedExperiment', 'batchelor', 'Matrix.utils'))
# devtools::install_github('cole-trapnell-lab/leidenbase')
# devtools::install_github('cole-trapnell-lab/monocle3')
# remotes::install_github('satijalab/seurat-wrappers')
library(leidenbase)
library(monocle3)

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: SingleCellExperiment

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:purrr':
## 
##     reduce

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'SummarizedExperiment'

## The following object is masked from 'package:SeuratObject':
## 
##     Assays

## The following object is masked from 'package:Seurat':
## 
##     Assays

## 
## Attaching package: 'monocle3'

## The following objects are masked from 'package:Biobase':
## 
##     exprs, fData, fData<-, pData, pData<-

library(SeuratWrappers)

2. Input Seurat object

# Read in pre-processed and harmonlized object by Austin
lscx <- readRDS("~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/from_Austin/LSCx_version1_harmonized_clustered_downsampled.rds") # to make sure all of us are starting on same object.

# change the orig.ident to a cool name
lscx@meta.data$orig.ident <- "lscx"

# Read in csv file storing new meta.data features needed for fig.6
meta.suppl <- read.csv("~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/ssp_analysis/lscx_meta_clinic_supple_v1.csv")

# Add in new features into the meta.data file
cell.id <- rownames(lscx@meta.data) #store cell id
lscx@meta.data <- left_join(lscx@meta.data, meta.suppl, by = "sample") # left_join new features into meta.data table
rownames(lscx@meta.data) <- cell.id # add back cell id

# Get some useful variables
type.lsc <- unique(lscx$type.lsc)
type.pheno <- unique(lscx$type.pheno)
type.tissue <- unique(lscx$type.tissue)
type.disease <- unique(lscx$type.disease)
sample <- unique(lscx$sample)

# get adt
adt <- purrr::map_chr(rownames(lscx@assays$ADT@counts), ~ paste0("adt_", .)) # add "adt_" in front of all antigens. 

# Check new features added
type.lsc

## [1] "unknown"  "Type-A"   "Type-B"   "Type-B-R" "Normal"

type.pheno

## [1] "Prim"    "Mono"    "unknown" "MMP"     "Normal"

type.tissue

## [1] "AML" "HTB" "NBM"

type.disease

## [1] "Rl-chemo" "Dx"       "unknown"  "Rl-ven"   "Normal"

#saveRDS(lscx, "~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/ssp_analysis/LSCx_version1_downsampled_20210125_harmonized_clustered_ssp.Rds")

3. Initial check

# According to Austin, $umap is the reduction to use and it is in fact the harmony-based umap
# despite the "goofy" name, he said...

# Plot all adt markers to get an intial sense of the clusters
lscx %>%
  SetIdent(., value = "type.tissue") %>%
  subset(., idents = c("AML", "HTB")) %>%
  FeaturePlot(., 
              adt,
              reduction = "umap",
              min.cutoff = "q5", 
              max.cutoff = "q95",
              pt.size = 0.1)

## Warning in FeaturePlot(., adt, reduction = "umap", min.cutoff = "q5", max.cutoff
## = "q95", : All cells have the same value (0) of adt_CD70.

## Warning in FeaturePlot(., adt, reduction = "umap", min.cutoff = "q5", max.cutoff
## = "q95", : All cells have the same value (0) of adt_CD27.

# DimPlot(lscx, reduction = "umap", split.by = "type.lsc") # Austin said always use reduction = "umap" to project the harmonized umap
# DimPlot(lscx, reduction = "umap", split.by = "type.pheno")
# DimPlot(lscx, reduction = "umap", split.by = "type.tissue")
# DimPlot(lscx, reduction = "umap", split.by = "type.disease")

4. Clustify

4.1. Trying different compute methods

# Download hema_microarray data from the DMAP study (Novershtern et al.Cell 2011)
dmap <- clustifyrdatahub::ref_hema_microarray()

## snapshotDate(): 2020-10-27
## snapshotDate(): 2020-10-27

## see ?clustifyrdatahub and browseVignettes('clustifyrdatahub') for documentation

## loading from cache

# clustify using cosine
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0, #let clustify assign threshold 
                rename_prefix = "hema_microarray_cosine",
                compute_method = "cosine", #spearman, pearson, kendall, cosine
                verbose = T)

## using # of genes: 1223

## using threshold of 0.5

unique(res$hema_microarray_cosine_type)

## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Monocyte"                              
## [3] "Erythroid_CD34+ CD71+ GlyA-"           
## [4] "Naïve B-cells"                         
## [5] "CD8+ Effector Memory"                  
## [6] "Erythroid_CD34- CD71+ GlyA+"           
## [7] "unassigned"

# clustify using spearman
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0, #let clustify assign threshold 
                rename_prefix = "hema_microarray_spearman",
                compute_method = "spearman", #spearman, pearson, kendall, cosine
                verbose = T)

## using # of genes: 1223

## using threshold of 0.54

unique(res$hema_microarray_spearman_type)

## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Monocyte"                              
## [3] "Naïve B-cells"                         
## [4] "CD8+ Effector Memory RA"               
## [5] "Erythroid_CD34- CD71+ GlyA-"           
## [6] "unassigned"                            
## [7] "CD4+ Central Memory"

# clustify using kendall
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0, #let clustify assign threshold 
                rename_prefix = "hema_microarray_kendall",
                compute_method = "kendall", #spearman, pearson, kendall, cosine
                verbose = T)

## using # of genes: 1223

## using threshold of 0.39

unique(res$hema_microarray_kendall_type)

## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Monocyte"                              
## [3] "Naïve B-cells"                         
## [4] "CD8+ Effector Memory RA"               
## [5] "Erythroid_CD34- CD71+ GlyA-"           
## [6] "unassigned"                            
## [7] "CD4+ Central Memory"

# clustify using pearson
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0.2, #let clustify assign threshold 
                rename_prefix = "hema_microarray_pearson",
                compute_method = "pearson", #spearman, pearson, kendall, cosine
                verbose = T)

## using # of genes: 1223

## using threshold of 0.52

unique(res$hema_microarray_pearson_type)

## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Myeloid Dendritic Cell"                
## [3] "unassigned"                            
## [4] "Naïve B-cells"                         
## [5] "Monocyte"                              
## [6] "CD8+ Effector Memory"                  
## [7] "Erythroid_CD34- CD71lo GlyA+"          
## [8] "Mature NK cell_CD56- CD16+ CD3-"       
## [9] "Naive CD4+ T-cell"

# pearson seems to give us best assignments at auto-assigned threshold

4.2. Trying different threshold settings using the pearson method

# Set threshold to many values and test their influence on assignments
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                threshold = 0.1, # threshold settings 0.1-0.4 gave same best results
                rename_prefix = "hema_microarray_pearson_0.1",
                compute_method = "pearson", #spearman, pearson, kendall, cosine
                verbose = T)

## using # of genes: 1223

# Check the assignments
res@meta.data %>%
  as_tibble() %>% # drop the rown names
  dplyr::select(c("sct_pca_snn_res.0.2", "seurat_clusters", "hema_microarray_pearson_0.1_type")) %>%
  unique() %>%
  arrange(seurat_clusters)

## # A tibble: 11 x 3
##    sct_pca_snn_res.0.2 seurat_clusters hema_microarray_pearson_0.1_type        
##    <fct>               <chr>           <chr>                                   
##  1 0                   0               "Monocyte"                              
##  2 1                   1               "Hematopoietic stem cell_CD133+ CD34dim"
##  3 10                  10              "Naive CD4+ T-cell"                     
##  4 2                   2               "Colony Forming Unit-Monocyte "         
##  5 3                   3               "CD8+ Effector Memory"                  
##  6 4                   4               "Myeloid Dendritic Cell"                
##  7 5                   5               "Erythroid_CD34+ CD71+ GlyA-"           
##  8 6                   6               "Naïve B-cells"                         
##  9 7                   7               "Erythroid_CD34- CD71lo GlyA+"          
## 10 8                   8               "Mature NK cell_CD56- CD16+ CD3-"       
## 11 9                   9               "Hematopoietic stem cell_CD133+ CD34dim"

# Display clusters 
DimPlot(res, reduction = "umap", 
        group.by = "hema_microarray_pearson_0.1_type")

4.3. Simplify cluster names

current.cluster.names <- unique(res$hema_microarray_pearson_0.1_type)

# Full names
new.cluster.names <- c("Primitive", #"Hematopoietic stem cell_CD133+ CD34dim",
                       "Myeloid Dendritic Cell",# "Myeloid Dendritic Cell",
                       "Erythroid (CD34+,CD71+,GlyA-)", # "Erythroid_CD34+ CD71+ GlyA-",
                       "CFU-Mono",#"Colony Forming Unit-Monocyte ",
                       "B-cells", # "Naïve B-cells",
                       "Monocytic", #"Monocyte"
                       "CD8+ T-cells",# "CD8+ Effector Memory",
                       "Erythroid (CD34-,CD71lo,GlyA+)", # "Erythroid_CD34- CD71lo GlyA+",
                       "NK-cells",# "Mature NK cell_CD56- CD16+ CD3-"
                        "CD4+ T-cells")# "Naive CD4+ T-cell",
# Short names
short.cluster.names <- c("Prim (CD34-dim)", #"Hematopoietic stem cell_CD133+ CD34dim",
                       "MDC",# "Myeloid Dendritic Cell",
                       "Prim (CD34-high)", # "Erythroid_CD34+ CD71+ GlyA-",
                       "CFU-Mono",#"Colony Forming Unit-Monocyte ",
                       "B", # "Naïve B-cells",
                       "Mono", #"Monocyte"
                       "CD8-T",# "CD8+ Effector Memory",
                       "Ery", # "Erythroid_CD34- CD71lo GlyA+",
                       "NK?",# "Mature NK cell_CD56- CD16+ CD3-"
                        "CD4-T")# "Naive CD4+ T-cell",

res$Clusters <- plyr::mapvalues(x=res$hema_microarray_pearson_0.1_type,
                                                        from = current.cluster.names,
                                                        to = new.cluster.names)# Check the assignments
res$Subpopulations <- plyr::mapvalues(x=res$hema_microarray_pearson_0.1_type,
                                                        from = current.cluster.names,
                                                        to = short.cluster.names)# Check the assignments

res@meta.data %>%
  as_tibble() %>% # drop the rown names
  dplyr::select(c("seurat_clusters", "hema_microarray_pearson_0.1_type", "Clusters", "Subpopulations")) %>%
  unique() %>%
  arrange(seurat_clusters)

## # A tibble: 11 x 4
##    seurat_clusters hema_microarray_pearson_0.… Clusters           Subpopulations
##    <chr>           <chr>                       <chr>              <chr>         
##  1 0               "Monocyte"                  Monocytic          Mono          
##  2 1               "Hematopoietic stem cell_C… Primitive          Prim (CD34-di…
##  3 10              "Naive CD4+ T-cell"         CD4+ T-cells       CD4-T         
##  4 2               "Colony Forming Unit-Monoc… CFU-Mono           CFU-Mono      
##  5 3               "CD8+ Effector Memory"      CD8+ T-cells       CD8-T         
##  6 4               "Myeloid Dendritic Cell"    Myeloid Dendritic… MDC           
##  7 5               "Erythroid_CD34+ CD71+ Gly… Erythroid (CD34+,… Prim (CD34-hi…
##  8 6               "Naïve B-cells"             B-cells            B             
##  9 7               "Erythroid_CD34- CD71lo Gl… Erythroid (CD34-,… Ery           
## 10 8               "Mature NK cell_CD56- CD16… NK-cells           NK?           
## 11 9               "Hematopoietic stem cell_C… Primitive          Prim (CD34-di…

# Display clusters 
DimPlot(res, reduction = "umap", 
        group.by = "Subpopulations")

4.4. Assgin new color schemes for the clusters

# Full names
level.clusters <- c("Monocytic",
                "CFU-Mono",
                "Myeloid Dendritic Cell",
                "Primitive",
                "Erythroid (CD34+,CD71+,GlyA-)",
                "Erythroid (CD34-,CD71lo,GlyA+)",
                "B-cells",
                "CD4+ T-cells",
                "CD8+ T-cells",
                "NK-cells"
                )

# short names
level.Subpopulations <- c("Mono",
                "CFU-Mono",
                "MDC",
                "Prim (CD34-dim)",
                "Prim (CD34-high)",
                "Ery",
                "B",
                "CD4-T",
                "CD8-T",
                "NK?"
                )

res$Clusters <- factor(res$Clusters, levels = level.clusters)
res$Subpopulations <- factor(res$Subpopulations, levels = level.Subpopulations)

palette.lscx <- c( "#b30024", #"Monocyte",
                  "#fa1e20",#"Colony Forming Unit-Monocyte ",
                   "#ff7f00", #"Myeloid Dendritic Cell",
                   "#0b47d4",#"Hematopoietic stem cell_CD133+ CD34dim",
                   "#5a95d1", #"Erythroid_CD34+ CD71+ GlyA-"
                  "#f28ac6", #"Erythroid_CD34- CD71lo GlyA+ "
                   "#20b2aa", #"Naïve B-cells",
                   "#1cad15", # "Naive CD4+ T-cell",
                   "#b2df8a", #"CD8+ Effector Memory",
                   "#d8db02") #"Mature NK cell_CD56- CD16+ CD3-"

show_col(palette.lscx)

DimPlot(res, reduction = "umap", 
        group.by = "Subpopulations", 
        #split.by = "type.lsc",
        cols = palette.lscx,
        label = T)

VlnPlot(res, "CD34", group.by = "Subpopulations", cols = palette.lscx) # Primitive marker

VlnPlot(res, "KIT", group.by = "Subpopulations", cols = palette.lscx) # Primitive marker

VlnPlot(res, "GYPA", group.by = "Subpopulations", cols = palette.lscx) # Erythroid marker Glycophorin A

VlnPlot(res, "adt_CD4", group.by = "Subpopulations", cols = palette.lscx) # T cell marker CD4

VlnPlot(res, "CD8A", group.by = "Subpopulations", cols = palette.lscx) # T cell marker CD8

VlnPlot(res, "adt_CD19", group.by = "Subpopulations", cols = palette.lscx) # B cell marker CD19

VlnPlot(res, "NLRP3", group.by = "Subpopulations", cols = palette.lscx)

res@meta.data %>%
  as_tibble() %>%
  dplyr::select(htb, type.lsc, type.pheno, type.tissue, type.disease) %>%
  unique() %>%
  DT::datatable()

saveRDS(res, "~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/ssp_analysis/SSP_LSCx_version1_downsampled_20210125_harmonized_clustered_res.Rds")

Analysis of LSCx_v1_20210125 for Fig.6

09 February, 2021

1. Set up

2. Input Seurat object

3. Initial check

4. Clustify

4.1. Trying different compute methods

4.2. Trying different threshold settings using the pearson method

4.3. Simplify cluster names

4.4. Assgin new color schemes for the clusters