1. Set up

library(biomaRt)
library(knitr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks biomaRt::select()
library(dplyr)
library(Seurat)
## Registered S3 method overwritten by 'spatstat':
##   method     from
##   print.boxx cli
## Attaching SeuratObject
#remotes::install_github("rnabioco/clustifyrdata")
library(clustifyr)
library(RColorBrewer)
library(gprofiler2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggpubr)
library(jpeg)

# #Install monocle3
# First in terminal do: brew install --build-from-source gcc
# BiocManager::install(c('BiocGenerics', 'DelayedArray', 'DelayedMatrixStats',
#                        'limma', 'S4Vectors', 'SingleCellExperiment',
#                        'SummarizedExperiment', 'batchelor', 'Matrix.utils'))
# devtools::install_github('cole-trapnell-lab/leidenbase')
# devtools::install_github('cole-trapnell-lab/monocle3')
# remotes::install_github('satijalab/seurat-wrappers')
library(leidenbase)
library(monocle3)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
## 
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
## 
##     exprs, fData, fData<-, pData, pData<-
library(SeuratWrappers)

2. Input Seurat object

# Read in pre-processed and harmonlized object by Austin
lscx <- readRDS("~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/from_Austin/LSCx_version1_harmonized_clustered_downsampled.rds") # to make sure all of us are starting on same object.

# change the orig.ident to a cool name
lscx@meta.data$orig.ident <- "lscx"

# Read in csv file storing new meta.data features needed for fig.6
meta.suppl <- read.csv("~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/ssp_analysis/lscx_meta_clinic_supple_v1.csv")

# Add in new features into the meta.data file
cell.id <- rownames(lscx@meta.data) #store cell id
lscx@meta.data <- left_join(lscx@meta.data, meta.suppl, by = "sample") # left_join new features into meta.data table
rownames(lscx@meta.data) <- cell.id # add back cell id

# Get some useful variables
type.lsc <- unique(lscx$type.lsc)
type.pheno <- unique(lscx$type.pheno)
type.tissue <- unique(lscx$type.tissue)
type.disease <- unique(lscx$type.disease)
sample <- unique(lscx$sample)

# get adt
adt <- purrr::map_chr(rownames(lscx@assays$ADT@counts), ~ paste0("adt_", .)) # add "adt_" in front of all antigens. 

# Check new features added
type.lsc
## [1] "unknown"  "Type-A"   "Type-B"   "Type-B-R" "Normal"
type.pheno
## [1] "Prim"    "Mono"    "unknown" "MMP"     "Normal"
type.tissue
## [1] "AML" "HTB" "NBM"
type.disease
## [1] "Rl-chemo" "Dx"       "unknown"  "Rl-ven"   "Normal"
#saveRDS(lscx, "~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/ssp_analysis/LSCx_version1_downsampled_20210125_harmonized_clustered_ssp.Rds")

3. Initial check

# According to Austin, $umap is the reduction to use and it is in fact the harmony-based umap
# despite the "goofy" name, he said...

# Plot all adt markers to get an intial sense of the clusters
lscx %>%
  SetIdent(., value = "type.tissue") %>%
  subset(., idents = c("AML", "HTB")) %>%
  FeaturePlot(., 
              adt,
              reduction = "umap",
              min.cutoff = "q5", 
              max.cutoff = "q95",
              pt.size = 0.1)
## Warning in FeaturePlot(., adt, reduction = "umap", min.cutoff = "q5", max.cutoff
## = "q95", : All cells have the same value (0) of adt_CD70.
## Warning in FeaturePlot(., adt, reduction = "umap", min.cutoff = "q5", max.cutoff
## = "q95", : All cells have the same value (0) of adt_CD27.

# DimPlot(lscx, reduction = "umap", split.by = "type.lsc") # Austin said always use reduction = "umap" to project the harmonized umap
# DimPlot(lscx, reduction = "umap", split.by = "type.pheno")
# DimPlot(lscx, reduction = "umap", split.by = "type.tissue")
# DimPlot(lscx, reduction = "umap", split.by = "type.disease")

4. Clustify

4.1. Trying different compute methods

# Download hema_microarray data from the DMAP study (Novershtern et al.Cell 2011)
dmap <- clustifyrdatahub::ref_hema_microarray() 
## snapshotDate(): 2020-10-27
## snapshotDate(): 2020-10-27
## see ?clustifyrdatahub and browseVignettes('clustifyrdatahub') for documentation
## loading from cache
# clustify using cosine
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0, #let clustify assign threshold 
                rename_prefix = "hema_microarray_cosine",
                compute_method = "cosine", #spearman, pearson, kendall, cosine
                verbose = T)
## using # of genes: 1223
## using threshold of 0.5
unique(res$hema_microarray_cosine_type)
## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Monocyte"                              
## [3] "Erythroid_CD34+ CD71+ GlyA-"           
## [4] "Naïve B-cells"                         
## [5] "CD8+ Effector Memory"                  
## [6] "Erythroid_CD34- CD71+ GlyA+"           
## [7] "unassigned"
# clustify using spearman
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0, #let clustify assign threshold 
                rename_prefix = "hema_microarray_spearman",
                compute_method = "spearman", #spearman, pearson, kendall, cosine
                verbose = T)
## using # of genes: 1223
## using threshold of 0.54
unique(res$hema_microarray_spearman_type)
## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Monocyte"                              
## [3] "Naïve B-cells"                         
## [4] "CD8+ Effector Memory RA"               
## [5] "Erythroid_CD34- CD71+ GlyA-"           
## [6] "unassigned"                            
## [7] "CD4+ Central Memory"
# clustify using kendall
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0, #let clustify assign threshold 
                rename_prefix = "hema_microarray_kendall",
                compute_method = "kendall", #spearman, pearson, kendall, cosine
                verbose = T)
## using # of genes: 1223
## using threshold of 0.39
unique(res$hema_microarray_kendall_type)
## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Monocyte"                              
## [3] "Naïve B-cells"                         
## [4] "CD8+ Effector Memory RA"               
## [5] "Erythroid_CD34- CD71+ GlyA-"           
## [6] "unassigned"                            
## [7] "CD4+ Central Memory"
# clustify using pearson
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                #threshold = 0.2, #let clustify assign threshold 
                rename_prefix = "hema_microarray_pearson",
                compute_method = "pearson", #spearman, pearson, kendall, cosine
                verbose = T)
## using # of genes: 1223
## using threshold of 0.52
unique(res$hema_microarray_pearson_type) 
## [1] "Hematopoietic stem cell_CD133+ CD34dim"
## [2] "Myeloid Dendritic Cell"                
## [3] "unassigned"                            
## [4] "Naïve B-cells"                         
## [5] "Monocyte"                              
## [6] "CD8+ Effector Memory"                  
## [7] "Erythroid_CD34- CD71lo GlyA+"          
## [8] "Mature NK cell_CD56- CD16+ CD3-"       
## [9] "Naive CD4+ T-cell"
# pearson seems to give us best assignments at auto-assigned threshold

4.2. Trying different threshold settings using the pearson method

# Set threshold to many values and test their influence on assignments
res <- clustify(lscx, 
                ref_mat = dmap, #can also use "cbmc_ref"
                cluster_col = "seurat_clusters",
                threshold = 0.1, # threshold settings 0.1-0.4 gave same best results
                rename_prefix = "hema_microarray_pearson_0.1",
                compute_method = "pearson", #spearman, pearson, kendall, cosine
                verbose = T)
## using # of genes: 1223
# Check the assignments
res@meta.data %>%
  as_tibble() %>% # drop the rown names
  dplyr::select(c("sct_pca_snn_res.0.2", "seurat_clusters", "hema_microarray_pearson_0.1_type")) %>%
  unique() %>%
  arrange(seurat_clusters)
## # A tibble: 11 x 3
##    sct_pca_snn_res.0.2 seurat_clusters hema_microarray_pearson_0.1_type        
##    <fct>               <chr>           <chr>                                   
##  1 0                   0               "Monocyte"                              
##  2 1                   1               "Hematopoietic stem cell_CD133+ CD34dim"
##  3 10                  10              "Naive CD4+ T-cell"                     
##  4 2                   2               "Colony Forming Unit-Monocyte "         
##  5 3                   3               "CD8+ Effector Memory"                  
##  6 4                   4               "Myeloid Dendritic Cell"                
##  7 5                   5               "Erythroid_CD34+ CD71+ GlyA-"           
##  8 6                   6               "Naïve B-cells"                         
##  9 7                   7               "Erythroid_CD34- CD71lo GlyA+"          
## 10 8                   8               "Mature NK cell_CD56- CD16+ CD3-"       
## 11 9                   9               "Hematopoietic stem cell_CD133+ CD34dim"
# Display clusters 
DimPlot(res, reduction = "umap", 
        group.by = "hema_microarray_pearson_0.1_type")

4.3. Simplify cluster names

current.cluster.names <- unique(res$hema_microarray_pearson_0.1_type)

# Full names
new.cluster.names <- c("Primitive", #"Hematopoietic stem cell_CD133+ CD34dim",
                       "Myeloid Dendritic Cell",# "Myeloid Dendritic Cell",
                       "Erythroid (CD34+,CD71+,GlyA-)", # "Erythroid_CD34+ CD71+ GlyA-",
                       "CFU-Mono",#"Colony Forming Unit-Monocyte ",
                       "B-cells", # "Naïve B-cells",
                       "Monocytic", #"Monocyte"
                       "CD8+ T-cells",# "CD8+ Effector Memory",
                       "Erythroid (CD34-,CD71lo,GlyA+)", # "Erythroid_CD34- CD71lo GlyA+",
                       "NK-cells",# "Mature NK cell_CD56- CD16+ CD3-"
                        "CD4+ T-cells")# "Naive CD4+ T-cell",
# Short names
short.cluster.names <- c("Prim (CD34-dim)", #"Hematopoietic stem cell_CD133+ CD34dim",
                       "MDC",# "Myeloid Dendritic Cell",
                       "Prim (CD34-high)", # "Erythroid_CD34+ CD71+ GlyA-",
                       "CFU-Mono",#"Colony Forming Unit-Monocyte ",
                       "B", # "Naïve B-cells",
                       "Mono", #"Monocyte"
                       "CD8-T",# "CD8+ Effector Memory",
                       "Ery", # "Erythroid_CD34- CD71lo GlyA+",
                       "NK?",# "Mature NK cell_CD56- CD16+ CD3-"
                        "CD4-T")# "Naive CD4+ T-cell",

res$Clusters <- plyr::mapvalues(x=res$hema_microarray_pearson_0.1_type,
                                                        from = current.cluster.names,
                                                        to = new.cluster.names)# Check the assignments
res$Subpopulations <- plyr::mapvalues(x=res$hema_microarray_pearson_0.1_type,
                                                        from = current.cluster.names,
                                                        to = short.cluster.names)# Check the assignments

res@meta.data %>%
  as_tibble() %>% # drop the rown names
  dplyr::select(c("seurat_clusters", "hema_microarray_pearson_0.1_type", "Clusters", "Subpopulations")) %>%
  unique() %>%
  arrange(seurat_clusters)
## # A tibble: 11 x 4
##    seurat_clusters hema_microarray_pearson_0.… Clusters           Subpopulations
##    <chr>           <chr>                       <chr>              <chr>         
##  1 0               "Monocyte"                  Monocytic          Mono          
##  2 1               "Hematopoietic stem cell_C… Primitive          Prim (CD34-di…
##  3 10              "Naive CD4+ T-cell"         CD4+ T-cells       CD4-T         
##  4 2               "Colony Forming Unit-Monoc… CFU-Mono           CFU-Mono      
##  5 3               "CD8+ Effector Memory"      CD8+ T-cells       CD8-T         
##  6 4               "Myeloid Dendritic Cell"    Myeloid Dendritic… MDC           
##  7 5               "Erythroid_CD34+ CD71+ Gly… Erythroid (CD34+,… Prim (CD34-hi…
##  8 6               "Naïve B-cells"             B-cells            B             
##  9 7               "Erythroid_CD34- CD71lo Gl… Erythroid (CD34-,… Ery           
## 10 8               "Mature NK cell_CD56- CD16… NK-cells           NK?           
## 11 9               "Hematopoietic stem cell_C… Primitive          Prim (CD34-di…
# Display clusters 
DimPlot(res, reduction = "umap", 
        group.by = "Subpopulations")

4.4. Assgin new color schemes for the clusters

# Full names
level.clusters <- c("Monocytic",
                "CFU-Mono",
                "Myeloid Dendritic Cell",
                "Primitive",
                "Erythroid (CD34+,CD71+,GlyA-)",
                "Erythroid (CD34-,CD71lo,GlyA+)",
                "B-cells",
                "CD4+ T-cells",
                "CD8+ T-cells",
                "NK-cells"
                )

# short names
level.Subpopulations <- c("Mono",
                "CFU-Mono",
                "MDC",
                "Prim (CD34-dim)",
                "Prim (CD34-high)",
                "Ery",
                "B",
                "CD4-T",
                "CD8-T",
                "NK?"
                )

res$Clusters <- factor(res$Clusters, levels = level.clusters)
res$Subpopulations <- factor(res$Subpopulations, levels = level.Subpopulations)

palette.lscx <- c( "#b30024", #"Monocyte",
                  "#fa1e20",#"Colony Forming Unit-Monocyte ",
                   "#ff7f00", #"Myeloid Dendritic Cell",
                   "#0b47d4",#"Hematopoietic stem cell_CD133+ CD34dim",
                   "#5a95d1", #"Erythroid_CD34+ CD71+ GlyA-"
                  "#f28ac6", #"Erythroid_CD34- CD71lo GlyA+ "
                   "#20b2aa", #"Naïve B-cells",
                   "#1cad15", # "Naive CD4+ T-cell",
                   "#b2df8a", #"CD8+ Effector Memory",
                   "#d8db02") #"Mature NK cell_CD56- CD16+ CD3-"

show_col(palette.lscx)

DimPlot(res, reduction = "umap", 
        group.by = "Subpopulations", 
        #split.by = "type.lsc",
        cols = palette.lscx,
        label = T)

VlnPlot(res, "CD34", group.by = "Subpopulations", cols = palette.lscx) # Primitive marker

VlnPlot(res, "KIT", group.by = "Subpopulations", cols = palette.lscx) # Primitive marker

VlnPlot(res, "GYPA", group.by = "Subpopulations", cols = palette.lscx) # Erythroid marker Glycophorin A

VlnPlot(res, "adt_CD4", group.by = "Subpopulations", cols = palette.lscx) # T cell marker CD4

VlnPlot(res, "CD8A", group.by = "Subpopulations", cols = palette.lscx) # T cell marker CD8

VlnPlot(res, "adt_CD19", group.by = "Subpopulations", cols = palette.lscx) # B cell marker CD19

VlnPlot(res, "NLRP3", group.by = "Subpopulations", cols = palette.lscx)

res@meta.data %>%
  as_tibble() %>%
  dplyr::select(htb, type.lsc, type.pheno, type.tissue, type.disease) %>%
  unique() %>%
  DT::datatable()
saveRDS(res, "~/Documents/Lab_data/P_LSCx/LSCx-CITEseq/ssp_analysis/SSP_LSCx_version1_downsampled_20210125_harmonized_clustered_res.Rds")