Harmonisation of single cell datasets

integrated_sample_annotated <- readRDS("PPCG_deconvolution_signatures_single_cell_PROCESSED/integrated_sample_annotated.rds")

Number of cell types in the intergated datasets

integrated_sample_annotated |> 
  count(dataset)
## tidyseurat says: A data frame is returned for independent data analysis.
## # A tibble: 5 × 2
##   dataset             n
##   <chr>           <int>
## 1 EGAS00001005115  7575
## 2 EGAS00001005787 13007
## 3 GSE137829       19050
## 4 GSE141445       35071
## 5 GSE176031       19339

Number of samples in each dataset

integrated_sample_annotated |> 
  distinct(dataset, sample) |> 
  count(dataset)
## tidyseurat says: A data frame is returned for independent data analysis.
## # A tibble: 5 × 2
##   dataset             n
##   <chr>           <int>
## 1 EGAS00001005115     1
## 2 EGAS00001005787    16
## 3 GSE137829           6
## 4 GSE141445          13
## 5 GSE176031          15

Integrated UMAP coloursed by dataset

integrated_sample_annotated |> 
  DimPlot(group.by = "dataset", shuffle = TRUE) +
  theme(legend.position = "bottom")

Integrated UMAP coloursed by sample

integrated_sample_annotated |> 
  DimPlot(group.by = "sample", shuffle = TRUE) +
  theme(legend.position = "bottom")

integrated_sample_annotated |> 
  DimPlot(group.by = "macrocluster", shuffle = TRUE) +
  theme(legend.position = "bottom")

integrated_sample_annotated |> 
FeaturePlot( 
  features = c("CD14", "FCGR3A", "CD79A", "CD3G", "EPCAM", "VIM", "PLVAP","CD68", "COL14A1", "KRT17"), 
  min.cutoff = 'q9', 
  order = TRUE
)

Integrated UMAP of tumour and benign samples

integrated_sample_annotated |> 
DimPlot( group.by = "if_tumor", shuffle = TRUE ) + 
  scale_color_brewer(palette="Set1")

Deconvolution

composition_b1000 <- readRDS("/stornext/Bioinf/data/bioinf-data/Papenfuss_lab/projects/mangiola.s/PostDoc/PPCG_tumour_microenvironment/composition_b1000.rds")

composition_pc_pseudobulk <- readRDS("/stornext/Bioinf/data/bioinf-data/Papenfuss_lab/projects/mangiola.s/PostDoc/PPCG_tumour_microenvironment/composition_pc_pseudobulk.rds")


counts <- 
  readRDS("PPCG_RNA_count/PPCG_RNAseq_RawCounts_Normalized_SampleAnnot_GenesDetail_1307222.rds") |> 
  extract(.feature, "symbol", ".+\\|\\|(.+)", remove = FALSE) 

Bar plot of bulk-based deconvolution

composition_b1000 |>
  left_join(
    counts |> pivot_sample()
  ) |> 
  pivot_longer(
    names_to= c("method", "cell_type"),
    values_to = "proportion",
    cols=contains("__"),
    names_sep ="__"
  ) %>%
  ggplot(aes(x = .sample, y = proportion, fill = cell_type)) +
  geom_bar(stat = "identity") +
  facet_wrap(method~Center, scale="free", nrow=1) +
  guides(fill=guide_legend(ncol=10)) +
  theme_multipanel +
  theme(legend.position = "bottom", axis.text.x = element_blank())
## Joining, by = ".sample"

Bar plot of single-cell-based deconvolution

composition_pc_pseudobulk |>
  left_join(
    counts |> pivot_sample()
  ) |> 
  pivot_longer(
    names_to= c("method", "cell_type"),
    values_to = "proportion",
    cols=contains("__"),
    names_sep ="__"
  ) %>%
  ggplot(aes(x = .sample, y = proportion, fill = cell_type)) +
  geom_bar(stat = "identity") +
  facet_wrap(method~Center, scale="free", nrow=1) +
  guides(fill=guide_legend(ncol=10)) +
  theme_multipanel +
  theme(legend.position = "bottom", axis.text.x = element_blank())
## Joining, by = ".sample"

dropLeadingZero <- function(l){  stringr::str_replace(l, '0(?=.)', '') }
S_sqrt <- function(x){sign(x)*sqrt(abs(x))}
IS_sqrt <- function(x){x^2*sign(x)}
S_sqrt_trans <- function() scales::trans_new("S_sqrt",S_sqrt,IS_sqrt)

left_join(
  composition_pc_pseudobulk |> 
    pivot_longer(-.sample, values_to = "pc_pseudobulk") |> 
    mutate(name = name |> str_remove("pc_pseudobulk__")) ,
  composition_b1000 |> 
    pivot_longer(-.sample, values_to = "b1000") |> 
    mutate(name = name |> str_remove("b1000__")) 
) |> 
  nest(data = -name) |> 
  mutate(plot = map2(
    data, name,
    ~ .x |> 
    ggplot(aes(pc_pseudobulk, b1000)) +
      geom_point(size=0.2) +
      geom_abline(intercept = 0, slope = 1, color="red") +
      ggtitle(.y) +
      scale_y_continuous(trans=S_sqrt_trans(), labels = dropLeadingZero) +
      scale_x_continuous(trans=S_sqrt_trans(), labels = dropLeadingZero) +
      theme_multipanel 
  )) |> 
  pull(plot) |> 
  wrap_plots() 
## Joining, by = c(".sample", "name")
## Warning: Removed 1757 rows containing missing values (`geom_point()`).

Epithelial by Center

ggMarginal(
  composition_b1000 |> 
    left_join(
      counts |> pivot_sample()
    ) |> 
    ggplot(aes(purity.scores, b1000__epithelial, color = Center)) +
    geom_point(size=0.2) +
    geom_smooth(method="lm")+
    geom_abline(linetype="dashed", alpha=0.3) +
    theme_multipanel, 
  groupFill = TRUE, 
  type = "boxplot"
)
## Joining, by = ".sample"
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Deconvolution (cibersort) also depends on the resolution. Ideally this should not be the case. Here we compare the bulk-based deconvolution (B1000 reference) with specific cell types, and sum their proportions, with generic cell types.

composition_b1000_high_resolution = 
  readRDS("composition_b1000_high_resolution.rds") |> 
  rowwise() |> 
   mutate(
    b1000_high_resolution__T = 
      b1000_high_resolution__t_CD4_memory_central    + b1000_high_resolution__t_CD4_memory_effector  + 
      b1000_high_resolution__t_CD8_memory_central +  b1000_high_resolution__t_CD8_memory_effector  + 
      b1000_high_resolution__t_CD8_naive  + b1000_high_resolution__t_gamma_delta  +      
      b1000_high_resolution__t_helper_h1    +  
      b1000_high_resolution__t_helper_h17  +   b1000_high_resolution__t_helper_h2+
      b1000_high_resolution__t_helper_naive  +   b1000_high_resolution__t_reg,
    b1000_high_resolution__B = b1000_high_resolution__b_memory + b1000_high_resolution__b_naive,
    b1000_high_resolution__monocyte = b1000_high_resolution__dendritic_myeloid_immature + b1000_high_resolution__dendritic_myeloid_mature + b1000_high_resolution__macrophage_M1 + b1000_high_resolution__macrophage_M2 +   b1000_high_resolution__monocyte
  ) |> 
  ungroup() |> 
  select(.sample, b1000_high_resolution__T, b1000_high_resolution__B, b1000_high_resolution__monocyte, b1000_high_resolution__fibroblast, b1000_high_resolution__endothelial, b1000_high_resolution__epithelial) 

left_join(
  composition_b1000_high_resolution |> 
    pivot_longer(-.sample, values_to = "b1000_high_resolution") |> 
    mutate(name = name |> str_remove("b1000_high_resolution__")) ,
  composition_b1000 |> 
    pivot_longer(-.sample, values_to = "b1000") |> 
    mutate(name = name |> str_remove("b1000__")) 
) |> 
  nest(data = -name) |> 
  mutate(plot = map2(
    data, name,
    ~ .x |> 
    ggplot(aes(b1000_high_resolution, b1000)) +
      geom_point(size=0.2) +
      geom_abline(intercept = 0, slope = 1, color="red") +
      ggtitle(.y) +
      scale_y_continuous(trans=S_sqrt_trans(), labels = dropLeadingZero) +
      scale_x_continuous(trans=S_sqrt_trans(), labels = dropLeadingZero) +
      theme_multipanel 
  )) |> 
  pull(plot) |> 
  wrap_plots() 
## Joining, by = c(".sample", "name")