Libraries

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6.9000     ✔ purrr   0.3.4     
## ✔ tibble  3.1.7          ✔ dplyr   1.0.9     
## ✔ tidyr   1.2.0          ✔ stringr 1.4.0     
## ✔ readr   2.1.2          ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'tidySummarizedExperiment'
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:dplyr':
## 
##     bind_cols, bind_rows, count
## The following object is masked from 'package:stats':
## 
##     filter
## ========================================
## tidybulk version 1.9.2
## If you use TIDYBULK in published research, please cite:
## 
## Mangiola et al. tidybulk: an R tidy framework for modular 
## transcriptomic data analysis. Genome Biology 2021.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(tidybulk))
## ========================================
## 
## Attaching package: 'tidybulk'
## The following objects are masked from 'package:tidySummarizedExperiment':
## 
##     bind_cols, bind_rows, se
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following objects are masked from 'package:dplyr':
## 
##     bind_cols, bind_rows
## The following object is masked from 'package:stats':
## 
##     filter
## Loading required package: DelayedArray
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'DelayedArray'
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum, scale, sweep
## Loading required package: rhdf5
## 
## Attaching package: 'HDF5Array'
## The following object is masked from 'package:rhdf5':
## 
##     h5ls
counts = readRDS("PPCG_RNA_count/counts_deconvolution.rds")

(
  counts |> 
  pivot_sample() |> 
  pivot_longer(
    names_to= c("method", "cell_type"), 
    values_to = "proportion", 
    cols=contains("__"),
    names_sep ="__"
  ) %>%
  ggplot(aes(x = .sample, y = proportion, fill = cell_type)) +
  geom_bar(stat = "identity") +
  facet_wrap(method~Center, scale="free", ncol=6) +
  guides(fill=guide_legend(ncol=10)) +
  theme_multipanel +
  theme(legend.position = "bottom", axis.text.x = element_blank()) 
) |> plotly::ggplotly()

Tumor purity vs epithelial abundance

ggMarginal(
  counts |> 
    pivot_sample() |> 
    ggplot(aes(purity.scores, deconvolution__epithelial, color = Center)) +
    geom_point(size=0.2) +
    geom_smooth(method="lm")+
    geom_abline(linetype="dashed", alpha=0.3) +
    theme_multipanel, 
  groupFill = TRUE, 
  type = "boxplot"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Comparison of the distribution of subtypes using variable 500 genes or tissue composition

# RNA
counts |> 
  select(-TMM) |> 
  identify_abundant() |> 
  scale_abundance()|> 
  reduce_dimensions(.abundance=raw.counts_scaled, method="PCA") |> 
  pivot_sample() |> 
  ggplot(aes(PC1, PC2, color=subtype)) +
  geom_point() +
  facet_wrap(~ Center, scale="free") +
  theme_multipanel
## No group or design set. Assuming all samples belong to one group.
## tidybulk says: the sample with largest library size PPCG2179a_RNAseq was chosen as reference for scaling
## Getting the 500 most variable genes
## Fraction of variance explained by the selected principal components
## # A tibble: 2 × 2
##   `Fraction of variance`    PC
##                    <dbl> <int>
## 1                 0.196      1
## 2                 0.0967     2
## tidybulk says: to access the raw results do `attr(..., "internals")$PCA`
## This message is displayed once per session.

# Tissue composition
counts |> 
  pivot_sample() |> 
  pivot_longer(
    names_to= c("method", "cell_type"), 
    values_to = "proportion", 
    cols=contains("__"),
    names_sep ="__"
  ) |> 
  
  # Filter abundant cell types
  nest(data = -cell_type) |> 
  mutate(median_proportion = map_dbl(data, ~ .x |> pull(proportion) |> median())) |>
  arrange(desc(median_proportion)) |>
  slice(1:15) |> 
  unnest(data) |> 
  
  # Transform
  reduce_dimensions(
    .sample, cell_type,  proportion,
    transform = identity, center=TRUE, 
    method="PCA"
  ) |> 
  ggplot(aes(PC1, PC2, color=subtype)) +
  geom_point() +
  facet_wrap(~ Center, scale="free") +
  theme_multipanel
## Warning in eval(dots[[i]][[action]], env, env): tidybulk says: highly abundant
## transcripts were not identified (i.e. identify_abundant()) or filtered (i.e.,
## keep_abundant), therefore this operation will be performed on unfiltered
## data. In rare occasions this could be wanted. In standard whole-transcriptome
## workflows is generally unwanted.
## Getting the 15 most variable genes
## Warning in as_mapper(.f2)(.x): 
##                      tidybulk says: In PCA correlation there is < 100 genes that have non NA values is all samples.
## The correlation calculation would not be reliable,
## we suggest to partition the dataset for sample clusters.
##                      
## Fraction of variance explained by the selected principal components
## # A tibble: 2 × 2
##   `Fraction of variance`    PC
##                    <dbl> <int>
## 1                  0.263     1
## 2                  0.127     2