Load data

base_data_dir = "/home/rnili/repo/rCodes/Rasteh_data_check/"
suppressPackageStartupMessages({
library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)
library(impute)
library(pvca)
library(Biobase)
library(lme4)})

Subject and specimen data

subject_2020 <- read_tsv(paste0(base_data_dir, "2020LD_subject.tsv"), show_col_types = FALSE)
subject_2021 <- read_tsv(paste0(base_data_dir, "2021LD_subject.tsv"), show_col_types = FALSE)

specimen_2020 <- read_tsv(paste0(base_data_dir, "2020LD_specimen.tsv"), show_col_types = FALSE)
specimen_2021 <- read_tsv(paste0(base_data_dir, "2021LD_specimen.tsv"), show_col_types = FALSE)


metadata_2020 <- specimen_2020 %>%
                  left_join(subject_2020) %>%
                  mutate(timepoint = planned_day_relative_to_boost)

## Joining with `by = join_by(subject_id)`

metadata_2021 <- left_join(specimen_2021, subject_2021)

## Joining with `by = join_by(subject_id)`

abtiter_2021 <- read_tsv(paste0(base_data_dir, "2021LD_plasma_ab_titer.tsv")) %>%
  filter(!isotype %in% "IgE") %>%
  mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
  select(specimen_id, isotype_antigen, MFI_normalised) %>%
  pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
  column_to_rownames("specimen_id")

## Rows: 8085 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): isotype, antigen, unit
## dbl (4): specimen_id, MFI, MFI_normalised, lower_limit_of_detection
## lgl (1): is_antigen_specific
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

abtiter_2020 <- read_tsv(paste0(base_data_dir, "2020LD_plasma_ab_titer.tsv")) %>%
  filter(!isotype %in% "IgE") %>%
  mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
  filter(isotype_antigen %in% colnames(abtiter_2021)) %>%
  select(specimen_id, isotype_antigen, MFI_normalised) %>%
  pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
  column_to_rownames("specimen_id")

## Rows: 31520 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): isotype, antigen, unit
## dbl (4): specimen_id, MFI, MFI_normalised, lower_limit_of_detection
## lgl (1): is_antigen_specific
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

abtiter_2021 = abtiter_2021 %>%
  select(colnames(abtiter_2021))

colnames(abtiter_2020) == colnames(abtiter_2021)

## Warning in colnames(abtiter_2020) == colnames(abtiter_2021): longer object
## length is not a multiple of shorter object length

##  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE

batch.factors = c("timepoint","infancy_vac","biological_sex")

## PVCA analysis
mat_data = abtiter_2020 %>%
  t()

# PVCA_now(mat_data, metadata_2020, batch.factors)

cytof_2021 <- read_tsv(paste0(base_data_dir, "2021LD_pbmc_cell_frequency.tsv")) %>%
  select(specimen_id, cell_type_name, percent_live_cell) %>%
  pivot_wider(names_from = cell_type_name, values_from = percent_live_cell) %>%
  column_to_rownames("specimen_id")

## Rows: 8250 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): cell_type_name
## dbl (2): specimen_id, percent_live_cell
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cytof_2020 <- read_tsv(paste0(base_data_dir, "2020LD_pbmc_cell_frequency.tsv")) %>%
  select(specimen_id, cell_type_name, percent_live_cell) %>%
  filter(cell_type_name %in% colnames(cytof_2021)) %>%
  pivot_wider(names_from = cell_type_name, values_from = percent_live_cell) %>%
  column_to_rownames("specimen_id")

## Rows: 2483 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): cell_type_name
## dbl (2): specimen_id, percent_live_cell
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(cytof_2020) == colnames(cytof_2021)

## Warning in colnames(cytof_2020) == colnames(cytof_2021): longer object length
## is not a multiple of shorter object length

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE

batch.factors = c("timepoint","infancy_vac","biological_sex")

## PVCA analysis
mat_data = cytof_2020 %>%
  t()

# PVCA_now(mat_data, metadata_2020, batch.factors)

olink_2021 <- read_tsv(paste0(base_data_dir, "2021LD_plasma_cytokine_concentration.tsv")) %>%
  select(specimen_id, protein_id, protein_expression) %>%
  pivot_wider(names_from = protein_id, values_from = protein_expression) %>%
  column_to_rownames("specimen_id")

## Rows: 8100 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): protein_id, quality_control, unit
## dbl (4): specimen_id, lower_limit_of_quantitation, protein_expression, upper...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

olink_2020 <- read_tsv(paste0(base_data_dir, "2020LD_plasma_cytokine_concentration.tsv")) %>%
  select(specimen_id, protein_id, protein_expression) %>%
  filter(protein_id %in% colnames(olink_2021)) %>%
  pivot_wider(names_from = protein_id, values_from = protein_expression) %>%
  column_to_rownames("specimen_id")

## Rows: 23670 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): protein_id, quality_control, unit
## dbl (4): specimen_id, lower_limit_of_quantitation, protein_expression, upper...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(olink_2020) == colnames(olink_2021)

## Warning in colnames(olink_2020) == colnames(olink_2021): longer object length
## is not a multiple of shorter object length

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

batch.factors = c("timepoint","infancy_vac","biological_sex")

## PVCA analysis
mat_data = olink_2020 %>%
  t()

# PVCA_now(mat_data, metadata_2020, batch.factors)

rnaseq_2021 <- read_tsv(paste0(base_data_dir, "2021LD_pbmc_gene_expression.tsv")) %>%
  select(specimen_id, versioned_ensembl_gene_id, tpm) #%>%

## Rows: 10494360 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): versioned_ensembl_gene_id
## dbl (3): specimen_id, raw_count, tpm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

  #pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  #column_to_rownames("specimen_id")

rnaseq_2020 <- read_tsv(paste0(base_data_dir, "2020LD_pbmc_gene_expression.tsv")) %>%
  select(specimen_id, versioned_ensembl_gene_id, tpm) #%>%

## Rows: 10494360 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): versioned_ensembl_gene_id
## dbl (3): specimen_id, raw_count, tpm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

  #pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  #column_to_rownames("specimen_id")

#' Create a shortlist of genes (tpm >= 1) in at least 30% of the specimens.

tpm_shortlist <- rnaseq_2020 %>%
  #rownames_to_column("versioned_ensembl_gene_id") %>%
  #filter(versioned_ensembl_gene_id %in% gene_90_38_shortlist$versioned_ensembl_gene_id) %>%
  #pivot_longer(!versioned_ensembl_gene_id, values_to = "tpm", names_to = "specimen_id") %>%
  mutate(specimen_id = as.integer(specimen_id)) %>%
  #left_join(subject_specimen) %>%
  group_by(versioned_ensembl_gene_id) %>%
  #group_by(versioned_ensembl_gene_id, infancy_vac) %>%
  summarise(proportion = mean(tpm >= 1))  %>%
  filter(proportion >= 0.8)

rnaseq_2020_v1 <- rnaseq_2020 %>%
  filter(versioned_ensembl_gene_id %in% tpm_shortlist$versioned_ensembl_gene_id) %>%
  pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  column_to_rownames("specimen_id")
  

tpm_shortlist_2021 <- rnaseq_2021 %>%
  #rownames_to_column("versioned_ensembl_gene_id") %>%
  #filter(versioned_ensembl_gene_id %in% gene_90_38_shortlist$versioned_ensembl_gene_id) %>%
  #pivot_longer(!versioned_ensembl_gene_id, values_to = "tpm", names_to = "specimen_id") %>%
  mutate(specimen_id = as.integer(specimen_id)) %>%
  #left_join(subject_specimen) %>%
  group_by(versioned_ensembl_gene_id) %>%
  #group_by(versioned_ensembl_gene_id, infancy_vac) %>%
  summarise(proportion = mean(tpm >= 1))  %>%
  filter(proportion >= 0.8)

rnaseq_2021_v1 <- rnaseq_2021 %>%
  filter(versioned_ensembl_gene_id %in% tpm_shortlist_2021$versioned_ensembl_gene_id) %>%
  pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  column_to_rownames("specimen_id")

batch.factors = c("timepoint","infancy_vac","biological_sex")

## PVCA analysis
mat_data = rnaseq_2020_v1 %>%
  t()

# PVCA_now(mat_data, metadata_2020, batch.factors)

PVCA_now <- function(mat_data, subject_specimen, batch.factors, plot_title = "PVCA analysis"){
  
  ## test
  # mat_data = olink_data_processed$wide
  # subject_specimen = metadata_2020
  
  counts_data <- mat_data[rowMeans(is.na(mat_data)) < 1, ] %>%
    as.matrix() %>%
    impute.knn() %>%
    .$data
  
  subject_specimen <- subject_specimen %>%
    dplyr::filter(specimen_id %in% colnames(counts_data)) %>%
    dplyr::select(specimen_id, timepoint, infancy_vac, biological_sex, dataset) %>%
    dplyr::mutate(specimen_id1 = specimen_id) %>%
    column_to_rownames("specimen_id1") 
  
  subject_specimen <- subject_specimen[colnames(counts_data), ]
  
  phenoData = AnnotatedDataFrame(data = subject_specimen)
  
  exprset = ExpressionSet(as.matrix(counts_data), 
                          phenoData = phenoData)
  
  pvcaObj_after <- pvca_run(exprset, batch.factors = batch.factors, threshold =0.6) 
  
  print(plot_pvca(pvcaObj_after, paste0(plot_title)))
  
  
  #calculate principle components cell components
  #pca_plots = plot_pca(counts_data, subject_specimen, paste0(plot_title))
  #plot(pca_plots$PC12)
  #plot(pca_plots$PC13)
  #plot(pca_plots$PC23)
  
}

MOFA

# library(MOFA2)
# library(MOFAdata)
# library(data.table)
# library(ggplot2)
# library(tidyverse)

multiome_2020_backbone = rnaseq_2020_v1 %>%
  rownames_to_column("specimen_id") %>%
  select(specimen_id)

abtiter_2020_multi <- multiome_2020_backbone %>%
  left_join(abtiter_2020 %>% rownames_to_column("specimen_id")) %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

cytof_2020_multi <- multiome_2020_backbone %>%
  left_join(cytof_2020 %>% rownames_to_column("specimen_id"))  %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

olink_2020_multi <- multiome_2020_backbone %>%
  left_join(olink_2020 %>% rownames_to_column("specimen_id"))  %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

df_make_mofa = list(
  
  rnaseq = t(rnaseq_2020_v1),
  cytof = t(cytof_2020_multi),
  abtiter = t(abtiter_2020_multi),
  olink = t(olink_2020_multi)
  
)

# lapply(df_make_mofa)

MOFAobject <- create_mofa(df_make_mofa)

## Creating MOFA object from a list of matrices (features as rows, sample as columns)...

plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- 15

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

## Warning in prepare_mofa(MOFAobject, data_options = data_opts, model_options =
## model_opts, : Some view(s) have a lot of features, it is recommended to perform
## a more stringent feature selection before creating the MOFA object....

## Checking data options...

## Checking training options...

## Checking model options...

MOFAobject <- run_mofa(MOFAobject, use_basilisk = TRUE)

## Warning in run_mofa(MOFAobject, use_basilisk = TRUE): No output filename provided. Using /tmp/Rtmpgh5ASc/mofa_20231016-155831.hdf5 to store the trained model.

## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

plot_factor_cor(MOFAobject)

plot_variance_explained(MOFAobject, max_r2=15)

plot_variance_explained(MOFAobject, plot_total = T)[[2]]

multiome_2021_backbone = rnaseq_2021_v1 %>%
  rownames_to_column("specimen_id") %>%
  select(specimen_id)

abtiter_2021_multi <- multiome_2021_backbone %>%
  left_join(abtiter_2021 %>% rownames_to_column("specimen_id")) %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

cytof_2021_multi <- multiome_2021_backbone %>%
  left_join(cytof_2021 %>% rownames_to_column("specimen_id"))  %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

olink_2021_multi <- multiome_2021_backbone %>%
  left_join(olink_2021 %>% rownames_to_column("specimen_id"))  %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

df_make_mofa = list(
  
  rnaseq = t(rnaseq_2021_v1),
  cytof = t(cytof_2021_multi),
  abtiter = t(abtiter_2021_multi),
  olink = t(olink_2021_multi)
  
)

#lapply(df_make_mofa)

MOFAobject_2021 <- create_mofa(df_make_mofa)

## Creating MOFA object from a list of matrices (features as rows, sample as columns)...

plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject_2021)
model_opts <- get_default_model_options(MOFAobject_2021)
model_opts$num_factors <- 15

train_opts <- get_default_training_options(MOFAobject_2021)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject_2021 <- prepare_mofa(MOFAobject_2021,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

## Warning in prepare_mofa(MOFAobject_2021, data_options = data_opts,
## model_options = model_opts, : Some view(s) have a lot of features, it is
## recommended to perform a more stringent feature selection before creating the
## MOFA object....

## Checking data options...

## Checking training options...

## Checking model options...

MOFAobject_2021 <- run_mofa(MOFAobject_2021, use_basilisk = TRUE)

## Warning in run_mofa(MOFAobject_2021, use_basilisk = TRUE): No output filename provided. Using /tmp/Rtmpgh5ASc/mofa_20231016-155848.hdf5 to store the trained model.

## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

plot_factor_cor(MOFAobject_2021)

plot_variance_explained(MOFAobject_2021, max_r2=15)

plot_variance_explained(MOFAobject_2021, plot_total = T)[[2]]

Process datasets | Inspect 2021 dataset

cytokine_2021_median_D0 <- cytof_2021_multi %>%
    rownames_to_column("specimen_id") %>%
    mutate(specimen_id = as.numeric(specimen_id)) %>%
    pivot_longer(!specimen_id, names_to = "cell_type_name", values_to = "count") %>%
    #left_join(metadata_2020[c("specimen_id")]) %>%
    filter(specimen_id %in% unique(metadata_2021[metadata_2021$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
    group_by( cell_type_name)  %>%
    summarise(median = median(count, na.rm = T))
  
cytof_2021_normalized_pre <-  cytof_2021_multi  %>%
    rownames_to_column("specimen_id") %>%
    mutate(specimen_id = as.numeric(specimen_id)) %>%
    pivot_longer(!specimen_id, names_to = "cell_type_name", values_to = "percent_live_cell") %>%
    left_join(metadata_2021[c("specimen_id", "dataset")]) %>%
    left_join(cytokine_2021_median_D0) %>%
    mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, NA, percent_live_cell/median))

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(cell_type_name)`

cytof_2021_normalized <- cytof_2021_normalized_pre %>%
  select(specimen_id, cell_type_name, percent_live_cell_normalized) %>%
  pivot_wider(names_from = cell_type_name, values_from = percent_live_cell_normalized) %>%
  column_to_rownames("specimen_id") 

  
cytof_2021_normalized_multi <- multiome_2021_backbone %>%
  left_join(cytof_2021_normalized %>% rownames_to_column("specimen_id"))  %>%
  column_to_rownames("specimen_id")

## Joining with `by = join_by(specimen_id)`

cytof_2021_normalized_multi_imputed = cytof_2021_normalized_multi %>%
      as.matrix() %>%
      impute.knn() %>%
      .$data

## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 15 rows with more than 50 % entries missing;
##  mean imputation used for these rows

cytof_2021_normalized_multi_imputed_log = log(cytof_2021_normalized_multi_imputed + 1)

df_make_mofa = list(
  
  rnaseq = t(rnaseq_2021_v1),
  cytof = t(cytof_2021_normalized_multi_imputed_log),
  abtiter = t(abtiter_2021_multi),
  olink = t(olink_2021_multi)
  
)

#lapply(df_make_mofa)

MOFAobject_2021 <- create_mofa(df_make_mofa)

## Creating MOFA object from a list of matrices (features as rows, sample as columns)...

plot_data_overview(MOFAobject_2021)

data_opts <- get_default_data_options(MOFAobject_2021)
model_opts <- get_default_model_options(MOFAobject_2021)
model_opts$num_factors <- 15

train_opts <- get_default_training_options(MOFAobject_2021)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject_2021 <- prepare_mofa(MOFAobject_2021,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

## Warning in prepare_mofa(MOFAobject_2021, data_options = data_opts,
## model_options = model_opts, : Some view(s) have a lot of features, it is
## recommended to perform a more stringent feature selection before creating the
## MOFA object....

## Checking data options...

## Checking training options...

## Checking model options...

MOFAobject_2021 <- run_mofa(MOFAobject_2021, use_basilisk = TRUE)

## Warning in run_mofa(MOFAobject_2021, use_basilisk = TRUE): No output filename provided. Using /tmp/Rtmpgh5ASc/mofa_20231016-155857.hdf5 to store the trained model.

## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

plot_factor_cor(MOFAobject_2021)

plot_variance_explained(MOFAobject_2021, max_r2=15)

plot_variance_explained(MOFAobject_2021, plot_total = T)[[2]]

training data recheck

Load data

Subject and specimen data

MOFA

Process datasets | Inspect 2021 dataset