Load libraries

suppressPackageStartupMessages({library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)
library(impute)
library(BatchQC)
library(sva)})

Find features in other experiments

df_source<- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

rnaDf_batchC <- as.data.frame(df_source[["pbmc_gene_expression_wide"]])
genes <- colnames(rnaDf_batchC)

abtite_batchC <- as.data.frame(df_source[["plasma_antibody_levels_wide"]])
antigens <- colnames(abtite_batchC)

cytof_batchC <- as.data.frame(df_source[["pbmc_cell_frequency_wide"]])
cells <- colnames(cytof_batchC)

olink_batchC <- as.data.frame(df_source[["plasma_cytokine_concentrations_wide"]])
proteins <- colnames(olink_batchC)

Load Meta Data

metaDf <- read_tsv(here("./data/2021LD_subject.tsv"), show_col_types = FALSE)
specimen <- read_tsv(here("./data/2021LD_specimen.tsv"), show_col_types = FALSE)
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth, units="weeks")/52, 2))
metaDf <- merge(metaDf, specimen, by="subject_id")
metaDf["timepoint"] <- metaDf["planned_day_relative_to_boost"]

metaDf <- metaDf[metaDf$timepoint %in% c(0, 1, 3, 7, 14), ]

samples <- levels(factor(metaDf$specimen_id))
subjects <- levels(factor(metaDf$subject_id))
print(paste("Number of samples:", length(samples)))
## [1] "Number of samples: 180"
print(paste("Number of subjects:", length(subjects)))
## [1] "Number of subjects: 36"
dist_histogram <- function(df, col){
    ggplot(df, aes_string(x=col)) +
    geom_histogram(aes(y=..density..),      
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666")
}

histogram <- function(df, col){
  ggplot(df, aes_string(x=col)) +
    geom_histogram(binwidth=.5,
                   colour="black",
                   fill="paleturquoise1")
}

Load Data

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

cellDf1 <- read_tsv(here("./data/2021LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)%>%
  select(specimen_id, cell_type_name, percent_live_cell) %>%
  filter(cell_type_name %in% cells)


cellDf <- cellDf1%>%
  pivot_wider(names_from = cell_type_name, values_from = percent_live_cell) %>%
  column_to_rownames("specimen_id")
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
rnaDf <- read_tsv(here("./data/2021LD_pbmc_gene_expression.tsv"), show_col_types = FALSE) %>%
  select(specimen_id, versioned_ensembl_gene_id, tpm)
  # filter(versioned_ensembl_gene_id %in% genes)

# tpm_shortlist <- rnaDf %>%
#   mutate(specimen_id = as.integer(specimen_id)) %>%
#   group_by(versioned_ensembl_gene_id) %>%
#   summarise(proportion = mean(tpm >= 1))  %>%
#   filter(proportion >= 0.8)

rnaDf <- rnaDf %>%
  # filter(versioned_ensembl_gene_id %in% tpm_shortlistrnaDf $versioned_ensembl_gene_id) %>%
  filter(versioned_ensembl_gene_id %in% genes) %>%
  mutate(versioned_ensembl_gene_id=gsub("\\..*", "", versioned_ensembl_gene_id))


par(mar = c(4, 4, .1, .1))
histogram(rnaDf, "tpm")
dist_histogram(rnaDf, "tpm")


# removed outlier RNA
print("Remove Outliers")
## [1] "Remove Outliers"
outliers <- boxplot(rnaDf$tpm, plot=FALSE)$out

par(mar = c(4, 4, .1, .1))
histogram(rnaDf[-which(rnaDf$tpm %in% outliers),], "tpm")
dist_histogram(rnaDf[-which(rnaDf$tpm %in% outliers),], "tpm")

rnaDf <- rnaDf %>%
  pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  column_to_rownames("specimen_id") 

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
abtiterDf <- read_tsv(here("./data/2021LD_plasma_ab_titer.tsv"), show_col_types = FALSE)%>%
  filter(!isotype %in% "IgE") %>%
  mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
  select(specimen_id, isotype_antigen, MFI_normalised) %>%
  filter(isotype_antigen %in% antigens) %>%
  pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
  column_to_rownames("specimen_id") 
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
olinkDf <- read_tsv(here("./data/2021LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE) %>%
  select(specimen_id, protein_id, protein_expression) %>%
  filter(specimen_id %in% samples & protein_id %in% proteins) %>%
  pivot_wider(names_from = protein_id, values_from = protein_expression) %>%
  column_to_rownames("specimen_id")
dataList <- list()
dataList[["original"]] <- list("rnaseq"= t(rnaDf),
                "abtiter"= t(abtiterDf),
                 "cytof"= t(cellDf), 
                 "olink"= t(olinkDf)
                 )
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
all_samples <- rnaDf %>%
  rownames_to_column("specimen_id") %>%
  select(specimen_id)

mofa_rnaseq <- rnaDf %>% t()

mofa_abtiter <- all_samples %>% 
  left_join(abtiterDf %>% rownames_to_column("specimen_id")) %>%
  column_to_rownames("specimen_id")%>%
  t()

mofa_cytof <- all_samples %>% 
  left_join(cellDf %>% rownames_to_column("specimen_id")) %>%
  column_to_rownames("specimen_id")%>%
  t()

mofa_olink <- all_samples %>% 
  left_join(olinkDf %>% rownames_to_column("specimen_id")) %>%
  column_to_rownames("specimen_id") %>%
  t()
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

dataList[["mofa"]] <- list(
  "rnaseq"= mofa_rnaseq,
  "abtiter"= mofa_abtiter,
   "cytof"= mofa_cytof,
   "olink"= mofa_olink)

dataList$mofa <- lapply(dataList$mofa, as.matrix)

MOFAobject <- create_mofa(dataList$mofa)
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- 15

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject <- run_mofa(MOFAobject, use_basilisk = TRUE)

plot_factor_cor(MOFAobject)

plot_variance_explained(MOFAobject, max_r2=15)

plot_variance_explained(MOFAobject, plot_total = T)[[2]]

na2median <- function(df) {
  df <- data.frame(df)
  df <- t(data.frame(t(df)) %>%
                mutate(across(1:dim(df)[1], ~replace_na(., median(., na.rm=TRUE)))))
  colnames(df) <- gsub("X", "", colnames(df))
  return(as.matrix(df))
}

cellDf_2020 <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)%>%
  select(specimen_id, cell_type_name, percent_live_cell) %>%
  filter(cell_type_name %in% cells) %>%
  pivot_wider(names_from = cell_type_name, values_from = percent_live_cell) %>%
  column_to_rownames("specimen_id") %>%
  t()

fullcellDf <- t(cbind(cellDf_2020, t(cellDf)))

batch <- c(rep('2020', 165), rep('2021', 100))


# impCellDf <- na_mean((fullcellDf))

impCellDf <- na2median((fullcellDf))
cellDf_all <- ComBat(dat=t(impCellDf), batch=batch, par.prior=FALSE)

cellDf_pLonger <-data.frame(cellDf_all) %>% 
    pivot_longer(everything(), names_to = "specimen_id") %>%
    mutate(specimen_id=gsub("X", "", specimen_id))

cellDf_2021 <- cellDf_pLonger %>% filter(specimen_id %in% rownames(cellDf))

print("Cell Freq before Batch Correction")
## [1] "Cell Freq before Batch Correction"
par(mar = c(4, 4, .1, .1))
histogram(cellDf1, "percent_live_cell")
dist_histogram(cellDf1, "percent_live_cell")

print("Cell Freq after Batch Correction")
## [1] "Cell Freq after Batch Correction"
par(mar = c(4, 4, .1, .1))
histogram(cellDf_2021, "value")
dist_histogram(cellDf_2021, "value")

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
colnames(cellDf_all) <- gsub("X", "", colnames(cellDf_all))
cell_BC <- data.frame(t(cellDf_all[, rownames(cellDf)]))
cell_BC$specimen_id <- rownames(cell_BC)

cellDf_BC <- full_join(all_samples, cell_BC, by="specimen_id") %>%
  column_to_rownames("specimen_id")%>%
  t()
dataList[["batchCorrected"]] <- list(
  "rnaseq"= mofa_rnaseq,
  "abtiter"= mofa_abtiter,
   "cytof"= cellDf_BC,
   "olink"= mofa_olink)

dataList$batchCorrected <- lapply(dataList$batchCorrected, as.matrix)

MOFAobject <- create_mofa(dataList$batchCorrected)
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- 15

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject <- run_mofa(MOFAobject, use_basilisk = TRUE)

plot_factor_cor(MOFAobject)

plot_variance_explained(MOFAobject, max_r2=15)

plot_variance_explained(MOFAobject, plot_total = T)[[2]]