Load libraries

suppressPackageStartupMessages({library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
# library('ComplexUpset')
# library("ComplexHeatmap")
library(mice)
# library(VIM)
library(impute)
library(BatchQC)
library(sva)
source("./src/codebase.R")})
dist_histogram <- function(df, col, title){
    ggplot(df, aes_string(x=col)) +
    geom_histogram(aes(y=..density..),      
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666", adjust = 3)+
    ggtitle(title)
}

histogram <- function(df, col, title){
  ggplot(df, aes_string(x=col)) +
    geom_histogram(binwidth=.5,
                   colour="black",
                   fill="paleturquoise1")+
    ggtitle(title)
}

1. Loading and filtering Data

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
subject_2020 <- read_tsv(here("./data/2020LD_subject.tsv"), show_col_types = FALSE)
subject_2021 <- read_tsv(here("./data/2021LD_subject.tsv"), show_col_types = FALSE)

specimen_2020 <- read_tsv(here("./data/2020LD_specimen.tsv"), show_col_types = FALSE)
specimen_2021 <- read_tsv(here("./data/2021LD_specimen.tsv"), show_col_types = FALSE)

metaDf_2020 <- specimen_2020 %>% 
              left_join(subject_2020) %>% 
              mutate(timepoint= planned_day_relative_to_boost)
## Joining with `by = join_by(subject_id)`
metaDf_2021 <-  specimen_2021 %>% 
              left_join(subject_2021) %>% 
              mutate(timepoint= planned_day_relative_to_boost)
## Joining with `by = join_by(subject_id)`
metaDf_2021 <- metaDf_2021[metaDf_2021$timepoint %in% c(0, 1, 3, 7, 14),]

batch.factors <- c("timepoint","infancy_vac","biological_sex")
aMeta <- rbind(metaDf_2020, metaDf_2021)
aMeta[batch.factors] <- lapply(aMeta[batch.factors], factor) 

Mitocondria genes

mGenes <- read_tsv(here("./data/raw.githubusercontent.com_CMI-PB_second-challenge-train-dataset-preparation_main_data_gene_90_38_export.tsv"), show_col_types = FALSE) %>%
  # filter(substr(display_label, 1,3) == "MT-")
  data.frame() %>%
  filter(grepl('MT-', display_label)) %>% 
  select(versioned_ensembl_gene_id)

mGenes <- as.vector(mGenes["versioned_ensembl_gene_id"])

RNA seq

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
rna_2020 <- read_tsv(here("./data/2020LD_pbmc_gene_expression.tsv"), 
                     show_col_types = FALSE) %>%
          select(specimen_id, versioned_ensembl_gene_id, tpm) %>%
          filter(!(versioned_ensembl_gene_id %in% mGenes)) %>%
          mutate(versioned_ensembl_gene_id=gsub("\\..*", "", versioned_ensembl_gene_id))
  
rna_2021 <- read_tsv(here("./data/2021LD_pbmc_gene_expression.tsv"), 
                     show_col_types = FALSE) %>%
          select(specimen_id, versioned_ensembl_gene_id, tpm) %>% 
          filter(!(versioned_ensembl_gene_id %in% mGenes)) %>%
          mutate(versioned_ensembl_gene_id=gsub("\\..*", "", versioned_ensembl_gene_id))

tpm_shortlist <- rna_2020 %>%
  mutate(specimen_id = as.integer(specimen_id)) %>%
  group_by(versioned_ensembl_gene_id) %>%
  summarise(proportion = mean(tpm >= 1))  %>%
  filter(proportion >= 0.8)

rna_2020 <- rna_2020 %>%
  filter(versioned_ensembl_gene_id %in% tpm_shortlist$versioned_ensembl_gene_id) %>%
  pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  column_to_rownames("specimen_id") %>%
  select(where(~n_distinct(.) > 1)) 

tpm_shortlist <- rna_2021 %>%
  mutate(specimen_id = as.integer(specimen_id)) %>%
  group_by(versioned_ensembl_gene_id) %>%
  summarise(proportion = mean(tpm >= 1))  %>%
  filter(proportion >= 0.8) 

rna_2021 <- rna_2021 %>%
  filter(versioned_ensembl_gene_id %in% tpm_shortlist$versioned_ensembl_gene_id) %>%
  pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
  column_to_rownames("specimen_id") %>%
  select(where(~n_distinct(.) > 1)) 

rna_features <- intersect(colnames(rna_2020), colnames(rna_2021))

rna_2021 <- rna_2021 %>% 
  as.data.frame() %>%
              select(rna_features) 

rna_2020 <- rna_2020 %>%
  as.data.frame() %>%
              select(rna_features) 

# rownames(rna_2020) <-  gsub("X", "", rownames(rna_2020))
# rownames(rna_2021) <-  gsub("X", "", rownames(rna_2021))

rna_2020_logTrans <- log2(rna_2020+1)
rna_2021_logTrans <- log2(rna_2021+1)

mRna <- melt(rna_2020)
dist_histogram(mRna, "value", "RNA seq Datasets 2020")

outliers <- boxplot(mRna$value, plot=FALSE)$out
dist_histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2020")

mRna <- melt(rna_2020_logTrans)
dist_histogram(mRna, "value", "log2 transformed RNA seq Datasets 2020")

mRna <- melt(rna_2021)
dist_histogram(mRna, "value", "RNA seq Datasets 2021")

outliers <- boxplot(mRna$value, plot=FALSE)$out
dist_histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2021")

mRna <- melt(rna_2021_logTrans)
dist_histogram(mRna, "value", "log2 transformed RNA seq Datasets 2021")

mRna <- melt(rna_2020)
histogram(mRna, "value", "RNA seq Datasets 2020")

outliers <- boxplot(mRna$value, plot=FALSE)$out
histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2020")

mRna <- melt(rna_2020_logTrans)
histogram(mRna, "value", "log2 transformed RNA seq Datasets 2020")

mRna <- melt(rna_2021)
histogram(mRna, "value", "RNA seq Datasets 2021")

outliers <- boxplot(mRna$value, plot=FALSE)$out
histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2021")

mRna <- melt(rna_2021_logTrans)
histogram(mRna, "value", "log2 transformed RNA seq Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rna <- rbind(rna_2020, rna_2021)
aData <- na.omit(rna) %>%
  t()


metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run_rnaseq(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "RNA seq Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "RNA seq Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rna <- rbind(rna_2020_logTrans, rna_2021_logTrans)
aData <- na.omit(rna) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run_rnaseq(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Log transformed RNA seq Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Log transformed RNA seq Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Ab-titer

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
abtiter_2020 <- read_tsv(here("./data/2020LD_plasma_ab_titer.tsv"), 
                         show_col_types = FALSE) %>%
      filter(!isotype %in% "IgE") %>%
      mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
      select(specimen_id, isotype_antigen, MFI_normalised) %>%
      pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
      column_to_rownames("specimen_id") %>%
      select(where(~n_distinct(.) > 1)) 

abtiter_2021 <- read_tsv(here("./data/2021LD_plasma_ab_titer.tsv"), 
                         show_col_types = FALSE) %>%
     filter(!isotype %in% "IgE") %>%
    mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
    select(specimen_id, isotype_antigen, MFI_normalised) %>%
    pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
    column_to_rownames("specimen_id") %>%
    select(where(~n_distinct(.) > 1)) 

abtiter_features <- intersect(colnames(abtiter_2020), colnames(abtiter_2021))
abtiter_2021 <- abtiter_2021 %>%
    select(abtiter_features)

abtiter_2020 <- abtiter_2020 %>%
    select(abtiter_features)

abtiter_2020_logTrans <- log2(abtiter_2020+1)
abtiter_2021_logTrans <- log2(abtiter_2021+1)

mAbtiter <- melt(abtiter_2020)
dist_histogram(mAbtiter, "value", "Abtiter Datasets 2020")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
dist_histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2020")

mAbtiter <- melt(abtiter_2020_logTrans)
dist_histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2020")

mAbtiter <- melt(abtiter_2021)
dist_histogram(mAbtiter, "value", "Abtiter Datasets 2021")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
dist_histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2021")

mAbtiter <- melt(abtiter_2021_logTrans)
dist_histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2021")

mAbtiter <- melt(abtiter_2020)
histogram(mAbtiter, "value", "Abtiter Datasets 2020")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2020")

mAbtiter <- melt(abtiter_2020_logTrans)
histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2020")

mAbtiter <- melt(abtiter_2021)
histogram(mAbtiter, "value", "Abtiter Datasets 2021")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2021")

mAbtiter <- melt(abtiter_2021_logTrans)
histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter <- rbind(abtiter_2020, abtiter_2021)
aData <- na.omit(abtiter) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Ab-titer Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Ab-titer Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter <- rbind(abtiter_2020, abtiter_2021)
aData <- na.omit(abtiter) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Log transformed Ab-titer Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Log transformed Ab-titer Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Cytof

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
cytof_2021 <- read_tsv(here("./data/2021LD_pbmc_cell_frequency.tsv"), 
                       show_col_types = FALSE) %>%
            select(specimen_id, cell_type_name, percent_live_cell) %>%
            pivot_wider(names_from = cell_type_name, 
                        values_from = percent_live_cell) %>%
            column_to_rownames("specimen_id") %>%
            select(where(~n_distinct(.) > 1))%>%
            as.matrix() %>%
            impute.knn() %>%
            .$data

cytof_2020 <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), 
                       show_col_types = FALSE) %>%
            select(specimen_id, cell_type_name, percent_live_cell) %>%
            pivot_wider(names_from = cell_type_name, 
                        values_from = percent_live_cell) %>%
            column_to_rownames("specimen_id") %>%
            select(where(~n_distinct(.) > 1))%>%
            as.matrix() %>%
            impute.knn() %>%
            .$data

cell_features <- intersect(colnames(cytof_2020), colnames(cytof_2021))

cytof_2021 <- as.data.frame(cytof_2021) %>%
              select(cell_features)

cytof_2020 <- as.data.frame(cytof_2020) %>%
              select(cell_features)

cytof_medianD0_2021 <- cytof_2021 %>%
  rownames_to_column("specimen_id") %>%
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  pivot_longer(!specimen_id, names_to = "cell_type_name", values_to = "count") %>%
  filter(specimen_id %in% unique(metaDf_2021[metaDf_2021$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
  group_by(cell_type_name)  %>%
  summarise(median = median(count, na.rm = T))

cytof_2021_normalized <- cytof_2021 %>%
    rownames_to_column("specimen_id") %>%
    mutate(specimen_id = as.numeric(specimen_id)) %>%
    pivot_longer(!specimen_id, names_to = "cell_type_name", 
                 values_to = "percent_live_cell") %>%
    left_join(cytof_medianD0_2021) %>%
    mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, 
                                                  NA, percent_live_cell/median))%>%
    select(specimen_id, cell_type_name, percent_live_cell_normalized) %>%
    pivot_wider(names_from = cell_type_name, 
                values_from = percent_live_cell_normalized) %>%
    column_to_rownames("specimen_id")

cytof_medianD0_2020 <- cytof_2020 %>%
  rownames_to_column("specimen_id") %>%
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  pivot_longer(!specimen_id, names_to = "cell_type_name", 
               values_to = "count") %>%
  filter(specimen_id %in% unique(metaDf_2020[metaDf_2020$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
  group_by(cell_type_name)  %>%
  summarise(median = median(count, na.rm = T))

cytof_2020_normalized <- cytof_2020 %>%
    rownames_to_column("specimen_id") %>%
    mutate(specimen_id = as.numeric(specimen_id)) %>%
    pivot_longer(!specimen_id, names_to = "cell_type_name", 
                 values_to = "percent_live_cell") %>%
    left_join(cytof_medianD0_2020) %>%
    mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, 
                                                  NA, percent_live_cell/median))%>%
    select(specimen_id, cell_type_name, percent_live_cell_normalized) %>%
    pivot_wider(names_from = cell_type_name, 
                values_from = percent_live_cell_normalized) %>%
    column_to_rownames("specimen_id")

cytof_2020_sqrtTrans <- sqrt(cytof_2020_normalized)
cytof_2021_sqrtTrans <- sqrt(cytof_2021_normalized)

mCytof <- melt(cytof_2020)
dist_histogram(mCytof, "value", "Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2020_normalized)
dist_histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2020")

mCytof <- melt(cytof_2020_sqrtTrans)
dist_histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2021)
dist_histogram(mCytof, "value", "Cell Frequency Datasets 2021")

mCytof <- melt(cytof_2021_normalized)
dist_histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2021")

mCytof <- melt(cytof_2021_sqrtTrans)
dist_histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2021")


mCytof <- melt(cytof_2020)
histogram(mCytof, "value", "Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2020_normalized)
histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2020")

mCytof <- melt(cytof_2020_sqrtTrans)
histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2021)
histogram(mCytof, "value", "Cell Frequency Datasets 2021")

mCytof <- melt(cytof_2021_normalized)
histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2021")

mCytof <- melt(cytof_2021_sqrtTrans)
histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- rbind(cytof_2020, cytof_2021)
aData <- na.omit(cytof) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- rbind(cytof_2020_normalized, cytof_2021_normalized)
aData <- na.omit(cytof) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Normalized Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Normalized Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- rbind(cytof_2020_sqrtTrans, cytof_2021_sqrtTrans)
aData <- na.omit(cytof) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Square root Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Square root Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Mofa Analysis for Normalized and Transformed Data 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
factor_number <- 15

transformedData <- lapply(dataList_2020[['transformed']], as.matrix)
MOFAobject_2020 <- create_mofa(transformedData)
plot_data_overview(MOFAobject_2020)

data_opts <- get_default_data_options(MOFAobject_2020)
model_opts <- get_default_model_options(MOFAobject_2020)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject_2020)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject_2020 <- prepare_mofa(MOFAobject_2020,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject_2020 <- run_mofa(MOFAobject_2020, use_basilisk=TRUE)

plot_factor_cor(MOFAobject_2020)
plot_variance_explained(MOFAobject_2020, max_r2=factor_number)
plot_variance_explained(MOFAobject_2020, plot_total = T)[[2]]

Mofa Analysis for Normalized and Transformed Data 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
factor_number <- 15

transformedData <- lapply(dataList_2021[['transformed']], as.matrix)
MOFAobject_2021 <- create_mofa(transformedData)
plot_data_overview(MOFAobject_2021)

data_opts <- get_default_data_options(MOFAobject_2021)
model_opts <- get_default_model_options(MOFAobject_2021)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject_2021)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject_2021<- prepare_mofa(MOFAobject_2021,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject_2021 <- run_mofa(MOFAobject_2021, use_basilisk = TRUE)

plot_factor_cor(MOFAobject_2021)
plot_variance_explained(MOFAobject_2021, max_r2=factor_number)
plot_variance_explained(MOFAobject_2021, plot_total = T)[[2]]

Mofa Imputation Dataset 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

# MOFAobject_mofaImputed <- impute(MOFAobject_2020, views = "all")
# # samples_metadata(MOFAobject_mofaImputed) <- metaDf_2020_multi
# MOFAobject_mofaImputed
# 
# results <- MOFAobject_mofaImputed@imputed_data

MOFAobject_mofaImputed <- MOFA2::impute(MOFAobject_2020, views = "all")
samples_metadata(MOFAobject_mofaImputed) <- metaDf_2020_multi
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14608 31 22 29 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
results <- MOFAobject_mofaImputed@imputed_data

dataList_2020$mofaImputed <- list()
dataList_2020$mofaImputed[["rnaseq"]] <- results$rnaseq[["group1"]]
dataList_2020$mofaImputed[["abtiter"]] <- results$abtiter[["group1"]]
dataList_2020$mofaImputed[["cytof"]] <- results$cytof[["group1"]]
dataList_2020$mofaImputed[["olink"]] <- results$olink[["group1"]]

factor_number <- 15

MOFAobject <- create_mofa(dataList_2020$mofaImputed)
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject <- run_mofa(MOFAobject, use_basilisk = TRUE)

plot_factor_cor(MOFAobject)
plot_variance_explained(MOFAobject, max_r2=factor_number)
plot_variance_explained(MOFAobject, plot_total = T)[[2]]

Mofa Imputation Dataset 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
MOFAobject_mofaImputed <- MOFA2::impute(MOFAobject_2021, views = "all")
samples_metadata(MOFAobject_mofaImputed) <- metaDf_2021_multi
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14608 31 22 29 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
results <- MOFAobject_mofaImputed@imputed_data

dataList_2021$mofaImputed <- list()
dataList_2021$mofaImputed[["rnaseq"]] <- results$rnaseq[["group1"]]
dataList_2021$mofaImputed[["abtiter"]] <- results$abtiter[["group1"]]
dataList_2021$mofaImputed[["cytof"]] <- results$cytof[["group1"]]
dataList_2021$mofaImputed[["olink"]] <- results$olink[["group1"]]

factor_number <- 15

MOFAobject <- create_mofa(dataList_2020$mofaImputed)
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject <- run_mofa(MOFAobject, use_basilisk = TRUE)

plot_factor_cor(MOFAobject)
plot_variance_explained(MOFAobject, max_r2=factor_number)
plot_variance_explained(MOFAobject, plot_total = T)[[2]]

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
mData <- melt(dataList_2020$mofaImputed[["abtiter"]])
dist_histogram(mData, "value", "Imputed Ab-titer Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["abtiter"]])
dist_histogram(mData, "value", "Imputed Ab-titer Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["cytof"]])
dist_histogram(mData, "value", "Imputed Cell Frequency Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["cytof"]])
dist_histogram(mData, "value", "Imputed Cell Frequency Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["olink"]])
dist_histogram(mData, "value", "Imputed Olink Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["olink"]])
dist_histogram(mData, "value", "Imputed Olink Datasets 2021")


mData <- melt(dataList_2020$mofaImputed[["abtiter"]])
histogram(mData, "value", "Imputed Ab-titer Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["abtiter"]])
histogram(mData, "value", "Imputed Ab-titer Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["cytof"]])
histogram(mData, "value", "Imputed Cell Frequency Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["cytof"]])
histogram(mData, "value", "Imputed Cell Frequency Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["olink"]])
histogram(mData, "value", "Imputed Olink Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["olink"]])
histogram(mData, "value", "Imputed Olink Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
barPlot <- function(df, title){
  ggplot(data=df, aes(x=names, y=values, fill=variables)) +
  geom_bar(stat="identity", position=position_dodge()) +
  # geom_text(aes(label=values), vjust=1.6, color="black",
  #           position = position_dodge(0.9), size=2) +
    theme(axis.text.x = element_text(angle = 45,
                                     vjust = 1,
                                     hjust = 1,
                                     size  = 8)) +
    ggtitle(title)
  
  }

Feature variance chaneg by imputation

Ab-titer 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter_2020_vars <- data.frame()
abtiter_2020_vars <- abtiter_2020_logTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

abtiter_2020_vars <- dataList_2020$mofaImputed[["abtiter"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(abtiter_2020_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(abtiter_2020_vars, "Feature variance chaneg by imputation Ab-titer 2020")

Ab-titer 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter_2021_vars <- data.frame()
abtiter_2021_vars <- abtiter_2021_logTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

abtiter_2021_vars <- dataList_2021$mofaImputed[["abtiter"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(abtiter_2021_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(abtiter_2021_vars, "Feature variance chaneg by imputation Ab-titer 2021")

cytof 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof_2020_vars <- data.frame()
cytof_2020_vars <- cytof_2020_sqrtTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

cytof_2020_vars <- dataList_2020$mofaImputed[["cytof"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(cytof_2020_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(cytof_2020_vars, "Feature variance chaneg by imputation Cell Frequency 2020")

cytof 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof_2021_vars <- data.frame()
cytof_2021_vars <- cytof_2021_sqrtTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

cytof_2021_vars <- dataList_2021$mofaImputed[["cytof"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(cytof_2021_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(cytof_2021_vars, "Feature variance chaneg by imputation Cell Frequency 2021")

#### olink 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
olink_2020_vars <- data.frame()
olink_2020_vars <- olink_2020_logTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

olink_2020_vars <- dataList_2020$mofaImputed[["olink"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(olink_2020_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(olink_2020_vars, "Feature variance chaneg by imputation Olink 2020")

RNAseq Batch Correction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

bcMeta <- rbind(metaDf_2020_multi, metaDf_2021_multi) %>%
  column_to_rownames("sample")

batchCorrected_list <- list()

rna <- cbind(dataList_2020$mofaImputed[["rnaseq"]], dataList_2021$mofaImputed[["rnaseq"]])
batch <- c(rep('2020', dim(multiome_2020_backbone)[1]), rep('2021', dim(multiome_2021_backbone)[1]))
aData <- ComBat(rna, batch = batch)

batchCorrected_list[["rnaseq"]] <- aData

phenoData = AnnotatedDataFrame(data = bcMeta)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run_rnaseq(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Batch Corrected RNA Seq Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Batch Corrected RNA Seq Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Ab-titer Batch Correction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

abtiter <- cbind(dataList_2020$mofaImputed[["abtiter"]], dataList_2021$mofaImputed[["abtiter"]])
batch <- c(rep('2020', dim(multiome_2020_backbone)[1]), rep('2021', dim(multiome_2021_backbone)[1]))
aData <- ComBat(abtiter, batch = batch)

batchCorrected_list[["abtiter"]] <- aData

phenoData = AnnotatedDataFrame(data = bcMeta)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Batch Corrected Ab-titer Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Batch Corrected Ab-titer Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Cytof Batch Correction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- cbind(dataList_2020$mofaImputed[["cytof"]], dataList_2021$mofaImputed[["cytof"]])
batch <- c(rep('2020', dim(multiome_2020_backbone)[1]), rep('2021', dim(multiome_2021_backbone)[1]))
aData <- ComBat(cytof, batch = batch)

batchCorrected_list[["cytof"]] <- aData

phenoData = AnnotatedDataFrame(data = bcMeta)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Batch Corrected Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Batch Corrected Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)