Load libraries

suppressPackageStartupMessages({library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
# library('ComplexUpset')
# library("ComplexHeatmap")
library(mice)
# library(VIM)
library(impute)
library(BatchQC)
library(sva)
source(here("./src/codebase.R"))})
dist_histogram <- function(df, col, title){
    ggplot(df, aes_string(x=col)) +
    geom_histogram(aes(y=..density..),      
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666", adjust = 3)+
    ggtitle(title)
}

histogram <- function(df, col, title){
  ggplot(df, aes_string(x=col)) +
    geom_histogram(binwidth=.5,
                   colour="black",
                   fill="paleturquoise1")+
    ggtitle(title)
}

1. Loading and filtering Data

master_data <- readRDS(here("./data/master_processed_training_data_CMI_PB.RDS"))

# master_data <- readRDS(here("./data/master_processed_training_data.RDS"))
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

metaDf_2020 <- master_data$subject_specimen %>%
  filter(dataset=="2020_dataset") 
# %>%
#   mutate(as.character(specimen_id))

metaDf_2021 <-  master_data$subject_specimen %>%
  filter(dataset=="2021_dataset") 
# %>%
#   mutate(as.character(specimen_id))

metaDf_2020 <- metaDf_2020[metaDf_2020$timepoint %in% c(0, 1, 3, 7, 14),]
metaDf_2021 <- metaDf_2021[metaDf_2021$timepoint %in% c(0, 1, 3, 7, 14),]

batch.factors <- c("timepoint","infancy_vac","biological_sex")
aMeta <- rbind(metaDf_2020, metaDf_2021) %>%
    mutate(specimen_id = as.numeric(specimen_id))
aMeta[batch.factors] <- lapply(aMeta[batch.factors], factor) 

RNA seq

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

geneExpDf <- master_data$pbmc_gene_expression$raw_data %>% 
  t() %>%
  data.frame() %>% 
  rownames_to_column("specimen_id")  %>%
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  column_to_rownames("specimen_id") 


rna_2020 <- geneExpDf[rownames(geneExpDf) %in% (metaDf_2020$specimen_id), ]
colnames(rna_2020) <- gsub("\\..*", "", colnames(rna_2020))

rna_2021 <- geneExpDf[rownames(geneExpDf) %in% (metaDf_2021$specimen_id), ]
colnames(rna_2021) <- gsub("\\..*", "", colnames(rna_2021))


rna_2020_logTrans <- log2(rna_2020+1)
rna_2021_logTrans <- log2(rna_2021+1)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
mRna <- melt(rna_2020)
dist_histogram(mRna, "value", "RNA seq Datasets 2020")

outliers <- boxplot(mRna$value, plot=FALSE)$out
dist_histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2020")

mRna <- melt(rna_2020_logTrans)
dist_histogram(mRna, "value", "log2 transformed RNA seq Datasets 2020")

mRna <- melt(rna_2021)
dist_histogram(mRna, "value", "RNA seq Datasets 2021")

outliers <- boxplot(mRna$value, plot=FALSE)$out
dist_histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2021")

mRna <- melt(rna_2021_logTrans)
dist_histogram(mRna, "value", "log2 transformed RNA seq Datasets 2021")

mRna <- melt(rna_2020)
histogram(mRna, "value", "RNA seq Datasets 2020")

outliers <- boxplot(mRna$value, plot=FALSE)$out
histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2020")

mRna <- melt(rna_2020_logTrans)
histogram(mRna, "value", "log2 transformed RNA seq Datasets 2020")

mRna <- melt(rna_2021)
histogram(mRna, "value", "RNA seq Datasets 2021")

outliers <- boxplot(mRna$value, plot=FALSE)$out
histogram(mRna[-which(mRna$value %in% outliers),], "value", 
               "RNA seq Outlier Removed Datasets 2021")

mRna <- melt(rna_2021_logTrans)
histogram(mRna, "value", "log2 transformed RNA seq Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rna <- rbind(rna_2020, rna_2021)
aData <- na.omit(rna) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData), c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run_rnaseq(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
source(here("./src/codebase.R"))
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "RNA seq Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "RNA seq Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rna <- rbind(rna_2020_logTrans, rna_2021_logTrans)
aData <- na.omit(rna) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData), c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run_rnaseq(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Log transformed RNA seq Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Log transformed RNA seq Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Ab-titer

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

abtiterDf <- master_data$abtiter_wide$raw_data %>% 
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  column_to_rownames("specimen_id")

abtiter_2020 <- abtiterDf[rownames(abtiterDf) %in% (metaDf_2020$specimen_id), ] %>%
            select(where(~n_distinct(.) > 1))%>%
            as.matrix() %>%
            impute.knn() %>%
            .$data %>%
            data.frame()

abtiter_2021 <- abtiterDf[rownames(abtiterDf) %in% (metaDf_2021$specimen_id), ] %>%
            select(where(~n_distinct(.) > 1))%>%
            as.matrix() %>%
            impute.knn() %>%
            .$data %>%
            data.frame()


abtiter_2020_logTrans <- log2(abtiter_2020+1)
abtiter_2021_logTrans <- log2(abtiter_2021+1)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

mAbtiter <- melt(abtiter_2020)
dist_histogram(mAbtiter, "value", "Abtiter Datasets 2020")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
dist_histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2020")

mAbtiter <- melt(abtiter_2020_logTrans)
dist_histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2020")

mAbtiter <- melt(abtiter_2021)
dist_histogram(mAbtiter, "value", "Abtiter Datasets 2021")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
dist_histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2021")

mAbtiter <- melt(abtiter_2021_logTrans)
dist_histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2021")

mAbtiter <- melt(abtiter_2020)
histogram(mAbtiter, "value", "Abtiter Datasets 2020")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2020")

mAbtiter <- melt(abtiter_2020_logTrans)
histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2020")

mAbtiter <- melt(abtiter_2021)
histogram(mAbtiter, "value", "Abtiter Datasets 2021")

outliers <- boxplot(mAbtiter$value, plot=FALSE)$out
histogram(mAbtiter[-which(mAbtiter$value %in% outliers),], "value", 
               "Abtiter Outlier Removed Datasets 2021")

mAbtiter <- melt(abtiter_2021_logTrans)
histogram(mAbtiter, "value", "log2 transformed Abtiter Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter <- rbind(abtiter_2020, abtiter_2021)
aData <- na.omit(abtiter) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Ab-titer Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Ab-titer Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter <- rbind(abtiter_2020_logTrans, abtiter_2021_logTrans)
aData <- na.omit(abtiter) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Log transformed Ab-titer Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Log transformed Ab-titer Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Cytof

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

cellFreqDf <- master_data$pbmc_cell_frequency$raw_data %>% 
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  column_to_rownames("specimen_id")

cytof_2021 <- cellFreqDf[rownames(cellFreqDf) %in% (metaDf_2021$specimen_id), ] %>%
            select(where(~n_distinct(.) > 1))%>%
            as.matrix() %>%
            impute.knn() %>%
            .$data %>%
            data.frame()

cytof_2020 <- cellFreqDf[rownames(cellFreqDf) %in% (metaDf_2020$specimen_id), ] %>%
            select(where(~n_distinct(.) > 1))%>%
            as.matrix() %>%
            impute.knn() %>%
            .$data %>%
            data.frame()


cytof_medianD0_2021 <- cytof_2021 %>%
  rownames_to_column("specimen_id") %>%
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  pivot_longer(!specimen_id, names_to = "cell_type_name", values_to = "count") %>%
  filter(specimen_id %in% unique(metaDf_2021[metaDf_2021$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
  group_by(cell_type_name)  %>%
  summarise(median = median(count, na.rm = T))

cytof_2021_normalized <- cytof_2021 %>%
    rownames_to_column("specimen_id") %>%
    mutate(specimen_id = as.numeric(specimen_id)) %>%
    pivot_longer(!specimen_id, names_to = "cell_type_name", 
                 values_to = "percent_live_cell") %>%
    left_join(cytof_medianD0_2021) %>%
    mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, 
                                                  NA, percent_live_cell/median))%>%
    select(specimen_id, cell_type_name, percent_live_cell_normalized) %>%
    pivot_wider(names_from = cell_type_name, 
                values_from = percent_live_cell_normalized) %>%
    column_to_rownames("specimen_id")

cytof_medianD0_2020 <- cytof_2020 %>%
  rownames_to_column("specimen_id") %>%
  mutate(specimen_id = as.numeric(specimen_id)) %>%
  pivot_longer(!specimen_id, names_to = "cell_type_name", 
               values_to = "count") %>%
  filter(specimen_id %in% unique(metaDf_2020[metaDf_2020$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
  group_by(cell_type_name)  %>%
  summarise(median = median(count, na.rm = T))

cytof_2020_normalized <- cytof_2020 %>%
    rownames_to_column("specimen_id") %>%
    mutate(specimen_id = as.numeric(specimen_id)) %>%
    pivot_longer(!specimen_id, names_to = "cell_type_name", 
                 values_to = "percent_live_cell") %>%
    left_join(cytof_medianD0_2020) %>%
    mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, 
                                                  NA, percent_live_cell/median))%>%
    select(specimen_id, cell_type_name, percent_live_cell_normalized) %>%
    pivot_wider(names_from = cell_type_name, 
                values_from = percent_live_cell_normalized) %>%
    column_to_rownames("specimen_id")

cytof_2020_sqrtTrans <- sqrt(cytof_2020_normalized)
cytof_2021_sqrtTrans <- sqrt(cytof_2021_normalized)
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
mCytof <- melt(cytof_2020)
dist_histogram(mCytof, "value", "Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2020_normalized)
dist_histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2020")

mCytof <- melt(cytof_2020_sqrtTrans)
dist_histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2021)
dist_histogram(mCytof, "value", "Cell Frequency Datasets 2021")

mCytof <- melt(cytof_2021_normalized)
dist_histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2021")

mCytof <- melt(cytof_2021_sqrtTrans)
dist_histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2021")


mCytof <- melt(cytof_2020)
histogram(mCytof, "value", "Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2020_normalized)
histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2020")

mCytof <- melt(cytof_2020_sqrtTrans)
histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2020")

mCytof <- melt(cytof_2021)
histogram(mCytof, "value", "Cell Frequency Datasets 2021")

mCytof <- melt(cytof_2021_normalized)
histogram(mCytof, "value", 
               "Cell Frequency Normalized Datasets 2021")

mCytof <- melt(cytof_2021_sqrtTrans)
histogram(mCytof, "value", "Square root transformed Cell Frequency Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- rbind(cytof_2020, cytof_2021)
aData <- na.omit(cytof) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- rbind(cytof_2020_normalized, cytof_2021_normalized)
aData <- na.omit(cytof) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Normalized Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Normalized Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- rbind(cytof_2020_sqrtTrans, cytof_2021_sqrtTrans)
aData <- na.omit(cytof) %>%
  t()

metaData <- aMeta[aMeta$specimen_id %in% colnames(aData),c("dataset", "specimen_id", batch.factors)]
rownames(metaData) <- metaData$specimen_id

phenoData = AnnotatedDataFrame(data = metaData)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Square root Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Square root Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Mofa Analysis for Normalized and Transformed Data 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
factor_number <- 15

transformedData <- lapply(dataList_2020[['transformed']], as.matrix)
MOFAobject_2020 <- create_mofa(transformedData)
plot_data_overview(MOFAobject_2020)

data_opts <- get_default_data_options(MOFAobject_2020)
model_opts <- get_default_model_options(MOFAobject_2020)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject_2020)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject_2020 <- prepare_mofa(MOFAobject_2020,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject_2020 <- run_mofa(MOFAobject_2020, outfile=here("./results/MOFA2_2ndChallenge_2020.hdf5"), use_basilisk=TRUE)

plot_factor_cor(MOFAobject_2020)
plot_variance_explained(MOFAobject_2020, max_r2=factor_number)
plot_variance_explained(MOFAobject_2020, plot_total = T)[[2]]

mofaVariance <- data.frame(MOFAobject_2020@cache$variance_explained$r2_total$group1)%>%
  rename_all(~c("values")) %>%
  rownames_to_column("experiment") %>%
  mutate(dataset=rep("2020", 4)) %>%
  mutate(variables=rep('original', 4))

Mofa Analysis for Normalized and Transformed Data 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
factor_number <- 15

transformedData <- lapply(dataList_2021[['transformed']], as.matrix)
MOFAobject_2021 <- create_mofa(transformedData)
plot_data_overview(MOFAobject_2021)

data_opts <- get_default_data_options(MOFAobject_2021)
model_opts <- get_default_model_options(MOFAobject_2021)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject_2021)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject_2021<- prepare_mofa(MOFAobject_2021,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject_2021 <- run_mofa(MOFAobject_2021, outfile=here("./results/MOFA2_2ndChallenge_2021.hdf5"), use_basilisk = TRUE)

plot_factor_cor(MOFAobject_2021)
plot_variance_explained(MOFAobject_2021, max_r2=factor_number)
plot_variance_explained(MOFAobject_2021, plot_total = T)[[2]]

varDf <- data.frame(MOFAobject_2021@cache$variance_explained$r2_total$group1)%>%
  rename_all(~c("values")) %>%
  rownames_to_column("experiment") %>%
  mutate(dataset=rep("2021", 4)) %>%
  mutate(variables=rep('original', 4))

mofaVariance <- rbind(mofaVariance, varDf)

Mofa Imputation Dataset 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

# MOFAobject_mofaImputed <- impute(MOFAobject_2020, views = "all")
# # samples_metadata(MOFAobject_mofaImputed) <- metaDf_2020_multi
# MOFAobject_mofaImputed
# 
# results <- MOFAobject_mofaImputed@imputed_data

MOFAobject_mofaImputed <- MOFA2::impute(MOFAobject_2020, views = "all")
samples_metadata(MOFAobject_mofaImputed) <- metaDf_2020_multi
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 8242 27 22 29 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
results <- MOFAobject_mofaImputed@imputed_data

dataList_2020$mofaImputed <- list()
dataList_2020$mofaImputed[["rnaseq"]] <- results$rnaseq[["group1"]]
dataList_2020$mofaImputed[["abtiter"]] <- results$abtiter[["group1"]]
dataList_2020$mofaImputed[["cytof"]] <- results$cytof[["group1"]]
dataList_2020$mofaImputed[["olink"]] <- results$olink[["group1"]]

factor_number <- 15

MOFAobject <- create_mofa(dataList_2020$mofaImputed)
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject <- run_mofa(MOFAobject, outfile=here("./results/MOFA2_2ndChallenge_2020_imputed.hdf5"), use_basilisk = TRUE)

plot_factor_cor(MOFAobject)
plot_variance_explained(MOFAobject, max_r2=factor_number)
plot_variance_explained(MOFAobject, plot_total = T)[[2]]

varDf <- data.frame(MOFAobject_2021@cache$variance_explained$r2_total$group1)%>%
  rename_all(~c("values")) %>%
  rownames_to_column("experiment") %>%
  mutate(dataset=rep("2020", 4)) %>%
  mutate(variables=rep('imputed', 4))

mofaVariance <- rbind(mofaVariance, varDf)

Mofa Imputation Dataset 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
MOFAobject_mofaImputed <- MOFA2::impute(MOFAobject_2021, views = "all")
samples_metadata(MOFAobject_mofaImputed) <- metaDf_2021_multi
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 8242 27 22 29 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
results <- MOFAobject_mofaImputed@imputed_data

dataList_2021$mofaImputed <- list()
dataList_2021$mofaImputed[["rnaseq"]] <- results$rnaseq[["group1"]]
dataList_2021$mofaImputed[["abtiter"]] <- results$abtiter[["group1"]]
dataList_2021$mofaImputed[["cytof"]] <- results$cytof[["group1"]]
dataList_2021$mofaImputed[["olink"]] <- results$olink[["group1"]]

factor_number <- 15

MOFAobject <- create_mofa(dataList_2020$mofaImputed)
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- factor_number

train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42

MOFAobject <- prepare_mofa(MOFAobject,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

MOFAobject <- run_mofa(MOFAobject, outfile=here("./results/MOFA2_2ndChallenge_2021_imputed.hdf5"), use_basilisk = TRUE)

plot_factor_cor(MOFAobject)
plot_variance_explained(MOFAobject, max_r2=factor_number)
plot_variance_explained(MOFAobject, plot_total = T)[[2]]

varDf <- data.frame(MOFAobject_2021@cache$variance_explained$r2_total$group1)%>%
  rename_all(~c("values")) %>%
  rownames_to_column("experiment") %>%
  mutate(dataset=rep("2021", 4)) %>%
  mutate(variables=rep('imputed', 4))

mofaVariance <- rbind(mofaVariance, varDf)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
mData <- melt(dataList_2020$mofaImputed[["abtiter"]])
dist_histogram(mData, "value", "Imputed Ab-titer Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["abtiter"]])
dist_histogram(mData, "value", "Imputed Ab-titer Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["cytof"]])
dist_histogram(mData, "value", "Imputed Cell Frequency Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["cytof"]])
dist_histogram(mData, "value", "Imputed Cell Frequency Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["olink"]])
dist_histogram(mData, "value", "Imputed Olink Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["olink"]])
dist_histogram(mData, "value", "Imputed Olink Datasets 2021")


mData <- melt(dataList_2020$mofaImputed[["abtiter"]])
histogram(mData, "value", "Imputed Ab-titer Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["abtiter"]])
histogram(mData, "value", "Imputed Ab-titer Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["cytof"]])
histogram(mData, "value", "Imputed Cell Frequency Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["cytof"]])
histogram(mData, "value", "Imputed Cell Frequency Datasets 2021")

mData <- melt(dataList_2020$mofaImputed[["olink"]])
histogram(mData, "value", "Imputed Olink Datasets 2020")

mData <- melt(dataList_2021$mofaImputed[["olink"]])
histogram(mData, "value", "Imputed Olink Datasets 2021")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
barPlot <- function(df, title){
  ggplot(data=df, aes(x=names, y=values, fill=variables)) +
  geom_bar(stat="identity", position=position_dodge()) +
  # geom_text(aes(label=values), vjust=1.6, color="black",
  #           position = position_dodge(0.9), size=2) +
    theme(axis.text.x = element_text(angle = 45,
                                     vjust = 1,
                                     hjust = 1,
                                     size  = 8)) +
    ggtitle(title)
  
  }

Feature variance chaneg by imputation

Ab-titer 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter_2020_vars <- data.frame()
abtiter_2020_vars <- abtiter_2020_logTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

abtiter_2020_vars <- dataList_2020$mofaImputed[["abtiter"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(abtiter_2020_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(abtiter_2020_vars, "Feature variance chaneg by imputation Ab-titer 2020")

Ab-titer 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
abtiter_2021_vars <- data.frame()
abtiter_2021_vars <- abtiter_2021_logTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

abtiter_2021_vars <- dataList_2021$mofaImputed[["abtiter"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(abtiter_2021_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(abtiter_2021_vars, "Feature variance chaneg by imputation Ab-titer 2021")

cytof 2020

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof_2020_vars <- data.frame()
cytof_2020_vars <- cytof_2020_sqrtTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

cytof_2020_vars <- dataList_2020$mofaImputed[["cytof"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(cytof_2020_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(cytof_2020_vars, "Feature variance chaneg by imputation Cell Frequency 2020")

cytof 2021

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof_2021_vars <- data.frame()
cytof_2021_vars <- cytof_2021_sqrtTrans %>% 
  t() %>%
  data.frame() %>%
  mutate(original= rowVars(.)) %>% 
  select("original") %>% 
  # mutate_all(funs(round(.,0))) %>%
  rownames_to_column("names")

cytof_2021_vars <- dataList_2021$mofaImputed[["cytof"]] %>%
  data.frame() %>%
  mutate(imputed= rowVars(.)) %>%
  select("imputed") %>%
  rownames_to_column("names") %>%
  left_join(cytof_2021_vars) %>%
  pivot_longer(!names, names_to = "variables", values_to = "values") %>%
  data.frame()

barPlot(cytof_2021_vars, "Feature variance chaneg by imputation Cell Frequency 2021")

variance of all values

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
ab2020_org <- abtiter_2020_logTrans %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

ab2021_org <- abtiter_2021_logTrans %>% 
              # data.frame() %>%
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

ab2020_imp <- dataList_2020$mofaImputed[["abtiter"]] %>% 
              data.frame() %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

ab2021_imp <- dataList_2021$mofaImputed[["abtiter"]] %>% 
              data.frame() %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

cytof2020_org <- cytof_2020_sqrtTrans %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

cytof2021_org <- cytof_2021_sqrtTrans %>% 
              # data.frame() %>%
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

cytof2020_imp <- dataList_2020$mofaImputed[["cytof"]] %>% 
              data.frame() %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

cytof2021_imp <- dataList_2021$mofaImputed[["cytof"]] %>% 
              data.frame() %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

olink2020_org <- olink_2020_logTrans %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

olink2021_org <- olink_2021_logTrans %>% 
              # data.frame() %>%
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

olink2020_imp <- dataList_2020$mofaImputed[["olink"]] %>% 
              data.frame() %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

olink2021_imp <- dataList_2021$mofaImputed[["olink"]] %>% 
              data.frame() %>% 
              pivot_longer(cols = everything(), 
              names_to = 'samples', 
              values_to = "values") %>% 
              summarise(var(values, na.rm = TRUE)) %>%
              as.vector()

varDf <- data.frame(experiment=c(rep("abtiter", 4), rep("cytof", 4), rep("olink", 4)),
                    variables=c(rep(c(rep('original', 2), rep('imputed', 2)), 3)),
                    dataset=c(rep(c("2020", "2021"), 6)),
                    values=c(ab2020_org[[1]], ab2021_org[[1]], ab2020_imp[[1]], ab2021_imp[[1]],
                             cytof2020_org[[1]], cytof2021_org[[1]], cytof2020_imp[[1]], cytof2021_imp[[1]],
                             olink2020_org[[1]], olink2021_org[[1]], olink2020_imp[[1]], olink2021_imp[[1]]))

barPlot <- function(df, title){
  ggplot(data=df, aes(x=names, y=values, fill=variables)) +
  geom_bar(stat="identity", position=position_dodge()) +
  geom_text(aes(label=values), vjust=0, color="black",
            position = position_dodge(0.9), size=4) +
    theme(axis.text.x = element_text(angle = 45,
                                     vjust = 1,
                                     hjust = 1,
                                     size  = 8)) +
    ggtitle(title)}

varDf <- varDf %>%
  mutate(across(c('values'),round, 2)) %>%
  unite(names, c("experiment", "dataset"))

barPlot(varDf, "Experiments variance chaneg by imputation")

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
varDf <- mofaVariance %>% 
  mutate(across(c('values'),round, 1)) %>%
  unite(names, c("experiment", "dataset"))

barPlot(varDf, "MOFA variance Explained chaneg by imputation ")

RNAseq Batch Correction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

bcMeta <- rbind(metaDf_2020_multi, metaDf_2021_multi) %>%
  column_to_rownames("sample")

batchCorrected_list <- list()

rna <- cbind(dataList_2020$mofaImputed[["rnaseq"]], dataList_2021$mofaImputed[["rnaseq"]])
batch <- c(rep('2020', dim(multiome_2020_backbone)[1]), rep('2021', dim(multiome_2021_backbone)[1]))
aData <- ComBat(rna, batch = batch)

batchCorrected_list[["rnaseq"]] <- aData

phenoData = AnnotatedDataFrame(data = bcMeta)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run_rnaseq(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
bcMeta$specimen_id <- rownames(bcMeta) 
metaData <- bcMeta[bcMeta$specimen_id %in% colnames(aData), c("dataset", "specimen_id", batch.factors)] %>%
  mutate(specimen_id = as.numeric(specimen_id))


plot_pvca(pvcaObj_after, "Batch Corrected RNA Seq Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Batch Corrected RNA Seq Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Ab-titer Batch Correction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

abtiter <- cbind(dataList_2020$mofaImputed[["abtiter"]], dataList_2021$mofaImputed[["abtiter"]])
batch <- c(rep('2020', dim(multiome_2020_backbone)[1]), rep('2021', dim(multiome_2021_backbone)[1]))
aData <- ComBat(abtiter, batch = batch)

batchCorrected_list[["abtiter"]] <- aData

phenoData = AnnotatedDataFrame(data = bcMeta)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Batch Corrected Ab-titer Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Batch Corrected Ab-titer Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)

Cytof Batch Correction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
cytof <- cbind(dataList_2020$mofaImputed[["cytof"]], dataList_2021$mofaImputed[["cytof"]])
batch <- c(rep('2020', dim(multiome_2020_backbone)[1]), rep('2021', dim(multiome_2021_backbone)[1]))
aData <- ComBat(cytof, batch = batch)

batchCorrected_list[["cytof"]] <- aData

phenoData = AnnotatedDataFrame(data = bcMeta)
exprset = ExpressionSet(as.matrix(aData[, sampleNames(phenoData)]),
                          phenoData = phenoData)
exprs(exprset)

pvcaObj_after <- pvca_run(exprset, batch.factors = c("dataset", batch.factors), threshold =0.6)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
plot_pvca(pvcaObj_after, "Batch Corrected Cell Frequency Datasets 2020 and 2021")

pca_plots = plot_pca(aData, metaData, "Batch Corrected Cell Frequency Datasets 2020 and 2021")
  plot(pca_plots$PC12)
  plot(pca_plots$PC13)
  plot(pca_plots$PC23)