Load libraries

suppressPackageStartupMessages({library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)
library(impute)
library(pvca)
library(Biobase)
library(lme4)})

Find features in other datasets

cells <- read_tsv(here("./data/2021LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)
cells <- levels(factor(cells$cell_type_name)) 

antigens <- read_tsv(here("./data/2021LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
antigens <- levels(factor(paste(antigens$isotype, antigens$antigen, sep = "_")))

genes <- read_tsv(here("./data/2021LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
genes <- levels(factor(gsub("\\..*", "", genes$versioned_ensembl_gene_id)))

proteins <- read_tsv(here("./data/2021LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
proteins <- levels(factor(proteins$protein_id))

Load Meta Data

metaDf <- read_tsv(here("./data/2020LD_subject.tsv"), show_col_types = FALSE)
specimen <- read_tsv(here("./data/2020LD_specimen.tsv"), show_col_types = FALSE)
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth, units="weeks")/52, 2))
metaDf <- merge(metaDf, specimen, by="subject_id")
metaDf["timepoint"] <- metaDf["planned_day_relative_to_boost"]

metaDf <- metaDf[metaDf$timepoint %in% c(0, 1, 3, 7, 14), ]

samples <- levels(factor(metaDf$specimen_id))
subjects <- levels(factor(metaDf$subject_id))
print(paste("Number of samples:", length(samples)))
## [1] "Number of samples: 296"
print(paste("Number of subjects:", length(subjects)))
## [1] "Number of subjects: 60"

Load Data

cellDf <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)

cellDf <- cellDf[cellDf$specimen_id %in% samples,]

cellDf <- cellDf[cellDf$cell_type_name %in% cells, ]
cellDf <- as.data.frame(pivot_wider(cellDf, names_from = "cell_type_name", 
                  values_from=c("percent_live_cell")))
row.names(cellDf) <- cellDf$specimen_id
subject_num <- length(levels(factor(metaDf[metaDf$specimen_id, "subject_id"])))

cellDf <- t(cellDf[,names(cellDf)!="specimen_id"])
# cellDf <- na.omit(cellDf)
cellDf <- cellDf[rowVars(cellDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(cellDf)) > 0))
compSamples <- names(which(colSums(is.na(cellDf)) == 0))
print(paste("Cell Frequency Incomplete Sample Cases:", length(incompSamples)))
## [1] "Cell Frequency Incomplete Sample Cases: 17"
print(paste("Cell Frequency Complete Sample Cases:", length(compSamples)))
## [1] "Cell Frequency Complete Sample Cases: 83"
print(paste("Cell Frequency Feature Number:", dim(cellDf)[1]))
## [1] "Cell Frequency Feature Number: 22"
incompFeature <- names(which(rowSums(is.na(cellDf)) > 0))
print(paste("Cell Frequency Incomplete Feature Numbers:", length(incompFeature)))
## [1] "Cell Frequency Incomplete Feature Numbers: 1"
print(c("Cell Frequency Incomplete Feature:", incompFeature))
## [1] "Cell Frequency Incomplete Feature:" "ASCs (Plasmablasts)"
for (f in incompFeature){
  subDf <- data.frame(cellDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}
## [1] "ASCs (Plasmablasts) Number of Incomplete Samples:"
## [1] 17
## [1] "ASCs (Plasmablasts) Incomplete Samples:"
##  [1] "281" "349" "350" "351" "352" "353" "376" "377" "378" "379" "380" "419"
## [13] "420" "421" "422" "423" "131"
## [1] "ASCs (Plasmablasts) Number of Incomplete Subjects:"
## [1] 5
## [1] "ASCs (Plasmablasts) Incomplete Subjects:"
## [1] "17" "36" "45" "49" "55"
print("Cell Frequency Incomplete Samples:") 
## [1] "Cell Frequency Incomplete Samples:"
print(incompSamples)
##  [1] "281" "349" "350" "351" "352" "353" "376" "377" "378" "379" "380" "419"
## [13] "420" "421" "422" "423" "131"
print("Cell Frequency Incomplete Subjects:")
## [1] "Cell Frequency Incomplete Subjects:"
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
## [1] "17" "36" "45" "49" "55"
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
## [1] "0"  "1"  "3"  "7"  "14"
incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Cell Frequency", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}
## [1] "Cell Frequency 17 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Frequency 36 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Frequency 45 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 49 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 55 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
print(paste("Cell Frequency Number of Incomplete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
## [1] "Cell Frequency Number of Incomplete Subjects: 5"
print(paste("Cell Frequency Number of Complete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))
## [1] "Cell Frequency Number of Complete Subjects: 17"
print(paste("Cell Frequency Number of All Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(cellDf), "subject_id"])))))
## [1] "Cell Frequency Number of All Subjects: 20"
abtiterDf <- read_tsv(here("./data/2020LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
abtiterDf <- abtiterDf[abtiterDf$specimen_id %in% samples, ]

abtiterDf["antigen"] <- paste(abtiterDf$isotype, abtiterDf$antigen, sep = "_")

abtiterDf <- abtiterDf[abtiterDf$antigen %in% antigens, c("specimen_id", "antigen", "MFI_normalised")]
abtiterDf <- as.data.frame(pivot_wider(abtiterDf, names_from = "antigen",
                  values_from=c("MFI_normalised")))
row.names(abtiterDf) <- abtiterDf$specimen_id
abtiterDf <- t(abtiterDf[, names(abtiterDf)!="specimen_id"])

abtiterDf <- abtiterDf[rowVars(abtiterDf, na.rm = TRUE)>0, ]
incompSamples <- names(which(colSums(is.na(abtiterDf)) > 0))
compSamples <- names(which(colSums(is.na(abtiterDf)) == 0))

print(paste("Ab Titer Incomplete Sample Cases:", length(incompSamples)))
## [1] "Ab Titer Incomplete Sample Cases: 0"
print(paste("Ab Titer Complete Sample Cases:", length(compSamples)))
## [1] "Ab Titer Complete Sample Cases: 274"
print(paste("Ab Titer Feature Number:", dim(abtiterDf)[1]))
## [1] "Ab Titer Feature Number: 31"
incompFeature <- names(which(rowSums(is.na(abtiterDf)) > 0))
print(paste("Ab Titer Incomplete Feature Numbers:", length(incompFeature)))
## [1] "Ab Titer Incomplete Feature Numbers: 0"
print(c("Ab Titer Incomplete Feature:", incompFeature))
## [1] "Ab Titer Incomplete Feature:"
for (f in incompFeature){
  subDf <- data.frame(abtiterDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Ab Titer Incomplete Samples:") 
## [1] "Ab Titer Incomplete Samples:"
print(incompSamples)
## character(0)
print("Ab Titer Incomplete Subjects:")
## [1] "Ab Titer Incomplete Subjects:"
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
## character(0)
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
## character(0)
incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Ab Titer", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Ab Titer Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
## [1] "Ab Titer Number of Incomplete Subjects: 0"
print(paste("Ab Titer Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))
## [1] "Ab Titer Number of Complete Subjects: 58"
print(paste("Ab Titer Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(abtiterDf), "subject_id"])))))
## [1] "Ab Titer Number of All Subjects: 58"
rnaDf <- read_tsv(here("./data/2020LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
rnaDf <- rnaDf[rnaDf$specimen_id %in% samples, ]

rnaDf$versioned_ensembl_gene_id <- gsub("\\..*", "", rnaDf$versioned_ensembl_gene_id)
# print(length(levels(factor(rnaDf$versioned_ensembl_gene_id))))

rnaDf <- rnaDf[rnaDf$versioned_ensembl_gene_id %in% genes, c("specimen_id", "versioned_ensembl_gene_id", "tpm")]
rnaDf <- as.data.frame(pivot_wider(rnaDf, names_from = "versioned_ensembl_gene_id",
                  values_from=c("tpm")))
row.names(rnaDf) <- rnaDf$specimen_id
rnaDf <- t(rnaDf[, names(rnaDf)!="specimen_id"])
# print(dim(rnaDf))

rnaDf <- rnaDf[(rowSums(rnaDf>1)/dim(rnaDf)[2])*100 >70,]
rnaDf <- rnaDf[rowVars(rnaDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(rnaDf)) > 0))
compSamples <- names(which(colSums(is.na(rnaDf)) == 0))

print(paste("RNA seq Incomplete Sample Cases:", length(incompSamples)))
## [1] "RNA seq Incomplete Sample Cases: 0"
print(paste("RNA seq Complete Sample Cases:", length(compSamples)))
## [1] "RNA seq Complete Sample Cases: 180"
print(paste("RNA seq Feature Number:", dim(rnaDf)[1]))
## [1] "RNA seq Feature Number: 14565"
# print(dim(rnaDf))

incompFeature <- names(which(rowSums(is.na(rnaDf)) > 0))
print(paste("RNA seq Incomplete Feature Numbers:", length(incompFeature)))
## [1] "RNA seq Incomplete Feature Numbers: 0"
print(c("RNA seq Incomplete Feature:", incompFeature))
## [1] "RNA seq Incomplete Feature:"
for (f in incompFeature){
  subDf <- data.frame(rnaDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("RNA seq Incomplete Samples:") 
## [1] "RNA seq Incomplete Samples:"
print(incompSamples)
## character(0)
print("RNA seq Incomplete Subjects:")
## [1] "RNA seq Incomplete Subjects:"
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
## character(0)
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
## character(0)
incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("RNA seq", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

print(paste("RNA seq Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
## [1] "RNA seq Number of Incomplete Subjects: 0"
print(paste("RNA seq Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))
## [1] "RNA seq Number of Complete Subjects: 36"
print(paste("RNA seq Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(rnaDf), "subject_id"])))))
## [1] "RNA seq Number of All Subjects: 36"
olinkDf <- read_tsv(here("./data/2020LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
olinkDf <- olinkDf[olinkDf$specimen_id %in% samples, ]

olinkDf <- olinkDf[olinkDf$protein_id %in% proteins, c("specimen_id", "protein_id", "protein_expression")]

olinkDf <- as.data.frame(pivot_wider(olinkDf, names_from = "protein_id",
                  values_from=c("protein_expression")))
row.names(olinkDf) <- olinkDf$specimen_id
olinkDf <- t(olinkDf[, names(olinkDf)!="specimen_id"])

olinkDf <- olinkDf[rowVars(olinkDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(olinkDf)) > 0))
compSamples <- names(which(colSums(is.na(olinkDf)) == 0))

print(paste("Olink Incomplete Sample Cases:", length(incompSamples)))
## [1] "Olink Incomplete Sample Cases: 0"
print(paste("Olink Complete Sample Cases:", length(compSamples)))
## [1] "Olink Complete Sample Cases: 90"
print(paste("Olink Feature Number:", dim(olinkDf)[1]))
## [1] "Olink Feature Number: 30"
incompFeature <- names(which(rowSums(is.na(olinkDf)) > 0))
print(paste("Olink Incomplete Feature Numbers:", length(incompFeature)))
## [1] "Olink Incomplete Feature Numbers: 0"
print(c("Olink Incomplete Feature:", incompFeature))
## [1] "Olink Incomplete Feature:"
for (f in incompFeature){
  subDf <- data.frame(olinkDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Olink Incomplete Samples:") 
## [1] "Olink Incomplete Samples:"
print(incompSamples)
## character(0)
print("Olink Incomplete Subjects:")
## [1] "Olink Incomplete Subjects:"
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
## character(0)
print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
## character(0)
incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Olink", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Olink Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
## [1] "Olink Number of Incomplete Subjects: 0"
print(paste("Olink Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))
## [1] "Olink Number of Complete Subjects: 18"
print(paste("Olink Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(olinkDf), "subject_id"])))))
## [1] "Olink Number of All Subjects: 18"
dataList <- list()
dataList[["original"]] <- list("abtiter"= abtiterDf,
                 "cytof"= cellDf, 
                 "olink"= olinkDf,
                 "rnaseq"=rnaDf)
K = 20
# int_cols <- Reduce(intersect, lapply(dataList$original[c("abtiter", "cytof", "olink", "rnaseq")], colnames))
# cols <- unique(c(int_cols, colnames(dataList$original[["rnaseq"]])))

cols <- colnames(dataList$original[["rnaseq"]])

add_cols <- function(df, cols, exp) {
  print(paste("************************", exp, "********************************"))
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  print(paste(exp, "All the Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))
  
  print(paste(exp, "Number of All the Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))))
  # print(df[, !colnames(df) %in% add])
  incompSamples <- setdiff(names(which(colSums(is.na(df)) > 0)), add)
  compSamples <- setdiff(names(which(colSums(is.na(df)) == 0)), add)
  # print(colnames(df))
  # print(add)
  # print(incompSamples)
  
  print(paste(exp, "Number of selected Samples:", dim(df)[2]))
  print(paste(exp, "Number of Features:", dim(df)[1]))
  
  incompFeature <- setdiff(names(which(rowSums(is.na(df)) > 0)), add)
  print(paste(exp, "Number of Incomplete Features:", length(incompFeature)))
  print(paste(exp, "Incomplete Features:", incompFeature))
  
  for (f in incompFeature){
    subDf <- data.frame(df[f,])
    
    incompSamp <- setdiff(names(which(rowSums(is.na(subDf)) > 0)), add)
    incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))
  
    print(paste(f, "Number of Incomplete Samples:"))
    print(length(incompSamp))
    print(paste(f, "Incomplete Samples:"))
    print(incompSamp)
    
    print(paste(f, "Number of Incomplete Subjects:"))
    print(length(incompSubj))
    print(paste(f, "Incomplete Subjects:"))
    print(incompSubj)
  }
  
  print(paste(exp, "Incomplete Samples:") )
  print(incompSamples)
  print(paste(exp, "Incomplete Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
  
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
  for (s in incompSubj){
    print(paste(exp, s, "Timepoints of Incomplete Subjects:"))
    print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
  }
  subjMissingTimepoints <- levels(factor(metaDf[metaDf$specimen_id %in% add, "subject_id"]))
  print(paste(exp, "Number of Missing Subjects:", 
              length(subjMissingTimepoints)))
  
  print(paste(exp, "Missing Subjects:"))
  print(subjMissingTimepoints)
  
  print(paste(exp, "Number of Incomplete Subjects:", 
              length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
  print(paste(exp, "Number of Complete Subjects:", 
              length(setdiff(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])),
                             subjMissingTimepoints))))
  

  print(paste(exp, "Number of Missing Samples:", length(add)))
  
  if(length(incompFeature)>0){set.seed(1)
    print(paste(exp, "Impute Missing Features for:", incompFeature))
    print("====================================================================")
    df <- t(impute.knn(t(df), k=K)$data)}

  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)
  print(paste(exp, "Number of all Samples:", dim(df)[2]))
  
  print("********************************************************************")
  return(as.matrix(df[, sort(cols)]))
}

# dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf[, int_cols], cols, "RNA seq")
dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf, cols, "RNA seq")
## [1] "************************ RNA seq ********************************"
## [1] "RNA seq All the Subjects:"
##  [1] "1"  "3"  "4"  "5"  "6"  "9"  "10" "11" "13" "15" "17" "18" "19" "20" "21"
## [16] "22" "23" "24" "25" "26" "27" "29" "31" "32" "33" "35" "36" "38" "42" "43"
## [31] "44" "47" "48" "50" "52" "53"
## [1] "RNA seq Number of All the Subjects: 36"
## [1] "RNA seq Number of selected Samples: 180"
## [1] "RNA seq Number of Features: 14565"
## [1] "RNA seq Number of Incomplete Features: 0"
## [1] "RNA seq Incomplete Features: "
## [1] "RNA seq Incomplete Samples:"
## character(0)
## [1] "RNA seq Incomplete Subjects:"
## character(0)
## character(0)
## [1] "RNA seq Number of Missing Subjects: 0"
## [1] "RNA seq Missing Subjects:"
## character(0)
## [1] "RNA seq Number of Incomplete Subjects: 0"
## [1] "RNA seq Number of Complete Subjects: 36"
## [1] "RNA seq Number of Missing Samples: 0"
## [1] "RNA seq Number of all Samples: 180"
## [1] "********************************************************************"
dataList$addedMissingVals[["abtiter"]] <- add_cols(abtiterDf, cols, "Ab-titer")
## [1] "************************ Ab-titer ********************************"
## [1] "Ab-titer All the Subjects:"
##  [1] "1"  "3"  "4"  "5"  "6"  "9"  "10" "11" "13" "15" "17" "18" "19" "20" "21"
## [16] "22" "23" "24" "25" "26" "27" "29" "31" "32" "33" "35" "36" "38" "42" "43"
## [31] "44" "47" "48" "50" "52" "53"
## [1] "Ab-titer Number of All the Subjects: 36"
## [1] "Ab-titer Number of selected Samples: 171"
## [1] "Ab-titer Number of Features: 31"
## [1] "Ab-titer Number of Incomplete Features: 0"
## [1] "Ab-titer Incomplete Features: "
## [1] "Ab-titer Incomplete Samples:"
## character(0)
## [1] "Ab-titer Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Ab-titer Number of Missing Subjects: 8"
## [1] "Ab-titer Missing Subjects:"
## [1] "1"  "4"  "6"  "10" "24" "25" "33" "43"
## [1] "Ab-titer Number of Incomplete Subjects: 0"
## [1] "Ab-titer Number of Complete Subjects: 28"
## [1] "Ab-titer Number of Missing Samples: 9"
## [1] "Ab-titer Number of all Samples: 180"
## [1] "********************************************************************"
dataList$addedMissingVals[["cytof"]] <- add_cols(cellDf, cols, "Cell Freq")
## [1] "************************ Cell Freq ********************************"
## [1] "Cell Freq All the Subjects:"
##  [1] "4"  "6"  "11" "15" "17" "20" "21" "26" "29" "31" "33" "36" "44" "47" "48"
## [16] "52"
## [1] "Cell Freq Number of All the Subjects: 16"
## [1] "Cell Freq Number of selected Samples: 80"
## [1] "Cell Freq Number of Features: 22"
## [1] "Cell Freq Number of Incomplete Features: 1"
## [1] "Cell Freq Incomplete Features: ASCs (Plasmablasts)"
## [1] "ASCs (Plasmablasts) Number of Incomplete Samples:"
## [1] 2
## [1] "ASCs (Plasmablasts) Incomplete Samples:"
## [1] "281" "131"
## [1] "ASCs (Plasmablasts) Number of Incomplete Subjects:"
## [1] 2
## [1] "ASCs (Plasmablasts) Incomplete Subjects:"
## [1] "17" "36"
## [1] "Cell Freq Incomplete Samples:"
## [1] "281" "131"
## [1] "Cell Freq Incomplete Subjects:"
## [1] "17" "36"
## [1] "0"
## [1] "Cell Freq 17 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Freq 36 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Freq Number of Missing Subjects: 20"
## [1] "Cell Freq Missing Subjects:"
##  [1] "1"  "3"  "5"  "9"  "10" "13" "18" "19" "22" "23" "24" "25" "27" "32" "35"
## [16] "38" "42" "43" "50" "53"
## [1] "Cell Freq Number of Incomplete Subjects: 2"
## [1] "Cell Freq Number of Complete Subjects: 16"
## [1] "Cell Freq Number of Missing Samples: 100"
## [1] "Cell Freq Impute Missing Features for: ASCs (Plasmablasts)"
## [1] "===================================================================="
## [1] "Cell Freq Number of all Samples: 180"
## [1] "********************************************************************"
dataList$addedMissingVals[["olink"]] <- add_cols(olinkDf, cols, "Olink")
## [1] "************************ Olink ********************************"
## [1] "Olink All the Subjects:"
##  [1] "4"  "6"  "11" "15" "17" "20" "21" "26" "29" "31" "33" "36" "44" "47" "48"
## [16] "52"
## [1] "Olink Number of All the Subjects: 16"
## [1] "Olink Number of selected Samples: 80"
## [1] "Olink Number of Features: 30"
## [1] "Olink Number of Incomplete Features: 0"
## [1] "Olink Incomplete Features: "
## [1] "Olink Incomplete Samples:"
## character(0)
## [1] "Olink Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Olink Number of Missing Subjects: 20"
## [1] "Olink Missing Subjects:"
##  [1] "1"  "3"  "5"  "9"  "10" "13" "18" "19" "22" "23" "24" "25" "27" "32" "35"
## [16] "38" "42" "43" "50" "53"
## [1] "Olink Number of Incomplete Subjects: 0"
## [1] "Olink Number of Complete Subjects: 16"
## [1] "Olink Number of Missing Samples: 100"
## [1] "Olink Number of all Samples: 180"
## [1] "********************************************************************"
dataList$featureImputed[["rnaseq"]] <- data.frame(dataList$addedMissingVals[["rnaseq"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["rnaseq"]]) <- gsub("X", "", colnames(dataList$featureImputed[["rnaseq"]]))

dataList$featureImputed[["abtiter"]] <- data.frame(dataList$addedMissingVals[["abtiter"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["abtiter"]]) <- gsub("X", "", colnames(dataList$featureImputed[["abtiter"]]))

dataList$featureImputed[["cytof"]] <- data.frame(dataList$addedMissingVals[["cytof"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["cytof"]]) <- gsub("X", "", colnames(dataList$featureImputed[["cytof"]]))

dataList$featureImputed[["olink"]] <- data.frame(dataList$addedMissingVals[["olink"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["olink"]]) <- gsub("X", "", colnames(dataList$featureImputed[["olink"]]))

Modify Metadata

metaDf <- data.frame(metaDf[metaDf$specimen_id %in% cols, ])
colnames(metaDf)[colnames(metaDf)=="specimen_id"] <- "sample"
rownames(metaDf) <- metaDf$sample
metaDf$sample <- as.character(metaDf$sample)
metaDf <- metaDf[cols,]
aMeta <- metaDf[, c("sample", "dataset", "timepoint", "infancy_vac", "biological_sex", "age_at_boost")]

rownames(aMeta) <- aMeta$sample

aMeta$infancy_vac <- as.factor(aMeta$infancy_vac)
aMeta$biological_sex <- as.factor(aMeta$biological_sex)
aMeta$dataset <- as.factor(aMeta$dataset)
aMeta$age_at_boost <- as.factor(aMeta$age_at_boost)

aMeta$sample <- paste("0", aMeta$sample, sep="")

PVCA old version

# # aData <- as.matrix(t(dataList$addedMissingVals[["cytof"]]))
# aData <- as.matrix(rnaDf[, colnames(rnaDf) %in%cols])
# pheno<- AnnotatedDataFrame(aMeta[colnames(aData), ])
# # expSet <- ExpressionSet(assayData=aData, phenoData=annotatedDataFrameFrom(as.matrix(t(aMeta)), byrow=FALSE))
# expSet <- ExpressionSet(assayData=aData, phenoData=pheno)
# exprs(expSet)
# 
# pvcaObj <- pvcaBatchAssess(expSet, batch.factors=c("timepoint", "infancy_vac", "biological_sex"),  threshold=0.4)
# par(mar=c(9,4,4,4))
# bp <- barplot(pvcaObj$dat,
#        ylab = "Proportion Variance Explained Actual Data",
#        ylim= c(0,0.5),
#        col = c("blue"), main="PVCA estimation Cell Frequency")
# box(col = "black")
# # grid(nx = NULL, ny = NULL,
# #      lty = 1,      # Grid line type
# #      col = "gray", # Grid line color
# #      lwd = 2)      # Grid line width
# # axis(1, at = bp, labels = pvcaObj$label, srt=45, las=2, cex.axis = 0.7)
# 
# # , las=2, angle=45, cex.axis = 0.9, tck=.01)
#      # font.lab=12, cex.lab=2, cex.names = 1)
# # axis(1, at=bp, labels = FALSE)
# # text(x = bp, labels = pvcaObj$label, srt = 45)
# text(bp, par("usr")[3], labels = pvcaObj$label, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.9)
# axis(2)
# 
# title(xlab = "Effects", line = 8, cex.lab=1.5)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)

PVCA

PVCA <- function(counts, meta, threshold, inter){

  counts.center <- t(apply(counts, 1, scale, center=TRUE, scale=FALSE))
  cor.counts <- cor(counts.center)
  dim(cor.counts)
  eigen.counts <- eigen(cor.counts)
  eigen.mat <- eigen.counts$vectors
  eigen.val <- eigen.counts$values
  n.eigen <- length(eigen.val)
  eigen.val.sum <- sum(eigen.val)
  percents.pcs <- eigen.val/eigen.val.sum
  meta <- as.data.frame(meta)

  all <- 0
  npc.in <- 0
  for(i in 1:n.eigen){
    all <- all + percents.pcs[i]
    npc.in <- npc.in + 1
    if(all > threshold){break}
  }
  if (npc.in < 3) {npc <- 3}

  pred.list <- colnames(meta)
  meta <- droplevels(meta)

  n.preds <- ncol(meta) + 1
  if(inter) {n.preds <- n.preds + choose(ncol(meta),2)}

  ran.pred.list <- c()
  for(i in 1:ncol(meta)){
    ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i],")"))
  }
  ##interactions
  if(inter){
    for(i in 1:(ncol(meta)-1)){
      for(j in (i+1):ncol(meta)){
        ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i], ":", pred.list[j], ")"))
        pred.list <- c(pred.list, paste0(pred.list[i], ":", pred.list[j]))
      }
    }
  }
  formula <- paste(ran.pred.list, collapse = " + ")
  formula <- paste("pc", formula, sep=" ~ ")
  ran.var.mat <- NULL
  for(i in 1:npc.in){
    dat <- cbind(eigen.mat[,i],meta)
    colnames(dat) <- c("pc",colnames(meta))
    Rm1ML <- lme4::lmer(formula, dat, REML = TRUE, verbose = FALSE, na.action = na.omit,
                        # control=lmerControl(check.nobs.vs.nlev = "ignore",
                        #                    check.nobs.vs.rankZ = "ignore",
                        #                    check.nobs.vs.nRE="ignore")
                        )
    var.vec <- unlist(VarCorr(Rm1ML))
    ran.var.mat <- rbind(ran.var.mat, c(var.vec[pred.list], resid = sigma(Rm1ML)^2))
  }
  ran.var.mat.std <- ran.var.mat/rowSums(ran.var.mat)
  wgt.vec <- eigen.val/eigen.val.sum
  prop.var <- colSums(ran.var.mat.std*wgt.vec[1:npc.in])
  std.prop.var <- prop.var/sum(prop.var)
  std.prop.var
}

PlotPVCA <- function(pvca.res, title){
  plot.dat <- data.frame(eff=names(pvca.res), prop=pvca.res)
  p <- ggplot2::ggplot(plot.dat, aes(x=eff, y=prop))
  p <- p + ggplot2::ggtitle(title)
  p <- p + ggplot2::geom_bar(stat="identity", fill="steelblue", colour="steelblue")
  p <- p + ggplot2::geom_text(aes(label=round(prop,3), y=prop+0.04), size=4)
  p <- p + ggplot2::scale_x_discrete(limits=names(pvca.res))
  p <- p + ggplot2::scale_y_continuous(limits = c(0,1))
  p <- p + ggplot2::labs(x= "Effects", y= "Weighted average proportion variance")
  p <- p + ggplot2::theme_bw()
  p <- p + ggplot2::theme(plot.background = element_blank() ,panel.grid.major = element_blank(),
                 panel.grid.minor = element_blank() ,panel.border = element_blank(), panel.background = element_blank())
  p <- p + ggplot2::theme(axis.line = element_line(color = 'black'))
  p <- p + ggplot2::theme(axis.title.x = element_text(size = 15, vjust=-0.5), 
                          axis.text.x = element_text(angle = 45, vjust= 1, hjust=1,  margin=margin(r=0)))
  p <- p + ggplot2::theme(axis.title.y = element_text(size = 15, vjust= 1.0))
  p <- p + ggplot2::theme(axis.text = element_text(size = 12))
  p
}
aData <- as.matrix(dataList$featureImputed[["cytof"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
## boundary (singular) fit: see help('isSingular')
PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

aData <- as.matrix(dataList$featureImputed[["rnaseq"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
PlotPVCA(pvcaObj, "PVCA estimation RNA seq")

aData <- as.matrix(dataList$featureImputed[["abtiter"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
## boundary (singular) fit: see help('isSingular')
PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

aData <- as.matrix(dataList$featureImputed[["olink"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
PlotPVCA(pvcaObj, "PVCA estimation Olink")

# newDf <- do.call("rbind", dataList[["addedMissingVals"]])
# # newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
# colnames(newDf) <- gsub("X", "", colnames(newDf))
# 
# aData <- as.matrix(na.omit(newDf[,colnames(newDf) %in%cols]))
# pheno<- AnnotatedDataFrame(aMeta[colnames(aData), ])
# 
# expSet <- ExpressionSet(assayData=aData, phenoData=pheno)
# # exprs(expSet)
# pvcaObj <- pvcaBatchAssess(expSet, batch.factors=c("timepoint", "infancy_vac", "biological_sex"),  threshold=0.4)
# 
# par(mar=c(9,4,4,4))
# bp <- barplot(pvcaObj$dat, 
#        ylab = "Proportion Variance Explained", 
#        ylim= c(0,0.5), 
#        col = c("blue"), main="PVCA estimation All Experiments Actual Data")
# box(col = "black")
# text(bp, par("usr")[3], labels = pvcaObj$label, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.9)
# axis(2)
# 
# title(xlab = "Effects", line = 8, cex.lab=1.5)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)
newDf <- do.call("rbind", dataList[["addedMissingVals"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))

aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

Data Analysis before imputation

MOFAobject_missingVals <- create_mofa(dataList[["addedMissingVals"]])
## Creating MOFA object from a list of matrices (features as rows, sample as columns)...
MOFAobject_missingVals 
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
plot_data_overview(MOFAobject_missingVals)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_missingVals) <- metaDf
MOFAobject_missingVals
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
data_opts <- get_default_data_options(MOFAobject_missingVals)
data_opts
## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"
model_opts <- get_default_model_options(MOFAobject_missingVals)
model_opts$num_factors <- 15

model_opts
## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_missingVals)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts
## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE
MOFAobject_missingVals <- prepare_mofa(MOFAobject_missingVals,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
## Warning in prepare_mofa(MOFAobject_missingVals, data_options = data_opts, :
## Some view(s) have a lot of features, it is recommended to perform a more
## stringent feature selection before creating the MOFA object....
## Checking data options...
## Checking training options...
## Checking model options...
MOFAobject_missingVals <- run_mofa(MOFAobject_missingVals, outfile=".../MOFA2_2ndChallenge_2020.hdf5", use_basilisk = TRUE)
## Warning: Output file .../MOFA2_2ndChallenge_2020.hdf5 already exists, it will be replaced
## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.
MOFAobject_missingVals
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
plot_object = MOFAobject_missingVals
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Median Baseline Correction

add_NA_cols <- function(df, cols, exp) {
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)

  return(as.matrix(df[, sort(cols)]))
}

dataList$medianNormalized[["rnaseq"]] <- dataList$addedMissingVals[["rnaseq"]] 
dataList$medianNormalized[["abtiter"]] <- dataList$addedMissingVals[["abtiter"]]

baselineSamples <- metaDf[metaDf$timepoint=="0", "sample"]
### Cell Frequency

cellFreq_featureImp <- dataList$featureImputed[["cytof"]]

cellFreqMedian <- data.frame(apply(cellFreq_featureImp[, colnames(cellFreq_featureImp) %in% baselineSamples], 1, median))
colnames(cellFreqMedian) <- "median"

cellFreqMerge <- cbind(cellFreq_featureImp, cellFreqMedian)

cellFreq_medianCorrect <- cellFreqMerge[, colnames(cellFreq_featureImp)]/cellFreqMerge[, "median"]

cellFreqMean <- data.frame(apply(cellFreq_medianCorrect, 1, mean))
colnames(cellFreqMean) <- "mean"
cellFreqMerge <- cbind(cellFreq_medianCorrect, cellFreqMean)
cellFreq_medianCorrect <- log2(cellFreqMerge[, colnames(cellFreq_featureImp)] + (cellFreqMerge[, "mean"]/2))

dataList$medianNormalized[["cytof"]] <- add_NA_cols(cellFreq_medianCorrect, cols, "Cell Freq")

### Olink

olink_featureImp <- dataList$featureImputed[["olink"]]

olinkMedian <- data.frame(apply(olink_featureImp[, colnames(olink_featureImp) %in% baselineSamples], 1, median))
colnames(olinkMedian) <- "median"

olinkMerge <- cbind(olink_featureImp, olinkMedian)
olink_medianCorrect <- olinkMerge[, colnames(olink_featureImp)]/olinkMerge[, "median"]

olinkMean <- data.frame(apply(olink_medianCorrect, 1, mean))
colnames(olinkMean) <- "mean"
olinkMerge <- cbind(olink_medianCorrect, olinkMean)
olink_medianCorrect <- log2(olinkMerge[, colnames(olink_featureImp)] + (olinkMerge[, "mean"]/2))

dataList$medianNormalized[["olink"]] <- add_NA_cols(olink_medianCorrect, cols, "Olink")
newDf <- data.frame(dataList$medianNormalized[["cytof"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

newDf <- data.frame(dataList$medianNormalized[["rnaseq"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation RNA Seq")

newDf <- data.frame(dataList$medianNormalized[["abtiter"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

newDf <- data.frame(dataList$medianNormalized[["olink"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Olink")

newDf <- do.call("rbind", dataList[["medianNormalized"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

# newDf <- do.call("rbind", dataList[["medianNormalized"]])
# newDf <- data.frame(newDf)%>% select_if(~any(!is.na(.)))
# colnames(newDf) <- gsub("X", "", colnames(newDf))
# 
# aData <- as.matrix(na.omit(newDf[,colnames(newDf) %in%cols]))
# pheno<- AnnotatedDataFrame(aMeta[colnames(aData), ])
# 
# expSet <- ExpressionSet(assayData=aData, phenoData=pheno)
# # exprs(expSet)
# pvcaObj <- pvcaBatchAssess(expSet, batch.factors=c("timepoint", "infancy_vac", "biological_sex"),  threshold=0.4)
# 
# par(mar=c(9,4,4,4))
# bp <- barplot(pvcaObj$dat, 
#        ylab = "Proportion Variance Explained", 
#        ylim= c(0,0.5), 
#        col = c("blue"), main="PVCA estimation All Experiments Normalized by Median Baseline")
# box(col = "black")
# text(bp, par("usr")[3], labels = pvcaObj$label, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.9)
# axis(2)
# 
# title(xlab = "Effects", line = 8, cex.lab=1.5)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)

Data Analysis before imputation

MOFAobject_medianNormalized <- create_mofa(dataList[["medianNormalized"]])
MOFAobject_medianNormalized 
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
plot_data_overview(MOFAobject_medianNormalized)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_medianNormalized) <- metaDf
MOFAobject_medianNormalized
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
data_opts <- get_default_data_options(MOFAobject_medianNormalized)
data_opts
## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"
model_opts <- get_default_model_options(MOFAobject_medianNormalized)
model_opts$num_factors <- 15

model_opts
## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_medianNormalized)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts
## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE
MOFAobject_medianNormalized <- prepare_mofa(MOFAobject_medianNormalized,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_medianNormalized <- run_mofa(MOFAobject_medianNormalized, outfile=".../MOFA2_2ndChallenge_2020.hdf5", use_basilisk = TRUE)

MOFAobject_medianNormalized
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
plot_object = MOFAobject_medianNormalized
plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

MOFAobject_mofaImputed <- impute(MOFAobject_medianNormalized, views = "all")
samples_metadata(MOFAobject_medianNormalized) <- metaDf
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
results <- MOFAobject_mofaImputed@imputed_data
dataList$mofaImputed[["rnaseq"]] <- results$rnaseq$group1
dataList$mofaImputed[["abtiter"]] <- results$abtiter$group1
dataList$mofaImputed[["cytof"]] <- results$cytof$group1
dataList$mofaImputed[["olink"]] <- results$olink$group1
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_mofaImputed <- create_mofa(dataList[["mofaImputed"]])
MOFAobject_mofaImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
plot_data_overview(MOFAobject_mofaImputed)

samples_metadata(MOFAobject_mofaImputed) <- metaDf
MOFAobject_mofaImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
data_opts <- get_default_data_options(MOFAobject_mofaImputed)
MOFAobject_mofaImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
model_opts <- get_default_model_options(MOFAobject_mofaImputed)
model_opts$num_factors <- 15

model_opts
## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_mofaImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts
## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE
MOFAobject_mofaImputed <- prepare_mofa(MOFAobject_mofaImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_mofaImputed <- run_mofa(MOFAobject_mofaImputed, outfile=".../MofaIpmputation_2ndChallenge_2020.hdf5", use_basilisk = TRUE)

MOFAobject_mofaImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
plot_object = MOFAobject_mofaImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Mean imputation

dataList$meanImputed[["abtiter"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["abtiter"]]))))
dataList$meanImputed[["cytof"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["cytof"]]))))
dataList$meanImputed[["rnaseq"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["rnaseq"]]))))
dataList$meanImputed[["olink"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["olink"]]))))
# a <-  lapply(dataList$addedMissingVals[["olink"]][, 1:10], NA2mean)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_meanImputed <- create_mofa(dataList[["meanImputed"]])
MOFAobject_meanImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
plot_data_overview(MOFAobject_meanImputed)

samples_metadata(MOFAobject_meanImputed) <- metaDf
MOFAobject_meanImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
data_opts <- get_default_data_options(MOFAobject_meanImputed)
MOFAobject_meanImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
model_opts <- get_default_model_options(MOFAobject_meanImputed)
model_opts$num_factors <- 15

model_opts
## $likelihoods
##    abtiter      cytof     rnaseq      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_meanImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts
## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE
MOFAobject_meanImputed <- prepare_mofa(MOFAobject_meanImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_meanImputed <- run_mofa(MOFAobject_meanImputed, outfile=".../MeanIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_meanImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
plot_object = MOFAobject_meanImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Median imputation

na2median <- function(df) {
  df <- data.frame(df)
  df <- t(data.frame(t(df)) %>%
                mutate(across(1:dim(df)[1], ~replace_na(., median(., na.rm=TRUE)))))
  colnames(df) <- gsub("X", "", colnames(df))
  return(as.matrix(df))
}

# na_replace from imputeTS can be used to impute median in these values too.

dataList$medianImputed[["abtiter"]] <- na2median(dataList$medianNormalized[["abtiter"]])
dataList$medianImputed[["cytof"]] <- na2median(dataList$medianNormalized[["cytof"]])
dataList$medianImputed[["rnaseq"]] <- na2median(dataList$medianNormalized[["rnaseq"]])
dataList$medianImputed[["olink"]] <- na2median(dataList$medianNormalized[["olink"]])
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_medianImputed <- create_mofa(dataList[["medianImputed"]])
MOFAobject_medianImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
plot_data_overview(MOFAobject_medianImputed)

samples_metadata(MOFAobject_medianImputed) <- metaDf
MOFAobject_medianImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
data_opts <- get_default_data_options(MOFAobject_medianImputed)
MOFAobject_medianImputed
## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
## 
model_opts <- get_default_model_options(MOFAobject_medianImputed)
model_opts$num_factors <- 15

model_opts
## $likelihoods
##    abtiter      cytof     rnaseq      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_medianImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts
## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE
MOFAobject_medianImputed <- prepare_mofa(MOFAobject_medianImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_medianImputed <- run_mofa(MOFAobject_medianImputed, outfile=".../MedianIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_medianImputed
## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15
plot_object = MOFAobject_medianImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

saveRDS(dataList, file = here("./results/allData_2020.RDS"))