Testing Different data Manipulation for dataset 2020

Load libraries

suppressPackageStartupMessages({library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)
library(impute)
library(pvca)
library(Biobase)
library(lme4)})

Find features in other datasets

cells <- read_tsv(here("./data/2021LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)
cells <- levels(factor(cells$cell_type_name)) 

antigens <- read_tsv(here("./data/2021LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
antigens <- levels(factor(paste(antigens$isotype, antigens$antigen, sep = "_")))

genes <- read_tsv(here("./data/2021LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
genes <- levels(factor(gsub("\\..*", "", genes$versioned_ensembl_gene_id)))

proteins <- read_tsv(here("./data/2021LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
proteins <- levels(factor(proteins$protein_id))

Load Meta Data

metaDf <- read_tsv(here("./data/2020LD_subject.tsv"), show_col_types = FALSE)
specimen <- read_tsv(here("./data/2020LD_specimen.tsv"), show_col_types = FALSE)
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth, units="weeks")/52, 2))
metaDf <- merge(metaDf, specimen, by="subject_id")
metaDf["timepoint"] <- metaDf["planned_day_relative_to_boost"]

metaDf <- metaDf[metaDf$timepoint %in% c(0, 1, 3, 7, 14), ]

samples <- levels(factor(metaDf$specimen_id))
subjects <- levels(factor(metaDf$subject_id))
print(paste("Number of samples:", length(samples)))

## [1] "Number of samples: 296"

print(paste("Number of subjects:", length(subjects)))

## [1] "Number of subjects: 60"

Load Data

cellDf <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)

cellDf <- cellDf[cellDf$specimen_id %in% samples,]

cellDf <- cellDf[cellDf$cell_type_name %in% cells, ]
cellDf <- as.data.frame(pivot_wider(cellDf, names_from = "cell_type_name", 
                  values_from=c("percent_live_cell")))
row.names(cellDf) <- cellDf$specimen_id
subject_num <- length(levels(factor(metaDf[metaDf$specimen_id, "subject_id"])))

cellDf <- t(cellDf[,names(cellDf)!="specimen_id"])
# cellDf <- na.omit(cellDf)
cellDf <- cellDf[rowVars(cellDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(cellDf)) > 0))
compSamples <- names(which(colSums(is.na(cellDf)) == 0))
print(paste("Cell Frequency Incomplete Sample Cases:", length(incompSamples)))

## [1] "Cell Frequency Incomplete Sample Cases: 17"

print(paste("Cell Frequency Complete Sample Cases:", length(compSamples)))

## [1] "Cell Frequency Complete Sample Cases: 83"

print(paste("Cell Frequency Feature Number:", dim(cellDf)[1]))

## [1] "Cell Frequency Feature Number: 22"

incompFeature <- names(which(rowSums(is.na(cellDf)) > 0))
print(paste("Cell Frequency Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Cell Frequency Incomplete Feature Numbers: 1"

print(c("Cell Frequency Incomplete Feature:", incompFeature))

## [1] "Cell Frequency Incomplete Feature:" "ASCs (Plasmablasts)"

for (f in incompFeature){
  subDf <- data.frame(cellDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

## [1] "ASCs (Plasmablasts) Number of Incomplete Samples:"
## [1] 17
## [1] "ASCs (Plasmablasts) Incomplete Samples:"
##  [1] "281" "349" "350" "351" "352" "353" "376" "377" "378" "379" "380" "419"
## [13] "420" "421" "422" "423" "131"
## [1] "ASCs (Plasmablasts) Number of Incomplete Subjects:"
## [1] 5
## [1] "ASCs (Plasmablasts) Incomplete Subjects:"
## [1] "17" "36" "45" "49" "55"

print("Cell Frequency Incomplete Samples:")

## [1] "Cell Frequency Incomplete Samples:"

print(incompSamples)

##  [1] "281" "349" "350" "351" "352" "353" "376" "377" "378" "379" "380" "419"
## [13] "420" "421" "422" "423" "131"

print("Cell Frequency Incomplete Subjects:")

## [1] "Cell Frequency Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## [1] "17" "36" "45" "49" "55"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## [1] "0"  "1"  "3"  "7"  "14"

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Cell Frequency", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

## [1] "Cell Frequency 17 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Frequency 36 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Frequency 45 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 49 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 55 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"

print(paste("Cell Frequency Number of Incomplete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Cell Frequency Number of Incomplete Subjects: 5"

print(paste("Cell Frequency Number of Complete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Cell Frequency Number of Complete Subjects: 17"

print(paste("Cell Frequency Number of All Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(cellDf), "subject_id"])))))

## [1] "Cell Frequency Number of All Subjects: 20"

abtiterDf <- read_tsv(here("./data/2020LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
abtiterDf <- abtiterDf[abtiterDf$specimen_id %in% samples, ]

abtiterDf["antigen"] <- paste(abtiterDf$isotype, abtiterDf$antigen, sep = "_")

abtiterDf <- abtiterDf[abtiterDf$antigen %in% antigens, c("specimen_id", "antigen", "MFI_normalised")]
abtiterDf <- as.data.frame(pivot_wider(abtiterDf, names_from = "antigen",
                  values_from=c("MFI_normalised")))
row.names(abtiterDf) <- abtiterDf$specimen_id
abtiterDf <- t(abtiterDf[, names(abtiterDf)!="specimen_id"])

abtiterDf <- abtiterDf[rowVars(abtiterDf, na.rm = TRUE)>0, ]
incompSamples <- names(which(colSums(is.na(abtiterDf)) > 0))
compSamples <- names(which(colSums(is.na(abtiterDf)) == 0))

print(paste("Ab Titer Incomplete Sample Cases:", length(incompSamples)))

## [1] "Ab Titer Incomplete Sample Cases: 0"

print(paste("Ab Titer Complete Sample Cases:", length(compSamples)))

## [1] "Ab Titer Complete Sample Cases: 274"

print(paste("Ab Titer Feature Number:", dim(abtiterDf)[1]))

## [1] "Ab Titer Feature Number: 31"

incompFeature <- names(which(rowSums(is.na(abtiterDf)) > 0))
print(paste("Ab Titer Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Ab Titer Incomplete Feature Numbers: 0"

print(c("Ab Titer Incomplete Feature:", incompFeature))

## [1] "Ab Titer Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(abtiterDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Ab Titer Incomplete Samples:")

## [1] "Ab Titer Incomplete Samples:"

print(incompSamples)

## character(0)

print("Ab Titer Incomplete Subjects:")

## [1] "Ab Titer Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Ab Titer", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Ab Titer Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Ab Titer Number of Incomplete Subjects: 0"

print(paste("Ab Titer Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Ab Titer Number of Complete Subjects: 58"

print(paste("Ab Titer Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(abtiterDf), "subject_id"])))))

## [1] "Ab Titer Number of All Subjects: 58"

rnaDf <- read_tsv(here("./data/2020LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
rnaDf <- rnaDf[rnaDf$specimen_id %in% samples, ]

rnaDf$versioned_ensembl_gene_id <- gsub("\\..*", "", rnaDf$versioned_ensembl_gene_id)
# print(length(levels(factor(rnaDf$versioned_ensembl_gene_id))))

rnaDf <- rnaDf[rnaDf$versioned_ensembl_gene_id %in% genes, c("specimen_id", "versioned_ensembl_gene_id", "tpm")]
rnaDf <- as.data.frame(pivot_wider(rnaDf, names_from = "versioned_ensembl_gene_id",
                  values_from=c("tpm")))
row.names(rnaDf) <- rnaDf$specimen_id
rnaDf <- t(rnaDf[, names(rnaDf)!="specimen_id"])
# print(dim(rnaDf))

rnaDf <- rnaDf[(rowSums(rnaDf>1)/dim(rnaDf)[2])*100 >70,]
rnaDf <- rnaDf[rowVars(rnaDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(rnaDf)) > 0))
compSamples <- names(which(colSums(is.na(rnaDf)) == 0))

print(paste("RNA seq Incomplete Sample Cases:", length(incompSamples)))

## [1] "RNA seq Incomplete Sample Cases: 0"

print(paste("RNA seq Complete Sample Cases:", length(compSamples)))

## [1] "RNA seq Complete Sample Cases: 180"

print(paste("RNA seq Feature Number:", dim(rnaDf)[1]))

## [1] "RNA seq Feature Number: 14565"

# print(dim(rnaDf))

incompFeature <- names(which(rowSums(is.na(rnaDf)) > 0))
print(paste("RNA seq Incomplete Feature Numbers:", length(incompFeature)))

## [1] "RNA seq Incomplete Feature Numbers: 0"

print(c("RNA seq Incomplete Feature:", incompFeature))

## [1] "RNA seq Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(rnaDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("RNA seq Incomplete Samples:")

## [1] "RNA seq Incomplete Samples:"

print(incompSamples)

## character(0)

print("RNA seq Incomplete Subjects:")

## [1] "RNA seq Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("RNA seq", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

print(paste("RNA seq Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "RNA seq Number of Incomplete Subjects: 0"

print(paste("RNA seq Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "RNA seq Number of Complete Subjects: 36"

print(paste("RNA seq Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(rnaDf), "subject_id"])))))

## [1] "RNA seq Number of All Subjects: 36"

olinkDf <- read_tsv(here("./data/2020LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
olinkDf <- olinkDf[olinkDf$specimen_id %in% samples, ]

olinkDf <- olinkDf[olinkDf$protein_id %in% proteins, c("specimen_id", "protein_id", "protein_expression")]

olinkDf <- as.data.frame(pivot_wider(olinkDf, names_from = "protein_id",
                  values_from=c("protein_expression")))
row.names(olinkDf) <- olinkDf$specimen_id
olinkDf <- t(olinkDf[, names(olinkDf)!="specimen_id"])

olinkDf <- olinkDf[rowVars(olinkDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(olinkDf)) > 0))
compSamples <- names(which(colSums(is.na(olinkDf)) == 0))

print(paste("Olink Incomplete Sample Cases:", length(incompSamples)))

## [1] "Olink Incomplete Sample Cases: 0"

print(paste("Olink Complete Sample Cases:", length(compSamples)))

## [1] "Olink Complete Sample Cases: 90"

print(paste("Olink Feature Number:", dim(olinkDf)[1]))

## [1] "Olink Feature Number: 30"

incompFeature <- names(which(rowSums(is.na(olinkDf)) > 0))
print(paste("Olink Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Olink Incomplete Feature Numbers: 0"

print(c("Olink Incomplete Feature:", incompFeature))

## [1] "Olink Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(olinkDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Olink Incomplete Samples:")

## [1] "Olink Incomplete Samples:"

print(incompSamples)

## character(0)

print("Olink Incomplete Subjects:")

## [1] "Olink Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Olink", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Olink Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Olink Number of Incomplete Subjects: 0"

print(paste("Olink Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Olink Number of Complete Subjects: 18"

print(paste("Olink Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(olinkDf), "subject_id"])))))

## [1] "Olink Number of All Subjects: 18"

dataList <- list()
dataList[["original"]] <- list("abtiter"= abtiterDf,
                 "cytof"= cellDf, 
                 "olink"= olinkDf,
                 "rnaseq"=rnaDf)
K = 20
# int_cols <- Reduce(intersect, lapply(dataList$original[c("abtiter", "cytof", "olink", "rnaseq")], colnames))
# cols <- unique(c(int_cols, colnames(dataList$original[["rnaseq"]])))

cols <- colnames(dataList$original[["rnaseq"]])

add_cols <- function(df, cols, exp) {
  print(paste("************************", exp, "********************************"))
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  print(paste(exp, "All the Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))
  
  print(paste(exp, "Number of All the Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))))
  # print(df[, !colnames(df) %in% add])
  incompSamples <- setdiff(names(which(colSums(is.na(df)) > 0)), add)
  compSamples <- setdiff(names(which(colSums(is.na(df)) == 0)), add)
  # print(colnames(df))
  # print(add)
  # print(incompSamples)
  
  print(paste(exp, "Number of selected Samples:", dim(df)[2]))
  print(paste(exp, "Number of Features:", dim(df)[1]))
  
  incompFeature <- setdiff(names(which(rowSums(is.na(df)) > 0)), add)
  print(paste(exp, "Number of Incomplete Features:", length(incompFeature)))
  print(paste(exp, "Incomplete Features:", incompFeature))
  
  for (f in incompFeature){
    subDf <- data.frame(df[f,])
    
    incompSamp <- setdiff(names(which(rowSums(is.na(subDf)) > 0)), add)
    incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))
  
    print(paste(f, "Number of Incomplete Samples:"))
    print(length(incompSamp))
    print(paste(f, "Incomplete Samples:"))
    print(incompSamp)
    
    print(paste(f, "Number of Incomplete Subjects:"))
    print(length(incompSubj))
    print(paste(f, "Incomplete Subjects:"))
    print(incompSubj)
  }
  
  print(paste(exp, "Incomplete Samples:") )
  print(incompSamples)
  print(paste(exp, "Incomplete Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
  
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
  for (s in incompSubj){
    print(paste(exp, s, "Timepoints of Incomplete Subjects:"))
    print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
  }
  subjMissingTimepoints <- levels(factor(metaDf[metaDf$specimen_id %in% add, "subject_id"]))
  print(paste(exp, "Number of Missing Subjects:", 
              length(subjMissingTimepoints)))
  
  print(paste(exp, "Missing Subjects:"))
  print(subjMissingTimepoints)
  
  print(paste(exp, "Number of Incomplete Subjects:", 
              length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
  print(paste(exp, "Number of Complete Subjects:", 
              length(setdiff(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])),
                             subjMissingTimepoints))))
  

  print(paste(exp, "Number of Missing Samples:", length(add)))
  
  if(length(incompFeature)>0){set.seed(1)
    print(paste(exp, "Impute Missing Features for:", incompFeature))
    print("====================================================================")
    df <- t(impute.knn(t(df), k=K)$data)}

  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)
  print(paste(exp, "Number of all Samples:", dim(df)[2]))
  
  print("********************************************************************")
  return(as.matrix(df[, sort(cols)]))
}

# dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf[, int_cols], cols, "RNA seq")
dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf, cols, "RNA seq")

## [1] "************************ RNA seq ********************************"
## [1] "RNA seq All the Subjects:"
##  [1] "1"  "3"  "4"  "5"  "6"  "9"  "10" "11" "13" "15" "17" "18" "19" "20" "21"
## [16] "22" "23" "24" "25" "26" "27" "29" "31" "32" "33" "35" "36" "38" "42" "43"
## [31] "44" "47" "48" "50" "52" "53"
## [1] "RNA seq Number of All the Subjects: 36"
## [1] "RNA seq Number of selected Samples: 180"
## [1] "RNA seq Number of Features: 14565"
## [1] "RNA seq Number of Incomplete Features: 0"
## [1] "RNA seq Incomplete Features: "
## [1] "RNA seq Incomplete Samples:"
## character(0)
## [1] "RNA seq Incomplete Subjects:"
## character(0)
## character(0)
## [1] "RNA seq Number of Missing Subjects: 0"
## [1] "RNA seq Missing Subjects:"
## character(0)
## [1] "RNA seq Number of Incomplete Subjects: 0"
## [1] "RNA seq Number of Complete Subjects: 36"
## [1] "RNA seq Number of Missing Samples: 0"
## [1] "RNA seq Number of all Samples: 180"
## [1] "********************************************************************"

dataList$addedMissingVals[["abtiter"]] <- add_cols(abtiterDf, cols, "Ab-titer")

## [1] "************************ Ab-titer ********************************"
## [1] "Ab-titer All the Subjects:"
##  [1] "1"  "3"  "4"  "5"  "6"  "9"  "10" "11" "13" "15" "17" "18" "19" "20" "21"
## [16] "22" "23" "24" "25" "26" "27" "29" "31" "32" "33" "35" "36" "38" "42" "43"
## [31] "44" "47" "48" "50" "52" "53"
## [1] "Ab-titer Number of All the Subjects: 36"
## [1] "Ab-titer Number of selected Samples: 171"
## [1] "Ab-titer Number of Features: 31"
## [1] "Ab-titer Number of Incomplete Features: 0"
## [1] "Ab-titer Incomplete Features: "
## [1] "Ab-titer Incomplete Samples:"
## character(0)
## [1] "Ab-titer Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Ab-titer Number of Missing Subjects: 8"
## [1] "Ab-titer Missing Subjects:"
## [1] "1"  "4"  "6"  "10" "24" "25" "33" "43"
## [1] "Ab-titer Number of Incomplete Subjects: 0"
## [1] "Ab-titer Number of Complete Subjects: 28"
## [1] "Ab-titer Number of Missing Samples: 9"
## [1] "Ab-titer Number of all Samples: 180"
## [1] "********************************************************************"

dataList$addedMissingVals[["cytof"]] <- add_cols(cellDf, cols, "Cell Freq")

## [1] "************************ Cell Freq ********************************"
## [1] "Cell Freq All the Subjects:"
##  [1] "4"  "6"  "11" "15" "17" "20" "21" "26" "29" "31" "33" "36" "44" "47" "48"
## [16] "52"
## [1] "Cell Freq Number of All the Subjects: 16"
## [1] "Cell Freq Number of selected Samples: 80"
## [1] "Cell Freq Number of Features: 22"
## [1] "Cell Freq Number of Incomplete Features: 1"
## [1] "Cell Freq Incomplete Features: ASCs (Plasmablasts)"
## [1] "ASCs (Plasmablasts) Number of Incomplete Samples:"
## [1] 2
## [1] "ASCs (Plasmablasts) Incomplete Samples:"
## [1] "281" "131"
## [1] "ASCs (Plasmablasts) Number of Incomplete Subjects:"
## [1] 2
## [1] "ASCs (Plasmablasts) Incomplete Subjects:"
## [1] "17" "36"
## [1] "Cell Freq Incomplete Samples:"
## [1] "281" "131"
## [1] "Cell Freq Incomplete Subjects:"
## [1] "17" "36"
## [1] "0"
## [1] "Cell Freq 17 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Freq 36 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Freq Number of Missing Subjects: 20"
## [1] "Cell Freq Missing Subjects:"
##  [1] "1"  "3"  "5"  "9"  "10" "13" "18" "19" "22" "23" "24" "25" "27" "32" "35"
## [16] "38" "42" "43" "50" "53"
## [1] "Cell Freq Number of Incomplete Subjects: 2"
## [1] "Cell Freq Number of Complete Subjects: 16"
## [1] "Cell Freq Number of Missing Samples: 100"
## [1] "Cell Freq Impute Missing Features for: ASCs (Plasmablasts)"
## [1] "===================================================================="
## [1] "Cell Freq Number of all Samples: 180"
## [1] "********************************************************************"

dataList$addedMissingVals[["olink"]] <- add_cols(olinkDf, cols, "Olink")

## [1] "************************ Olink ********************************"
## [1] "Olink All the Subjects:"
##  [1] "4"  "6"  "11" "15" "17" "20" "21" "26" "29" "31" "33" "36" "44" "47" "48"
## [16] "52"
## [1] "Olink Number of All the Subjects: 16"
## [1] "Olink Number of selected Samples: 80"
## [1] "Olink Number of Features: 30"
## [1] "Olink Number of Incomplete Features: 0"
## [1] "Olink Incomplete Features: "
## [1] "Olink Incomplete Samples:"
## character(0)
## [1] "Olink Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Olink Number of Missing Subjects: 20"
## [1] "Olink Missing Subjects:"
##  [1] "1"  "3"  "5"  "9"  "10" "13" "18" "19" "22" "23" "24" "25" "27" "32" "35"
## [16] "38" "42" "43" "50" "53"
## [1] "Olink Number of Incomplete Subjects: 0"
## [1] "Olink Number of Complete Subjects: 16"
## [1] "Olink Number of Missing Samples: 100"
## [1] "Olink Number of all Samples: 180"
## [1] "********************************************************************"

dataList$featureImputed[["rnaseq"]] <- data.frame(dataList$addedMissingVals[["rnaseq"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["rnaseq"]]) <- gsub("X", "", colnames(dataList$featureImputed[["rnaseq"]]))

dataList$featureImputed[["abtiter"]] <- data.frame(dataList$addedMissingVals[["abtiter"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["abtiter"]]) <- gsub("X", "", colnames(dataList$featureImputed[["abtiter"]]))

dataList$featureImputed[["cytof"]] <- data.frame(dataList$addedMissingVals[["cytof"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["cytof"]]) <- gsub("X", "", colnames(dataList$featureImputed[["cytof"]]))

dataList$featureImputed[["olink"]] <- data.frame(dataList$addedMissingVals[["olink"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["olink"]]) <- gsub("X", "", colnames(dataList$featureImputed[["olink"]]))

Modify Metadata

metaDf <- data.frame(metaDf[metaDf$specimen_id %in% cols, ])
colnames(metaDf)[colnames(metaDf)=="specimen_id"] <- "sample"
rownames(metaDf) <- metaDf$sample
metaDf$sample <- as.character(metaDf$sample)
metaDf <- metaDf[cols,]

aMeta <- metaDf[, c("sample", "dataset", "timepoint", "infancy_vac", "biological_sex", "age_at_boost")]

rownames(aMeta) <- aMeta$sample

aMeta$infancy_vac <- as.factor(aMeta$infancy_vac)
aMeta$biological_sex <- as.factor(aMeta$biological_sex)
aMeta$dataset <- as.factor(aMeta$dataset)
aMeta$age_at_boost <- as.factor(aMeta$age_at_boost)

aMeta$sample <- paste("0", aMeta$sample, sep="")

PVCA old version

# # aData <- as.matrix(t(dataList$addedMissingVals[["cytof"]]))
# aData <- as.matrix(rnaDf[, colnames(rnaDf) %in%cols])
# pheno<- AnnotatedDataFrame(aMeta[colnames(aData), ])
# # expSet <- ExpressionSet(assayData=aData, phenoData=annotatedDataFrameFrom(as.matrix(t(aMeta)), byrow=FALSE))
# expSet <- ExpressionSet(assayData=aData, phenoData=pheno)
# exprs(expSet)
# 
# pvcaObj <- pvcaBatchAssess(expSet, batch.factors=c("timepoint", "infancy_vac", "biological_sex"),  threshold=0.4)
# par(mar=c(9,4,4,4))
# bp <- barplot(pvcaObj$dat,
#        ylab = "Proportion Variance Explained Actual Data",
#        ylim= c(0,0.5),
#        col = c("blue"), main="PVCA estimation Cell Frequency")
# box(col = "black")
# # grid(nx = NULL, ny = NULL,
# #      lty = 1,      # Grid line type
# #      col = "gray", # Grid line color
# #      lwd = 2)      # Grid line width
# # axis(1, at = bp, labels = pvcaObj$label, srt=45, las=2, cex.axis = 0.7)
# 
# # , las=2, angle=45, cex.axis = 0.9, tck=.01)
#      # font.lab=12, cex.lab=2, cex.names = 1)
# # axis(1, at=bp, labels = FALSE)
# # text(x = bp, labels = pvcaObj$label, srt = 45)
# text(bp, par("usr")[3], labels = pvcaObj$label, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.9)
# axis(2)
# 
# title(xlab = "Effects", line = 8, cex.lab=1.5)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)

PVCA

PVCA <- function(counts, meta, threshold, inter){

  counts.center <- t(apply(counts, 1, scale, center=TRUE, scale=FALSE))
  cor.counts <- cor(counts.center)
  dim(cor.counts)
  eigen.counts <- eigen(cor.counts)
  eigen.mat <- eigen.counts$vectors
  eigen.val <- eigen.counts$values
  n.eigen <- length(eigen.val)
  eigen.val.sum <- sum(eigen.val)
  percents.pcs <- eigen.val/eigen.val.sum
  meta <- as.data.frame(meta)

  all <- 0
  npc.in <- 0
  for(i in 1:n.eigen){
    all <- all + percents.pcs[i]
    npc.in <- npc.in + 1
    if(all > threshold){break}
  }
  if (npc.in < 3) {npc <- 3}

  pred.list <- colnames(meta)
  meta <- droplevels(meta)

  n.preds <- ncol(meta) + 1
  if(inter) {n.preds <- n.preds + choose(ncol(meta),2)}

  ran.pred.list <- c()
  for(i in 1:ncol(meta)){
    ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i],")"))
  }
  ##interactions
  if(inter){
    for(i in 1:(ncol(meta)-1)){
      for(j in (i+1):ncol(meta)){
        ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i], ":", pred.list[j], ")"))
        pred.list <- c(pred.list, paste0(pred.list[i], ":", pred.list[j]))
      }
    }
  }
  formula <- paste(ran.pred.list, collapse = " + ")
  formula <- paste("pc", formula, sep=" ~ ")
  ran.var.mat <- NULL
  for(i in 1:npc.in){
    dat <- cbind(eigen.mat[,i],meta)
    colnames(dat) <- c("pc",colnames(meta))
    Rm1ML <- lme4::lmer(formula, dat, REML = TRUE, verbose = FALSE, na.action = na.omit,
                        # control=lmerControl(check.nobs.vs.nlev = "ignore",
                        #                    check.nobs.vs.rankZ = "ignore",
                        #                    check.nobs.vs.nRE="ignore")
                        )
    var.vec <- unlist(VarCorr(Rm1ML))
    ran.var.mat <- rbind(ran.var.mat, c(var.vec[pred.list], resid = sigma(Rm1ML)^2))
  }
  ran.var.mat.std <- ran.var.mat/rowSums(ran.var.mat)
  wgt.vec <- eigen.val/eigen.val.sum
  prop.var <- colSums(ran.var.mat.std*wgt.vec[1:npc.in])
  std.prop.var <- prop.var/sum(prop.var)
  std.prop.var
}

PlotPVCA <- function(pvca.res, title){
  plot.dat <- data.frame(eff=names(pvca.res), prop=pvca.res)
  p <- ggplot2::ggplot(plot.dat, aes(x=eff, y=prop))
  p <- p + ggplot2::ggtitle(title)
  p <- p + ggplot2::geom_bar(stat="identity", fill="steelblue", colour="steelblue")
  p <- p + ggplot2::geom_text(aes(label=round(prop,3), y=prop+0.04), size=4)
  p <- p + ggplot2::scale_x_discrete(limits=names(pvca.res))
  p <- p + ggplot2::scale_y_continuous(limits = c(0,1))
  p <- p + ggplot2::labs(x= "Effects", y= "Weighted average proportion variance")
  p <- p + ggplot2::theme_bw()
  p <- p + ggplot2::theme(plot.background = element_blank() ,panel.grid.major = element_blank(),
                 panel.grid.minor = element_blank() ,panel.border = element_blank(), panel.background = element_blank())
  p <- p + ggplot2::theme(axis.line = element_line(color = 'black'))
  p <- p + ggplot2::theme(axis.title.x = element_text(size = 15, vjust=-0.5), 
                          axis.text.x = element_text(angle = 45, vjust= 1, hjust=1,  margin=margin(r=0)))
  p <- p + ggplot2::theme(axis.title.y = element_text(size = 15, vjust= 1.0))
  p <- p + ggplot2::theme(axis.text = element_text(size = 12))
  p
}

aData <- as.matrix(dataList$featureImputed[["cytof"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

aData <- as.matrix(dataList$featureImputed[["rnaseq"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation RNA seq")

aData <- as.matrix(dataList$featureImputed[["abtiter"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

aData <- as.matrix(dataList$featureImputed[["olink"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Olink")

# newDf <- do.call("rbind", dataList[["addedMissingVals"]])
# # newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
# colnames(newDf) <- gsub("X", "", colnames(newDf))
# 
# aData <- as.matrix(na.omit(newDf[,colnames(newDf) %in%cols]))
# pheno<- AnnotatedDataFrame(aMeta[colnames(aData), ])
# 
# expSet <- ExpressionSet(assayData=aData, phenoData=pheno)
# # exprs(expSet)
# pvcaObj <- pvcaBatchAssess(expSet, batch.factors=c("timepoint", "infancy_vac", "biological_sex"),  threshold=0.4)
# 
# par(mar=c(9,4,4,4))
# bp <- barplot(pvcaObj$dat, 
#        ylab = "Proportion Variance Explained", 
#        ylim= c(0,0.5), 
#        col = c("blue"), main="PVCA estimation All Experiments Actual Data")
# box(col = "black")
# text(bp, par("usr")[3], labels = pvcaObj$label, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.9)
# axis(2)
# 
# title(xlab = "Effects", line = 8, cex.lab=1.5)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)

newDf <- do.call("rbind", dataList[["addedMissingVals"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))

aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

Data Analysis before imputation

MOFAobject_missingVals <- create_mofa(dataList[["addedMissingVals"]])

## Creating MOFA object from a list of matrices (features as rows, sample as columns)...

MOFAobject_missingVals

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

plot_data_overview(MOFAobject_missingVals)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_missingVals) <- metaDf
MOFAobject_missingVals

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

data_opts <- get_default_data_options(MOFAobject_missingVals)
data_opts

## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"

model_opts <- get_default_model_options(MOFAobject_missingVals)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_missingVals)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_missingVals <- prepare_mofa(MOFAobject_missingVals,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

## Warning in prepare_mofa(MOFAobject_missingVals, data_options = data_opts, :
## Some view(s) have a lot of features, it is recommended to perform a more
## stringent feature selection before creating the MOFA object....

## Checking data options...

## Checking training options...

## Checking model options...

MOFAobject_missingVals <- run_mofa(MOFAobject_missingVals, outfile=".../MOFA2_2ndChallenge_2020.hdf5", use_basilisk = TRUE)

## Warning: Output file .../MOFA2_2ndChallenge_2020.hdf5 already exists, it will be replaced

## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

MOFAobject_missingVals

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

plot_object = MOFAobject_missingVals
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Median Baseline Correction

add_NA_cols <- function(df, cols, exp) {
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)

  return(as.matrix(df[, sort(cols)]))
}

dataList$medianNormalized[["rnaseq"]] <- dataList$addedMissingVals[["rnaseq"]] 
dataList$medianNormalized[["abtiter"]] <- dataList$addedMissingVals[["abtiter"]]

baselineSamples <- metaDf[metaDf$timepoint=="0", "sample"]
### Cell Frequency

cellFreq_featureImp <- dataList$featureImputed[["cytof"]]

cellFreqMedian <- data.frame(apply(cellFreq_featureImp[, colnames(cellFreq_featureImp) %in% baselineSamples], 1, median))
colnames(cellFreqMedian) <- "median"

cellFreqMerge <- cbind(cellFreq_featureImp, cellFreqMedian)

cellFreq_medianCorrect <- cellFreqMerge[, colnames(cellFreq_featureImp)]/cellFreqMerge[, "median"]

cellFreqMean <- data.frame(apply(cellFreq_medianCorrect, 1, mean))
colnames(cellFreqMean) <- "mean"
cellFreqMerge <- cbind(cellFreq_medianCorrect, cellFreqMean)
cellFreq_medianCorrect <- log2(cellFreqMerge[, colnames(cellFreq_featureImp)] + (cellFreqMerge[, "mean"]/2))

dataList$medianNormalized[["cytof"]] <- add_NA_cols(cellFreq_medianCorrect, cols, "Cell Freq")

### Olink

olink_featureImp <- dataList$featureImputed[["olink"]]

olinkMedian <- data.frame(apply(olink_featureImp[, colnames(olink_featureImp) %in% baselineSamples], 1, median))
colnames(olinkMedian) <- "median"

olinkMerge <- cbind(olink_featureImp, olinkMedian)
olink_medianCorrect <- olinkMerge[, colnames(olink_featureImp)]/olinkMerge[, "median"]

olinkMean <- data.frame(apply(olink_medianCorrect, 1, mean))
colnames(olinkMean) <- "mean"
olinkMerge <- cbind(olink_medianCorrect, olinkMean)
olink_medianCorrect <- log2(olinkMerge[, colnames(olink_featureImp)] + (olinkMerge[, "mean"]/2))

dataList$medianNormalized[["olink"]] <- add_NA_cols(olink_medianCorrect, cols, "Olink")

newDf <- data.frame(dataList$medianNormalized[["cytof"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

newDf <- data.frame(dataList$medianNormalized[["rnaseq"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation RNA Seq")

newDf <- data.frame(dataList$medianNormalized[["abtiter"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

newDf <- data.frame(dataList$medianNormalized[["olink"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Olink")

newDf <- do.call("rbind", dataList[["medianNormalized"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

# newDf <- do.call("rbind", dataList[["medianNormalized"]])
# newDf <- data.frame(newDf)%>% select_if(~any(!is.na(.)))
# colnames(newDf) <- gsub("X", "", colnames(newDf))
# 
# aData <- as.matrix(na.omit(newDf[,colnames(newDf) %in%cols]))
# pheno<- AnnotatedDataFrame(aMeta[colnames(aData), ])
# 
# expSet <- ExpressionSet(assayData=aData, phenoData=pheno)
# # exprs(expSet)
# pvcaObj <- pvcaBatchAssess(expSet, batch.factors=c("timepoint", "infancy_vac", "biological_sex"),  threshold=0.4)
# 
# par(mar=c(9,4,4,4))
# bp <- barplot(pvcaObj$dat, 
#        ylab = "Proportion Variance Explained", 
#        ylim= c(0,0.5), 
#        col = c("blue"), main="PVCA estimation All Experiments Normalized by Median Baseline")
# box(col = "black")
# text(bp, par("usr")[3], labels = pvcaObj$label, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.9)
# axis(2)
# 
# title(xlab = "Effects", line = 8, cex.lab=1.5)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)

Data Analysis before imputation

MOFAobject_medianNormalized <- create_mofa(dataList[["medianNormalized"]])
MOFAobject_medianNormalized

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

plot_data_overview(MOFAobject_medianNormalized)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_medianNormalized) <- metaDf
MOFAobject_medianNormalized

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

data_opts <- get_default_data_options(MOFAobject_medianNormalized)
data_opts

## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"

model_opts <- get_default_model_options(MOFAobject_medianNormalized)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_medianNormalized)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_medianNormalized <- prepare_mofa(MOFAobject_medianNormalized,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_medianNormalized <- run_mofa(MOFAobject_medianNormalized, outfile=".../MOFA2_2ndChallenge_2020.hdf5", use_basilisk = TRUE)

MOFAobject_medianNormalized

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

plot_object = MOFAobject_medianNormalized
plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

MOFAobject_mofaImputed <- impute(MOFAobject_medianNormalized, views = "all")
samples_metadata(MOFAobject_medianNormalized) <- metaDf
MOFAobject_mofaImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

results <- MOFAobject_mofaImputed@imputed_data
dataList$mofaImputed[["rnaseq"]] <- results$rnaseq$group1
dataList$mofaImputed[["abtiter"]] <- results$abtiter$group1
dataList$mofaImputed[["cytof"]] <- results$cytof$group1
dataList$mofaImputed[["olink"]] <- results$olink$group1

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_mofaImputed <- create_mofa(dataList[["mofaImputed"]])
MOFAobject_mofaImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

plot_data_overview(MOFAobject_mofaImputed)

samples_metadata(MOFAobject_mofaImputed) <- metaDf
MOFAobject_mofaImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

data_opts <- get_default_data_options(MOFAobject_mofaImputed)
MOFAobject_mofaImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

model_opts <- get_default_model_options(MOFAobject_mofaImputed)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_mofaImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_mofaImputed <- prepare_mofa(MOFAobject_mofaImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_mofaImputed <- run_mofa(MOFAobject_mofaImputed, outfile=".../MofaIpmputation_2ndChallenge_2020.hdf5", use_basilisk = TRUE)

MOFAobject_mofaImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 14565 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

plot_object = MOFAobject_mofaImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Mean imputation

dataList$meanImputed[["abtiter"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["abtiter"]]))))
dataList$meanImputed[["cytof"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["cytof"]]))))
dataList$meanImputed[["rnaseq"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["rnaseq"]]))))
dataList$meanImputed[["olink"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["olink"]]))))
# a <-  lapply(dataList$addedMissingVals[["olink"]][, 1:10], NA2mean)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_meanImputed <- create_mofa(dataList[["meanImputed"]])
MOFAobject_meanImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

plot_data_overview(MOFAobject_meanImputed)

samples_metadata(MOFAobject_meanImputed) <- metaDf
MOFAobject_meanImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

data_opts <- get_default_data_options(MOFAobject_meanImputed)
MOFAobject_meanImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

model_opts <- get_default_model_options(MOFAobject_meanImputed)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##    abtiter      cytof     rnaseq      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_meanImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_meanImputed <- prepare_mofa(MOFAobject_meanImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_meanImputed <- run_mofa(MOFAobject_meanImputed, outfile=".../MeanIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_meanImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

plot_object = MOFAobject_meanImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Median imputation

na2median <- function(df) {
  df <- data.frame(df)
  df <- t(data.frame(t(df)) %>%
                mutate(across(1:dim(df)[1], ~replace_na(., median(., na.rm=TRUE)))))
  colnames(df) <- gsub("X", "", colnames(df))
  return(as.matrix(df))
}

# na_replace from imputeTS can be used to impute median in these values too.

dataList$medianImputed[["abtiter"]] <- na2median(dataList$medianNormalized[["abtiter"]])
dataList$medianImputed[["cytof"]] <- na2median(dataList$medianNormalized[["cytof"]])
dataList$medianImputed[["rnaseq"]] <- na2median(dataList$medianNormalized[["rnaseq"]])
dataList$medianImputed[["olink"]] <- na2median(dataList$medianNormalized[["olink"]])

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_medianImputed <- create_mofa(dataList[["medianImputed"]])
MOFAobject_medianImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

plot_data_overview(MOFAobject_medianImputed)

samples_metadata(MOFAobject_medianImputed) <- metaDf
MOFAobject_medianImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

data_opts <- get_default_data_options(MOFAobject_medianImputed)
MOFAobject_medianImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

model_opts <- get_default_model_options(MOFAobject_medianImputed)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##    abtiter      cytof     rnaseq      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_medianImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_medianImputed <- prepare_mofa(MOFAobject_medianImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_medianImputed <- run_mofa(MOFAobject_medianImputed, outfile=".../MedianIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_medianImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 14565 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

plot_object = MOFAobject_medianImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

saveRDS(dataList, file = here("./results/allData_2020.RDS"))

Testing Different data Manipulation for dataset 2020

2023-10-03

Load libraries

Find features in other datasets

Load Meta Data

Load Data

Modify Metadata

PVCA old version

PVCA

Data Analysis before imputation

Training Model

Median Baseline Correction

Data Analysis before imputation

Training Model

Mean imputation

Median imputation