Testing Different data Manipulation for dataset 2020 & 2021

Load libraries

suppressPackageStartupMessages({
library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)
library(impute)
library(pvca)
library(Biobase)
library(lme4)})

Load Meta Data

metaDf_2020 <- read_tsv(here("./data/2020LD_subject.tsv"), show_col_types = FALSE)
specimen_2020 <- read_tsv(here("./data/2020LD_specimen.tsv"), show_col_types = FALSE)

metaDf_2021 <- read_tsv(here("./data/2021LD_subject.tsv"), show_col_types = FALSE)
specimen_2021 <- read_tsv(here("./data/2021LD_specimen.tsv"), show_col_types = FALSE)

metaDf <- rbind(metaDf_2020 , metaDf_2021)
specimen <- rbind(specimen_2020, specimen_2021)

metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth, units="weeks")/52, 2))
metaDf <- merge(metaDf, specimen, by="subject_id")
metaDf["timepoint"] <- metaDf["planned_day_relative_to_boost"]

metaDf <- metaDf[metaDf$timepoint %in% c(0, 1, 3, 7, 14), ]

samples <- levels(factor(metaDf$specimen_id))
subjects <- levels(factor(metaDf$subject_id))
print(paste("Number of samples:", length(samples)))

## [1] "Number of samples: 476"

print(paste("Number of subjects:", length(subjects)))

## [1] "Number of subjects: 96"

Load Data

cellDf_2020 <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)
cellDf_2021 <- read_tsv(here("./data/2021LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)

cells <- intersect(levels(factor(cellDf_2020$cell_type_name)), levels(factor(cellDf_2021$cell_type_name)))

cellDf <- rbind(cellDf_2020[cellDf_2020$cell_type_name %in% cells,], cellDf_2021[cellDf_2021$cell_type_name %in% cells,])

cellDf <- cellDf[cellDf$specimen_id %in% samples,]

cellDf <- as.data.frame(pivot_wider(cellDf, names_from = "cell_type_name", 
                  values_from=c("percent_live_cell")))
row.names(cellDf) <- cellDf$specimen_id
subject_num <- length(levels(factor(metaDf[metaDf$specimen_id, "subject_id"])))

cellDf <- t(cellDf[,names(cellDf)!="specimen_id"])
# cellDf <- na.omit(cellDf)
cellDf <- cellDf[rowVars(cellDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(cellDf)) > 0))
compSamples <- names(which(colSums(is.na(cellDf)) == 0))
print(paste("Cell Frequency Incomplete Sample Cases:", length(incompSamples)))

## [1] "Cell Frequency Incomplete Sample Cases: 22"

print(paste("Cell Frequency Complete Sample Cases:", length(compSamples)))

## [1] "Cell Frequency Complete Sample Cases: 243"

print(paste("Cell Frequency Feature Number:", dim(cellDf)[1]))

## [1] "Cell Frequency Feature Number: 22"

incompFeature <- names(which(rowSums(is.na(cellDf)) > 0))
print(paste("Cell Frequency Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Cell Frequency Incomplete Feature Numbers: 9"

print(c("Cell Frequency Incomplete Feature:", incompFeature))

##  [1] "Cell Frequency Incomplete Feature:" "TemraCD4"                          
##  [3] "NaiveCD4"                           "TemCD4"                            
##  [5] "TcmCD4"                             "TemraCD8"                          
##  [7] "NaiveCD8"                           "TemCD8"                            
##  [9] "TcmCD8"                             "ASCs (Plasmablasts)"

for (f in incompFeature){
  subDf <- data.frame(cellDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

## [1] "TemraCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "TemraCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemraCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemraCD4 Incomplete Subjects:"
## [1] "70"
## [1] "NaiveCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "NaiveCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "NaiveCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "NaiveCD4 Incomplete Subjects:"
## [1] "70"
## [1] "TemCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "TemCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemCD4 Incomplete Subjects:"
## [1] "70"
## [1] "TcmCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "TcmCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TcmCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "TcmCD4 Incomplete Subjects:"
## [1] "70"
## [1] "TemraCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "TemraCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemraCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemraCD8 Incomplete Subjects:"
## [1] "70"
## [1] "NaiveCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "NaiveCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "NaiveCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "NaiveCD8 Incomplete Subjects:"
## [1] "70"
## [1] "TemCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "TemCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemCD8 Incomplete Subjects:"
## [1] "70"
## [1] "TcmCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "TcmCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TcmCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "TcmCD8 Incomplete Subjects:"
## [1] "70"
## [1] "ASCs (Plasmablasts) Number of Incomplete Samples:"
## [1] 17
## [1] "ASCs (Plasmablasts) Incomplete Samples:"
##  [1] "281" "349" "350" "351" "352" "353" "376" "377" "378" "379" "380" "419"
## [13] "420" "421" "422" "423" "131"
## [1] "ASCs (Plasmablasts) Number of Incomplete Subjects:"
## [1] 5
## [1] "ASCs (Plasmablasts) Incomplete Subjects:"
## [1] "17" "36" "45" "49" "55"

print("Cell Frequency Incomplete Samples:")

## [1] "Cell Frequency Incomplete Samples:"

print(incompSamples)

##  [1] "281" "349" "350" "351" "352" "353" "376" "377" "378" "379" "380" "419"
## [13] "420" "421" "422" "423" "131" "537" "538" "539" "540" "541"

print("Cell Frequency Incomplete Subjects:")

## [1] "Cell Frequency Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## [1] "17" "36" "45" "49" "55" "70"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## [1] "0"  "1"  "3"  "7"  "14"

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Cell Frequency", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

## [1] "Cell Frequency 17 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Frequency 36 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Frequency 45 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 49 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 55 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Frequency 70 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"

print(paste("Cell Frequency Number of Incomplete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Cell Frequency Number of Incomplete Subjects: 6"

print(paste("Cell Frequency Number of Complete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Cell Frequency Number of Complete Subjects: 49"

print(paste("Cell Frequency Number of All Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(cellDf), "subject_id"])))))

## [1] "Cell Frequency Number of All Subjects: 53"

abtiterDf_2020 <- data.frame(read_tsv(here("./data/2020LD_plasma_ab_titer.tsv"), show_col_types = FALSE))
abtiterDf_2021 <- data.frame(read_tsv(here("./data/2021LD_plasma_ab_titer.tsv"), show_col_types = FALSE))
abtiterDf_2020["antigen"] <- paste(abtiterDf_2020$isotype, abtiterDf_2020$antigen, sep = "_")
abtiterDf_2021["antigen"] <- paste(abtiterDf_2021$isotype, abtiterDf_2021$antigen, sep = "_")

antigens <- intersect(levels(factor(abtiterDf_2020$antigen)),
                      levels(factor(abtiterDf_2021$antigen)))

abtiterDf <- rbind(abtiterDf_2020, 
                   abtiterDf_2021)

abtiterDf <- abtiterDf[abtiterDf$antigen %in% antigens, c("specimen_id", "antigen", "MFI_normalised")]
abtiterDf <- as.data.frame(pivot_wider(abtiterDf, names_from = "antigen",
                  values_from=c("MFI_normalised")))
row.names(abtiterDf) <- abtiterDf$specimen_id
abtiterDf <- t(abtiterDf[, names(abtiterDf)!="specimen_id"])

abtiterDf <- abtiterDf[rowVars(abtiterDf, na.rm = TRUE)>0, ]
incompSamples <- names(which(colSums(is.na(abtiterDf)) > 0))
compSamples <- names(which(colSums(is.na(abtiterDf)) == 0))

print(paste("Ab Titer Incomplete Sample Cases:", length(incompSamples)))

## [1] "Ab Titer Incomplete Sample Cases: 0"

print(paste("Ab Titer Complete Sample Cases:", length(compSamples)))

## [1] "Ab Titer Complete Sample Cases: 625"

print(paste("Ab Titer Feature Number:", dim(abtiterDf)[1]))

## [1] "Ab Titer Feature Number: 31"

incompFeature <- names(which(rowSums(is.na(abtiterDf)) > 0))
print(paste("Ab Titer Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Ab Titer Incomplete Feature Numbers: 0"

print(c("Ab Titer Incomplete Feature:", incompFeature))

## [1] "Ab Titer Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(abtiterDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Ab Titer Incomplete Samples:")

## [1] "Ab Titer Incomplete Samples:"

print(incompSamples)

## character(0)

print("Ab Titer Incomplete Subjects:")

## [1] "Ab Titer Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Ab Titer", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Ab Titer Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Ab Titer Number of Incomplete Subjects: 0"

print(paste("Ab Titer Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Ab Titer Number of Complete Subjects: 91"

print(paste("Ab Titer Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(abtiterDf), "subject_id"])))))

## [1] "Ab Titer Number of All Subjects: 91"

# abtiterDf <- abtiterDf[abtiterDf$is_antigen_specific==TRUE,]
# abtiterDf <- abtiterDf[abtiterDf$specimen_id %in% samples, ]
# abtiterDf <- abtiterDf[abtiterDf$antigen %in% antigens, ]
# 
# abtiterDf <- abtiterDf[, c("specimen_id", "antigen", "MFI_normalised")]
# abtiterDf <- as.data.frame(pivot_wider(abtiterDf, names_from = "antigen",
#                   values_from=c("MFI_normalised")))
# row.names(abtiterDf) <- abtiterDf$specimen_id
# abtiterDf <- t(abtiterDf[, names(abtiterDf)!="specimen_id"])
# 
# abtiterDf <- abtiterDf[rowVars(abtiterDf, na.rm = TRUE)>0, ]
# incompSamples <- names(which(colSums(is.na(abtiterDf)) > 0))
# compSamples <- names(which(colSums(is.na(abtiterDf)) == 0))
# 
# print(paste("Ab Titer Incomplete Sample Cases:", length(incompSamples)))
# print(paste("Ab Titer Complete Sample Cases:", length(compSamples)))
# print(paste("Ab Titer Feature Number:", dim(abtiterDf)[1]))
# 
# incompFeature <- names(which(rowSums(is.na(abtiterDf)) > 0))
# print(paste("Ab Titer Incomplete Feature Numbers:", length(incompFeature)))
# print(c("Ab Titer Incomplete Feature:", incompFeature))
# 
# for (f in incompFeature){
#   subDf <- data.frame(abtiterDf[f,])
#   incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
#   incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))
# 
#   print(paste(f, "Number of Incomplete Samples:"))
#   print(length(incompSamp))
#   print(paste(f, "Incomplete Samples:"))
#   print(incompSamp)
#   
#   print(paste(f, "Number of Incomplete Subjects:"))
#   print(length(incompSubj))
#   print(paste(f, "Incomplete Subjects:"))
#   print(incompSubj)
# }
# 
# print("Ab Titer Incomplete Samples:") 
# print(incompSamples)
# print("Ab Titer Incomplete Subjects:")
# print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
# print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
# 
# incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
# for (s in incompSubj){
#   print(paste("Ab Titer", s, "Timepoints of Incomplete Subjects:"))
#   print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
# }
# 
# 
# print(paste("Ab Titer Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
# print(paste("Ab Titer Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))
# print(paste("Ab Titer Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(abtiterDf), "subject_id"])))))

rnaDf_2020 <- read_tsv(here("./data/2020LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
rnaDf_2021 <- read_tsv(here("./data/2021LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
rnaDf_2020$versioned_ensembl_gene_id <- gsub("\\..*", "", rnaDf_2020$versioned_ensembl_gene_id)
rnaDf_2021$versioned_ensembl_gene_id <- gsub("\\..*", "", rnaDf_2021$versioned_ensembl_gene_id)

genes <- intersect(levels(factor(rnaDf_2020$versioned_ensembl_gene_id)), 
                   levels(factor(rnaDf_2021$versioned_ensembl_gene_id)))

rnaDf <- rbind(rnaDf_2020[rnaDf_2020$versioned_ensembl_gene_id %in% genes, ],
               rnaDf_2021[rnaDf_2021$versioned_ensembl_gene_id %in% genes, ])

rnaDf <- rnaDf[rnaDf$specimen_id %in% samples, ]


rnaDf <- rnaDf[, c("specimen_id", "versioned_ensembl_gene_id", "tpm")]
rnaDf <- as.data.frame(pivot_wider(rnaDf, names_from = "versioned_ensembl_gene_id",
                  values_from=c("tpm")))
row.names(rnaDf) <- rnaDf$specimen_id
rnaDf <- t(rnaDf[, names(rnaDf)!="specimen_id"])

rnaDf <- rnaDf[(rowSums(rnaDf>1)/dim(rnaDf)[2])*100 >70,]
rnaDf <- rnaDf[rowVars(rnaDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(rnaDf)) > 0))
compSamples <- names(which(colSums(is.na(rnaDf)) == 0))

print(paste("RNA seq Incomplete Sample Cases:", length(incompSamples)))

## [1] "RNA seq Incomplete Sample Cases: 0"

print(paste("RNA seq Complete Sample Cases:", length(compSamples)))

## [1] "RNA seq Complete Sample Cases: 360"

print(paste("RNA seq Feature Number:", dim(rnaDf)[1]))

## [1] "RNA seq Feature Number: 13092"

incompFeature <- names(which(rowSums(is.na(rnaDf)) > 0))
print(paste("RNA seq Incomplete Feature Numbers:", length(incompFeature)))

## [1] "RNA seq Incomplete Feature Numbers: 0"

print(c("RNA seq Incomplete Feature:", incompFeature))

## [1] "RNA seq Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(rnaDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("RNA seq Incomplete Samples:")

## [1] "RNA seq Incomplete Samples:"

print(incompSamples)

## character(0)

print("RNA seq Incomplete Subjects:")

## [1] "RNA seq Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("RNA seq", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

print(paste("RNA seq Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "RNA seq Number of Incomplete Subjects: 0"

print(paste("RNA seq Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "RNA seq Number of Complete Subjects: 72"

print(paste("RNA seq Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(rnaDf), "subject_id"])))))

## [1] "RNA seq Number of All Subjects: 72"

olinkDf_2020 <- read_tsv(here("./data/2020LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
olinkDf_2021 <- read_tsv(here("./data/2021LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)

proteins <- intersect(levels(factor(olinkDf_2020$protein_id)),
                      levels(factor(olinkDf_2021$protein_id)))

olinkDf <- rbind(olinkDf_2020[olinkDf_2020$protein_id %in% proteins, ], 
                 olinkDf_2021[olinkDf_2021$protein_id %in% proteins, ])

olinkDf <- olinkDf[olinkDf$specimen_id %in% samples, ]

olinkDf <- olinkDf[, c("specimen_id", "protein_id", "protein_expression")]

olinkDf <- as.data.frame(pivot_wider(olinkDf, names_from = "protein_id",
                  values_from=c("protein_expression")))
row.names(olinkDf) <- olinkDf$specimen_id
olinkDf <- t(olinkDf[, names(olinkDf)!="specimen_id"])

olinkDf <- olinkDf[rowVars(olinkDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(olinkDf)) > 0))
compSamples <- names(which(colSums(is.na(olinkDf)) == 0))

print(paste("Olink Incomplete Sample Cases:", length(incompSamples)))

## [1] "Olink Incomplete Sample Cases: 23"

print(paste("Olink Complete Sample Cases:", length(compSamples)))

## [1] "Olink Complete Sample Cases: 247"

print(paste("Olink Feature Number:", dim(olinkDf)[1]))

## [1] "Olink Feature Number: 30"

incompFeature <- names(which(rowSums(is.na(olinkDf)) > 0))
print(paste("Olink Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Olink Incomplete Feature Numbers: 3"

print(c("Olink Incomplete Feature:", incompFeature))

## [1] "Olink Incomplete Feature:" "P60568"                   
## [3] "O95760"                    "P35225"

for (f in incompFeature){
  subDf <- data.frame(olinkDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

## [1] "P60568 Number of Incomplete Samples:"
## [1] 16
## [1] "P60568 Incomplete Samples:"
##  [1] "472" "643" "483" "486" "586" "589" "569" "664" "517" "668" "513" "516"
## [13] "611" "698" "699" "688"
## [1] "P60568 Number of Incomplete Subjects:"
## [1] 10
## [1] "P60568 Incomplete Subjects:"
##  [1] "61" "63" "67" "74" "76" "79" "84" "87" "91" "92"
## [1] "O95760 Number of Incomplete Samples:"
## [1] 2
## [1] "O95760 Incomplete Samples:"
## [1] "696" "619"
## [1] "O95760 Number of Incomplete Subjects:"
## [1] 2
## [1] "O95760 Incomplete Subjects:"
## [1] "80" "92"
## [1] "P35225 Number of Incomplete Samples:"
## [1] 6
## [1] "P35225 Incomplete Samples:"
## [1] "469" "547" "548" "569" "563" "709"
## [1] "P35225 Number of Incomplete Subjects:"
## [1] 5
## [1] "P35225 Incomplete Subjects:"
## [1] "61" "71" "73" "74" "94"

print("Olink Incomplete Samples:")

## [1] "Olink Incomplete Samples:"

print(incompSamples)

##  [1] "472" "469" "547" "548" "643" "483" "486" "586" "589" "569" "664" "563"
## [13] "517" "668" "513" "516" "696" "611" "698" "699" "688" "619" "709"

print("Olink Incomplete Subjects:")

## [1] "Olink Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

##  [1] "61" "63" "67" "71" "73" "74" "76" "79" "80" "84" "87" "91" "92" "94"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## [1] "0"  "1"  "3"  "7"  "14"

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Olink", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

## [1] "Olink 61 Timepoints of Incomplete Subjects:"
## [1] "1"  "14"
## [1] "Olink 63 Timepoints of Incomplete Subjects:"
## [1] "0" "7"
## [1] "Olink 67 Timepoints of Incomplete Subjects:"
## [1] "0"  "7"  "14"
## [1] "Olink 71 Timepoints of Incomplete Subjects:"
## [1] "1" "3"
## [1] "Olink 73 Timepoints of Incomplete Subjects:"
## [1] "1"
## [1] "Olink 74 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink 76 Timepoints of Incomplete Subjects:"
## [1] "1"  "14"
## [1] "Olink 79 Timepoints of Incomplete Subjects:"
## [1] "7"
## [1] "Olink 80 Timepoints of Incomplete Subjects:"
## [1] "7"
## [1] "Olink 84 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink 87 Timepoints of Incomplete Subjects:"
## [1] "0"  "14"
## [1] "Olink 91 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink 92 Timepoints of Incomplete Subjects:"
## [1] "1"  "7"  "14"
## [1] "Olink 94 Timepoints of Incomplete Subjects:"
## [1] "0"

print(paste("Olink Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Olink Number of Incomplete Subjects: 14"

print(paste("Olink Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Olink Number of Complete Subjects: 54"

print(paste("Olink Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(olinkDf), "subject_id"])))))

## [1] "Olink Number of All Subjects: 54"

dataList <- list()
dataList[["original"]] <- list("abtiter"= abtiterDf,
                 "cytof"= cellDf, 
                 "olink"= olinkDf,
                 "rnaseq"=rnaDf)
K = 20

cols <- colnames(dataList$original[["rnaseq"]])

add_cols <- function(df, cols, exp) {
  print(paste("************************", exp, "********************************"))
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  print(paste(exp, "All the Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))
  
  print(paste(exp, "Number of All the Subjects:", 
              length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))))
  # print(df[, !colnames(df) %in% add])
  incompSamples <- setdiff(names(which(colSums(is.na(df)) > 0)), add)
  compSamples <- setdiff(names(which(colSums(is.na(df)) == 0)), add)
  # print(colnames(df))
  # print(add)
  # print(incompSamples)
  
  print(paste(exp, "Number of selected Samples:", dim(df)[2]))
  print(paste(exp, "Number of Features:", dim(df)[1]))
  
  incompFeature <- setdiff(names(which(rowSums(is.na(df)) > 0)), add)
  print(paste(exp, "Number of Incomplete Features:", length(incompFeature)))
  print(paste(exp, "Incomplete Features:", incompFeature))
  
  for (f in incompFeature){
    subDf <- data.frame(df[f,])
    
    incompSamp <- setdiff(names(which(rowSums(is.na(subDf)) > 0)), add)
    incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))
  
    print(paste(f, "Number of Incomplete Samples:"))
    print(length(incompSamp))
    print(paste(f, "Incomplete Samples:"))
    print(incompSamp)
    
    print(paste(f, "Number of Incomplete Subjects:"))
    print(length(incompSubj))
    print(paste(f, "Incomplete Subjects:"))
    print(incompSubj)
  }
  
  print(paste(exp, "Incomplete Samples:") )
  print(incompSamples)
  print(paste(exp, "Incomplete Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
  
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
  for (s in incompSubj){
    print(paste(exp, s, "Timepoints of Incomplete Subjects:"))
    print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
  }
  subjMissingTimepoints <- levels(factor(metaDf[metaDf$specimen_id %in% add, "subject_id"]))
  print(paste(exp, "Number of Missing Subjects:", 
              length(subjMissingTimepoints)))
  
  print(paste(exp, "Missing Subjects:"))
  print(subjMissingTimepoints)
  
  print(paste(exp, "Number of Incomplete Subjects:", 
              length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
  print(paste(exp, "Number of Complete Subjects:", 
              length(setdiff(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])),
                             subjMissingTimepoints))))
  

  print(paste(exp, "Number of Missing Samples:", length(add)))
  
  if(length(incompFeature)>0){set.seed(1)
    print(paste(exp, "Impute Missing Features for:", incompFeature))
    print("====================================================================")
    df <- t(impute.knn(t(df), k=K)$data)}

  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)
  print(paste(exp, "Number of all Samples:", dim(df)[2]))
  
  print("********************************************************************")
  return(as.matrix(df[, sort(cols)]))
}

# dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf[, int_cols], cols, "RNA seq")
dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf, cols, "RNA seq")

## [1] "************************ RNA seq ********************************"
## [1] "RNA seq All the Subjects:"
##  [1] "1"  "3"  "4"  "5"  "6"  "9"  "10" "11" "13" "15" "17" "18" "19" "20" "21"
## [16] "22" "23" "24" "25" "26" "27" "29" "31" "32" "33" "35" "36" "38" "42" "43"
## [31] "44" "47" "48" "50" "52" "53" "61" "62" "63" "64" "65" "66" "67" "68" "69"
## [46] "70" "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [61] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [1] "RNA seq Number of All the Subjects: 72"
## [1] "RNA seq Number of selected Samples: 360"
## [1] "RNA seq Number of Features: 13092"
## [1] "RNA seq Number of Incomplete Features: 0"
## [1] "RNA seq Incomplete Features: "
## [1] "RNA seq Incomplete Samples:"
## character(0)
## [1] "RNA seq Incomplete Subjects:"
## character(0)
## character(0)
## [1] "RNA seq Number of Missing Subjects: 0"
## [1] "RNA seq Missing Subjects:"
## character(0)
## [1] "RNA seq Number of Incomplete Subjects: 0"
## [1] "RNA seq Number of Complete Subjects: 72"
## [1] "RNA seq Number of Missing Samples: 0"
## [1] "RNA seq Number of all Samples: 360"
## [1] "********************************************************************"

dataList$addedMissingVals[["abtiter"]] <- add_cols(abtiterDf, cols, "Ab-titer")

## [1] "************************ Ab-titer ********************************"
## [1] "Ab-titer All the Subjects:"
##  [1] "1"  "3"  "4"  "5"  "6"  "9"  "10" "11" "13" "15" "17" "18" "19" "20" "21"
## [16] "22" "23" "24" "25" "26" "27" "29" "31" "32" "33" "35" "36" "38" "42" "43"
## [31] "44" "47" "48" "50" "52" "53" "61" "62" "63" "64" "65" "66" "67" "68" "69"
## [46] "70" "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "83" "84" "85"
## [61] "86" "89" "90" "91" "92" "93" "94" "95" "96"
## [1] "Ab-titer Number of All the Subjects: 69"
## [1] "Ab-titer Number of selected Samples: 336"
## [1] "Ab-titer Number of Features: 31"
## [1] "Ab-titer Number of Incomplete Features: 0"
## [1] "Ab-titer Incomplete Features: "
## [1] "Ab-titer Incomplete Samples:"
## character(0)
## [1] "Ab-titer Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Ab-titer Number of Missing Subjects: 11"
## [1] "Ab-titer Missing Subjects:"
##  [1] "1"  "4"  "6"  "10" "24" "25" "33" "43" "82" "87" "88"
## [1] "Ab-titer Number of Incomplete Subjects: 0"
## [1] "Ab-titer Number of Complete Subjects: 61"
## [1] "Ab-titer Number of Missing Samples: 24"
## [1] "Ab-titer Number of all Samples: 360"
## [1] "********************************************************************"

dataList$addedMissingVals[["cytof"]] <- add_cols(cellDf, cols, "Cell Freq")

## [1] "************************ Cell Freq ********************************"
## [1] "Cell Freq All the Subjects:"
##  [1] "4"  "6"  "11" "15" "17" "20" "21" "26" "29" "31" "33" "36" "44" "47" "48"
## [16] "52" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "76" "77"
## [31] "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90" "91" "92"
## [46] "93" "94" "95" "96"
## [1] "Cell Freq Number of All the Subjects: 49"
## [1] "Cell Freq Number of selected Samples: 245"
## [1] "Cell Freq Number of Features: 22"
## [1] "Cell Freq Number of Incomplete Features: 9"
## [1] "Cell Freq Incomplete Features: TemraCD4"           
## [2] "Cell Freq Incomplete Features: NaiveCD4"           
## [3] "Cell Freq Incomplete Features: TemCD4"             
## [4] "Cell Freq Incomplete Features: TcmCD4"             
## [5] "Cell Freq Incomplete Features: TemraCD8"           
## [6] "Cell Freq Incomplete Features: NaiveCD8"           
## [7] "Cell Freq Incomplete Features: TemCD8"             
## [8] "Cell Freq Incomplete Features: TcmCD8"             
## [9] "Cell Freq Incomplete Features: ASCs (Plasmablasts)"
## [1] "TemraCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "TemraCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemraCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemraCD4 Incomplete Subjects:"
## [1] "70"
## [1] "NaiveCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "NaiveCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "NaiveCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "NaiveCD4 Incomplete Subjects:"
## [1] "70"
## [1] "TemCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "TemCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemCD4 Incomplete Subjects:"
## [1] "70"
## [1] "TcmCD4 Number of Incomplete Samples:"
## [1] 5
## [1] "TcmCD4 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TcmCD4 Number of Incomplete Subjects:"
## [1] 1
## [1] "TcmCD4 Incomplete Subjects:"
## [1] "70"
## [1] "TemraCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "TemraCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemraCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemraCD8 Incomplete Subjects:"
## [1] "70"
## [1] "NaiveCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "NaiveCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "NaiveCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "NaiveCD8 Incomplete Subjects:"
## [1] "70"
## [1] "TemCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "TemCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TemCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "TemCD8 Incomplete Subjects:"
## [1] "70"
## [1] "TcmCD8 Number of Incomplete Samples:"
## [1] 5
## [1] "TcmCD8 Incomplete Samples:"
## [1] "537" "538" "539" "540" "541"
## [1] "TcmCD8 Number of Incomplete Subjects:"
## [1] 1
## [1] "TcmCD8 Incomplete Subjects:"
## [1] "70"
## [1] "ASCs (Plasmablasts) Number of Incomplete Samples:"
## [1] 2
## [1] "ASCs (Plasmablasts) Incomplete Samples:"
## [1] "281" "131"
## [1] "ASCs (Plasmablasts) Number of Incomplete Subjects:"
## [1] 2
## [1] "ASCs (Plasmablasts) Incomplete Subjects:"
## [1] "17" "36"
## [1] "Cell Freq Incomplete Samples:"
## [1] "281" "131" "537" "538" "539" "540" "541"
## [1] "Cell Freq Incomplete Subjects:"
## [1] "17" "36" "70"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Freq 17 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Freq 36 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Cell Freq 70 Timepoints of Incomplete Subjects:"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Cell Freq Number of Missing Subjects: 23"
## [1] "Cell Freq Missing Subjects:"
##  [1] "1"  "3"  "5"  "9"  "10" "13" "18" "19" "22" "23" "24" "25" "27" "32" "35"
## [16] "38" "42" "43" "50" "53" "61" "62" "75"
## [1] "Cell Freq Number of Incomplete Subjects: 3"
## [1] "Cell Freq Number of Complete Subjects: 48"
## [1] "Cell Freq Number of Missing Samples: 115"
## [1] "Cell Freq Impute Missing Features for: TemraCD4"           
## [2] "Cell Freq Impute Missing Features for: NaiveCD4"           
## [3] "Cell Freq Impute Missing Features for: TemCD4"             
## [4] "Cell Freq Impute Missing Features for: TcmCD4"             
## [5] "Cell Freq Impute Missing Features for: TemraCD8"           
## [6] "Cell Freq Impute Missing Features for: NaiveCD8"           
## [7] "Cell Freq Impute Missing Features for: TemCD8"             
## [8] "Cell Freq Impute Missing Features for: TcmCD8"             
## [9] "Cell Freq Impute Missing Features for: ASCs (Plasmablasts)"
## [1] "===================================================================="
## [1] "Cell Freq Number of all Samples: 360"
## [1] "********************************************************************"

dataList$addedMissingVals[["olink"]] <- add_cols(olinkDf, cols, "Olink")

## [1] "************************ Olink ********************************"
## [1] "Olink All the Subjects:"
##  [1] "4"  "6"  "11" "15" "17" "20" "21" "26" "29" "31" "33" "36" "44" "47" "48"
## [16] "52" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74"
## [31] "75" "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89"
## [46] "90" "91" "92" "93" "94" "95" "96"
## [1] "Olink Number of All the Subjects: 52"
## [1] "Olink Number of selected Samples: 260"
## [1] "Olink Number of Features: 30"
## [1] "Olink Number of Incomplete Features: 3"
## [1] "Olink Incomplete Features: P60568" "Olink Incomplete Features: O95760"
## [3] "Olink Incomplete Features: P35225"
## [1] "P60568 Number of Incomplete Samples:"
## [1] 16
## [1] "P60568 Incomplete Samples:"
##  [1] "472" "643" "483" "486" "586" "589" "569" "664" "517" "668" "513" "516"
## [13] "611" "698" "699" "688"
## [1] "P60568 Number of Incomplete Subjects:"
## [1] 10
## [1] "P60568 Incomplete Subjects:"
##  [1] "61" "63" "67" "74" "76" "79" "84" "87" "91" "92"
## [1] "O95760 Number of Incomplete Samples:"
## [1] 2
## [1] "O95760 Incomplete Samples:"
## [1] "696" "619"
## [1] "O95760 Number of Incomplete Subjects:"
## [1] 2
## [1] "O95760 Incomplete Subjects:"
## [1] "80" "92"
## [1] "P35225 Number of Incomplete Samples:"
## [1] 6
## [1] "P35225 Incomplete Samples:"
## [1] "469" "547" "548" "569" "563" "709"
## [1] "P35225 Number of Incomplete Subjects:"
## [1] 5
## [1] "P35225 Incomplete Subjects:"
## [1] "61" "71" "73" "74" "94"
## [1] "Olink Incomplete Samples:"
##  [1] "472" "469" "547" "548" "643" "483" "486" "586" "589" "569" "664" "563"
## [13] "517" "668" "513" "516" "696" "611" "698" "699" "688" "619" "709"
## [1] "Olink Incomplete Subjects:"
##  [1] "61" "63" "67" "71" "73" "74" "76" "79" "80" "84" "87" "91" "92" "94"
## [1] "0"  "1"  "3"  "7"  "14"
## [1] "Olink 61 Timepoints of Incomplete Subjects:"
## [1] "1"  "14"
## [1] "Olink 63 Timepoints of Incomplete Subjects:"
## [1] "0" "7"
## [1] "Olink 67 Timepoints of Incomplete Subjects:"
## [1] "0"  "7"  "14"
## [1] "Olink 71 Timepoints of Incomplete Subjects:"
## [1] "1" "3"
## [1] "Olink 73 Timepoints of Incomplete Subjects:"
## [1] "1"
## [1] "Olink 74 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink 76 Timepoints of Incomplete Subjects:"
## [1] "1"  "14"
## [1] "Olink 79 Timepoints of Incomplete Subjects:"
## [1] "7"
## [1] "Olink 80 Timepoints of Incomplete Subjects:"
## [1] "7"
## [1] "Olink 84 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink 87 Timepoints of Incomplete Subjects:"
## [1] "0"  "14"
## [1] "Olink 91 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink 92 Timepoints of Incomplete Subjects:"
## [1] "1"  "7"  "14"
## [1] "Olink 94 Timepoints of Incomplete Subjects:"
## [1] "0"
## [1] "Olink Number of Missing Subjects: 20"
## [1] "Olink Missing Subjects:"
##  [1] "1"  "3"  "5"  "9"  "10" "13" "18" "19" "22" "23" "24" "25" "27" "32" "35"
## [16] "38" "42" "43" "50" "53"
## [1] "Olink Number of Incomplete Subjects: 14"
## [1] "Olink Number of Complete Subjects: 52"
## [1] "Olink Number of Missing Samples: 100"
## [1] "Olink Impute Missing Features for: P60568"
## [2] "Olink Impute Missing Features for: O95760"
## [3] "Olink Impute Missing Features for: P35225"
## [1] "===================================================================="
## [1] "Olink Number of all Samples: 360"
## [1] "********************************************************************"

dataList$featureImputed[["rnaseq"]] <- data.frame(dataList$addedMissingVals[["rnaseq"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["rnaseq"]]) <- gsub("X", "", colnames(dataList$featureImputed[["rnaseq"]]))

dataList$featureImputed[["abtiter"]] <- data.frame(dataList$addedMissingVals[["abtiter"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["abtiter"]]) <- gsub("X", "", colnames(dataList$featureImputed[["abtiter"]]))

dataList$featureImputed[["cytof"]] <- data.frame(dataList$addedMissingVals[["cytof"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["cytof"]]) <- gsub("X", "", colnames(dataList$featureImputed[["cytof"]]))

dataList$featureImputed[["olink"]] <- data.frame(dataList$addedMissingVals[["olink"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["olink"]]) <- gsub("X", "", colnames(dataList$featureImputed[["olink"]]))

Modify Metadata

metaDf <- data.frame(metaDf[metaDf$specimen_id %in% cols, ])
colnames(metaDf)[colnames(metaDf)=="specimen_id"] <- "sample"
rownames(metaDf) <- metaDf$sample
metaDf$sample <- as.character(metaDf$sample)
metaDf <- metaDf[cols,]

aMeta <- metaDf[, c("sample", "dataset", "timepoint", "infancy_vac", "biological_sex", "age_at_boost")]

rownames(aMeta) <- aMeta$sample

aMeta$infancy_vac <- as.factor(aMeta$infancy_vac)
aMeta$biological_sex <- as.factor(aMeta$biological_sex)
aMeta$dataset <- as.factor(aMeta$dataset)
aMeta$age_at_boost <- as.factor(aMeta$age_at_boost)

aMeta$sample <- paste("0", aMeta$sample, sep="")

PVCA

PVCA <- function(counts, meta, threshold, inter){

  counts.center <- t(apply(counts, 1, scale, center=TRUE, scale=FALSE))
  cor.counts <- cor(counts.center)
  dim(cor.counts)
  eigen.counts <- eigen(cor.counts)
  eigen.mat <- eigen.counts$vectors
  eigen.val <- eigen.counts$values
  n.eigen <- length(eigen.val)
  eigen.val.sum <- sum(eigen.val)
  percents.pcs <- eigen.val/eigen.val.sum
  meta <- as.data.frame(meta)

  all <- 0
  npc.in <- 0
  for(i in 1:n.eigen){
    all <- all + percents.pcs[i]
    npc.in <- npc.in + 1
    if(all > threshold){break}
  }
  if (npc.in < 3) {npc <- 3}

  pred.list <- colnames(meta)
  meta <- droplevels(meta)

  n.preds <- ncol(meta) + 1
  if(inter) {n.preds <- n.preds + choose(ncol(meta),2)}

  ran.pred.list <- c()
  for(i in 1:ncol(meta)){
    ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i],")"))
  }
  ##interactions
  if(inter){
    for(i in 1:(ncol(meta)-1)){
      for(j in (i+1):ncol(meta)){
        ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i], ":", pred.list[j], ")"))
        pred.list <- c(pred.list, paste0(pred.list[i], ":", pred.list[j]))
      }
    }
  }
  formula <- paste(ran.pred.list, collapse = " + ")
  formula <- paste("pc", formula, sep=" ~ ")
  ran.var.mat <- NULL
  for(i in 1:npc.in){
    dat <- cbind(eigen.mat[,i],meta)
    colnames(dat) <- c("pc",colnames(meta))
    Rm1ML <- lme4::lmer(formula, dat, REML = TRUE, verbose = FALSE, na.action = na.omit,
                        # control=lmerControl(check.nobs.vs.nlev = "ignore",
                        #                    check.nobs.vs.rankZ = "ignore",
                        #                    check.nobs.vs.nRE="ignore")
                        )
    var.vec <- unlist(VarCorr(Rm1ML))
    ran.var.mat <- rbind(ran.var.mat, c(var.vec[pred.list], resid = sigma(Rm1ML)^2))
  }
  ran.var.mat.std <- ran.var.mat/rowSums(ran.var.mat)
  wgt.vec <- eigen.val/eigen.val.sum
  prop.var <- colSums(ran.var.mat.std*wgt.vec[1:npc.in])
  std.prop.var <- prop.var/sum(prop.var)
  std.prop.var
}

PlotPVCA <- function(pvca.res, title){
  plot.dat <- data.frame(eff=names(pvca.res), prop=pvca.res)
  p <- ggplot2::ggplot(plot.dat, aes(x=eff, y=prop))
  p <- p + ggplot2::ggtitle(title)
  p <- p + ggplot2::geom_bar(stat="identity", fill="steelblue", colour="steelblue")
  p <- p + ggplot2::geom_text(aes(label=round(prop,3), y=prop+0.04), size=4)
  p <- p + ggplot2::scale_x_discrete(limits=names(pvca.res))
  p <- p + ggplot2::scale_y_continuous(limits = c(0,1))
  p <- p + ggplot2::labs(x= "Effects", y= "Weighted average proportion variance")
  p <- p + ggplot2::theme_bw()
  p <- p + ggplot2::theme(plot.background = element_blank() ,panel.grid.major = element_blank(),
                 panel.grid.minor = element_blank() ,panel.border = element_blank(), panel.background = element_blank())
  p <- p + ggplot2::theme(axis.line = element_line(color = 'black'))
  p <- p + ggplot2::theme(axis.title.x = element_text(size = 15, vjust=-0.5), 
                          axis.text.x = element_text(angle = 45, vjust= 1, hjust=1,  margin=margin(r=0)))
  p <- p + ggplot2::theme(axis.title.y = element_text(size = 15, vjust= 1.0))
  p <- p + ggplot2::theme(axis.text = element_text(size = 12))
  p
}

aData <- as.matrix(dataList$featureImputed[["cytof"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

aData <- as.matrix(dataList$featureImputed[["rnaseq"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation RNA seq")

## Warning: Removed 1 rows containing missing values (`geom_text()`).

aData <- as.matrix(dataList$featureImputed[["abtiter"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

aData <- as.matrix(dataList$featureImputed[["olink"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Olink")

newDf <- do.call("rbind", dataList[["addedMissingVals"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))

aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

## Warning: Removed 1 rows containing missing values (`geom_text()`).

Data Analysis before imputation

MOFAobject_missingVals <- create_mofa(dataList[["addedMissingVals"]])

## Creating MOFA object from a list of matrices (features as rows, sample as columns)...

MOFAobject_missingVals

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

plot_data_overview(MOFAobject_missingVals)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_missingVals) <- metaDf
MOFAobject_missingVals

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

data_opts <- get_default_data_options(MOFAobject_missingVals)
data_opts

## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"

model_opts <- get_default_model_options(MOFAobject_missingVals)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_missingVals)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_missingVals <- prepare_mofa(MOFAobject_missingVals,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

## Warning in prepare_mofa(MOFAobject_missingVals, data_options = data_opts, :
## Some view(s) have a lot of features, it is recommended to perform a more
## stringent feature selection before creating the MOFA object....

## Checking data options...

## Checking training options...

## Checking model options...

MOFAobject_missingVals <- run_mofa(MOFAobject_missingVals, outfile=".../MOFA2_2ndChallenge_2021.hdf5", use_basilisk = TRUE)

## Warning: Output file .../MOFA2_2ndChallenge_2021.hdf5 already exists, it will be replaced

## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

## Warning in .quality_control(object, verbose = verbose): Factor(s) 1 are strongly correlated with the total number of expressed features for at least one of your omics. Such factors appear when there are differences in the total 'levels' between your samples, *sometimes* because of poor normalisation in the preprocessing steps.

MOFAobject_missingVals

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##  Number of factors: 15

plot_object = MOFAobject_missingVals
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Median Baseline Correction

add_NA_cols <- function(df, cols, exp) {
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)

  return(as.matrix(df[, sort(cols)]))
}

dataList$medianNormalized[["rnaseq"]] <- dataList$addedMissingVals[["rnaseq"]] 
dataList$medianNormalized[["abtiter"]] <- dataList$addedMissingVals[["abtiter"]]

baselineSamples <- metaDf[metaDf$timepoint=="0", "sample"]
### Cell Frequency

cellFreq_featureImp <- dataList$featureImputed[["cytof"]]

cellFreqMedian <- data.frame(apply(cellFreq_featureImp[, colnames(cellFreq_featureImp) %in% baselineSamples], 1, median))
colnames(cellFreqMedian) <- "median"

cellFreqMerge <- cbind(cellFreq_featureImp, cellFreqMedian)

cellFreq_medianCorrect <- cellFreqMerge[, colnames(cellFreq_featureImp)]/cellFreqMerge[, "median"]

cellFreqMean <- data.frame(apply(cellFreq_medianCorrect, 1, mean))
colnames(cellFreqMean) <- "mean"
cellFreqMerge <- cbind(cellFreq_medianCorrect, cellFreqMean)
cellFreq_medianCorrect <- log2(cellFreqMerge[, colnames(cellFreq_featureImp)] + (cellFreqMerge[, "mean"]/2))

dataList$medianNormalized[["cytof"]] <- add_NA_cols(cellFreq_medianCorrect, cols, "Cell Freq")

### Olink

olink_featureImp <- dataList$featureImputed[["olink"]]

olinkMedian <- data.frame(apply(olink_featureImp[, colnames(olink_featureImp) %in% baselineSamples], 1, median))
colnames(olinkMedian) <- "median"

olinkMerge <- cbind(olink_featureImp, olinkMedian)
olink_medianCorrect <- olinkMerge[, colnames(olink_featureImp)]/olinkMerge[, "median"]

olinkMean <- data.frame(apply(olink_medianCorrect, 1, mean))
colnames(olinkMean) <- "mean"
olinkMerge <- cbind(olink_medianCorrect, olinkMean)
olink_medianCorrect <- log2(olinkMerge[, colnames(olink_featureImp)] + (olinkMerge[, "mean"]/2))

dataList$medianNormalized[["olink"]] <- add_NA_cols(olink_medianCorrect, cols, "Olink")

newDf <- data.frame(dataList$medianNormalized[["cytof"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

newDf <- data.frame(dataList$medianNormalized[["rnaseq"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation RNA Seq")

newDf <- data.frame(dataList$medianNormalized[["abtiter"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

newDf <- data.frame(dataList$medianNormalized[["olink"]])%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation Olink")

newDf <- do.call("rbind", dataList[["medianNormalized"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))
aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("dataset", "timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)
PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

Data Analysis before imputation

MOFAobject_medianNormalized <- create_mofa(dataList[["medianNormalized"]])
MOFAobject_medianNormalized

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

plot_data_overview(MOFAobject_medianNormalized)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_medianNormalized) <- metaDf
MOFAobject_medianNormalized

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

data_opts <- get_default_data_options(MOFAobject_medianNormalized)
data_opts

## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"

model_opts <- get_default_model_options(MOFAobject_medianNormalized)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_medianNormalized)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_medianNormalized <- prepare_mofa(MOFAobject_medianNormalized,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_medianNormalized <- run_mofa(MOFAobject_medianNormalized, outfile=".../MOFA2_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_medianNormalized

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##  Number of factors: 15

plot_object = MOFAobject_medianNormalized
plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

MOFAobject_mofaImputed <- impute(MOFAobject_medianNormalized, views = "all")
samples_metadata(MOFAobject_medianNormalized) <- metaDf
MOFAobject_mofaImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##  Number of factors: 15

results <- MOFAobject_mofaImputed@imputed_data
dataList$mofaImputed[["rnaseq"]] <- results$rnaseq$group1
dataList$mofaImputed[["abtiter"]] <- results$abtiter$group1
dataList$mofaImputed[["cytof"]] <- results$cytof$group1
dataList$mofaImputed[["olink"]] <- results$olink$group1

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_mofaImputed <- create_mofa(dataList[["mofaImputed"]])
MOFAobject_mofaImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

plot_data_overview(MOFAobject_mofaImputed)

samples_metadata(MOFAobject_mofaImputed) <- metaDf
MOFAobject_mofaImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

data_opts <- get_default_data_options(MOFAobject_mofaImputed)
MOFAobject_mofaImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

model_opts <- get_default_model_options(MOFAobject_mofaImputed)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_mofaImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_mofaImputed <- prepare_mofa(MOFAobject_mofaImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_mofaImputed <- run_mofa(MOFAobject_mofaImputed, outfile=".../MofaIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_mofaImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 13092 31 22 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##  Number of factors: 15

plot_object = MOFAobject_mofaImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Mean imputation

dataList$meanImputed[["abtiter"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["abtiter"]]))))
dataList$meanImputed[["cytof"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["cytof"]]))))
dataList$meanImputed[["rnaseq"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["rnaseq"]]))))
dataList$meanImputed[["olink"]] <- as.matrix(t(na_mean(t(dataList$medianNormalized[["olink"]]))))
# a <-  lapply(dataList$addedMissingVals[["olink"]][, 1:10], NA2mean)

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_meanImputed <- create_mofa(dataList[["meanImputed"]])
MOFAobject_meanImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

plot_data_overview(MOFAobject_meanImputed)

samples_metadata(MOFAobject_meanImputed) <- metaDf
MOFAobject_meanImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

data_opts <- get_default_data_options(MOFAobject_meanImputed)
MOFAobject_meanImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

model_opts <- get_default_model_options(MOFAobject_meanImputed)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##    abtiter      cytof     rnaseq      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_meanImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_meanImputed <- prepare_mofa(MOFAobject_meanImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_meanImputed <- run_mofa(MOFAobject_meanImputed, outfile=".../MeanIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_meanImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##  Number of factors: 15

plot_object = MOFAobject_meanImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Median imputation

na2median <- function(df) {
  df <- data.frame(df)
  df <- t(data.frame(t(df)) %>%
                mutate(across(1:dim(df)[1], ~replace_na(., median(., na.rm=TRUE)))))
  colnames(df) <- gsub("X", "", colnames(df))
  return(as.matrix(df))
}

# na_replace from imputeTS can be used to impute median in these values too.

dataList$medianImputed[["abtiter"]] <- na2median(dataList$medianNormalized[["abtiter"]])
dataList$medianImputed[["cytof"]] <- na2median(dataList$medianNormalized[["cytof"]])
dataList$medianImputed[["rnaseq"]] <- na2median(dataList$medianNormalized[["rnaseq"]])
dataList$medianImputed[["olink"]] <- na2median(dataList$medianNormalized[["olink"]])

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

MOFAobject_medianImputed <- create_mofa(dataList[["medianImputed"]])
MOFAobject_medianImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

plot_data_overview(MOFAobject_medianImputed)

samples_metadata(MOFAobject_medianImputed) <- metaDf
MOFAobject_medianImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

data_opts <- get_default_data_options(MOFAobject_medianImputed)
MOFAobject_medianImputed

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##

model_opts <- get_default_model_options(MOFAobject_medianImputed)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##    abtiter      cytof     rnaseq      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_medianImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_medianImputed <- prepare_mofa(MOFAobject_medianImputed,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)
MOFAobject_medianImputed <- run_mofa(MOFAobject_medianImputed, outfile=".../MedianIpmputation_2ndChallenge_2020_2021.hdf5", use_basilisk = TRUE)

MOFAobject_medianImputed

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: abtiter cytof rnaseq olink 
##  Number of features (per view): 31 22 13092 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 360 
##  Number of factors: 15

plot_object = MOFAobject_medianImputed
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

saveRDS(dataList, file = here("./results/allData_2020_2021.RDS"))

Testing Different data Manipulation for dataset 2020 & 2021

2023-10-03

Load libraries

Load Meta Data

Load Data

Modify Metadata

PVCA

Data Analysis before imputation

Training Model

Median Baseline Correction

Data Analysis before imputation

Training Model

Mean imputation

Median imputation