Testing Different data Manipulation for dataset 2021

Load libraries

suppressPackageStartupMessages({
library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)
library(impute)
library(pvca)
library(Biobase)
library(lme4)})

Find features in other datasets

cells <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)
cells <- levels(factor(cells$cell_type_name))

antigens <- read_tsv(here("./data/2020LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
antigens <- levels(factor(paste(antigens$isotype, antigens$antigen, sep = "_")))

genes <- read_tsv(here("./data/2020LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
genes <- levels(factor(gsub("\\..*", "", genes$versioned_ensembl_gene_id)))

proteins <- read_tsv(here("./data/2020LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
proteins <- levels(factor(proteins$protein_id))

Load

df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

metaDf <- df_source[["subject_specimen"]]
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[metaDf$dataset=="2021_dataset", ]
specimen <- metaDf[metaDf$dataset=="2021_dataset", "specimen_id"]

Load Meta Data

metaDf <- metaDf[metaDf$timepoint %in% c(0, 1, 3, 7, 14), ]

samples <- levels(factor(metaDf$specimen_id))
subjects <- levels(factor(metaDf$subject_id))
print(paste("Number of samples:", length(samples)))

## [1] "Number of samples: 180"

print(paste("Number of subjects:", length(subjects)))

## [1] "Number of subjects: 36"

Load Data

cellDf <- as.data.frame(t(df_source[["pbmc_cell_frequency_wide"]]))
cellDf <- cellDf[, colnames(cellDf) %in% samples]
cellDf <- as.matrix(cellDf[rownames(cellDf) %in% cells, ])
subject_num <- length(levels(factor(metaDf[metaDf$specimen_id, "subject_id"])))

cellDf <- cellDf[rowVars(cellDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(cellDf)) > 0))
compSamples <- names(which(colSums(is.na(cellDf)) == 0))
print(paste("Cell Frequency Incomplete Sample Cases:", length(incompSamples)))

## [1] "Cell Frequency Incomplete Sample Cases: 0"

print(paste("Cell Frequency Complete Sample Cases:", length(compSamples)))

## [1] "Cell Frequency Complete Sample Cases: 165"

print(paste("Cell Frequency Feature Number:", dim(cellDf)[1]))

## [1] "Cell Frequency Feature Number: 20"

incompFeature <- names(which(rowSums(is.na(cellDf)) > 0))
print(paste("Cell Frequency Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Cell Frequency Incomplete Feature Numbers: 0"

print(c("Cell Frequency Incomplete Feature:", incompFeature))

## [1] "Cell Frequency Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(cellDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Cell Frequency Incomplete Samples:")

## [1] "Cell Frequency Incomplete Samples:"

print(incompSamples)

## character(0)

print("Cell Frequency Incomplete Subjects:")

## [1] "Cell Frequency Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Cell Frequency", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

print(paste("Cell Frequency Number of Incomplete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Cell Frequency Number of Incomplete Subjects: 0"

print(paste("Cell Frequency Number of Complete Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Cell Frequency Number of Complete Subjects: 33"

print(paste("Cell Frequency Number of All Subjects:", 
            length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(cellDf), "subject_id"])))))

## [1] "Cell Frequency Number of All Subjects: 33"

abtiterDf <- as.data.frame(t(df_source[["plasma_antibody_levels_wide"]]))
abtiterDf <- abtiterDf[, colnames(abtiterDf) %in% samples]
abtiterDf <- as.matrix(abtiterDf[rownames(abtiterDf) %in% antigens, ])

abtiterDf <- abtiterDf[rowVars(abtiterDf, na.rm = TRUE)>0, ]
incompSamples <- names(which(colSums(is.na(abtiterDf)) > 0))
compSamples <- names(which(colSums(is.na(abtiterDf)) == 0))

print(paste("Ab Titer Incomplete Sample Cases:", length(incompSamples)))

## [1] "Ab Titer Incomplete Sample Cases: 0"

print(paste("Ab Titer Complete Sample Cases:", length(compSamples)))

## [1] "Ab Titer Complete Sample Cases: 165"

print(paste("Ab Titer Feature Number:", dim(abtiterDf)[1]))

## [1] "Ab Titer Feature Number: 27"

incompFeature <- names(which(rowSums(is.na(abtiterDf)) > 0))
print(paste("Ab Titer Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Ab Titer Incomplete Feature Numbers: 0"

print(c("Ab Titer Incomplete Feature:", incompFeature))

## [1] "Ab Titer Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(abtiterDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Ab Titer Incomplete Samples:")

## [1] "Ab Titer Incomplete Samples:"

print(incompSamples)

## character(0)

print("Ab Titer Incomplete Subjects:")

## [1] "Ab Titer Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Ab Titer", s, "Timepoints of Incomplete Subjects:"))
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Ab Titer Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Ab Titer Number of Incomplete Subjects: 0"

print(paste("Ab Titer Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Ab Titer Number of Complete Subjects: 33"

print(paste("Ab Titer Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(abtiterDf), "subject_id"])))))

## [1] "Ab Titer Number of All Subjects: 33"

rnaDf <- as.data.frame(t(df_source[["pbmc_gene_expression_wide"]]))
rnaDf <- rnaDf[, colnames(rnaDf) %in% samples]

rownames(rnaDf) <- gsub("\\..*", "", rownames(rnaDf))
rnaDf <- as.matrix(rnaDf[rownames(rnaDf) %in% genes, ])

rnaDf <- rnaDf[(rowSums(rnaDf>1)/dim(rnaDf)[2])*100 >70,]

rnaDf <- rnaDf[rowVars(rnaDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(rnaDf)) > 0))
compSamples <- names(which(colSums(is.na(rnaDf)) == 0))

print(paste("RNA seq Incomplete Sample Cases:", length(incompSamples)))

## [1] "RNA seq Incomplete Sample Cases: 0"

print(paste("RNA seq Complete Sample Cases:", length(compSamples)))

## [1] "RNA seq Complete Sample Cases: 180"

print(paste("RNA seq Feature Number:", dim(rnaDf)[1]))

## [1] "RNA seq Feature Number: 10492"

# print(dim(rnaDf))

incompFeature <- names(which(rowSums(is.na(rnaDf)) > 0))
print(paste("RNA seq Incomplete Feature Numbers:", length(incompFeature)))

## [1] "RNA seq Incomplete Feature Numbers: 0"

print(c("RNA seq Incomplete Feature:", incompFeature))

## [1] "RNA seq Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(rnaDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("RNA seq Incomplete Samples:")

## [1] "RNA seq Incomplete Samples:"

print(incompSamples)

## character(0)

print("RNA seq Incomplete Subjects:")

## [1] "RNA seq Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("RNA seq", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}

print(paste("RNA seq Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "RNA seq Number of Incomplete Subjects: 0"

print(paste("RNA seq Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "RNA seq Number of Complete Subjects: 36"

print(paste("RNA seq Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(rnaDf), "subject_id"])))))

## [1] "RNA seq Number of All Subjects: 36"

olinkDf <- as.data.frame(t(df_source[["plasma_cytokine_concentrations_wide"]]))
olinkDf <- olinkDf[, colnames(olinkDf) %in% samples]
olinkDf <- as.matrix(olinkDf[rownames(olinkDf) %in% proteins, ])

# olinkDf <- olinkDf[olinkDf$protein_id %in% proteins, c("specimen_id", "protein_id", "protein_expression")]
# 
# olinkDf <- as.data.frame(pivot_wider(olinkDf, names_from = "protein_id",
#                   values_from=c("protein_expression")))
# row.names(olinkDf) <- olinkDf$specimen_id
# olinkDf <- t(olinkDf[, names(olinkDf)!="specimen_id"])

olinkDf <- olinkDf[rowVars(olinkDf, na.rm = TRUE)>0, ]

incompSamples <- names(which(colSums(is.na(olinkDf)) > 0))
compSamples <- names(which(colSums(is.na(olinkDf)) == 0))

print(paste("Olink Incomplete Sample Cases:", length(incompSamples)))

## [1] "Olink Incomplete Sample Cases: 0"

print(paste("Olink Complete Sample Cases:", length(compSamples)))

## [1] "Olink Complete Sample Cases: 180"

print(paste("Olink Feature Number:", dim(olinkDf)[1]))

## [1] "Olink Feature Number: 30"

incompFeature <- names(which(rowSums(is.na(olinkDf)) > 0))
print(paste("Olink Incomplete Feature Numbers:", length(incompFeature)))

## [1] "Olink Incomplete Feature Numbers: 0"

print(c("Olink Incomplete Feature:", incompFeature))

## [1] "Olink Incomplete Feature:"

for (f in incompFeature){
  subDf <- data.frame(olinkDf[f,])
  incompSamp <- names(which(rowSums(is.na(subDf)) > 0))
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))

  print(paste(f, "Number of Incomplete Samples:"))
  print(length(incompSamp))
  print(paste(f, "Incomplete Samples:"))
  print(incompSamp)
  
  print(paste(f, "Number of Incomplete Subjects:"))
  print(length(incompSubj))
  print(paste(f, "Incomplete Subjects:"))
  print(incompSubj)
}

print("Olink Incomplete Samples:")

## [1] "Olink Incomplete Samples:"

print(incompSamples)

## character(0)

print("Olink Incomplete Subjects:")

## [1] "Olink Incomplete Subjects:"

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))

## character(0)

print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))

## character(0)

incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
for (s in incompSubj){
  print(paste("Olink", s, "Timepoints of Incomplete Subjects:"))
  # samp <- 
  print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
}


print(paste("Olink Number of Incomplete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))

## [1] "Olink Number of Incomplete Subjects: 0"

print(paste("Olink Number of Complete Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])))))

## [1] "Olink Number of Complete Subjects: 36"

print(paste("Olink Number of All Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(olinkDf), "subject_id"])))))

## [1] "Olink Number of All Subjects: 36"

dataList <- list()
dataList[["original"]] <- list("abtiter"= abtiterDf,
                 "cytof"= cellDf, 
                 "olink"= olinkDf,
                 "rnaseq"=rnaDf)
K = 20
# int_cols <- Reduce(intersect, lapply(dataList$original[c("abtiter", "cytof", "olink", "rnaseq")], colnames))
# cols <- unique(c(int_cols, colnames(dataList$original[["rnaseq"]])))

cols <- colnames(dataList$original[["rnaseq"]])

add_cols <- function(df, cols, exp) {
  print(paste("************************", exp, "********************************"))
  df <- df[, colnames(df) %in% cols]
  add <- setdiff(cols, colnames(df))
  
  print(paste(exp, "All the Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))
  
  print(paste(exp, "Number of All the Subjects:", length(levels(factor(metaDf[metaDf$specimen_id %in% colnames(df), "subject_id"])))))
  # print(df[, !colnames(df) %in% add])
  incompSamples <- setdiff(names(which(colSums(is.na(df)) > 0)), add)
  compSamples <- setdiff(names(which(colSums(is.na(df)) == 0)), add)
  # print(colnames(df))
  # print(add)
  # print(incompSamples)
  
  print(paste(exp, "Number of selected Samples:", dim(df)[2]))
  print(paste(exp, "Number of Features:", dim(df)[1]))
  
  incompFeature <- setdiff(names(which(rowSums(is.na(df)) > 0)), add)
  print(paste(exp, "Number of Incomplete Features:", length(incompFeature)))
  print(paste(exp, "Incomplete Features:", incompFeature))
  
  for (f in incompFeature){
    subDf <- data.frame(df[f,])
    
    incompSamp <- setdiff(names(which(rowSums(is.na(subDf)) > 0)), add)
    incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamp, "subject_id"]))
  
    print(paste(f, "Number of Incomplete Samples:"))
    print(length(incompSamp))
    print(paste(f, "Incomplete Samples:"))
    print(incompSamp)
    
    print(paste(f, "Number of Incomplete Subjects:"))
    print(length(incompSubj))
    print(paste(f, "Incomplete Subjects:"))
    print(incompSubj)
  }
  
  print(paste(exp, "Incomplete Samples:") )
  print(incompSamples)
  print(paste(exp, "Incomplete Subjects:"))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))
  print(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "timepoint"])))
  
  incompSubj <- levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"]))
  for (s in incompSubj){
    print(paste(exp, s, "Timepoints of Incomplete Subjects:"))
    print(levels(factor(metaDf[which(metaDf$specimen_id %in% incompSamples & metaDf$subject_id==s), "timepoint"])))
  }
  subjMissingTimepoints <- levels(factor(metaDf[metaDf$specimen_id %in% add, "subject_id"]))
  print(paste(exp, "Number of Missing Subjects:", 
              length(subjMissingTimepoints)))
  
  print(paste(exp, "Missing Subjects:"))
  print(subjMissingTimepoints)
  
  print(paste(exp, "Number of Incomplete Subjects:", 
              length(levels(factor(metaDf[metaDf$specimen_id %in%  incompSamples, "subject_id"])))))
  print(paste(exp, "Number of Complete Subjects:", 
              length(setdiff(levels(factor(metaDf[metaDf$specimen_id %in%  compSamples, "subject_id"])),
                             subjMissingTimepoints))))
  

  print(paste(exp, "Number of Missing Samples:", length(add)))
  
  if(length(incompFeature)>0){set.seed(1)
    print(paste(exp, "Impute Missing Features for:", incompFeature))
    print("====================================================================")
    df <- t(impute.knn(t(df), k=K)$data)}

  dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
  colnames(dumyDf) <- add
  
  if(length(add) != 0) df <- cbind(df, dumyDf)
  print(paste(exp, "Number of all Samples:", dim(df)[2]))
  
  print("********************************************************************")
  return(as.matrix(df[, sort(cols)]))
}

# dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf[, int_cols], cols, "RNA seq")
dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf, cols, "RNA seq")

## [1] "************************ RNA seq ********************************"
## [1] "RNA seq All the Subjects:"
##  [1] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [16] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [31] "91" "92" "93" "94" "95" "96"
## [1] "RNA seq Number of All the Subjects: 36"
## [1] "RNA seq Number of selected Samples: 180"
## [1] "RNA seq Number of Features: 10492"
## [1] "RNA seq Number of Incomplete Features: 0"
## [1] "RNA seq Incomplete Features: "
## [1] "RNA seq Incomplete Samples:"
## character(0)
## [1] "RNA seq Incomplete Subjects:"
## character(0)
## character(0)
## [1] "RNA seq Number of Missing Subjects: 0"
## [1] "RNA seq Missing Subjects:"
## character(0)
## [1] "RNA seq Number of Incomplete Subjects: 0"
## [1] "RNA seq Number of Complete Subjects: 36"
## [1] "RNA seq Number of Missing Samples: 0"
## [1] "RNA seq Number of all Samples: 180"
## [1] "********************************************************************"

dataList$addedMissingVals[["abtiter"]] <- add_cols(abtiterDf, cols, "Ab-titer")

## [1] "************************ Ab-titer ********************************"
## [1] "Ab-titer All the Subjects:"
##  [1] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [16] "76" "77" "78" "79" "80" "81" "83" "84" "85" "86" "89" "90" "91" "92" "93"
## [31] "94" "95" "96"
## [1] "Ab-titer Number of All the Subjects: 33"
## [1] "Ab-titer Number of selected Samples: 165"
## [1] "Ab-titer Number of Features: 27"
## [1] "Ab-titer Number of Incomplete Features: 0"
## [1] "Ab-titer Incomplete Features: "
## [1] "Ab-titer Incomplete Samples:"
## character(0)
## [1] "Ab-titer Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Ab-titer Number of Missing Subjects: 3"
## [1] "Ab-titer Missing Subjects:"
## [1] "82" "87" "88"
## [1] "Ab-titer Number of Incomplete Subjects: 0"
## [1] "Ab-titer Number of Complete Subjects: 33"
## [1] "Ab-titer Number of Missing Samples: 15"
## [1] "Ab-titer Number of all Samples: 180"
## [1] "********************************************************************"

dataList$addedMissingVals[["cytof"]] <- add_cols(cellDf, cols, "Cell Freq")

## [1] "************************ Cell Freq ********************************"
## [1] "Cell Freq All the Subjects:"
##  [1] "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "76" "77" "78"
## [16] "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90" "91" "92" "93"
## [31] "94" "95" "96"
## [1] "Cell Freq Number of All the Subjects: 33"
## [1] "Cell Freq Number of selected Samples: 165"
## [1] "Cell Freq Number of Features: 20"
## [1] "Cell Freq Number of Incomplete Features: 0"
## [1] "Cell Freq Incomplete Features: "
## [1] "Cell Freq Incomplete Samples:"
## character(0)
## [1] "Cell Freq Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Cell Freq Number of Missing Subjects: 3"
## [1] "Cell Freq Missing Subjects:"
## [1] "61" "62" "75"
## [1] "Cell Freq Number of Incomplete Subjects: 0"
## [1] "Cell Freq Number of Complete Subjects: 33"
## [1] "Cell Freq Number of Missing Samples: 15"
## [1] "Cell Freq Number of all Samples: 180"
## [1] "********************************************************************"

dataList$addedMissingVals[["olink"]] <- add_cols(olinkDf, cols, "Olink")

## [1] "************************ Olink ********************************"
## [1] "Olink All the Subjects:"
##  [1] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" "73" "74" "75"
## [16] "76" "77" "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88" "89" "90"
## [31] "91" "92" "93" "94" "95" "96"
## [1] "Olink Number of All the Subjects: 36"
## [1] "Olink Number of selected Samples: 180"
## [1] "Olink Number of Features: 30"
## [1] "Olink Number of Incomplete Features: 0"
## [1] "Olink Incomplete Features: "
## [1] "Olink Incomplete Samples:"
## character(0)
## [1] "Olink Incomplete Subjects:"
## character(0)
## character(0)
## [1] "Olink Number of Missing Subjects: 0"
## [1] "Olink Missing Subjects:"
## character(0)
## [1] "Olink Number of Incomplete Subjects: 0"
## [1] "Olink Number of Complete Subjects: 36"
## [1] "Olink Number of Missing Samples: 0"
## [1] "Olink Number of all Samples: 180"
## [1] "********************************************************************"

dataList$featureImputed[["rnaseq"]] <- data.frame(dataList$addedMissingVals[["rnaseq"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["rnaseq"]]) <- gsub("X", "", colnames(dataList$featureImputed[["rnaseq"]]))

dataList$featureImputed[["abtiter"]] <- data.frame(dataList$addedMissingVals[["abtiter"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["abtiter"]]) <- gsub("X", "", colnames(dataList$featureImputed[["abtiter"]]))

dataList$featureImputed[["cytof"]] <- data.frame(dataList$addedMissingVals[["cytof"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["cytof"]]) <- gsub("X", "", colnames(dataList$featureImputed[["cytof"]]))

dataList$featureImputed[["olink"]] <- data.frame(dataList$addedMissingVals[["olink"]])%>% 
                                  select(where(~!all(is.na(.))))
colnames(dataList$featureImputed[["olink"]]) <- gsub("X", "", colnames(dataList$featureImputed[["olink"]]))

Modify Metadata

metaDf <- data.frame(metaDf[metaDf$specimen_id %in% cols, ])
colnames(metaDf)[colnames(metaDf)=="specimen_id"] <- "sample"
rownames(metaDf) <- metaDf$sample
metaDf$sample <- as.character(metaDf$sample)
metaDf <- metaDf[cols,]

aMeta <- metaDf[, c("sample", "dataset", "timepoint", "infancy_vac", "biological_sex", "age_at_boost")]

rownames(aMeta) <- aMeta$sample

aMeta$infancy_vac <- as.factor(aMeta$infancy_vac)
aMeta$biological_sex <- as.factor(aMeta$biological_sex)
aMeta$dataset <- as.factor(aMeta$dataset)
aMeta$age_at_boost <- as.factor(aMeta$age_at_boost)

aMeta$sample <- paste("0", aMeta$sample, sep="")

PVCA

PVCA <- function(counts, meta, threshold, inter){

  counts.center <- t(apply(counts, 1, scale, center=TRUE, scale=FALSE))
  cor.counts <- cor(counts.center)
  dim(cor.counts)
  eigen.counts <- eigen(cor.counts)
  eigen.mat <- eigen.counts$vectors
  eigen.val <- eigen.counts$values
  n.eigen <- length(eigen.val)
  eigen.val.sum <- sum(eigen.val)
  percents.pcs <- eigen.val/eigen.val.sum
  meta <- as.data.frame(meta)

  all <- 0
  npc.in <- 0
  for(i in 1:n.eigen){
    all <- all + percents.pcs[i]
    npc.in <- npc.in + 1
    if(all > threshold){break}
  }
  if (npc.in < 3) {npc <- 3}

  pred.list <- colnames(meta)
  meta <- droplevels(meta)

  n.preds <- ncol(meta) + 1
  if(inter) {n.preds <- n.preds + choose(ncol(meta),2)}

  ran.pred.list <- c()
  for(i in 1:ncol(meta)){
    ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i],")"))
  }
  ##interactions
  if(inter){
    for(i in 1:(ncol(meta)-1)){
      for(j in (i+1):ncol(meta)){
        ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i], ":", pred.list[j], ")"))
        pred.list <- c(pred.list, paste0(pred.list[i], ":", pred.list[j]))
      }
    }
  }
  formula <- paste(ran.pred.list, collapse = " + ")
  formula <- paste("pc", formula, sep=" ~ ")
  ran.var.mat <- NULL
  for(i in 1:npc.in){
    dat <- cbind(eigen.mat[,i],meta)
    colnames(dat) <- c("pc",colnames(meta))
    Rm1ML <- lme4::lmer(formula, dat, REML = TRUE, verbose = FALSE, na.action = na.omit,
                        # control=lmerControl(check.nobs.vs.nlev = "ignore",
                        #                    check.nobs.vs.rankZ = "ignore",
                        #                    check.nobs.vs.nRE="ignore")
                        )
    var.vec <- unlist(VarCorr(Rm1ML))
    ran.var.mat <- rbind(ran.var.mat, c(var.vec[pred.list], resid = sigma(Rm1ML)^2))
  }
  ran.var.mat.std <- ran.var.mat/rowSums(ran.var.mat)
  wgt.vec <- eigen.val/eigen.val.sum
  prop.var <- colSums(ran.var.mat.std*wgt.vec[1:npc.in])
  std.prop.var <- prop.var/sum(prop.var)
  std.prop.var
}

PlotPVCA <- function(pvca.res, title){
  plot.dat <- data.frame(eff=names(pvca.res), prop=pvca.res)
  p <- ggplot2::ggplot(plot.dat, aes(x=eff, y=prop))
  p <- p + ggplot2::ggtitle(title)
  p <- p + ggplot2::geom_bar(stat="identity", fill="steelblue", colour="steelblue")
  p <- p + ggplot2::geom_text(aes(label=round(prop,3), y=prop+0.04), size=4)
  p <- p + ggplot2::scale_x_discrete(limits=names(pvca.res))
  p <- p + ggplot2::scale_y_continuous(limits = c(0,1))
  p <- p + ggplot2::labs(x= "Effects", y= "Weighted average proportion variance")
  p <- p + ggplot2::theme_bw()
  p <- p + ggplot2::theme(plot.background = element_blank() ,panel.grid.major = element_blank(),
                 panel.grid.minor = element_blank() ,panel.border = element_blank(), panel.background = element_blank())
  p <- p + ggplot2::theme(axis.line = element_line(color = 'black'))
  p <- p + ggplot2::theme(axis.title.x = element_text(size = 15, vjust=-0.5), 
                          axis.text.x = element_text(angle = 45, vjust= 1, hjust=1,  margin=margin(r=0)))
  p <- p + ggplot2::theme(axis.title.y = element_text(size = 15, vjust= 1.0))
  p <- p + ggplot2::theme(axis.text = element_text(size = 12))
  p
}

aData <- as.matrix(dataList$featureImputed[["cytof"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Cell Frequency")

aData <- as.matrix(dataList$featureImputed[["rnaseq"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation RNA seq")

aData <- as.matrix(dataList$featureImputed[["abtiter"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Ab-Titer")

aData <- as.matrix(dataList$featureImputed[["olink"]])

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation Olink")

newDf <- do.call("rbind", dataList[["addedMissingVals"]])
newDf <- data.frame(newDf)%>% select_if(~all(!is.na(.)))
colnames(newDf) <- gsub("X", "", colnames(newDf))

aData <- as.matrix(newDf)

pvcaObj <- PVCA(aData, meta=aMeta[colnames(aData), c("timepoint", "infancy_vac", "biological_sex")],  threshold=0.4, inter = TRUE)

## boundary (singular) fit: see help('isSingular')

PlotPVCA(pvcaObj, "PVCA estimation All Experiments")

Data Analysis before imputation

MOFAobject_missingVals <- create_mofa(dataList[["addedMissingVals"]])

## Creating MOFA object from a list of matrices (features as rows, sample as columns)...

MOFAobject_missingVals

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 10492 27 20 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

plot_data_overview(MOFAobject_missingVals)

Training Model

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_missingVals) <- metaDf
MOFAobject_missingVals

## Untrained MOFA model with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 10492 27 20 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##

data_opts <- get_default_data_options(MOFAobject_missingVals)
data_opts

## $scale_views
## [1] FALSE
## 
## $scale_groups
## [1] FALSE
## 
## $center_groups
## [1] TRUE
## 
## $use_float32
## [1] TRUE
## 
## $views
## [1] "rnaseq"  "abtiter" "cytof"   "olink"  
## 
## $groups
## [1] "group1"

model_opts <- get_default_model_options(MOFAobject_missingVals)
model_opts$num_factors <- 15

model_opts

## $likelihoods
##     rnaseq    abtiter      cytof      olink 
## "gaussian" "gaussian" "gaussian" "gaussian" 
## 
## $num_factors
## [1] 15
## 
## $spikeslab_factors
## [1] FALSE
## 
## $spikeslab_weights
## [1] FALSE
## 
## $ard_factors
## [1] FALSE
## 
## $ard_weights
## [1] TRUE

train_opts <- get_default_training_options(MOFAobject_missingVals)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42

train_opts

## $maxiter
## [1] 1000
## 
## $convergence_mode
## [1] "medium"
## 
## $drop_factor_threshold
## [1] -1
## 
## $verbose
## [1] FALSE
## 
## $startELBO
## [1] 1
## 
## $freqELBO
## [1] 5
## 
## $stochastic
## [1] FALSE
## 
## $gpu_mode
## [1] FALSE
## 
## $seed
## [1] 42
## 
## $outfile
## NULL
## 
## $weight_views
## [1] FALSE
## 
## $save_interrupted
## [1] FALSE

MOFAobject_missingVals <- prepare_mofa(MOFAobject_missingVals,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts
)

## Warning in prepare_mofa(MOFAobject_missingVals, data_options = data_opts, :
## Some view(s) have a lot of features, it is recommended to perform a more
## stringent feature selection before creating the MOFA object....

## Checking data options...

## Checking training options...

## Checking model options...

MOFAobject_missingVals <- run_mofa(MOFAobject_missingVals, outfile=".../MOFA2_2ndChallenge_2021.hdf5", use_basilisk = TRUE)

## Warning: Output file .../MOFA2_2ndChallenge_2021.hdf5 already exists, it will be replaced

## Connecting to the mofapy2 package using basilisk. 
##     Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.

## Warning in .quality_control(object, verbose = verbose): Factor(s) 3 are strongly correlated with the total number of expressed features for at least one of your omics. Such factors appear when there are differences in the total 'levels' between your samples, *sometimes* because of poor normalisation in the preprocessing steps.

MOFAobject_missingVals

## Trained MOFA with the following characteristics: 
##  Number of views: 4 
##  Views names: rnaseq abtiter cytof olink 
##  Number of features (per view): 10492 27 20 30 
##  Number of groups: 1 
##  Groups names: group1 
##  Number of samples (per group): 180 
##  Number of factors: 15

plot_object = MOFAobject_missingVals
plot_factor_cor(plot_object)

plot_factor(plot_object,
  factors = 1,
  color_by = "Factor1"
)

plot_variance_explained(plot_object, max_r2=1)

plot_variance_explained(plot_object, plot_total = T)[[2]]

correlate_factors_with_covariates(plot_object,
  covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
  plot="log_pval"
)

Testing Different data Manipulation for dataset 2021

2023-10-03

Load libraries

Find features in other datasets

Load

Load Meta Data

Load Data

Modify Metadata

PVCA

Data Analysis before imputation

Training Model