suppressPackageStartupMessages({library(MOFA2)
library(kableExtra)
library(data.table)
library(ggplot2)
library(tidyverse)
library(readr)
library(here)
library(matrixStats)
library(imputeTS)
library('ComplexUpset')
library("ComplexHeatmap")
library(mice)
library(VIM)})
cells <- read_tsv(here("./data/2021LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)
cells <- levels(factor(cells$cell_type_name))
antigens <- read_tsv(here("./data/2021LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
antigens <- levels(factor(paste(antigens$isotype, antigens$antigen, sep = "_")))
genes <- read_tsv(here("./data/2021LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
genes <- levels(factor(gsub("\\..*", "", genes$versioned_ensembl_gene_id)))
proteins <- read_tsv(here("./data/2021LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
proteins <- levels(factor(proteins$protein_id))
metaDf <- read_tsv(here("./data/2020LD_subject.tsv"), show_col_types = FALSE)
specimen <- read_tsv(here("./data/2020LD_specimen.tsv"), show_col_types = FALSE)
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth, units="weeks")/52, 2))
metaDf <- merge(metaDf, specimen, by="subject_id")
metaDf["timepoint"] <- metaDf["planned_day_relative_to_boost"]
metaDf <- metaDf[metaDf$timepoint %in% c(0, 1, 3, 7, 14), ]
samples <- metaDf$specimen_id
cellDf <- read_tsv(here("./data/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE)
cellDf <- cellDf[cellDf$specimen_id %in% samples,]
cellDf <- cellDf[cellDf$cell_type_name %in% cells, ]
cellDf <- as.data.frame(pivot_wider(cellDf, names_from = "cell_type_name",
values_from=c("percent_live_cell")))
row.names(cellDf) <- cellDf$specimen_id
cellDf <- t(cellDf[,names(cellDf)!="specimen_id"])
cellDf <- na.omit(cellDf)
# print(names(which(colSums(is.na(cellDf)) > 0)))
incompSamples <- names(which(colSums(is.na(cellDf)) > 0))
cellDf <- cellDf[rowVars(cellDf, na.rm = TRUE)>0, ]
print(paste("Cell Frequency Incomplete Cases:", length(names(which(colSums(is.na(cellDf)) > 0)))))
## [1] "Cell Frequency Incomplete Cases: 0"
print(paste("Cell Frequency Complete Cases:", length(names(which(colSums(is.na(cellDf)) == 0)))))
## [1] "Cell Frequency Complete Cases: 100"
print(paste("Cell Frequency Feature Number:", dim(cellDf)[1]))
## [1] "Cell Frequency Feature Number: 21"
abtiterDf <- read_tsv(here("./data/2020LD_plasma_ab_titer.tsv"), show_col_types = FALSE)
abtiterDf <- abtiterDf[abtiterDf$specimen_id %in% samples, ]
abtiterDf["antigen"] <- paste(abtiterDf$isotype, abtiterDf$antigen, sep = "_")
abtiterDf <- abtiterDf[abtiterDf$antigen %in% antigens, c("specimen_id", "antigen", "MFI_normalised")]
abtiterDf <- as.data.frame(pivot_wider(abtiterDf, names_from = "antigen",
values_from=c("MFI_normalised")))
row.names(abtiterDf) <- abtiterDf$specimen_id
abtiterDf <- t(abtiterDf[, names(abtiterDf)!="specimen_id"])
abtiterDf <- abtiterDf[rowVars(abtiterDf, na.rm = TRUE)>0, ]
print(paste("Ab Titer Incomplete Cases:", length(names(which(colSums(is.na(abtiterDf)) > 0)))))
## [1] "Ab Titer Incomplete Cases: 0"
print(paste("Ab Titer Complete Cases:", length(names(which(colSums(is.na(abtiterDf)) == 0)))))
## [1] "Ab Titer Complete Cases: 274"
print(paste("Ab Titer Feature Number:", dim(abtiterDf)[1]))
## [1] "Ab Titer Feature Number: 31"
rnaDf <- read_tsv(here("./data/2020LD_pbmc_gene_expression.tsv"), show_col_types = FALSE)
rnaDf <- rnaDf[rnaDf$specimen_id %in% samples, ]
rnaDf$versioned_ensembl_gene_id <- gsub("\\..*", "", rnaDf$versioned_ensembl_gene_id)
# print(length(levels(factor(rnaDf$versioned_ensembl_gene_id))))
rnaDf <- rnaDf[rnaDf$versioned_ensembl_gene_id %in% genes, c("specimen_id", "versioned_ensembl_gene_id", "tpm")]
rnaDf <- as.data.frame(pivot_wider(rnaDf, names_from = "versioned_ensembl_gene_id",
values_from=c("tpm")))
row.names(rnaDf) <- rnaDf$specimen_id
rnaDf <- t(rnaDf[, names(rnaDf)!="specimen_id"])
# print(dim(rnaDf))
rnaDf <- rnaDf[(rowSums(rnaDf>1)/dim(rnaDf)[2])*100 >70,]
rnaDf <- rnaDf[rowVars(rnaDf, na.rm = TRUE)>0, ]
print(paste("RNA seq Incomplete Cases:", length(names(which(colSums(is.na(rnaDf)) > 0)))))
## [1] "RNA seq Incomplete Cases: 0"
print(paste("RNA seq Complete Cases:", length(names(which(colSums(is.na(rnaDf)) == 0)))))
## [1] "RNA seq Complete Cases: 180"
print(paste("RNA seq Feature Number:", dim(rnaDf)[1]))
## [1] "RNA seq Feature Number: 14565"
# print(dim(rnaDf))
olinkDf <- read_tsv(here("./data/2020LD_plasma_cytokine_concentration.tsv"), show_col_types = FALSE)
olinkDf <- olinkDf[olinkDf$specimen_id %in% samples, ]
olinkDf <- olinkDf[olinkDf$protein_id %in% proteins, c("specimen_id", "protein_id", "protein_expression")]
olinkDf <- as.data.frame(pivot_wider(olinkDf, names_from = "protein_id",
values_from=c("protein_expression")))
row.names(olinkDf) <- olinkDf$specimen_id
olinkDf <- t(olinkDf[, names(olinkDf)!="specimen_id"])
olinkDf <- olinkDf[rowVars(olinkDf, na.rm = TRUE)>0, ]
print(paste("Olink Incomplete Cases:", length(names(which(colSums(is.na(olinkDf)) > 0)))))
## [1] "Olink Incomplete Cases: 0"
print(paste("Olink Complete Cases:", length(names(which(colSums(is.na(olinkDf)) == 0)))))
## [1] "Olink Complete Cases: 90"
print(paste("Olink Feature Number:", dim(olinkDf)[1]))
## [1] "Olink Feature Number: 30"
dataList <- list()
dataList[["original"]] <- list("abtiter"= abtiterDf,
"cytof"= cellDf,
"olink"= olinkDf,
"rnaseq"=rnaDf)
# int_cols <- Reduce(intersect, lapply(dataList$original[c("abtiter", "cytof", "olink", "rnaseq")], colnames))
# cols <- unique(c(int_cols, colnames(dataList$original[["rnaseq"]])))
cols <- colnames(dataList$original[["rnaseq"]])
add_cols <- function(df, cols, exp) {
df <- df[, colnames(df) %in% cols]
print(paste(exp, "Number of selected Samples:", dim(df)[2]))
add <- setdiff(cols, colnames(df))
print(paste(exp, "Number of Missing Samples:", length(add)))
dumyDf <- data.frame(matrix(ncol = length(add), nrow = nrow(df)), row.names = row.names(df))
colnames(dumyDf) <- add
if(length(add) != 0) df <- cbind(df, dumyDf)
print(paste(exp, "Number of all Samples:", dim(df)[2]))
return(as.matrix(df[, sort(cols)]))
}
# dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf[, int_cols], cols, "RNA seq")
dataList$addedMissingVals[["rnaseq"]] <- add_cols(rnaDf, cols, "RNA seq")
## [1] "RNA seq Number of selected Samples: 180"
## [1] "RNA seq Number of Missing Samples: 0"
## [1] "RNA seq Number of all Samples: 180"
dataList$addedMissingVals[["abtiter"]] <- add_cols(abtiterDf, cols, "Ab-titer")
## [1] "Ab-titer Number of selected Samples: 171"
## [1] "Ab-titer Number of Missing Samples: 9"
## [1] "Ab-titer Number of all Samples: 180"
dataList$addedMissingVals[["cytof"]] <- add_cols(cellDf, cols, "Cell Freq")
## [1] "Cell Freq Number of selected Samples: 80"
## [1] "Cell Freq Number of Missing Samples: 100"
## [1] "Cell Freq Number of all Samples: 180"
dataList$addedMissingVals[["olink"]] <- add_cols(olinkDf, cols, "Olink")
## [1] "Olink Number of selected Samples: 80"
## [1] "Olink Number of Missing Samples: 100"
## [1] "Olink Number of all Samples: 180"
# metaDf <- read_tsv(here("./data/2020LD_subject.tsv"), show_col_types = FALSE)
# specimen <- read_tsv(here("./data/2020LD_specimen.tsv"), show_col_types = FALSE)
# metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth, units="weeks")/52, 2))
# metaDf <- merge(metaDf, specimen, by="subject_id")
# metaDf["timepoint"] <- metaDf["planned_day_relative_to_boost"]
metaDf <- data.frame(metaDf[metaDf$specimen_id %in% cols, ])
colnames(metaDf)[colnames(metaDf)=="specimen_id"] <- "sample"
rownames(metaDf) <- metaDf$sample
metaDf$sample <- as.character(metaDf$sample)
metaDf <- metaDf[cols,]
samplesList <- list(rnaseq=colnames(data.frame(dataList$addedMissingVals[["rnaseq"]])%>%
select(where(~!all(is.na(.))))),
abtiter=colnames(data.frame(dataList$addedMissingVals[["abtiter"]])%>%
select(where(~!all(is.na(.))))),
cytof=colnames(data.frame(dataList$addedMissingVals[["cytof"]])%>%
select(where(~!all(is.na(.))))),
olink=colnames(data.frame(dataList$addedMissingVals[["olink"]])%>%
select(where(~!all(is.na(.))))))
UpSet(make_comb_mat(list_to_matrix(samplesList)))
### find Intersects
noNaRNA <- rownames(na.omit(t(data.frame(dataList$addedMissingVals[["rnaseq"]]))))
noNaCell <- rownames(na.omit(t(data.frame(dataList$addedMissingVals[["cytof"]]))))
noNaAbtiter <- rownames(na.omit(t(data.frame(dataList$addedMissingVals[["abtiter"]]))))
noNaOlink <- rownames(na.omit(t(data.frame(dataList$addedMissingVals[["olink"]]))))
print(paste("Common between all datasets: ", length(intersect(intersect(intersect(noNaCell, noNaOlink), noNaRNA), noNaAbtiter))))
## [1] "Common between all datasets: 76"
print(paste("Only Common between RNAseq, Cytof and Olink: ", length(setdiff(intersect(intersect(noNaCell, noNaOlink ), noNaRNA), noNaAbtiter))))
## [1] "Only Common between RNAseq, Cytof and Olink: 4"
print(paste("Only Common between RNAseq and Abtiter: ", length(setdiff(setdiff(intersect(noNaRNA, noNaAbtiter), noNaCell), noNaOlink))))
## [1] "Only Common between RNAseq and Abtiter: 95"
print(paste("Only Exist in RNAseq: ", length(setdiff(noNaRNA, levels(factor(c(noNaCell, noNaAbtiter, noNaOlink)))))))
## [1] "Only Exist in RNAseq: 5"
MOFAobject_missingVals <- create_mofa(dataList[["addedMissingVals"]])
## Creating MOFA object from a list of matrices (features as rows, sample as columns)...
MOFAobject_missingVals
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
plot_data_overview(MOFAobject_missingVals)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
samples_metadata(MOFAobject_missingVals) <- metaDf
MOFAobject_missingVals
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
data_opts <- get_default_data_options(MOFAobject_missingVals)
data_opts
## $scale_views
## [1] FALSE
##
## $scale_groups
## [1] FALSE
##
## $center_groups
## [1] TRUE
##
## $use_float32
## [1] TRUE
##
## $views
## [1] "rnaseq" "abtiter" "cytof" "olink"
##
## $groups
## [1] "group1"
model_opts <- get_default_model_options(MOFAobject_missingVals)
model_opts$num_factors <- 15
model_opts
## $likelihoods
## rnaseq abtiter cytof olink
## "gaussian" "gaussian" "gaussian" "gaussian"
##
## $num_factors
## [1] 15
##
## $spikeslab_factors
## [1] FALSE
##
## $spikeslab_weights
## [1] FALSE
##
## $ard_factors
## [1] FALSE
##
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_missingVals)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42
train_opts
## $maxiter
## [1] 1000
##
## $convergence_mode
## [1] "medium"
##
## $drop_factor_threshold
## [1] -1
##
## $verbose
## [1] FALSE
##
## $startELBO
## [1] 1
##
## $freqELBO
## [1] 5
##
## $stochastic
## [1] FALSE
##
## $gpu_mode
## [1] FALSE
##
## $seed
## [1] 42
##
## $outfile
## NULL
##
## $weight_views
## [1] FALSE
##
## $save_interrupted
## [1] FALSE
MOFAobject_missingVals <- prepare_mofa(MOFAobject_missingVals,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
## Warning in prepare_mofa(MOFAobject_missingVals, data_options = data_opts, :
## Some view(s) have a lot of features, it is recommended to perform a more
## stringent feature selection before creating the MOFA object....
## Checking data options...
## Checking training options...
## Checking model options...
MOFAobject_missingVals <- run_mofa(MOFAobject_missingVals, outfile=".../MOFA2_2ndChallenge_2020.hdf5", use_basilisk = TRUE)
## Warning: Output file .../MOFA2_2ndChallenge_2020.hdf5 already exists, it will be replaced
## Connecting to the mofapy2 package using basilisk.
## Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.
MOFAobject_missingVals
## Trained MOFA with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
## Number of factors: 15
plot_object = MOFAobject_missingVals
plot_factor_cor(plot_object)
plot_factor(plot_object,
factors = 1,
color_by = "Factor1"
)
plot_variance_explained(plot_object, max_r2=1)
plot_variance_explained(plot_object, plot_total = T)[[2]]
correlate_factors_with_covariates(plot_object,
covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
plot="log_pval"
)
dataList$meanImputed[["abtiter"]] <- as.matrix(t(na_mean(t(dataList$addedMissingVals[["abtiter"]]))))
dataList$meanImputed[["cytof"]] <- as.matrix(t(na_mean(t(dataList$addedMissingVals[["cytof"]]))))
dataList$meanImputed[["rnaseq"]] <- as.matrix(t(na_mean(t(dataList$addedMissingVals[["rnaseq"]]))))
dataList$meanImputed[["olink"]] <- as.matrix(t(na_mean(t(dataList$addedMissingVals[["olink"]]))))
# a <- lapply(dataList$addedMissingVals[["olink"]][, 1:10], NA2mean)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
MOFAobject_meanImputed <- create_mofa(dataList[["meanImputed"]])
MOFAobject_meanImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
plot_data_overview(MOFAobject_meanImputed)
samples_metadata(MOFAobject_meanImputed) <- metaDf
MOFAobject_meanImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
data_opts <- get_default_data_options(MOFAobject_meanImputed)
MOFAobject_meanImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
model_opts <- get_default_model_options(MOFAobject_meanImputed)
model_opts$num_factors <- 15
model_opts
## $likelihoods
## abtiter cytof rnaseq olink
## "gaussian" "gaussian" "gaussian" "gaussian"
##
## $num_factors
## [1] 15
##
## $spikeslab_factors
## [1] FALSE
##
## $spikeslab_weights
## [1] FALSE
##
## $ard_factors
## [1] FALSE
##
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_meanImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42
train_opts
## $maxiter
## [1] 1000
##
## $convergence_mode
## [1] "medium"
##
## $drop_factor_threshold
## [1] -1
##
## $verbose
## [1] FALSE
##
## $startELBO
## [1] 1
##
## $freqELBO
## [1] 5
##
## $stochastic
## [1] FALSE
##
## $gpu_mode
## [1] FALSE
##
## $seed
## [1] 42
##
## $outfile
## NULL
##
## $weight_views
## [1] FALSE
##
## $save_interrupted
## [1] FALSE
MOFAobject_meanImputed <- prepare_mofa(MOFAobject_meanImputed,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
MOFAobject_meanImputed <- run_mofa(MOFAobject_meanImputed, outfile=".../MeanIpmputation_2ndChallenge_2020.hdf5", use_basilisk = TRUE)
MOFAobject_meanImputed
## Trained MOFA with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
## Number of factors: 15
plot_object = MOFAobject_meanImputed
plot_factor_cor(plot_object)
plot_factor(plot_object,
factors = 1,
color_by = "Factor1"
)
plot_variance_explained(plot_object, max_r2=1)
plot_variance_explained(plot_object, plot_total = T)[[2]]
correlate_factors_with_covariates(plot_object,
covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
plot="log_pval"
)
na2median <- function(df) {
df <- data.frame(df)
df <- t(data.frame(t(df)) %>%
mutate(across(1:dim(df)[1], ~replace_na(., median(., na.rm=TRUE)))))
colnames(df) <- gsub("X", "", colnames(df))
return(as.matrix(df))
}
# na_replace from imputeTS can be used to impute median in these values too.
dataList$medianImputed[["abtiter"]] <- na2median(dataList$addedMissingVals[["abtiter"]])
dataList$medianImputed[["cytof"]] <- na2median(dataList$addedMissingVals[["cytof"]])
dataList$medianImputed[["rnaseq"]] <- na2median(dataList$addedMissingVals[["rnaseq"]])
dataList$medianImputed[["olink"]] <- na2median(dataList$addedMissingVals[["olink"]])
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
MOFAobject_medianImputed <- create_mofa(dataList[["medianImputed"]])
MOFAobject_medianImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
plot_data_overview(MOFAobject_medianImputed)
samples_metadata(MOFAobject_medianImputed) <- metaDf
MOFAobject_medianImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
data_opts <- get_default_data_options(MOFAobject_medianImputed)
MOFAobject_medianImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
model_opts <- get_default_model_options(MOFAobject_medianImputed)
model_opts$num_factors <- 15
model_opts
## $likelihoods
## abtiter cytof rnaseq olink
## "gaussian" "gaussian" "gaussian" "gaussian"
##
## $num_factors
## [1] 15
##
## $spikeslab_factors
## [1] FALSE
##
## $spikeslab_weights
## [1] FALSE
##
## $ard_factors
## [1] FALSE
##
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_medianImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42
train_opts
## $maxiter
## [1] 1000
##
## $convergence_mode
## [1] "medium"
##
## $drop_factor_threshold
## [1] -1
##
## $verbose
## [1] FALSE
##
## $startELBO
## [1] 1
##
## $freqELBO
## [1] 5
##
## $stochastic
## [1] FALSE
##
## $gpu_mode
## [1] FALSE
##
## $seed
## [1] 42
##
## $outfile
## NULL
##
## $weight_views
## [1] FALSE
##
## $save_interrupted
## [1] FALSE
MOFAobject_medianImputed <- prepare_mofa(MOFAobject_medianImputed,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
MOFAobject_medianImputed <- run_mofa(MOFAobject_medianImputed, outfile=".../MedianIpmputation_2ndChallenge_2020.hdf5", use_basilisk = TRUE)
MOFAobject_medianImputed
## Trained MOFA with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
## Number of factors: 15
plot_object = MOFAobject_medianImputed
plot_factor_cor(plot_object)
plot_factor(plot_object,
factors = 1,
color_by = "Factor1"
)
plot_variance_explained(plot_object, max_r2=1)
plot_variance_explained(plot_object, plot_total = T)[[2]]
correlate_factors_with_covariates(plot_object,
covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
plot="log_pval"
)
MOFAobject_mofaImputed <- impute(MOFAobject_missingVals, views = "all")
samples_metadata(MOFAobject_mofaImputed) <- metaDf
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
## Number of factors: 15
results <- MOFAobject_mofaImputed@imputed_data
dataList$mofaImputed[["rnaseq"]] <- results$rnaseq$group1
dataList$mofaImputed[["abtiter"]] <- results$abtiter$group1
dataList$mofaImputed[["cytof"]] <- results$cytof$group1
dataList$mofaImputed[["olink"]] <- results$olink$group1
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
MOFAobject_mofaImputed <- create_mofa(dataList[["mofaImputed"]])
MOFAobject_mofaImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
plot_data_overview(MOFAobject_mofaImputed)
samples_metadata(MOFAobject_mofaImputed) <- metaDf
MOFAobject_mofaImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
data_opts <- get_default_data_options(MOFAobject_mofaImputed)
MOFAobject_mofaImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
model_opts <- get_default_model_options(MOFAobject_mofaImputed)
model_opts$num_factors <- 15
model_opts
## $likelihoods
## rnaseq abtiter cytof olink
## "gaussian" "gaussian" "gaussian" "gaussian"
##
## $num_factors
## [1] 15
##
## $spikeslab_factors
## [1] FALSE
##
## $spikeslab_weights
## [1] FALSE
##
## $ard_factors
## [1] FALSE
##
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_mofaImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42
train_opts
## $maxiter
## [1] 1000
##
## $convergence_mode
## [1] "medium"
##
## $drop_factor_threshold
## [1] -1
##
## $verbose
## [1] FALSE
##
## $startELBO
## [1] 1
##
## $freqELBO
## [1] 5
##
## $stochastic
## [1] FALSE
##
## $gpu_mode
## [1] FALSE
##
## $seed
## [1] 42
##
## $outfile
## NULL
##
## $weight_views
## [1] FALSE
##
## $save_interrupted
## [1] FALSE
MOFAobject_mofaImputed <- prepare_mofa(MOFAobject_mofaImputed,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
MOFAobject_mofaImputed <- run_mofa(MOFAobject_mofaImputed, outfile=".../MofaIpmputation_2ndChallenge_2020.hdf5", use_basilisk = TRUE)
MOFAobject_mofaImputed
## Trained MOFA with the following characteristics:
## Number of views: 4
## Views names: rnaseq abtiter cytof olink
## Number of features (per view): 14565 31 21 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
## Number of factors: 15
plot_object = MOFAobject_mofaImputed
plot_object = MOFAobject_medianImputed
plot_factor_cor(plot_object)
plot_factor(plot_object,
factors = 1,
color_by = "Factor1"
)
plot_variance_explained(plot_object, max_r2=1)
plot_variance_explained(plot_object, plot_total = T)[[2]]
correlate_factors_with_covariates(plot_object,
covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
plot="log_pval"
)
pMiss <- function(x){sum(is.na(x))/length(x)*100}
# pMiss <- function(x){is.na(x)}
data <- data.frame(row.names = cols)
data[,"cytof"] <- data.frame(apply(dataList[["addedMissingVals"]]$cytof,2,pMiss))
data[,"rnaseq"] <- data.frame(apply(dataList[["addedMissingVals"]]$rnaseq,2,pMiss))
data[,"abtiter"] <- data.frame(apply(dataList[["addedMissingVals"]]$abtiter,2,pMiss))
data[,"olink"] <- data.frame(apply(dataList[["addedMissingVals"]]$olink,2,pMiss))
data[data == 0] <- NA
md.pattern(data)
## cytof olink abtiter rnaseq
## 5 1 1 1 0 1
## 95 1 1 0 0 2
## 4 0 0 1 0 3
## 76 0 0 0 0 4
## 80 80 171 180 511
aggr_plot <- aggr(data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## rnaseq 1.0000000
## abtiter 0.9500000
## cytof 0.4444444
## olink 0.4444444
aggr_plot
##
## Missings in variables:
## Variable Count
## cytof 80
## rnaseq 180
## abtiter 171
## olink 80
marginplot(data[c("cytof", "abtiter")])
# methods(mice)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
mice_data <- do.call("rbind", dataList[["addedMissingVals"]])
tempData <- mice(data.frame(mice_data), m=4, maxit=5, meth='pmm', seed=500)
summary(tempData)
completedData <- complete(tempData,1)
knitr::kable(completedData[1:10,], "html", align = "lccrr", booktabs=TRUE, border_left = T,
border_right = T, caption = "MICE Imputation") %>%
kable_styling("striped", full_width = T) %>%
scroll_box(width = "100%", height = "400px")
colnames(completedData) <- gsub("X", "", colnames(completedData))
dataList$miceImputed[["abtiter"]] <- as.matrix(completedData[rownames(abtiterDf),])
dataList$miceImputed[["cytof"]] <- as.matrix(completedData[rownames(cellDf), ])
dataList$miceImputed[["rnaseq"]] <- as.matrix(completedData[rownames(rnaDf), ])
dataList$miceImputed[["olink"]] <- as.matrix(completedData[rownames(olinkDf), ])
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
MOFAobject_miceImputed <- create_mofa(dataList[["miceImputed"]])
MOFAobject_miceImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
plot_data_overview(MOFAobject_miceImputed)
samples_metadata(MOFAobject_miceImputed) <- metaDf
MOFAobject_miceImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
data_opts <- get_default_data_options(MOFAobject_miceImputed)
MOFAobject_miceImputed
## Untrained MOFA model with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
##
model_opts <- get_default_model_options(MOFAobject_miceImputed)
model_opts$num_factors <- 15
model_opts
## $likelihoods
## abtiter cytof rnaseq olink
## "gaussian" "gaussian" "gaussian" "gaussian"
##
## $num_factors
## [1] 15
##
## $spikeslab_factors
## [1] FALSE
##
## $spikeslab_weights
## [1] FALSE
##
## $ard_factors
## [1] FALSE
##
## $ard_weights
## [1] TRUE
train_opts <- get_default_training_options(MOFAobject_miceImputed)
train_opts$convergence_mode <- "medium"
train_opts$seed <- 42
train_opts
## $maxiter
## [1] 1000
##
## $convergence_mode
## [1] "medium"
##
## $drop_factor_threshold
## [1] -1
##
## $verbose
## [1] FALSE
##
## $startELBO
## [1] 1
##
## $freqELBO
## [1] 5
##
## $stochastic
## [1] FALSE
##
## $gpu_mode
## [1] FALSE
##
## $seed
## [1] 42
##
## $outfile
## NULL
##
## $weight_views
## [1] FALSE
##
## $save_interrupted
## [1] FALSE
MOFAobject_miceImputed <- prepare_mofa(MOFAobject_miceImputed,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
MOFAobject_miceImputed <- run_mofa(MOFAobject_miceImputed, outfile=".../MiceIpmputation_2ndChallenge_2020.hdf5", use_basilisk = TRUE)
MOFAobject_miceImputed
## Trained MOFA with the following characteristics:
## Number of views: 4
## Views names: abtiter cytof rnaseq olink
## Number of features (per view): 31 21 14565 30
## Number of groups: 1
## Groups names: group1
## Number of samples (per group): 180
## Number of factors: 15
plot_object = MOFAobject_miceImputed
plot_object = MOFAobject_medianImputed
plot_factor_cor(plot_object)
plot_factor(plot_object,
factors = 1,
color_by = "Factor1"
)
plot_variance_explained(plot_object, max_r2=1)
plot_variance_explained(plot_object, plot_total = T)[[2]]
correlate_factors_with_covariates(plot_object,
covariates = c("timepoint", "infancy_vac", "biological_sex", "ethnicity", "race"),
plot="log_pval"
)
saveRDS(dataList, file = here("./results/allData_2020.RDS"))