2023-08-21
Setup: Load libraries and helper functions
suppressPackageStartupMessages({
library(verification)
library(pROC)
library(discretefit)
library(dplyr)
library(ggplot2)
library(tidyr)
library(tidyverse)
library(corrplot)
library(data.table)
library(here)
library(readr)
library(kableExtra)
library(glmnet)
library(here)
})
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'Setup: Variables
DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"
META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT",'IgG_FHA','IgG_PRN','IgG1_PT','IgG1_FHA','IgG4_PT', 'IgG4_FHA')
RNA_COLS <- c("CCL3", "IL6", "NFKBIA")
CELL_COLS <- c("Monocytes", "CD4Tcells")
DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")
TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "CD4Tcells_D0", "IgG_PT_D0", 'IgG_FHA_D0',
'IgG_PRN_D0','IgG1_PT_D0', 'IgG1_FHA_D0', 'IgG4_PT_D0',
'IgG4_FHA_D0', "CCL3_D0", "IL6_D0", "NFKBIA_D0")
LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0","IgG_FHA_D0",'IgG_PRN_D0','IgG1_PT_D0','IgG1_FHA_D0',
'IgG4_PT_D0','IgG4_FHA_D0', "IL6_D0", "NFKBIA_D0", "CCL3_D3", "IgG_PT_D14")Setup: Load dataset, combine different experiments
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))
metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS]
abtiterDf <- data.frame(df_source[["plasma_antibody_levels_wide"]])
abtiterDf <- data.frame(abtiterDf[, ABTITER_COLS])
colnames(abtiterDf) <- ABTITER_COLS
abtiterDf["specimen_id"] <- as.numeric(row.names(abtiterDf))
rnaDf <- data.frame(df_source[["pbmc_gene_expression_wide"]])
tasks_seq <- c('ENSG00000277632','ENSG00000136244','ENSG00000100906')
for (i in 1:length(tasks_seq)){
rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, RNA_COLS])
colnames(rnaDf) <- RNA_COLS
rnaDf["specimen_id"] <- as.numeric(row.names(rnaDf))
cellDf <- data.frame(df_source[["pbmc_cell_frequency_wide"]])
cellDf <- data.frame(cellDf[, CELL_COLS])
colnames(cellDf) <- CELL_COLS
cellDf["specimen_id"] <- as.numeric(row.names(cellDf))
list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]
df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>%
pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex",
"infancy_vac", "age_at_boost"),
names_from = timepoint,
values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)),
names_sep = "_D")
df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]Setup: Dataset log transform , Ranking and fold change
Fold change for actual values been generated by dividing the actual
value of each task by it’s baseline Fold change for normalised values
been generated by dividing the log transformed value of each task by
it’s log transformed baseline
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")
fc_cols <- paste(targetY, "FC", sep="_")
ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")
df <- df_pivot[, c("subject_id", DEMOGRAPHY_COLS, targetX, targetY)]
rankingFunction <- function(x) {
as.numeric(rank(-x, ties.method = "average", na.last = "keep"))
}
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")
df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]
df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)], 2, rankingFunction)
df <- data.frame(df)Table for the rank of Age based Models
expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", "2.1) Monocytes-D1-Rank",
"2.2) Monocytes-D1-FC-Rank", "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS,
"IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank",
"Monocytes_D1_Rank", "Monocytes_D1_FC_Rank",
"CCL3_D3_Rank", "CCL3_D3_FC_Rank")]
ageModel_df$Monocytes_D1_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df$age_at_boost_Rank
ageModel_df$Monocytes_D1_FC_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df$age_at_boost_Rank
colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age"
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status"
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"
knitr::kable(head(ageModel_df), "pipe", align = "lccrr", booktabs=TRUE) %>% kable_styling(full_width = F)| Subject ID | Age | Biological Sex at Birth | Vaccine Priming Status | 1.1) IgG_PT_D14_titer_Rank | 1.2) IgG_PT_D14_FC_Rank | 2.1) Monocytes_D1_Rank | 2.2) Monocytes_D1_FC_Rank | 3.1) CCL3_D3_Rank | 3.2) CCL3_D3_FC_Rank |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 30.80 | Female | wP | 22 | 22 | 22 | 22 | 22 | 22 |
| 2 | 51.25 | Female | wP | 1 | 1 | 1 | 1 | 1 | 1 |
| 3 | 33.89 | Female | wP | 13 | 13 | 13 | 13 | 13 | 13 |
| 4 | 28.76 | Male | wP | 30 | 30 | 30 | 30 | 30 | 30 |
| 5 | 25.75 | Male | wP | 42 | 42 | 42 | 42 | 42 | 42 |
| 6 | 28.87 | Female | wP | 27 | 27 | 27 | 27 | 27 | 27 |
Table for the rank of Baseline based Models
expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", "2.1) Monocytes-D1-Rank",
"2.2) Monocytes-D1-FC-Rank", "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS,
"IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank",
"Monocytes_D1_Rank", "Monocytes_D1_FC_Rank",
"CCL3_D3_Rank", "CCL3_D3_FC_Rank")]
baselineModel_df$Monocytes_D1_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df$IgG_PT_D14_Rank
baselineModel_df$Monocytes_D1_FC_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df$IgG_PT_D14_Rank
colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age"
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status"
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"
knitr::kable(head(baselineModel_df), "pipe", align = "lccrr", booktabs=TRUE) %>% kable_styling(full_width = F)| Subject ID | Age | Biological Sex at Birth | Vaccine Priming Status | 1.1) IgG_PT_D14_titer_Rank | 1.2) IgG_PT_D14_FC_Rank | 2.1) Monocytes_D1_Rank | 2.2) Monocytes_D1_FC_Rank | 3.1) CCL3_D3_Rank | 3.2) CCL3_D3_FC_Rank |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 30.80 | Female | wP | 20 | 20 | NA | NA | 21 | 21 |
| 2 | 51.25 | Female | wP | NA | NA | NA | NA | NA | NA |
| 3 | 33.89 | Female | wP | 40 | 40 | NA | NA | 36 | 36 |
| 4 | 28.76 | Male | wP | 35 | 35 | 53 | 53 | 64 | 64 |
| 5 | 25.75 | Male | wP | 48 | 48 | NA | NA | 42 | 42 |
| 6 | 28.87 | Female | wP | 29 | 29 | 1 | 1 | 45 | 45 |
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]
corDf <- data.frame()
corP_df <- data.frame()
corrTest <- suppressPackageStartupMessages({t(cor(df[,x], df[,y], method="spearman", use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df[,ranked_cols]), method="spearman", conf.level = 0.95, na.rm = TRUE)})$p
corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]
corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]
corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black',
addCoef.col = 1,
mar=c(0,0,0,2),
number.cex = 1.2, tl.srt = 45,
p.mat = as.matrix(corP_df), sig.level = 0.05,
col.lim = c(-maxCorr, maxCorr),
cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))knitr::opts_chunk$set(warning = FALSE, message = FALSE)
set.seed(0)
source(here("./src/codebase.R"))
df1 <- data.frame(df_pivot)
df1$infancy_vac <- as.numeric(factor(df1$infancy_vac, labels = c(1,2), levels = c("wP", "aP"), exclude = NA))
df1$biological_sex <- as.numeric(factor(df1$biological_sex, labels = c(1,2), exclude = NA))
for (ltCol in LOG_TRANS_COLS){
df1[, paste("normalized", ltCol, sep = "_")] <- log2(df1[, ltCol]+1)
}
corrDf <- data.frame()
pvalDf <- data.frame()
x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "3Baseline", y, x)
corrDf <- data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])
x <- c("age_at_boost", "infancy_vac", "biological_sex",
"Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "Demography_3Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])
x <- c("Monocytes_D0", "CD4Tcells_D0", "normalized_IgG_PT_D0", 'normalized_IgG1_FHA_D0',
'normalized_IgG_FHA_D0','normalized_IgG_PRN_D0', 'normalized_IgG1_PT_D0', 'normalized_IgG4_PT_D0',
'normalized_IgG4_FHA_D0', "normalized_CCL3_D0", "normalized_IL6_D0", "normalized_NFKBIA_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "14Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])
x <- c("age_at_boost", "infancy_vac", "biological_sex", "Monocytes_D0", "CD4Tcells_D0", "normalized_IgG_PT_D0",
'normalized_IgG_FHA_D0','normalized_IgG_PRN_D0', 'normalized_IgG1_PT_D0','normalized_IgG1_FHA_D0',
'normalized_IgG4_PT_D0', 'normalized_IgG4_FHA_D0', "normalized_CCL3_D0", "normalized_IL6_D0", "normalized_NFKBIA_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "Demography_14Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])knitr::opts_chunk$set(warning = FALSE, message = FALSE)
colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
"CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")
colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"
colnames(corrDf)[colnames(corrDf)=="X14Baseline"] <- "14 Baseline"
colnames(corrDf)[colnames(corrDf)=="Demography_14Baseline"] <- "Demography + 14 Baseline"
#
colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"
colnames(pvalDf)[colnames(pvalDf)=="X14Baseline"] <- "14 Baseline"
colnames(pvalDf)[colnames(pvalDf)=="Demography_14 Baseline"] <- "Demography + 14 Baseline"
corrDf <- t(corrDf)[, colOrder]
pvalDf <- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes_D1_Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes_D1_FC_Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3_D3_Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3_D3_FC_Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG_PT_D14_titer_Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG_PT_D14_FC_Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes_D1_Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes_D1_FC_Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3_D3_Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3_D3_FC_Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG_PT_D14_titer_Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG_PT_D14_FC_Rank"
maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf),
tl.col = 'black',
addCoef.col = 1,
# mar=c(0,0,0,3),
number.cex = 1.2, tl.srt = 60,
p.mat = as.matrix(pvalDf),
sig.level = 0.05,
col.lim = c(-maxCorr, maxCorr),
cl.ratio = 0.2,
col = colorRampPalette(c("blue", "white","red"))(100))