Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
  library(kableExtra)
  library(formattable)
  library(colorRamp2)
  library(ComplexHeatmap)
  library(GetoptLong)
  library(ellipse)
  library("ggplot2")                     
  library("GGally")
  source(here("./src/codebase.R"))
})

Setup: Variables used through out the script

Tasks:
1. Monocytes at day 1

2. CCL3 at day 3

3. IgG-PT at day 14
DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", 
               "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT")
RNA_COLS <- c("CCL3")
CELL_COLS <- c("Monocytes")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "IgG_PT_D0", "CCL3_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0",
                    "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS]

abtiterDf <- data.frame(df_source[["plasma_antibody_levels_wide"]])
abtiterDf$specimen_id <- as.numeric(rownames(abtiterDf))
abtiterDf <- data.frame(abtiterDf[, c("specimen_id", ABTITER_COLS)])


rnaDf <- data.frame(df_source[["pbmc_gene_expression_wide"]])
rnaDf$specimen_id <- as.numeric(rownames(rnaDf))
tasks_seq <- c('ENSG00000277632')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, c("specimen_id", RNA_COLS)])


cellDf <- data.frame(df_source[["pbmc_cell_frequency_wide"]])
cellDf$specimen_id <- as.numeric(rownames(cellDf))
cellDf <- data.frame(cellDf[, c("specimen_id", CELL_COLS)])


list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>% 
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex", 
                        "infancy_vac", "age_at_boost"), 
              names_from = timepoint, 
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)), 
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]
df_pivot <- data.frame(df_pivot %>%
   mutate(across(everything(),  ~ case_when(.x >=0 ~ .x))))

Setup: Dataset log transform , Ranking and fold change

  1. Fold change for actual values been generated by dividing the actual value of each task to it’s baseline

  2. Fold change for normalized values been generated by dividing the log transformed value of each task to it’s log transformed baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", "dataset", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "min", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)],
                           2, rankingFunction)
df <- data.frame(df)

df_2020 <- df[df$dataset=="2020_dataset", ]
df_2021 <- df[df$dataset=="2021_dataset", ]

Model construction

No actual model been developed. We use baseline values of tasks provided and demographic features such as age of participants as model. Two types of models has been developed:

A. Age of the participants

B. Baseline values of task variables

A. Modeling Approach: Age of the participants

Age of participants taken as it is has been used as predictor for a given task.

Modeling steps:

1.  Extract age of the participants
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on age at boost.

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", 
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df_2021[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

ageModel_df$Monocytes_D1_Rank <- df_2021$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df_2021$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df_2021$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df_2021$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df_2021$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df_2021$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(ageModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Age Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")
Age Based Models
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
61 61 32.38 Female wP 16 16 16 16 16 16
62 62 25.99 Female wP 38 38 38 38 38 38
63 63 23.98 Female wP 46 46 46 46 46 46
64 64 25.99 Male wP 38 38 38 38 38 38
65 65 29.02 Male wP 26 26 26 26 26 26
66 66 43.07 Female wP 4 4 4 4 4 4
67 67 47.24 Female wP 2 2 2 2 2 2
68 68 47.24 Male wP 2 2 2 2 2 2
69 69 29.17 Female wP 25 25 25 25 25 25
70 70 21.15 Male aP 59 59 59 59 59 59
71 71 21.15 Female aP 59 59 59 59 59 59
72 72 28.25 Female wP 31 31 31 31 31 31
73 73 24.23 Female wP 44 44 44 44 44 44
74 74 24.23 Female wP 44 44 44 44 44 44
75 75 21.22 Female aP 57 57 57 57 57 57
76 76 21.22 Female aP 57 57 57 57 57 57
77 77 31.32 Male wP 21 21 21 21 21 21
78 78 26.30 Female wP 36 36 36 36 36 36
79 79 32.32 Male wP 17 17 17 17 17 17
80 80 27.30 Female wP 34 34 34 34 34 34
81 81 26.30 Male wP 36 36 36 36 36 36
82 82 21.28 Female aP 56 56 56 56 56 56
83 83 20.34 Female aP 66 66 66 66 66 66
84 84 22.34 Female aP 52 52 52 52 52 52
85 85 19.39 Female aP 87 87 87 87 87 87
86 86 21.40 Female aP 55 55 55 55 55 55
87 87 19.39 Male aP 87 87 87 87 87 87
88 88 19.39 Male aP 87 87 87 87 87 87
89 89 22.49 Female aP 51 51 51 51 51 51
90 90 20.49 Female aP 65 65 65 65 65 65
91 91 21.49 Male aP 54 54 54 54 54 54
92 92 19.54 Female aP 85 85 85 85 85 85
93 93 23.56 Female aP 48 48 48 48 48 48
94 94 20.55 Male aP 64 64 64 64 64 64
95 95 21.55 Female aP 53 53 53 53 53 53
96 96 19.54 Male aP 85 85 85 85 85 85

B. Modeling approach: Baseline values of task variables

Baseline value of a given task has been used as predictor for a given task.

For Example:
  For task 1.1) IgG-PT-D14-titer-Rank we use baseline value of IgG-PT as predicted values.
  Similarly for 1.2) IgG-PT-D14-FC-Rank we used baseline of IgG-PT as predictor values.

Modeling steps:

1.  Extract baseline of each task
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on the baseline of that task.

expCols <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank",  
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df_2021[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

baselineModel_df$Monocytes_D1_Rank <- df_2021$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df_2021$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df_2021$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df_2021$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df_2021$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df_2021$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(baselineModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Baseline Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")
Baseline Based Models
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
61 61 32.38 Female wP 70 70 NA NA 39 39
62 62 25.99 Female wP 13 13 NA NA 13 13
63 63 23.98 Female wP 5 5 40 40 31 31
64 64 25.99 Male wP 51 51 49 49 22 22
65 65 29.02 Male wP 6 6 36 36 10 10
66 66 43.07 Female wP 4 4 48 48 28 28
67 67 47.24 Female wP 3 3 17 17 40 40
68 68 47.24 Male wP 12 12 44 44 40 40
69 69 29.17 Female wP 22 22 39 39 24 24
70 70 21.15 Male aP 79 79 6 6 52 52
71 71 21.15 Female aP 33 33 42 42 20 20
72 72 28.25 Female wP 8 8 20 20 51 51
73 73 24.23 Female wP 15 15 28 28 43 43
74 74 24.23 Female wP 65 65 3 3 59 59
75 75 21.22 Female aP 84 84 NA NA 53 53
76 76 21.22 Female aP 17 17 32 32 60 60
77 77 31.32 Male wP 56 56 18 18 11 11
78 78 26.30 Female wP 37 37 11 11 56 56
79 79 32.32 Male wP 30 30 2 2 9 9
80 80 27.30 Female wP 74 74 21 21 8 8
81 81 26.30 Male wP 54 54 12 12 12 12
82 82 21.28 Female aP NA NA 35 35 5 5
83 83 20.34 Female aP 63 63 19 19 70 70
84 84 22.34 Female aP 59 59 24 24 34 34
85 85 19.39 Female aP 80 80 40 40 35 35
86 86 21.40 Female aP 47 47 23 23 32 32
87 87 19.39 Male aP NA NA 31 31 15 15
88 88 19.39 Male aP NA NA 37 37 7 7
89 89 22.49 Female aP 85 85 43 43 27 27
90 90 20.49 Female aP 76 76 34 34 47 47
91 91 21.49 Male aP 57 57 8 8 66 66
92 92 19.54 Female aP 67 67 16 16 60 60
93 93 23.56 Female aP 45 45 45 45 53 53
94 94 20.55 Male aP 32 32 9 9 70 70
95 95 21.55 Female aP 24 24 27 27 60 60
96 96 19.54 Male aP 64 64 14 14 68 68

Model Training and Evaluation

Models been developed and trained using different predictors or combination of predictors. Two types of models been trained for each task:

1) Trivial Models

2) Multi-predictor Models

Spearman correlation coefficient has been used as evaluation matrices.

1) Training Approach : Trivial Model

Two models has been constructed for each task and the fold change of each task using Spearman Correlation Coefficient.

1. Age of Participants

2. Baseline values of task variables
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]

corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df_2020[,x], df_2020[,y], 
                                                  method="spearman", 
                                                  use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df_2020[,ranked_cols]), 
                                                      method="spearman", 
                                                      conf.level = 0.95, 
                                                      na.rm = TRUE)})$p

corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Model Evaluation using Spearman Rank Correlation Coefficient

Heatmap of Spearman correlation coefficient for each task against age at boost or the baseline of the task. Columns of the Heatmap indicating the task and the task fold change, and rows are indicating the model type; age base or baseline base models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))

2) Training Approach : Multi-predictors LASSO Models

  • For each task 4 models have been developed using combination of predictors as follow:

    1. 3 Baseline: Baseline of all 3 tasks combined are being used to train the models for each of the tasks.

    2. 3 Baseline + Demographic: Baseline of all three tasks combined with demographic data including; Age, Biological Sex and Vaccine type are being used to train models for each of the tasks.

source(here("./src/codebase.R"))
set.seed(0)

gs_2020 <- df_2020
# gs_2020 <- data.frame(df_pivot)

# 
gs_2020$infancy_vac <- as.numeric(factor(gs_2020$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2020$biological_sex <- as.numeric(factor(gs_2020$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2020[,"Monocytes_D1_FC"] <- gs_2020[, "Monocytes_D1"] / gs_2020[, "Monocytes_D0"]
gs_2020[,"CCL3_D3_FC"] <- gs_2020[, "CCL3_D3"] / gs_2020[, "CCL3_D0"]
gs_2020[,"IgG_PT_D14_FC"] <- gs_2020[, "IgG_PT_D14"] / gs_2020[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   gs_2020[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2020[, ltCol]+1)
}

gs_2020[,"normalized_CCL3_D3_FC"] <- gs_2020[, "normalized_CCL3_D3"] / gs_2020[, "normalized_CCL3_D0"]
gs_2020[,"normalized_IgG_PT_D14_FC"] <- gs_2020[, "normalized_IgG_PT_D14"] / gs_2020[, "normalized_IgG_PT_D0"]



gs_2021 <- df_2021
# gs_2020 <- data.frame(df_pivot)

# 
gs_2021$infancy_vac <- as.numeric(factor(gs_2021$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2021$biological_sex <- as.numeric(factor(gs_2021$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2021[,"Monocytes_D1_FC"] <- gs_2021[, "Monocytes_D1"] / gs_2021[, "Monocytes_D0"]
gs_2021[,"CCL3_D3_FC"] <- gs_2021[, "CCL3_D3"] / gs_2021[, "CCL3_D0"]
gs_2021[,"IgG_PT_D14_FC"] <- gs_2021[, "IgG_PT_D14"] / gs_2021[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   gs_2021[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2021[, ltCol]+1)
}

gs_2021[,"normalized_CCL3_D3_FC"] <- gs_2021[, "normalized_CCL3_D3"] / gs_2021[, "normalized_CCL3_D0"]
gs_2021[,"normalized_IgG_PT_D14_FC"] <- gs_2021[, "normalized_IgG_PT_D14"] / gs_2021[, "normalized_IgG_PT_D0"]

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Feature Selection

For each model, we assessed which features contribute to non-zero coefficients (slopes). For this purpose the minimum lambda, that gives minimum mean cross-validated error, been used for prediction.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
set.seed(1)

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2020, y, x)
res_base3 <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3) <- c("Variable", y)
res_base3$Models <- "3 Baselines"

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2020, y, x)
res_base3D <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3D) <- c("Variable", y)
res_base3D$Models <- "Demography + 3 Baselines"

df_list <- list(res_base3, res_base3D)
coeffDf <- df_list %>% reduce(full_join, by= c("Variable", "Models",y))
coeffDf$Variable <- gsub("normalized_", "", coeffDf$Variable)

# knitr::opts_chunk$set(fig.width=16, fig.height=18)
coeffDf2 <- coeffDf

rownames(coeffDf2) <- paste(coeffDf2$Models, coeffDf2$Variable, sep="_")
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3"] <- "3.1) CCL3-D3-Rank"

colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"

coeffDf2 <- coeffDf2[coeffDf2$Variable!="(Intercept)",]
# coeffDf2["Demography + 3 Baselines_age_at_boost", ] <- 0
# coeffDf2["Demography + 3 Baselines_age_at_boost", "Variable"] <- "age_at_boost"

tcoeffDf2 <- coeffDf2[, !names(coeffDf2) %in% c("Variable", "Models")]

Feature Importance using Coefficients (Slope)

Heatmap of slopes between each task and each of the predictors used to train LASSO model. The far left column of the heatmap represents model type; 3 Baseline models or 3 Baseline + Demographic models. The second column indicates features (predictors) being used to train models. Slopes between each task and a particular predictor, from each model type, is shown in the body of the heatmap. Columns of the heatmap body represent tasks. Slopes above zero are shown in red and those bellow zero are shown in blue. Slope of zero indicates that there is no relationship between coresponding task and predictor in that particular model.

modelAnno = rowAnnotation(
  Models = anno_block(gp = gpar(fill = 2:4),
        labels = c("3 Baseline", "Demography + 3 Baseline"),
        labels_gp = gpar(col = "white", fontsize = 10, fontface = "bold")),
        annotation_name_side = "top")

baseAnno = rowAnnotation(labels = anno_text(coeffDf2$Variable, which = "row",
                                            gp = gpar(fontsize = 12, fontface = "bold")),
    width = unit(30, "mm"))

tcoeffDf2 <- tcoeffDf2 %>% replace(is.na(.), 0)

rowOrder <- c("3 Baselines_IgG_PT_D0", "3 Baselines_Monocytes_D0", "3 Baselines_CCL3_D0", 
             # "Demography + 3 Baselines_age_at_boost",
             "Demography + 3 Baselines_biological_sex",
             "Demography + 3 Baselines_infancy_vac", "Demography + 3 Baselines_IgG_PT_D0", 
             "Demography + 3 Baselines_Monocytes_D0", "Demography + 3 Baselines_CCL3_D0")

rowNames <- c("IgG_PT_D0", "Monocytes_D0", "CCL3_D0", 
             # "age_at_boost",
             "biological_sex",
             "infancy_vac", "IgG_PT_D0", 
             "Monocytes_D0", "CCL3_D0")

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")

maxCorr <- max(abs(as.matrix(tcoeffDf2)), na.rm = T)
col_rnorm = colorRamp2(c(-maxCorr, 0, maxCorr), c("blue", "white","red"))
h1 <- Heatmap(tcoeffDf2, name = "Coefficient", column_names_rot = 45,
              cell_fun = function(j, i, x, y, width, height, fill) {
                  grid.text(sprintf("%.2f", tcoeffDf2[i, j]), x, y, gp = gpar(fontsize = 14, fontface = "bold"))},
              column_names_side = "top",
              na_col = "white",
              show_row_names=F,
              row_order = rowOrder,
              row_labels = rowNames,
              column_order = colOrders,
              height = nrow(tcoeffDf2)*unit(10, "mm"),
              width = ncol(tcoeffDf2)*unit(30, "mm"),
              col = col_rnorm, show_row_dend = FALSE,
              show_column_dend = FALSE,
              row_split = factor(coeffDf2$Model,
                                 levels = c("3 Baselines",
                                            "Demography + 3 Baselines",
                                            "12 Baselines",
                                            "Demography + 12 Baselines")),
              row_title = NULL,
              column_names_gp = gpar(fontsize = 12, fontface = "bold"),
               heatmap_legend_param = list( title_gp=gpar(fontsize = 12, fontface = "bold"),
                                          labels_gp=gpar(fontsize = 12)))

png(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", sep = "")),
    width = 10, height = 6, units = 'in', res = 300 )
modelAnno + baseAnno+ h1
dev.off()
knitr::include_graphics(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", 
                                   sep = "")), dpi = 100)

Generate predictions for 2021 tasks

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         na.label=" ", 
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

#### Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GGally_2.1.2          ellipse_0.4.3         GetoptLong_1.0.5     
##  [4] ComplexHeatmap_2.14.0 colorRamp2_0.1.0      formattable_0.2.1    
##  [7] here_1.0.1            glmnet_4.1-6          Matrix_1.5-1         
## [10] kableExtra_1.3.4      data.table_1.14.8     corrplot_0.92        
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] purrr_1.0.1           readr_2.1.4           tibble_3.2.0         
## [19] tidyverse_2.0.0       tidyr_1.3.0           ggplot2_3.4.1        
## [22] dplyr_1.1.0           discretefit_0.1.2     pROC_1.18.0          
## [25] verification_1.42     dtw_1.23-1            proxy_0.4-27         
## [28] CircStats_0.2-6       MASS_7.3-58.1         boot_1.3-28          
## [31] fields_14.1           viridis_0.6.2         viridisLite_0.4.1    
## [34] spam_2.9-1           
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.63.0  RColorBrewer_1.1-3  doParallel_1.0.17  
##  [4] webshot_0.5.4       httr_1.4.5          rprojroot_2.0.3    
##  [7] tools_4.2.2         bslib_0.4.2         utf8_1.2.3         
## [10] R6_2.5.1            BiocGenerics_0.44.0 colorspace_2.1-0   
## [13] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [16] compiler_4.2.2      cli_3.6.0           rvest_1.0.3        
## [19] xml2_1.3.3          sass_0.4.5          scales_1.2.1       
## [22] systemfonts_1.0.4   digest_0.6.31       rmarkdown_2.20     
## [25] svglite_2.1.1       pkgconfig_2.0.3     htmltools_0.5.4    
## [28] highr_0.10          fastmap_1.1.1       maps_3.4.1         
## [31] GlobalOptions_0.1.2 htmlwidgets_1.6.1   rlang_1.0.6        
## [34] rstudioapi_0.14     shape_1.4.6         jquerylib_0.1.4    
## [37] generics_0.1.3      jsonlite_1.8.4      magrittr_2.0.3     
## [40] dotCall64_1.0-2     S4Vectors_0.36.2    Rcpp_1.0.10        
## [43] munsell_0.5.0       fansi_1.0.4         lifecycle_1.0.3    
## [46] stringi_1.7.12      yaml_2.3.7          plyr_1.8.8         
## [49] parallel_4.2.2      crayon_1.5.2        lattice_0.20-45    
## [52] splines_4.2.2       circlize_0.4.15     hms_1.1.2          
## [55] knitr_1.42          pillar_1.8.1        rjson_0.2.21       
## [58] stats4_4.2.2        codetools_0.2-18    glue_1.6.2         
## [61] evaluate_0.20       vctrs_0.5.2         png_0.1-8          
## [64] tzdb_0.3.0          foreach_1.5.2       gtable_0.3.1       
## [67] reshape_0.8.9       clue_0.3-64         cachem_1.0.7       
## [70] xfun_0.37           survival_3.4-0      iterators_1.0.14   
## [73] IRanges_2.32.0      cluster_2.1.4       timechange_0.2.0   
## [76] ellipsis_0.3.2