Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
  library(kableExtra)
  library(formattable)
  library(colorRamp2)
  library(ComplexHeatmap)
  library(GetoptLong)
  library(ellipse)
  library("ggplot2")                     
  library("GGally")
  source(here("./src/codebase.R"))
})

Setup: Variables used through out the script

Tasks:
1. Monocytes at day 1

2. CCL3 at day 3

3. IgG-PT at day 14
DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", 
               "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT")
RNA_COLS <- c("CCL3")
CELL_COLS <- c("Monocytes")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "IgG_PT_D0", "CCL3_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0",
                    "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS]

abtiterDf <- data.frame(df_source[["plasma_antibody_levels_wide"]])
abtiterDf$specimen_id <- as.numeric(rownames(abtiterDf))
abtiterDf <- data.frame(abtiterDf[, c("specimen_id", ABTITER_COLS)])


rnaDf <- data.frame(df_source[["pbmc_gene_expression_wide"]])
rnaDf$specimen_id <- as.numeric(rownames(rnaDf))
tasks_seq <- c('ENSG00000277632')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, c("specimen_id", RNA_COLS)])


cellDf <- data.frame(df_source[["pbmc_cell_frequency_wide"]])
cellDf$specimen_id <- as.numeric(rownames(cellDf))
cellDf <- data.frame(cellDf[, c("specimen_id", CELL_COLS)])


list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>% 
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex", 
                        "infancy_vac", "age_at_boost"), 
              names_from = timepoint, 
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)), 
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]
df_pivot <- data.frame(df_pivot %>%
   mutate(across(everything(),  ~ case_when(.x >=0 ~ .x))))

Setup: Dataset log transform , Ranking and fold change

  1. Fold change for actual values been generated by dividing the actual value of each task to it’s baseline

  2. Fold change for normalized values been generated by dividing the log transformed value of each task to it’s log transformed baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", "dataset", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "min", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)],
                           2, rankingFunction)
df <- data.frame(df)

df_2020 <- df[df$dataset=="2020_dataset", ]
df_2021 <- df[df$dataset=="2021_dataset", ]

Model construction

No actual model been developed. We use baseline values of tasks provided and demographic features such as age of participants as model. Two types of models has been developed:

A. Age of the participants

B. Baseline values of task variables

A. Modeling Approach: Age of the participants

Age of participants taken as it is has been used as predictor for a given task.

Modeling steps:

1.  Extract age of the participants
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on age at boost.

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", 
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df_2020[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

ageModel_df$Monocytes_D1_Rank <- df_2020$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df_2020$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df_2020$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df_2020$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df_2020$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df_2020$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(ageModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Age Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")
Age Based Models
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
1 30.80 Female wP 22 22 22 22 22 22
2 51.25 Female wP 1 1 1 1 1 1
3 33.89 Female wP 13 13 13 13 13 13
4 28.76 Male wP 30 30 30 30 30 30
5 25.75 Male wP 42 42 42 42 42 42
6 28.87 Female wP 27 27 27 27 27 27
7 35.97 Female wP 6 6 6 6 6 6
8 34.27 Female wP 11 11 11 11 11 11
9 20.63 Male aP 63 63 63 63 63 63
10 34.68 Female wP 9 9 9 9 9 9
11 30.76 Female wP 23 23 23 23 23 23
12 34.68 Male wP 9 9 9 9 9 9
13 19.63 Male aP 84 84 84 84 84 84
14 23.70 Male wP 47 47 47 47 47 47
15 27.71 Male wP 33 33 33 33 33 33
16 29.66 Female wP 24 24 24 24 24 24
17 36.82 Female wP 5 5 5 5 5 5
18 19.73 Female aP 83 83 83 83 83 83
19 22.81 Male wP 50 50 50 50 50 50
20 35.78 Female wP 7 7 7 7 7 7
21 33.77 Male wP 15 15 15 15 15 15
22 31.77 Female wP 20 20 20 20 20 20
23 25.82 Female wP 41 41 41 41 41 41
24 24.79 Female wP 43 43 43 43 43 43
25 28.80 Female wP 29 29 29 29 29 29
26 33.85 Female wP 14 14 14 14 14 14
27 19.80 Female aP 81 81 81 81 81 81
28 34.85 Male wP 8 8 8 8 8 8
29 19.80 Male aP 81 81 81 81 81 81
30 28.84 Female wP 28 28 28 28 28 28
31 27.83 Female wP 32 32 32 32 32 32
32 19.88 Male aP 78 78 78 78 78 78
33 26.87 Male wP 35 35 35 35 35 35
34 33.93 Female wP 12 12 12 12 12 12
35 25.86 Male wP 40 40 40 40 40 40
36 19.88 Female aP 78 78 78 78 78 78
37 18.91 Female aP 93 93 93 93 93 93
38 19.88 Female aP 78 78 78 78 78 78
39 31.92 Female wP 19 19 19 19 19 19
40 22.89 Female wP 49 49 49 49 49 49
41 31.96 Male wP 18 18 18 18 18 18
42 19.92 Female aP 77 77 77 77 77 77
43 18.91 Female aP 93 93 93 93 93 93
44 18.91 Female aP 93 93 93 93 93 93
45 19.98 Female aP 74 74 74 74 74 74
46 18.91 Female aP 93 93 93 93 93 93
47 20.98 Female aP 62 62 62 62 62 62
48 19.11 Female aP 90 90 90 90 90 90
49 20.11 Female aP 71 71 71 71 71 71
50 19.98 Female aP 74 74 74 74 74 74
51 19.98 Male aP 74 74 74 74 74 74
52 19.07 Male aP 91 91 91 91 91 91
53 19.07 Female aP 91 91 91 91 91 91
54 20.11 Female aP 71 71 71 71 71 71
55 20.11 Female aP 71 71 71 71 71 71
56 20.15 Female aP 67 67 67 67 67 67
57 21.15 Female aP 59 59 59 59 59 59
58 20.15 Female aP 67 67 67 67 67 67
59 20.15 Female aP 67 67 67 67 67 67
60 20.15 Male aP 67 67 67 67 67 67

B. Modeling approach: Baseline values of task variables

Baseline value of a given task has been used as predictor for a given task.

For Example:
  For task 1.1) IgG-PT-D14-titer-Rank we use baseline value of IgG-PT as predicted values.
  Similarly for 1.2) IgG-PT-D14-FC-Rank we used baseline of IgG-PT as predictor values.

Modeling steps:

1.  Extract baseline of each task
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on the baseline of that task.

expCols <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank",  
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df_2020[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

baselineModel_df$Monocytes_D1_Rank <- df_2020$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df_2020$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df_2020$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df_2020$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df_2020$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df_2020$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(baselineModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Baseline Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")
Baseline Based Models
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
1 30.80 Female wP 20 20 NA NA 21 21
2 51.25 Female wP NA NA NA NA NA NA
3 33.89 Female wP 40 40 NA NA 36 36
4 28.76 Male wP 35 35 53 53 63 63
5 25.75 Male wP 48 48 NA NA 42 42
6 28.87 Female wP 29 29 1 1 44 44
7 35.97 Female wP 50 50 NA NA NA NA
8 34.27 Female wP NA NA NA NA NA NA
9 20.63 Male aP 87 87 NA NA 25 25
10 34.68 Female wP 89 89 NA NA 48 48
11 30.76 Female wP 1 1 52 52 50 50
12 34.68 Male wP 49 49 NA NA NA NA
13 19.63 Male aP 68 68 NA NA 14 14
14 23.70 Male wP 11 11 NA NA NA NA
15 27.71 Male wP 46 46 46 46 30 30
16 29.66 Female wP 53 53 NA NA NA NA
17 36.82 Female wP 28 28 29 29 26 26
18 19.73 Female aP 23 23 NA NA 67 67
19 22.81 Male wP 27 27 NA NA 63 63
20 35.78 Female wP 38 38 10 10 23 23
21 33.77 Male wP 77 77 5 5 37 37
22 31.77 Female wP 52 52 NA NA 69 69
23 25.82 Female wP 39 39 NA NA 57 57
24 24.79 Female wP 82 82 NA NA 55 55
25 28.80 Female wP 83 83 NA NA 44 44
26 33.85 Female wP 71 71 30 30 63 63
27 19.80 Female aP 72 72 NA NA 44 44
28 34.85 Male wP 43 43 NA NA NA NA
29 19.80 Male aP 60 60 13 13 33 33
30 28.84 Female wP 66 66 NA NA NA NA
31 27.83 Female wP 78 78 51 51 19 19
32 19.88 Male aP 10 10 NA NA 18 18
33 26.87 Male wP 88 88 25 25 16 16
34 33.93 Female wP 26 26 NA NA NA NA
35 25.86 Male wP 58 58 NA NA 37 37
36 19.88 Female aP 73 73 26 26 48 48
37 18.91 Female aP NA NA NA NA NA NA
38 19.88 Female aP 41 41 NA NA 72 72
39 31.92 Female wP 62 62 NA NA NA NA
40 22.89 Female wP 34 34 NA NA NA NA
41 31.96 Male wP 21 21 NA NA NA NA
42 19.92 Female aP 31 31 NA NA 17 17
43 18.91 Female aP 81 81 NA NA 29 29
44 18.91 Female aP 7 7 4 4 57 57
45 19.98 Female aP 75 75 38 38 NA NA
46 18.91 Female aP 19 19 7 7 NA NA
47 20.98 Female aP 9 9 50 50 3 3
48 19.11 Female aP 90 90 22 22 6 6
49 20.11 Female aP 36 36 47 47 NA NA
50 19.98 Female aP 69 69 NA NA 1 1
51 19.98 Male aP 61 61 NA NA NA NA
52 19.07 Male aP 44 44 15 15 2 2
53 19.07 Female aP 16 16 NA NA 4 4
54 20.11 Female aP 25 25 NA NA NA NA
55 20.11 Female aP 42 42 33 33 NA NA
56 20.15 Female aP 18 18 NA NA NA NA
57 21.15 Female aP 55 55 NA NA NA NA
58 20.15 Female aP 14 14 NA NA NA NA
59 20.15 Female aP 2 2 NA NA NA NA
60 20.15 Male aP 86 86 NA NA NA NA

Model Training and Evaluation

Models been developed and trained using different predictors or combination of predictors. Two types of models been trained for each task:

1) Trivial Models

2) Multi-predictor Models

Spearman correlation coefficient has been used as evaluation matrices.

1) Training Approach : Trivial Model

Two models has been constructed for each task and the fold change of each task using Spearman Correlation Coefficient.

1. Age of Participants

2. Baseline values of task variables
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]

corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df_2021[,x], df_2021[,y], 
                                                  method="spearman", 
                                                  use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df_2021[,ranked_cols]), 
                                                      method="spearman", 
                                                      conf.level = 0.95, 
                                                      na.rm = TRUE)})$p

corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Model Evaluation using Spearman Rank Correlation Coefficient

Heatmap of Spearman correlation coefficient for each task against age at boost or the baseline of the task. Columns of the Heatmap indicating the task and the task fold change, and rows are indicating the model type; age base or baseline base models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))

2) Training Approach : Multi-predictors LASSO Models

  • For each task 4 models have been developed using combination of predictors as follow:

    1. 3 Baseline: Baseline of all 3 tasks combined are being used to train the models for each of the tasks.

    2. 3 Baseline + Demographic: Baseline of all three tasks combined with demographic data including; Age, Biological Sex and Vaccine type are being used to train models for each of the tasks.

source(here("./src/codebase.R"))
set.seed(1)

gs_2020 <- df_2020
# 
gs_2020$infancy_vac <- as.numeric(factor(gs_2020$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2020$biological_sex <- as.numeric(factor(gs_2020$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2020[,"Monocytes_D1_FC"] <- gs_2020[, "Monocytes_D1"] / gs_2020[, "Monocytes_D0"]
gs_2020[,"CCL3_D3_FC"] <- gs_2020[, "CCL3_D3"] / gs_2020[, "CCL3_D0"]
gs_2020[,"IgG_PT_D14_FC"] <- gs_2020[, "IgG_PT_D14"] / gs_2020[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   gs_2020[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2020[, ltCol]+1)
}

gs_2020[,"normalized_CCL3_D3_FC"] <- gs_2020[, "normalized_CCL3_D3"] / gs_2020[, "normalized_CCL3_D0"]
gs_2020[,"normalized_IgG_PT_D14_FC"] <- gs_2020[, "normalized_IgG_PT_D14"] / gs_2020[, "normalized_IgG_PT_D0"]


gs_2021 <- df_2021

gs_2021$infancy_vac <- as.numeric(factor(gs_2021$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2021$biological_sex <- as.numeric(factor(gs_2021$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2021[,"Monocytes_D1_FC"] <- gs_2021[, "Monocytes_D1"] / gs_2021[, "Monocytes_D0"]
gs_2021[,"CCL3_D3_FC"] <- gs_2021[, "CCL3_D3"] / gs_2021[, "CCL3_D0"]
gs_2021[,"IgG_PT_D14_FC"] <- gs_2021[, "IgG_PT_D14"] / gs_2021[, "IgG_PT_D0"]

for (ltCol in LOG_TRANS_COLS){
   gs_2021[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2021[, ltCol]+1)
}

gs_2021[,"normalized_CCL3_D3_FC"] <- gs_2021[, "normalized_CCL3_D3"] / gs_2021[, "normalized_CCL3_D0"]
gs_2021[,"normalized_IgG_PT_D14_FC"] <- gs_2021[, "normalized_IgG_PT_D14"] / gs_2021[, "normalized_IgG_PT_D0"]

Test model quality

Analyze correlation between predicted and true values in leave-one-out cross-validation

source(here("./src/codebase.R"))
corrDf <- data.frame()
pvalDf <- data.frame()

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "Monocytes_D1_FC", "normalized_CCL3_D3_FC", "normalized_IgG_PT_D14_FC")

res <- lassoFunction(gs_2021, corrDf, pvalDf, "3Baseline", y, x)
corrDf <- data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])


x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "Monocytes_D1_FC", "normalized_CCL3_D3_FC", "normalized_IgG_PT_D14_FC")

res <- lassoFunction(gs_2021, corrDf, pvalDf, "Demography_3Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Feature Selection

For each model, we assessed which features contribute to non-zero coefficients (slopes). For this purpose the minimum lambda, that gives minimum mean cross-validated error, been used for prediction.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
source(here("./src/codebase.R"))
set.seed(1)
coefList <- list()
x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2021, y, x)
res_base3 <- coefList %>% reduce(full_join, by="Variable")
a <- res_base3
# colnames(res_base3) <- c("Variable", y)
res_base3$Models <- "3 Baselines"

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2020, y, x)
res_base3D <- coefList %>% reduce(full_join, by="Variable")
# colnames(res_base3D) <- c("Variable", y)
res_base3D$Models <- "Demography + 3 Baselines"

df_list <- list(res_base3, res_base3D)
coeffDf <- df_list %>% reduce(full_join, by= c("Variable", "Models",y))
coeffDf$Variable <- gsub("normalized_", "", coeffDf$Variable)

coeffDf2 <- coeffDf

rownames(coeffDf2) <- paste(coeffDf2$Models, coeffDf2$Variable, sep="_")
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3"] <- "3.1) CCL3-D3-Rank"

colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"

coeffDf2 <- coeffDf2[coeffDf2$Variable!="(Intercept)",]

tcoeffDf2 <- coeffDf2[, !names(coeffDf2) %in% c("Variable", "Models")]

Feature Importance using Coefficients (Slope)

Heatmap of slopes between each task and each of the predictors used to train LASSO model. The far left column of the heatmap represents model type; 3 Baseline models or 3 Baseline + Demographic models. The second column indicates features (predictors) being used to train models. Slopes between each task and a particular predictor, from each model type, is shown in the body of the heatmap. Columns of the heatmap body represent tasks. Slopes above zero are shown in red and those bellow zero are shown in blue. Slope of zero indicates that there is no relationship between coresponding task and predictor in that particular model.

modelAnno = rowAnnotation(
  Models = anno_block(gp = gpar(fill = 2:4),
        labels = c("3 Baseline", "Demography + 3 Baseline"),
        labels_gp = gpar(col = "white", fontsize = 10, fontface = "bold")),
        annotation_name_side = "top")

baseAnno = rowAnnotation(labels = anno_text(coeffDf2$Variable, which = "row",
                                            gp = gpar(fontsize = 12, fontface = "bold")),
                         width = unit(30, "mm"))

tcoeffDf2 <- tcoeffDf2 %>% replace(is.na(.), 0)

rowOrder <- c("3 Baselines_IgG_PT_D0", "3 Baselines_Monocytes_D0", "3 Baselines_CCL3_D0", 
             "Demography + 3 Baselines_biological_sex",
             "Demography + 3 Baselines_infancy_vac", "Demography + 3 Baselines_IgG_PT_D0", 
             "Demography + 3 Baselines_Monocytes_D0", "Demography + 3 Baselines_CCL3_D0")

rowNames <- c("IgG_PT_D0", "Monocytes_D0", "CCL3_D0", 
             "biological_sex",
             "infancy_vac", "IgG_PT_D0", 
             "Monocytes_D0", "CCL3_D0")

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")

maxCorr <- max(abs(as.matrix(tcoeffDf2)), na.rm = T)
col_rnorm = colorRamp2(c(-maxCorr, 0, maxCorr), c("blue", "white","red"))
h1 <- Heatmap(tcoeffDf2, name = "Coefficient", column_names_rot = 45,
              cell_fun = function(j, i, x, y, width, height, fill) {
                  grid.text(sprintf("%.2f", tcoeffDf2[i, j]), x, y, gp = gpar(fontsize = 14, fontface = "bold"))},
              column_names_side = "top",
              na_col = "white",
              show_row_names=F,
              row_order = rowOrder,
              row_labels = rowNames,
              column_order = colOrders,
              height = nrow(tcoeffDf2)*unit(10, "mm"),
              width = ncol(tcoeffDf2)*unit(30, "mm"),
              col = col_rnorm, show_row_dend = FALSE,
              show_column_dend = FALSE,
              row_split = factor(coeffDf2$Model,
                                 levels = c("3 Baselines",
                                            "Demography + 3 Baselines",
                                            "12 Baselines",
                                            "Demography + 12 Baselines")),
              row_title = NULL,
              column_names_gp = gpar(fontsize = 12, fontface = "bold"),
               heatmap_legend_param = list( title_gp=gpar(fontsize = 12, fontface = "bold"),
                                          labels_gp=gpar(fontsize = 12)))

png(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", sep = "")),
    width = 10, height = 6, units = 'in', res = 300 )
modelAnno + baseAnno+ h1
dev.off()
knitr::include_graphics(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", 
                                   sep = "")), dpi = 100)

Generate predictions for 2020 tasks

source(here("./src/codebase.R"))
corrDf <- data.frame()
pvalDf <- data.frame()

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "normalized_IgG_PT_D14_FC")

res <- lassoFunction_test(gs_2021, gs_2020, corrDf, pvalDf, "3Baseline", y, x)
corrDf <- data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "normalized_IgG_PT_D14_FC")

res <- lassoFunction_test(gs_2021, gs_2020, corrDf, pvalDf, "Demography_3Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "CCL3_D3",
              "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
# colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
# colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
# colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
# colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "3.1) CCL3-D3-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         na.label=" ", 
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GGally_2.1.2          ellipse_0.4.3         GetoptLong_1.0.5     
##  [4] ComplexHeatmap_2.14.0 colorRamp2_0.1.0      formattable_0.2.1    
##  [7] here_1.0.1            glmnet_4.1-6          Matrix_1.5-1         
## [10] kableExtra_1.3.4      data.table_1.14.8     corrplot_0.92        
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] purrr_1.0.1           readr_2.1.4           tibble_3.2.0         
## [19] tidyverse_2.0.0       tidyr_1.3.0           ggplot2_3.4.1        
## [22] dplyr_1.1.0           discretefit_0.1.2     pROC_1.18.0          
## [25] verification_1.42     dtw_1.23-1            proxy_0.4-27         
## [28] CircStats_0.2-6       MASS_7.3-58.1         boot_1.3-28          
## [31] fields_14.1           viridis_0.6.2         viridisLite_0.4.1    
## [34] spam_2.9-1           
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.63.0  RColorBrewer_1.1-3  doParallel_1.0.17  
##  [4] webshot_0.5.4       httr_1.4.5          rprojroot_2.0.3    
##  [7] tools_4.2.2         bslib_0.4.2         utf8_1.2.3         
## [10] R6_2.5.1            BiocGenerics_0.44.0 colorspace_2.1-0   
## [13] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [16] compiler_4.2.2      cli_3.6.0           rvest_1.0.3        
## [19] xml2_1.3.3          sass_0.4.5          scales_1.2.1       
## [22] systemfonts_1.0.4   digest_0.6.31       rmarkdown_2.20     
## [25] svglite_2.1.1       pkgconfig_2.0.3     htmltools_0.5.4    
## [28] highr_0.10          fastmap_1.1.1       maps_3.4.1         
## [31] GlobalOptions_0.1.2 htmlwidgets_1.6.1   rlang_1.0.6        
## [34] rstudioapi_0.14     shape_1.4.6         jquerylib_0.1.4    
## [37] generics_0.1.3      jsonlite_1.8.4      magrittr_2.0.3     
## [40] dotCall64_1.0-2     S4Vectors_0.36.2    Rcpp_1.0.10        
## [43] munsell_0.5.0       fansi_1.0.4         lifecycle_1.0.3    
## [46] stringi_1.7.12      yaml_2.3.7          plyr_1.8.8         
## [49] parallel_4.2.2      crayon_1.5.2        lattice_0.20-45    
## [52] splines_4.2.2       circlize_0.4.15     hms_1.1.2          
## [55] knitr_1.42          pillar_1.8.1        rjson_0.2.21       
## [58] stats4_4.2.2        codetools_0.2-18    glue_1.6.2         
## [61] evaluate_0.20       vctrs_0.5.2         png_0.1-8          
## [64] tzdb_0.3.0          foreach_1.5.2       gtable_0.3.1       
## [67] reshape_0.8.9       clue_0.3-64         cachem_1.0.7       
## [70] xfun_0.37           survival_3.4-0      iterators_1.0.14   
## [73] IRanges_2.32.0      cluster_2.1.4       timechange_0.2.0   
## [76] ellipsis_0.3.2