Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
  library(kableExtra)
  library(formattable)
  library(colorRamp2)
  library(ComplexHeatmap)
  library(GetoptLong)
  library(ellipse)
  library("ggplot2")                     
  library("GGally")
  source(here("./src/codebase.R"))
})

Setup: Variables used throughout the script

Tasks:
1. Monocytes at day 1

2. CCL3 at day 3

3. IgG-PT at day 14
DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", 
               "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT")
RNA_COLS <- c("CCL3")
CELL_COLS <- c("Monocytes")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "IgG_PT_D0", "CCL3_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0",
                    "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

df_source <- readRDS(here("./data/master_processed_training_data.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS] %>%
  data.frame()

abtiterDf <- df_source[["abtiter_wide"]]$batchCorrected_data %>%
  t() %>%
  data.frame() 

abtiterDf$specimen_id <- as.numeric(rownames(abtiterDf))
abtiterDf <- data.frame(abtiterDf[, c("specimen_id", ABTITER_COLS)])


rnaDf <- df_source[["pbmc_gene_expression"]]$batchCorrected_data %>%
  t() %>%
  data.frame() 

rnaDf$specimen_id <- as.numeric(rownames(rnaDf))
tasks_seq <- c('ENSG00000277632')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, c("specimen_id", RNA_COLS)])


cellDf <- df_source[["pbmc_cell_frequency"]]$batchCorrected_data %>%
  t() %>%
  data.frame()  

cellDf$specimen_id <- as.numeric(rownames(cellDf))
cellDf <- data.frame(cellDf[, c("specimen_id", CELL_COLS)])

list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>%
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex",
                        "infancy_vac", "age_at_boost"),
              names_from = timepoint,
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)),
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]
df_pivot <- data.frame(df_pivot %>%
   mutate(across(everything(),  ~ case_when(.x >=0 ~ .x))))

Setup: Dataset fold change and Ranking

Fold change for actual values have been generated by dividing the actual value of each task to it’s baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", "dataset", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "min", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)],
                           2, rankingFunction)
df <- data.frame(df)

Model construction

No actual model has been developed. We use baseline values of tasks provided and demographic features such as age of participants as model. Two types of models have been developed:

A. Age of the participants

B. Baseline values of task variables

A. Modeling Approach: Age of the participants

Age of participants taken, as it is, has been used as predictor for a given task.

Modeling steps:

1.  Extract age of the participants
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follows:

Possible rank of predicted values for each task when model is trained based on age at boost.

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", 
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

ageModel_df$Monocytes_D1_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(ageModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Age Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")
Age Based Models
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
1 30.80 Female wP 22 22 22 22 22 22
2 51.25 Female wP 1 1 1 1 1 1
3 33.89 Female wP 13 13 13 13 13 13
4 28.76 Male wP 30 30 30 30 30 30
5 25.75 Male wP 42 42 42 42 42 42
6 28.87 Female wP 27 27 27 27 27 27
7 35.97 Female wP 6 6 6 6 6 6
8 34.27 Female wP 11 11 11 11 11 11
9 20.63 Male aP 63 63 63 63 63 63
10 34.68 Female wP 9 9 9 9 9 9
11 30.76 Female wP 23 23 23 23 23 23
12 34.68 Male wP 9 9 9 9 9 9
13 19.63 Male aP 84 84 84 84 84 84
14 23.70 Male wP 47 47 47 47 47 47
15 27.71 Male wP 33 33 33 33 33 33
16 29.66 Female wP 24 24 24 24 24 24
17 36.82 Female wP 5 5 5 5 5 5
18 19.73 Female aP 83 83 83 83 83 83
19 22.81 Male wP 50 50 50 50 50 50
20 35.78 Female wP 7 7 7 7 7 7
21 33.77 Male wP 15 15 15 15 15 15
22 31.77 Female wP 20 20 20 20 20 20
23 25.82 Female wP 41 41 41 41 41 41
24 24.79 Female wP 43 43 43 43 43 43
25 28.80 Female wP 29 29 29 29 29 29
26 33.85 Female wP 14 14 14 14 14 14
27 19.80 Female aP 81 81 81 81 81 81
28 34.85 Male wP 8 8 8 8 8 8
29 19.80 Male aP 81 81 81 81 81 81
30 28.84 Female wP 28 28 28 28 28 28
31 27.83 Female wP 32 32 32 32 32 32
32 19.88 Male aP 78 78 78 78 78 78
33 26.87 Male wP 35 35 35 35 35 35
34 33.93 Female wP 12 12 12 12 12 12
35 25.86 Male wP 40 40 40 40 40 40
36 19.88 Female aP 78 78 78 78 78 78
37 18.91 Female aP 93 93 93 93 93 93
38 19.88 Female aP 78 78 78 78 78 78
39 31.92 Female wP 19 19 19 19 19 19
40 22.89 Female wP 49 49 49 49 49 49
41 31.96 Male wP 18 18 18 18 18 18
42 19.92 Female aP 77 77 77 77 77 77
43 18.91 Female aP 93 93 93 93 93 93
44 18.91 Female aP 93 93 93 93 93 93
45 19.98 Female aP 74 74 74 74 74 74
46 18.91 Female aP 93 93 93 93 93 93
47 20.98 Female aP 62 62 62 62 62 62
48 19.11 Female aP 90 90 90 90 90 90
49 20.11 Female aP 71 71 71 71 71 71
50 19.98 Female aP 74 74 74 74 74 74
51 19.98 Male aP 74 74 74 74 74 74
52 19.07 Male aP 91 91 91 91 91 91
53 19.07 Female aP 91 91 91 91 91 91
54 20.11 Female aP 71 71 71 71 71 71
55 20.11 Female aP 71 71 71 71 71 71
56 20.15 Female aP 67 67 67 67 67 67
57 21.15 Female aP 59 59 59 59 59 59
58 20.15 Female aP 67 67 67 67 67 67
59 20.15 Female aP 67 67 67 67 67 67
60 20.15 Male aP 67 67 67 67 67 67
61 32.38 Female wP 16 16 16 16 16 16
62 25.99 Female wP 38 38 38 38 38 38
63 23.98 Female wP 46 46 46 46 46 46
64 25.99 Male wP 38 38 38 38 38 38
65 29.02 Male wP 26 26 26 26 26 26
66 43.07 Female wP 4 4 4 4 4 4
67 47.24 Female wP 2 2 2 2 2 2
68 47.24 Male wP 2 2 2 2 2 2
69 29.17 Female wP 25 25 25 25 25 25
70 21.15 Male aP 59 59 59 59 59 59
71 21.15 Female aP 59 59 59 59 59 59
72 28.25 Female wP 31 31 31 31 31 31
73 24.23 Female wP 44 44 44 44 44 44
74 24.23 Female wP 44 44 44 44 44 44
75 21.22 Female aP 57 57 57 57 57 57
76 21.22 Female aP 57 57 57 57 57 57
77 31.32 Male wP 21 21 21 21 21 21
78 26.30 Female wP 36 36 36 36 36 36
79 32.32 Male wP 17 17 17 17 17 17
80 27.30 Female wP 34 34 34 34 34 34
81 26.30 Male wP 36 36 36 36 36 36
82 21.28 Female aP 56 56 56 56 56 56
83 20.34 Female aP 66 66 66 66 66 66
84 22.34 Female aP 52 52 52 52 52 52
85 19.39 Female aP 87 87 87 87 87 87
86 21.40 Female aP 55 55 55 55 55 55
87 19.39 Male aP 87 87 87 87 87 87
88 19.39 Male aP 87 87 87 87 87 87
89 22.49 Female aP 51 51 51 51 51 51
90 20.49 Female aP 65 65 65 65 65 65
91 21.49 Male aP 54 54 54 54 54 54
92 19.54 Female aP 85 85 85 85 85 85
93 23.56 Female aP 48 48 48 48 48 48
94 20.55 Male aP 64 64 64 64 64 64
95 21.55 Female aP 53 53 53 53 53 53
96 19.54 Male aP 85 85 85 85 85 85

B. Modeling approach: Baseline values of task variables

Baseline value of a given task has been used as predictor for a given task.

For Example:
  For task 1.1) IgG-PT-D14-titer-Rank we use baseline value of IgG-PT as predicted values.
  Similarly for 1.2) IgG-PT-D14-FC-Rank we used baseline of IgG-PT as predictor values.

Modeling steps:

1.  Extract baseline of each task
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on the baseline of that task.

expCols <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank",  
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

baselineModel_df$Monocytes_D1_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(baselineModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Baseline Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")
Baseline Based Models
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
1 30.80 Female wP 20 20 NA NA 23 23
2 51.25 Female wP NA NA NA NA NA NA
3 33.89 Female wP 40 40 NA NA 40 40
4 28.76 Male wP 35 35 53 53 63 63
5 25.75 Male wP 48 48 NA NA 48 48
6 28.87 Female wP 31 31 1 1 52 52
7 35.97 Female wP 50 50 NA NA NA NA
8 34.27 Female wP NA NA NA NA NA NA
9 20.63 Male aP 85 85 NA NA 28 28
10 34.68 Female wP 89 89 NA NA 52 52
11 30.76 Female wP 1 1 52 52 57 57
12 34.68 Male wP 49 49 NA NA NA NA
13 19.63 Male aP 68 68 NA NA 15 15
14 23.70 Male wP 11 11 NA NA NA NA
15 27.71 Male wP 47 47 43 43 34 34
16 29.66 Female wP 53 53 NA NA NA NA
17 36.82 Female wP 29 29 27 27 28 28
18 19.73 Female aP 23 23 NA NA 67 67
19 22.81 Male wP 28 28 NA NA 67 67
20 35.78 Female wP 38 38 10 10 24 24
21 33.77 Male wP 76 76 6 6 42 42
22 31.77 Female wP 52 52 NA NA 71 71
23 25.82 Female wP 39 39 NA NA 63 63
24 24.79 Female wP 81 81 NA NA 60 60
25 28.80 Female wP 82 82 NA NA 48 48
26 33.85 Female wP 71 71 28 28 67 67
27 19.80 Female aP 72 72 NA NA 48 48
28 34.85 Male wP 43 43 NA NA NA NA
29 19.80 Male aP 60 60 14 14 35 35
30 28.84 Female wP 64 64 NA NA NA NA
31 27.83 Female wP 77 77 49 49 21 21
32 19.88 Male aP 10 10 NA NA 18 18
33 26.87 Male wP 88 88 23 23 16 16
34 33.93 Female wP 26 26 NA NA NA NA
35 25.86 Male wP 58 58 NA NA 42 42
36 19.88 Female aP 73 73 24 24 54 54
37 18.91 Female aP NA NA NA NA NA NA
38 19.88 Female aP 41 41 NA NA 72 72
39 31.92 Female wP 62 62 NA NA NA NA
40 22.89 Female wP 34 34 NA NA NA NA
41 31.96 Male wP 21 21 NA NA NA NA
42 19.92 Female aP 32 32 NA NA 17 17
43 18.91 Female aP 80 80 NA NA 33 33
44 18.91 Female aP 8 8 4 4 63 63
45 19.98 Female aP 74 74 35 35 NA NA
46 18.91 Female aP 19 19 8 8 NA NA
47 20.98 Female aP 9 9 48 48 3 3
48 19.11 Female aP 90 90 22 22 6 6
49 20.11 Female aP 36 36 44 44 NA NA
50 19.98 Female aP 69 69 NA NA 1 1
51 19.98 Male aP 61 61 NA NA NA NA
52 19.07 Male aP 44 44 16 16 2 2
53 19.07 Female aP 16 16 NA NA 4 4
54 20.11 Female aP 25 25 NA NA NA NA
55 20.11 Female aP 42 42 31 31 NA NA
56 20.15 Female aP 18 18 NA NA NA NA
57 21.15 Female aP 55 55 NA NA NA NA
58 20.15 Female aP 15 15 NA NA NA NA
59 20.15 Female aP 4 4 NA NA NA NA
60 20.15 Male aP 84 84 NA NA NA NA
61 32.38 Female wP 70 70 NA NA 37 37
62 25.99 Female wP 13 13 NA NA 13 13
63 23.98 Female wP 5 5 40 40 27 27
64 25.99 Male wP 51 51 51 51 20 20
65 29.02 Male wP 6 6 37 37 10 10
66 43.07 Female wP 3 3 50 50 26 26
67 47.24 Female wP 2 2 17 17 37 37
68 47.24 Male wP 12 12 46 46 35 35
69 29.17 Female wP 22 22 39 39 22 22
70 21.15 Male aP 79 79 5 5 45 45
71 21.15 Female aP 33 33 42 42 19 19
72 28.25 Female wP 7 7 20 20 42 42
73 24.23 Female wP 14 14 30 30 39 39
74 24.23 Female wP 66 66 3 3 54 54
75 21.22 Female aP 86 86 NA NA 47 47
76 21.22 Female aP 17 17 33 33 57 57
77 31.32 Male wP 56 56 18 18 11 11
78 26.30 Female wP 37 37 11 11 51 51
79 32.32 Male wP 27 27 2 2 9 9
80 27.30 Female wP 75 75 21 21 8 8
81 26.30 Male wP 54 54 12 12 12 12
82 21.28 Female aP NA NA 36 36 5 5
83 20.34 Female aP 63 63 19 19 67 67
84 22.34 Female aP 59 59 26 26 31 31
85 19.39 Female aP 83 83 40 40 32 32
86 21.40 Female aP 46 46 25 25 28 28
87 19.39 Male aP NA NA 32 32 14 14
88 19.39 Male aP NA NA 38 38 7 7
89 22.49 Female aP 87 87 45 45 24 24
90 20.49 Female aP 78 78 34 34 40 40
91 21.49 Male aP 57 57 7 7 60 60
92 19.54 Female aP 67 67 15 15 54 54
93 23.56 Female aP 45 45 47 47 46 46
94 20.55 Male aP 30 30 9 9 63 63
95 21.55 Female aP 24 24 29 29 57 57
96 19.54 Male aP 65 65 13 13 62 62

Model Training and Evaluation

Models have been developed and trained using different predictors or combination of predictors. Two types of models have been trained for each task:

1) Trivial Models

2) Multi-predictor Models

Spearman correlation coefficient has been used as evaluation matrics.

1) Training Approach : Trivial Model

Two models have been constructed for each task and the fold change of each task using Spearman Correlation Coefficient.

1. Age of Participants

2. Baseline values of task variables
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]

corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df[,x], df[,y], 
                                                  method="spearman", 
                                                  use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df[,ranked_cols]), 
                                                      method="spearman", 
                                                      conf.level = 0.95, 
                                                      na.rm = TRUE)})$p

corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Model Evaluation using Spearman Rank Correlation Coefficient

Heatmap of Spearman correlation coefficient for each task against age at boost or the baseline of the task. Columns of the Heatmap indicating the task and the task fold change, and rows are indicating the model type; age base or baseline base models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))

2) Training Approach : Multi-predictors LASSO Models

  • For each task two models have been developed using combination of predictors as follow:

    1. 3 Baseline: Baseline of all three tasks combined are being used to train the models for each of the tasks.

    2. 3 Baseline + Demographic: Baseline of all three tasks combined with demographic data including; Age, Biological Sex and Vaccine type are being used to train models for each of the tasks.

source(here("./src/codebase.R"))
set.seed(1)

df1 <- df

df1$infancy_vac <- as.numeric(factor(df1$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
df1$biological_sex <- as.numeric(factor(df1$biological_sex,
                                        labels = c(1,2), exclude = NA))

df1[,"Monocytes_D1_FC"] <- df1[, "Monocytes_D1"] / df1[, "Monocytes_D0"]
df1[,"CCL3_D3_FC"] <- df1[, "CCL3_D3"] / df1[, "CCL3_D0"]
df1[,"IgG_PT_D14_FC"] <- df1[, "IgG_PT_D14"] / df1[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   df1[, paste("normalized", ltCol, sep = "_")] <- log2(df1[, ltCol]+1)
}

df1[,"normalized_CCL3_D3_FC"] <- df1[, "normalized_CCL3_D3"] / df1[, "normalized_CCL3_D0"]
df1[,"normalized_IgG_PT_D14_FC"] <- df1[, "normalized_IgG_PT_D14"] / df1[, "normalized_IgG_PT_D0"]

Validate model quality

Analyze Spearman Correlation Coefficient between predicted and true values in leave-one-out cross-validation.

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicate the task and the task fold change, and rows indicate the model type; 3 Baseline or Demographic + 3 Baseline models. Crosses indicate insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)


# pvalDf[corrDf < 0] <- 1

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Feature Selection

For each model, we assessed which features contribute to non-zero coefficients (slopes). For this purpose the minimum lambda, that gives minimum mean cross-validated error, has been used.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
set.seed(1)

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(df1, y, x)
res_base3 <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3) <- c("Variable", y)
res_base3$Models <- "3 Baselines"

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(df1, y, x)
res_base3D <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3D) <- c("Variable", y)
res_base3D$Models <- "Demography + 3 Baselines"

df_list <- list(res_base3, res_base3D)
coeffDf <- df_list %>% reduce(full_join, by= c("Variable", "Models",y))
coeffDf$Variable <- gsub("normalized_", "", coeffDf$Variable)

# knitr::opts_chunk$set(fig.width=16, fig.height=18)
coeffDf2 <- coeffDf

rownames(coeffDf2) <- paste(coeffDf2$Models, coeffDf2$Variable, sep="_")
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3"] <- "3.1) CCL3-D3-Rank"

colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"

coeffDf2 <- coeffDf2[coeffDf2$Variable!="(Intercept)",]
tcoeffDf2 <- coeffDf2[, !names(coeffDf2) %in% c("Variable", "Models")]

Feature Importance using Coefficients (Slope)

Heatmap of slopes between each task and each of the predictors used to train different type of LASSO models. The far left column of the heatmap represents model type; 3 Baseline models or 3 Baseline + Demographic models. The second column indicates features (predictors) being used to train models. Slopes between each task and a particular predictor, from each model type, is shown in the body of the heatmap. Columns of the heatmap body represent tasks. Slopes above zero are shown in red and those bellow zero are shown in blue. Slope of zero indicates that there is no relationship between corresponding task and predictor in that particular model.

modelAnno = rowAnnotation(
  Models = anno_block(gp = gpar(fill = 2:4),
        labels = c("3 Baseline", "Demography + 3 Baseline"),
        labels_gp = gpar(col = "white", fontsize = 12, fontface = "bold")),
        annotation_name_side = "top")

baseAnno = rowAnnotation(labels = anno_text(coeffDf2$Variable, which = "row",
                                            gp = gpar(fontsize = 12, fontface = "bold")),
    width = unit(30, "mm"))

tcoeffDf2 <- tcoeffDf2 %>% replace(is.na(.), 0)
rowOrder <- c("3 Baselines_IgG_PT_D0", "3 Baselines_Monocytes_D0", "3 Baselines_CCL3_D0", 
             "Demography + 3 Baselines_age_at_boost", "Demography + 3 Baselines_biological_sex", 
             "Demography + 3 Baselines_infancy_vac", "Demography + 3 Baselines_IgG_PT_D0", 
             "Demography + 3 Baselines_Monocytes_D0", "Demography + 3 Baselines_CCL3_D0")

rowNames <- c("IgG_PT_D0", "Monocytes_D0", "CCL3_D0", 
             "age_at_boost", "biological_sex", 
             "infancy_vac", "IgG_PT_D0", 
             "Monocytes_D0", "CCL3_D0")

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")

maxCorr <- max(abs(as.matrix(tcoeffDf2)), na.rm = T)
col_rnorm = colorRamp2(c(-maxCorr, 0, maxCorr), c("blue", "white","red"))
h1 <- Heatmap(tcoeffDf2, name = "Slope", column_names_rot = 45,
              cell_fun = function(j, i, x, y, width, height, fill) {
                  grid.text(sprintf("%.2f", tcoeffDf2[i, j]), x, y, gp = gpar(fontsize = 14, fontface = "bold"))},
              column_names_side = "top",
              na_col = "white",
              show_row_names=F,
              row_order = rowOrder,
              row_labels = rowNames,
              column_order = colOrders,
              height = nrow(tcoeffDf2)*unit(10, "mm"),
              width = ncol(tcoeffDf2)*unit(30, "mm"),
              col = col_rnorm, show_row_dend = FALSE,
              show_column_dend = FALSE,
              row_split = factor(coeffDf2$Model,
                                 levels = c("3 Baselines",
                                            "Demography + 3 Baselines",
                                            "12 Baselines",
                                            "Demography + 12 Baselines")),
              row_title = NULL,
              column_names_gp = gpar(fontsize = 12, fontface = "bold"),
               heatmap_legend_param = list( title_gp=gpar(fontsize = 12, fontface = "bold"),
                                          labels_gp=gpar(fontsize = 12)))

png(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", sep = "")),
    width = 10, height = 6, units = 'in', res = 300 )
modelAnno + baseAnno+ h1
dev.off()
knitr::include_graphics(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", 
                                   sep = "")), dpi = 100)

Generate predictions for 2022 tasks

This would be exciting if we can Test the validity of this result using dataset 2022.

Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GGally_2.1.2          ellipse_0.4.3         GetoptLong_1.0.5     
##  [4] ComplexHeatmap_2.14.0 colorRamp2_0.1.0      formattable_0.2.1    
##  [7] here_1.0.1            glmnet_4.1-6          Matrix_1.5-1         
## [10] kableExtra_1.3.4      data.table_1.14.8     corrplot_0.92        
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] purrr_1.0.1           readr_2.1.4           tibble_3.2.0         
## [19] tidyverse_2.0.0       tidyr_1.3.0           ggplot2_3.4.1        
## [22] dplyr_1.1.0           discretefit_0.1.2     pROC_1.18.0          
## [25] verification_1.42     dtw_1.23-1            proxy_0.4-27         
## [28] CircStats_0.2-6       MASS_7.3-58.1         boot_1.3-28          
## [31] fields_14.1           viridis_0.6.2         viridisLite_0.4.1    
## [34] spam_2.9-1           
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.63.0  RColorBrewer_1.1-3  doParallel_1.0.17  
##  [4] webshot_0.5.4       httr_1.4.5          rprojroot_2.0.3    
##  [7] tools_4.2.2         bslib_0.4.2         utf8_1.2.3         
## [10] R6_2.5.1            BiocGenerics_0.44.0 colorspace_2.1-0   
## [13] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [16] compiler_4.2.2      cli_3.6.0           rvest_1.0.3        
## [19] xml2_1.3.3          sass_0.4.5          scales_1.2.1       
## [22] systemfonts_1.0.4   digest_0.6.31       rmarkdown_2.20     
## [25] svglite_2.1.1       pkgconfig_2.0.3     htmltools_0.5.4    
## [28] highr_0.10          fastmap_1.1.1       maps_3.4.1         
## [31] GlobalOptions_0.1.2 htmlwidgets_1.6.1   rlang_1.0.6        
## [34] rstudioapi_0.14     shape_1.4.6         jquerylib_0.1.4    
## [37] generics_0.1.3      jsonlite_1.8.4      magrittr_2.0.3     
## [40] dotCall64_1.0-2     S4Vectors_0.36.2    Rcpp_1.0.10        
## [43] munsell_0.5.0       fansi_1.0.4         lifecycle_1.0.3    
## [46] stringi_1.7.12      yaml_2.3.7          plyr_1.8.8         
## [49] parallel_4.2.2      crayon_1.5.2        lattice_0.20-45    
## [52] splines_4.2.2       circlize_0.4.15     hms_1.1.2          
## [55] knitr_1.42          pillar_1.8.1        rjson_0.2.21       
## [58] stats4_4.2.2        codetools_0.2-18    glue_1.6.2         
## [61] evaluate_0.20       vctrs_0.5.2         png_0.1-8          
## [64] tzdb_0.3.0          foreach_1.5.2       gtable_0.3.1       
## [67] reshape_0.8.9       clue_0.3-64         cachem_1.0.7       
## [70] xfun_0.37           survival_3.4-0      iterators_1.0.14   
## [73] IRanges_2.32.0      cluster_2.1.4       timechange_0.2.0   
## [76] ellipsis_0.3.2