Simple Computational Models for Predicting Pertussis Boost Response 2021 dataset been used to train

Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
  library(kableExtra)
  library(formattable)
  library(colorRamp2)
  library(ComplexHeatmap)
  library(GetoptLong)
  library(ellipse)
  library("ggplot2")                     
  library("GGally")
  source(here("./src/codebase.R"))
})

Setup: Variables used through out the script

Tasks:

1. Monocytes at day 1

2. CCL3 at day 3

3. IgG-PT at day 14

DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", 
               "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT")
RNA_COLS <- c("CCL3")
CELL_COLS <- c("Monocytes")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "IgG_PT_D0", "CCL3_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0",
                    "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS]

abtiterDf <- data.frame(df_source[["plasma_antibody_levels_wide"]])
abtiterDf$specimen_id <- as.numeric(rownames(abtiterDf))
abtiterDf <- data.frame(abtiterDf[, c("specimen_id", ABTITER_COLS)])


rnaDf <- data.frame(df_source[["pbmc_gene_expression_wide"]])
rnaDf$specimen_id <- as.numeric(rownames(rnaDf))
tasks_seq <- c('ENSG00000277632')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, c("specimen_id", RNA_COLS)])


cellDf <- data.frame(df_source[["pbmc_cell_frequency_wide"]])
cellDf$specimen_id <- as.numeric(rownames(cellDf))
cellDf <- data.frame(cellDf[, c("specimen_id", CELL_COLS)])


list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>% 
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex", 
                        "infancy_vac", "age_at_boost"), 
              names_from = timepoint, 
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)), 
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]
df_pivot <- data.frame(df_pivot %>%
   mutate(across(everything(),  ~ case_when(.x >=0 ~ .x))))

Setup: Dataset log transform , Ranking and fold change

Fold change for actual values been generated by dividing the actual value of each task to it’s baseline
Fold change for normalized values been generated by dividing the log transformed value of each task to it’s log transformed baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", "dataset", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "min", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)],
                           2, rankingFunction)
df <- data.frame(df)

df_2020 <- df[df$dataset=="2020_dataset", ]
df_2021 <- df[df$dataset=="2021_dataset", ]

Model construction

No actual model been developed. We use baseline values of tasks provided and demographic features such as age of participants as model. Two types of models has been developed:

A. Age of the participants

B. Baseline values of task variables

A. Modeling Approach: Age of the participants

Age of participants taken as it is has been used as predictor for a given task.

Modeling steps:

1.  Extract age of the participants
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on age at boost.

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", 
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df_2020[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

ageModel_df$Monocytes_D1_Rank <- df_2020$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df_2020$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df_2020$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df_2020$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df_2020$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df_2020$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(ageModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Age Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")

Age Based Models
Subject ID	Age	Biological Sex at Birth	Vaccine Priming Status	1.1) IgG_PT_D14_titer_Rank	1.2) IgG_PT_D14_FC_Rank	2.1) Monocytes_D1_Rank	2.2) Monocytes_D1_FC_Rank	3.1) CCL3_D3_Rank	3.2) CCL3_D3_FC_Rank
1	30.80	Female	wP	22	22	22	22	22	22
2	51.25	Female	wP	1	1	1	1	1	1
3	33.89	Female	wP	13	13	13	13	13	13
4	28.76	Male	wP	30	30	30	30	30	30
5	25.75	Male	wP	42	42	42	42	42	42
6	28.87	Female	wP	27	27	27	27	27	27
7	35.97	Female	wP	6	6	6	6	6	6
8	34.27	Female	wP	11	11	11	11	11	11
9	20.63	Male	aP	63	63	63	63	63	63
10	34.68	Female	wP	9	9	9	9	9	9
11	30.76	Female	wP	23	23	23	23	23	23
12	34.68	Male	wP	9	9	9	9	9	9
13	19.63	Male	aP	84	84	84	84	84	84
14	23.70	Male	wP	47	47	47	47	47	47
15	27.71	Male	wP	33	33	33	33	33	33
16	29.66	Female	wP	24	24	24	24	24	24
17	36.82	Female	wP	5	5	5	5	5	5
18	19.73	Female	aP	83	83	83	83	83	83
19	22.81	Male	wP	50	50	50	50	50	50
20	35.78	Female	wP	7	7	7	7	7	7
21	33.77	Male	wP	15	15	15	15	15	15
22	31.77	Female	wP	20	20	20	20	20	20
23	25.82	Female	wP	41	41	41	41	41	41
24	24.79	Female	wP	43	43	43	43	43	43
25	28.80	Female	wP	29	29	29	29	29	29
26	33.85	Female	wP	14	14	14	14	14	14
27	19.80	Female	aP	81	81	81	81	81	81
28	34.85	Male	wP	8	8	8	8	8	8
29	19.80	Male	aP	81	81	81	81	81	81
30	28.84	Female	wP	28	28	28	28	28	28
31	27.83	Female	wP	32	32	32	32	32	32
32	19.88	Male	aP	78	78	78	78	78	78
33	26.87	Male	wP	35	35	35	35	35	35
34	33.93	Female	wP	12	12	12	12	12	12
35	25.86	Male	wP	40	40	40	40	40	40
36	19.88	Female	aP	78	78	78	78	78	78
37	18.91	Female	aP	93	93	93	93	93	93
38	19.88	Female	aP	78	78	78	78	78	78
39	31.92	Female	wP	19	19	19	19	19	19
40	22.89	Female	wP	49	49	49	49	49	49
41	31.96	Male	wP	18	18	18	18	18	18
42	19.92	Female	aP	77	77	77	77	77	77
43	18.91	Female	aP	93	93	93	93	93	93
44	18.91	Female	aP	93	93	93	93	93	93
45	19.98	Female	aP	74	74	74	74	74	74
46	18.91	Female	aP	93	93	93	93	93	93
47	20.98	Female	aP	62	62	62	62	62	62
48	19.11	Female	aP	90	90	90	90	90	90
49	20.11	Female	aP	71	71	71	71	71	71
50	19.98	Female	aP	74	74	74	74	74	74
51	19.98	Male	aP	74	74	74	74	74	74
52	19.07	Male	aP	91	91	91	91	91	91
53	19.07	Female	aP	91	91	91	91	91	91
54	20.11	Female	aP	71	71	71	71	71	71
55	20.11	Female	aP	71	71	71	71	71	71
56	20.15	Female	aP	67	67	67	67	67	67
57	21.15	Female	aP	59	59	59	59	59	59
58	20.15	Female	aP	67	67	67	67	67	67
59	20.15	Female	aP	67	67	67	67	67	67
60	20.15	Male	aP	67	67	67	67	67	67

B. Modeling approach: Baseline values of task variables

Baseline value of a given task has been used as predictor for a given task.

For Example:
  For task 1.1) IgG-PT-D14-titer-Rank we use baseline value of IgG-PT as predicted values.
  Similarly for 1.2) IgG-PT-D14-FC-Rank we used baseline of IgG-PT as predictor values.

Modeling steps:

1.  Extract baseline of each task
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on the baseline of that task.

expCols <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank",  
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df_2020[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

baselineModel_df$Monocytes_D1_Rank <- df_2020$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df_2020$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df_2020$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df_2020$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df_2020$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df_2020$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(baselineModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Baseline Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")

Baseline Based Models
Subject ID	Age	Biological Sex at Birth	Vaccine Priming Status	1.1) IgG_PT_D14_titer_Rank	1.2) IgG_PT_D14_FC_Rank	2.1) Monocytes_D1_Rank	2.2) Monocytes_D1_FC_Rank	3.1) CCL3_D3_Rank	3.2) CCL3_D3_FC_Rank
1	30.80	Female	wP	20	20	NA	NA	21	21
2	51.25	Female	wP	NA	NA	NA	NA	NA	NA
3	33.89	Female	wP	40	40	NA	NA	36	36
4	28.76	Male	wP	35	35	53	53	63	63
5	25.75	Male	wP	48	48	NA	NA	42	42
6	28.87	Female	wP	29	29	1	1	44	44
7	35.97	Female	wP	50	50	NA	NA	NA	NA
8	34.27	Female	wP	NA	NA	NA	NA	NA	NA
9	20.63	Male	aP	87	87	NA	NA	25	25
10	34.68	Female	wP	89	89	NA	NA	48	48
11	30.76	Female	wP	1	1	52	52	50	50
12	34.68	Male	wP	49	49	NA	NA	NA	NA
13	19.63	Male	aP	68	68	NA	NA	14	14
14	23.70	Male	wP	11	11	NA	NA	NA	NA
15	27.71	Male	wP	46	46	46	46	30	30
16	29.66	Female	wP	53	53	NA	NA	NA	NA
17	36.82	Female	wP	28	28	29	29	26	26
18	19.73	Female	aP	23	23	NA	NA	67	67
19	22.81	Male	wP	27	27	NA	NA	63	63
20	35.78	Female	wP	38	38	10	10	23	23
21	33.77	Male	wP	77	77	5	5	37	37
22	31.77	Female	wP	52	52	NA	NA	69	69
23	25.82	Female	wP	39	39	NA	NA	57	57
24	24.79	Female	wP	82	82	NA	NA	55	55
25	28.80	Female	wP	83	83	NA	NA	44	44
26	33.85	Female	wP	71	71	30	30	63	63
27	19.80	Female	aP	72	72	NA	NA	44	44
28	34.85	Male	wP	43	43	NA	NA	NA	NA
29	19.80	Male	aP	60	60	13	13	33	33
30	28.84	Female	wP	66	66	NA	NA	NA	NA
31	27.83	Female	wP	78	78	51	51	19	19
32	19.88	Male	aP	10	10	NA	NA	18	18
33	26.87	Male	wP	88	88	25	25	16	16
34	33.93	Female	wP	26	26	NA	NA	NA	NA
35	25.86	Male	wP	58	58	NA	NA	37	37
36	19.88	Female	aP	73	73	26	26	48	48
37	18.91	Female	aP	NA	NA	NA	NA	NA	NA
38	19.88	Female	aP	41	41	NA	NA	72	72
39	31.92	Female	wP	62	62	NA	NA	NA	NA
40	22.89	Female	wP	34	34	NA	NA	NA	NA
41	31.96	Male	wP	21	21	NA	NA	NA	NA
42	19.92	Female	aP	31	31	NA	NA	17	17
43	18.91	Female	aP	81	81	NA	NA	29	29
44	18.91	Female	aP	7	7	4	4	57	57
45	19.98	Female	aP	75	75	38	38	NA	NA
46	18.91	Female	aP	19	19	7	7	NA	NA
47	20.98	Female	aP	9	9	50	50	3	3
48	19.11	Female	aP	90	90	22	22	6	6
49	20.11	Female	aP	36	36	47	47	NA	NA
50	19.98	Female	aP	69	69	NA	NA	1	1
51	19.98	Male	aP	61	61	NA	NA	NA	NA
52	19.07	Male	aP	44	44	15	15	2	2
53	19.07	Female	aP	16	16	NA	NA	4	4
54	20.11	Female	aP	25	25	NA	NA	NA	NA
55	20.11	Female	aP	42	42	33	33	NA	NA
56	20.15	Female	aP	18	18	NA	NA	NA	NA
57	21.15	Female	aP	55	55	NA	NA	NA	NA
58	20.15	Female	aP	14	14	NA	NA	NA	NA
59	20.15	Female	aP	2	2	NA	NA	NA	NA
60	20.15	Male	aP	86	86	NA	NA	NA	NA

Model Training and Evaluation

Models been developed and trained using different predictors or combination of predictors. Two types of models been trained for each task:

1) Trivial Models

2) Multi-predictor Models

Spearman correlation coefficient has been used as evaluation matrices.

1) Training Approach : Trivial Model

Two models has been constructed for each task and the fold change of each task using Spearman Correlation Coefficient.

1. Age of Participants

2. Baseline values of task variables

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]

corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df_2021[,x], df_2021[,y], 
                                                  method="spearman", 
                                                  use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df_2021[,ranked_cols]), 
                                                      method="spearman", 
                                                      conf.level = 0.95, 
                                                      na.rm = TRUE)})$p

corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Model Evaluation using Spearman Rank Correlation Coefficient

Heatmap of Spearman correlation coefficient for each task against age at boost or the baseline of the task. Columns of the Heatmap indicating the task and the task fold change, and rows are indicating the model type; age base or baseline base models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))

2) Training Approach : Multi-predictors LASSO Models

For each task 4 models have been developed using combination of predictors as follow:
1. 3 Baseline: Baseline of all 3 tasks combined are being used to train the models for each of the tasks.
2. 3 Baseline + Demographic: Baseline of all three tasks combined with demographic data including; Age, Biological Sex and Vaccine type are being used to train models for each of the tasks.

source(here("./src/codebase.R"))
set.seed(1)

gs_2020 <- df_2020
# 
gs_2020$infancy_vac <- as.numeric(factor(gs_2020$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2020$biological_sex <- as.numeric(factor(gs_2020$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2020[,"Monocytes_D1_FC"] <- gs_2020[, "Monocytes_D1"] / gs_2020[, "Monocytes_D0"]
gs_2020[,"CCL3_D3_FC"] <- gs_2020[, "CCL3_D3"] / gs_2020[, "CCL3_D0"]
gs_2020[,"IgG_PT_D14_FC"] <- gs_2020[, "IgG_PT_D14"] / gs_2020[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   gs_2020[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2020[, ltCol]+1)
}

gs_2020[,"normalized_CCL3_D3_FC"] <- gs_2020[, "normalized_CCL3_D3"] / gs_2020[, "normalized_CCL3_D0"]
gs_2020[,"normalized_IgG_PT_D14_FC"] <- gs_2020[, "normalized_IgG_PT_D14"] / gs_2020[, "normalized_IgG_PT_D0"]


gs_2021 <- df_2021

gs_2021$infancy_vac <- as.numeric(factor(gs_2021$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2021$biological_sex <- as.numeric(factor(gs_2021$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2021[,"Monocytes_D1_FC"] <- gs_2021[, "Monocytes_D1"] / gs_2021[, "Monocytes_D0"]
gs_2021[,"CCL3_D3_FC"] <- gs_2021[, "CCL3_D3"] / gs_2021[, "CCL3_D0"]
gs_2021[,"IgG_PT_D14_FC"] <- gs_2021[, "IgG_PT_D14"] / gs_2021[, "IgG_PT_D0"]

for (ltCol in LOG_TRANS_COLS){
   gs_2021[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2021[, ltCol]+1)
}

gs_2021[,"normalized_CCL3_D3_FC"] <- gs_2021[, "normalized_CCL3_D3"] / gs_2021[, "normalized_CCL3_D0"]
gs_2021[,"normalized_IgG_PT_D14_FC"] <- gs_2021[, "normalized_IgG_PT_D14"] / gs_2021[, "normalized_IgG_PT_D0"]

Test model quality

Analyze correlation between predicted and true values in leave-one-out cross-validation

source(here("./src/codebase.R"))
corrDf <- data.frame()
pvalDf <- data.frame()

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "Monocytes_D1_FC", "normalized_CCL3_D3_FC", "normalized_IgG_PT_D14_FC")

res <- lassoFunction(gs_2021, corrDf, pvalDf, "3Baseline", y, x)
corrDf <- data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])


x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "Monocytes_D1_FC", "normalized_CCL3_D3_FC", "normalized_IgG_PT_D14_FC")

res <- lassoFunction(gs_2021, corrDf, pvalDf, "Demography_3Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Feature Selection

For each model, we assessed which features contribute to non-zero coefficients (slopes). For this purpose the minimum lambda, that gives minimum mean cross-validated error, been used for prediction.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
source(here("./src/codebase.R"))
set.seed(1)
coefList <- list()
x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2021, y, x)
res_base3 <- coefList %>% reduce(full_join, by="Variable")
a <- res_base3
# colnames(res_base3) <- c("Variable", y)
res_base3$Models <- "3 Baselines"

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2020, y, x)
res_base3D <- coefList %>% reduce(full_join, by="Variable")
# colnames(res_base3D) <- c("Variable", y)
res_base3D$Models <- "Demography + 3 Baselines"

df_list <- list(res_base3, res_base3D)
coeffDf <- df_list %>% reduce(full_join, by= c("Variable", "Models",y))
coeffDf$Variable <- gsub("normalized_", "", coeffDf$Variable)

coeffDf2 <- coeffDf

rownames(coeffDf2) <- paste(coeffDf2$Models, coeffDf2$Variable, sep="_")
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3"] <- "3.1) CCL3-D3-Rank"

colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"

coeffDf2 <- coeffDf2[coeffDf2$Variable!="(Intercept)",]

tcoeffDf2 <- coeffDf2[, !names(coeffDf2) %in% c("Variable", "Models")]

Feature Importance using Coefficients (Slope)

Heatmap of slopes between each task and each of the predictors used to train LASSO model. The far left column of the heatmap represents model type; 3 Baseline models or 3 Baseline + Demographic models. The second column indicates features (predictors) being used to train models. Slopes between each task and a particular predictor, from each model type, is shown in the body of the heatmap. Columns of the heatmap body represent tasks. Slopes above zero are shown in red and those bellow zero are shown in blue. Slope of zero indicates that there is no relationship between coresponding task and predictor in that particular model.

modelAnno = rowAnnotation(
  Models = anno_block(gp = gpar(fill = 2:4),
        labels = c("3 Baseline", "Demography + 3 Baseline"),
        labels_gp = gpar(col = "white", fontsize = 10, fontface = "bold")),
        annotation_name_side = "top")

baseAnno = rowAnnotation(labels = anno_text(coeffDf2$Variable, which = "row",
                                            gp = gpar(fontsize = 12, fontface = "bold")),
                         width = unit(30, "mm"))

tcoeffDf2 <- tcoeffDf2 %>% replace(is.na(.), 0)

rowOrder <- c("3 Baselines_IgG_PT_D0", "3 Baselines_Monocytes_D0", "3 Baselines_CCL3_D0", 
             "Demography + 3 Baselines_biological_sex",
             "Demography + 3 Baselines_infancy_vac", "Demography + 3 Baselines_IgG_PT_D0", 
             "Demography + 3 Baselines_Monocytes_D0", "Demography + 3 Baselines_CCL3_D0")

rowNames <- c("IgG_PT_D0", "Monocytes_D0", "CCL3_D0", 
             "biological_sex",
             "infancy_vac", "IgG_PT_D0", 
             "Monocytes_D0", "CCL3_D0")

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")

maxCorr <- max(abs(as.matrix(tcoeffDf2)), na.rm = T)
col_rnorm = colorRamp2(c(-maxCorr, 0, maxCorr), c("blue", "white","red"))
h1 <- Heatmap(tcoeffDf2, name = "Coefficient", column_names_rot = 45,
              cell_fun = function(j, i, x, y, width, height, fill) {
                  grid.text(sprintf("%.2f", tcoeffDf2[i, j]), x, y, gp = gpar(fontsize = 14, fontface = "bold"))},
              column_names_side = "top",
              na_col = "white",
              show_row_names=F,
              row_order = rowOrder,
              row_labels = rowNames,
              column_order = colOrders,
              height = nrow(tcoeffDf2)*unit(10, "mm"),
              width = ncol(tcoeffDf2)*unit(30, "mm"),
              col = col_rnorm, show_row_dend = FALSE,
              show_column_dend = FALSE,
              row_split = factor(coeffDf2$Model,
                                 levels = c("3 Baselines",
                                            "Demography + 3 Baselines",
                                            "12 Baselines",
                                            "Demography + 12 Baselines")),
              row_title = NULL,
              column_names_gp = gpar(fontsize = 12, fontface = "bold"),
               heatmap_legend_param = list( title_gp=gpar(fontsize = 12, fontface = "bold"),
                                          labels_gp=gpar(fontsize = 12)))

png(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", sep = "")),
    width = 10, height = 6, units = 'in', res = 300 )
modelAnno + baseAnno+ h1
dev.off()

knitr::include_graphics(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", 
                                   sep = "")), dpi = 100)

Generate predictions for 2020 tasks

source(here("./src/codebase.R"))
corrDf <- data.frame()
pvalDf <- data.frame()

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "normalized_IgG_PT_D14_FC")

res <- lassoFunction_test(gs_2021, gs_2020, corrDf, pvalDf, "3Baseline", y, x)
corrDf <- data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14",
       "normalized_IgG_PT_D14_FC")

res <- lassoFunction_test(gs_2021, gs_2020, corrDf, pvalDf, "Demography_3Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "CCL3_D3",
              "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
# colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
# colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
# colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
# colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "3.1) CCL3-D3-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         na.label=" ", 
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Session Info

sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GGally_2.1.2          ellipse_0.4.3         GetoptLong_1.0.5     
##  [4] ComplexHeatmap_2.14.0 colorRamp2_0.1.0      formattable_0.2.1    
##  [7] here_1.0.1            glmnet_4.1-6          Matrix_1.5-1         
## [10] kableExtra_1.3.4      data.table_1.14.8     corrplot_0.92        
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] purrr_1.0.1           readr_2.1.4           tibble_3.2.0         
## [19] tidyverse_2.0.0       tidyr_1.3.0           ggplot2_3.4.1        
## [22] dplyr_1.1.0           discretefit_0.1.2     pROC_1.18.0          
## [25] verification_1.42     dtw_1.23-1            proxy_0.4-27         
## [28] CircStats_0.2-6       MASS_7.3-58.1         boot_1.3-28          
## [31] fields_14.1           viridis_0.6.2         viridisLite_0.4.1    
## [34] spam_2.9-1           
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.63.0  RColorBrewer_1.1-3  doParallel_1.0.17  
##  [4] webshot_0.5.4       httr_1.4.5          rprojroot_2.0.3    
##  [7] tools_4.2.2         bslib_0.4.2         utf8_1.2.3         
## [10] R6_2.5.1            BiocGenerics_0.44.0 colorspace_2.1-0   
## [13] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [16] compiler_4.2.2      cli_3.6.0           rvest_1.0.3        
## [19] xml2_1.3.3          sass_0.4.5          scales_1.2.1       
## [22] systemfonts_1.0.4   digest_0.6.31       rmarkdown_2.20     
## [25] svglite_2.1.1       pkgconfig_2.0.3     htmltools_0.5.4    
## [28] highr_0.10          fastmap_1.1.1       maps_3.4.1         
## [31] GlobalOptions_0.1.2 htmlwidgets_1.6.1   rlang_1.0.6        
## [34] rstudioapi_0.14     shape_1.4.6         jquerylib_0.1.4    
## [37] generics_0.1.3      jsonlite_1.8.4      magrittr_2.0.3     
## [40] dotCall64_1.0-2     S4Vectors_0.36.2    Rcpp_1.0.10        
## [43] munsell_0.5.0       fansi_1.0.4         lifecycle_1.0.3    
## [46] stringi_1.7.12      yaml_2.3.7          plyr_1.8.8         
## [49] parallel_4.2.2      crayon_1.5.2        lattice_0.20-45    
## [52] splines_4.2.2       circlize_0.4.15     hms_1.1.2          
## [55] knitr_1.42          pillar_1.8.1        rjson_0.2.21       
## [58] stats4_4.2.2        codetools_0.2-18    glue_1.6.2         
## [61] evaluate_0.20       vctrs_0.5.2         png_0.1-8          
## [64] tzdb_0.3.0          foreach_1.5.2       gtable_0.3.1       
## [67] reshape_0.8.9       clue_0.3-64         cachem_1.0.7       
## [70] xfun_0.37           survival_3.4-0      iterators_1.0.14   
## [73] IRanges_2.32.0      cluster_2.1.4       timechange_0.2.0   
## [76] ellipsis_0.3.2

Simple Computational Models for Predicting Pertussis Boost Response 2021 dataset been used to train

2023-09-05

Setup: Load libraries and helper functions

Setup: Variables used through out the script

Tasks:

Setup: Load dataset, combine different experiments

Setup: Dataset log transform , Ranking and fold change

Model construction

A. Modeling Approach: Age of the participants

Modeling steps:

B. Modeling approach: Baseline values of task variables

Modeling steps:

Model Training and Evaluation

1) Training Approach : Trivial Model

Model Evaluation using Spearman Rank Correlation Coefficient

2) Training Approach : Multi-predictors LASSO Models

Test model quality

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Feature Selection

Feature Importance using Coefficients (Slope)

Generate predictions for 2020 tasks

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Session Info