Simple Computational Models for Predicting Pertussis Boost Response

Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
  library(kableExtra)
  library(formattable)
  library(colorRamp2)
  library(ComplexHeatmap)
  library(GetoptLong)
  library(ellipse)
  library("ggplot2")                     
  library("GGally")
  source(here("./src/codebase.R"))
})

Setup: Variables used throughout the script

Tasks:

1. Monocytes at day 1

2. CCL3 at day 3

3. IgG-PT at day 14

DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", 
               "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT")
RNA_COLS <- c("CCL3")
CELL_COLS <- c("Monocytes")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "IgG_PT_D0", "CCL3_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0",
                    "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

df_source <- readRDS(here("./data/master_processed_training_data.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS] %>%
  data.frame()

abtiterDf <- df_source[["abtiter_wide"]]$batchCorrected_data %>%
  t() %>%
  data.frame() 

abtiterDf$specimen_id <- as.numeric(rownames(abtiterDf))
abtiterDf <- data.frame(abtiterDf[, c("specimen_id", ABTITER_COLS)])


rnaDf <- df_source[["pbmc_gene_expression"]]$batchCorrected_data %>%
  t() %>%
  data.frame() 

rnaDf$specimen_id <- as.numeric(rownames(rnaDf))
tasks_seq <- c('ENSG00000277632')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, c("specimen_id", RNA_COLS)])


cellDf <- df_source[["pbmc_cell_frequency"]]$batchCorrected_data %>%
  t() %>%
  data.frame()  

cellDf$specimen_id <- as.numeric(rownames(cellDf))
cellDf <- data.frame(cellDf[, c("specimen_id", CELL_COLS)])

list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>%
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex",
                        "infancy_vac", "age_at_boost"),
              names_from = timepoint,
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)),
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]
df_pivot <- data.frame(df_pivot %>%
   mutate(across(everything(),  ~ case_when(.x >=0 ~ .x))))

Setup: Dataset fold change and Ranking

Fold change for actual values have been generated by dividing the actual value of each task to it’s baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", "dataset", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "min", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)],
                           2, rankingFunction)
df <- data.frame(df)

Model construction

No actual model has been developed. We use baseline values of tasks provided and demographic features such as age of participants as model. Two types of models have been developed:

A. Age of the participants

B. Baseline values of task variables

A. Modeling Approach: Age of the participants

Age of participants taken, as it is, has been used as predictor for a given task.

Modeling steps:

1.  Extract age of the participants
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follows:

Possible rank of predicted values for each task when model is trained based on age at boost.

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", 
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

ageModel_df$Monocytes_D1_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(ageModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Age Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")

Age Based Models
Subject ID	Age	Biological Sex at Birth	Vaccine Priming Status	1.1) IgG_PT_D14_titer_Rank	1.2) IgG_PT_D14_FC_Rank	2.1) Monocytes_D1_Rank	2.2) Monocytes_D1_FC_Rank	3.1) CCL3_D3_Rank	3.2) CCL3_D3_FC_Rank
1	30.80	Female	wP	22	22	22	22	22	22
2	51.25	Female	wP	1	1	1	1	1	1
3	33.89	Female	wP	13	13	13	13	13	13
4	28.76	Male	wP	30	30	30	30	30	30
5	25.75	Male	wP	42	42	42	42	42	42
6	28.87	Female	wP	27	27	27	27	27	27
7	35.97	Female	wP	6	6	6	6	6	6
8	34.27	Female	wP	11	11	11	11	11	11
9	20.63	Male	aP	63	63	63	63	63	63
10	34.68	Female	wP	9	9	9	9	9	9
11	30.76	Female	wP	23	23	23	23	23	23
12	34.68	Male	wP	9	9	9	9	9	9
13	19.63	Male	aP	84	84	84	84	84	84
14	23.70	Male	wP	47	47	47	47	47	47
15	27.71	Male	wP	33	33	33	33	33	33
16	29.66	Female	wP	24	24	24	24	24	24
17	36.82	Female	wP	5	5	5	5	5	5
18	19.73	Female	aP	83	83	83	83	83	83
19	22.81	Male	wP	50	50	50	50	50	50
20	35.78	Female	wP	7	7	7	7	7	7
21	33.77	Male	wP	15	15	15	15	15	15
22	31.77	Female	wP	20	20	20	20	20	20
23	25.82	Female	wP	41	41	41	41	41	41
24	24.79	Female	wP	43	43	43	43	43	43
25	28.80	Female	wP	29	29	29	29	29	29
26	33.85	Female	wP	14	14	14	14	14	14
27	19.80	Female	aP	81	81	81	81	81	81
28	34.85	Male	wP	8	8	8	8	8	8
29	19.80	Male	aP	81	81	81	81	81	81
30	28.84	Female	wP	28	28	28	28	28	28
31	27.83	Female	wP	32	32	32	32	32	32
32	19.88	Male	aP	78	78	78	78	78	78
33	26.87	Male	wP	35	35	35	35	35	35
34	33.93	Female	wP	12	12	12	12	12	12
35	25.86	Male	wP	40	40	40	40	40	40
36	19.88	Female	aP	78	78	78	78	78	78
37	18.91	Female	aP	93	93	93	93	93	93
38	19.88	Female	aP	78	78	78	78	78	78
39	31.92	Female	wP	19	19	19	19	19	19
40	22.89	Female	wP	49	49	49	49	49	49
41	31.96	Male	wP	18	18	18	18	18	18
42	19.92	Female	aP	77	77	77	77	77	77
43	18.91	Female	aP	93	93	93	93	93	93
44	18.91	Female	aP	93	93	93	93	93	93
45	19.98	Female	aP	74	74	74	74	74	74
46	18.91	Female	aP	93	93	93	93	93	93
47	20.98	Female	aP	62	62	62	62	62	62
48	19.11	Female	aP	90	90	90	90	90	90
49	20.11	Female	aP	71	71	71	71	71	71
50	19.98	Female	aP	74	74	74	74	74	74
51	19.98	Male	aP	74	74	74	74	74	74
52	19.07	Male	aP	91	91	91	91	91	91
53	19.07	Female	aP	91	91	91	91	91	91
54	20.11	Female	aP	71	71	71	71	71	71
55	20.11	Female	aP	71	71	71	71	71	71
56	20.15	Female	aP	67	67	67	67	67	67
57	21.15	Female	aP	59	59	59	59	59	59
58	20.15	Female	aP	67	67	67	67	67	67
59	20.15	Female	aP	67	67	67	67	67	67
60	20.15	Male	aP	67	67	67	67	67	67
61	32.38	Female	wP	16	16	16	16	16	16
62	25.99	Female	wP	38	38	38	38	38	38
63	23.98	Female	wP	46	46	46	46	46	46
64	25.99	Male	wP	38	38	38	38	38	38
65	29.02	Male	wP	26	26	26	26	26	26
66	43.07	Female	wP	4	4	4	4	4	4
67	47.24	Female	wP	2	2	2	2	2	2
68	47.24	Male	wP	2	2	2	2	2	2
69	29.17	Female	wP	25	25	25	25	25	25
70	21.15	Male	aP	59	59	59	59	59	59
71	21.15	Female	aP	59	59	59	59	59	59
72	28.25	Female	wP	31	31	31	31	31	31
73	24.23	Female	wP	44	44	44	44	44	44
74	24.23	Female	wP	44	44	44	44	44	44
75	21.22	Female	aP	57	57	57	57	57	57
76	21.22	Female	aP	57	57	57	57	57	57
77	31.32	Male	wP	21	21	21	21	21	21
78	26.30	Female	wP	36	36	36	36	36	36
79	32.32	Male	wP	17	17	17	17	17	17
80	27.30	Female	wP	34	34	34	34	34	34
81	26.30	Male	wP	36	36	36	36	36	36
82	21.28	Female	aP	56	56	56	56	56	56
83	20.34	Female	aP	66	66	66	66	66	66
84	22.34	Female	aP	52	52	52	52	52	52
85	19.39	Female	aP	87	87	87	87	87	87
86	21.40	Female	aP	55	55	55	55	55	55
87	19.39	Male	aP	87	87	87	87	87	87
88	19.39	Male	aP	87	87	87	87	87	87
89	22.49	Female	aP	51	51	51	51	51	51
90	20.49	Female	aP	65	65	65	65	65	65
91	21.49	Male	aP	54	54	54	54	54	54
92	19.54	Female	aP	85	85	85	85	85	85
93	23.56	Female	aP	48	48	48	48	48	48
94	20.55	Male	aP	64	64	64	64	64	64
95	21.55	Female	aP	53	53	53	53	53	53
96	19.54	Male	aP	85	85	85	85	85	85

B. Modeling approach: Baseline values of task variables

Baseline value of a given task has been used as predictor for a given task.

For Example:
  For task 1.1) IgG-PT-D14-titer-Rank we use baseline value of IgG-PT as predicted values.
  Similarly for 1.2) IgG-PT-D14-FC-Rank we used baseline of IgG-PT as predictor values.

Modeling steps:

1.  Extract baseline of each task
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on the baseline of that task.

expCols <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank",  
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

baselineModel_df$Monocytes_D1_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(baselineModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Baseline Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")

Baseline Based Models
Subject ID	Age	Biological Sex at Birth	Vaccine Priming Status	1.1) IgG_PT_D14_titer_Rank	1.2) IgG_PT_D14_FC_Rank	2.1) Monocytes_D1_Rank	2.2) Monocytes_D1_FC_Rank	3.1) CCL3_D3_Rank	3.2) CCL3_D3_FC_Rank
1	30.80	Female	wP	20	20	NA	NA	23	23
2	51.25	Female	wP	NA	NA	NA	NA	NA	NA
3	33.89	Female	wP	40	40	NA	NA	40	40
4	28.76	Male	wP	35	35	53	53	63	63
5	25.75	Male	wP	48	48	NA	NA	48	48
6	28.87	Female	wP	31	31	1	1	52	52
7	35.97	Female	wP	50	50	NA	NA	NA	NA
8	34.27	Female	wP	NA	NA	NA	NA	NA	NA
9	20.63	Male	aP	85	85	NA	NA	28	28
10	34.68	Female	wP	89	89	NA	NA	52	52
11	30.76	Female	wP	1	1	52	52	57	57
12	34.68	Male	wP	49	49	NA	NA	NA	NA
13	19.63	Male	aP	68	68	NA	NA	15	15
14	23.70	Male	wP	11	11	NA	NA	NA	NA
15	27.71	Male	wP	47	47	43	43	34	34
16	29.66	Female	wP	53	53	NA	NA	NA	NA
17	36.82	Female	wP	29	29	27	27	28	28
18	19.73	Female	aP	23	23	NA	NA	67	67
19	22.81	Male	wP	28	28	NA	NA	67	67
20	35.78	Female	wP	38	38	10	10	24	24
21	33.77	Male	wP	76	76	6	6	42	42
22	31.77	Female	wP	52	52	NA	NA	71	71
23	25.82	Female	wP	39	39	NA	NA	63	63
24	24.79	Female	wP	81	81	NA	NA	60	60
25	28.80	Female	wP	82	82	NA	NA	48	48
26	33.85	Female	wP	71	71	28	28	67	67
27	19.80	Female	aP	72	72	NA	NA	48	48
28	34.85	Male	wP	43	43	NA	NA	NA	NA
29	19.80	Male	aP	60	60	14	14	35	35
30	28.84	Female	wP	64	64	NA	NA	NA	NA
31	27.83	Female	wP	77	77	49	49	21	21
32	19.88	Male	aP	10	10	NA	NA	18	18
33	26.87	Male	wP	88	88	23	23	16	16
34	33.93	Female	wP	26	26	NA	NA	NA	NA
35	25.86	Male	wP	58	58	NA	NA	42	42
36	19.88	Female	aP	73	73	24	24	54	54
37	18.91	Female	aP	NA	NA	NA	NA	NA	NA
38	19.88	Female	aP	41	41	NA	NA	72	72
39	31.92	Female	wP	62	62	NA	NA	NA	NA
40	22.89	Female	wP	34	34	NA	NA	NA	NA
41	31.96	Male	wP	21	21	NA	NA	NA	NA
42	19.92	Female	aP	32	32	NA	NA	17	17
43	18.91	Female	aP	80	80	NA	NA	33	33
44	18.91	Female	aP	8	8	4	4	63	63
45	19.98	Female	aP	74	74	35	35	NA	NA
46	18.91	Female	aP	19	19	8	8	NA	NA
47	20.98	Female	aP	9	9	48	48	3	3
48	19.11	Female	aP	90	90	22	22	6	6
49	20.11	Female	aP	36	36	44	44	NA	NA
50	19.98	Female	aP	69	69	NA	NA	1	1
51	19.98	Male	aP	61	61	NA	NA	NA	NA
52	19.07	Male	aP	44	44	16	16	2	2
53	19.07	Female	aP	16	16	NA	NA	4	4
54	20.11	Female	aP	25	25	NA	NA	NA	NA
55	20.11	Female	aP	42	42	31	31	NA	NA
56	20.15	Female	aP	18	18	NA	NA	NA	NA
57	21.15	Female	aP	55	55	NA	NA	NA	NA
58	20.15	Female	aP	15	15	NA	NA	NA	NA
59	20.15	Female	aP	4	4	NA	NA	NA	NA
60	20.15	Male	aP	84	84	NA	NA	NA	NA
61	32.38	Female	wP	70	70	NA	NA	37	37
62	25.99	Female	wP	13	13	NA	NA	13	13
63	23.98	Female	wP	5	5	40	40	27	27
64	25.99	Male	wP	51	51	51	51	20	20
65	29.02	Male	wP	6	6	37	37	10	10
66	43.07	Female	wP	3	3	50	50	26	26
67	47.24	Female	wP	2	2	17	17	37	37
68	47.24	Male	wP	12	12	46	46	35	35
69	29.17	Female	wP	22	22	39	39	22	22
70	21.15	Male	aP	79	79	5	5	45	45
71	21.15	Female	aP	33	33	42	42	19	19
72	28.25	Female	wP	7	7	20	20	42	42
73	24.23	Female	wP	14	14	30	30	39	39
74	24.23	Female	wP	66	66	3	3	54	54
75	21.22	Female	aP	86	86	NA	NA	47	47
76	21.22	Female	aP	17	17	33	33	57	57
77	31.32	Male	wP	56	56	18	18	11	11
78	26.30	Female	wP	37	37	11	11	51	51
79	32.32	Male	wP	27	27	2	2	9	9
80	27.30	Female	wP	75	75	21	21	8	8
81	26.30	Male	wP	54	54	12	12	12	12
82	21.28	Female	aP	NA	NA	36	36	5	5
83	20.34	Female	aP	63	63	19	19	67	67
84	22.34	Female	aP	59	59	26	26	31	31
85	19.39	Female	aP	83	83	40	40	32	32
86	21.40	Female	aP	46	46	25	25	28	28
87	19.39	Male	aP	NA	NA	32	32	14	14
88	19.39	Male	aP	NA	NA	38	38	7	7
89	22.49	Female	aP	87	87	45	45	24	24
90	20.49	Female	aP	78	78	34	34	40	40
91	21.49	Male	aP	57	57	7	7	60	60
92	19.54	Female	aP	67	67	15	15	54	54
93	23.56	Female	aP	45	45	47	47	46	46
94	20.55	Male	aP	30	30	9	9	63	63
95	21.55	Female	aP	24	24	29	29	57	57
96	19.54	Male	aP	65	65	13	13	62	62

Model Training and Evaluation

Models have been developed and trained using different predictors or combination of predictors. Two types of models have been trained for each task:

1) Trivial Models

2) Multi-predictor Models

Spearman correlation coefficient has been used as evaluation matrics.

1) Training Approach : Trivial Model

Two models have been constructed for each task and the fold change of each task using Spearman Correlation Coefficient.

1. Age of Participants

2. Baseline values of task variables

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]

corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df[,x], df[,y], 
                                                  method="spearman", 
                                                  use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df[,ranked_cols]), 
                                                      method="spearman", 
                                                      conf.level = 0.95, 
                                                      na.rm = TRUE)})$p

corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Model Evaluation using Spearman Rank Correlation Coefficient

Heatmap of Spearman correlation coefficient for each task against age at boost or the baseline of the task. Columns of the Heatmap indicating the task and the task fold change, and rows are indicating the model type; age base or baseline base models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))

2) Training Approach : Multi-predictors LASSO Models

For each task two models have been developed using combination of predictors as follow:
1. 3 Baseline: Baseline of all three tasks combined are being used to train the models for each of the tasks.
2. 3 Baseline + Demographic: Baseline of all three tasks combined with demographic data including; Age, Biological Sex and Vaccine type are being used to train models for each of the tasks.

source(here("./src/codebase.R"))
set.seed(1)

df1 <- df

df1$infancy_vac <- as.numeric(factor(df1$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
df1$biological_sex <- as.numeric(factor(df1$biological_sex,
                                        labels = c(1,2), exclude = NA))

df1[,"Monocytes_D1_FC"] <- df1[, "Monocytes_D1"] / df1[, "Monocytes_D0"]
df1[,"CCL3_D3_FC"] <- df1[, "CCL3_D3"] / df1[, "CCL3_D0"]
df1[,"IgG_PT_D14_FC"] <- df1[, "IgG_PT_D14"] / df1[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   df1[, paste("normalized", ltCol, sep = "_")] <- log2(df1[, ltCol]+1)
}

df1[,"normalized_CCL3_D3_FC"] <- df1[, "normalized_CCL3_D3"] / df1[, "normalized_CCL3_D0"]
df1[,"normalized_IgG_PT_D14_FC"] <- df1[, "normalized_IgG_PT_D14"] / df1[, "normalized_IgG_PT_D0"]

Validate model quality

Analyze Spearman Correlation Coefficient between predicted and true values in leave-one-out cross-validation.

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicate the task and the task fold change, and rows indicate the model type; 3 Baseline or Demographic + 3 Baseline models. Crosses indicate insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)


# pvalDf[corrDf < 0] <- 1

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Feature Selection

For each model, we assessed which features contribute to non-zero coefficients (slopes). For this purpose the minimum lambda, that gives minimum mean cross-validated error, has been used.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
set.seed(1)

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(df1, y, x)
res_base3 <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3) <- c("Variable", y)
res_base3$Models <- "3 Baselines"

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(df1, y, x)
res_base3D <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3D) <- c("Variable", y)
res_base3D$Models <- "Demography + 3 Baselines"

df_list <- list(res_base3, res_base3D)
coeffDf <- df_list %>% reduce(full_join, by= c("Variable", "Models",y))
coeffDf$Variable <- gsub("normalized_", "", coeffDf$Variable)

# knitr::opts_chunk$set(fig.width=16, fig.height=18)
coeffDf2 <- coeffDf

rownames(coeffDf2) <- paste(coeffDf2$Models, coeffDf2$Variable, sep="_")
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3"] <- "3.1) CCL3-D3-Rank"

colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"

coeffDf2 <- coeffDf2[coeffDf2$Variable!="(Intercept)",]
tcoeffDf2 <- coeffDf2[, !names(coeffDf2) %in% c("Variable", "Models")]

Feature Importance using Coefficients (Slope)

Heatmap of slopes between each task and each of the predictors used to train different type of LASSO models. The far left column of the heatmap represents model type; 3 Baseline models or 3 Baseline + Demographic models. The second column indicates features (predictors) being used to train models. Slopes between each task and a particular predictor, from each model type, is shown in the body of the heatmap. Columns of the heatmap body represent tasks. Slopes above zero are shown in red and those bellow zero are shown in blue. Slope of zero indicates that there is no relationship between corresponding task and predictor in that particular model.

modelAnno = rowAnnotation(
  Models = anno_block(gp = gpar(fill = 2:4),
        labels = c("3 Baseline", "Demography + 3 Baseline"),
        labels_gp = gpar(col = "white", fontsize = 12, fontface = "bold")),
        annotation_name_side = "top")

baseAnno = rowAnnotation(labels = anno_text(coeffDf2$Variable, which = "row",
                                            gp = gpar(fontsize = 12, fontface = "bold")),
    width = unit(30, "mm"))

tcoeffDf2 <- tcoeffDf2 %>% replace(is.na(.), 0)
rowOrder <- c("3 Baselines_IgG_PT_D0", "3 Baselines_Monocytes_D0", "3 Baselines_CCL3_D0", 
             "Demography + 3 Baselines_age_at_boost", "Demography + 3 Baselines_biological_sex", 
             "Demography + 3 Baselines_infancy_vac", "Demography + 3 Baselines_IgG_PT_D0", 
             "Demography + 3 Baselines_Monocytes_D0", "Demography + 3 Baselines_CCL3_D0")

rowNames <- c("IgG_PT_D0", "Monocytes_D0", "CCL3_D0", 
             "age_at_boost", "biological_sex", 
             "infancy_vac", "IgG_PT_D0", 
             "Monocytes_D0", "CCL3_D0")

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")

maxCorr <- max(abs(as.matrix(tcoeffDf2)), na.rm = T)
col_rnorm = colorRamp2(c(-maxCorr, 0, maxCorr), c("blue", "white","red"))
h1 <- Heatmap(tcoeffDf2, name = "Slope", column_names_rot = 45,
              cell_fun = function(j, i, x, y, width, height, fill) {
                  grid.text(sprintf("%.2f", tcoeffDf2[i, j]), x, y, gp = gpar(fontsize = 14, fontface = "bold"))},
              column_names_side = "top",
              na_col = "white",
              show_row_names=F,
              row_order = rowOrder,
              row_labels = rowNames,
              column_order = colOrders,
              height = nrow(tcoeffDf2)*unit(10, "mm"),
              width = ncol(tcoeffDf2)*unit(30, "mm"),
              col = col_rnorm, show_row_dend = FALSE,
              show_column_dend = FALSE,
              row_split = factor(coeffDf2$Model,
                                 levels = c("3 Baselines",
                                            "Demography + 3 Baselines",
                                            "12 Baselines",
                                            "Demography + 12 Baselines")),
              row_title = NULL,
              column_names_gp = gpar(fontsize = 12, fontface = "bold"),
               heatmap_legend_param = list( title_gp=gpar(fontsize = 12, fontface = "bold"),
                                          labels_gp=gpar(fontsize = 12)))

png(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", sep = "")),
    width = 10, height = 6, units = 'in', res = 300 )
modelAnno + baseAnno+ h1
dev.off()

knitr::include_graphics(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", 
                                   sep = "")), dpi = 100)

Generate predictions for 2022 tasks

This would be exciting if we can Test the validity of this result using dataset 2022.

Session Info

sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GGally_2.1.2          ellipse_0.4.3         GetoptLong_1.0.5     
##  [4] ComplexHeatmap_2.14.0 colorRamp2_0.1.0      formattable_0.2.1    
##  [7] here_1.0.1            glmnet_4.1-6          Matrix_1.5-1         
## [10] kableExtra_1.3.4      data.table_1.14.8     corrplot_0.92        
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] purrr_1.0.1           readr_2.1.4           tibble_3.2.0         
## [19] tidyverse_2.0.0       tidyr_1.3.0           ggplot2_3.4.1        
## [22] dplyr_1.1.0           discretefit_0.1.2     pROC_1.18.0          
## [25] verification_1.42     dtw_1.23-1            proxy_0.4-27         
## [28] CircStats_0.2-6       MASS_7.3-58.1         boot_1.3-28          
## [31] fields_14.1           viridis_0.6.2         viridisLite_0.4.1    
## [34] spam_2.9-1           
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.63.0  RColorBrewer_1.1-3  doParallel_1.0.17  
##  [4] webshot_0.5.4       httr_1.4.5          rprojroot_2.0.3    
##  [7] tools_4.2.2         bslib_0.4.2         utf8_1.2.3         
## [10] R6_2.5.1            BiocGenerics_0.44.0 colorspace_2.1-0   
## [13] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [16] compiler_4.2.2      cli_3.6.0           rvest_1.0.3        
## [19] xml2_1.3.3          sass_0.4.5          scales_1.2.1       
## [22] systemfonts_1.0.4   digest_0.6.31       rmarkdown_2.20     
## [25] svglite_2.1.1       pkgconfig_2.0.3     htmltools_0.5.4    
## [28] highr_0.10          fastmap_1.1.1       maps_3.4.1         
## [31] GlobalOptions_0.1.2 htmlwidgets_1.6.1   rlang_1.0.6        
## [34] rstudioapi_0.14     shape_1.4.6         jquerylib_0.1.4    
## [37] generics_0.1.3      jsonlite_1.8.4      magrittr_2.0.3     
## [40] dotCall64_1.0-2     S4Vectors_0.36.2    Rcpp_1.0.10        
## [43] munsell_0.5.0       fansi_1.0.4         lifecycle_1.0.3    
## [46] stringi_1.7.12      yaml_2.3.7          plyr_1.8.8         
## [49] parallel_4.2.2      crayon_1.5.2        lattice_0.20-45    
## [52] splines_4.2.2       circlize_0.4.15     hms_1.1.2          
## [55] knitr_1.42          pillar_1.8.1        rjson_0.2.21       
## [58] stats4_4.2.2        codetools_0.2-18    glue_1.6.2         
## [61] evaluate_0.20       vctrs_0.5.2         png_0.1-8          
## [64] tzdb_0.3.0          foreach_1.5.2       gtable_0.3.1       
## [67] reshape_0.8.9       clue_0.3-64         cachem_1.0.7       
## [70] xfun_0.37           survival_3.4-0      iterators_1.0.14   
## [73] IRanges_2.32.0      cluster_2.1.4       timechange_0.2.0   
## [76] ellipsis_0.3.2

Simple Computational Models for Predicting Pertussis Boost Response

2023-09-11

Setup: Load libraries and helper functions

Setup: Variables used throughout the script

Tasks:

Setup: Load dataset, combine different experiments

Setup: Dataset fold change and Ranking

Model construction

A. Modeling Approach: Age of the participants

Modeling steps:

B. Modeling approach: Baseline values of task variables

Modeling steps:

Model Training and Evaluation

1) Training Approach : Trivial Model

Model Evaluation using Spearman Rank Correlation Coefficient

2) Training Approach : Multi-predictors LASSO Models

Validate model quality

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Feature Selection

Feature Importance using Coefficients (Slope)

Generate predictions for 2022 tasks

Session Info