Simple Computational Models for Predicting Pertussis Boost Response

Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
  library(kableExtra)
  library(formattable)
  library(colorRamp2)
  library(ComplexHeatmap)
  library(GetoptLong)
  library(ellipse)
  library("ggplot2")                     
  library("GGally")
  source(here("./src/codebase.R"))
})

Setup: Variables used through out the script

Tasks:

1. Monocytes at day 1

2. CCL3 at day 3

3. IgG-PT at day 14

DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", 
               "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT")
RNA_COLS <- c("CCL3")
CELL_COLS <- c("Monocytes")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "IgG_PT_D0", "CCL3_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0",
                    "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS]

abtiterDf <- data.frame(df_source[["plasma_antibody_levels_wide"]])
abtiterDf$specimen_id <- as.numeric(rownames(abtiterDf))
abtiterDf <- data.frame(abtiterDf[, c("specimen_id", ABTITER_COLS)])


rnaDf <- data.frame(df_source[["pbmc_gene_expression_wide"]])
rnaDf$specimen_id <- as.numeric(rownames(rnaDf))
tasks_seq <- c('ENSG00000277632')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, c("specimen_id", RNA_COLS)])


cellDf <- data.frame(df_source[["pbmc_cell_frequency_wide"]])
cellDf$specimen_id <- as.numeric(rownames(cellDf))
cellDf <- data.frame(cellDf[, c("specimen_id", CELL_COLS)])


list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>% 
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex", 
                        "infancy_vac", "age_at_boost"), 
              names_from = timepoint, 
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)), 
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]
df_pivot <- data.frame(df_pivot %>%
   mutate(across(everything(),  ~ case_when(.x >=0 ~ .x))))

Setup: Dataset log transform , Ranking and fold change

Fold change for actual values been generated by dividing the actual value of each task to it’s baseline
Fold change for normalized values been generated by dividing the log transformed value of each task to it’s log transformed baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", "dataset", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "min", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)],
                           2, rankingFunction)
df <- data.frame(df)

df_2020 <- df[df$dataset=="2020_dataset", ]
df_2021 <- df[df$dataset=="2021_dataset", ]

Model construction

No actual model been developed. We use baseline values of tasks provided and demographic features such as age of participants as model. Two types of models has been developed:

A. Age of the participants

B. Baseline values of task variables

A. Modeling Approach: Age of the participants

Age of participants taken as it is has been used as predictor for a given task.

Modeling steps:

1.  Extract age of the participants
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on age at boost.

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", 
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df_2021[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

ageModel_df$Monocytes_D1_Rank <- df_2021$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df_2021$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df_2021$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df_2021$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df_2021$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df_2021$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(ageModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Age Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")

Age Based Models
	Subject ID	Age	Biological Sex at Birth	Vaccine Priming Status	1.1) IgG_PT_D14_titer_Rank	1.2) IgG_PT_D14_FC_Rank	2.1) Monocytes_D1_Rank	2.2) Monocytes_D1_FC_Rank	3.1) CCL3_D3_Rank	3.2) CCL3_D3_FC_Rank
61	61	32.38	Female	wP	16	16	16	16	16	16
62	62	25.99	Female	wP	38	38	38	38	38	38
63	63	23.98	Female	wP	46	46	46	46	46	46
64	64	25.99	Male	wP	38	38	38	38	38	38
65	65	29.02	Male	wP	26	26	26	26	26	26
66	66	43.07	Female	wP	4	4	4	4	4	4
67	67	47.24	Female	wP	2	2	2	2	2	2
68	68	47.24	Male	wP	2	2	2	2	2	2
69	69	29.17	Female	wP	25	25	25	25	25	25
70	70	21.15	Male	aP	59	59	59	59	59	59
71	71	21.15	Female	aP	59	59	59	59	59	59
72	72	28.25	Female	wP	31	31	31	31	31	31
73	73	24.23	Female	wP	44	44	44	44	44	44
74	74	24.23	Female	wP	44	44	44	44	44	44
75	75	21.22	Female	aP	57	57	57	57	57	57
76	76	21.22	Female	aP	57	57	57	57	57	57
77	77	31.32	Male	wP	21	21	21	21	21	21
78	78	26.30	Female	wP	36	36	36	36	36	36
79	79	32.32	Male	wP	17	17	17	17	17	17
80	80	27.30	Female	wP	34	34	34	34	34	34
81	81	26.30	Male	wP	36	36	36	36	36	36
82	82	21.28	Female	aP	56	56	56	56	56	56
83	83	20.34	Female	aP	66	66	66	66	66	66
84	84	22.34	Female	aP	52	52	52	52	52	52
85	85	19.39	Female	aP	87	87	87	87	87	87
86	86	21.40	Female	aP	55	55	55	55	55	55
87	87	19.39	Male	aP	87	87	87	87	87	87
88	88	19.39	Male	aP	87	87	87	87	87	87
89	89	22.49	Female	aP	51	51	51	51	51	51
90	90	20.49	Female	aP	65	65	65	65	65	65
91	91	21.49	Male	aP	54	54	54	54	54	54
92	92	19.54	Female	aP	85	85	85	85	85	85
93	93	23.56	Female	aP	48	48	48	48	48	48
94	94	20.55	Male	aP	64	64	64	64	64	64
95	95	21.55	Female	aP	53	53	53	53	53	53
96	96	19.54	Male	aP	85	85	85	85	85	85

B. Modeling approach: Baseline values of task variables

Baseline value of a given task has been used as predictor for a given task.

For Example:
  For task 1.1) IgG-PT-D14-titer-Rank we use baseline value of IgG-PT as predicted values.
  Similarly for 1.2) IgG-PT-D14-FC-Rank we used baseline of IgG-PT as predictor values.

Modeling steps:

1.  Extract baseline of each task
2.  Convert actual values into ranks
3.  Prepare submission file

The submission file looks as follow:

Possible rank of predicted values for each task when model is trained based on the baseline of that task.

expCols <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank",  
             "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank", 
             "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df_2021[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]

baselineModel_df$Monocytes_D1_Rank <- df_2021$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df_2021$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df_2021$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df_2021$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df_2021$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df_2021$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(baselineModel_df, "html", align = "lccrr", booktabs=TRUE, border_left = T, 
             border_right = T, caption = "Baseline Based Models")  %>% 
  kable_styling("striped", full_width = T) %>% 
  scroll_box(width = "100%", height = "400px")

Baseline Based Models
	Subject ID	Age	Biological Sex at Birth	Vaccine Priming Status	1.1) IgG_PT_D14_titer_Rank	1.2) IgG_PT_D14_FC_Rank	2.1) Monocytes_D1_Rank	2.2) Monocytes_D1_FC_Rank	3.1) CCL3_D3_Rank	3.2) CCL3_D3_FC_Rank
61	61	32.38	Female	wP	70	70	NA	NA	39	39
62	62	25.99	Female	wP	13	13	NA	NA	13	13
63	63	23.98	Female	wP	5	5	40	40	31	31
64	64	25.99	Male	wP	51	51	49	49	22	22
65	65	29.02	Male	wP	6	6	36	36	10	10
66	66	43.07	Female	wP	4	4	48	48	28	28
67	67	47.24	Female	wP	3	3	17	17	40	40
68	68	47.24	Male	wP	12	12	44	44	40	40
69	69	29.17	Female	wP	22	22	39	39	24	24
70	70	21.15	Male	aP	79	79	6	6	52	52
71	71	21.15	Female	aP	33	33	42	42	20	20
72	72	28.25	Female	wP	8	8	20	20	51	51
73	73	24.23	Female	wP	15	15	28	28	43	43
74	74	24.23	Female	wP	65	65	3	3	59	59
75	75	21.22	Female	aP	84	84	NA	NA	53	53
76	76	21.22	Female	aP	17	17	32	32	60	60
77	77	31.32	Male	wP	56	56	18	18	11	11
78	78	26.30	Female	wP	37	37	11	11	56	56
79	79	32.32	Male	wP	30	30	2	2	9	9
80	80	27.30	Female	wP	74	74	21	21	8	8
81	81	26.30	Male	wP	54	54	12	12	12	12
82	82	21.28	Female	aP	NA	NA	35	35	5	5
83	83	20.34	Female	aP	63	63	19	19	70	70
84	84	22.34	Female	aP	59	59	24	24	34	34
85	85	19.39	Female	aP	80	80	40	40	35	35
86	86	21.40	Female	aP	47	47	23	23	32	32
87	87	19.39	Male	aP	NA	NA	31	31	15	15
88	88	19.39	Male	aP	NA	NA	37	37	7	7
89	89	22.49	Female	aP	85	85	43	43	27	27
90	90	20.49	Female	aP	76	76	34	34	47	47
91	91	21.49	Male	aP	57	57	8	8	66	66
92	92	19.54	Female	aP	67	67	16	16	60	60
93	93	23.56	Female	aP	45	45	45	45	53	53
94	94	20.55	Male	aP	32	32	9	9	70	70
95	95	21.55	Female	aP	24	24	27	27	60	60
96	96	19.54	Male	aP	64	64	14	14	68	68

Model Training and Evaluation

Models been developed and trained using different predictors or combination of predictors. Two types of models been trained for each task:

1) Trivial Models

2) Multi-predictor Models

Spearman correlation coefficient has been used as evaluation matrices.

1) Training Approach : Trivial Model

Two models has been constructed for each task and the fold change of each task using Spearman Correlation Coefficient.

1. Age of Participants

2. Baseline values of task variables

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]

corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df_2020[,x], df_2020[,y], 
                                                  method="spearman", 
                                                  use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df_2020[,ranked_cols]), 
                                                      method="spearman", 
                                                      conf.level = 0.95, 
                                                      na.rm = TRUE)})$p

corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Model Evaluation using Spearman Rank Correlation Coefficient

Heatmap of Spearman correlation coefficient for each task against age at boost or the baseline of the task. Columns of the Heatmap indicating the task and the task fold change, and rows are indicating the model type; age base or baseline base models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))

2) Training Approach : Multi-predictors LASSO Models

For each task 4 models have been developed using combination of predictors as follow:
1. 3 Baseline: Baseline of all 3 tasks combined are being used to train the models for each of the tasks.
2. 3 Baseline + Demographic: Baseline of all three tasks combined with demographic data including; Age, Biological Sex and Vaccine type are being used to train models for each of the tasks.

source(here("./src/codebase.R"))
set.seed(0)

gs_2020 <- df_2020
# gs_2020 <- data.frame(df_pivot)

# 
gs_2020$infancy_vac <- as.numeric(factor(gs_2020$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2020$biological_sex <- as.numeric(factor(gs_2020$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2020[,"Monocytes_D1_FC"] <- gs_2020[, "Monocytes_D1"] / gs_2020[, "Monocytes_D0"]
gs_2020[,"CCL3_D3_FC"] <- gs_2020[, "CCL3_D3"] / gs_2020[, "CCL3_D0"]
gs_2020[,"IgG_PT_D14_FC"] <- gs_2020[, "IgG_PT_D14"] / gs_2020[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   gs_2020[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2020[, ltCol]+1)
}

gs_2020[,"normalized_CCL3_D3_FC"] <- gs_2020[, "normalized_CCL3_D3"] / gs_2020[, "normalized_CCL3_D0"]
gs_2020[,"normalized_IgG_PT_D14_FC"] <- gs_2020[, "normalized_IgG_PT_D14"] / gs_2020[, "normalized_IgG_PT_D0"]



gs_2021 <- df_2021
# gs_2020 <- data.frame(df_pivot)

# 
gs_2021$infancy_vac <- as.numeric(factor(gs_2021$infancy_vac, labels = c(1,2),
                                     levels = c("wP", "aP"), exclude = NA))
gs_2021$biological_sex <- as.numeric(factor(gs_2021$biological_sex,
                                        labels = c(1,2), exclude = NA))

gs_2021[,"Monocytes_D1_FC"] <- gs_2021[, "Monocytes_D1"] / gs_2021[, "Monocytes_D0"]
gs_2021[,"CCL3_D3_FC"] <- gs_2021[, "CCL3_D3"] / gs_2021[, "CCL3_D0"]
gs_2021[,"IgG_PT_D14_FC"] <- gs_2021[, "IgG_PT_D14"] / gs_2021[, "IgG_PT_D0"]


for (ltCol in LOG_TRANS_COLS){
   gs_2021[, paste("normalized", ltCol, sep = "_")] <- log2(gs_2021[, ltCol]+1)
}

gs_2021[,"normalized_CCL3_D3_FC"] <- gs_2021[, "normalized_CCL3_D3"] / gs_2021[, "normalized_CCL3_D0"]
gs_2021[,"normalized_IgG_PT_D14_FC"] <- gs_2021[, "normalized_IgG_PT_D14"] / gs_2021[, "normalized_IgG_PT_D0"]

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Heatmap of Spearman correlation of predicted against actual values for each task. Columns of the Heatmap indicating the task and the task fold change. Rows are indicating different models. Crosses indicating insignificant correlations (p-values > 0.05).

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

Feature Selection

For each model, we assessed which features contribute to non-zero coefficients (slopes). For this purpose the minimum lambda, that gives minimum mean cross-validated error, been used for prediction.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
set.seed(1)

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2020, y, x)
res_base3 <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3) <- c("Variable", y)
res_base3$Models <- "3 Baselines"

x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")
y <- c("normalized_IgG_PT_D14", "normalized_IgG_PT_D14_FC", "Monocytes_D1", 
       "Monocytes_D1_FC", "normalized_CCL3_D3", "normalized_CCL3_D3_FC")

coefList <- coefficientFunction(gs_2020, y, x)
res_base3D <- coefList %>% reduce(full_join, by="Variable")
colnames(res_base3D) <- c("Variable", y)
res_base3D$Models <- "Demography + 3 Baselines"

df_list <- list(res_base3, res_base3D)
coeffDf <- df_list %>% reduce(full_join, by= c("Variable", "Models",y))
coeffDf$Variable <- gsub("normalized_", "", coeffDf$Variable)

# knitr::opts_chunk$set(fig.width=16, fig.height=18)
coeffDf2 <- coeffDf

rownames(coeffDf2) <- paste(coeffDf2$Models, coeffDf2$Variable, sep="_")
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3"] <- "3.1) CCL3-D3-Rank"

colnames(coeffDf2)[colnames(coeffDf2)=="normalized_IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(coeffDf2)[colnames(coeffDf2)=="normalized_CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"

coeffDf2 <- coeffDf2[coeffDf2$Variable!="(Intercept)",]
# coeffDf2["Demography + 3 Baselines_age_at_boost", ] <- 0
# coeffDf2["Demography + 3 Baselines_age_at_boost", "Variable"] <- "age_at_boost"

tcoeffDf2 <- coeffDf2[, !names(coeffDf2) %in% c("Variable", "Models")]

Feature Importance using Coefficients (Slope)

Heatmap of slopes between each task and each of the predictors used to train LASSO model. The far left column of the heatmap represents model type; 3 Baseline models or 3 Baseline + Demographic models. The second column indicates features (predictors) being used to train models. Slopes between each task and a particular predictor, from each model type, is shown in the body of the heatmap. Columns of the heatmap body represent tasks. Slopes above zero are shown in red and those bellow zero are shown in blue. Slope of zero indicates that there is no relationship between coresponding task and predictor in that particular model.

modelAnno = rowAnnotation(
  Models = anno_block(gp = gpar(fill = 2:4),
        labels = c("3 Baseline", "Demography + 3 Baseline"),
        labels_gp = gpar(col = "white", fontsize = 10, fontface = "bold")),
        annotation_name_side = "top")

baseAnno = rowAnnotation(labels = anno_text(coeffDf2$Variable, which = "row",
                                            gp = gpar(fontsize = 12, fontface = "bold")),
    width = unit(30, "mm"))

tcoeffDf2 <- tcoeffDf2 %>% replace(is.na(.), 0)

rowOrder <- c("3 Baselines_IgG_PT_D0", "3 Baselines_Monocytes_D0", "3 Baselines_CCL3_D0", 
             # "Demography + 3 Baselines_age_at_boost",
             "Demography + 3 Baselines_biological_sex",
             "Demography + 3 Baselines_infancy_vac", "Demography + 3 Baselines_IgG_PT_D0", 
             "Demography + 3 Baselines_Monocytes_D0", "Demography + 3 Baselines_CCL3_D0")

rowNames <- c("IgG_PT_D0", "Monocytes_D0", "CCL3_D0", 
             # "age_at_boost",
             "biological_sex",
             "infancy_vac", "IgG_PT_D0", 
             "Monocytes_D0", "CCL3_D0")

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")

maxCorr <- max(abs(as.matrix(tcoeffDf2)), na.rm = T)
col_rnorm = colorRamp2(c(-maxCorr, 0, maxCorr), c("blue", "white","red"))
h1 <- Heatmap(tcoeffDf2, name = "Coefficient", column_names_rot = 45,
              cell_fun = function(j, i, x, y, width, height, fill) {
                  grid.text(sprintf("%.2f", tcoeffDf2[i, j]), x, y, gp = gpar(fontsize = 14, fontface = "bold"))},
              column_names_side = "top",
              na_col = "white",
              show_row_names=F,
              row_order = rowOrder,
              row_labels = rowNames,
              column_order = colOrders,
              height = nrow(tcoeffDf2)*unit(10, "mm"),
              width = ncol(tcoeffDf2)*unit(30, "mm"),
              col = col_rnorm, show_row_dend = FALSE,
              show_column_dend = FALSE,
              row_split = factor(coeffDf2$Model,
                                 levels = c("3 Baselines",
                                            "Demography + 3 Baselines",
                                            "12 Baselines",
                                            "Demography + 12 Baselines")),
              row_title = NULL,
              column_names_gp = gpar(fontsize = 12, fontface = "bold"),
               heatmap_legend_param = list( title_gp=gpar(fontsize = 12, fontface = "bold"),
                                          labels_gp=gpar(fontsize = 12)))

png(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", sep = "")),
    width = 10, height = 6, units = 'in', res = 300 )
modelAnno + baseAnno+ h1
dev.off()

knitr::include_graphics(here(paste("./results/",  "/LASSO_coefficientHeatmap.png", 
                                   sep = "")), dpi = 100)

Generate predictions for 2021 tasks

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf<- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes-D1-Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes-D1-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3-D3-Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3-D3-FC-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG-PT-D14-titer-Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG-PT-D14-FC-Rank"

colOrders <- c("1.1) IgG-PT-D14-titer-Rank", "1.2) IgG-PT-D14-FC-Rank", 
               "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank",
               "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")


maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf[,colOrders]),
         tl.col = 'black',
         # insig='blank',
         na.label=" ", 
         addCoef.col = 1,
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf[,colOrders]),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))

#### Session Info

sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GGally_2.1.2          ellipse_0.4.3         GetoptLong_1.0.5     
##  [4] ComplexHeatmap_2.14.0 colorRamp2_0.1.0      formattable_0.2.1    
##  [7] here_1.0.1            glmnet_4.1-6          Matrix_1.5-1         
## [10] kableExtra_1.3.4      data.table_1.14.8     corrplot_0.92        
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] purrr_1.0.1           readr_2.1.4           tibble_3.2.0         
## [19] tidyverse_2.0.0       tidyr_1.3.0           ggplot2_3.4.1        
## [22] dplyr_1.1.0           discretefit_0.1.2     pROC_1.18.0          
## [25] verification_1.42     dtw_1.23-1            proxy_0.4-27         
## [28] CircStats_0.2-6       MASS_7.3-58.1         boot_1.3-28          
## [31] fields_14.1           viridis_0.6.2         viridisLite_0.4.1    
## [34] spam_2.9-1           
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_0.63.0  RColorBrewer_1.1-3  doParallel_1.0.17  
##  [4] webshot_0.5.4       httr_1.4.5          rprojroot_2.0.3    
##  [7] tools_4.2.2         bslib_0.4.2         utf8_1.2.3         
## [10] R6_2.5.1            BiocGenerics_0.44.0 colorspace_2.1-0   
## [13] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [16] compiler_4.2.2      cli_3.6.0           rvest_1.0.3        
## [19] xml2_1.3.3          sass_0.4.5          scales_1.2.1       
## [22] systemfonts_1.0.4   digest_0.6.31       rmarkdown_2.20     
## [25] svglite_2.1.1       pkgconfig_2.0.3     htmltools_0.5.4    
## [28] highr_0.10          fastmap_1.1.1       maps_3.4.1         
## [31] GlobalOptions_0.1.2 htmlwidgets_1.6.1   rlang_1.0.6        
## [34] rstudioapi_0.14     shape_1.4.6         jquerylib_0.1.4    
## [37] generics_0.1.3      jsonlite_1.8.4      magrittr_2.0.3     
## [40] dotCall64_1.0-2     S4Vectors_0.36.2    Rcpp_1.0.10        
## [43] munsell_0.5.0       fansi_1.0.4         lifecycle_1.0.3    
## [46] stringi_1.7.12      yaml_2.3.7          plyr_1.8.8         
## [49] parallel_4.2.2      crayon_1.5.2        lattice_0.20-45    
## [52] splines_4.2.2       circlize_0.4.15     hms_1.1.2          
## [55] knitr_1.42          pillar_1.8.1        rjson_0.2.21       
## [58] stats4_4.2.2        codetools_0.2-18    glue_1.6.2         
## [61] evaluate_0.20       vctrs_0.5.2         png_0.1-8          
## [64] tzdb_0.3.0          foreach_1.5.2       gtable_0.3.1       
## [67] reshape_0.8.9       clue_0.3-64         cachem_1.0.7       
## [70] xfun_0.37           survival_3.4-0      iterators_1.0.14   
## [73] IRanges_2.32.0      cluster_2.1.4       timechange_0.2.0   
## [76] ellipsis_0.3.2

Simple Computational Models for Predicting Pertussis Boost Response

2023-09-05

Setup: Load libraries and helper functions

Setup: Variables used through out the script

Tasks:

Setup: Load dataset, combine different experiments

Setup: Dataset log transform , Ranking and fold change

Model construction

A. Modeling Approach: Age of the participants

Modeling steps:

B. Modeling approach: Baseline values of task variables

Modeling steps:

Model Training and Evaluation

1) Training Approach : Trivial Model

Model Evaluation using Spearman Rank Correlation Coefficient

2) Training Approach : Multi-predictors LASSO Models

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction

Feature Selection

Feature Importance using Coefficients (Slope)

Generate predictions for 2021 tasks

Model Evaluation using Spearman Rank Correlation Coefficient for LASSO prediction