Simple Computational Models for Predicting Pertussis Boost Response.

2023-08-21

Setup: Load libraries and helper functions

suppressPackageStartupMessages({
  library(verification)
  library(pROC)
  library(discretefit)
  library(dplyr)
  library(ggplot2)
  library(tidyr)
  library(tidyverse)
  library(corrplot)
  library(data.table)
  library(here)
  library(readr)
  library(kableExtra)
  library(glmnet)
  library(here)
})
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'

Setup: Variables

DATASET <- c("2020_dataset", "2021_dataset")
TIMEPOINTS <- c(0, 1, 3, 14)
dataFile <- "combined_dataset2020_2021.csv"

META_COLS <- c("specimen_id", "subject_id", "timepoint", "dataset", "biological_sex", "infancy_vac", "age_at_boost")
ABTITER_COLS <- c("IgG_PT",'IgG_FHA','IgG_PRN','IgG1_PT','IgG1_FHA','IgG4_PT', 'IgG4_FHA')
RNA_COLS <- c("CCL3", "IL6", "NFKBIA")
CELL_COLS <- c("Monocytes", "CD4Tcells")

DEMOGRAPHY_COLS <- c("age_at_boost", "biological_sex", "infancy_vac")

TASK_COLS <- c("Monocytes_D1", "CCL3_D3", "IgG_PT_D14")
TASKS_BASELINES <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
BASE_COLS <- c("Monocytes_D0", "CD4Tcells_D0", "IgG_PT_D0", 'IgG_FHA_D0', 
               'IgG_PRN_D0','IgG1_PT_D0', 'IgG1_FHA_D0', 'IgG4_PT_D0', 
               'IgG4_FHA_D0', "CCL3_D0", "IL6_D0", "NFKBIA_D0")

LOG_TRANS_COLS <- c("CCL3_D0", "IgG_PT_D0","IgG_FHA_D0",'IgG_PRN_D0','IgG1_PT_D0','IgG1_FHA_D0',
                    'IgG4_PT_D0','IgG4_FHA_D0', "IL6_D0", "NFKBIA_D0",  "CCL3_D3",  "IgG_PT_D14")

Setup: Load dataset, combine different experiments

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
df_source <- readRDS(here("./data/master_normalized_data_challenge2_train.RDS"))

metaDf <- data.frame(df_source[["subject_specimen"]])
metaDf["age_at_boost"] <- as.numeric(round(difftime(metaDf$date_of_boost, metaDf$year_of_birth,units="weeks")/52, 2))
metaDf <- metaDf[, META_COLS]

abtiterDf <- data.frame(df_source[["plasma_antibody_levels_wide"]])
abtiterDf <- data.frame(abtiterDf[, ABTITER_COLS])
colnames(abtiterDf) <- ABTITER_COLS
abtiterDf["specimen_id"] <- as.numeric(row.names(abtiterDf))

rnaDf <- data.frame(df_source[["pbmc_gene_expression_wide"]])
tasks_seq <- c('ENSG00000277632','ENSG00000136244','ENSG00000100906')
for (i in 1:length(tasks_seq)){
  rnaDf <- data.frame(rnaDf %>% rename_at(vars(starts_with(tasks_seq[i])), ~RNA_COLS[i]))
}
rnaDf <- data.frame(rnaDf[, RNA_COLS])
colnames(rnaDf) <- RNA_COLS
rnaDf["specimen_id"] <- as.numeric(row.names(rnaDf))

cellDf <- data.frame(df_source[["pbmc_cell_frequency_wide"]])
cellDf <- data.frame(cellDf[, CELL_COLS])
colnames(cellDf) <- CELL_COLS
cellDf["specimen_id"] <- as.numeric(row.names(cellDf))

list_df <- list(metaDf, cellDf, abtiterDf, rnaDf)
df_merge <- list_df %>% reduce(full_join, by="specimen_id")
df_merge <- df_merge[df_merge$timepoint %in% TIMEPOINTS, ]

df_pivot <- df_merge[, names(df_merge)!="specimen_id"] %>% 
  pivot_wider(id_cols=c("subject_id", "dataset", "biological_sex", 
                        "infancy_vac", "age_at_boost"), 
              names_from = timepoint, 
              values_from = all_of(c(CELL_COLS, RNA_COLS, ABTITER_COLS)), 
              names_sep = "_D")

df_pivot <- df_pivot[df_pivot$dataset %in% DATASET, ]

Setup: Dataset log transform , Ranking and fold change
Fold change for actual values been generated by dividing the actual value of each task by it’s baseline Fold change for normalised values been generated by dividing the log transformed value of each task by it’s log transformed baseline

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

fc_cols <- paste(targetY, "FC", sep="_")

ranked_cols <- paste(c("age_at_boost", targetX, targetY, fc_cols), "Rank", sep="_")

df <- df_pivot[, c("subject_id", DEMOGRAPHY_COLS, targetX, targetY)]

rankingFunction <- function(x) {
  as.numeric(rank(-x, ties.method = "average", na.last = "keep"))
}

targetX <- c("Monocytes_D0", "CCL3_D0", "IgG_PT_D0")
targetY <- c("Monocytes_D1","CCL3_D3", "IgG_PT_D14")

df[,"Monocytes_D1_FC"] <- df[, "Monocytes_D1"] / df[, "Monocytes_D0"]
df[,"CCL3_D3_FC"] <- df[, "CCL3_D3"] / df[, "CCL3_D0"]
df[,"IgG_PT_D14_FC"] <- df[, "IgG_PT_D14"] / df[, "IgG_PT_D0"]

df[, ranked_cols] <- apply(df[, c("age_at_boost", targetX, targetY, fc_cols)], 2, rankingFunction)
df <- data.frame(df)

Table for the rank of Age based Models

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", "2.1) Monocytes-D1-Rank", 
             "2.2) Monocytes-D1-FC-Rank", "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
ageModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS, 
                      "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                      "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                      "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]


ageModel_df$Monocytes_D1_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_Rank <- df$age_at_boost_Rank

ageModel_df$Monocytes_D1_FC_Rank <- df$age_at_boost_Rank
ageModel_df$CCL3_D3_FC_Rank <- df$age_at_boost_Rank
ageModel_df$IgG_PT_D14_FC_Rank <- df$age_at_boost_Rank

colnames(ageModel_df)[colnames(ageModel_df)=="subject_id"] <- "Subject ID"
colnames(ageModel_df)[colnames(ageModel_df)=="age_at_boost"] <- "Age" 
colnames(ageModel_df)[colnames(ageModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(ageModel_df)[colnames(ageModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(ageModel_df)[colnames(ageModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(ageModel_df)[colnames(ageModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"  

knitr::kable(head(ageModel_df), "pipe", align = "lccrr", booktabs=TRUE) %>% kable_styling(full_width = F)
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
1 30.80 Female wP 22 22 22 22 22 22
2 51.25 Female wP 1 1 1 1 1 1
3 33.89 Female wP 13 13 13 13 13 13
4 28.76 Male wP 30 30 30 30 30 30
5 25.75 Male wP 42 42 42 42 42 42
6 28.87 Female wP 27 27 27 27 27 27

Table for the rank of Baseline based Models

expCols <- c("1.1) IgG-PT-D14-Rank", "1.2) IgG-PT-D14-FC-Rank", "2.1) Monocytes-D1-Rank", 
             "2.2) Monocytes-D1-FC-Rank", "3.1) CCL3-D3-Rank", "3.2) CCL3-D3-FC-Rank")
baselineModel_df <- df[, c("subject_id", DEMOGRAPHY_COLS, 
                           "IgG_PT_D14_Rank", "IgG_PT_D14_FC_Rank", 
                            "Monocytes_D1_Rank", "Monocytes_D1_FC_Rank", 
                            "CCL3_D3_Rank", "CCL3_D3_FC_Rank")]


baselineModel_df$Monocytes_D1_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_Rank <- df$IgG_PT_D14_Rank

baselineModel_df$Monocytes_D1_FC_Rank <- df$Monocytes_D1_Rank
baselineModel_df$CCL3_D3_FC_Rank <- df$CCL3_D3_Rank
baselineModel_df$IgG_PT_D14_FC_Rank <- df$IgG_PT_D14_Rank

colnames(baselineModel_df)[colnames(baselineModel_df)=="subject_id"] <- "Subject ID"
colnames(baselineModel_df)[colnames(baselineModel_df)=="age_at_boost"] <- "Age" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="infancy_vac"] <- "Vaccine Priming Status" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="biological_sex"] <- "Biological Sex at Birth" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_Rank"] <- "2.1) Monocytes_D1_Rank"
colnames(baselineModel_df)[colnames(baselineModel_df)=="Monocytes_D1_FC_Rank"] <- "2.2) Monocytes_D1_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_Rank"] <- "3.1) CCL3_D3_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="CCL3_D3_FC_Rank"] <- "3.2) CCL3_D3_FC_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_Rank"] <- "1.1) IgG_PT_D14_titer_Rank" 
colnames(baselineModel_df)[colnames(baselineModel_df)=="IgG_PT_D14_FC_Rank"] <- "1.2) IgG_PT_D14_FC_Rank"    

knitr::kable(head(baselineModel_df), "pipe", align = "lccrr", booktabs=TRUE) %>% kable_styling(full_width = F)
Subject ID Age Biological Sex at Birth Vaccine Priming Status 1.1) IgG_PT_D14_titer_Rank 1.2) IgG_PT_D14_FC_Rank 2.1) Monocytes_D1_Rank 2.2) Monocytes_D1_FC_Rank 3.1) CCL3_D3_Rank 3.2) CCL3_D3_FC_Rank
1 30.80 Female wP 20 20 NA NA 21 21
2 51.25 Female wP NA NA NA NA NA NA
3 33.89 Female wP 40 40 NA NA 36 36
4 28.76 Male wP 35 35 53 53 64 64
5 25.75 Male wP 48 48 NA NA 42 42
6 28.87 Female wP 29 29 1 1 45 45

1) Trivial Model;

  1. Calculate ranks of predictors e.g. baseline values and age.
  2. Calculate ranks of tasks
  3. calculate spearman correlation of step 1 and 2 ranks
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
x <- c("age_at_boost_Rank", ranked_cols[grepl('D0', ranked_cols)])
y <- ranked_cols[!ranked_cols %in% x ]


corDf <- data.frame()
corP_df <- data.frame()

corrTest <- suppressPackageStartupMessages({t(cor(df[,x], df[,y], method="spearman", use= "pairwise.complete.obs"))})
corrPval <- suppressPackageStartupMessages({cor.mtest(as.matrix(df[,ranked_cols]), method="spearman", conf.level = 0.95, na.rm = TRUE)})$p


corDf["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corDf["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "age_at_boost_Rank"]
corDf["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "age_at_boost_Rank"]
corDf["Age", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "age_at_boost_Rank"]
corDf["Age", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "age_at_boost_Rank"]

corDf["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrTest["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrTest["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank"]
corDf["Baseline", "2.1) Monocytes-D1-Rank"] <- corrTest["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corDf["Baseline",  "2.2) Monocytes-D1-FC-Rank"] <- corrTest["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corDf["Baseline", "3.1) CCL3-D3-Rank"] <- corrTest["CCL3_D3_Rank", "CCL3_D0_Rank"]
corDf["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrTest["CCL3_D3_FC_Rank", "CCL3_D0_Rank" ]

corP_df["Age", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "age_at_boost_Rank"]
corP_df["Age", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]
corP_df["Age", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_Rank", "age_at_boost_Rank"]

corP_df["Baseline", "1.1) IgG-PT-D14-titer-Rank"] <- corrPval["IgG_PT_D14_Rank", "IgG_PT_D0_Rank"]
corP_df["Baseline", "1.2) IgG-PT-D14-FC-Rank"] <- corrPval["IgG_PT_D14_FC_Rank", "IgG_PT_D0_Rank" ]
corP_df["Baseline", "2.1) Monocytes-D1-Rank"] <- corrPval["Monocytes_D1_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "2.2) Monocytes-D1-FC-Rank"] <- corrPval["Monocytes_D1_FC_Rank", "Monocytes_D0_Rank"]
corP_df["Baseline", "3.1) CCL3-D3-Rank"] <- corrPval["CCL3_D3_Rank", "CCL3_D0_Rank"]
corP_df["Baseline", "3.2) CCL3-D3-FC-Rank"] <- corrPval["CCL3_D3_FC_Rank", "CCL3_D0_Rank"]

Figure1: Heatmap of Spearman Correlation

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
maxCorr <- max(abs(as.matrix(corDf)), na.rm = T)
corrplot(as.matrix(corDf), tl.col = 'black', 
         addCoef.col = 1,
         mar=c(0,0,0,2),
         number.cex = 1.2, tl.srt = 45,
         p.mat = as.matrix(corP_df), sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2, col = colorRampPalette(c("blue", "white","red"))(100))
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
set.seed(0)
source(here("./src/codebase.R"))
df1 <- data.frame(df_pivot)

df1$infancy_vac <- as.numeric(factor(df1$infancy_vac, labels = c(1,2), levels = c("wP", "aP"), exclude = NA))
df1$biological_sex <- as.numeric(factor(df1$biological_sex, labels = c(1,2), exclude = NA))

for (ltCol in LOG_TRANS_COLS){
   df1[, paste("normalized", ltCol, sep = "_")] <- log2(df1[, ltCol]+1)
}

corrDf <- data.frame()
pvalDf <- data.frame()

x <- c("Monocytes_D0", "normalized_CCL3_D0", "normalized_IgG_PT_D0")

y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "3Baseline", y, x)
corrDf <- data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])


x <- c("age_at_boost", "infancy_vac", "biological_sex",
       "Monocytes_D0", "normalized_IgG_PT_D0", "normalized_CCL3_D0")

y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "Demography_3Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])


x <- c("Monocytes_D0", "CD4Tcells_D0", "normalized_IgG_PT_D0", 'normalized_IgG1_FHA_D0',
       'normalized_IgG_FHA_D0','normalized_IgG_PRN_D0', 'normalized_IgG1_PT_D0', 'normalized_IgG4_PT_D0',
        'normalized_IgG4_FHA_D0', "normalized_CCL3_D0", "normalized_IL6_D0", "normalized_NFKBIA_D0")

y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "14Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

x <- c("age_at_boost", "infancy_vac", "biological_sex", "Monocytes_D0", "CD4Tcells_D0", "normalized_IgG_PT_D0",
       'normalized_IgG_FHA_D0','normalized_IgG_PRN_D0', 'normalized_IgG1_PT_D0','normalized_IgG1_FHA_D0',
       'normalized_IgG4_PT_D0', 'normalized_IgG4_FHA_D0', "normalized_CCL3_D0", "normalized_IL6_D0", "normalized_NFKBIA_D0")

y <- c("Monocytes_D1", "normalized_CCL3_D3", "normalized_IgG_PT_D14")
res <- lassoFunction(df1, corrDf, pvalDf, "Demography_14Baseline", y, x)
corrDf <-data.frame(res[[1]])
pvalDf <- data.frame(res[[2]])

Figure2: Heatmap of Spearman Correlation for LASSO prediction

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

colOrder <- c("Monocytes_D1", "Monocytes_D1_FC", "CCL3_D3",
              "CCL3_D3_FC", "IgG_PT_D14", "IgG_PT_D14_FC")

colnames(corrDf)[colnames(corrDf)=="X3Baseline"] <- "3 Baseline "
colnames(corrDf)[colnames(corrDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"
colnames(corrDf)[colnames(corrDf)=="X14Baseline"] <- "14 Baseline"
colnames(corrDf)[colnames(corrDf)=="Demography_14Baseline"] <- "Demography + 14 Baseline"
#
colnames(pvalDf)[colnames(pvalDf)=="X3Baseline"] <- "3 Baseline "
colnames(pvalDf)[colnames(pvalDf)=="Demography_3Baseline"] <- "Demography + 3 Baseline"
colnames(pvalDf)[colnames(pvalDf)=="X14Baseline"] <- "14 Baseline"
colnames(pvalDf)[colnames(pvalDf)=="Demography_14 Baseline"] <- "Demography + 14 Baseline"

corrDf <- t(corrDf)[, colOrder]
pvalDf <- t(pvalDf)[, colOrder]
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1"] <- "2.1) Monocytes_D1_Rank"
colnames(corrDf)[colnames(corrDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes_D1_FC_Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3"] <- "3.1) CCL3_D3_Rank"
colnames(corrDf)[colnames(corrDf)=="CCL3_D3_FC"] <- "3.2) CCL3_D3_FC_Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14"] <- "1.1) IgG_PT_D14_titer_Rank"
colnames(corrDf)[colnames(corrDf)=="IgG_PT_D14_FC"] <- "1.2) IgG_PT_D14_FC_Rank"
#
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1"] <- "2.1) Monocytes_D1_Rank"
colnames(pvalDf)[colnames(pvalDf)=="Monocytes_D1_FC"] <- "2.2) Monocytes_D1_FC_Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3"] <- "3.1) CCL3_D3_Rank"
colnames(pvalDf)[colnames(pvalDf)=="CCL3_D3_FC"] <- "3.2) CCL3_D3_FC_Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14"] <- "1.1) IgG_PT_D14_titer_Rank"
colnames(pvalDf)[colnames(pvalDf)=="IgG_PT_D14_FC"] <- "1.2) IgG_PT_D14_FC_Rank"

maxCorr <- max(abs(as.matrix(corrDf)), na.rm = T)
corrplot(as.matrix(corrDf),
         tl.col = 'black',
         addCoef.col = 1,
         # mar=c(0,0,0,3),
         number.cex = 1.2, tl.srt = 60,
         p.mat = as.matrix(pvalDf),
         sig.level = 0.05,
         col.lim = c(-maxCorr, maxCorr),
         cl.ratio = 0.2,
         col = colorRampPalette(c("blue", "white","red"))(100))