Two demo models are created to depict model construction and submission file preparation process for 3rd (Public) CMI-PB challenge.
Model 1: Baseline (day 0) IgG antibody levels against PT as a predictor of two Antibody levels tasks
Model 2: Age as a predictor for all tasks
library(lubridate)
library(tidyverse)
library(DT)
library(BiocStyle)
base_dir = "/home/pshinde/repos/cmi-pb/public_challenge/training_preprocess/"
data_dir = paste0(base_dir, "data/")
#load(paste0(base_dir, "code/codebase1.R"))
submission_template = readr::read_tsv(paste0(data_dir, "3rdChallengeSubmissionTemplate.tsv"))
subject_2023BD = readr::read_tsv(paste0(data_dir, "raw_datasets/challenge_dataset/2023BD_subject.tsv"))
specimen_2023BD = readr::read_tsv(paste0(data_dir, "raw_datasets/challenge_dataset/2023BD_specimen.tsv")) %>%
left_join(subject_2023BD)
plasma_ab_titer_2023BD = readr::read_tsv(paste0(data_dir, "raw_datasets/challenge_dataset/2023BD_plasma_ab_titer.tsv"))
## More resources
# Selecting the first four columns to keep as they are
kept_columns <- submission_template %>%
select(`SubjectID`, Age, `BiologicalSexAtBirth`, `VaccinePrimingStatus`)
# Getting the names of columns to be replaced with age_rank
submission_template_col_names <- names(submission_template)
IgG_baseline_model_df <- plasma_ab_titer_2023BD %>%
left_join(specimen_2023BD) %>%
filter(isotype == "IgG" & antigen == "PT") %>%
filter(planned_day_relative_to_boost == 0) %>%
dplyr::select(subject_id, MFI_normalised) %>%
mutate(IgG_baseline_rank = rank(MFI_normalised)) %>%
dplyr::select(subject_id, IgG_baseline_rank) %>%
rename("SubjectID" = "subject_id")
IgG_baseline_final_model_df <- IgG_baseline_model_df %>%
mutate(
"task1_1" = IgG_baseline_rank,
"task1_2" = IgG_baseline_rank
) %>%
dplyr::select(-IgG_baseline_rank)
datatable(IgG_baseline_model_df)
# Selecting the first four columns to keep as they are
kept_columns_v2 <- submission_template %>%
select(`SubjectID`, "2.1) Monocytes-D1-Rank", "2.2) Monocytes-D1-FC-Rank","3.1) CCL3-D3-Rank","3.2) CCL3-D3-FC-Rank")
# Creating a dataframe with age_rank repeated for each column to be replaced
IgG_model_submission_df <- kept_columns %>%
left_join(IgG_baseline_final_model_df, by = "SubjectID") %>%
left_join(kept_columns_v2, by = "SubjectID")
# Combining the kept columns with the new age_rank columns
colnames(IgG_model_submission_df) <- submission_template_col_names
# View the updated submission template
datatable(IgG_model_submission_df)
## Construct the model
# Calculating age_at_boost and rank for the age_at_boost column
subject_2023BD <- subject_2023BD %>%
mutate(age_at_boost = interval(ymd(year_of_birth), ymd(date_of_boost)) / years(1)) %>%
mutate(age_rank = rank(age_at_boost))
age_model_df = subject_2023BD %>%
dplyr::select(subject_id, age_rank) %>%
rename("SubjectID" = "subject_id")
age_final_model_df = age_model_df %>%
mutate(
"task1_1" = age_rank,
"task1_2" = age_rank,
"task2_1" = age_rank,
"task2_2" = age_rank,
"task3_1" = age_rank,
"task3_2" = age_rank
) %>%
dplyr::select(-age_rank)
datatable(age_model_df)
datatable(age_final_model_df)
2.2) Prepare submission file
# Creating a dataframe with age_rank repeated for each column to be replaced
age_model_submission_df <- kept_columns %>%
left_join(age_final_model_df, by = "SubjectID")
# Combining the kept columns with the new age_rank columns
colnames(age_model_submission_df) <- submission_template_col_names
# View the updated submission template
datatable(age_model_submission_df)
#readr::write_tsv(age_model_submission_df, paste0(data_dir, "myAgeModel_submission_v20240810.tsv"))