CMI-PB Team’s Data Processing Approach for Second Challenge Dataset

The training dataset for the second challenge comprises two multi-omics datasets (designated as 2020 and 2021) that require processing and normalization to generate computable matrices suitable for subsequent model development. While the procedures for data processing and normalization are inherently user-specific, the CMI-PB team has devised a data processing method, drawing inspiration from the approach used in the internal 1st CMI-PB challenge. The codebase is also available on GitHub. If you have specific questions, please contact us via Solutions center.

Download and read 2nd challenge data from CMI-PB website

The data files for the 2nd CMI-PB challenge can be accessed at [https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/raw_datasets/]. They are available for direct file download or as R data objects. For our notebook, we chose to download the data as R data objects. These objects contain both demographical metadata of subjects and experimental data, including:

Plasma antibody titers: Plasma antibodies against Tdap were measured at all time points using Luminex assay,
Plasma cytokine concentrations: Plasma cytokine concentration analysis using OLINK assay,
PBMC gene expression: Gene expression analysis (RNAseq) of bulk peripheral blood mononuclear cells (PBMCs),
PBMC cell frequency: Cell frequency analysis of PBMC subsets were performed before and after booster vaccination until day 14.

source("codebase.R")

master_database_data <- readRDS(paste0(dir_rds_objects, "master_harmonized_training_data.RDS"))

training_dataset <- subset_dataset(master_database_data, c("2020_dataset", "2021_dataset"))
test_dataset <- subset_dataset(master_database_data, c("2022_dataset"))

subject_specimen <- master_database_data$subject_specimen  %>% 
  mutate(timepoint = planned_day_relative_to_boost)

training_subject_specimen <- training_dataset$subject_specimen
test_subject_specimen <- test_dataset$subject_specimen

gene_90_38_export <- read_tsv(paste0(base_dir, "data/gene_90_38_export.tsv"))

mito_genes <- gene_90_38_export %>%
  filter(substr(display_label, 1,3) == "MT-")

gene_90_38_shortlist <- gene_90_38_export %>%
  filter(biotype == "protein_coding") %>%
  filter(!versioned_ensembl_gene_id %in% mito_genes$versioned_ensembl_gene_id)

batch.factors = c("timepoint","infancy_vac","biological_sex","dataset")
data_obj = training_dataset

test_subject_specimen_baseline <- subject_specimen %>% 
  filter(dataset %in% c("2022_dataset")) %>% 
  filter(timepoint %in% c(-30, -15, 0))

subject_specimen_baseline <- subject_specimen %>% 
    filter(timepoint %in% c(-30, -15, 0))

Antibody titers

abtiter_wide_before <- data_obj$plasma_antibody_levels$long %>%
  dplyr::select(isotype_antigen, specimen_id, MFI) %>%
  pivot_wider(names_from = "isotype_antigen", values_from = MFI) %>%
  column_to_rownames("specimen_id")%>%
  t() 

pvca_analysis(abtiter_wide_before, data_obj$subject_specimen, batch.factors, plot_title = "Antibody titer:  Raw data")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

## Apply data normalization and batch correction
abtiter_data_processed = processAbtiter(data_obj, BatchCorrection = TRUE)

## Joining with `by = join_by(specimen_id)`
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

pvca_analysis(abtiter_data_processed$normalized_data, data_obj$subject_specimen, batch.factors, plot_title = "Antibody titer: Normalization")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

pvca_analysis(abtiter_data_processed$batchCorrected_data, data_obj$subject_specimen, batch.factors, plot_title = "Antibody titer:  Normalization and batch correction")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

Cell frequency Analysis

## Before  normalization
cell_wide_before <- data_obj$pbmc_cell_frequency$wide %>%
  column_to_rownames("specimen_id")%>%
  t() 

pvca_analysis(cell_wide_before, data_obj$subject_specimen, batch.factors, plot_title = "Cell frequency:  Raw data")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

## Apply data normalization and batch correction
cytof_data_processed = processCellFreq(data_obj, BatchCorrection = TRUE)

## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(cell_type_name, dataset)`
## Joining with `by = join_by(specimen_id)`
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Found40Missing Data Values
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

pvca_analysis(cytof_data_processed$normalized_data, data_obj$subject_specimen, batch.factors, plot_title = "Cell frequency: Normalization")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

pvca_analysis(cytof_data_processed$batchCorrected_data, data_obj$subject_specimen, batch.factors, plot_title = "Cell frequency:  Normalization and batch correction")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

Olink Analysis

## Before batch correction
olink_wide_before <- data_obj$plasma_cytokine_concentrations$wide  %>%
  column_to_rownames("specimen_id")%>%
  t() 

pvca_analysis(olink_wide_before, data_obj$subject_specimen, batch.factors, plot_title = "Cytokine concetrations:  Raw data")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

## Apply data normalization and batch correction
olink_data_processed = processOlink(data_obj, BatchCorrection = TRUE)

## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(protein_id, dataset)`
## Joining with `by = join_by(specimen_id)`

## Found 3 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.

## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Found6Missing Data Values 
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

pvca_analysis(cytof_data_processed$normalized_data, data_obj$subject_specimen, batch.factors, plot_title = "Cytokine concetrations: Normalization")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

pvca_analysis(cytof_data_processed$batchCorrected_data, data_obj$subject_specimen, batch.factors, plot_title = "Cytokine concetrations: Normalization and batch correction")

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

Gene expression data Analysis

rnaseq_countData <- data_obj$pbmc_gene_expression_wide$wide %>%
  column_to_rownames("specimen_id") %>%
  t()  %>%
  as.data.frame()  

colnames(rnaseq_countData) = as.integer(colnames(rnaseq_countData))

rnaseq_metaData <- data_obj$subject_specimen %>%
  filter(specimen_id %in% colnames(rnaseq_countData)) %>%
  mutate(specimen_id1 = specimen_id) %>%
  column_to_rownames("specimen_id1") 

#' Identify genes with tpm >=1 that are present in at least 80% of either the aP(wP) cohort and absent in at least 80% of wP(aP) cohort.
threshold_proportion_greater_than_1 = 0.8

tpm_sum_infancy_subgroup <- rnaseq_countData %>%
  rownames_to_column("versioned_ensembl_gene_id") %>%
  pivot_longer(!versioned_ensembl_gene_id, values_to = "tpm", names_to = "specimen_id") %>%
  mutate(specimen_id = as.integer(specimen_id)) %>%
  left_join(subject_specimen) %>%
  group_by(dataset, versioned_ensembl_gene_id, infancy_vac) %>%
  #group_by(versioned_ensembl_gene_id, infancy_vac) %>%
  summarise(proportion_greater_than_1 = mean(tpm >= 1)) %>%
  pivot_wider(names_from = infancy_vac, values_from = proportion_greater_than_1)  %>%
  mutate(gene_meets_criterion_aP = aP >= threshold_proportion_greater_than_1 & wP <= (1 - threshold_proportion_greater_than_1),
         gene_meets_criterion_wP = wP >= threshold_proportion_greater_than_1 & aP <= (1 - threshold_proportion_greater_than_1)
         )  %>%
  filter((gene_meets_criterion_aP == TRUE & gene_meets_criterion_wP == FALSE) || (gene_meets_criterion_aP == FALSE & gene_meets_criterion_wP == TRUE))

## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset', 'versioned_ensembl_gene_id'. You
## can override using the `.groups` argument.

#' Create a shortlist of genes (tpm >= 1) in at least 30% of the specimens.

tpm_shortlist <- rnaseq_countData %>%
  rownames_to_column("versioned_ensembl_gene_id") %>%
  filter(versioned_ensembl_gene_id %in% gene_90_38_shortlist$versioned_ensembl_gene_id) %>%
  pivot_longer(!versioned_ensembl_gene_id, values_to = "tpm", names_to = "specimen_id") %>%
  mutate(specimen_id = as.integer(specimen_id)) %>%
  left_join(subject_specimen) %>%
  group_by(versioned_ensembl_gene_id) %>%
  #group_by(versioned_ensembl_gene_id, infancy_vac) %>%
  summarise(proportion = mean(tpm >= 1))  %>%
  filter(proportion >= 0.3)

## Joining with `by = join_by(specimen_id)`

## Before batch correction
rnaseq_countData_v2 <- rnaseq_countData %>%
  rownames_to_column("versioned_ensembl_gene_id") %>%
  filter(versioned_ensembl_gene_id %in% gene_90_38_shortlist$versioned_ensembl_gene_id) %>%
  filter(!versioned_ensembl_gene_id %in% tpm_sum_infancy_subgroup$versioned_ensembl_gene_id) %>%
  filter(versioned_ensembl_gene_id %in% tpm_shortlist$versioned_ensembl_gene_id) %>%
  column_to_rownames("versioned_ensembl_gene_id")

mad_2020 <- mad_calculations(rnaseq_countData_v2, data_obj$subject_specimen, c("2020_dataset"))

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

mad_2021 <- mad_calculations(rnaseq_countData_v2, data_obj$subject_specimen, c("2021_dataset"))

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

mad_shotlisted_genes = intersect(mad_2020$gene_id, mad_2021$gene_id)

rnaseq_countData_v3 <- rnaseq_countData_v2 %>%
  rownames_to_column("versioned_ensembl_gene_id") %>%
  filter(versioned_ensembl_gene_id %in% mad_shotlisted_genes) %>%
  column_to_rownames("versioned_ensembl_gene_id")
  
pvca_analysis_rnaseq(rnaseq_countData_v3, data_obj$subject_specimen, batch.factors, plot_title = "RNASeq: Raw data")

## Cluster size 8242 broken into 8152 90 
## Cluster size 8152 broken into 467 7685 
## Done cluster 467 
## Cluster size 7685 broken into 1250 6435 
## Done cluster 1250 
## Cluster size 6435 broken into 4290 2145 
## Cluster size 4290 broken into 2371 1919 
## Cluster size 2371 broken into 891 1480 
## Done cluster 891 
## Done cluster 1480 
## Done cluster 2371 
## Cluster size 1919 broken into 778 1141 
## Done cluster 778 
## Done cluster 1141 
## Done cluster 1919 
## Done cluster 4290 
## Cluster size 2145 broken into 1073 1072 
## Done cluster 1073 
## Done cluster 1072 
## Done cluster 2145 
## Done cluster 6435 
## Done cluster 7685 
## Done cluster 8152 
## Done cluster 90

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

batch_lebels = as.data.frame(colnames(rnaseq_countData_v3)) %>%
      rename(specimen_id = starts_with("colnames")) %>%
      mutate(specimen_id = as.integer(specimen_id)) %>%
      left_join(rnaseq_metaData) %>%
      dplyr::select(dataset)

## Joining with `by = join_by(specimen_id)`

rnaseq_batchCorrected = sva::ComBat_seq(as.matrix(rnaseq_countData_v3), batch = batch_lebels$dataset)

## Found 2 batches
## Using null model in ComBat-seq.
## Adjusting for 0 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data

pvca_analysis_rnaseq(rnaseq_batchCorrected, data_obj$subject_specimen, batch.factors, plot_title = "RNASeq: Batch correction")

## Cluster size 8242 broken into 8140 102 
## Cluster size 8140 broken into 7679 461 
## Cluster size 7679 broken into 6445 1234 
## Cluster size 6445 broken into 4448 1997 
## Cluster size 4448 broken into 2552 1896 
## Cluster size 2552 broken into 1426 1126 
## Done cluster 1426 
## Done cluster 1126 
## Done cluster 2552 
## Cluster size 1896 broken into 1285 611 
## Done cluster 1285 
## Done cluster 611 
## Done cluster 1896 
## Done cluster 4448 
## Cluster size 1997 broken into 1246 751 
## Done cluster 1246 
## Done cluster 751 
## Done cluster 1997 
## Done cluster 6445 
## Done cluster 1234 
## Done cluster 7679 
## Done cluster 461 
## Done cluster 8140 
## Done cluster 102

## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## boundary (singular) fit: see help('isSingular')
## Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`Joining with `by = join_by(specimen_id)`

rnaseq_normalised_data = list(
      
      metadata = rnaseq_metaData,
      raw_data = rnaseq_countData_v3,
      batchCorrected_data = rnaseq_batchCorrected
    )

Save normalized data

master_normalized_data <- list(
  
  subject_specimen = training_subject_specimen,
  abtiter_wide = abtiter_data_processed,
  plasma_cytokine_concentrations = olink_data_processed,
  pbmc_cell_frequency = cytof_data_processed,
  pbmc_gene_expression = rnaseq_normalised_data
  
)

sapply(master_normalized_data, dim)

## $subject_specimen
## [1] 729  14
## 
## $abtiter_wide
## NULL
## 
## $plasma_cytokine_concentrations
## NULL
## 
## $pbmc_cell_frequency
## NULL
## 
## $pbmc_gene_expression
## NULL

Save data as RDS File

saveRDS(master_normalized_data, file = paste0(dir_rds_objects, "master_processed_training_data.RDS"))

#sessioninfo::session_info()