CMI-PB 2nd challenge: Process prediction datasets similar to training dataset

Download all data files

We manually downloaded data files manually from the following repositories: 1) Raw prediction datasets: CMI-PB https site. 2) Processed training datasets for ID mapping: CMI-PB https site.

Alternatively, RCurl package can be used to download data files.

#base_dir = "/home/pramod/Documents/GitHub/second-challenge-prediction-dataset-preparation/"
base_dir = "../"


## `codebase.R` installs required packages and all house keepinf functions
source(paste0(base_dir, "scripts/codebase.R"))


dir_data = paste0(base_dir, "data/")
dir_raw_prediction <- paste0(base_dir, "data/raw_datasets/prediction/")
dir_processed_prediction <- paste0(base_dir, "data/processed_datasets/prediction/")

Read data

Read the training data object to associate each feature with its corresponding ID

master_processed_training_data_obj = read_rds(paste0(dir_data, "master_processed_training_data.RDS"))

features_training_abtiter = rownames(master_processed_training_data_obj$abtiter_wide$batchCorrected_data)
features_training_pbmc_gene_expression = rownames(master_processed_training_data_obj$pbmc_gene_expression$batchCorrected_data)
features_training_plasma_cytokine_concentrations = rownames(master_processed_training_data_obj$plasma_cytokine_concentrations$batchCorrected_data)
features_training_pbmc_cell_frequency = rownames(master_processed_training_data_obj$pbmc_cell_frequency$batchCorrected_data)

Read subject and specimen ID mapping inforamtion

d2022_subject <- read_tsv(paste0(dir_raw_prediction, "2022BD_subject.tsv"))

## Rows: 22 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (5): infancy_vac, biological_sex, ethnicity, race, dataset
## dbl  (1): subject_id
## date (2): year_of_birth, date_of_boost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d2022_specimen <- read_tsv(paste0(dir_raw_prediction, "2022BD_specimen.tsv"))

## Rows: 210 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): specimen_type
## dbl (5): specimen_id, subject_id, actual_day_relative_to_boost, planned_day_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## create new object subject_specimen
d2022_subject_specimen <- d2022_specimen %>%
  left_join(d2022_subject) %>%
  mutate(timepoint = planned_day_relative_to_boost)

## Joining with `by = join_by(subject_id)`

Read experimental data: four assays

d2022_pbmc_abtiter <- read_tsv(paste0(dir_raw_prediction, "2022BD_plasma_ab_titer.tsv"))

## Rows: 2170 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): isotype, antigen, unit
## dbl (4): specimen_id, MFI, MFI_normalised, lower_limit_of_detection
## lgl (1): is_antigen_specific
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d2022_pbmc_cell <- read_tsv(paste0(dir_raw_prediction, "2022BD_pbmc_cell_frequency.tsv"))

## Rows: 3100 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): cell_type_name
## dbl (2): specimen_id, percent_live_cell
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d2022_pbmc_gene <- read_tsv(paste0(dir_raw_prediction, "2022BD_pbmc_gene_expression.tsv"))

## Rows: 3673026 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): versioned_ensembl_gene_id
## dbl (3): specimen_id, raw_count, tpm
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d2022_plasma_cytokine <- read_tsv(paste0(dir_raw_prediction, "2022BD_plasma_cytokine_concentration.tsv"))

## Rows: 2520 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): protein_id, quality_control, unit
## dbl (4): specimen_id, lower_limit_of_quantitation, protein_expression, upper...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Process datasets

abtiter data:

Identify feature overlaps between the prediction dataset and the training abtiter dataset; reformat the prediction abtiter data to align with the training data structure.

#head(master_processed_training_data_obj$abtiter_wide$batchCorrected_data)

d2022_pbmc_abtiter_v1 = d2022_pbmc_abtiter %>%
  mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
  filter(isotype_antigen %in% features_training_abtiter) %>%
  dplyr::select(specimen_id, isotype_antigen, MFI_normalised) %>%
  pivot_wider(names_from = specimen_id, values_from = MFI_normalised) 

d2022_pbmc_abtiter_v1_mat = d2022_pbmc_abtiter_v1 %>%
  column_to_rownames("isotype_antigen")

## Check training Vs prediction dataset dimentions
all(rownames(d2022_pbmc_abtiter_v1_mat) %in% rownames(master_processed_training_data_obj$abtiter_wide$batchCorrected_data))

## [1] TRUE

Gene expression data:

Identify feature overlaps between the prediction dataset and the training gene expression dataset; reformat the prediction gene expression data to align with the training data structure.

#head(master_processed_training_data_obj$pbmc_gene_expression$batchCorrected_data)

d2022_pbmc_gene_v1 = d2022_pbmc_gene %>%
  filter(versioned_ensembl_gene_id %in% features_training_pbmc_gene_expression) %>% ## Filter
  dplyr::select(specimen_id, versioned_ensembl_gene_id, tpm) %>%
  pivot_wider(names_from = specimen_id, values_from = tpm) 

d2022_pbmc_gene_v1_mat = d2022_pbmc_gene_v1 %>%
  column_to_rownames("versioned_ensembl_gene_id")

## Check training Vs prediction dataset dimentions
all(rownames(d2022_pbmc_gene_v1_mat) %in% rownames(master_processed_training_data_obj$pbmc_gene_expression$batchCorrected_data))

## [1] TRUE

Cell Frequency Analysis:

Identify the median at baseline (day 0) in a manner similar to the training dataset. Then, identify feature overlaps between the prediction dataset and the training cell frequency dataset; subsequently, reformat the prediction cell frequency data to align with the training data structure.

## Perform median normalization
cell_median_D0 <- d2022_pbmc_cell %>%
    left_join(d2022_subject_specimen) %>%
    filter(specimen_id %in% unique(d2022_subject_specimen[d2022_subject_specimen$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
    group_by(dataset, cell_type_name)  %>%
    summarise(median = median(percent_live_cell, na.rm = T))

## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.

## Check if all cell_types have benn captured 
all(cell_median_D0$cell_type_name %in% d2022_pbmc_cell$cell_type_name)

## [1] TRUE

d2022_pbmc_cell_normalized_pre <-  d2022_pbmc_cell  %>%
    left_join(d2022_subject_specimen) %>%
    left_join(cell_median_D0) %>%
    mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, NA, percent_live_cell/median))

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(cell_type_name, dataset)`

## Reshape dataframe in wide format
d2022_pbmc_cell_normalized_pre_v1 <- d2022_pbmc_cell_normalized_pre  %>%
    filter(cell_type_name %in% features_training_pbmc_cell_frequency) %>% ## Filter
    dplyr::select(cell_type_name, specimen_id, percent_live_cell_normalized) %>%
    pivot_wider(names_from = "cell_type_name", values_from = percent_live_cell_normalized) 

d2022_pbmc_cell_normalized_pre_v1_mat = d2022_pbmc_cell_normalized_pre_v1 %>%
    column_to_rownames("specimen_id")%>%
    t() 

## Check training Vs prediction dataset dimentions
all(rownames(d2022_pbmc_cell_normalized_pre_v1_mat) %in% rownames(master_processed_training_data_obj$pbmc_cell_frequency$batchCorrected_data))

## [1] TRUE

## Note: This step is optional and should be selected according to the specific needs of the modeling process. For the preparation of the training dataset, KNN imputation was utilized.

#d2022_pbmc_cell_normalized_pre_v1_mat_imputed = d2022_pbmc_cell_normalized_pre_v1_mat[rowMeans(is.na(d2022_pbmc_cell_normalized_pre_v1_mat)) < 1, ] %>%
#    as.matrix() %>%
#    impute.knn() %>%
#    .$data

Plasma Cytokine Analysis:

Identify the median at baseline (day 0) in a manner similar to the training dataset. Then, identify feature overlaps between the prediction dataset and the training plasma cytokine dataset; subsequently, reformat the prediction plasma cytokine data to align with the training data structure.

## Perform median normalization
cytokine_median_D0 <- d2022_plasma_cytokine %>%
    left_join(d2022_subject_specimen) %>%
    filter(specimen_id %in% unique(d2022_subject_specimen[d2022_subject_specimen$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
    group_by(dataset, protein_id)  %>%
    summarise(median = median(protein_expression, na.rm = T))

## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.

d2022_plasma_cytokine_normalized_pre <-  d2022_plasma_cytokine  %>%
    left_join(d2022_subject_specimen) %>%
    left_join(cytokine_median_D0) %>%
    mutate(protein_expression_normalized = if_else(is.na(protein_expression) == T, NA, protein_expression/median))

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(protein_id, dataset)`

## Reshape dataframe in wide format
d2022_plasma_cytokine_normalized_pre_v1 <- d2022_plasma_cytokine_normalized_pre  %>%
    filter(protein_id %in% features_training_plasma_cytokine_concentrations) %>% ## Filter
    dplyr::select(protein_id, specimen_id, protein_expression_normalized) %>%
    pivot_wider(names_from = "protein_id", values_from = protein_expression_normalized)

d2022_plasma_cytokine_normalized_pre_v1_mat <- d2022_plasma_cytokine_normalized_pre_v1  %>%
    column_to_rownames("specimen_id")%>%
    t() 

## Check training Vs prediction dataset dimentions
all(rownames(d2022_plasma_cytokine_normalized_pre_v1_mat) %in% rownames(master_processed_training_data_obj$plasma_cytokine_concentrations$batchCorrected_data))

## [1] TRUE

## Note: This step is optional and should be selected according to the specific needs of the modeling process. For the preparation of the training dataset, KNN imputation was utilized.

#d2022_plasma_cytokine_normalized_pre_v1_mat_imputed = d2022_plasma_cytokine_normalized_pre_v1_mat[rowMeans(is.na(d2022_plasma_cytokine_normalized_pre_v1_mat)) < 1, ] %>%
#    as.matrix() %>%
#    impute.knn() %>%
#    .$data

Save normalized data

Prepare the RDS object

master_processed_prediction_data <- list(
  
  subject_specimen = d2022_subject_specimen,
  abtiter = list(
              raw_from_database = d2022_pbmc_abtiter,
              processed_similar_to_training = d2022_pbmc_abtiter_v1_mat
          ),
  plasma_cytokine_concentrations = list(
              raw_from_database = d2022_plasma_cytokine,
              processed_similar_to_training = d2022_plasma_cytokine_normalized_pre_v1_mat
          ),
  pbmc_cell_frequency = list(
              raw_from_database = d2022_pbmc_cell,
              processed_similar_to_training = d2022_pbmc_cell_normalized_pre_v1_mat
          ),
  pbmc_gene_expression = list(
              raw_from_database = d2022_pbmc_gene,
              processed_similar_to_training = d2022_pbmc_gene_v1_mat
          )
  
)

Save the data in both RDS format and as individual TSV files for versatile use and accessibility

Note: Uncomment this section when you are ready to save the files.

## Save RDS object

#saveRDS(master_processed_prediction_data, file = paste0(dir_processed_prediction, "master_processed_prediction_data.RDS"))

## Save individual data files

#write_tsv(d2022_subject_specimen, paste0(dir_processed_prediction, "subject_specimen.tsv"))
#write.table(d2022_pbmc_abtiter_v1_mat, 
#                file = paste0(dir_processed_prediction, "abtiter_processed_data.tsv"),  
#                sep = "\t", row.names = TRUE, quote = FALSE
#            )

#write.table(d2022_plasma_cytokine_normalized_pre_v1_mat, 
#                file = paste0(dir_processed_prediction, "plasma_cytokine_concentrations_processed_data.tsv"),  
#                sep = "\t", row.names = TRUE, quote = FALSE
#            )

#write.table(d2022_pbmc_cell_normalized_pre_v1_mat, 
#                file = paste0(dir_processed_prediction, "pbmc_cell_frequency_processed_data.tsv"),  
#                sep = "\t", row.names = TRUE, quote = FALSE
#            )

#write.table(d2022_pbmc_gene_v1_mat, 
#                file = paste0(dir_processed_prediction, "pbmc_gene_expression_processed_data.tsv"),  
#                sep = "\t", row.names = TRUE, quote = FALSE
#            )

session_info()

sessioninfo::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31)
##  os       Ubuntu 20.04.6 LTS
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/Los_Angeles
##  date     2024-01-05
##  pandoc   2.18 @ /usr/lib/rstudio/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version date (UTC) lib source
##  abind         1.4-5   2016-07-21 [1] CRAN (R 4.3.2)
##  backports     1.4.1   2021-12-13 [1] CRAN (R 4.3.2)
##  base64enc     0.1-3   2015-07-28 [1] CRAN (R 4.3.2)
##  BiocManager * 1.30.22 2023-08-08 [1] CRAN (R 4.3.2)
##  BiocStyle   * 2.30.0  2023-10-24 [1] Bioconductor
##  bit           4.0.5   2022-11-15 [1] CRAN (R 4.3.2)
##  bit64         4.0.5   2020-08-30 [1] CRAN (R 4.3.2)
##  bookdown      0.36    2023-10-16 [1] CRAN (R 4.3.2)
##  broom         1.0.5   2023-06-09 [1] CRAN (R 4.3.2)
##  bslib         0.5.1   2023-08-11 [1] CRAN (R 4.3.2)
##  cachem        1.0.8   2023-05-01 [1] CRAN (R 4.3.2)
##  callr         3.7.3   2022-11-02 [1] CRAN (R 4.3.2)
##  car           3.1-2   2023-03-30 [1] CRAN (R 4.3.2)
##  carData       3.0-5   2022-01-06 [1] CRAN (R 4.3.2)
##  checkmate     2.3.0   2023-10-25 [1] CRAN (R 4.3.2)
##  cli           3.6.1   2023-03-23 [1] CRAN (R 4.3.2)
##  cluster       2.1.4   2022-08-22 [4] CRAN (R 4.2.1)
##  colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.3.2)
##  corrplot    * 0.92    2021-11-18 [1] CRAN (R 4.3.2)
##  crayon        1.5.2   2022-09-29 [1] CRAN (R 4.3.2)
##  data.table    1.14.8  2023-02-17 [1] CRAN (R 4.3.2)
##  devtools    * 2.4.5   2022-10-11 [1] CRAN (R 4.3.2)
##  digest        0.6.33  2023-07-07 [1] CRAN (R 4.3.2)
##  dplyr       * 1.1.3   2023-09-03 [1] CRAN (R 4.3.2)
##  ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.3.2)
##  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.2)
##  fansi         1.0.5   2023-10-08 [1] CRAN (R 4.3.2)
##  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.2)
##  forcats     * 1.0.0   2023-01-29 [1] CRAN (R 4.3.2)
##  foreign       0.8-85  2023-09-09 [4] CRAN (R 4.3.1)
##  Formula       1.2-5   2023-02-24 [1] CRAN (R 4.3.2)
##  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.2)
##  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.2)
##  ggplot2     * 3.4.4   2023-10-12 [1] CRAN (R 4.3.2)
##  ggpubr      * 0.6.0   2023-02-10 [1] CRAN (R 4.3.2)
##  ggsignif      0.6.4   2022-10-13 [1] CRAN (R 4.3.2)
##  glue          1.6.2   2022-02-24 [1] CRAN (R 4.3.2)
##  gridExtra     2.3     2017-09-09 [1] CRAN (R 4.3.2)
##  gtable        0.3.4   2023-08-21 [1] CRAN (R 4.3.2)
##  Hmisc       * 5.1-1   2023-09-12 [1] CRAN (R 4.3.2)
##  hms           1.1.3   2023-03-21 [1] CRAN (R 4.3.2)
##  htmlTable     2.4.2   2023-10-29 [1] CRAN (R 4.3.2)
##  htmltools     0.5.7   2023-11-03 [1] CRAN (R 4.3.2)
##  htmlwidgets   1.6.2   2023-03-17 [1] CRAN (R 4.3.2)
##  httpuv        1.6.12  2023-10-23 [1] CRAN (R 4.3.2)
##  impute      * 1.76.0  2023-10-24 [1] Bioconductor
##  jquerylib     0.1.4   2021-04-26 [1] CRAN (R 4.3.2)
##  jsonlite      1.8.7   2023-06-29 [1] CRAN (R 4.3.2)
##  knitr         1.45    2023-10-30 [1] CRAN (R 4.3.2)
##  later         1.3.1   2023-05-02 [1] CRAN (R 4.3.2)
##  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.2)
##  lubridate   * 1.9.3   2023-09-27 [1] CRAN (R 4.3.2)
##  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.2)
##  memoise       2.0.1   2021-11-26 [1] CRAN (R 4.3.2)
##  mime          0.12    2021-09-28 [1] CRAN (R 4.3.2)
##  miniUI        0.1.1.1 2018-05-18 [1] CRAN (R 4.3.2)
##  munsell       0.5.0   2018-06-12 [1] CRAN (R 4.3.2)
##  nnet          7.3-19  2023-05-03 [4] CRAN (R 4.3.1)
##  pacman      * 0.5.1   2019-03-11 [1] CRAN (R 4.3.2)
##  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.2)
##  pkgbuild      1.4.2   2023-06-26 [1] CRAN (R 4.3.2)
##  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.2)
##  pkgload       1.3.3   2023-09-22 [1] CRAN (R 4.3.2)
##  prettyunits   1.2.0   2023-09-24 [1] CRAN (R 4.3.2)
##  processx      3.8.2   2023-06-30 [1] CRAN (R 4.3.2)
##  profvis       0.3.8   2023-05-02 [1] CRAN (R 4.3.2)
##  promises      1.2.1   2023-08-10 [1] CRAN (R 4.3.2)
##  ps            1.7.5   2023-04-18 [1] CRAN (R 4.3.2)
##  purrr       * 1.0.2   2023-08-10 [1] CRAN (R 4.3.2)
##  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.2)
##  Rcpp          1.0.11  2023-07-06 [1] CRAN (R 4.3.2)
##  readr       * 2.1.4   2023-02-10 [1] CRAN (R 4.3.2)
##  remotes       2.4.2.1 2023-07-18 [1] CRAN (R 4.3.2)
##  rlang         1.1.2   2023-11-04 [1] CRAN (R 4.3.2)
##  rmarkdown     2.25    2023-09-18 [1] CRAN (R 4.3.2)
##  rpart         4.1.21  2023-10-09 [4] CRAN (R 4.3.1)
##  rstatix       0.7.2   2023-02-01 [1] CRAN (R 4.3.2)
##  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.2)
##  sass          0.4.7   2023-07-15 [1] CRAN (R 4.3.2)
##  scales        1.2.1   2022-08-20 [1] CRAN (R 4.3.2)
##  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.2)
##  shiny         1.7.5.1 2023-10-14 [1] CRAN (R 4.3.2)
##  stringi       1.8.1   2023-11-13 [1] CRAN (R 4.3.2)
##  stringr     * 1.5.0   2022-12-02 [1] CRAN (R 4.3.2)
##  tibble      * 3.2.1   2023-03-20 [1] CRAN (R 4.3.2)
##  tidyr       * 1.3.0   2023-01-24 [1] CRAN (R 4.3.2)
##  tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.3.2)
##  tidyverse   * 2.0.0   2023-02-22 [1] CRAN (R 4.3.2)
##  timechange    0.2.0   2023-01-11 [1] CRAN (R 4.3.2)
##  tzdb          0.4.0   2023-05-12 [1] CRAN (R 4.3.2)
##  urlchecker    1.0.1   2021-11-30 [1] CRAN (R 4.3.2)
##  usethis     * 2.2.2   2023-07-06 [1] CRAN (R 4.3.2)
##  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.2)
##  vctrs         0.6.4   2023-10-12 [1] CRAN (R 4.3.2)
##  vroom         1.6.4   2023-10-02 [1] CRAN (R 4.3.2)
##  withr         2.5.2   2023-10-30 [1] CRAN (R 4.3.2)
##  xfun          0.41    2023-11-01 [1] CRAN (R 4.3.2)
##  xtable        1.8-4   2019-04-21 [1] CRAN (R 4.3.2)
##  yaml          2.3.7   2023-01-23 [1] CRAN (R 4.3.2)
## 
##  [1] /home/pramod/R/x86_64-pc-linux-gnu-library/4.3
##  [2] /usr/local/lib/R/site-library
##  [3] /usr/lib/R/site-library
##  [4] /usr/lib/R/library
## 
## ──────────────────────────────────────────────────────────────────────────────