We manually downloaded data files manually from the following repositories: 1) Raw prediction datasets: CMI-PB https site. 2) Processed training datasets for ID mapping: CMI-PB https site.
Alternatively, RCurl package can be used to download data files.
#base_dir = "/home/pramod/Documents/GitHub/second-challenge-prediction-dataset-preparation/"
base_dir = "../"
## `codebase.R` installs required packages and all house keepinf functions
source(paste0(base_dir, "scripts/codebase.R"))
dir_data = paste0(base_dir, "data/")
dir_raw_prediction <- paste0(base_dir, "data/raw_datasets/prediction/")
dir_processed_prediction <- paste0(base_dir, "data/processed_datasets/prediction/")
master_processed_training_data_obj = read_rds(paste0(dir_data, "master_processed_training_data.RDS"))
features_training_abtiter = rownames(master_processed_training_data_obj$abtiter_wide$batchCorrected_data)
features_training_pbmc_gene_expression = rownames(master_processed_training_data_obj$pbmc_gene_expression$batchCorrected_data)
features_training_plasma_cytokine_concentrations = rownames(master_processed_training_data_obj$plasma_cytokine_concentrations$batchCorrected_data)
features_training_pbmc_cell_frequency = rownames(master_processed_training_data_obj$pbmc_cell_frequency$batchCorrected_data)
d2022_subject <- read_tsv(paste0(dir_raw_prediction, "2022BD_subject.tsv"))
## Rows: 22 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): infancy_vac, biological_sex, ethnicity, race, dataset
## dbl (1): subject_id
## date (2): year_of_birth, date_of_boost
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2022_specimen <- read_tsv(paste0(dir_raw_prediction, "2022BD_specimen.tsv"))
## Rows: 210 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): specimen_type
## dbl (5): specimen_id, subject_id, actual_day_relative_to_boost, planned_day_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## create new object subject_specimen
d2022_subject_specimen <- d2022_specimen %>%
left_join(d2022_subject) %>%
mutate(timepoint = planned_day_relative_to_boost)
## Joining with `by = join_by(subject_id)`
d2022_pbmc_abtiter <- read_tsv(paste0(dir_raw_prediction, "2022BD_plasma_ab_titer.tsv"))
## Rows: 2170 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): isotype, antigen, unit
## dbl (4): specimen_id, MFI, MFI_normalised, lower_limit_of_detection
## lgl (1): is_antigen_specific
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2022_pbmc_cell <- read_tsv(paste0(dir_raw_prediction, "2022BD_pbmc_cell_frequency.tsv"))
## Rows: 3100 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): cell_type_name
## dbl (2): specimen_id, percent_live_cell
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2022_pbmc_gene <- read_tsv(paste0(dir_raw_prediction, "2022BD_pbmc_gene_expression.tsv"))
## Rows: 3673026 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): versioned_ensembl_gene_id
## dbl (3): specimen_id, raw_count, tpm
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d2022_plasma_cytokine <- read_tsv(paste0(dir_raw_prediction, "2022BD_plasma_cytokine_concentration.tsv"))
## Rows: 2520 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): protein_id, quality_control, unit
## dbl (4): specimen_id, lower_limit_of_quantitation, protein_expression, upper...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Identify feature overlaps between the prediction dataset and the training abtiter dataset; reformat the prediction abtiter data to align with the training data structure.
#head(master_processed_training_data_obj$abtiter_wide$batchCorrected_data)
d2022_pbmc_abtiter_v1 = d2022_pbmc_abtiter %>%
mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
filter(isotype_antigen %in% features_training_abtiter) %>%
dplyr::select(specimen_id, isotype_antigen, MFI_normalised) %>%
pivot_wider(names_from = specimen_id, values_from = MFI_normalised)
d2022_pbmc_abtiter_v1_mat = d2022_pbmc_abtiter_v1 %>%
column_to_rownames("isotype_antigen")
## Check training Vs prediction dataset dimentions
all(rownames(d2022_pbmc_abtiter_v1_mat) %in% rownames(master_processed_training_data_obj$abtiter_wide$batchCorrected_data))
## [1] TRUE
Identify feature overlaps between the prediction dataset and the training gene expression dataset; reformat the prediction gene expression data to align with the training data structure.
#head(master_processed_training_data_obj$pbmc_gene_expression$batchCorrected_data)
d2022_pbmc_gene_v1 = d2022_pbmc_gene %>%
filter(versioned_ensembl_gene_id %in% features_training_pbmc_gene_expression) %>% ## Filter
dplyr::select(specimen_id, versioned_ensembl_gene_id, tpm) %>%
pivot_wider(names_from = specimen_id, values_from = tpm)
d2022_pbmc_gene_v1_mat = d2022_pbmc_gene_v1 %>%
column_to_rownames("versioned_ensembl_gene_id")
## Check training Vs prediction dataset dimentions
all(rownames(d2022_pbmc_gene_v1_mat) %in% rownames(master_processed_training_data_obj$pbmc_gene_expression$batchCorrected_data))
## [1] TRUE
Identify the median at baseline (day 0) in a manner similar to the training dataset. Then, identify feature overlaps between the prediction dataset and the training cell frequency dataset; subsequently, reformat the prediction cell frequency data to align with the training data structure.
## Perform median normalization
cell_median_D0 <- d2022_pbmc_cell %>%
left_join(d2022_subject_specimen) %>%
filter(specimen_id %in% unique(d2022_subject_specimen[d2022_subject_specimen$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
group_by(dataset, cell_type_name) %>%
summarise(median = median(percent_live_cell, na.rm = T))
## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
## Check if all cell_types have benn captured
all(cell_median_D0$cell_type_name %in% d2022_pbmc_cell$cell_type_name)
## [1] TRUE
d2022_pbmc_cell_normalized_pre <- d2022_pbmc_cell %>%
left_join(d2022_subject_specimen) %>%
left_join(cell_median_D0) %>%
mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, NA, percent_live_cell/median))
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(cell_type_name, dataset)`
## Reshape dataframe in wide format
d2022_pbmc_cell_normalized_pre_v1 <- d2022_pbmc_cell_normalized_pre %>%
filter(cell_type_name %in% features_training_pbmc_cell_frequency) %>% ## Filter
dplyr::select(cell_type_name, specimen_id, percent_live_cell_normalized) %>%
pivot_wider(names_from = "cell_type_name", values_from = percent_live_cell_normalized)
d2022_pbmc_cell_normalized_pre_v1_mat = d2022_pbmc_cell_normalized_pre_v1 %>%
column_to_rownames("specimen_id")%>%
t()
## Check training Vs prediction dataset dimentions
all(rownames(d2022_pbmc_cell_normalized_pre_v1_mat) %in% rownames(master_processed_training_data_obj$pbmc_cell_frequency$batchCorrected_data))
## [1] TRUE
## Note: This step is optional and should be selected according to the specific needs of the modeling process. For the preparation of the training dataset, KNN imputation was utilized.
#d2022_pbmc_cell_normalized_pre_v1_mat_imputed = d2022_pbmc_cell_normalized_pre_v1_mat[rowMeans(is.na(d2022_pbmc_cell_normalized_pre_v1_mat)) < 1, ] %>%
# as.matrix() %>%
# impute.knn() %>%
# .$data
Identify the median at baseline (day 0) in a manner similar to the training dataset. Then, identify feature overlaps between the prediction dataset and the training plasma cytokine dataset; subsequently, reformat the prediction plasma cytokine data to align with the training data structure.
## Perform median normalization
cytokine_median_D0 <- d2022_plasma_cytokine %>%
left_join(d2022_subject_specimen) %>%
filter(specimen_id %in% unique(d2022_subject_specimen[d2022_subject_specimen$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
group_by(dataset, protein_id) %>%
summarise(median = median(protein_expression, na.rm = T))
## Joining with `by = join_by(specimen_id)`
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
d2022_plasma_cytokine_normalized_pre <- d2022_plasma_cytokine %>%
left_join(d2022_subject_specimen) %>%
left_join(cytokine_median_D0) %>%
mutate(protein_expression_normalized = if_else(is.na(protein_expression) == T, NA, protein_expression/median))
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(protein_id, dataset)`
## Reshape dataframe in wide format
d2022_plasma_cytokine_normalized_pre_v1 <- d2022_plasma_cytokine_normalized_pre %>%
filter(protein_id %in% features_training_plasma_cytokine_concentrations) %>% ## Filter
dplyr::select(protein_id, specimen_id, protein_expression_normalized) %>%
pivot_wider(names_from = "protein_id", values_from = protein_expression_normalized)
d2022_plasma_cytokine_normalized_pre_v1_mat <- d2022_plasma_cytokine_normalized_pre_v1 %>%
column_to_rownames("specimen_id")%>%
t()
## Check training Vs prediction dataset dimentions
all(rownames(d2022_plasma_cytokine_normalized_pre_v1_mat) %in% rownames(master_processed_training_data_obj$plasma_cytokine_concentrations$batchCorrected_data))
## [1] TRUE
## Note: This step is optional and should be selected according to the specific needs of the modeling process. For the preparation of the training dataset, KNN imputation was utilized.
#d2022_plasma_cytokine_normalized_pre_v1_mat_imputed = d2022_plasma_cytokine_normalized_pre_v1_mat[rowMeans(is.na(d2022_plasma_cytokine_normalized_pre_v1_mat)) < 1, ] %>%
# as.matrix() %>%
# impute.knn() %>%
# .$data
master_processed_prediction_data <- list(
subject_specimen = d2022_subject_specimen,
abtiter = list(
raw_from_database = d2022_pbmc_abtiter,
processed_similar_to_training = d2022_pbmc_abtiter_v1_mat
),
plasma_cytokine_concentrations = list(
raw_from_database = d2022_plasma_cytokine,
processed_similar_to_training = d2022_plasma_cytokine_normalized_pre_v1_mat
),
pbmc_cell_frequency = list(
raw_from_database = d2022_pbmc_cell,
processed_similar_to_training = d2022_pbmc_cell_normalized_pre_v1_mat
),
pbmc_gene_expression = list(
raw_from_database = d2022_pbmc_gene,
processed_similar_to_training = d2022_pbmc_gene_v1_mat
)
)
## Save RDS object
#saveRDS(master_processed_prediction_data, file = paste0(dir_processed_prediction, "master_processed_prediction_data.RDS"))
## Save individual data files
#write_tsv(d2022_subject_specimen, paste0(dir_processed_prediction, "subject_specimen.tsv"))
#write.table(d2022_pbmc_abtiter_v1_mat,
# file = paste0(dir_processed_prediction, "abtiter_processed_data.tsv"),
# sep = "\t", row.names = TRUE, quote = FALSE
# )
#write.table(d2022_plasma_cytokine_normalized_pre_v1_mat,
# file = paste0(dir_processed_prediction, "plasma_cytokine_concentrations_processed_data.tsv"),
# sep = "\t", row.names = TRUE, quote = FALSE
# )
#write.table(d2022_pbmc_cell_normalized_pre_v1_mat,
# file = paste0(dir_processed_prediction, "pbmc_cell_frequency_processed_data.tsv"),
# sep = "\t", row.names = TRUE, quote = FALSE
# )
#write.table(d2022_pbmc_gene_v1_mat,
# file = paste0(dir_processed_prediction, "pbmc_gene_expression_processed_data.tsv"),
# sep = "\t", row.names = TRUE, quote = FALSE
# )
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os Ubuntu 20.04.6 LTS
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Los_Angeles
## date 2024-01-05
## pandoc 2.18 @ /usr/lib/rstudio/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 4.3.2)
## backports 1.4.1 2021-12-13 [1] CRAN (R 4.3.2)
## base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.3.2)
## BiocManager * 1.30.22 2023-08-08 [1] CRAN (R 4.3.2)
## BiocStyle * 2.30.0 2023-10-24 [1] Bioconductor
## bit 4.0.5 2022-11-15 [1] CRAN (R 4.3.2)
## bit64 4.0.5 2020-08-30 [1] CRAN (R 4.3.2)
## bookdown 0.36 2023-10-16 [1] CRAN (R 4.3.2)
## broom 1.0.5 2023-06-09 [1] CRAN (R 4.3.2)
## bslib 0.5.1 2023-08-11 [1] CRAN (R 4.3.2)
## cachem 1.0.8 2023-05-01 [1] CRAN (R 4.3.2)
## callr 3.7.3 2022-11-02 [1] CRAN (R 4.3.2)
## car 3.1-2 2023-03-30 [1] CRAN (R 4.3.2)
## carData 3.0-5 2022-01-06 [1] CRAN (R 4.3.2)
## checkmate 2.3.0 2023-10-25 [1] CRAN (R 4.3.2)
## cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.2)
## cluster 2.1.4 2022-08-22 [4] CRAN (R 4.2.1)
## colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.2)
## corrplot * 0.92 2021-11-18 [1] CRAN (R 4.3.2)
## crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.2)
## data.table 1.14.8 2023-02-17 [1] CRAN (R 4.3.2)
## devtools * 2.4.5 2022-10-11 [1] CRAN (R 4.3.2)
## digest 0.6.33 2023-07-07 [1] CRAN (R 4.3.2)
## dplyr * 1.1.3 2023-09-03 [1] CRAN (R 4.3.2)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.3.2)
## evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.2)
## fansi 1.0.5 2023-10-08 [1] CRAN (R 4.3.2)
## fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.2)
## foreign 0.8-85 2023-09-09 [4] CRAN (R 4.3.1)
## Formula 1.2-5 2023-02-24 [1] CRAN (R 4.3.2)
## fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.2)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.2)
## ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.3.2)
## ggpubr * 0.6.0 2023-02-10 [1] CRAN (R 4.3.2)
## ggsignif 0.6.4 2022-10-13 [1] CRAN (R 4.3.2)
## glue 1.6.2 2022-02-24 [1] CRAN (R 4.3.2)
## gridExtra 2.3 2017-09-09 [1] CRAN (R 4.3.2)
## gtable 0.3.4 2023-08-21 [1] CRAN (R 4.3.2)
## Hmisc * 5.1-1 2023-09-12 [1] CRAN (R 4.3.2)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.2)
## htmlTable 2.4.2 2023-10-29 [1] CRAN (R 4.3.2)
## htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.2)
## htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.2)
## httpuv 1.6.12 2023-10-23 [1] CRAN (R 4.3.2)
## impute * 1.76.0 2023-10-24 [1] Bioconductor
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.3.2)
## jsonlite 1.8.7 2023-06-29 [1] CRAN (R 4.3.2)
## knitr 1.45 2023-10-30 [1] CRAN (R 4.3.2)
## later 1.3.1 2023-05-02 [1] CRAN (R 4.3.2)
## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.2)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.2)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.3.2)
## mime 0.12 2021-09-28 [1] CRAN (R 4.3.2)
## miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.3.2)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.2)
## nnet 7.3-19 2023-05-03 [4] CRAN (R 4.3.1)
## pacman * 0.5.1 2019-03-11 [1] CRAN (R 4.3.2)
## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.2)
## pkgbuild 1.4.2 2023-06-26 [1] CRAN (R 4.3.2)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.2)
## pkgload 1.3.3 2023-09-22 [1] CRAN (R 4.3.2)
## prettyunits 1.2.0 2023-09-24 [1] CRAN (R 4.3.2)
## processx 3.8.2 2023-06-30 [1] CRAN (R 4.3.2)
## profvis 0.3.8 2023-05-02 [1] CRAN (R 4.3.2)
## promises 1.2.1 2023-08-10 [1] CRAN (R 4.3.2)
## ps 1.7.5 2023-04-18 [1] CRAN (R 4.3.2)
## purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.2)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.2)
## Rcpp 1.0.11 2023-07-06 [1] CRAN (R 4.3.2)
## readr * 2.1.4 2023-02-10 [1] CRAN (R 4.3.2)
## remotes 2.4.2.1 2023-07-18 [1] CRAN (R 4.3.2)
## rlang 1.1.2 2023-11-04 [1] CRAN (R 4.3.2)
## rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.2)
## rpart 4.1.21 2023-10-09 [4] CRAN (R 4.3.1)
## rstatix 0.7.2 2023-02-01 [1] CRAN (R 4.3.2)
## rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.2)
## sass 0.4.7 2023-07-15 [1] CRAN (R 4.3.2)
## scales 1.2.1 2022-08-20 [1] CRAN (R 4.3.2)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.2)
## shiny 1.7.5.1 2023-10-14 [1] CRAN (R 4.3.2)
## stringi 1.8.1 2023-11-13 [1] CRAN (R 4.3.2)
## stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.3.2)
## tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.2)
## tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.3.2)
## tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.2)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.2)
## timechange 0.2.0 2023-01-11 [1] CRAN (R 4.3.2)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.2)
## urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.3.2)
## usethis * 2.2.2 2023-07-06 [1] CRAN (R 4.3.2)
## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.2)
## vctrs 0.6.4 2023-10-12 [1] CRAN (R 4.3.2)
## vroom 1.6.4 2023-10-02 [1] CRAN (R 4.3.2)
## withr 2.5.2 2023-10-30 [1] CRAN (R 4.3.2)
## xfun 0.41 2023-11-01 [1] CRAN (R 4.3.2)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.3.2)
## yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.2)
##
## [1] /home/pramod/R/x86_64-pc-linux-gnu-library/4.3
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library
##
## ──────────────────────────────────────────────────────────────────────────────