Subject and specimen data
subject_2020 <- read_tsv(paste0(base_data_dir, "2020LD_subject.tsv"), show_col_types = FALSE)
subject_2021 <- read_tsv(paste0(base_data_dir, "2021LD_subject.tsv"), show_col_types = FALSE)
specimen_2020 <- read_tsv(paste0(base_data_dir, "2020LD_specimen.tsv"), show_col_types = FALSE)
specimen_2021 <- read_tsv(paste0(base_data_dir, "2021LD_specimen.tsv"), show_col_types = FALSE)
metadata_2020 <- specimen_2020 %>%
left_join(subject_2020) %>%
mutate(timepoint = planned_day_relative_to_boost)
## Joining with `by = join_by(subject_id)`
metadata_2021 <- left_join(specimen_2021, subject_2021)
## Joining with `by = join_by(subject_id)`
abtiter_2021 <- read_tsv(paste0(base_data_dir, "2021LD_plasma_ab_titer.tsv")) %>%
filter(!isotype %in% "IgE") %>%
mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
select(specimen_id, isotype_antigen, MFI_normalised) %>%
pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
column_to_rownames("specimen_id")
## Rows: 8085 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): isotype, antigen, unit
## dbl (4): specimen_id, MFI, MFI_normalised, lower_limit_of_detection
## lgl (1): is_antigen_specific
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
abtiter_2020 <- read_tsv(paste0(base_data_dir, "2020LD_plasma_ab_titer.tsv")) %>%
filter(!isotype %in% "IgE") %>%
mutate(isotype_antigen = paste0(isotype, "_", antigen)) %>%
filter(isotype_antigen %in% colnames(abtiter_2021)) %>%
select(specimen_id, isotype_antigen, MFI_normalised) %>%
pivot_wider(names_from = isotype_antigen, values_from = MFI_normalised) %>%
column_to_rownames("specimen_id")
## Rows: 31520 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): isotype, antigen, unit
## dbl (4): specimen_id, MFI, MFI_normalised, lower_limit_of_detection
## lgl (1): is_antigen_specific
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
abtiter_2021 = abtiter_2021 %>%
select(colnames(abtiter_2021))
colnames(abtiter_2020) == colnames(abtiter_2021)
## Warning in colnames(abtiter_2020) == colnames(abtiter_2021): longer object
## length is not a multiple of shorter object length
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
batch.factors = c("timepoint","infancy_vac","biological_sex")
## PVCA analysis
mat_data = abtiter_2020 %>%
t()
# PVCA_now(mat_data, metadata_2020, batch.factors)
cytof_2021 <- read_tsv(paste0(base_data_dir, "2021LD_pbmc_cell_frequency.tsv")) %>%
select(specimen_id, cell_type_name, percent_live_cell) %>%
pivot_wider(names_from = cell_type_name, values_from = percent_live_cell) %>%
column_to_rownames("specimen_id")
## Rows: 8250 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): cell_type_name
## dbl (2): specimen_id, percent_live_cell
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cytof_2020 <- read_tsv(paste0(base_data_dir, "2020LD_pbmc_cell_frequency.tsv")) %>%
select(specimen_id, cell_type_name, percent_live_cell) %>%
filter(cell_type_name %in% colnames(cytof_2021)) %>%
pivot_wider(names_from = cell_type_name, values_from = percent_live_cell) %>%
column_to_rownames("specimen_id")
## Rows: 2483 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): cell_type_name
## dbl (2): specimen_id, percent_live_cell
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(cytof_2020) == colnames(cytof_2021)
## Warning in colnames(cytof_2020) == colnames(cytof_2021): longer object length
## is not a multiple of shorter object length
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE
batch.factors = c("timepoint","infancy_vac","biological_sex")
## PVCA analysis
mat_data = cytof_2020 %>%
t()
# PVCA_now(mat_data, metadata_2020, batch.factors)
olink_2021 <- read_tsv(paste0(base_data_dir, "2021LD_plasma_cytokine_concentration.tsv")) %>%
select(specimen_id, protein_id, protein_expression) %>%
pivot_wider(names_from = protein_id, values_from = protein_expression) %>%
column_to_rownames("specimen_id")
## Rows: 8100 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): protein_id, quality_control, unit
## dbl (4): specimen_id, lower_limit_of_quantitation, protein_expression, upper...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
olink_2020 <- read_tsv(paste0(base_data_dir, "2020LD_plasma_cytokine_concentration.tsv")) %>%
select(specimen_id, protein_id, protein_expression) %>%
filter(protein_id %in% colnames(olink_2021)) %>%
pivot_wider(names_from = protein_id, values_from = protein_expression) %>%
column_to_rownames("specimen_id")
## Rows: 23670 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): protein_id, quality_control, unit
## dbl (4): specimen_id, lower_limit_of_quantitation, protein_expression, upper...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(olink_2020) == colnames(olink_2021)
## Warning in colnames(olink_2020) == colnames(olink_2021): longer object length
## is not a multiple of shorter object length
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
batch.factors = c("timepoint","infancy_vac","biological_sex")
## PVCA analysis
mat_data = olink_2020 %>%
t()
# PVCA_now(mat_data, metadata_2020, batch.factors)
rnaseq_2021 <- read_tsv(paste0(base_data_dir, "2021LD_pbmc_gene_expression.tsv")) %>%
select(specimen_id, versioned_ensembl_gene_id, tpm) #%>%
## Rows: 10494360 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): versioned_ensembl_gene_id
## dbl (3): specimen_id, raw_count, tpm
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
#column_to_rownames("specimen_id")
rnaseq_2020 <- read_tsv(paste0(base_data_dir, "2020LD_pbmc_gene_expression.tsv")) %>%
select(specimen_id, versioned_ensembl_gene_id, tpm) #%>%
## Rows: 10494360 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): versioned_ensembl_gene_id
## dbl (3): specimen_id, raw_count, tpm
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
#column_to_rownames("specimen_id")
#' Create a shortlist of genes (tpm >= 1) in at least 30% of the specimens.
tpm_shortlist <- rnaseq_2020 %>%
#rownames_to_column("versioned_ensembl_gene_id") %>%
#filter(versioned_ensembl_gene_id %in% gene_90_38_shortlist$versioned_ensembl_gene_id) %>%
#pivot_longer(!versioned_ensembl_gene_id, values_to = "tpm", names_to = "specimen_id") %>%
mutate(specimen_id = as.integer(specimen_id)) %>%
#left_join(subject_specimen) %>%
group_by(versioned_ensembl_gene_id) %>%
#group_by(versioned_ensembl_gene_id, infancy_vac) %>%
summarise(proportion = mean(tpm >= 1)) %>%
filter(proportion >= 0.8)
rnaseq_2020_v1 <- rnaseq_2020 %>%
filter(versioned_ensembl_gene_id %in% tpm_shortlist$versioned_ensembl_gene_id) %>%
pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
column_to_rownames("specimen_id")
tpm_shortlist_2021 <- rnaseq_2021 %>%
#rownames_to_column("versioned_ensembl_gene_id") %>%
#filter(versioned_ensembl_gene_id %in% gene_90_38_shortlist$versioned_ensembl_gene_id) %>%
#pivot_longer(!versioned_ensembl_gene_id, values_to = "tpm", names_to = "specimen_id") %>%
mutate(specimen_id = as.integer(specimen_id)) %>%
#left_join(subject_specimen) %>%
group_by(versioned_ensembl_gene_id) %>%
#group_by(versioned_ensembl_gene_id, infancy_vac) %>%
summarise(proportion = mean(tpm >= 1)) %>%
filter(proportion >= 0.8)
rnaseq_2021_v1 <- rnaseq_2021 %>%
filter(versioned_ensembl_gene_id %in% tpm_shortlist_2021$versioned_ensembl_gene_id) %>%
pivot_wider(names_from = versioned_ensembl_gene_id, values_from = tpm) %>%
column_to_rownames("specimen_id")
batch.factors = c("timepoint","infancy_vac","biological_sex")
## PVCA analysis
mat_data = rnaseq_2020_v1 %>%
t()
# PVCA_now(mat_data, metadata_2020, batch.factors)
PVCA_now <- function(mat_data, subject_specimen, batch.factors, plot_title = "PVCA analysis"){
## test
# mat_data = olink_data_processed$wide
# subject_specimen = metadata_2020
counts_data <- mat_data[rowMeans(is.na(mat_data)) < 1, ] %>%
as.matrix() %>%
impute.knn() %>%
.$data
subject_specimen <- subject_specimen %>%
dplyr::filter(specimen_id %in% colnames(counts_data)) %>%
dplyr::select(specimen_id, timepoint, infancy_vac, biological_sex, dataset) %>%
dplyr::mutate(specimen_id1 = specimen_id) %>%
column_to_rownames("specimen_id1")
subject_specimen <- subject_specimen[colnames(counts_data), ]
phenoData = AnnotatedDataFrame(data = subject_specimen)
exprset = ExpressionSet(as.matrix(counts_data),
phenoData = phenoData)
pvcaObj_after <- pvca_run(exprset, batch.factors = batch.factors, threshold =0.6)
print(plot_pvca(pvcaObj_after, paste0(plot_title)))
#calculate principle components cell components
#pca_plots = plot_pca(counts_data, subject_specimen, paste0(plot_title))
#plot(pca_plots$PC12)
#plot(pca_plots$PC13)
#plot(pca_plots$PC23)
}
MOFA
# library(MOFA2)
# library(MOFAdata)
# library(data.table)
# library(ggplot2)
# library(tidyverse)
multiome_2020_backbone = rnaseq_2020_v1 %>%
rownames_to_column("specimen_id") %>%
select(specimen_id)
abtiter_2020_multi <- multiome_2020_backbone %>%
left_join(abtiter_2020 %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
cytof_2020_multi <- multiome_2020_backbone %>%
left_join(cytof_2020 %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
olink_2020_multi <- multiome_2020_backbone %>%
left_join(olink_2020 %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
df_make_mofa = list(
rnaseq = t(rnaseq_2020_v1),
cytof = t(cytof_2020_multi),
abtiter = t(abtiter_2020_multi),
olink = t(olink_2020_multi)
)
# lapply(df_make_mofa)
MOFAobject <- create_mofa(df_make_mofa)
## Creating MOFA object from a list of matrices (features as rows, sample as columns)...
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject)
model_opts <- get_default_model_options(MOFAobject)
model_opts$num_factors <- 15
train_opts <- get_default_training_options(MOFAobject)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42
MOFAobject <- prepare_mofa(MOFAobject,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
## Warning in prepare_mofa(MOFAobject, data_options = data_opts, model_options =
## model_opts, : Some view(s) have a lot of features, it is recommended to perform
## a more stringent feature selection before creating the MOFA object....
## Checking data options...
## Checking training options...
## Checking model options...
MOFAobject <- run_mofa(MOFAobject, use_basilisk = TRUE)
## Warning in run_mofa(MOFAobject, use_basilisk = TRUE): No output filename provided. Using /tmp/Rtmpgh5ASc/mofa_20231016-155831.hdf5 to store the trained model.
## Connecting to the mofapy2 package using basilisk.
## Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.
plot_factor_cor(MOFAobject)

plot_variance_explained(MOFAobject, max_r2=15)

plot_variance_explained(MOFAobject, plot_total = T)[[2]]

multiome_2021_backbone = rnaseq_2021_v1 %>%
rownames_to_column("specimen_id") %>%
select(specimen_id)
abtiter_2021_multi <- multiome_2021_backbone %>%
left_join(abtiter_2021 %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
cytof_2021_multi <- multiome_2021_backbone %>%
left_join(cytof_2021 %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
olink_2021_multi <- multiome_2021_backbone %>%
left_join(olink_2021 %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
df_make_mofa = list(
rnaseq = t(rnaseq_2021_v1),
cytof = t(cytof_2021_multi),
abtiter = t(abtiter_2021_multi),
olink = t(olink_2021_multi)
)
#lapply(df_make_mofa)
MOFAobject_2021 <- create_mofa(df_make_mofa)
## Creating MOFA object from a list of matrices (features as rows, sample as columns)...
plot_data_overview(MOFAobject)

data_opts <- get_default_data_options(MOFAobject_2021)
model_opts <- get_default_model_options(MOFAobject_2021)
model_opts$num_factors <- 15
train_opts <- get_default_training_options(MOFAobject_2021)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42
MOFAobject_2021 <- prepare_mofa(MOFAobject_2021,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
## Warning in prepare_mofa(MOFAobject_2021, data_options = data_opts,
## model_options = model_opts, : Some view(s) have a lot of features, it is
## recommended to perform a more stringent feature selection before creating the
## MOFA object....
## Checking data options...
## Checking training options...
## Checking model options...
MOFAobject_2021 <- run_mofa(MOFAobject_2021, use_basilisk = TRUE)
## Warning in run_mofa(MOFAobject_2021, use_basilisk = TRUE): No output filename provided. Using /tmp/Rtmpgh5ASc/mofa_20231016-155848.hdf5 to store the trained model.
## Connecting to the mofapy2 package using basilisk.
## Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.
plot_factor_cor(MOFAobject_2021)

plot_variance_explained(MOFAobject_2021, max_r2=15)

plot_variance_explained(MOFAobject_2021, plot_total = T)[[2]]

Process datasets | Inspect 2021 dataset
cytokine_2021_median_D0 <- cytof_2021_multi %>%
rownames_to_column("specimen_id") %>%
mutate(specimen_id = as.numeric(specimen_id)) %>%
pivot_longer(!specimen_id, names_to = "cell_type_name", values_to = "count") %>%
#left_join(metadata_2020[c("specimen_id")]) %>%
filter(specimen_id %in% unique(metadata_2021[metadata_2021$planned_day_relative_to_boost == 0,]$specimen_id)) %>%
group_by( cell_type_name) %>%
summarise(median = median(count, na.rm = T))
cytof_2021_normalized_pre <- cytof_2021_multi %>%
rownames_to_column("specimen_id") %>%
mutate(specimen_id = as.numeric(specimen_id)) %>%
pivot_longer(!specimen_id, names_to = "cell_type_name", values_to = "percent_live_cell") %>%
left_join(metadata_2021[c("specimen_id", "dataset")]) %>%
left_join(cytokine_2021_median_D0) %>%
mutate(percent_live_cell_normalized = if_else(is.na(percent_live_cell) == T, NA, percent_live_cell/median))
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(cell_type_name)`
cytof_2021_normalized <- cytof_2021_normalized_pre %>%
select(specimen_id, cell_type_name, percent_live_cell_normalized) %>%
pivot_wider(names_from = cell_type_name, values_from = percent_live_cell_normalized) %>%
column_to_rownames("specimen_id")
cytof_2021_normalized_multi <- multiome_2021_backbone %>%
left_join(cytof_2021_normalized %>% rownames_to_column("specimen_id")) %>%
column_to_rownames("specimen_id")
## Joining with `by = join_by(specimen_id)`
cytof_2021_normalized_multi_imputed = cytof_2021_normalized_multi %>%
as.matrix() %>%
impute.knn() %>%
.$data
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 15 rows with more than 50 % entries missing;
## mean imputation used for these rows
cytof_2021_normalized_multi_imputed_log = log(cytof_2021_normalized_multi_imputed + 1)
df_make_mofa = list(
rnaseq = t(rnaseq_2021_v1),
cytof = t(cytof_2021_normalized_multi_imputed_log),
abtiter = t(abtiter_2021_multi),
olink = t(olink_2021_multi)
)
#lapply(df_make_mofa)
MOFAobject_2021 <- create_mofa(df_make_mofa)
## Creating MOFA object from a list of matrices (features as rows, sample as columns)...
plot_data_overview(MOFAobject_2021)

data_opts <- get_default_data_options(MOFAobject_2021)
model_opts <- get_default_model_options(MOFAobject_2021)
model_opts$num_factors <- 15
train_opts <- get_default_training_options(MOFAobject_2021)
train_opts$convergence_mode <- "fast"
train_opts$seed <- 42
MOFAobject_2021 <- prepare_mofa(MOFAobject_2021,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
## Warning in prepare_mofa(MOFAobject_2021, data_options = data_opts,
## model_options = model_opts, : Some view(s) have a lot of features, it is
## recommended to perform a more stringent feature selection before creating the
## MOFA object....
## Checking data options...
## Checking training options...
## Checking model options...
MOFAobject_2021 <- run_mofa(MOFAobject_2021, use_basilisk = TRUE)
## Warning in run_mofa(MOFAobject_2021, use_basilisk = TRUE): No output filename provided. Using /tmp/Rtmpgh5ASc/mofa_20231016-155857.hdf5 to store the trained model.
## Connecting to the mofapy2 package using basilisk.
## Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.
plot_factor_cor(MOFAobject_2021)

plot_variance_explained(MOFAobject_2021, max_r2=15)

plot_variance_explained(MOFAobject_2021, plot_total = T)[[2]]
