Download all data file as described in here: https://www.cmi-pb.org/mycmipb/2021_prediction_challenge/#step_2
base_path = "/home/pramod/Documents/cmipb_project/2021_cmipb_challenge/"
## Read subject data
subject_2020 <- readr::read_csv(paste0(base_path, "2020LD_subject.csv"))
subject_2021 <- readr::read_csv(paste0(base_path, "2021BD_subject.csv"))
subject <- rbind(subject_2020, subject_2021)
## Read specimen data
specimen_2020 <- readr::read_csv(paste0(base_path, "2020LD_specimen.csv"))
specimen_2021 <- readr::read_csv(paste0(base_path, "2021BD_specimen.csv"))
specimen <- rbind(specimen_2020, specimen_2021)
## Join subject and specimen data
clinical_data <- dplyr::inner_join(subject, specimen)
#clinical_data <- clinical_data[clinical_data$planned_day_relative_to_boost <= 90, ]
clinical_data$Timepoint <- as.factor(clinical_data$planned_day_relative_to_boost)
## Read live_cell_percentages datasets
cell_freq_2020 <- readr::read_csv(paste0(base_path, "2020LD_live_cell_percentages.csv")) %>%
dplyr::left_join(., clinical_data)
cell_freq_2021 <- readr::read_csv(paste0(base_path, "2021BD_live_cell_percentages.csv")) %>%
dplyr::left_join(., clinical_data)
cell_freq <- rbind(cell_freq_2020, cell_freq_2021)
## Read rnaseq datasets
rnaseq_2020 <- readr::read_csv(paste0(base_path, "2020LD_rnaseq.csv")) %>%
dplyr::left_join(., clinical_data)
rnaseq_2021 <- readr::read_csv(paste0(base_path, "2021BD_rnaseq.csv")) %>%
dplyr::left_join(., clinical_data)
rnaseq <- rbind(rnaseq_2020, rnaseq_2021)
#rnaseq <- dplyr::left_join(rnaseq , clinical_data)
## Read olink datasets
olink_2020 <- readr::read_csv(paste0(base_path, "2020LD_olink_prot_exp.csv")) %>%
dplyr::left_join(., clinical_data)
olink_2021 <- readr::read_csv(paste0(base_path, "2021BD_olink_prot_exp.csv")) %>%
dplyr::left_join(., clinical_data)
olink <- rbind(olink_2020, olink_2021)
#olink <- dplyr::left_join(olink , clinical_data)
## Read Ab titer datasets
abtiter_2020 <- readr::read_csv(paste0(base_path, "2020LD_ab_titer.csv")) %>%
dplyr::left_join(., clinical_data)
abtiter_2021 <- readr::read_csv(paste0(base_path, "2021BD_ab_titer.csv")) %>%
dplyr::left_join(., clinical_data)
abtiter <- rbind(abtiter_2020, abtiter_2021)
print("All files read successfully")## [1] "All files read successfully"
demographic_plots <- function(clinical_data, count_data, plotTitle){
print(plotTitle)
print(paste0("Number of subjects: ", length(unique( count_data$subject_id))))
print(paste0("Number of specimen: ", length(unique( count_data$specimen_id))))
plot_data <- count_data %>%
distinct(., subject_id, biological_sex,race, ethnicity, infancy_vac, dataset) %>%
group_by(dataset)
# Distribution of subjects by sex
p1 <- plot_data %>%
ggplot(., aes(x=dataset, fill = biological_sex)) +
geom_bar(position = "dodge") +
theme_minimal() +
ggtitle(plotTitle)
# Distribution of subjects by ethnicity
p2 <- plot_data %>%
ggplot(., aes(x=dataset, fill = ethnicity)) +
geom_bar(position = "dodge") +
theme_minimal()
# Distribution of subjects by race
p3 <- plot_data %>%
ggplot(., aes(x=dataset, fill = race)) +
geom_bar(position = "dodge") +
theme_minimal()
# Distribution of subjects by infancy_vac
p4 <- plot_data %>%
ggplot(., aes(x=dataset, fill = infancy_vac)) +
geom_bar(position = "dodge") +
theme_minimal()
plotDF <- clinical_data %>%
filter(specimen_id %in% count_data$specimen_id) %>%
add_column(flag = 1) %>%
dplyr::select(subject_id, Timepoint, flag) %>%
distinct() %>%
pivot_wider(names_from = Timepoint, values_from = flag) %>%
dplyr::select(-subject_id) %>%
group_by_all() %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
rowid_to_column() %>%
pivot_longer(cols = -c(rowid, n), names_to = "Timepoint") %>%
filter(!is.na(value))
p5 <- ggplot(data = plotDF,
mapping = aes(x = as.numeric(Timepoint), y = rowid)) +
geom_point(mapping = aes(size = n)) +
geom_line(mapping = aes(group = rowid)) +
scale_y_continuous(breaks = 1:max(plotDF$rowid),
labels = distinct(dplyr::select(plotDF,rowid, n))$n) +
labs(y = "Number of Subjects", x = "Timepoint") +
ggtitle(plotTitle) +
theme_bw()+
theme(axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
#plot as grid
knitr::opts_chunk$set(fig.width=8, fig.height=6)
hlay1 <- rbind(c(1,2),
c(3,3))
hlay2 <- rbind(c(1,2))
grid.arrange(p5, p4, ncol = 2, layout_matrix=hlay2)
grid.arrange(p1, p2, p3, ncol = 2, layout_matrix=hlay1)
#knitr::opts_chunk$set(fig.width=6, fig.height=4.5)
#plot(p5)
}demographic_plots(clinical_data, rnaseq, "RNASeq data")## [1] "RNASeq data"
## [1] "Number of subjects: 72"
## [1] "Number of specimen: 216"
## `summarise()` has grouped output by '0', '1', '3', '7'. You can override using the `.groups` argument.
demographic_plots(clinical_data, olink, "OLINK data")## [1] "OLINK data"
## [1] "Number of subjects: 54"
## [1] "Number of specimen: 126"
## `summarise()` has grouped output by '0', '1', '3', '7'. You can override using the `.groups` argument.
demographic_plots(clinical_data, cell_freq, "Cell frequencies data")## [1] "Cell frequencies data"
## [1] "Number of subjects: 53"
## [1] "Number of specimen: 133"
## `summarise()` has grouped output by '0', '1', '3', '7'. You can override using the `.groups` argument.
demographic_plots(clinical_data, abtiter, "Ab titer data")## [1] "Ab titer data"
## [1] "Number of subjects: 91"
## [1] "Number of specimen: 427"
## `summarise()` has grouped output by '0', '1', '3', '7', '14', '30', '90', '386', '428'. You can override using the `.groups` argument.
knitr::opts_chunk$set(fig.width=6, fig.height=4.5)
upset_plot1 <- function(mat)
{
plot <- UpSet(mat,
comb_col = c("red", "blue", "brown", "black")[comb_degree(mat)],
top_annotation = upset_top_annotation(mat, add_numbers = TRUE, gp = gpar(col = comb_degree(mat))),
right_annotation = upset_right_annotation(mat, add_numbers = TRUE)
)
}
lt_all <- list(
olink2020 = unique(olink_2020$subject_id),
rnaseq2020 = unique(rnaseq_2020$subject_id),
cell_freq2020 = unique(cell_freq_2020$subject_id),
abtiter2020 = unique(abtiter_2020$subject_id),
olink2021 = unique(olink_2021$subject_id),
rnaseq2021 = unique(rnaseq_2021$subject_id),
cell_freq2021 = unique(cell_freq_2021$subject_id),
abtiter2021 = unique(abtiter_2021$subject_id)
)
mat_all = make_comb_mat(lt_all)
plot <- upset_plot1(mat_all)
plotlt_protein <- list(
uniprot_2020 = unique(olink_2020$uniprot_id),
uniprot_2021 = unique(olink_2021$uniprot_id)
)
mat_protein = make_comb_mat(lt_protein)
plot <- upset_plot1(mat_protein)
plotlt_rnaseq <- list(
rnaseq_2020 = unique(rnaseq_2020$versioned_ensembl_gene_id),
rnaseq_2021 = unique(rnaseq_2021$versioned_ensembl_gene_id)
)
mat_rnaseq = make_comb_mat(lt_rnaseq)
plot <- upset_plot1(mat_rnaseq)
plotlt_cell_freq <- list(
cell_freq_2020 = unique(cell_freq_2020$cell_type_name),
cell_freq_2021 = unique(cell_freq_2021$cell_type_name)
)
mat_cell_freq = make_comb_mat(lt_cell_freq)
plot <- upset_plot1(mat_cell_freq)
plotlt_abtiter <- list(
abtiter_2020 = unique(abtiter_2020$antigen),
abtiter_2021 = unique(abtiter_2021$antigen)
)
mat_abtiter = make_comb_mat(lt_abtiter)
plot <- upset_plot1(mat_abtiter)
plot