Reading and ploting data

2021 CMI-PB Challenge resource

The CMI-PB team

2022-01-07

Read entire dataset (all files)

Download all data file as described in here: https://www.cmi-pb.org/mycmipb/2021_prediction_challenge/#step_2

base_path = "/home/pramod/Documents/cmipb_project/2021_cmipb_challenge/"


## Read subject data
subject_2020 <- readr::read_csv(paste0(base_path, "2020LD_subject.csv"))
subject_2021 <- readr::read_csv(paste0(base_path, "2021BD_subject.csv"))
subject <- rbind(subject_2020, subject_2021)

## Read specimen data

specimen_2020 <- readr::read_csv(paste0(base_path, "2020LD_specimen.csv"))
specimen_2021 <- readr::read_csv(paste0(base_path, "2021BD_specimen.csv"))
specimen <- rbind(specimen_2020, specimen_2021)

## Join subject and specimen data

clinical_data <- dplyr::inner_join(subject, specimen)
#clinical_data <- clinical_data[clinical_data$planned_day_relative_to_boost <= 90, ]

clinical_data$Timepoint <- as.factor(clinical_data$planned_day_relative_to_boost)

## Read live_cell_percentages datasets

cell_freq_2020 <- readr::read_csv(paste0(base_path, "2020LD_live_cell_percentages.csv")) %>%
  dplyr::left_join(., clinical_data)

cell_freq_2021 <- readr::read_csv(paste0(base_path, "2021BD_live_cell_percentages.csv")) %>%
  dplyr::left_join(., clinical_data)

cell_freq <- rbind(cell_freq_2020, cell_freq_2021)


## Read rnaseq datasets

rnaseq_2020 <- readr::read_csv(paste0(base_path, "2020LD_rnaseq.csv")) %>%
  dplyr::left_join(., clinical_data)

rnaseq_2021 <- readr::read_csv(paste0(base_path, "2021BD_rnaseq.csv")) %>%
  dplyr::left_join(., clinical_data)

rnaseq <- rbind(rnaseq_2020, rnaseq_2021)
#rnaseq   <- dplyr::left_join(rnaseq   , clinical_data)

## Read olink datasets

olink_2020 <- readr::read_csv(paste0(base_path, "2020LD_olink_prot_exp.csv")) %>%
  dplyr::left_join(., clinical_data)

olink_2021 <- readr::read_csv(paste0(base_path, "2021BD_olink_prot_exp.csv")) %>%
  dplyr::left_join(., clinical_data)

olink <- rbind(olink_2020, olink_2021)
#olink <- dplyr::left_join(olink , clinical_data)

## Read Ab titer datasets

abtiter_2020 <- readr::read_csv(paste0(base_path, "2020LD_ab_titer.csv")) %>%
  dplyr::left_join(., clinical_data)

abtiter_2021 <- readr::read_csv(paste0(base_path, "2021BD_ab_titer.csv")) %>%
  dplyr::left_join(., clinical_data)

abtiter <- rbind(abtiter_2020, abtiter_2021)

print("All files read successfully")
## [1] "All files read successfully"

Exploratory plot 1: Demographic data and Longitudinal sample mapping

demographic_plots <- function(clinical_data, count_data, plotTitle){
    
    print(plotTitle)
    print(paste0("Number of subjects: ", length(unique( count_data$subject_id))))
    print(paste0("Number of specimen: ", length(unique( count_data$specimen_id))))

  plot_data <- count_data %>% 
    distinct(., subject_id, biological_sex,race, ethnicity, infancy_vac, dataset) %>% 
    group_by(dataset) 
  
  # Distribution of subjects by sex
  p1 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = biological_sex)) + 
    geom_bar(position = "dodge") +
    theme_minimal() +
    ggtitle(plotTitle)
  
  # Distribution of subjects by ethnicity
  p2 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = ethnicity)) + 
    geom_bar(position = "dodge") +
    theme_minimal() 
  
  # Distribution of subjects by race
  p3 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = race)) + 
     geom_bar(position = "dodge") +
    theme_minimal() 
  
  # Distribution of subjects by infancy_vac
  p4 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = infancy_vac)) + 
     geom_bar(position = "dodge") +
    theme_minimal() 
  
  plotDF <- clinical_data %>%
    filter(specimen_id %in% count_data$specimen_id) %>%
    add_column(flag = 1) %>%
    dplyr::select(subject_id, Timepoint, flag) %>%
    distinct() %>%
    pivot_wider(names_from = Timepoint, values_from = flag) %>%
    dplyr::select(-subject_id) %>%
    group_by_all() %>%
    summarize(n = n()) %>%
    arrange(desc(n)) %>%
    rowid_to_column() %>%
    pivot_longer(cols = -c(rowid, n), names_to = "Timepoint") %>%
    filter(!is.na(value))
  
  p5 <- ggplot(data = plotDF,
         mapping = aes(x = as.numeric(Timepoint), y = rowid)) +
    geom_point(mapping = aes(size = n)) +
    geom_line(mapping = aes(group = rowid)) +
    scale_y_continuous(breaks = 1:max(plotDF$rowid), 
                       labels = distinct(dplyr::select(plotDF,rowid, n))$n) +
    labs(y = "Number of Subjects", x = "Timepoint") +
    ggtitle(plotTitle) +
    theme_bw()+
    theme(axis.ticks.y = element_blank(),
          axis.text.x  = element_text(angle = 45, hjust = 1))
  
   #plot as grid
   knitr::opts_chunk$set(fig.width=8, fig.height=6) 
   hlay1 <- rbind(c(1,2),
              c(3,3))
   hlay2 <- rbind(c(1,2))
   
   grid.arrange(p5, p4, ncol = 2, layout_matrix=hlay2)
   grid.arrange(p1, p2, p3, ncol = 2, layout_matrix=hlay1)
   
   
   #knitr::opts_chunk$set(fig.width=6, fig.height=4.5) 
   #plot(p5)

}
demographic_plots(clinical_data, rnaseq, "RNASeq data")
## [1] "RNASeq data"
## [1] "Number of subjects: 72"
## [1] "Number of specimen: 216"
## `summarise()` has grouped output by '0', '1', '3', '7'. You can override using the `.groups` argument.

demographic_plots(clinical_data, olink, "OLINK data")
## [1] "OLINK data"
## [1] "Number of subjects: 54"
## [1] "Number of specimen: 126"
## `summarise()` has grouped output by '0', '1', '3', '7'. You can override using the `.groups` argument.

demographic_plots(clinical_data, cell_freq, "Cell frequencies data")
## [1] "Cell frequencies data"
## [1] "Number of subjects: 53"
## [1] "Number of specimen: 133"
## `summarise()` has grouped output by '0', '1', '3', '7'. You can override using the `.groups` argument.

demographic_plots(clinical_data, abtiter, "Ab titer data")
## [1] "Ab titer data"
## [1] "Number of subjects: 91"
## [1] "Number of specimen: 427"
## `summarise()` has grouped output by '0', '1', '3', '7', '14', '30', '90', '386', '428'. You can override using the `.groups` argument.

Exploratory plot 2: Common subjects and features between datasets

knitr::opts_chunk$set(fig.width=6, fig.height=4.5) 
upset_plot1 <- function(mat)
{
plot <- UpSet(mat,
              comb_col = c("red", "blue", "brown", "black")[comb_degree(mat)],
              top_annotation = upset_top_annotation(mat, add_numbers = TRUE, gp = gpar(col = comb_degree(mat))),
              right_annotation = upset_right_annotation(mat, add_numbers = TRUE)
              )

}

lt_all <- list(
            olink2020 = unique(olink_2020$subject_id), 
            rnaseq2020 = unique(rnaseq_2020$subject_id),
            cell_freq2020 = unique(cell_freq_2020$subject_id),
            abtiter2020 = unique(abtiter_2020$subject_id),
            olink2021 = unique(olink_2021$subject_id), 
            rnaseq2021 = unique(rnaseq_2021$subject_id),
            cell_freq2021 = unique(cell_freq_2021$subject_id),
            abtiter2021 = unique(abtiter_2021$subject_id)
            
            )

mat_all = make_comb_mat(lt_all)

plot <- upset_plot1(mat_all)
plot

lt_protein <- list(
  uniprot_2020 = unique(olink_2020$uniprot_id),
  uniprot_2021 = unique(olink_2021$uniprot_id)
)

mat_protein = make_comb_mat(lt_protein)

plot <- upset_plot1(mat_protein)

plot

lt_rnaseq <- list(
  rnaseq_2020 = unique(rnaseq_2020$versioned_ensembl_gene_id),
  rnaseq_2021 = unique(rnaseq_2021$versioned_ensembl_gene_id)
)

mat_rnaseq  = make_comb_mat(lt_rnaseq)

plot <- upset_plot1(mat_rnaseq)

plot

lt_cell_freq <- list(
  cell_freq_2020 = unique(cell_freq_2020$cell_type_name),
  cell_freq_2021 = unique(cell_freq_2021$cell_type_name)
)

mat_cell_freq  = make_comb_mat(lt_cell_freq)

plot <- upset_plot1(mat_cell_freq)

plot

lt_abtiter <- list(
  abtiter_2020 = unique(abtiter_2020$antigen),
  abtiter_2021 = unique(abtiter_2021$antigen)
)

mat_abtiter  = make_comb_mat(lt_abtiter)

plot <- upset_plot1(mat_abtiter)

plot