dataExploration.R

library(readr)
library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

library(tibble)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggupset)
library(circlize)

## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.14.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library(patchwork)
# library(GenomicRanges)
print(getwd())

## [1] "/Users/rnili/Desktop/Challegne_2_data"

base_path <-  "/Users/rnili/Desktop/Challegne_2_data"
list.files(path=paste0(base_path,"/2020_LD"))

## [1] "2020LD_pbmc_cell_frequency.tsv"      
## [2] "2020LD_pbmc_gene_expression.tsv"     
## [3] "2020LD_plasma_ab_titer.tsv"          
## [4] "2020LD_plasma_protein_expression.tsv"
## [5] "2020LD_specimen.tsv"                 
## [6] "2020LD_subject.tsv"

list.files(path=paste0(base_path))

##  [1] "2020_LD"                                   
##  [2] "2021_LD"                                   
##  [3] "2022_BD"                                   
##  [4] "Ab_titer.pdf"                              
##  [5] "antigen.pdf"                               
##  [6] "Cell_frequency.pdf"                        
##  [7] "cell_type_name.pdf"                        
##  [8] "clinical_data.csv"                         
##  [9] "dataExploration.log"                       
## [10] "dataExploration.pdf"                       
## [11] "dataExploration.R"                         
## [12] "dataExploration.spin.R"                    
## [13] "dataExploration.spin.Rmd"                  
## [14] "dataExploration.tex"                       
## [15] "dataExploration1.html"                     
## [16] "Demographic_Longitudinal_sampleMapping.pdf"
## [17] "freq_cell.csv"                             
## [18] "freq_cell0.csv"                            
## [19] "freq_cell1.csv"                            
## [20] "freq_cell2.csv"                            
## [21] "freq_cell3.csv"                            
## [22] "gene_id.pdf"                               
## [23] "mat_all.csv"                               
## [24] "Olink.pdf"                                 
## [25] "RNASeq.pdf"                                
## [26] "subjects.pdf"                              
## [27] "uniprot.pdf"

folders = c("2020_LD", "2021_LD", "2022_BD")

info_binder <- function(path, folders, sufix){
  df_total <- data.frame()

  for (i in 1:length(folders)){
    name = gsub('_', '', folders[i])

    file_name <- sprintf("/%s/%s_%s.tsv", folders[i], name, sufix)
    df <- readr::read_tsv(paste0(base_path, file_name), show_col_types = FALSE)
    df_total <- rbind(df_total, df)
  }
  return(df_total)
}


experiments_binder <- function(path, clinical_data, folders, sufix){
  df_total <- data.frame()
  
  for (i in 1:length(folders)){
    name = gsub('_', '', folders[i])
    
    file_name <- sprintf("/%s/%s_%s.tsv", folders[i], name, sufix)
    df <- readr::read_tsv(paste0(base_path, file_name), show_col_types = FALSE)%>%
        dplyr::left_join(., clinical_data)
    df$year <- name
    
    # print(header(df)) 
    df_total <- rbind(df_total, df)
  }
  return(df_total)
}


demographic_plots <- function(clinical_data, count_data, plotTitle){
  print(plotTitle)
  print(paste0("Number of subjects: ", length(unique( count_data$subject_id))))
  print(paste0("Number of specimen: ", length(unique( count_data$specimen_id))))
  # 
  # print(colnames(count_data))
  # 
  # print(count_data$dataset)
  
  plot_data <- count_data %>% 
    distinct(., subject_id, biological_sex,race, ethnicity, infancy_vac, dataset) %>% 
    group_by(dataset) 
  
  print(plot_data)
  
  # Distribution of subjects by sex
  p1 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = biological_sex)) + 
    geom_bar(position = "dodge") +
    theme_minimal() +
    ggtitle(plotTitle) +
    # theme_minimal() 
    theme(axis.ticks.y = element_blank(),
          axis.text.x  = element_text(angle = 45, hjust = 1))+
    ggtitle(plotTitle)
  
  # Distribution of subjects by ethnicity
  p2 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = ethnicity)) + 
    geom_bar(position = "dodge") +
    # theme_minimal()+
    theme(axis.ticks.y = element_blank(),
          axis.text.x  = element_text(angle = 45, hjust = 1))+
    plot_layout(widths = 5)
  
  # Distribution of subjects by race
  p3 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = race)) + 
    geom_bar(position = "dodge") +
    # theme_minimal()+
    theme(axis.ticks.y = element_blank(),
          axis.text.x  = element_text(angle = 45, hjust = 1))+
    plot_layout(widths = 5)
  
  # Distribution of subjects by infancy_vac
  p4 <- plot_data %>%
    ggplot(., aes(x=dataset, fill = infancy_vac)) + 
    geom_bar(position = "dodge") +
    # theme_minimal() + 
    theme(axis.ticks.y = element_blank(),
          axis.text.x  = element_text(angle = 45, hjust = 1))+
    plot_layout(widths = 5)
  
  plotDF <- clinical_data %>%
    filter(specimen_id %in% count_data$specimen_id) %>%
    add_column(flag = 1) %>%
    dplyr::select(subject_id, Timepoint, flag) %>%
    distinct() %>%
    pivot_wider(names_from = Timepoint, values_from = flag) %>%
    dplyr::select(-subject_id) %>%
    group_by_all() %>%
    summarize(n = n()) %>%
    arrange(desc(n)) %>%
    rowid_to_column() %>%
    pivot_longer(cols = -c(rowid, n), names_to = "Timepoint") %>%
    filter(!is.na(value))
  
  print(plotDF)
  
  p5 <- ggplot(data = plotDF,
               mapping = aes(x = as.numeric(Timepoint), y = rowid)) +
    geom_point(mapping = aes(size = n)) +
    geom_line(mapping = aes(group = rowid)) +
    scale_y_continuous(breaks = 1:max(plotDF$rowid), 
                       labels = distinct(dplyr::select(plotDF,rowid, n))$n) +
    labs(y = "Number of Subjects", x = "Timepoint") +
    ggtitle(plotTitle) +
    theme_bw()+
    theme(axis.ticks.y = element_blank(),
          axis.text.x  = element_text(angle = 45, hjust = 1))+plot_layout(widths = 5)
  
  #plot as grid
  knitr::opts_chunk$set(fig.width=14, fig.height=6) 
  hlay1 <- rbind(c(1,2),
                 c(3,3))
  hlay2 <- rbind(c(1,2))
  
  # options(repr.plot.width =12, repr.plot.height =4)
  grid.arrange(p5, p4, ncol = 2, layout_matrix=hlay2)
  # options(repr.plot.width =12, repr.plot.height =6)
  grid.arrange(p1, p2, p3, ncol = 2, layout_matrix=hlay1)
  
}


upSetExpList <- function(years, olinkDF, rnaDF, cellDF, abtiterDF){
  exp_list = list()
  for (i in 1:length(years)){
    olinkln <- paste('olink',years[i],sep='')
    exp_list[[olinkln]] <- unlist(unique(olinkDF[grep(years[i], olinkDF$dataset), 'subject_id']), use.names = FALSE)
    
    rnaln <- paste('rnaseq',years[i],sep='')
    exp_list[[rnaln]] <- unlist(unique(rnaDF[grep(years[i], rnaDF$dataset), 'subject_id']), use.names = FALSE)

    cellln <- paste('cell_freq',years[i],sep='')
    exp_list[[cellln]] <- unlist(unique(cellDF[grep(years[i], cellDF$dataset), 'subject_id']), use.names = FALSE)

    abtiterln <- paste('abtiter',years[i],sep='')
    exp_list[[abtiterln]] <- unlist(unique(abtiterDF[grep(years[i], abtiterDF$dataset), 'subject_id']), use.names = FALSE)
  }
  return(exp_list)
}


upSetList <- function(years, df, target_col){
  exp_list = list()
  for (i in 1:length(years)){
    uniprotln <- paste(target_col,years[i],sep='')
    exp_list[[uniprotln]] <- unlist(unique(df[grep(years[i], df$dataset), target_col]), use.names = FALSE)
  }
  return(exp_list)
}

upset_plot1 <- function(mat){
  plot <- UpSet(mat,
                comb_col = c("red", "blue", "brown", "black")[comb_degree(mat)],
                top_annotation = upset_top_annotation(mat, add_numbers = TRUE, gp = gpar(col = comb_degree(mat))),
                right_annotation = upset_right_annotation(mat, add_numbers = TRUE)
  )
  
}


## Read subject data
subject <- info_binder(base_path, folders, "subject")
print(dim(subject))

## [1] 118   8

## Read specimen data
specimen <- info_binder(base_path, folders, "specimen")
print(dim(specimen))

## [1] 939   6

## Timepoint: days passed from the boost date for taking the specimen relevantto the visit number

## Join subject and specimen data
clinical_data <- dplyr::inner_join(subject, specimen, multiple = "all")

## Joining with `by = join_by(subject_id)`

clinical_data$Timepoint <- as.factor(clinical_data$planned_day_relative_to_boost)


## Read live_cell_percentages datasets
cell_freq <- experiments_binder(base_path, clinical_data, folders, "pbmc_cell_frequency")

## Joining with `by = join_by(specimen_id)`

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

print(dim(cell_freq))

## [1] 13895    17

## Read rnaseq datasets
rnaseq <- experiments_binder(base_path, clinical_data, folders, "pbmc_gene_expression")

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

print(dim(rnaseq))

## [1] 29559114       18

## Read olink datasets
olink <- experiments_binder(base_path, clinical_data, folders, "plasma_protein_expression")

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

# print(dim(olink))
# print(colnames(olink))
# print(as.factor(olink$dataset))

## Read Ab titer datasets
abtiter <- experiments_binder(base_path, clinical_data, folders, "plasma_ab_titer")

## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`
## Joining with `by = join_by(specimen_id)`

print(dim(abtiter))

## [1] 41705    22

colnames(abtiter)

##  [1] "specimen_id"                   "isotype"                      
##  [3] "is_antigen_specific"           "antigen"                      
##  [5] "MFI"                           "MFI_normalised"               
##  [7] "unit"                          "lower_limit_of_detection"     
##  [9] "subject_id"                    "infancy_vac"                  
## [11] "biological_sex"                "ethnicity"                    
## [13] "race"                          "year_of_birth"                
## [15] "date_of_boost"                 "dataset"                      
## [17] "actual_day_relative_to_boost"  "planned_day_relative_to_boost"
## [19] "specimen_type"                 "visit"                        
## [21] "Timepoint"                     "year"

abtiter$antigen = toupper(abtiter$antigen)

# Exploratory plot 1
# e_plot1 <- demographic_plots(clinical_data, rnaseq, "RNASeq data")
# plotDF <- clinical_data %>%
#   filter(specimen_id %in% cell_freq$specimen_id) %>%
#   add_column(flag = 1) %>%
#   dplyr::select(subject_id, Timepoint, flag) %>%
#   distinct() %>%
#   pivot_wider(names_from = Timepoint, values_from = flag) %>%
#   dplyr::select(-subject_id) %>%
#   group_by_all() %>%
#   summarize(n = n()) %>%
#   arrange(desc(n)) %>%
#   rowid_to_column() %>%
#   pivot_longer(cols = -c(rowid, n), names_to = "Timepoint") %>%
#   filter(!is.na(value))

# print(tail(data.frame(plotDF), 100))
# write.csv(clinical_data, paste0(base_path, '/clinical_data.csv'), row.names =FALSE)
# write.csv(plotDF, paste0(base_path, '/freq_cell3.csv'), row.names =FALSE)
# print(dim(data.frame(plotDF)))

# pdf(paste0(base_path, "/RNASeq.pdf"), paper='USr', width=10, height=6)
    # width = 30, height = 20 , units = 'in', res = 300 )
demographic_plots(clinical_data, rnaseq, "RNASeq data")

## [1] "RNASeq data"
## [1] "Number of subjects: 93"
## [1] "Number of specimen: 507"
## # A tibble: 93 × 6
## # Groups:   dataset [3]
##    subject_id biological_sex race                    ethnicity   infan…¹ dataset
##         <dbl> <chr>          <chr>                   <chr>       <chr>   <chr>  
##  1          9 Male           Asian                   Not Hispan… aP      2020_d…
##  2         13 Male           White                   Not Hispan… aP      2020_d…
##  3         18 Female         Unknown or Not Reported Hispanic o… aP      2020_d…
##  4         27 Female         Asian                   Not Hispan… aP      2020_d…
##  5         29 Male           White                   Hispanic o… aP      2020_d…
##  6         10 Female         Asian                   Not Hispan… wP      2020_d…
##  7         15 Male           Asian                   Not Hispan… wP      2020_d…
##  8         20 Female         White                   Not Hispan… wP      2020_d…
##  9         21 Male           White                   Not Hispan… wP      2020_d…
## 10         22 Female         White                   Not Hispan… wP      2020_d…
## # … with 83 more rows, and abbreviated variable name ¹infancy_vac

## `summarise()` has grouped output by '0', '1', '3', '7', '14', '-30'. You can
## override using the `.groups` argument.

## # A tibble: 12 × 4
##    rowid     n Timepoint value
##    <int> <int> <chr>     <dbl>
##  1     1    72 0             1
##  2     1    72 1             1
##  3     1    72 3             1
##  4     1    72 7             1
##  5     1    72 14            1
##  6     2    21 0             1
##  7     2    21 1             1
##  8     2    21 3             1
##  9     2    21 7             1
## 10     2    21 14            1
## 11     2    21 -30           1
## 12     2    21 -15           1

# dev.off()

# pdf(paste0(base_path, "/Olink.pdf"), paper='USr', width=10, height=6)
demographic_plots(clinical_data, olink, "OLINK data")

## [1] "OLINK data"
## [1] "Number of subjects: 74"
## [1] "Number of specimen: 326"
## # A tibble: 74 × 6
## # Groups:   dataset [3]
##    subject_id biological_sex race                    ethnicity   infan…¹ dataset
##         <dbl> <chr>          <chr>                   <chr>       <chr>   <chr>  
##  1         21 Male           White                   Not Hispan… wP      2020_d…
##  2         33 Male           More Than One Race      Hispanic o… wP      2020_d…
##  3         31 Female         Asian                   Not Hispan… wP      2020_d…
##  4         26 Female         Unknown or Not Reported Hispanic o… wP      2020_d…
##  5         47 Female         White                   Not Hispan… aP      2020_d…
##  6          4 Male           Asian                   Not Hispan… wP      2020_d…
##  7         11 Female         Unknown or Not Reported Hispanic o… wP      2020_d…
##  8         29 Male           White                   Hispanic o… aP      2020_d…
##  9         20 Female         White                   Not Hispan… wP      2020_d…
## 10          6 Female         White                   Not Hispan… wP      2020_d…
## # … with 64 more rows, and abbreviated variable name ¹infancy_vac

## `summarise()` has grouped output by '0', '1', '3', '7', '14', '-30'. You can
## override using the `.groups` argument.

## # A tibble: 12 × 4
##    rowid     n Timepoint value
##    <int> <int> <chr>     <dbl>
##  1     1    54 0             1
##  2     1    54 1             1
##  3     1    54 3             1
##  4     1    54 7             1
##  5     1    54 14            1
##  6     2    16 0             1
##  7     2    16 -30           1
##  8     2    16 -15           1
##  9     3     3 0             1
## 10     3     3 -15           1
## 11     4     1 -30           1
## 12     4     1 -15           1

# dev.off()

# pdf(paste0(base_path, "/Cell_frequency.pdf"), paper='USr', width=10, height=6)
demographic_plots(clinical_data, cell_freq, "Cell frequencies data")

## [1] "Cell frequencies data"
## [1] "Number of subjects: 74"
## [1] "Number of specimen: 327"
## # A tibble: 74 × 6
## # Groups:   dataset [3]
##    subject_id biological_sex race                    ethnicity   infan…¹ dataset
##         <dbl> <chr>          <chr>                   <chr>       <chr>   <chr>  
##  1         29 Male           White                   Hispanic o… aP      2020_d…
##  2         36 Female         White                   Hispanic o… aP      2020_d…
##  3         44 Female         More Than One Race      Hispanic o… aP      2020_d…
##  4         45 Female         Asian                   Not Hispan… aP      2020_d…
##  5         46 Female         Unknown or Not Reported Not Hispan… aP      2020_d…
##  6         47 Female         White                   Not Hispan… aP      2020_d…
##  7         48 Female         White                   Not Hispan… aP      2020_d…
##  8         49 Female         White                   Not Hispan… aP      2020_d…
##  9         52 Male           More Than One Race      Not Hispan… aP      2020_d…
## 10         55 Female         Asian                   Not Hispan… aP      2020_d…
## # … with 64 more rows, and abbreviated variable name ¹infancy_vac

## `summarise()` has grouped output by '0', '1', '3', '7', '14', '-30'. You can
## override using the `.groups` argument.

## # A tibble: 10 × 4
##    rowid     n Timepoint value
##    <int> <int> <chr>     <dbl>
##  1     1    53 0             1
##  2     1    53 1             1
##  3     1    53 3             1
##  4     1    53 7             1
##  5     1    53 14            1
##  6     2    20 0             1
##  7     2    20 -30           1
##  8     2    20 -15           1
##  9     3     1 0             1
## 10     3     1 -15           1

# dev.off()


# pdf(paste0(base_path, "/Ab_titer.pdf"), paper='USr', width=10, height=6)
demographic_plots(clinical_data, abtiter, "Ab titer data")

## [1] "Ab titer data"
## [1] "Number of subjects: 111"
## [1] "Number of specimen: 685"
## # A tibble: 111 × 6
## # Groups:   dataset [3]
##    subject_id biological_sex race                    ethnicity   infan…¹ dataset
##         <dbl> <chr>          <chr>                   <chr>       <chr>   <chr>  
##  1          1 Female         White                   Not Hispan… wP      2020_d…
##  2          3 Female         White                   Unknown     wP      2020_d…
##  3          4 Male           Asian                   Not Hispan… wP      2020_d…
##  4          5 Male           Asian                   Not Hispan… wP      2020_d…
##  5          6 Female         White                   Not Hispan… wP      2020_d…
##  6          7 Female         More Than One Race      Hispanic o… wP      2020_d…
##  7          9 Male           Asian                   Not Hispan… aP      2020_d…
##  8         10 Female         Asian                   Not Hispan… wP      2020_d…
##  9         11 Female         Unknown or Not Reported Hispanic o… wP      2020_d…
## 10         12 Male           Asian                   Not Hispan… wP      2020_d…
## # … with 101 more rows, and abbreviated variable name ¹infancy_vac

## `summarise()` has grouped output by '0', '3', '7', '14', '30', '120', '464',
## '1', '423', '386', '511', '447', '448', '428', '415', '543', '402', '407',
## '396', '366', '-30'. You can override using the `.groups` argument.

## # A tibble: 122 × 4
##    rowid     n Timepoint value
##    <int> <int> <chr>     <dbl>
##  1     1    70 0             1
##  2     1    70 3             1
##  3     1    70 7             1
##  4     1    70 14            1
##  5     1    70 30            1
##  6     1    70 120           1
##  7     1    70 1             1
##  8     2    20 0             1
##  9     2    20 -30           1
## 10     2    20 -15           1
## # … with 112 more rows

# dev.off()

years = c('2020', '2021', '2022')

lt_all = upSetExpList(years, olink, rnaseq, cell_freq, abtiter)
# rnaseq
# lt_all
# print(lt_all)
# print(cbind(lt_all$olink2020, lt_all$olink2021))
# print(lt_all$olink2021) 


knitr::opts_chunk$set(fig.width=6, fig.height=4.5) 
mat_all <- make_comb_mat(lt_all)
# print(mat_all)
write.csv(mat_all, paste0(base_path, '/mat_all.csv'))
plot <- upset_plot1(mat_all)
# pdf(paste0(base_path, "/subjects.pdf"), paper='USr', width=10, height=6)
plot

# dev.off()

lt_protein <- upSetList(years, olink, 'protein_id')
# lt_protein
print(typeof(lt_protein$protein_id2021))

## [1] "character"

mat_protein = make_comb_mat(lt_protein)
plot <- upset_plot1(mat_protein)
# pdf(paste0(base_path, "/uniprot.pdf"), paper='USr', width=10, height=6)
plot

# dev.off()


lt_rnaseq <- upSetList(years, rnaseq, 'versioned_ensembl_gene_id')
mat_rnaseq  = make_comb_mat(lt_rnaseq)
plot <- upset_plot1(mat_rnaseq)

# pdf(paste0(base_path, "/gene_id.pdf"), paper='USr', width=10, height=6)
plot

# dev.off()


print(colnames(cell_freq))

##  [1] "specimen_id"                   "cell_type_name"               
##  [3] "percent_live_cell"             "subject_id"                   
##  [5] "infancy_vac"                   "biological_sex"               
##  [7] "ethnicity"                     "race"                         
##  [9] "year_of_birth"                 "date_of_boost"                
## [11] "dataset"                       "actual_day_relative_to_boost" 
## [13] "planned_day_relative_to_boost" "specimen_type"                
## [15] "visit"                         "Timepoint"                    
## [17] "year"

lt_cell_freq <- upSetList(years, cell_freq, 'cell_type_name')
mat_cell_freq  = make_comb_mat(lt_cell_freq)
plot <- upset_plot1(mat_cell_freq)

# pdf(paste0(base_path, "/cell_type_name.pdf"), paper='USr', width=10, height=6)
plot

# dev.off()

print(colnames(abtiter))

##  [1] "specimen_id"                   "isotype"                      
##  [3] "is_antigen_specific"           "antigen"                      
##  [5] "MFI"                           "MFI_normalised"               
##  [7] "unit"                          "lower_limit_of_detection"     
##  [9] "subject_id"                    "infancy_vac"                  
## [11] "biological_sex"                "ethnicity"                    
## [13] "race"                          "year_of_birth"                
## [15] "date_of_boost"                 "dataset"                      
## [17] "actual_day_relative_to_boost"  "planned_day_relative_to_boost"
## [19] "specimen_type"                 "visit"                        
## [21] "Timepoint"                     "year"

lt_abtiter <- upSetList(years, abtiter, 'antigen')
print(lt_abtiter)

## $antigen2020
##  [1] "TOTAL"   "PT"      "PRN"     "FHA"     "ACT"     "LOS"     "FELD1"  
##  [8] "BETV1"   "LOLP1"   "MEASLES" "PTM"     "FIM2/3"  "TT"      "DT"     
## [15] "OVA"     "PD1"    
## 
## $antigen2021
## [1] "PRN"    "DT"     "FHA"    "FIM2/3" "TT"     "PT"     "OVA"   
## 
## $antigen2022
## [1] "DT"     "FHA"    "FIM2/3" "OVA"    "PRN"    "PT"     "TT"

mat_abtiter  = make_comb_mat(lt_abtiter)
plot <- upset_plot1(mat_abtiter)

# pdf(paste0(base_path, "/antigen.pdf"), paper='USr', width=10, height=6)
plot

# dev.off()

# ## Read subject data
# subject_2020 <- readr::read_tsv(paste0(base_path, "/2020_LD/2020LD_subject.tsv"), show_col_types = FALSE)
# # print(subject_2020[, 1:4])
# # subject_2020 <- read.delim(paste0(base_path, "/2020_LD/2020LD_subject.tsv"), sep="\t")
# subject_2021 <- readr::read_tsv(paste0(base_path, "/2021_LD/2021LD_subject.tsv"), show_col_types = FALSE)
# subject_2022 <- readr::read_tsv(paste0(base_path, "/2022_BD/2022BD_subject.tsv"), show_col_types = FALSE)
# 
# subject <- rbind(subject_2020, subject_2021, subject_2022)
# print(dim(subject))
# # print(subject)
# 
# ## Read specimen data
# specimen_2020 <- readr::read_tsv(paste0(base_path, "/2020_LD/2020LD_specimen.tsv"), show_col_types = FALSE)
# specimen_2021 <- readr::read_tsv(paste0(base_path, "/2021_LD/2021LD_specimen.tsv"), show_col_types = FALSE)
# specimen_2022 <- readr::read_tsv(paste0(base_path, "/2022_BD/2022BD_specimen.tsv"), show_col_types = FALSE)
# 
# specimen = rbind(specimen_2020, specimen_2021, specimen_2022)
# print(specimen)
# 
# ## Join subject and specimen data
# clinical_data <- dplyr::inner_join(subject, specimen, multiple = "all")
# print(clinical_data)
# 
# # typeof(clinical_data$planned_day_relative_to_boost)
# clinical_data$Timepoint <- as.factor(clinical_data$planned_day_relative_to_boost)
# # typeof(clinical_data$Timepoint)
# # print(clinical_data$Timepoint)
# 
# ## Read live_cell_percentages datasets
# cell_freq_2020 <- readr::read_tsv(paste0(base_path, "/2020_LD/2020LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE) %>% dplyr::left_join(., clinical_data)
# cell_freq_2021 <- readr::read_tsv(paste0(base_path, "/2021_LD/2021LD_pbmc_cell_frequency.tsv"), show_col_types = FALSE) %>% dplyr::left_join(., clinical_data)
# cell_freq_2022 <- readr::read_tsv(paste0(base_path, "/2022_BD/2022BD_pbmc_cell_frequency.tsv"), show_col_types = FALSE) %>% dplyr::left_join(., clinical_data)
# 
# cell_freq <- rbind(cell_freq_2020, cell_freq_2021, cell_freq_2022)
# print(dim(cell_freq))
# 
# ## Read rnaseq datasets

## Read olink datasets
olink_2020 <- readr::read_tsv(paste0(base_path, "/2020_LD/2020LD_plasma_protein_expression.tsv"), show_col_types = FALSE) %>% dplyr::left_join(., clinical_data)

## Joining with `by = join_by(specimen_id)`

olink_2021 <- readr::read_tsv(paste0(base_path, "/2021_LD/2021LD_plasma_protein_expression.tsv"), show_col_types = FALSE) %>% dplyr::left_join(., clinical_data)

## Joining with `by = join_by(specimen_id)`

olink_2022 <- readr::read_tsv(paste0(base_path, "/2022_BD/2022BD_plasma_protein_expression.tsv"), show_col_types = FALSE) %>% dplyr::left_join(., clinical_data)

## Joining with `by = join_by(specimen_id)`

lt_protein <- list(
  uniprot_2020 = unique(olink_2020$protein_id),
  uniprot_2021 = unique(olink_2021$protein_id),
  uniprot_2022 = unique(olink_2022$protein_id)
)


# print(typeof(lt_protein$uniprot_2021))
mat_protein = make_comb_mat(lt_protein)

plot <- upset_plot1(mat_protein)

plot

dataExploration.R

rnili

2023-03-05