Background

Some scripts to harvest data from the EGA API and find out what state it is in.

DACs

Retrieve all DACs housed at EGA, there are currently around 1500 (as at 2021-10-22) so set limit to 2000.

# dacs<- GET("https://ega-archive.org/metadata/v2/dacs?limit=2000")
# 
# dacs
tibble_response <- function(list){
  list %>%
    enframe() %>%
    pivot_wider() %>%
    mutate_all(~ifelse(is.null(.x[[1]]), list(NA), .x)) %>%
    unnest(cols = c(alias, egaStableId, centerName, creationTime, title, url, released, 
    published, contacts)) %>%
    mutate(emails = unlist(lapply(contacts, function(x) x$email)),
           organisation = unlist(lapply(contacts, function(x) {if(is.null(x$organisation)){
             return(NA)
           } else {
             return(x$organisation)
           }}))) %>%
    mutate_if(is.logical, as.character)
}
# dacs_content <- content(dacs)[["response"]][["result"]]
# df_list <- lapply(dacs_content, tibble_response)
# df_bound <- bind_rows(df_list)
# saveRDS(df_bound, "outputs/dacs_bound_table.rds")
df_bound <- readRDS("outputs/dacs_bound_table.rds")

Now I have harvested all the DACs in EGA, filter to Australian research institutes by using the email (hopefully)

au_researchers <- df_bound %>%
  filter(str_detect(emails, "au$")) %>%
  separate(emails, sep="@", remove=FALSE, into = c("person", "institute"))
glimpse(au_researchers)
## Rows: 116
## Columns: 13
## $ alias        <chr> "ARC Linkage Project Grant LPO990067", "ena-DAC-QCMG-14-1…
## $ egaStableId  <chr> "EGAC00001000022", "EGAC00001000142", "EGAC00001000142", …
## $ centerName   <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RESEARCH"…
## $ creationTime <chr> "2011-08-05T10:45.000Z", "2013-11-14T01:31.000Z", "2013-1…
## $ title        <chr> "ARC Linkage Project Grant LPO990067", "DAC overseeing se…
## $ url          <chr> "not provided", "not provided", "not provided", "not prov…
## $ released     <chr> "RELEASED", "NOT_RELEASED", "NOT_RELEASED", "NOT_RELEASED…
## $ published    <chr> "TRUE", "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE…
## $ contacts     <list> ["Melanie Bahlo", "bahlo@wehi.edu.au", "The Walter and E…
## $ emails       <chr> "bahlo@wehi.edu.au", "n.waddell@imb.uq.edu.au", "m.quinn@…
## $ person       <chr> "bahlo", "n.waddell", "m.quinn", "j.saunus", "s.lakhani",…
## $ institute    <chr> "wehi.edu.au", "imb.uq.edu.au", "uq.edu.au", "uq.edu.au",…
## $ organisation <chr> "The Walter and Eliza Hall Institute of Medical Research"…

Count of DACs per centerName submitted. It is clear there are a few issues with normalization of this field.

au_researchers %>%
  group_by(egaStableId, centerName) %>%
  tally() %>%
  ggplot(aes(centerName)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

SACGF - South Australia centre for cancer biology - ACRF Cancer Genomics Facility

Count of DACs per organisation, again this field doesn’t seem to be used consistently.

au_researchers %>%
  group_by(egaStableId, organisation) %>%
  tally() %>%
  ggplot(aes(organisation)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

Count of DACs by the institutional part of the email address of the contacts for each DAC, gives a bit of a clearer picture.

au_researchers %>%
  group_by(egaStableId, institute) %>%
  tally() %>%
  ggplot(aes(institute)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

Manually curate each institute to get nicer and more understandable labels. Here I may lose some detail, that is, groups that are housed within institutes but should give a clearer and simpler picture.

au_researchers <- au_researchers %>%
  mutate(tidy_center_names = case_when(
    str_detect(institute, "unimelb") ~ "UMELB", # University of Melbourne
    str_detect(institute, "uq") ~ "UQ",         # University of Queensland
    str_detect(institute, "syd") ~ "USYD",      # University of Sydney
    str_detect(institute, "anu") ~ "ANU",       # Australian National University
    str_detect(institute, "wehi") ~ "WEHI",     # Walter and Eliza Hall Institute
    str_detect(institute, "qut") ~ "QUT",       # Queensland University of Technology
    str_detect(institute, "garvan") ~ "GARVAN", # Garvan Institute of Medical Research
    str_detect(institute, "mcri") ~ "MCRI",     # Murdoch Children's Research Institute
    str_detect(institute, "unsw") ~ "UNSW",     # University of New South Wales
    str_detect(institute, "uow") ~ "UOW",       # University of Wollongong
    str_detect(institute, "griffith") ~ "GRIFFITH", # Griffith University
    str_detect(institute, "ccia") ~ "CCIA",         # Children's Cancer Institute
    str_detect(institute, "mh.org.au") ~ "ROYALMELB", # Royal Melbourne Hospital
    str_detect(institute, "centenary") ~ "CENTENARY", # Centenary Insitute
    str_detect(institute, "adelaide") ~ "UADEL", # University of Adelaide/SA Cancer Genomics Facility
    str_detect(institute, "qimr") ~ "QIMRB",    # QIMR Berghofer Medical Research Institute
    str_detect(institute, "lh") ~ "LIFEHOUSE",  # Chris O'Brien Lifehouse - not-for-profit Cancer Treatement centre
    str_detect(institute, "health.nsw") ~ "NSWHEALTH", # NSW health department
    str_detect(institute, "sswahs.nsw") ~ "NSWHEALTH", # Sydney South-West Health Department
    str_detect(institute, "telethon") ~ "TELETHON",  # Telethon Kids Institute
    TRUE ~ institute
  ))

Get centerName lookup table for nice names

tidy_center_lookup <- au_researchers %>%
  group_by(centerName, tidy_center_names) %>%
  tally() %>%
  select(-n)

Set institutes colour palette

length(levels(as.factor(tidy_center_lookup$tidy_center_names)))
## [1] 20
institute_colour_pal = pal_d3(palette="category20")(20)
names(institute_colour_pal) <- levels(as.factor(tidy_center_lookup$tidy_center_names))

Count of DACs per institute using curated names

au_researchers %>%
  group_by(egaStableId, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(tidy_center_names, fill=tidy_center_names)) +
  geom_bar() +
  scale_fill_manual(values = institute_colour_pal) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Institute") +
  ylab("DAC count") +
  labs(title="Count of DACs per institute")

Contacts per DAC

contacts_per_dac <- au_researchers %>%
  group_by(egaStableId) %>%
  tally() %>%
  arrange(-n) %>%
  ggplot(aes(n)) +
  geom_histogram(fill=biocommons_pal['purple'], bins=7) +
  scale_x_continuous(breaks = seq(0,7,1)) +
  theme_bw() +
  xlab("Number of contacts per DAC") +
  ylab("Frequency") 
  # + labs(title="Frequency of DACs with certain number of contacts specified on the DAC")

contacts_per_dac

ggsave("plots/contacts_per_dac_hist.png")
## Saving 9 x 5 in image

The majority of DACs (49) out of 71 only have a single point of contact.

au_researchers %>%
  group_by(egaStableId) %>%
  summarise(n = n(),
            emails = paste0(person, collapse=";")) %>%
  arrange(n)
## # A tibble: 71 × 3
##    egaStableId         n emails       
##    <chr>           <int> <chr>        
##  1 EGAC00001000022     1 bahlo        
##  2 EGAC00001000192     1 nic.waddell  
##  3 EGAC00001000220     1 rob.king     
##  4 EGAC00001000229     1 matt.brown   
##  5 EGAC00001000230     1 chovens      
##  6 EGAC00001000261     1 aghs         
##  7 EGAC00001000368     1 m.budzinska  
##  8 EGAC00001000465     1 d.lambert    
##  9 EGAC00001000565     1 sarah.dunstan
## 10 EGAC00001000577     1 majewski     
## # … with 61 more rows

Institutes per DAC

au_researchers %>%
  group_by(egaStableId, tidy_center_names) %>%
  tally() %>%
  select(-n) %>%
  group_by(egaStableId) %>%
  tally() %>%
  arrange(-n) %>%
  ggplot(aes(n)) +
  geom_histogram(fill=biocommons_pal['teal'], bins=5) +
  theme_bw() +
  xlab("Count of institutes") +
  ylab("Frequency of DACs") +
  labs(title="Frequency of number of institutes specified per DAC")

Most DACs have contacts from a single institute, 9 have DACs across multiple institutes with one DAC having contacts from 5 separate institutes. This is a consortium called the Sydney Head and Neck Cancer Institute (SHNCI).

Contacts on DACs

QIMRB

QIMRB has highest number of DACs. Contact emails tend to be individual researchers. This has the potential to cause issues if those individual researchers change institutes.

au_researchers %>%
  filter(tidy_center_names == "QIMRB") %>%
  select(person) %>%
  group_by(person) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 19 × 2
##    person                  n
##    <chr>               <int>
##  1 nic.waddell             8
##  2 john.pearson            3
##  3 ann-marie.patch         1
##  4 bryan.day               1
##  5 christian.engwerda      1
##  6 felicity.newell         1
##  7 georgia.trench          1
##  8 gregory.quaife-ryan     1
##  9 hong.you                1
## 10 james.hudson            1
## 11 juliet.french           1
## 12 katia.nones             1
## 13 mark.smyth              1
## 14 michael.quinn           1
## 15 olga.kondrashova        1
## 16 peter.johansson         1
## 17 ross.koufariotis        1
## 18 susanna.ng              1
## 19 tobias.bald             1

University of Melbourne

au_researchers %>%
  filter(tidy_center_names == "UMELB") %>%
  select(person, centerName) %>%
  group_by(person, centerName) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 15 × 3
## # Groups:   person [10]
##    person          centerName     n
##    <chr>           <chr>      <int>
##  1 jungch          PP_AUS         2
##  2 bjpope          UM             1
##  3 chovens         PP_GER         1
##  4 chovens         UM             1
##  5 chovens         Unimelb        1
##  6 chovens         UOM-CAP        1
##  7 chovens         UOMCAP         1
##  8 cmerom          UOMCAP         1
##  9 con             UM             1
## 10 con             Unimelb        1
## 11 epilepsy-austin WEHI           1
## 12 lachlan.coin    UMel           1
## 13 s.berkovic      WEHI           1
## 14 sarah.dunstan   NUS            1
## 15 scheffer        WEHI           1

Garvan institute

au_researchers %>%
  filter(tidy_center_names == "GARVAN") %>%
  select(person, centerName) %>%
  group_by(person, centerName) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 8 × 3
## # Groups:   person [8]
##   person    centerName     n
##   <chr>     <chr>      <int>
## 1 e.benn    GIMR           2
## 2 grants    KCCG           2
## 3 e.ali     KCCG           1
## 4 m.cowley  KCCG           1
## 5 m.mccabe  KCCG           1
## 6 mgrb      GIMR           1
## 7 n.watkins KCCG           1
## 8 s.mueller SHNCI          1

Children’s Cancer Institute

au_researchers %>%
  filter(tidy_center_names == "CCIA") %>%
  select(person, centerName) %>%
  group_by(person, centerName) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 4 × 3
## # Groups:   person [4]
##   person    centerName     n
##   <chr>     <chr>      <int>
## 1 zero      ZERO           3
## 2 mcowley   SHNCI          1
## 3 mgauthier SHNCI          1
## 4 pekert    MCRI           1

DACs over time

Increasing number of DACs over time, until 2021, perhaps covid related.

au_researchers %>%
  group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
  tally() %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y"))) %>%
  filter(creationYear > 1982) %>%  #remove 1980 DAC as obvious error
  ggplot(aes(x=creationYear)) + 
  geom_bar(fill=biocommons_pal['pink']) +
  scale_x_continuous(breaks = seq(1980,2022, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("DAC count")

dacs per institute over time dodged, potentially a bit ugly but shows more institutes being involved in later years - 2018-2020

au_researchers %>%
  group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
  tally() %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y"))) %>%
  filter(creationYear > 1982) %>%   #remove 1980 DAC as obvious error
  ggplot(aes(x=creationYear, fill=tidy_center_names)) + 
  geom_bar(position="dodge", width=0.9) +
  scale_x_continuous(breaks = seq(1980,2022, 1), name="Institute") +
  scale_fill_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "bottom")

dacs per institute over time, if dac has contacts from multiple insitutes, will be counted more than once

au_researchers %>%
  group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
  tally() %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y"))) %>%
  filter(creationYear > 1982) %>%  #remove 1980 DAC as obvious error
  ggplot(aes(x=creationYear, fill=tidy_center_names)) + 
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2022, 1)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  xlab("Year of submission") +
  ylab("DAC count") +
  labs(title="DACs submitted to EGA each year 2011-2021 coloured by institute of contacts")

Large increase in DACs in 2020 and an increase in the number of institutes submitting DACs to EGA.

Linking DACs and Datasets

Do cross reference from the listed DAC IDs to what datasets are present

For a given DAC, how many datasets does it manage

datasets_per_dac <- function(ega_stable_id){
  api_request <- paste0("https://ega-archive.org/metadata/v2/datasets?queryBy=dac&queryId=", ega_stable_id, "&limit=0")
  response <- GET(api_request)
  if (length(content(response)[["response"]][["result"]]) > 0){
    return(content(response)[["response"]][["result"]]) 
  } else {
    return(list(NA))
  }
}

datasets_per_dac("EGAC00001000951")
## [[1]]
## [1] NA

get datasets for all dacs

# dac_datasets <- au_researchers %>%
#   group_by(egaStableId, centerName, tidy_center_names) %>%
#   tally() %>%
#   rowwise() %>%
#   mutate(datasets = list(datasets_per_dac(egaStableId))) %>%
#   ungroup()
# saveRDS(dac_datasets, "outputs/dac_datasets.rds")
dac_datasets <- readRDS("outputs/dac_datasets.rds")

Ghost DACs

Some DACs do not have any associated datasets, I think this is probably due to any datasets for those DACs not yet being released, or some may have been submitted for testing.

dac_datasets %>%
  rowwise() %>%
  unnest() %>%
  filter(is.na(datasets))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
## # A tibble: 17 × 5
##    egaStableId     centerName tidy_center_names     n datasets 
##    <chr>           <chr>      <chr>             <int> <list>   
##  1 EGAC00001000142 QCMG       UQ                    4 <lgl [1]>
##  2 EGAC00001000229 QLDI       UQ                    1 <lgl [1]>
##  3 EGAC00001000368 CENT       CENTENARY             1 <lgl [1]>
##  4 EGAC00001000497 RPAH       USYD                  2 <lgl [1]>
##  5 EGAC00001000498 RPAH       USYD                  2 <lgl [1]>
##  6 EGAC00001000700 QCMG       QIMRB                 1 <lgl [1]>
##  7 EGAC00001000773 PP_GER     UMELB                 1 <lgl [1]>
##  8 EGAC00001000951 PP_AUS     UMELB                 1 <lgl [1]>
##  9 EGAC00001000952 PP_AUS     UMELB                 1 <lgl [1]>
## 10 EGAC00001001094 GIMR       GARVAN                1 <lgl [1]>
## 11 EGAC00001001097 GIMR       GARVAN                1 <lgl [1]>
## 12 EGAC00001001447 sacgf      UADEL                 1 <lgl [1]>
## 13 EGAC00001001631 QCMG       QIMRB                 1 <lgl [1]>
## 14 EGAC00001001696 UFRN       TELETHON              1 <lgl [1]>
## 15 EGAC00001001836 WEHI       WEHI                  1 <lgl [1]>
## 16 EGAC00001001863 ZERO       CCIA                  1 <lgl [1]>
## 17 EGAC00001001921 QCMG       QIMRB                 2 <lgl [1]>

Get information about the Ghost DAcs

ghost_dacs <- dac_datasets %>%
  rowwise() %>%
  unnest() %>%
  filter(is.na(datasets)) %>%
  pull(egaStableId)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
au_researchers %>%
  filter(egaStableId %in% ghost_dacs)
## # A tibble: 23 × 14
##    alias      egaStableId centerName creationTime title url   released published
##    <chr>      <chr>       <chr>      <chr>        <chr> <chr> <chr>    <chr>    
##  1 ena-DAC-Q… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  2 ena-DAC-Q… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  3 ena-DAC-Q… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  4 ena-DAC-Q… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  5 ena-DAC-Q… EGAC000010… QLDI       2014-08-12T… UQ D… <NA>  NOT_REL… FALSE    
##  6 ena-DAC-C… EGAC000010… CENT       2015-07-29T… CI L… <NA>  NOT_REL… FALSE    
##  7 ena-DAC-R… EGAC000010… RPAH       2016-06-04T… Depa… NA    NOT_REL… FALSE    
##  8 ena-DAC-R… EGAC000010… RPAH       2016-06-04T… Depa… NA    NOT_REL… FALSE    
##  9 ena-DAC-R… EGAC000010… RPAH       2016-06-06T… Depa… NA    NOT_REL… FALSE    
## 10 ena-DAC-R… EGAC000010… RPAH       2016-06-06T… Depa… NA    NOT_REL… FALSE    
## # … with 13 more rows, and 6 more variables: contacts <list>, emails <chr>,
## #   person <chr>, institute <chr>, organisation <chr>, tidy_center_names <chr>

Weird that there are DACs that are not released even though they were submitted many years ago, e.g. 2013-2016. Some are fairly new, e.g. earlier this year, so perhaps will be released at a later date.

Datasets per DAC

show counts of datasets per dac

dac_datasets %>%
  rowwise() %>%
  unnest() %>%
  ggplot(aes(egaStableId, fill=tidy_center_names)) +
  geom_bar() +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  coord_flip() +
  theme_bw()+
  theme(legend.position = "bottom")
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`

datasets_per_dac_plot <- dac_datasets %>%
  group_by(egaStableId) %>%
  summarise(datasets_obj = unique(datasets)) %>%
  rowwise() %>%
  mutate(num_datasets = length(datasets_obj)) %>%
  ungroup() %>%
  ggplot(aes(num_datasets)) +
  geom_histogram(bins=9, fill=biocommons_pal['blue']) +
  scale_x_continuous(breaks = seq(0,8,1)) +
  theme_bw() +
  xlab("Count of datasets per DAC") +
  ylab("Frequency")

datasets_per_dac_plot

ggsave("plots/datasets_per_dac.png")
## Saving 9 x 5 in image

Of the total of 71 individual DACs with Australian resarchers as contacts, the vast majority (59) have a single dataset attached to them.

The South Australian Cancer Genomics Facility has the highest number of datasets attached to a single DAC (8).

There are a handful of other examples that have more than one dataset governed by a single DAC.

Datasets

Tease out the information in the datasets table

datasets <- dac_datasets %>%
  select(datasets) %>%
  unnest() %>% 
  filter(!is.na(datasets))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
dataset_list <- datasets$datasets
datasets_df_list <- lapply(dataset_list, function(x) pivot_wider(enframe(unlist(x))))
bound_datasets <- bind_rows(datasets_df_list) %>% 
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y")),
         releasedDate = as.Date(releasedDate),
         releasedYear = as.numeric(format(releasedDate, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  left_join(tidy_center_lookup)
## Joining, by = "centerName"
glimpse(bound_datasets)
## Rows: 197
## Columns: 25
## $ alias             <chr> "Hepatitis C IL28B resequencing study", "20210729_EG…
## $ egaStableId       <chr> "EGAD00001000032", "EGAD00001007973", "EGAD000010079…
## $ centerName        <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RESE…
## $ creationTime      <date> 2011-08-22, 2021-08-05, 2021-08-05, 2021-08-05, 202…
## $ published         <chr> "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRU…
## $ released          <chr> "RELEASED", "RELEASED", "RELEASED", "RELEASED", "REL…
## $ releasedDate      <date> 2015-05-27, 2001-01-01, 2001-01-01, 2001-01-01, 200…
## $ title             <chr> "Hepatitis C IL28B pooled resequencing study with 10…
## $ description       <chr> "Hepatitis C IL28B pooled resequencing study with 10…
## $ technology        <chr> "Illumina Genome Analyzer IIx", NA, NA, NA, NA, NA, …
## $ numSamples        <int> 4, 106, 106, 106, 106, 106, 106, 44, 44, 44, 4, 4, 1…
## $ datasetTypes      <chr> "sample", "sample", "sample", "sample", "sample", "s…
## $ policyStableId    <chr> "EGAP00001000022", "EGAP00001000187", "EGAP000010001…
## $ availableInBeacon <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE"…
## $ accessType        <chr> "CONTROLLED", "CONTROLLED", "CONTROLLED", "CONTROLLE…
## $ technology1       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Ill…
## $ technology2       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Ill…
## $ technology3       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Nex…
## $ datasetTypes1     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Who…
## $ datasetTypes2     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Tra…
## $ technology4       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ datasetTypes3     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ creationYear      <dbl> 2011, 2021, 2021, 2021, 2021, 2021, 2021, 2014, 2014…
## $ releasedYear      <dbl> 2015, 2001, 2001, 2001, 2001, 2001, 2001, 2015, 2015…
## $ tidy_center_names <chr> "WEHI", "QIMRB", "QUT", "UQ", "QIMRB", "QUT", "UQ", …
bound_datasets %>%
  filter(creationYear < 1990)
## # A tibble: 11 × 25
##    alias     egaStableId centerName creationTime published released releasedDate
##    <chr>     <chr>       <chr>      <date>       <chr>     <chr>    <date>      
##  1 ena-DATA… EGAD000010… TKI_AGHS   1980-01-01   TRUE      RELEASED 2016-03-02  
##  2 Illumina… EGAD000100… TKI_AGHS   1980-01-01   TRUE      RELEASED 2015-07-28  
##  3 ena-DATA… EGAD000010… GARVAN     1980-01-01   TRUE      RELEASED 2016-08-17  
##  4 ena-DATA… EGAD000010… GARVAN     1980-01-01   TRUE      RELEASED 2016-08-17  
##  5 ena-DATA… EGAD000010… GARVAN     1980-01-01   TRUE      RELEASED 2016-08-17  
##  6 ena-DATA… EGAD000010… GARVAN     1980-01-01   TRUE      RELEASED 2016-08-17  
##  7 HanChine… EGAD000100… <NA>       1980-01-01   TRUE      RELEASED 2015-05-27  
##  8 HanChine… EGAD000100… <NA>       1980-01-01   TRUE      RELEASED 2015-05-27  
##  9 HanChine… EGAD000100… <NA>       1980-01-01   TRUE      RELEASED 2015-05-27  
## 10 HanChine… EGAD000100… <NA>       1980-01-01   TRUE      RELEASED 2015-05-27  
## 11 HanChine… EGAD000100… <NA>       1980-01-01   TRUE      RELEASED 2015-05-27  
## # … with 18 more variables: title <chr>, description <chr>, technology <chr>,
## #   numSamples <int>, datasetTypes <chr>, policyStableId <chr>,
## #   availableInBeacon <chr>, accessType <chr>, technology1 <chr>,
## #   technology2 <chr>, technology3 <chr>, datasetTypes1 <chr>,
## #   datasetTypes2 <chr>, technology4 <chr>, datasetTypes3 <chr>,
## #   creationYear <dbl>, releasedYear <dbl>, tidy_center_names <chr>

There is something weird going on with the released and creation years, some of the creation years are way too early, i.e. 1980, but the released date looks like it could be accurate. Also some of the released dates look very wrong, e.g. 2001 when the study was created much later like 2015. So made a function to take the released year if the creation year is obviously wrong, but to take the creation year if the released year is obviously wrong.

fix_year <- function(released_year, creation_year){
  if(creation_year < 1990){
    return(released_year)
  }
}
bound_datasets<- bound_datasets %>%
  rowwise() %>%
  mutate(fixedYear = case_when(creationYear < 1990 ~ releasedYear,
                               releasedYear < creationYear ~ creationYear,
                               TRUE ~ releasedYear)) 

Datasets over time

bound_datasets %>%
  group_by(egaStableId, fixedYear) %>%
  tally() %>%
  ggplot(aes(fixedYear)) +
  geom_bar(fill=biocommons_pal['yellow']) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets released per year")

Datasets per year, stacked by contributing institute. Datasets with more than one contributing institute are counted more than once.

bound_datasets %>%
  group_by(egaStableId, fixedYear, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(fixedYear, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2022, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets released per year") +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1))

joint_institutes <- bound_datasets %>%
  group_by(egaStableId, fixedYear, numSamples) %>%
  summarise(joined_names = paste0(unique(tidy_center_names), collapse=";"),
            n = n())
## `summarise()` has grouped output by 'egaStableId', 'fixedYear'. You can
## override using the `.groups` argument.

Set joint institutes colour palette

length(levels(as.factor(joint_institutes$joined_names)))
## [1] 21
joint_institute_colour_pal = c("#771155", "#AA4488", "#CC99BB", "#114477", "#4477AA", "#77AADD", "#117777", "#44AAAA", "#77CCCC", "#117744", "#44AA77", "#88CCAA", "#777711", "#AAAA44", "#DDDD77", "#774411", "#AA7744", "#DDAA77", "#771122", "#AA4455", "#DD7788")
names(joint_institute_colour_pal) <- levels(as.factor(joint_institutes$joined_names))
joint_institutes %>%
  ggplot(aes(as.factor(fixedYear), fill=joined_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = joint_institute_colour_pal, name="Institutes", guide=guide_legend(title.position="top")) +
  scale_x_discrete() +
  theme_bw(base_size = 18) +
  xlab("Year") +
  ylab("Dataset count") +
  # labs(title="Datasets per year") +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1)) +
  labs(title = "Australian datasets in EGA")

ggsave("plots/datasets_per_year.png")
## Saving 15 x 9 in image
bound_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(creationYear, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year") +
  theme(legend.position = "bottom")

bound_datasets %>%
  filter(creationYear > 1980) %>%
  left_join(tidy_center_lookup) %>%
  group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(creationYear, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year") +
  theme(legend.position = "bottom")
## Joining, by = c("centerName", "tidy_center_names")

Samples submitted per year

bound_datasets %>%
  left_join(tidy_center_lookup) %>%
  group_by(creationYear, tidy_center_names) %>%
  summarise(summed_by_institute = sum(numSamples)) %>%
  ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
  # geom_jitter() +
  geom_text(check_overlap = TRUE) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "none")
## Joining, by = c("centerName", "tidy_center_names")
## `summarise()` has grouped output by 'creationYear'. You can override using the
## `.groups` argument.

bound_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(creationYear, tidy_center_names) %>%
  summarise(summed_by_institute = sum(numSamples)) %>%
  ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
  # geom_jitter() +
  geom_text(check_overlap = TRUE) +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "none") +
  ylab("Sum of submitted samples per institute") +
  xlab("Year of submission")
## `summarise()` has grouped output by 'creationYear'. You can override using the
## `.groups` argument.

bound_datasets %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.character(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
  tally() %>%
  left_join(tidy_center_lookup) %>%
  ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
  geom_point(shape=1, size=4) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs("Number of samples submitted per dataset over time, coloured by institute") +
  ylab("Number of samples per dataset") +
  xlab("Year of submission")
## Joining, by = "centerName"

bound_datasets %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.character(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  filter(creationYear > 1980) %>%
  group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
  tally() %>%
  left_join(tidy_center_lookup) %>%
  ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
  geom_beeswarm(shape=1, size=4) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs("Number of samples submitted per dataset over time, coloured by institute") +
  ylab("Number of samples per dataset") +
  xlab("Year of submission")
## Joining, by = "centerName"
## Warning in f(...): The default behavior of beeswarm has changed in version
## 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis.
## Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis
## choice.

trying out log scale but don’t think it is very good visually.

bound_datasets %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.character(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  group_by(egaStableId, creationYear, numSamples, centerName) %>%
  tally() %>%
  left_join(tidy_center_lookup) %>%
  ggplot(aes(x=creationYear, y=numSamples, col=tidy_center_names)) +
  geom_jitter(shape=1, size=3) +
  scale_color_manual(values = institute_colour_pal) +
  scale_y_continuous(trans = log10_trans()) +
  theme(legend.position = "bottom") +
  theme_bw()
## Joining, by = "centerName"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 3 rows containing missing values (geom_point).

bound_datasets %>%
  ungroup() |>
  select(egaStableId, starts_with("technology"), starts_with("dataset")) %>%
  pivot_longer(!egaStableId, 
               names_to = "data_category",
               values_to = "type") %>%
  filter(!is.na(type)) %>%
  group_by(egaStableId, data_category, type) %>%
  tally() %>% 
  mutate(data_category = ifelse(str_detect(data_category, "technology"), "technology", "datasetType")) |>
  ggplot(aes(x=type)) +
    facet_wrap(~data_category, scales="free", ncol = 1) +
    geom_bar() +
    coord_flip()

bound_datasets %>%
  group_by(datasetTypes1, datasetTypes2, datasetTypes3) %>%
  tally()
## # A tibble: 7 × 4
## # Groups:   datasetTypes1, datasetTypes2 [6]
##   datasetTypes1           datasetTypes2                      datasetTypes3     n
##   <chr>                   <chr>                              <chr>         <int>
## 1 Exome sequencing        Amplicon sequencing                <NA>              4
## 2 Genotyping by array     Transcriptome profiling by high-t… <NA>              1
## 3 Whole genome sequencing Amplicon sequencing                <NA>              1
## 4 Whole genome sequencing Exome sequencing                   <NA>              9
## 5 Whole genome sequencing Transcriptome profiling by high-t… Genomic vari…     1
## 6 Whole genome sequencing Transcriptome profiling by high-t… <NA>              7
## 7 <NA>                    <NA>                               <NA>            174

Amount of data

files_per_dataset <- function(dataset_accession){
  api_request <- paste0("https://ega-archive.org/metadata/v2/files?queryBy=dataset&queryId=", dataset_accession, "&limit=0")
  response <- GET(api_request)
  if(status_code(response) == 200){
    this_content <- content(response)
    num_files <- this_content[["response"]][["numTotalResults"]]
    sizes <- lapply(this_content[["response"]][["result"]], function(x) x[["fileSize"]])
    sizes_length <- length(sizes)
    summed_size <- do.call(sum, sizes)
    file_formats <- paste0(unique(lapply(this_content[["response"]][["result"]], function(x) x[["fileFormat"]])), collapse=";")
    return(tibble(dataset_accession, num_files, sizes_length, summed_size, file_formats))
  } else {
    return(NA)
  }
}
dataset_list <- unique(bound_datasets$egaStableId)
# file_size_list <- lapply(dataset_list, files_per_dataset)
# bound_files_dataset <- bind_rows(file_size_list)
# files_and_datasets <- bound_files_dataset %>%
#   left_join(bound_datasets, by=c("dataset_accession" = "egaStableId"))
# 
# saveRDS(file_size_list, "outputs/file_size_list.rds")
# saveRDS(files_and_datasets, "outputs/files_joined_datasets.rds")
files_and_datasets <- readRDS("outputs/files_joined_datasets.rds")

Total data stored in EGA by aussie researchers

per_dataset_file_summary <- files_and_datasets %>%
  group_by(dataset_accession, title, num_files, summed_size) %>%
  tally()
per_dataset_file_summary 
## # A tibble: 85 × 5
## # Groups:   dataset_accession, title, num_files [85]
##    dataset_accession title                           num_files summed_size     n
##    <chr>             <chr>                               <int>       <dbl> <int>
##  1 EGAD00001000032   "Hepatitis C IL28B pooled rese…         0     0           1
##  2 EGAD00001000845   "Matched tumour/normal .bam fi…        44     7.38e12     3
##  3 EGAD00001000967   "Cancer-Normal chronic myeloid…         8     1.87e10     1
##  4 EGAD00001001343   "WGS on patients 1-4, study of…       354     1.94e12     1
##  5 EGAD00001001344   "WGS on patients 5-7, study of…       140     6.27e11     1
##  6 EGAD00001001345   "RNA-seq on patients 1-4, stud…       168     2.46e11     1
##  7 EGAD00001001660   "Genomic analysis of HPV-posit…        24     4.20e11     4
##  8 EGAD00001001661   "Exome data for an Australian …         1     8.44e 8     1
##  9 EGAD00001002001   "Australian genomes"                  166     1.08e13     1
## 10 EGAD00001003119   "TP53 in ovarian cancer panel …        76     2.28e 8     1
## # … with 75 more rows
saveRDS(per_dataset_file_summary, "outputs/per_dataset_file_summary.rds")

Total data stored in EGA by aussie researchers:

  • 14673 files

  • 325.4687165 TB of data

Summary of dataset sizes in TB:

round(summary(per_dataset_file_summary$summed_size)/1e+12, 2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.08    3.83    1.11   88.93
per_dataset_file_summary %>%
ggplot(aes(summed_size)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Datasets with a size greater than 15TB

per_dataset_file_summary %>%
  filter(summed_size > 15*1e+12) %>%
  arrange(-summed_size)
## # A tibble: 5 × 5
## # Groups:   dataset_accession, title, num_files [5]
##   dataset_accession title                            num_files summed_size     n
##   <chr>             <chr>                                <int>       <dbl> <int>
## 1 EGAD00001004940   MGRB dataset                          2571     8.89e13     1
## 2 EGAD00001006902   WGS fastq for EGAS00001004572         1962     8.26e13     1
## 3 EGAD00001005095   MGRB dataset. Samples that were…      1441     4.65e13     1
## 4 EGAD00001004494   Familial Breast cancer dataset         156     1.80e13     6
## 5 EGAD00001005287   Deep multi-region WGS of lung c…        68     1.63e13     1

There aren’t too many datasets that are greater than 15TB, but some institutes have a single submission account and may be submitting on behalf of multiple labs.

files_and_datasets %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationTime) %>%
  filter(creationYear > 1980) %>%
  tally() %>%
  mutate(file_size_gb = summed_size/1e+9) %>%
    ggplot(aes(x=creationTime, y=file_size_gb)) +
    geom_point() +
    geom_abline(intercept = 10000, slope=0, color="red") +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    # scale_y_continuous(trans = log10_trans()) +
    theme_bw() +
    ylab("Total file size per dataset (GB)") +
    xlab("Year of submission") +
    labs(title="Total file size per dataset over time")

How much data submitted in the past 4 years?

files_and_datasets %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationTime) %>%
  filter(creationYear >= 2018) %>%
  tally() %>%
  pull(summed_size) %>%
  sum()/1e+12
## [1] 301.6727
sum_size_plot <- files_and_datasets %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationYear, datasetTypes, title) %>%
  filter(creationYear > 1980) %>%
  tally() %>%
  mutate(file_size_gb = summed_size/1e+9) %>%
    ggplot(aes(x=creationYear, y=file_size_gb, col=centerName, label=title, fill=datasetTypes)) +
    geom_beeswarm() +
    geom_abline(intercept = 10000, slope=0, color="red") +
    # scale_y_continuous(trans = log10_trans()) +
    scale_x_continuous(breaks=seq(1980,2022,1)) +
    theme_bw() +
    ylab("Total file size per dataset (GB)") +
    xlab("Year of submission") +
    labs(title="Total file size per data set over time (info on hover)") +
  theme(legend.position = "none")

ggplotly(sum_size_plot)
files_and_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationTime, title) %>%
  tally() %>%
    ggplot(aes(x=creationTime, y=num_files)) +
    geom_point() +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    theme_bw() +
    ylab("Total files per dataset") +
    xlab("Year of submission") +
    labs(title="Total number of files per data set over time")

# num_files_plot <- files_and_datasets %>%
#   filter(creationYear > 1980) %>%
#   group_by(dataset_accession, summed_size, centerName, num_files, creationTime, creationYear, title, datasetTypes) %>%
#   tally() %>%
#     ggplot(aes(x=creationYear, y=num_files, label=title, col=centerName, fill=datasetTypes)) +
#     geom_beeswarm() +
#     # scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
#     theme_bw() +
#     ylab("Total files per dataset") +
#     xlab("Year of submission") +
#     labs(title="Total files per data set over time (info on hover)") +
#     theme(legend.position = "none")
# ggplotly(num_files_plot)

Sample metadata

What sample metadata is submitted alongside datasets? What phenotypes are covered? Are minimal requirements met?

Minimal requirements are gender, subject_id and phenotype (preferably ontologised, recommends EFO)

samples_per_dataset <- function(dataset_accession){
  api_request <- paste0("https://ega-archive.org/metadata/v2/samples?queryBy=dataset&queryId=", dataset_accession, "&limit=0")
  response <- GET(api_request)
  if(status_code(response) == 200){
    content <- content(response)
    num_samples <- content[["response"]][["numTotalResults"]]
    genders <- unlist(lapply(content[["response"]][["result"]], function(x) x[["gender"]]))
    if (is.null(genders)){
      genders <- as.character(rep(NA, num_samples))
    }
    phenotypes <- unlist(lapply(content[["response"]][["result"]], function(x)x[["phenotype"]]))
    if (is.null(phenotypes)){
      phenotypes <- as.character(rep(NA, num_samples))
    }
    attributes <- unlist(lapply(content[["response"]][["result"]], function(x) x[["attributes"]]))
    if (is.null(attributes)){
      attributes <- as.character(rep(NA, num_samples))
    }
    subjectIds <- unlist(lapply(content[["response"]][["result"]], function(x) x[["subjectId"]]))
    if (is.null(attributes)){
      subjectIds <- as.character(rep(NA, num_samples))
    }
    return(tibble(dataset_accession, num_samples, genders, phenotypes, attributes, subjectIds))
  } else {
    return(NA)
  }
}
sample_md_list <- lapply(per_dataset_file_summary$dataset_accession, samples_per_dataset)
sample_md_tbl <- bind_rows(sample_md_list)
total_samples <- sample_md_tbl %>%
  group_by(dataset_accession, num_samples) %>%
  tally() %>%
  pull(num_samples) %>%
  sum()

Total of 15,945 in the EGA from Aussie researchers

Breakdown of genders

sample_md_tbl %>%
  ggplot(aes(genders, fill=genders, label = genders)) +
  geom_bar() +
  scale_fill_pretty_d("Bold", na.value="grey") +
  coord_flip() +
  theme_bw()

sample_md_tbl %>%
  group_by(genders) %>%
  tally()
## # A tibble: 4 × 2
##   genders     n
##   <chr>   <int>
## 1 female   5868
## 2 male     5991
## 3 unknown   705
## 4 <NA>     3381

About 25% samples do not have a specified gender.

sample_md_tbl %>%
  ggplot(aes(phenotypes)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

sample_md_tbl %>%
  rowwise() %>%
  mutate(ontologised = ifelse(str_detect(phenotypes, "EFO"), TRUE, FALSE)) %>%
  filter(ontologised) %>%
  group_by(dataset_accession) %>%
  summarise(count = n(), phenotypes = paste0(unique(phenotypes), collapse=";"))
## # A tibble: 13 × 3
##    dataset_accession count phenotypes                         
##    <chr>             <int> <chr>                              
##  1 EGAD00001001343      15 EFO:0001663                        
##  2 EGAD00001001344       6 EFO:0001663                        
##  3 EGAD00001001345      12 EFO:0001663                        
##  4 EGAD00001003203      37 EFO:0000096                        
##  5 EGAD00001003247      37 EFO:0000096                        
##  6 EGAD00001003282      37 EFO:0000096                        
##  7 EGAD00001004179     183 CML, EFO_0000339                   
##  8 EGAD00001004190     239 EFO:0000203;EFO:0001378;EFO:0003073
##  9 EGAD00001004833       3 EFO:1000796                        
## 10 EGAD00001004861     396 PMDS, EFO_0000198;TMN, EFO_1000575 
## 11 EGAD00001005287      68 EFO:0000571;EFO:0000708;EFO:0000702
## 12 EGAD00001005807      48 EFO:0010105                        
## 13 EGAD00001005949      39 EFO_0000222

13 studies our of 85, 15% use the recommended EFO ontology in some form

sample_md_tbl %>%
  ggplot(aes(attributes)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

How many NA subject IDs?

sum(is.na(sample_md_tbl$subjectIds))
## [1] 3337

3337 samples have no subjectId specified

No samples have any extra attribute metadata

Phenotype files

Do projects have phenotype files attached that could contain more information?

dbGaP data

did a search of the dbGaP for keyword ‘Australia’

dbgap_aus <- read_csv("data/dbgap_australia.csv")
## Rows: 48 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): accession, name, description, Parent study, Study Disease/Focus, ...
## date  (2): Release Date, Embargo Release Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
length(dbgap_aus$accession)
## [1] 48
dbgap_aus %>%
  mutate(release_year = as.numeric(format(`Release Date`, "%Y"))) %>%
  ggplot(aes(release_year, fill=`NIH Institute`)) +
  geom_bar() +
  scale_x_continuous(breaks = seq(2009, 2022, 1)) +
  scale_y_continuous(breaks = seq(0, 9, 1)) +
  scale_fill_tableau(palette="Tableau 20", guide=guide_legend(title.position="top")) +
  theme_bw(base_size = 18) +
  theme(panel.grid.minor = element_blank()) + 
  ylab("Count of datasets") +
  xlab("Release year") +
  theme(legend.position = "bottom") +
  labs(title="Australian datasets in dbGaP")

ggsave("plots/dgbap_datasets_per_year.png")
## Saving 9 x 5 in image

How many from NCI and NINDS

dbgap_aus %>%
  pull(`NIH Institute`) %>%
  table()
## .
## NCATS   NCI   NEI NHGRI NHLBI NIAAA NIAID NIAMS NICHD  NIDA NIDCR NIDDK  NIMH 
##     2    18     1     3     1     1     3     1     1     2     1     1     5 
## NINDS 
##     8