Background

Some scripts to harvest data from the EGA API and find out what state it is in.

DACs

Retrieve all DACs housed at EGA, there are currently around 1500 (as at 2021-10-22) so set limit to 2000.

# dacs<- GET("https://ega-archive.org/metadata/v2/dacs?limit=2000")
# 
# dacs

tibble_response <- function(list){
  list %>%
    enframe() %>%
    pivot_wider() %>%
    mutate_all(~ifelse(is.null(.x[[1]]), list(NA), .x)) %>%
    unnest(cols = c(alias, egaStableId, centerName, creationTime, title, url, released, 
    published, contacts)) %>%
    mutate(emails = unlist(lapply(contacts, function(x) x$email)),
           organisation = unlist(lapply(contacts, function(x) {if(is.null(x$organisation)){
             return(NA)
           } else {
             return(x$organisation)
           }}))) %>%
    mutate_if(is.logical, as.character)
}

# dacs_content <- content(dacs)[["response"]][["result"]]
# df_list <- lapply(dacs_content, tibble_response)
# df_bound <- bind_rows(df_list)
# saveRDS(df_bound, "outputs/dacs_bound_table.rds")
df_bound <- readRDS("outputs/dacs_bound_table.rds")

Now I have harvested all the DACs in EGA, filter to Australian research institutes by using the email (hopefully)

au_researchers <- df_bound %>%
  filter(str_detect(emails, "au$")) %>%
  separate(emails, sep="@", remove=FALSE, into = c("person", "institute"))
glimpse(au_researchers)

## Rows: 116
## Columns: 13
## $ alias        <chr> "ARC Linkage Project Grant LPO990067", "ena-DAC-QCMG-14-…
## $ egaStableId  <chr> "EGAC00001000022", "EGAC00001000142", "EGAC00001000142",…
## $ centerName   <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RESEARCH…
## $ creationTime <chr> "2011-08-05T10:45.000Z", "2013-11-14T01:31.000Z", "2013-…
## $ title        <chr> "ARC Linkage Project Grant LPO990067", "DAC overseeing s…
## $ url          <chr> "not provided", "not provided", "not provided", "not pro…
## $ released     <chr> "RELEASED", "NOT_RELEASED", "NOT_RELEASED", "NOT_RELEASE…
## $ published    <chr> "TRUE", "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRU…
## $ contacts     <list> [["Melanie Bahlo", "bahlo@wehi.edu.au", "The Walter and…
## $ emails       <chr> "bahlo@wehi.edu.au", "n.waddell@imb.uq.edu.au", "m.quinn…
## $ person       <chr> "bahlo", "n.waddell", "m.quinn", "j.saunus", "s.lakhani"…
## $ institute    <chr> "wehi.edu.au", "imb.uq.edu.au", "uq.edu.au", "uq.edu.au"…
## $ organisation <chr> "The Walter and Eliza Hall Institute of Medical Research…

Count of DACs per centerName submitted. It is clear there are a few issues with normalization of this field.

au_researchers %>%
  group_by(egaStableId, centerName) %>%
  tally() %>%
  ggplot(aes(centerName)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

SACGF - South Australia centre for cancer biology - ACRF Cancer Genomics Facility

Count of DACs per organisation, again this field doesn’t seem to be used consistently.

au_researchers %>%
  group_by(egaStableId, organisation) %>%
  tally() %>%
  ggplot(aes(organisation)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

Count of DACs by the institutional part of the email address of the contacts for each DAC, gives a bit of a clearer picture.

au_researchers %>%
  group_by(egaStableId, institute) %>%
  tally() %>%
  ggplot(aes(institute)) +
  geom_bar() +
  coord_flip() +
  theme_bw()

Manually curate each institute to get nicer and more understandable labels. Here I may lose some detail, that is, groups that are housed within institutes but should give a clearer and simpler picture.

au_researchers <- au_researchers %>%
  mutate(tidy_center_names = case_when(
    str_detect(institute, "unimelb") ~ "UMELB", # University of Melbourne
    str_detect(institute, "uq") ~ "UQ",         # University of Queensland
    str_detect(institute, "syd") ~ "USYD",      # University of Sydney
    str_detect(institute, "anu") ~ "ANU",       # Australian National University
    str_detect(institute, "wehi") ~ "WEHI",     # Walter and Eliza Hall Institute
    str_detect(institute, "qut") ~ "QUT",       # Queensland University of Technology
    str_detect(institute, "garvan") ~ "GARVAN", # Garvan Institute of Medical Research
    str_detect(institute, "mcri") ~ "MCRI",     # Murdoch Children's Research Institute
    str_detect(institute, "unsw") ~ "UNSW",     # University of New South Wales
    str_detect(institute, "uow") ~ "UOW",       # University of Wollongong
    str_detect(institute, "griffith") ~ "GRIFFITH", # Griffith University
    str_detect(institute, "ccia") ~ "CCIA",         # Children's Cancer Institute
    str_detect(institute, "mh.org.au") ~ "ROYALMELB", # Royal Melbourne Hospital
    str_detect(institute, "centenary") ~ "CENTENARY", # Centenary Insitute
    str_detect(institute, "adelaide") ~ "UADEL", # University of Adelaide/SA Cancer Genomics Facility
    str_detect(institute, "qimr") ~ "QIMRB",    # QIMR Berghofer Medical Research Institute
    str_detect(institute, "lh") ~ "LIFEHOUSE",  # Chris O'Brien Lifehouse - not-for-profit Cancer Treatement centre
    str_detect(institute, "health.nsw") ~ "NSWHEALTH", # NSW health department
    str_detect(institute, "sswahs.nsw") ~ "NSWHEALTH", # Sydney South-West Health Department
    str_detect(institute, "telethon") ~ "TELETHON",  # Telethon Kids Institute
    TRUE ~ institute
  ))

Get centerName lookup table for nice names

tidy_center_lookup <- au_researchers %>%
  group_by(centerName, tidy_center_names) %>%
  tally() %>%
  select(-n)

Set institutes colour palette

length(levels(as.factor(tidy_center_lookup$tidy_center_names)))

## [1] 19

institute_colour_pal = pal_d3(palette="category20")(19)
names(institute_colour_pal) <- levels(as.factor(tidy_center_lookup$tidy_center_names))

Count of DACs per institute using curated names

au_researchers %>%
  group_by(egaStableId, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(tidy_center_names, fill=tidy_center_names)) +
  geom_bar() +
  scale_fill_manual(values = institute_colour_pal) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("Institute") +
  ylab("DAC count") +
  labs(title="Count of DACs per institute")

Contacts per DAC

au_researchers %>%
  group_by(egaStableId) %>%
  tally() %>%
  arrange(-n) %>%
  ggplot(aes(n)) +
  geom_histogram(fill=biocommons_pal['purple'], bins=7) +
  scale_x_continuous(breaks = seq(0,7,1)) +
  theme_bw() +
  xlab("Number of contacts per DAC") +
  ylab("Frequency") +
  labs(title="Frequency of DACs with certain number of contacts specified on the DAC")

The majority of DACs (49) only have a single point of contact.

Institutes per DAC

au_researchers %>%
  group_by(egaStableId, tidy_center_names) %>%
  tally() %>%
  select(-n) %>%
  group_by(egaStableId) %>%
  tally() %>%
  arrange(-n) %>%
  ggplot(aes(n)) +
  geom_histogram(fill=biocommons_pal['teal'], bins=5) +
  theme_bw() +
  xlab("Count of institutes") +
  ylab("Frequency of DACs") +
  labs(title="Frequency of number of institutes specified per DAC")

Most DACs have contacts from a single institute, 9 have DACs across multiple institutes with one DAC having contacts from 5 separate institutes. This is a consortium called the Sydney Head and Neck Cancer Institute (SHNCI).

Contacts on DACs

QIMRB

QIMRB has highest number of DACs. Contact emails tend to be individual researchers. This has the potential to cause issues if those individual researchers change institutes.

au_researchers %>%
  filter(tidy_center_names == "QIMRB") %>%
  select(person) %>%
  group_by(person) %>%
  tally() %>%
  arrange(desc(n))

## # A tibble: 19 x 2
##    person                  n
##    <chr>               <int>
##  1 nic.waddell             8
##  2 john.pearson            3
##  3 ann-marie.patch         1
##  4 bryan.day               1
##  5 christian.engwerda      1
##  6 felicity.newell         1
##  7 georgia.trench          1
##  8 gregory.quaife-ryan     1
##  9 hong.you                1
## 10 james.hudson            1
## 11 juliet.french           1
## 12 katia.nones             1
## 13 mark.smyth              1
## 14 michael.quinn           1
## 15 olga.kondrashova        1
## 16 peter.johansson         1
## 17 ross.koufariotis        1
## 18 susanna.ng              1
## 19 tobias.bald             1

University of Melbourne

au_researchers %>%
  filter(tidy_center_names == "UMELB") %>%
  select(person, centerName) %>%
  group_by(person, centerName) %>%
  tally() %>%
  arrange(desc(n))

## # A tibble: 15 x 3
## # Groups:   person [10]
##    person          centerName     n
##    <chr>           <chr>      <int>
##  1 jungch          PP_AUS         2
##  2 bjpope          UM             1
##  3 chovens         PP_GER         1
##  4 chovens         UM             1
##  5 chovens         Unimelb        1
##  6 chovens         UOM-CAP        1
##  7 chovens         UOMCAP         1
##  8 cmerom          UOMCAP         1
##  9 con             UM             1
## 10 con             Unimelb        1
## 11 epilepsy-austin WEHI           1
## 12 lachlan.coin    UMel           1
## 13 s.berkovic      WEHI           1
## 14 sarah.dunstan   NUS            1
## 15 scheffer        WEHI           1

Garvan institute

au_researchers %>%
  filter(tidy_center_names == "GARVAN") %>%
  select(person, centerName) %>%
  group_by(person, centerName) %>%
  tally() %>%
  arrange(desc(n))

## # A tibble: 8 x 3
## # Groups:   person [8]
##   person    centerName     n
##   <chr>     <chr>      <int>
## 1 e.benn    GIMR           2
## 2 grants    KCCG           2
## 3 e.ali     KCCG           1
## 4 m.cowley  KCCG           1
## 5 m.mccabe  KCCG           1
## 6 mgrb      GIMR           1
## 7 n.watkins KCCG           1
## 8 s.mueller SHNCI          1

Children’s Cancer Institute

au_researchers %>%
  filter(tidy_center_names == "CCIA") %>%
  select(person, centerName) %>%
  group_by(person, centerName) %>%
  tally() %>%
  arrange(desc(n))

## # A tibble: 4 x 3
## # Groups:   person [4]
##   person    centerName     n
##   <chr>     <chr>      <int>
## 1 zero      ZERO           3
## 2 mcowley   SHNCI          1
## 3 mgauthier SHNCI          1
## 4 pekert    MCRI           1

DACs over time

Increasing number of DACs over time, until 2021, perhaps covid related.

au_researchers %>%
  group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
  tally() %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y"))) %>%
  filter(creationYear > 1982) %>%  #remove 1980 DAC as obvious error
  ggplot(aes(x=creationYear)) + 
  geom_bar(fill=biocommons_pal['pink']) +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("DAC count")

dacs per institute over time dodged, potentially a bit ugly but shows more institutes being involved in later years - 2018-2020

au_researchers %>%
  group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
  tally() %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y"))) %>%
  filter(creationYear > 1982) %>%   #remove 1980 DAC as obvious error
  ggplot(aes(x=creationYear, fill=tidy_center_names)) + 
  geom_bar(position="dodge", width=0.9) +
  scale_x_continuous(breaks = seq(1980,2021, 1), name="Institute") +
  scale_fill_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "bottom")

dacs per institute over time, if dac has contacts from multiple insitutes, will be counted more than once

au_researchers %>%
  group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
  tally() %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y"))) %>%
  filter(creationYear > 1982) %>%  #remove 1980 DAC as obvious error
  ggplot(aes(x=creationYear, fill=tidy_center_names)) + 
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  xlab("Year of submission") +
  ylab("DAC count") +
  labs(title="DACs submitted to EGA each year 2011-2021 coloured by institute of contacts")

Large increase in DACs in 2020 and an increase in the number of institutes submitting DACs to EGA.

Linking DACs and Datasets

Do cross reference from the listed DAC IDs to what datasets are present

For a given DAC, how many datasets does it manage

datasets_per_dac <- function(ega_stable_id){
  api_request <- paste0("https://ega-archive.org/metadata/v2/datasets?queryBy=dac&queryId=", ega_stable_id, "&limit=0")
  response <- GET(api_request)
  if (length(content(response)[["response"]][["result"]]) > 0){
    return(content(response)[["response"]][["result"]]) 
  } else {
    return(list(NA))
  }
}

datasets_per_dac("EGAC00001000951")

## [[1]]
## [1] NA

get datasets for all dacs

# dac_datasets <- au_researchers %>%
#   group_by(egaStableId, centerName, tidy_center_names) %>%
#   tally() %>%
#   rowwise() %>%
#   mutate(datasets = list(datasets_per_dac(egaStableId))) %>%
#   ungroup()
# saveRDS(dac_datasets, "outputs/dac_datasets.rds")
dac_datasets <- readRDS("outputs/dac_datasets.rds")

Ghost DACs

Some DACs do not have any associated datasets, I think this is probably due to any datasets for those DACs not yet being released, or some may have been submitted for testing.

dac_datasets %>%
  rowwise() %>%
  unnest() %>%
  filter(is.na(datasets))

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`

## # A tibble: 24 x 5
##    egaStableId     centerName tidy_center_names     n datasets 
##    <chr>           <chr>      <chr>             <int> <list>   
##  1 EGAC00001000142 QCMG       UQ                    4 <lgl [1]>
##  2 EGAC00001000229 QLDI       UQ                    1 <lgl [1]>
##  3 EGAC00001000368 CENT       CENTENARY             1 <lgl [1]>
##  4 EGAC00001000497 RPAH       USYD                  2 <lgl [1]>
##  5 EGAC00001000498 RPAH       USYD                  2 <lgl [1]>
##  6 EGAC00001000700 QCMG       QIMRB                 1 <lgl [1]>
##  7 EGAC00001000773 PP_GER     UMELB                 1 <lgl [1]>
##  8 EGAC00001000951 PP_AUS     UMELB                 1 <lgl [1]>
##  9 EGAC00001000952 PP_AUS     UMELB                 1 <lgl [1]>
## 10 EGAC00001001090 QIMR       QIMRB                 1 <lgl [1]>
## # … with 14 more rows

Get information about the Ghost DAcs

ghost_dacs <- dac_datasets %>%
  rowwise() %>%
  unnest() %>%
  filter(is.na(datasets)) %>%
  pull(egaStableId)

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`

au_researchers %>%
  filter(egaStableId %in% ghost_dacs)

## # A tibble: 33 x 14
##    alias egaStableId centerName creationTime title url   released published
##    <chr> <chr>       <chr>      <chr>        <chr> <chr> <chr>    <chr>    
##  1 ena-… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  2 ena-… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  3 ena-… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  4 ena-… EGAC000010… QCMG       2013-11-14T… DAC … not … NOT_REL… FALSE    
##  5 ena-… EGAC000010… QLDI       2014-08-12T… UQ D… <NA>  NOT_REL… FALSE    
##  6 ena-… EGAC000010… CENT       2015-07-29T… CI L… <NA>  NOT_REL… FALSE    
##  7 ena-… EGAC000010… RPAH       2016-06-04T… Depa… NA    NOT_REL… FALSE    
##  8 ena-… EGAC000010… RPAH       2016-06-04T… Depa… NA    NOT_REL… FALSE    
##  9 ena-… EGAC000010… RPAH       2016-06-06T… Depa… NA    NOT_REL… FALSE    
## 10 ena-… EGAC000010… RPAH       2016-06-06T… Depa… NA    NOT_REL… FALSE    
## # … with 23 more rows, and 6 more variables: contacts <list>, emails <chr>,
## #   person <chr>, institute <chr>, organisation <chr>, tidy_center_names <chr>

Weird that there are DACs that are not released even though they were submitted many years ago, e.g. 2013-2016. Some are fairly new, e.g. earlier this year, so perhaps will be released at a later date.

Datasets per DAC

show counts of datasets per dac

dac_datasets %>%
  rowwise() %>%
  unnest() %>%
  ggplot(aes(egaStableId, fill=tidy_center_names)) +
  geom_bar() +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  coord_flip() +
  theme_bw()+
  theme(legend.position = "bottom")

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`

dac_datasets %>%
  group_by(egaStableId) %>%
  summarise(datasets_obj = unique(datasets)) %>%
  rowwise() %>%
  mutate(num_datasets = length(datasets_obj)) %>%
  ungroup() %>%
  ggplot(aes(num_datasets)) +
  geom_histogram(bins=9, fill=biocommons_pal['blue']) +
  scale_x_continuous(breaks = seq(0,8,1)) +
  theme_bw() +
  xlab("Count of datasets per DAC") +
  ylab("Frequency")

## `summarise()` ungrouping output (override with `.groups` argument)

Of the total of 71 individual DACs with Australian resarchers as contacts, the vast majority (59) have a single dataset attached to them.

The South Australian Cancer Genomics Facility has the highest number of datasets attached to a single DAC (8).

There are a handful of other examples that have more than one dataset governed by a single DAC.

Datasets

Tease out the information in the datasets table

datasets <- dac_datasets %>%
  select(datasets) %>%
  unnest() %>% 
  filter(!is.na(datasets))

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`

dataset_list <- datasets$datasets
datasets_df_list <- lapply(dataset_list, function(x) pivot_wider(enframe(unlist(x))))
bound_datasets <- bind_rows(datasets_df_list) %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.numeric(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  left_join(tidy_center_lookup)

## Joining, by = "centerName"

glimpse(bound_datasets)

## Rows: 168
## Columns: 23
## $ alias             <chr> "Hepatitis C IL28B resequencing study", "20210729_E…
## $ egaStableId       <chr> "EGAD00001000032", "EGAD00001007973", "EGAD00001007…
## $ centerName        <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RES…
## $ creationTime      <date> 2011-08-22, 2021-08-05, 2021-08-05, 2021-08-05, 20…
## $ published         <chr> "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TR…
## $ released          <chr> "RELEASED", "RELEASED", "RELEASED", "RELEASED", "RE…
## $ releasedDate      <chr> "2015-05-27T00:00.000Z", "2001-01-01T00:00.000Z", "…
## $ title             <chr> "Hepatitis C IL28B pooled resequencing study with 1…
## $ description       <chr> "Hepatitis C IL28B pooled resequencing study with 1…
## $ technology        <chr> "Illumina Genome Analyzer IIx", NA, NA, NA, NA, NA,…
## $ numSamples        <int> 4, 106, 106, 106, 106, 106, 106, 44, 44, 44, 4, 183…
## $ datasetTypes      <chr> "sample", "sample", "sample", "sample", "sample", "…
## $ policyStableId    <chr> "EGAP00001000022", "EGAP00001000187", "EGAP00001000…
## $ availableInBeacon <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE…
## $ accessType        <chr> "CONTROLLED", "CONTROLLED", "CONTROLLED", "CONTROLL…
## $ technology1       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Illumi…
## $ technology2       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Illumi…
## $ technology3       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NextSe…
## $ datasetTypes1     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Whole …
## $ datasetTypes2     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Transc…
## $ technology4       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ creationYear      <dbl> 2011, 2021, 2021, 2021, 2021, 2021, 2021, 2014, 201…
## $ tidy_center_names <chr> "WEHI", "QIMRB", "QUT", "UQ", "QIMRB", "QUT", "UQ",…

Datasets over time

bound_datasets %>%
  group_by(egaStableId, creationYear) %>%
  tally() %>%
  ggplot(aes(creationYear)) +
  geom_bar(fill=biocommons_pal['yellow']) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year")

without 1980

bound_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(egaStableId, creationYear) %>%
  tally() %>%
  ggplot(aes(creationYear)) +
  geom_bar(fill=biocommons_pal['yellow']) +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year")

Datasets per year, stacked by contributing institute. Datasets with more than one contributing institute are counted more than once.

bound_datasets %>%
  group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(creationYear, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year") +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1))

bound_datasets %>%
  mutate(na_year = ifelse(creationYear == 1980, NA, creationYear)) %>%
  group_by(egaStableId, na_year, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(na_year, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year") +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1))

## Warning: Removed 9 rows containing non-finite values (stat_count).

bound_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(creationYear, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year") +
  theme(legend.position = "bottom")

bound_datasets %>%
  filter(creationYear > 1980) %>%
  left_join(tidy_center_lookup) %>%
  group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
  tally() %>%
  ggplot(aes(creationYear, fill=tidy_center_names)) +
  geom_bar(width=0.9) +
  scale_fill_manual(values = institute_colour_pal, name="Institute") +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  theme_bw() +
  xlab("Year of submission") +
  ylab("Dataset count") +
  labs(title="Datasets submitted per year") +
  theme(legend.position = "bottom")

## Joining, by = c("centerName", "tidy_center_names")

Samples submitted per year

bound_datasets %>%
  left_join(tidy_center_lookup) %>%
  group_by(creationYear, tidy_center_names) %>%
  summarise(summed_by_institute = sum(numSamples)) %>%
  ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
  # geom_jitter() +
  geom_text(check_overlap = TRUE) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "none")

## Joining, by = c("centerName", "tidy_center_names")

## `summarise()` regrouping output by 'creationYear' (override with `.groups` argument)

bound_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(creationYear, tidy_center_names) %>%
  summarise(summed_by_institute = sum(numSamples)) %>%
  ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
  # geom_jitter() +
  geom_text(check_overlap = TRUE) +
  scale_x_continuous(breaks = seq(1980,2021, 1)) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "none") +
  ylab("Sum of submitted samples per institute") +
  xlab("Year of submission")

## `summarise()` regrouping output by 'creationYear' (override with `.groups` argument)

bound_datasets %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.character(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
  tally() %>%
  left_join(tidy_center_lookup) %>%
  ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
  geom_point(shape=1, size=4) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs("Number of samples submitted per dataset over time, coloured by institute") +
  ylab("Number of samples per dataset") +
  xlab("Year of submission")

## Joining, by = "centerName"

bound_datasets %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.character(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  filter(creationYear > 1980) %>%
  group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
  tally() %>%
  left_join(tidy_center_lookup) %>%
  ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
  geom_beeswarm(shape=1, size=4) +
  scale_color_manual(values = institute_colour_pal) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs("Number of samples submitted per dataset over time, coloured by institute") +
  ylab("Number of samples per dataset") +
  xlab("Year of submission")

## Joining, by = "centerName"

## Warning in f(...): The default behavior of beeswarm has changed in version
## 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis.
## Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis
## choice.

trying out log scale but don’t think it is very good visually.

bound_datasets %>%
  mutate(creationTime = as.Date(creationTime),
         creationYear = as.character(format(creationTime, "%Y")),
         numSamples = as.integer(numSamples)) %>%
  group_by(egaStableId, creationYear, numSamples, centerName) %>%
  tally() %>%
  left_join(tidy_center_lookup) %>%
  ggplot(aes(x=creationYear, y=numSamples, col=tidy_center_names)) +
  geom_jitter(shape=1, size=3) +
  scale_color_manual(values = institute_colour_pal) +
  scale_y_continuous(trans = log10_trans()) +
  theme(legend.position = "bottom") +
  theme_bw()

## Joining, by = "centerName"

Amount of data

files_per_dataset <- function(dataset_accession){
  api_request <- paste0("https://ega-archive.org/metadata/v2/files?queryBy=dataset&queryId=", dataset_accession, "&limit=0")
  response <- GET(api_request)
  if(status_code(response) == 200){
    this_content <- content(response)
    num_files <- this_content[["response"]][["numTotalResults"]]
    sizes <- lapply(this_content[["response"]][["result"]], function(x) x[["fileSize"]])
    sizes_length <- length(sizes)
    summed_size <- do.call(sum, sizes)
    return(tibble(dataset_accession, num_files, sizes_length, summed_size))
  } else {
    return(NA)
  }
}

# test_response <- files_per_dataset(dataset_list[51])

# dataset_list <- unique(bound_datasets$egaStableId)
# file_size_list <- lapply(dataset_list, files_per_dataset)
# bound_files_dataset <- bind_rows(file_size_list)
# files_and_datasets <- bound_files_dataset %>%
#   left_join(bound_datasets, by=c("dataset_accession" = "egaStableId"))

# saveRDS(file_size_list, "outputs/file_size_list.rds")
# saveRDS(files_and_datasets, "outputs/files_joined_datasets.rds")
files_and_datasets <- readRDS("outputs/files_joined_datasets.rds")

Total data stored in EGA by aussie researchers

per_dataset_file_summary <- files_and_datasets %>%
  group_by(dataset_accession, num_files, summed_size) %>%
  tally()

Total data stored in EGA by aussie researchers: * 14673 files * 325.4687165 TB of data

files_and_datasets %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationTime) %>%
  filter(creationYear > 1980) %>%
  tally() %>%
  mutate(file_size_gb = summed_size/1e+9) %>%
    ggplot(aes(x=creationTime, y=file_size_gb)) +
    geom_point() +
    geom_abline(intercept = 10000, slope=0, color="red") +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    # scale_y_continuous(trans = log10_trans()) +
    theme_bw() +
    ylab("Total file size per dataset (GB)") +
    xlab("Year of submission") +
    labs(title="Total file size per data set over time")

sum_size_plot <- files_and_datasets %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationYear, datasetTypes, title) %>%
  filter(creationYear > 1980) %>%
  tally() %>%
  mutate(file_size_gb = summed_size/1e+9) %>%
    ggplot(aes(x=creationYear, y=file_size_gb, col=centerName, label=title, fill=datasetTypes)) +
    geom_beeswarm() +
    geom_abline(intercept = 10000, slope=0, color="red") +
    # scale_y_continuous(trans = log10_trans()) +
    scale_x_continuous(breaks=seq(1980,2022,1)) +
    theme_bw() +
    ylab("Total file size per dataset (GB)") +
    xlab("Year of submission") +
    labs(title="Total file size per data set over time") +
  theme(legend.position = "none")

ggplotly(sum_size_plot)

files_and_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationTime, title) %>%
  tally() %>%
    ggplot(aes(x=creationTime, y=num_files)) +
    geom_point() +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    theme_bw() +
    ylab("Total files per dataset") +
    xlab("Year of submission") +
    labs(title="Total files per data set over time")

num_files_plot <- files_and_datasets %>%
  filter(creationYear > 1980) %>%
  group_by(dataset_accession, summed_size, centerName, num_files, creationTime, creationYear, title, datasetTypes) %>%
  tally() %>%
    ggplot(aes(x=creationYear, y=num_files, label=title, col=centerName, fill=datasetTypes)) +
    geom_beeswarm() +
    # scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    theme_bw() +
    ylab("Total files per dataset") +
    xlab("Year of submission") +
    labs(title="Total files per data set over time") +
    theme(legend.position = "none")
ggplotly(num_files_plot)

Australian Data in the EGA

Marion Shadbolt

2021-10-25