Some scripts to harvest data from the EGA API and find out what state it is in.
Retrieve all DACs housed at EGA, there are currently around 1500 (as at 2021-10-22) so set limit to 2000.
# dacs<- GET("https://ega-archive.org/metadata/v2/dacs?limit=2000")
#
# dacs
tibble_response <- function(list){
list %>%
enframe() %>%
pivot_wider() %>%
mutate_all(~ifelse(is.null(.x[[1]]), list(NA), .x)) %>%
unnest(cols = c(alias, egaStableId, centerName, creationTime, title, url, released,
published, contacts)) %>%
mutate(emails = unlist(lapply(contacts, function(x) x$email)),
organisation = unlist(lapply(contacts, function(x) {if(is.null(x$organisation)){
return(NA)
} else {
return(x$organisation)
}}))) %>%
mutate_if(is.logical, as.character)
}
# dacs_content <- content(dacs)[["response"]][["result"]]
# df_list <- lapply(dacs_content, tibble_response)
# df_bound <- bind_rows(df_list)
# saveRDS(df_bound, "outputs/dacs_bound_table.rds")
df_bound <- readRDS("outputs/dacs_bound_table.rds")
Now I have harvested all the DACs in EGA, filter to Australian research institutes by using the email (hopefully)
au_researchers <- df_bound %>%
filter(str_detect(emails, "au$")) %>%
separate(emails, sep="@", remove=FALSE, into = c("person", "institute"))
glimpse(au_researchers)
## Rows: 116
## Columns: 13
## $ alias <chr> "ARC Linkage Project Grant LPO990067", "ena-DAC-QCMG-14-1…
## $ egaStableId <chr> "EGAC00001000022", "EGAC00001000142", "EGAC00001000142", …
## $ centerName <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RESEARCH"…
## $ creationTime <chr> "2011-08-05T10:45.000Z", "2013-11-14T01:31.000Z", "2013-1…
## $ title <chr> "ARC Linkage Project Grant LPO990067", "DAC overseeing se…
## $ url <chr> "not provided", "not provided", "not provided", "not prov…
## $ released <chr> "RELEASED", "NOT_RELEASED", "NOT_RELEASED", "NOT_RELEASED…
## $ published <chr> "TRUE", "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE…
## $ contacts <list> ["Melanie Bahlo", "bahlo@wehi.edu.au", "The Walter and E…
## $ emails <chr> "bahlo@wehi.edu.au", "n.waddell@imb.uq.edu.au", "m.quinn@…
## $ person <chr> "bahlo", "n.waddell", "m.quinn", "j.saunus", "s.lakhani",…
## $ institute <chr> "wehi.edu.au", "imb.uq.edu.au", "uq.edu.au", "uq.edu.au",…
## $ organisation <chr> "The Walter and Eliza Hall Institute of Medical Research"…
Count of DACs per centerName submitted. It is clear there are a few issues with normalization of this field.
au_researchers %>%
group_by(egaStableId, centerName) %>%
tally() %>%
ggplot(aes(centerName)) +
geom_bar() +
coord_flip() +
theme_bw()
SACGF - South Australia centre for cancer biology - ACRF Cancer Genomics Facility
Count of DACs per organisation, again this field doesn’t seem to be used consistently.
au_researchers %>%
group_by(egaStableId, organisation) %>%
tally() %>%
ggplot(aes(organisation)) +
geom_bar() +
coord_flip() +
theme_bw()
Count of DACs by the institutional part of the email address of the contacts for each DAC, gives a bit of a clearer picture.
au_researchers %>%
group_by(egaStableId, institute) %>%
tally() %>%
ggplot(aes(institute)) +
geom_bar() +
coord_flip() +
theme_bw()
Manually curate each institute to get nicer and more understandable labels. Here I may lose some detail, that is, groups that are housed within institutes but should give a clearer and simpler picture.
au_researchers <- au_researchers %>%
mutate(tidy_center_names = case_when(
str_detect(institute, "unimelb") ~ "UMELB", # University of Melbourne
str_detect(institute, "uq") ~ "UQ", # University of Queensland
str_detect(institute, "syd") ~ "USYD", # University of Sydney
str_detect(institute, "anu") ~ "ANU", # Australian National University
str_detect(institute, "wehi") ~ "WEHI", # Walter and Eliza Hall Institute
str_detect(institute, "qut") ~ "QUT", # Queensland University of Technology
str_detect(institute, "garvan") ~ "GARVAN", # Garvan Institute of Medical Research
str_detect(institute, "mcri") ~ "MCRI", # Murdoch Children's Research Institute
str_detect(institute, "unsw") ~ "UNSW", # University of New South Wales
str_detect(institute, "uow") ~ "UOW", # University of Wollongong
str_detect(institute, "griffith") ~ "GRIFFITH", # Griffith University
str_detect(institute, "ccia") ~ "CCIA", # Children's Cancer Institute
str_detect(institute, "mh.org.au") ~ "ROYALMELB", # Royal Melbourne Hospital
str_detect(institute, "centenary") ~ "CENTENARY", # Centenary Insitute
str_detect(institute, "adelaide") ~ "UADEL", # University of Adelaide/SA Cancer Genomics Facility
str_detect(institute, "qimr") ~ "QIMRB", # QIMR Berghofer Medical Research Institute
str_detect(institute, "lh") ~ "LIFEHOUSE", # Chris O'Brien Lifehouse - not-for-profit Cancer Treatement centre
str_detect(institute, "health.nsw") ~ "NSWHEALTH", # NSW health department
str_detect(institute, "sswahs.nsw") ~ "NSWHEALTH", # Sydney South-West Health Department
str_detect(institute, "telethon") ~ "TELETHON", # Telethon Kids Institute
TRUE ~ institute
))
Get centerName lookup table for nice names
tidy_center_lookup <- au_researchers %>%
group_by(centerName, tidy_center_names) %>%
tally() %>%
select(-n)
Set institutes colour palette
length(levels(as.factor(tidy_center_lookup$tidy_center_names)))
## [1] 20
institute_colour_pal = pal_d3(palette="category20")(20)
names(institute_colour_pal) <- levels(as.factor(tidy_center_lookup$tidy_center_names))
Count of DACs per institute using curated names
au_researchers %>%
group_by(egaStableId, tidy_center_names) %>%
tally() %>%
ggplot(aes(tidy_center_names, fill=tidy_center_names)) +
geom_bar() +
scale_fill_manual(values = institute_colour_pal) +
coord_flip() +
theme_bw() +
theme(legend.position = "none") +
xlab("Institute") +
ylab("DAC count") +
labs(title="Count of DACs per institute")
Contacts per DAC
contacts_per_dac <- au_researchers %>%
group_by(egaStableId) %>%
tally() %>%
arrange(-n) %>%
ggplot(aes(n)) +
geom_histogram(fill=biocommons_pal['purple'], bins=7) +
scale_x_continuous(breaks = seq(0,7,1)) +
theme_bw() +
xlab("Number of contacts per DAC") +
ylab("Frequency")
# + labs(title="Frequency of DACs with certain number of contacts specified on the DAC")
contacts_per_dac
ggsave("plots/contacts_per_dac_hist.png")
## Saving 9 x 5 in image
The majority of DACs (49) out of 71 only have a single point of contact.
au_researchers %>%
group_by(egaStableId) %>%
summarise(n = n(),
emails = paste0(person, collapse=";")) %>%
arrange(n)
## # A tibble: 71 × 3
## egaStableId n emails
## <chr> <int> <chr>
## 1 EGAC00001000022 1 bahlo
## 2 EGAC00001000192 1 nic.waddell
## 3 EGAC00001000220 1 rob.king
## 4 EGAC00001000229 1 matt.brown
## 5 EGAC00001000230 1 chovens
## 6 EGAC00001000261 1 aghs
## 7 EGAC00001000368 1 m.budzinska
## 8 EGAC00001000465 1 d.lambert
## 9 EGAC00001000565 1 sarah.dunstan
## 10 EGAC00001000577 1 majewski
## # … with 61 more rows
Institutes per DAC
au_researchers %>%
group_by(egaStableId, tidy_center_names) %>%
tally() %>%
select(-n) %>%
group_by(egaStableId) %>%
tally() %>%
arrange(-n) %>%
ggplot(aes(n)) +
geom_histogram(fill=biocommons_pal['teal'], bins=5) +
theme_bw() +
xlab("Count of institutes") +
ylab("Frequency of DACs") +
labs(title="Frequency of number of institutes specified per DAC")
Most DACs have contacts from a single institute, 9 have DACs across
multiple institutes with one DAC having contacts from 5 separate
institutes. This is a consortium called the
Sydney Head and Neck Cancer Institute (SHNCI).
QIMRB has highest number of DACs. Contact emails tend to be individual researchers. This has the potential to cause issues if those individual researchers change institutes.
au_researchers %>%
filter(tidy_center_names == "QIMRB") %>%
select(person) %>%
group_by(person) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 19 × 2
## person n
## <chr> <int>
## 1 nic.waddell 8
## 2 john.pearson 3
## 3 ann-marie.patch 1
## 4 bryan.day 1
## 5 christian.engwerda 1
## 6 felicity.newell 1
## 7 georgia.trench 1
## 8 gregory.quaife-ryan 1
## 9 hong.you 1
## 10 james.hudson 1
## 11 juliet.french 1
## 12 katia.nones 1
## 13 mark.smyth 1
## 14 michael.quinn 1
## 15 olga.kondrashova 1
## 16 peter.johansson 1
## 17 ross.koufariotis 1
## 18 susanna.ng 1
## 19 tobias.bald 1
au_researchers %>%
filter(tidy_center_names == "UMELB") %>%
select(person, centerName) %>%
group_by(person, centerName) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 15 × 3
## # Groups: person [10]
## person centerName n
## <chr> <chr> <int>
## 1 jungch PP_AUS 2
## 2 bjpope UM 1
## 3 chovens PP_GER 1
## 4 chovens UM 1
## 5 chovens Unimelb 1
## 6 chovens UOM-CAP 1
## 7 chovens UOMCAP 1
## 8 cmerom UOMCAP 1
## 9 con UM 1
## 10 con Unimelb 1
## 11 epilepsy-austin WEHI 1
## 12 lachlan.coin UMel 1
## 13 s.berkovic WEHI 1
## 14 sarah.dunstan NUS 1
## 15 scheffer WEHI 1
au_researchers %>%
filter(tidy_center_names == "GARVAN") %>%
select(person, centerName) %>%
group_by(person, centerName) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 8 × 3
## # Groups: person [8]
## person centerName n
## <chr> <chr> <int>
## 1 e.benn GIMR 2
## 2 grants KCCG 2
## 3 e.ali KCCG 1
## 4 m.cowley KCCG 1
## 5 m.mccabe KCCG 1
## 6 mgrb GIMR 1
## 7 n.watkins KCCG 1
## 8 s.mueller SHNCI 1
au_researchers %>%
filter(tidy_center_names == "CCIA") %>%
select(person, centerName) %>%
group_by(person, centerName) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 4 × 3
## # Groups: person [4]
## person centerName n
## <chr> <chr> <int>
## 1 zero ZERO 3
## 2 mcowley SHNCI 1
## 3 mgauthier SHNCI 1
## 4 pekert MCRI 1
Increasing number of DACs over time, until 2021, perhaps covid related.
au_researchers %>%
group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
tally() %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y"))) %>%
filter(creationYear > 1982) %>% #remove 1980 DAC as obvious error
ggplot(aes(x=creationYear)) +
geom_bar(fill=biocommons_pal['pink']) +
scale_x_continuous(breaks = seq(1980,2022, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("DAC count")
dacs per institute over time dodged, potentially a bit ugly but shows more institutes being involved in later years - 2018-2020
au_researchers %>%
group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
tally() %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y"))) %>%
filter(creationYear > 1982) %>% #remove 1980 DAC as obvious error
ggplot(aes(x=creationYear, fill=tidy_center_names)) +
geom_bar(position="dodge", width=0.9) +
scale_x_continuous(breaks = seq(1980,2022, 1), name="Institute") +
scale_fill_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "bottom")
dacs per institute over time, if dac has contacts from multiple insitutes, will be counted more than once
au_researchers %>%
group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
tally() %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y"))) %>%
filter(creationYear > 1982) %>% #remove 1980 DAC as obvious error
ggplot(aes(x=creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2022, 1)) +
theme_bw() +
theme(legend.position = "bottom") +
xlab("Year of submission") +
ylab("DAC count") +
labs(title="DACs submitted to EGA each year 2011-2021 coloured by institute of contacts")
Large increase in DACs in 2020 and an increase in the number of institutes submitting DACs to EGA.
Do cross reference from the listed DAC IDs to what datasets are present
For a given DAC, how many datasets does it manage
datasets_per_dac <- function(ega_stable_id){
api_request <- paste0("https://ega-archive.org/metadata/v2/datasets?queryBy=dac&queryId=", ega_stable_id, "&limit=0")
response <- GET(api_request)
if (length(content(response)[["response"]][["result"]]) > 0){
return(content(response)[["response"]][["result"]])
} else {
return(list(NA))
}
}
datasets_per_dac("EGAC00001000951")
## [[1]]
## [1] NA
get datasets for all dacs
# dac_datasets <- au_researchers %>%
# group_by(egaStableId, centerName, tidy_center_names) %>%
# tally() %>%
# rowwise() %>%
# mutate(datasets = list(datasets_per_dac(egaStableId))) %>%
# ungroup()
# saveRDS(dac_datasets, "outputs/dac_datasets.rds")
dac_datasets <- readRDS("outputs/dac_datasets.rds")
Some DACs do not have any associated datasets, I think this is probably due to any datasets for those DACs not yet being released, or some may have been submitted for testing.
dac_datasets %>%
rowwise() %>%
unnest() %>%
filter(is.na(datasets))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
## # A tibble: 17 × 5
## egaStableId centerName tidy_center_names n datasets
## <chr> <chr> <chr> <int> <list>
## 1 EGAC00001000142 QCMG UQ 4 <lgl [1]>
## 2 EGAC00001000229 QLDI UQ 1 <lgl [1]>
## 3 EGAC00001000368 CENT CENTENARY 1 <lgl [1]>
## 4 EGAC00001000497 RPAH USYD 2 <lgl [1]>
## 5 EGAC00001000498 RPAH USYD 2 <lgl [1]>
## 6 EGAC00001000700 QCMG QIMRB 1 <lgl [1]>
## 7 EGAC00001000773 PP_GER UMELB 1 <lgl [1]>
## 8 EGAC00001000951 PP_AUS UMELB 1 <lgl [1]>
## 9 EGAC00001000952 PP_AUS UMELB 1 <lgl [1]>
## 10 EGAC00001001094 GIMR GARVAN 1 <lgl [1]>
## 11 EGAC00001001097 GIMR GARVAN 1 <lgl [1]>
## 12 EGAC00001001447 sacgf UADEL 1 <lgl [1]>
## 13 EGAC00001001631 QCMG QIMRB 1 <lgl [1]>
## 14 EGAC00001001696 UFRN TELETHON 1 <lgl [1]>
## 15 EGAC00001001836 WEHI WEHI 1 <lgl [1]>
## 16 EGAC00001001863 ZERO CCIA 1 <lgl [1]>
## 17 EGAC00001001921 QCMG QIMRB 2 <lgl [1]>
Get information about the Ghost DAcs
ghost_dacs <- dac_datasets %>%
rowwise() %>%
unnest() %>%
filter(is.na(datasets)) %>%
pull(egaStableId)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
au_researchers %>%
filter(egaStableId %in% ghost_dacs)
## # A tibble: 23 × 14
## alias egaStableId centerName creationTime title url released published
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ena-DAC-Q… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 2 ena-DAC-Q… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 3 ena-DAC-Q… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 4 ena-DAC-Q… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 5 ena-DAC-Q… EGAC000010… QLDI 2014-08-12T… UQ D… <NA> NOT_REL… FALSE
## 6 ena-DAC-C… EGAC000010… CENT 2015-07-29T… CI L… <NA> NOT_REL… FALSE
## 7 ena-DAC-R… EGAC000010… RPAH 2016-06-04T… Depa… NA NOT_REL… FALSE
## 8 ena-DAC-R… EGAC000010… RPAH 2016-06-04T… Depa… NA NOT_REL… FALSE
## 9 ena-DAC-R… EGAC000010… RPAH 2016-06-06T… Depa… NA NOT_REL… FALSE
## 10 ena-DAC-R… EGAC000010… RPAH 2016-06-06T… Depa… NA NOT_REL… FALSE
## # … with 13 more rows, and 6 more variables: contacts <list>, emails <chr>,
## # person <chr>, institute <chr>, organisation <chr>, tidy_center_names <chr>
Weird that there are DACs that are not released even though they were submitted many years ago, e.g. 2013-2016. Some are fairly new, e.g. earlier this year, so perhaps will be released at a later date.
show counts of datasets per dac
dac_datasets %>%
rowwise() %>%
unnest() %>%
ggplot(aes(egaStableId, fill=tidy_center_names)) +
geom_bar() +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
coord_flip() +
theme_bw()+
theme(legend.position = "bottom")
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
datasets_per_dac_plot <- dac_datasets %>%
group_by(egaStableId) %>%
summarise(datasets_obj = unique(datasets)) %>%
rowwise() %>%
mutate(num_datasets = length(datasets_obj)) %>%
ungroup() %>%
ggplot(aes(num_datasets)) +
geom_histogram(bins=9, fill=biocommons_pal['blue']) +
scale_x_continuous(breaks = seq(0,8,1)) +
theme_bw() +
xlab("Count of datasets per DAC") +
ylab("Frequency")
datasets_per_dac_plot
ggsave("plots/datasets_per_dac.png")
## Saving 9 x 5 in image
Of the total of 71 individual DACs with Australian resarchers as contacts, the vast majority (59) have a single dataset attached to them.
The South Australian Cancer Genomics Facility has the highest number of datasets attached to a single DAC (8).
There are a handful of other examples that have more than one dataset governed by a single DAC.
Tease out the information in the datasets table
datasets <- dac_datasets %>%
select(datasets) %>%
unnest() %>%
filter(!is.na(datasets))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
dataset_list <- datasets$datasets
datasets_df_list <- lapply(dataset_list, function(x) pivot_wider(enframe(unlist(x))))
bound_datasets <- bind_rows(datasets_df_list) %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y")),
releasedDate = as.Date(releasedDate),
releasedYear = as.numeric(format(releasedDate, "%Y")),
numSamples = as.integer(numSamples)) %>%
left_join(tidy_center_lookup)
## Joining, by = "centerName"
glimpse(bound_datasets)
## Rows: 197
## Columns: 25
## $ alias <chr> "Hepatitis C IL28B resequencing study", "20210729_EG…
## $ egaStableId <chr> "EGAD00001000032", "EGAD00001007973", "EGAD000010079…
## $ centerName <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RESE…
## $ creationTime <date> 2011-08-22, 2021-08-05, 2021-08-05, 2021-08-05, 202…
## $ published <chr> "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRU…
## $ released <chr> "RELEASED", "RELEASED", "RELEASED", "RELEASED", "REL…
## $ releasedDate <date> 2015-05-27, 2001-01-01, 2001-01-01, 2001-01-01, 200…
## $ title <chr> "Hepatitis C IL28B pooled resequencing study with 10…
## $ description <chr> "Hepatitis C IL28B pooled resequencing study with 10…
## $ technology <chr> "Illumina Genome Analyzer IIx", NA, NA, NA, NA, NA, …
## $ numSamples <int> 4, 106, 106, 106, 106, 106, 106, 44, 44, 44, 4, 4, 1…
## $ datasetTypes <chr> "sample", "sample", "sample", "sample", "sample", "s…
## $ policyStableId <chr> "EGAP00001000022", "EGAP00001000187", "EGAP000010001…
## $ availableInBeacon <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE"…
## $ accessType <chr> "CONTROLLED", "CONTROLLED", "CONTROLLED", "CONTROLLE…
## $ technology1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Ill…
## $ technology2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Ill…
## $ technology3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Nex…
## $ datasetTypes1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Who…
## $ datasetTypes2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Tra…
## $ technology4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ datasetTypes3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ creationYear <dbl> 2011, 2021, 2021, 2021, 2021, 2021, 2021, 2014, 2014…
## $ releasedYear <dbl> 2015, 2001, 2001, 2001, 2001, 2001, 2001, 2015, 2015…
## $ tidy_center_names <chr> "WEHI", "QIMRB", "QUT", "UQ", "QIMRB", "QUT", "UQ", …
bound_datasets %>%
filter(creationYear < 1990)
## # A tibble: 11 × 25
## alias egaStableId centerName creationTime published released releasedDate
## <chr> <chr> <chr> <date> <chr> <chr> <date>
## 1 ena-DATA… EGAD000010… TKI_AGHS 1980-01-01 TRUE RELEASED 2016-03-02
## 2 Illumina… EGAD000100… TKI_AGHS 1980-01-01 TRUE RELEASED 2015-07-28
## 3 ena-DATA… EGAD000010… GARVAN 1980-01-01 TRUE RELEASED 2016-08-17
## 4 ena-DATA… EGAD000010… GARVAN 1980-01-01 TRUE RELEASED 2016-08-17
## 5 ena-DATA… EGAD000010… GARVAN 1980-01-01 TRUE RELEASED 2016-08-17
## 6 ena-DATA… EGAD000010… GARVAN 1980-01-01 TRUE RELEASED 2016-08-17
## 7 HanChine… EGAD000100… <NA> 1980-01-01 TRUE RELEASED 2015-05-27
## 8 HanChine… EGAD000100… <NA> 1980-01-01 TRUE RELEASED 2015-05-27
## 9 HanChine… EGAD000100… <NA> 1980-01-01 TRUE RELEASED 2015-05-27
## 10 HanChine… EGAD000100… <NA> 1980-01-01 TRUE RELEASED 2015-05-27
## 11 HanChine… EGAD000100… <NA> 1980-01-01 TRUE RELEASED 2015-05-27
## # … with 18 more variables: title <chr>, description <chr>, technology <chr>,
## # numSamples <int>, datasetTypes <chr>, policyStableId <chr>,
## # availableInBeacon <chr>, accessType <chr>, technology1 <chr>,
## # technology2 <chr>, technology3 <chr>, datasetTypes1 <chr>,
## # datasetTypes2 <chr>, technology4 <chr>, datasetTypes3 <chr>,
## # creationYear <dbl>, releasedYear <dbl>, tidy_center_names <chr>
There is something weird going on with the released and creation years, some of the creation years are way too early, i.e. 1980, but the released date looks like it could be accurate. Also some of the released dates look very wrong, e.g. 2001 when the study was created much later like 2015. So made a function to take the released year if the creation year is obviously wrong, but to take the creation year if the released year is obviously wrong.
fix_year <- function(released_year, creation_year){
if(creation_year < 1990){
return(released_year)
}
}
bound_datasets<- bound_datasets %>%
rowwise() %>%
mutate(fixedYear = case_when(creationYear < 1990 ~ releasedYear,
releasedYear < creationYear ~ creationYear,
TRUE ~ releasedYear))
Datasets over time
bound_datasets %>%
group_by(egaStableId, fixedYear) %>%
tally() %>%
ggplot(aes(fixedYear)) +
geom_bar(fill=biocommons_pal['yellow']) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets released per year")
Datasets per year, stacked by contributing institute. Datasets with more than one contributing institute are counted more than once.
bound_datasets %>%
group_by(egaStableId, fixedYear, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(fixedYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2022, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets released per year") +
theme(legend.position = "bottom") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1))
joint_institutes <- bound_datasets %>%
group_by(egaStableId, fixedYear, numSamples) %>%
summarise(joined_names = paste0(unique(tidy_center_names), collapse=";"),
n = n())
## `summarise()` has grouped output by 'egaStableId', 'fixedYear'. You can
## override using the `.groups` argument.
Set joint institutes colour palette
length(levels(as.factor(joint_institutes$joined_names)))
## [1] 21
joint_institute_colour_pal = c("#771155", "#AA4488", "#CC99BB", "#114477", "#4477AA", "#77AADD", "#117777", "#44AAAA", "#77CCCC", "#117744", "#44AA77", "#88CCAA", "#777711", "#AAAA44", "#DDDD77", "#774411", "#AA7744", "#DDAA77", "#771122", "#AA4455", "#DD7788")
names(joint_institute_colour_pal) <- levels(as.factor(joint_institutes$joined_names))
joint_institutes %>%
ggplot(aes(as.factor(fixedYear), fill=joined_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = joint_institute_colour_pal, name="Institutes", guide=guide_legend(title.position="top")) +
scale_x_discrete() +
theme_bw(base_size = 18) +
xlab("Year") +
ylab("Dataset count") +
# labs(title="Datasets per year") +
theme(legend.position = "bottom") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1)) +
labs(title = "Australian datasets in EGA")
ggsave("plots/datasets_per_year.png")
## Saving 15 x 9 in image
bound_datasets %>%
filter(creationYear > 1980) %>%
group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year") +
theme(legend.position = "bottom")
bound_datasets %>%
filter(creationYear > 1980) %>%
left_join(tidy_center_lookup) %>%
group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year") +
theme(legend.position = "bottom")
## Joining, by = c("centerName", "tidy_center_names")
Samples submitted per year
bound_datasets %>%
left_join(tidy_center_lookup) %>%
group_by(creationYear, tidy_center_names) %>%
summarise(summed_by_institute = sum(numSamples)) %>%
ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
# geom_jitter() +
geom_text(check_overlap = TRUE) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "none")
## Joining, by = c("centerName", "tidy_center_names")
## `summarise()` has grouped output by 'creationYear'. You can override using the
## `.groups` argument.
bound_datasets %>%
filter(creationYear > 1980) %>%
group_by(creationYear, tidy_center_names) %>%
summarise(summed_by_institute = sum(numSamples)) %>%
ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
# geom_jitter() +
geom_text(check_overlap = TRUE) +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "none") +
ylab("Sum of submitted samples per institute") +
xlab("Year of submission")
## `summarise()` has grouped output by 'creationYear'. You can override using the
## `.groups` argument.
bound_datasets %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.character(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
tally() %>%
left_join(tidy_center_lookup) %>%
ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
geom_point(shape=1, size=4) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "bottom") +
labs("Number of samples submitted per dataset over time, coloured by institute") +
ylab("Number of samples per dataset") +
xlab("Year of submission")
## Joining, by = "centerName"
bound_datasets %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.character(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
filter(creationYear > 1980) %>%
group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
tally() %>%
left_join(tidy_center_lookup) %>%
ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
geom_beeswarm(shape=1, size=4) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "bottom") +
labs("Number of samples submitted per dataset over time, coloured by institute") +
ylab("Number of samples per dataset") +
xlab("Year of submission")
## Joining, by = "centerName"
## Warning in f(...): The default behavior of beeswarm has changed in version
## 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis.
## Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis
## choice.
trying out log scale but don’t think it is very good visually.
bound_datasets %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.character(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
group_by(egaStableId, creationYear, numSamples, centerName) %>%
tally() %>%
left_join(tidy_center_lookup) %>%
ggplot(aes(x=creationYear, y=numSamples, col=tidy_center_names)) +
geom_jitter(shape=1, size=3) +
scale_color_manual(values = institute_colour_pal) +
scale_y_continuous(trans = log10_trans()) +
theme(legend.position = "bottom") +
theme_bw()
## Joining, by = "centerName"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 3 rows containing missing values (geom_point).
bound_datasets %>%
ungroup() |>
select(egaStableId, starts_with("technology"), starts_with("dataset")) %>%
pivot_longer(!egaStableId,
names_to = "data_category",
values_to = "type") %>%
filter(!is.na(type)) %>%
group_by(egaStableId, data_category, type) %>%
tally() %>%
mutate(data_category = ifelse(str_detect(data_category, "technology"), "technology", "datasetType")) |>
ggplot(aes(x=type)) +
facet_wrap(~data_category, scales="free", ncol = 1) +
geom_bar() +
coord_flip()
bound_datasets %>%
group_by(datasetTypes1, datasetTypes2, datasetTypes3) %>%
tally()
## # A tibble: 7 × 4
## # Groups: datasetTypes1, datasetTypes2 [6]
## datasetTypes1 datasetTypes2 datasetTypes3 n
## <chr> <chr> <chr> <int>
## 1 Exome sequencing Amplicon sequencing <NA> 4
## 2 Genotyping by array Transcriptome profiling by high-t… <NA> 1
## 3 Whole genome sequencing Amplicon sequencing <NA> 1
## 4 Whole genome sequencing Exome sequencing <NA> 9
## 5 Whole genome sequencing Transcriptome profiling by high-t… Genomic vari… 1
## 6 Whole genome sequencing Transcriptome profiling by high-t… <NA> 7
## 7 <NA> <NA> <NA> 174
files_per_dataset <- function(dataset_accession){
api_request <- paste0("https://ega-archive.org/metadata/v2/files?queryBy=dataset&queryId=", dataset_accession, "&limit=0")
response <- GET(api_request)
if(status_code(response) == 200){
this_content <- content(response)
num_files <- this_content[["response"]][["numTotalResults"]]
sizes <- lapply(this_content[["response"]][["result"]], function(x) x[["fileSize"]])
sizes_length <- length(sizes)
summed_size <- do.call(sum, sizes)
file_formats <- paste0(unique(lapply(this_content[["response"]][["result"]], function(x) x[["fileFormat"]])), collapse=";")
return(tibble(dataset_accession, num_files, sizes_length, summed_size, file_formats))
} else {
return(NA)
}
}
dataset_list <- unique(bound_datasets$egaStableId)
# file_size_list <- lapply(dataset_list, files_per_dataset)
# bound_files_dataset <- bind_rows(file_size_list)
# files_and_datasets <- bound_files_dataset %>%
# left_join(bound_datasets, by=c("dataset_accession" = "egaStableId"))
#
# saveRDS(file_size_list, "outputs/file_size_list.rds")
# saveRDS(files_and_datasets, "outputs/files_joined_datasets.rds")
files_and_datasets <- readRDS("outputs/files_joined_datasets.rds")
Total data stored in EGA by aussie researchers
per_dataset_file_summary <- files_and_datasets %>%
group_by(dataset_accession, title, num_files, summed_size) %>%
tally()
per_dataset_file_summary
## # A tibble: 85 × 5
## # Groups: dataset_accession, title, num_files [85]
## dataset_accession title num_files summed_size n
## <chr> <chr> <int> <dbl> <int>
## 1 EGAD00001000032 "Hepatitis C IL28B pooled rese… 0 0 1
## 2 EGAD00001000845 "Matched tumour/normal .bam fi… 44 7.38e12 3
## 3 EGAD00001000967 "Cancer-Normal chronic myeloid… 8 1.87e10 1
## 4 EGAD00001001343 "WGS on patients 1-4, study of… 354 1.94e12 1
## 5 EGAD00001001344 "WGS on patients 5-7, study of… 140 6.27e11 1
## 6 EGAD00001001345 "RNA-seq on patients 1-4, stud… 168 2.46e11 1
## 7 EGAD00001001660 "Genomic analysis of HPV-posit… 24 4.20e11 4
## 8 EGAD00001001661 "Exome data for an Australian … 1 8.44e 8 1
## 9 EGAD00001002001 "Australian genomes" 166 1.08e13 1
## 10 EGAD00001003119 "TP53 in ovarian cancer panel … 76 2.28e 8 1
## # … with 75 more rows
saveRDS(per_dataset_file_summary, "outputs/per_dataset_file_summary.rds")
Total data stored in EGA by aussie researchers:
14673 files
325.4687165 TB of data
Summary of dataset sizes in TB:
round(summary(per_dataset_file_summary$summed_size)/1e+12, 2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.08 3.83 1.11 88.93
per_dataset_file_summary %>%
ggplot(aes(summed_size)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Datasets with a size greater than 15TB
per_dataset_file_summary %>%
filter(summed_size > 15*1e+12) %>%
arrange(-summed_size)
## # A tibble: 5 × 5
## # Groups: dataset_accession, title, num_files [5]
## dataset_accession title num_files summed_size n
## <chr> <chr> <int> <dbl> <int>
## 1 EGAD00001004940 MGRB dataset 2571 8.89e13 1
## 2 EGAD00001006902 WGS fastq for EGAS00001004572 1962 8.26e13 1
## 3 EGAD00001005095 MGRB dataset. Samples that were… 1441 4.65e13 1
## 4 EGAD00001004494 Familial Breast cancer dataset 156 1.80e13 6
## 5 EGAD00001005287 Deep multi-region WGS of lung c… 68 1.63e13 1
There aren’t too many datasets that are greater than 15TB, but some institutes have a single submission account and may be submitting on behalf of multiple labs.
files_and_datasets %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationTime) %>%
filter(creationYear > 1980) %>%
tally() %>%
mutate(file_size_gb = summed_size/1e+9) %>%
ggplot(aes(x=creationTime, y=file_size_gb)) +
geom_point() +
geom_abline(intercept = 10000, slope=0, color="red") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
# scale_y_continuous(trans = log10_trans()) +
theme_bw() +
ylab("Total file size per dataset (GB)") +
xlab("Year of submission") +
labs(title="Total file size per dataset over time")
How much data submitted in the past 4 years?
files_and_datasets %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationTime) %>%
filter(creationYear >= 2018) %>%
tally() %>%
pull(summed_size) %>%
sum()/1e+12
## [1] 301.6727
sum_size_plot <- files_and_datasets %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationYear, datasetTypes, title) %>%
filter(creationYear > 1980) %>%
tally() %>%
mutate(file_size_gb = summed_size/1e+9) %>%
ggplot(aes(x=creationYear, y=file_size_gb, col=centerName, label=title, fill=datasetTypes)) +
geom_beeswarm() +
geom_abline(intercept = 10000, slope=0, color="red") +
# scale_y_continuous(trans = log10_trans()) +
scale_x_continuous(breaks=seq(1980,2022,1)) +
theme_bw() +
ylab("Total file size per dataset (GB)") +
xlab("Year of submission") +
labs(title="Total file size per data set over time (info on hover)") +
theme(legend.position = "none")
ggplotly(sum_size_plot)
files_and_datasets %>%
filter(creationYear > 1980) %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationTime, title) %>%
tally() %>%
ggplot(aes(x=creationTime, y=num_files)) +
geom_point() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme_bw() +
ylab("Total files per dataset") +
xlab("Year of submission") +
labs(title="Total number of files per data set over time")
# num_files_plot <- files_and_datasets %>%
# filter(creationYear > 1980) %>%
# group_by(dataset_accession, summed_size, centerName, num_files, creationTime, creationYear, title, datasetTypes) %>%
# tally() %>%
# ggplot(aes(x=creationYear, y=num_files, label=title, col=centerName, fill=datasetTypes)) +
# geom_beeswarm() +
# # scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
# theme_bw() +
# ylab("Total files per dataset") +
# xlab("Year of submission") +
# labs(title="Total files per data set over time (info on hover)") +
# theme(legend.position = "none")
# ggplotly(num_files_plot)
What sample metadata is submitted alongside datasets? What phenotypes are covered? Are minimal requirements met?
Minimal requirements are gender, subject_id and phenotype (preferably ontologised, recommends EFO)
samples_per_dataset <- function(dataset_accession){
api_request <- paste0("https://ega-archive.org/metadata/v2/samples?queryBy=dataset&queryId=", dataset_accession, "&limit=0")
response <- GET(api_request)
if(status_code(response) == 200){
content <- content(response)
num_samples <- content[["response"]][["numTotalResults"]]
genders <- unlist(lapply(content[["response"]][["result"]], function(x) x[["gender"]]))
if (is.null(genders)){
genders <- as.character(rep(NA, num_samples))
}
phenotypes <- unlist(lapply(content[["response"]][["result"]], function(x)x[["phenotype"]]))
if (is.null(phenotypes)){
phenotypes <- as.character(rep(NA, num_samples))
}
attributes <- unlist(lapply(content[["response"]][["result"]], function(x) x[["attributes"]]))
if (is.null(attributes)){
attributes <- as.character(rep(NA, num_samples))
}
subjectIds <- unlist(lapply(content[["response"]][["result"]], function(x) x[["subjectId"]]))
if (is.null(attributes)){
subjectIds <- as.character(rep(NA, num_samples))
}
return(tibble(dataset_accession, num_samples, genders, phenotypes, attributes, subjectIds))
} else {
return(NA)
}
}
sample_md_list <- lapply(per_dataset_file_summary$dataset_accession, samples_per_dataset)
sample_md_tbl <- bind_rows(sample_md_list)
total_samples <- sample_md_tbl %>%
group_by(dataset_accession, num_samples) %>%
tally() %>%
pull(num_samples) %>%
sum()
Total of 15,945 in the EGA from Aussie researchers
Breakdown of genders
sample_md_tbl %>%
ggplot(aes(genders, fill=genders, label = genders)) +
geom_bar() +
scale_fill_pretty_d("Bold", na.value="grey") +
coord_flip() +
theme_bw()
sample_md_tbl %>%
group_by(genders) %>%
tally()
## # A tibble: 4 × 2
## genders n
## <chr> <int>
## 1 female 5868
## 2 male 5991
## 3 unknown 705
## 4 <NA> 3381
About 25% samples do not have a specified gender.
sample_md_tbl %>%
ggplot(aes(phenotypes)) +
geom_bar() +
coord_flip() +
theme_bw()
sample_md_tbl %>%
rowwise() %>%
mutate(ontologised = ifelse(str_detect(phenotypes, "EFO"), TRUE, FALSE)) %>%
filter(ontologised) %>%
group_by(dataset_accession) %>%
summarise(count = n(), phenotypes = paste0(unique(phenotypes), collapse=";"))
## # A tibble: 13 × 3
## dataset_accession count phenotypes
## <chr> <int> <chr>
## 1 EGAD00001001343 15 EFO:0001663
## 2 EGAD00001001344 6 EFO:0001663
## 3 EGAD00001001345 12 EFO:0001663
## 4 EGAD00001003203 37 EFO:0000096
## 5 EGAD00001003247 37 EFO:0000096
## 6 EGAD00001003282 37 EFO:0000096
## 7 EGAD00001004179 183 CML, EFO_0000339
## 8 EGAD00001004190 239 EFO:0000203;EFO:0001378;EFO:0003073
## 9 EGAD00001004833 3 EFO:1000796
## 10 EGAD00001004861 396 PMDS, EFO_0000198;TMN, EFO_1000575
## 11 EGAD00001005287 68 EFO:0000571;EFO:0000708;EFO:0000702
## 12 EGAD00001005807 48 EFO:0010105
## 13 EGAD00001005949 39 EFO_0000222
13 studies our of 85, 15% use the recommended EFO ontology in some form
sample_md_tbl %>%
ggplot(aes(attributes)) +
geom_bar() +
coord_flip() +
theme_bw()
How many NA subject IDs?
sum(is.na(sample_md_tbl$subjectIds))
## [1] 3337
3337 samples have no subjectId specified
No samples have any extra attribute metadata
Do projects have phenotype files attached that could contain more information?
did a search of the dbGaP for keyword ‘Australia’
dbgap_aus <- read_csv("data/dbgap_australia.csv")
## Rows: 48 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): accession, name, description, Parent study, Study Disease/Focus, ...
## date (2): Release Date, Embargo Release Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
length(dbgap_aus$accession)
## [1] 48
dbgap_aus %>%
mutate(release_year = as.numeric(format(`Release Date`, "%Y"))) %>%
ggplot(aes(release_year, fill=`NIH Institute`)) +
geom_bar() +
scale_x_continuous(breaks = seq(2009, 2022, 1)) +
scale_y_continuous(breaks = seq(0, 9, 1)) +
scale_fill_tableau(palette="Tableau 20", guide=guide_legend(title.position="top")) +
theme_bw(base_size = 18) +
theme(panel.grid.minor = element_blank()) +
ylab("Count of datasets") +
xlab("Release year") +
theme(legend.position = "bottom") +
labs(title="Australian datasets in dbGaP")
ggsave("plots/dgbap_datasets_per_year.png")
## Saving 9 x 5 in image
How many from NCI and NINDS
dbgap_aus %>%
pull(`NIH Institute`) %>%
table()
## .
## NCATS NCI NEI NHGRI NHLBI NIAAA NIAID NIAMS NICHD NIDA NIDCR NIDDK NIMH
## 2 18 1 3 1 1 3 1 1 2 1 1 5
## NINDS
## 8