Some scripts to harvest data from the EGA API and find out what state it is in.
Retrieve all DACs housed at EGA, there are currently around 1500 (as at 2021-10-22) so set limit to 2000.
# dacs<- GET("https://ega-archive.org/metadata/v2/dacs?limit=2000")
#
# dacs
tibble_response <- function(list){
list %>%
enframe() %>%
pivot_wider() %>%
mutate_all(~ifelse(is.null(.x[[1]]), list(NA), .x)) %>%
unnest(cols = c(alias, egaStableId, centerName, creationTime, title, url, released,
published, contacts)) %>%
mutate(emails = unlist(lapply(contacts, function(x) x$email)),
organisation = unlist(lapply(contacts, function(x) {if(is.null(x$organisation)){
return(NA)
} else {
return(x$organisation)
}}))) %>%
mutate_if(is.logical, as.character)
}
# dacs_content <- content(dacs)[["response"]][["result"]]
# df_list <- lapply(dacs_content, tibble_response)
# df_bound <- bind_rows(df_list)
# saveRDS(df_bound, "outputs/dacs_bound_table.rds")
df_bound <- readRDS("outputs/dacs_bound_table.rds")
Now I have harvested all the DACs in EGA, filter to Australian research institutes by using the email (hopefully)
au_researchers <- df_bound %>%
filter(str_detect(emails, "au$")) %>%
separate(emails, sep="@", remove=FALSE, into = c("person", "institute"))
glimpse(au_researchers)
## Rows: 116
## Columns: 13
## $ alias <chr> "ARC Linkage Project Grant LPO990067", "ena-DAC-QCMG-14-…
## $ egaStableId <chr> "EGAC00001000022", "EGAC00001000142", "EGAC00001000142",…
## $ centerName <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RESEARCH…
## $ creationTime <chr> "2011-08-05T10:45.000Z", "2013-11-14T01:31.000Z", "2013-…
## $ title <chr> "ARC Linkage Project Grant LPO990067", "DAC overseeing s…
## $ url <chr> "not provided", "not provided", "not provided", "not pro…
## $ released <chr> "RELEASED", "NOT_RELEASED", "NOT_RELEASED", "NOT_RELEASE…
## $ published <chr> "TRUE", "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRU…
## $ contacts <list> [["Melanie Bahlo", "bahlo@wehi.edu.au", "The Walter and…
## $ emails <chr> "bahlo@wehi.edu.au", "n.waddell@imb.uq.edu.au", "m.quinn…
## $ person <chr> "bahlo", "n.waddell", "m.quinn", "j.saunus", "s.lakhani"…
## $ institute <chr> "wehi.edu.au", "imb.uq.edu.au", "uq.edu.au", "uq.edu.au"…
## $ organisation <chr> "The Walter and Eliza Hall Institute of Medical Research…
Count of DACs per centerName submitted. It is clear there are a few issues with normalization of this field.
au_researchers %>%
group_by(egaStableId, centerName) %>%
tally() %>%
ggplot(aes(centerName)) +
geom_bar() +
coord_flip() +
theme_bw()
SACGF - South Australia centre for cancer biology - ACRF Cancer Genomics Facility
Count of DACs per organisation, again this field doesn’t seem to be used consistently.
au_researchers %>%
group_by(egaStableId, organisation) %>%
tally() %>%
ggplot(aes(organisation)) +
geom_bar() +
coord_flip() +
theme_bw()
Count of DACs by the institutional part of the email address of the contacts for each DAC, gives a bit of a clearer picture.
au_researchers %>%
group_by(egaStableId, institute) %>%
tally() %>%
ggplot(aes(institute)) +
geom_bar() +
coord_flip() +
theme_bw()
Manually curate each institute to get nicer and more understandable labels. Here I may lose some detail, that is, groups that are housed within institutes but should give a clearer and simpler picture.
au_researchers <- au_researchers %>%
mutate(tidy_center_names = case_when(
str_detect(institute, "unimelb") ~ "UMELB", # University of Melbourne
str_detect(institute, "uq") ~ "UQ", # University of Queensland
str_detect(institute, "syd") ~ "USYD", # University of Sydney
str_detect(institute, "anu") ~ "ANU", # Australian National University
str_detect(institute, "wehi") ~ "WEHI", # Walter and Eliza Hall Institute
str_detect(institute, "qut") ~ "QUT", # Queensland University of Technology
str_detect(institute, "garvan") ~ "GARVAN", # Garvan Institute of Medical Research
str_detect(institute, "mcri") ~ "MCRI", # Murdoch Children's Research Institute
str_detect(institute, "unsw") ~ "UNSW", # University of New South Wales
str_detect(institute, "uow") ~ "UOW", # University of Wollongong
str_detect(institute, "griffith") ~ "GRIFFITH", # Griffith University
str_detect(institute, "ccia") ~ "CCIA", # Children's Cancer Institute
str_detect(institute, "mh.org.au") ~ "ROYALMELB", # Royal Melbourne Hospital
str_detect(institute, "centenary") ~ "CENTENARY", # Centenary Insitute
str_detect(institute, "adelaide") ~ "UADEL", # University of Adelaide/SA Cancer Genomics Facility
str_detect(institute, "qimr") ~ "QIMRB", # QIMR Berghofer Medical Research Institute
str_detect(institute, "lh") ~ "LIFEHOUSE", # Chris O'Brien Lifehouse - not-for-profit Cancer Treatement centre
str_detect(institute, "health.nsw") ~ "NSWHEALTH", # NSW health department
str_detect(institute, "sswahs.nsw") ~ "NSWHEALTH", # Sydney South-West Health Department
str_detect(institute, "telethon") ~ "TELETHON", # Telethon Kids Institute
TRUE ~ institute
))
Get centerName lookup table for nice names
tidy_center_lookup <- au_researchers %>%
group_by(centerName, tidy_center_names) %>%
tally() %>%
select(-n)
Set institutes colour palette
length(levels(as.factor(tidy_center_lookup$tidy_center_names)))
## [1] 19
institute_colour_pal = pal_d3(palette="category20")(19)
names(institute_colour_pal) <- levels(as.factor(tidy_center_lookup$tidy_center_names))
Count of DACs per institute using curated names
au_researchers %>%
group_by(egaStableId, tidy_center_names) %>%
tally() %>%
ggplot(aes(tidy_center_names, fill=tidy_center_names)) +
geom_bar() +
scale_fill_manual(values = institute_colour_pal) +
coord_flip() +
theme_bw() +
theme(legend.position = "none") +
xlab("Institute") +
ylab("DAC count") +
labs(title="Count of DACs per institute")
Contacts per DAC
au_researchers %>%
group_by(egaStableId) %>%
tally() %>%
arrange(-n) %>%
ggplot(aes(n)) +
geom_histogram(fill=biocommons_pal['purple'], bins=7) +
scale_x_continuous(breaks = seq(0,7,1)) +
theme_bw() +
xlab("Number of contacts per DAC") +
ylab("Frequency") +
labs(title="Frequency of DACs with certain number of contacts specified on the DAC")
The majority of DACs (49) only have a single point of contact.
Institutes per DAC
au_researchers %>%
group_by(egaStableId, tidy_center_names) %>%
tally() %>%
select(-n) %>%
group_by(egaStableId) %>%
tally() %>%
arrange(-n) %>%
ggplot(aes(n)) +
geom_histogram(fill=biocommons_pal['teal'], bins=5) +
theme_bw() +
xlab("Count of institutes") +
ylab("Frequency of DACs") +
labs(title="Frequency of number of institutes specified per DAC")
Most DACs have contacts from a single institute, 9 have DACs across multiple institutes with one DAC having contacts from 5 separate institutes. This is a consortium called the Sydney Head and Neck Cancer Institute (SHNCI).
QIMRB has highest number of DACs. Contact emails tend to be individual researchers. This has the potential to cause issues if those individual researchers change institutes.
au_researchers %>%
filter(tidy_center_names == "QIMRB") %>%
select(person) %>%
group_by(person) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 19 x 2
## person n
## <chr> <int>
## 1 nic.waddell 8
## 2 john.pearson 3
## 3 ann-marie.patch 1
## 4 bryan.day 1
## 5 christian.engwerda 1
## 6 felicity.newell 1
## 7 georgia.trench 1
## 8 gregory.quaife-ryan 1
## 9 hong.you 1
## 10 james.hudson 1
## 11 juliet.french 1
## 12 katia.nones 1
## 13 mark.smyth 1
## 14 michael.quinn 1
## 15 olga.kondrashova 1
## 16 peter.johansson 1
## 17 ross.koufariotis 1
## 18 susanna.ng 1
## 19 tobias.bald 1
au_researchers %>%
filter(tidy_center_names == "UMELB") %>%
select(person, centerName) %>%
group_by(person, centerName) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 15 x 3
## # Groups: person [10]
## person centerName n
## <chr> <chr> <int>
## 1 jungch PP_AUS 2
## 2 bjpope UM 1
## 3 chovens PP_GER 1
## 4 chovens UM 1
## 5 chovens Unimelb 1
## 6 chovens UOM-CAP 1
## 7 chovens UOMCAP 1
## 8 cmerom UOMCAP 1
## 9 con UM 1
## 10 con Unimelb 1
## 11 epilepsy-austin WEHI 1
## 12 lachlan.coin UMel 1
## 13 s.berkovic WEHI 1
## 14 sarah.dunstan NUS 1
## 15 scheffer WEHI 1
au_researchers %>%
filter(tidy_center_names == "GARVAN") %>%
select(person, centerName) %>%
group_by(person, centerName) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 8 x 3
## # Groups: person [8]
## person centerName n
## <chr> <chr> <int>
## 1 e.benn GIMR 2
## 2 grants KCCG 2
## 3 e.ali KCCG 1
## 4 m.cowley KCCG 1
## 5 m.mccabe KCCG 1
## 6 mgrb GIMR 1
## 7 n.watkins KCCG 1
## 8 s.mueller SHNCI 1
au_researchers %>%
filter(tidy_center_names == "CCIA") %>%
select(person, centerName) %>%
group_by(person, centerName) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 4 x 3
## # Groups: person [4]
## person centerName n
## <chr> <chr> <int>
## 1 zero ZERO 3
## 2 mcowley SHNCI 1
## 3 mgauthier SHNCI 1
## 4 pekert MCRI 1
Increasing number of DACs over time, until 2021, perhaps covid related.
au_researchers %>%
group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
tally() %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y"))) %>%
filter(creationYear > 1982) %>% #remove 1980 DAC as obvious error
ggplot(aes(x=creationYear)) +
geom_bar(fill=biocommons_pal['pink']) +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("DAC count")
dacs per institute over time dodged, potentially a bit ugly but shows more institutes being involved in later years - 2018-2020
au_researchers %>%
group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
tally() %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y"))) %>%
filter(creationYear > 1982) %>% #remove 1980 DAC as obvious error
ggplot(aes(x=creationYear, fill=tidy_center_names)) +
geom_bar(position="dodge", width=0.9) +
scale_x_continuous(breaks = seq(1980,2021, 1), name="Institute") +
scale_fill_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "bottom")
dacs per institute over time, if dac has contacts from multiple insitutes, will be counted more than once
au_researchers %>%
group_by(alias, egaStableId, tidy_center_names, title, creationTime) %>%
tally() %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y"))) %>%
filter(creationYear > 1982) %>% #remove 1980 DAC as obvious error
ggplot(aes(x=creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
theme(legend.position = "bottom") +
xlab("Year of submission") +
ylab("DAC count") +
labs(title="DACs submitted to EGA each year 2011-2021 coloured by institute of contacts")
Large increase in DACs in 2020 and an increase in the number of institutes submitting DACs to EGA.
Do cross reference from the listed DAC IDs to what datasets are present
For a given DAC, how many datasets does it manage
datasets_per_dac <- function(ega_stable_id){
api_request <- paste0("https://ega-archive.org/metadata/v2/datasets?queryBy=dac&queryId=", ega_stable_id, "&limit=0")
response <- GET(api_request)
if (length(content(response)[["response"]][["result"]]) > 0){
return(content(response)[["response"]][["result"]])
} else {
return(list(NA))
}
}
datasets_per_dac("EGAC00001000951")
## [[1]]
## [1] NA
get datasets for all dacs
# dac_datasets <- au_researchers %>%
# group_by(egaStableId, centerName, tidy_center_names) %>%
# tally() %>%
# rowwise() %>%
# mutate(datasets = list(datasets_per_dac(egaStableId))) %>%
# ungroup()
# saveRDS(dac_datasets, "outputs/dac_datasets.rds")
dac_datasets <- readRDS("outputs/dac_datasets.rds")
Some DACs do not have any associated datasets, I think this is probably due to any datasets for those DACs not yet being released, or some may have been submitted for testing.
dac_datasets %>%
rowwise() %>%
unnest() %>%
filter(is.na(datasets))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
## # A tibble: 24 x 5
## egaStableId centerName tidy_center_names n datasets
## <chr> <chr> <chr> <int> <list>
## 1 EGAC00001000142 QCMG UQ 4 <lgl [1]>
## 2 EGAC00001000229 QLDI UQ 1 <lgl [1]>
## 3 EGAC00001000368 CENT CENTENARY 1 <lgl [1]>
## 4 EGAC00001000497 RPAH USYD 2 <lgl [1]>
## 5 EGAC00001000498 RPAH USYD 2 <lgl [1]>
## 6 EGAC00001000700 QCMG QIMRB 1 <lgl [1]>
## 7 EGAC00001000773 PP_GER UMELB 1 <lgl [1]>
## 8 EGAC00001000951 PP_AUS UMELB 1 <lgl [1]>
## 9 EGAC00001000952 PP_AUS UMELB 1 <lgl [1]>
## 10 EGAC00001001090 QIMR QIMRB 1 <lgl [1]>
## # … with 14 more rows
Get information about the Ghost DAcs
ghost_dacs <- dac_datasets %>%
rowwise() %>%
unnest() %>%
filter(is.na(datasets)) %>%
pull(egaStableId)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
au_researchers %>%
filter(egaStableId %in% ghost_dacs)
## # A tibble: 33 x 14
## alias egaStableId centerName creationTime title url released published
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ena-… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 2 ena-… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 3 ena-… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 4 ena-… EGAC000010… QCMG 2013-11-14T… DAC … not … NOT_REL… FALSE
## 5 ena-… EGAC000010… QLDI 2014-08-12T… UQ D… <NA> NOT_REL… FALSE
## 6 ena-… EGAC000010… CENT 2015-07-29T… CI L… <NA> NOT_REL… FALSE
## 7 ena-… EGAC000010… RPAH 2016-06-04T… Depa… NA NOT_REL… FALSE
## 8 ena-… EGAC000010… RPAH 2016-06-04T… Depa… NA NOT_REL… FALSE
## 9 ena-… EGAC000010… RPAH 2016-06-06T… Depa… NA NOT_REL… FALSE
## 10 ena-… EGAC000010… RPAH 2016-06-06T… Depa… NA NOT_REL… FALSE
## # … with 23 more rows, and 6 more variables: contacts <list>, emails <chr>,
## # person <chr>, institute <chr>, organisation <chr>, tidy_center_names <chr>
Weird that there are DACs that are not released even though they were submitted many years ago, e.g. 2013-2016. Some are fairly new, e.g. earlier this year, so perhaps will be released at a later date.
show counts of datasets per dac
dac_datasets %>%
rowwise() %>%
unnest() %>%
ggplot(aes(egaStableId, fill=tidy_center_names)) +
geom_bar() +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
coord_flip() +
theme_bw()+
theme(legend.position = "bottom")
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
dac_datasets %>%
group_by(egaStableId) %>%
summarise(datasets_obj = unique(datasets)) %>%
rowwise() %>%
mutate(num_datasets = length(datasets_obj)) %>%
ungroup() %>%
ggplot(aes(num_datasets)) +
geom_histogram(bins=9, fill=biocommons_pal['blue']) +
scale_x_continuous(breaks = seq(0,8,1)) +
theme_bw() +
xlab("Count of datasets per DAC") +
ylab("Frequency")
## `summarise()` ungrouping output (override with `.groups` argument)
Of the total of 71 individual DACs with Australian resarchers as contacts, the vast majority (59) have a single dataset attached to them.
The South Australian Cancer Genomics Facility has the highest number of datasets attached to a single DAC (8).
There are a handful of other examples that have more than one dataset governed by a single DAC.
Tease out the information in the datasets table
datasets <- dac_datasets %>%
select(datasets) %>%
unnest() %>%
filter(!is.na(datasets))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(datasets)`
dataset_list <- datasets$datasets
datasets_df_list <- lapply(dataset_list, function(x) pivot_wider(enframe(unlist(x))))
bound_datasets <- bind_rows(datasets_df_list) %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.numeric(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
left_join(tidy_center_lookup)
## Joining, by = "centerName"
glimpse(bound_datasets)
## Rows: 168
## Columns: 23
## $ alias <chr> "Hepatitis C IL28B resequencing study", "20210729_E…
## $ egaStableId <chr> "EGAD00001000032", "EGAD00001007973", "EGAD00001007…
## $ centerName <chr> "THE WALTER AND ELIZA HALL INSTITUTE OF MEDICAL RES…
## $ creationTime <date> 2011-08-22, 2021-08-05, 2021-08-05, 2021-08-05, 20…
## $ published <chr> "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TR…
## $ released <chr> "RELEASED", "RELEASED", "RELEASED", "RELEASED", "RE…
## $ releasedDate <chr> "2015-05-27T00:00.000Z", "2001-01-01T00:00.000Z", "…
## $ title <chr> "Hepatitis C IL28B pooled resequencing study with 1…
## $ description <chr> "Hepatitis C IL28B pooled resequencing study with 1…
## $ technology <chr> "Illumina Genome Analyzer IIx", NA, NA, NA, NA, NA,…
## $ numSamples <int> 4, 106, 106, 106, 106, 106, 106, 44, 44, 44, 4, 183…
## $ datasetTypes <chr> "sample", "sample", "sample", "sample", "sample", "…
## $ policyStableId <chr> "EGAP00001000022", "EGAP00001000187", "EGAP00001000…
## $ availableInBeacon <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE…
## $ accessType <chr> "CONTROLLED", "CONTROLLED", "CONTROLLED", "CONTROLL…
## $ technology1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Illumi…
## $ technology2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Illumi…
## $ technology3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NextSe…
## $ datasetTypes1 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Whole …
## $ datasetTypes2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Transc…
## $ technology4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ creationYear <dbl> 2011, 2021, 2021, 2021, 2021, 2021, 2021, 2014, 201…
## $ tidy_center_names <chr> "WEHI", "QIMRB", "QUT", "UQ", "QIMRB", "QUT", "UQ",…
Datasets over time
bound_datasets %>%
group_by(egaStableId, creationYear) %>%
tally() %>%
ggplot(aes(creationYear)) +
geom_bar(fill=biocommons_pal['yellow']) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year")
without 1980
bound_datasets %>%
filter(creationYear > 1980) %>%
group_by(egaStableId, creationYear) %>%
tally() %>%
ggplot(aes(creationYear)) +
geom_bar(fill=biocommons_pal['yellow']) +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year")
Datasets per year, stacked by contributing institute. Datasets with more than one contributing institute are counted more than once.
bound_datasets %>%
group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year") +
theme(legend.position = "bottom") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1))
bound_datasets %>%
mutate(na_year = ifelse(creationYear == 1980, NA, creationYear)) %>%
group_by(egaStableId, na_year, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(na_year, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year") +
theme(legend.position = "bottom") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust=1))
## Warning: Removed 9 rows containing non-finite values (stat_count).
bound_datasets %>%
filter(creationYear > 1980) %>%
group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year") +
theme(legend.position = "bottom")
bound_datasets %>%
filter(creationYear > 1980) %>%
left_join(tidy_center_lookup) %>%
group_by(egaStableId, creationYear, numSamples, tidy_center_names) %>%
tally() %>%
ggplot(aes(creationYear, fill=tidy_center_names)) +
geom_bar(width=0.9) +
scale_fill_manual(values = institute_colour_pal, name="Institute") +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
theme_bw() +
xlab("Year of submission") +
ylab("Dataset count") +
labs(title="Datasets submitted per year") +
theme(legend.position = "bottom")
## Joining, by = c("centerName", "tidy_center_names")
Samples submitted per year
bound_datasets %>%
left_join(tidy_center_lookup) %>%
group_by(creationYear, tidy_center_names) %>%
summarise(summed_by_institute = sum(numSamples)) %>%
ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
# geom_jitter() +
geom_text(check_overlap = TRUE) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "none")
## Joining, by = c("centerName", "tidy_center_names")
## `summarise()` regrouping output by 'creationYear' (override with `.groups` argument)
bound_datasets %>%
filter(creationYear > 1980) %>%
group_by(creationYear, tidy_center_names) %>%
summarise(summed_by_institute = sum(numSamples)) %>%
ggplot(aes(x=creationYear, y=summed_by_institute, col=tidy_center_names, label=tidy_center_names)) +
# geom_jitter() +
geom_text(check_overlap = TRUE) +
scale_x_continuous(breaks = seq(1980,2021, 1)) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "none") +
ylab("Sum of submitted samples per institute") +
xlab("Year of submission")
## `summarise()` regrouping output by 'creationYear' (override with `.groups` argument)
bound_datasets %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.character(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
tally() %>%
left_join(tidy_center_lookup) %>%
ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
geom_point(shape=1, size=4) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "bottom") +
labs("Number of samples submitted per dataset over time, coloured by institute") +
ylab("Number of samples per dataset") +
xlab("Year of submission")
## Joining, by = "centerName"
bound_datasets %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.character(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
filter(creationYear > 1980) %>%
group_by(egaStableId, creationYear, creationTime, numSamples, centerName) %>%
tally() %>%
left_join(tidy_center_lookup) %>%
ggplot(aes(x=creationTime, y=numSamples, col=tidy_center_names)) +
geom_beeswarm(shape=1, size=4) +
scale_color_manual(values = institute_colour_pal) +
theme_bw() +
theme(legend.position = "bottom") +
labs("Number of samples submitted per dataset over time, coloured by institute") +
ylab("Number of samples per dataset") +
xlab("Year of submission")
## Joining, by = "centerName"
## Warning in f(...): The default behavior of beeswarm has changed in version
## 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis.
## Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis
## choice.
trying out log scale but don’t think it is very good visually.
bound_datasets %>%
mutate(creationTime = as.Date(creationTime),
creationYear = as.character(format(creationTime, "%Y")),
numSamples = as.integer(numSamples)) %>%
group_by(egaStableId, creationYear, numSamples, centerName) %>%
tally() %>%
left_join(tidy_center_lookup) %>%
ggplot(aes(x=creationYear, y=numSamples, col=tidy_center_names)) +
geom_jitter(shape=1, size=3) +
scale_color_manual(values = institute_colour_pal) +
scale_y_continuous(trans = log10_trans()) +
theme(legend.position = "bottom") +
theme_bw()
## Joining, by = "centerName"
files_per_dataset <- function(dataset_accession){
api_request <- paste0("https://ega-archive.org/metadata/v2/files?queryBy=dataset&queryId=", dataset_accession, "&limit=0")
response <- GET(api_request)
if(status_code(response) == 200){
this_content <- content(response)
num_files <- this_content[["response"]][["numTotalResults"]]
sizes <- lapply(this_content[["response"]][["result"]], function(x) x[["fileSize"]])
sizes_length <- length(sizes)
summed_size <- do.call(sum, sizes)
return(tibble(dataset_accession, num_files, sizes_length, summed_size))
} else {
return(NA)
}
}
# test_response <- files_per_dataset(dataset_list[51])
# dataset_list <- unique(bound_datasets$egaStableId)
# file_size_list <- lapply(dataset_list, files_per_dataset)
# bound_files_dataset <- bind_rows(file_size_list)
# files_and_datasets <- bound_files_dataset %>%
# left_join(bound_datasets, by=c("dataset_accession" = "egaStableId"))
# saveRDS(file_size_list, "outputs/file_size_list.rds")
# saveRDS(files_and_datasets, "outputs/files_joined_datasets.rds")
files_and_datasets <- readRDS("outputs/files_joined_datasets.rds")
Total data stored in EGA by aussie researchers
per_dataset_file_summary <- files_and_datasets %>%
group_by(dataset_accession, num_files, summed_size) %>%
tally()
Total data stored in EGA by aussie researchers: * 14673 files * 325.4687165 TB of data
files_and_datasets %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationTime) %>%
filter(creationYear > 1980) %>%
tally() %>%
mutate(file_size_gb = summed_size/1e+9) %>%
ggplot(aes(x=creationTime, y=file_size_gb)) +
geom_point() +
geom_abline(intercept = 10000, slope=0, color="red") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
# scale_y_continuous(trans = log10_trans()) +
theme_bw() +
ylab("Total file size per dataset (GB)") +
xlab("Year of submission") +
labs(title="Total file size per data set over time")
sum_size_plot <- files_and_datasets %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationYear, datasetTypes, title) %>%
filter(creationYear > 1980) %>%
tally() %>%
mutate(file_size_gb = summed_size/1e+9) %>%
ggplot(aes(x=creationYear, y=file_size_gb, col=centerName, label=title, fill=datasetTypes)) +
geom_beeswarm() +
geom_abline(intercept = 10000, slope=0, color="red") +
# scale_y_continuous(trans = log10_trans()) +
scale_x_continuous(breaks=seq(1980,2022,1)) +
theme_bw() +
ylab("Total file size per dataset (GB)") +
xlab("Year of submission") +
labs(title="Total file size per data set over time") +
theme(legend.position = "none")
ggplotly(sum_size_plot)
files_and_datasets %>%
filter(creationYear > 1980) %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationTime, title) %>%
tally() %>%
ggplot(aes(x=creationTime, y=num_files)) +
geom_point() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme_bw() +
ylab("Total files per dataset") +
xlab("Year of submission") +
labs(title="Total files per data set over time")
num_files_plot <- files_and_datasets %>%
filter(creationYear > 1980) %>%
group_by(dataset_accession, summed_size, centerName, num_files, creationTime, creationYear, title, datasetTypes) %>%
tally() %>%
ggplot(aes(x=creationYear, y=num_files, label=title, col=centerName, fill=datasetTypes)) +
geom_beeswarm() +
# scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme_bw() +
ylab("Total files per dataset") +
xlab("Year of submission") +
labs(title="Total files per data set over time") +
theme(legend.position = "none")
ggplotly(num_files_plot)