library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)

## Convert K, M, G to bytes
convb <- function(x) {x
   ptn <- "(\\d*(.\\d+)*)(.*)"
   num  <- as.numeric(sub(ptn, "\\1", x))
   unit <- sub(ptn, "\\3", x)
   unit[unit == ""] <- "1"

   mult <- c("1" = 1, "K" = 1024, "M" = 1024^2, "G" = 1024^3)
   num * unname(mult[unit])
 }

## Dataset
url <- 'https://gist.githubusercontent.com/nturaga/a909d5d6dbcdb6d64ee731db7a0ad257/raw/f542411fc59ddb45f27c889d7e0697ae8609f3d7/size_dataexp_packs.txt'

Read the data

dat <- read_table(url, col_names = c('size', 'package'))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   size = col_character(),
##   package = col_character()
## )

List of experiment data packages over 100Mb

Packages over a certain size limit should be in the Hubs

dat <- dat %>% 
    mutate(name = str_replace_all(package, 'packages/','')) %>% 
    mutate(name = str_replace_all(name, '.git','')) %>% 
    mutate(size_bytes = convb(size))

## data experiment packages over 100M
dat %>%
    filter(size_bytes > convb('100M')) %>%
    arrange(desc(size_bytes)) %>% 
    select(name, size) %>% print(n=200)
## # A tibble: 106 × 2
##     name                                     size 
##     <chr>                                    <chr>
##   1 ChIPXpressData                           3.2G 
##   2 ELMER.data                               2.6G 
##   3 mammaPrintData                           1.8G 
##   4 depmap                                   1.6G 
##   5 RGMQLlib                                 1.2G 
##   6 RNASeqRData                              1018M
##   7 pRolocdata                               900M 
##   8 Single.mTEC.Transcriptomes               898M 
##   9 ccdata                                   896M 
##  10 ChAMPdata                                848M 
##  11 Fletcher2013b                            757M 
##  12 DAPARdata                                694M 
##  13 ListerEtAlBSseq                          661M 
##  14 curatedMetagenomicData                   646M 
##  15 MMDiffBamSubset                          634M 
##  16 RTCGA.methylation                        624M 
##  17 RTCGA.rnaseq                             615M 
##  18 rcellminerData                           567M 
##  19 davidTiling                              540M 
##  20 HD2013SGI                                529M 
##  21 systemPipeRdata                          510M 
##  22 macrophage                               495M 
##  23 tximportData                             489M 
##  24 msdata                                   488M 
##  25 FlowSorted.CordBlood.450k                483M 
##  26 chipenrich.data                          467M 
##  27 synapterdata                             421M 
##  28 Affymoe4302Expr                          420M 
##  29 msPurityData                             407M 
##  30 curatedCRCData                           401M 
##  31 SCATEData                                393M 
##  32 AssessORFData                            385M 
##  33 curatedOvarianData                       383M 
##  34 WES.1KG.WUGSC                            375M 
##  35 SVM2CRMdata                              369M 
##  36 ChIC.data                                366M 
##  37 RnBeads.mm10                             365M 
##  38 optimalFlowData                          355M 
##  39 furrowSeg                                350M 
##  40 proteomics                               347M 
##  41 RNAseqData.HNRNPC.bam.chr14              342M 
##  42 WGSmapp                                  341M 
##  43 methylationArrayAnalysis                 337M 
##  44 FlowSorted.CordBloodNorway.450k          330M 
##  45 RcisTarget.hg19.motifDBs.cisbpOnly.500bp 326M 
##  46 ccTutorial                               325M 
##  47 RnBeads.hg19                             323M 
##  48 Affyhgu133aExpr                          318M 
##  49 yeastRNASeq                              310M 
##  50 KEGGdzPathwaysGEO                        302M 
##  51 FlowSorted.Blood.450k                    300M 
##  52 brgedata                                 293M 
##  53 curatedBreastData                        285M 
##  54 hapmapsnp6                               279M 
##  55 FlowSorted.DLPFC.450k                    249M 
##  56 bsseqData                                243M 
##  57 ConnectivityMap                          238M 
##  58 nanotubes                                236M 
##  59 SNPhoodData                              235M 
##  60 hpAnnot                                  230M 
##  61 MetaGxPancreas                           229M 
##  62 Affyhgu133Plus2Expr                      226M 
##  63 ChIPexoQualExample                       221M 
##  64 mtbls2                                   217M 
##  65 RTCGA.PANCAN12                           215M 
##  66 RnBeads.rn5                              215M 
##  67 TargetScoreData                          208M 
##  68 VariantToolsData                         206M 
##  69 aracne.networks                          205M 
##  70 BeadArrayUseCases                        204M 
##  71 seqpac                                   199M 
##  72 pd.atdschip.tiling                       196M 
##  73 TCGAWorkflowData                         195M 
##  74 PCHiCdata                                192M 
##  75 RnBeads.mm9                              189M 
##  76 oct4                                     188M 
##  77 lydata                                   184M 
##  78 seqc                                     181M 
##  79 MEDIPSData                               179M 
##  80 SCATE                                    159M 
##  81 breakpointRdata                          158M 
##  82 LungCancerACvsSCCGEO                     155M 
##  83 recountmethylation                       151M 
##  84 CardinalWorkflows                        147M 
##  85 seventyGeneData                          147M 
##  86 chromstaRData                            142M 
##  87 Hiiragi2013                              140M 
##  88 DmelSGI                                  140M 
##  89 HiCDataHumanIMR90                        138M 
##  90 epimutacionsData                         137M 
##  91 minionSummaryData                        136M 
##  92 RnBeads.hg38                             135M 
##  93 CCl4                                     134M 
##  94 metaMSdata                               130M 
##  95 OMICsPCAdata                             128M 
##  96 MOFAdata                                 122M 
##  97 topdownrdata                             121M 
##  98 NanoporeRNASeq                           111M 
##  99 BioImageDbs                              111M 
## 100 Fletcher2013a                            110M 
## 101 RTCGA.mutations                          106M 
## 102 pumadata                                 104M 
## 103 geneLenDataBase                          102M 
## 104 minfiData                                102M 
## 105 chipseqDB                                102M 
## 106 dorothea                                 101M