library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
## Convert K, M, G to bytes
convb <- function(x) {x
ptn <- "(\\d*(.\\d+)*)(.*)"
num <- as.numeric(sub(ptn, "\\1", x))
unit <- sub(ptn, "\\3", x)
unit[unit == ""] <- "1"
mult <- c("1" = 1, "K" = 1024, "M" = 1024^2, "G" = 1024^3)
num * unname(mult[unit])
}
## Dataset
url <- 'https://gist.githubusercontent.com/nturaga/a909d5d6dbcdb6d64ee731db7a0ad257/raw/f542411fc59ddb45f27c889d7e0697ae8609f3d7/size_dataexp_packs.txt'
dat <- read_table(url, col_names = c('size', 'package'))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## size = col_character(),
## package = col_character()
## )
Packages over a certain size limit should be in the Hubs
dat <- dat %>%
mutate(name = str_replace_all(package, 'packages/','')) %>%
mutate(name = str_replace_all(name, '.git','')) %>%
mutate(size_bytes = convb(size))
## data experiment packages over 100M
dat %>%
filter(size_bytes > convb('100M')) %>%
arrange(desc(size_bytes)) %>%
select(name, size) %>% print(n=200)
## # A tibble: 106 × 2
## name size
## <chr> <chr>
## 1 ChIPXpressData 3.2G
## 2 ELMER.data 2.6G
## 3 mammaPrintData 1.8G
## 4 depmap 1.6G
## 5 RGMQLlib 1.2G
## 6 RNASeqRData 1018M
## 7 pRolocdata 900M
## 8 Single.mTEC.Transcriptomes 898M
## 9 ccdata 896M
## 10 ChAMPdata 848M
## 11 Fletcher2013b 757M
## 12 DAPARdata 694M
## 13 ListerEtAlBSseq 661M
## 14 curatedMetagenomicData 646M
## 15 MMDiffBamSubset 634M
## 16 RTCGA.methylation 624M
## 17 RTCGA.rnaseq 615M
## 18 rcellminerData 567M
## 19 davidTiling 540M
## 20 HD2013SGI 529M
## 21 systemPipeRdata 510M
## 22 macrophage 495M
## 23 tximportData 489M
## 24 msdata 488M
## 25 FlowSorted.CordBlood.450k 483M
## 26 chipenrich.data 467M
## 27 synapterdata 421M
## 28 Affymoe4302Expr 420M
## 29 msPurityData 407M
## 30 curatedCRCData 401M
## 31 SCATEData 393M
## 32 AssessORFData 385M
## 33 curatedOvarianData 383M
## 34 WES.1KG.WUGSC 375M
## 35 SVM2CRMdata 369M
## 36 ChIC.data 366M
## 37 RnBeads.mm10 365M
## 38 optimalFlowData 355M
## 39 furrowSeg 350M
## 40 proteomics 347M
## 41 RNAseqData.HNRNPC.bam.chr14 342M
## 42 WGSmapp 341M
## 43 methylationArrayAnalysis 337M
## 44 FlowSorted.CordBloodNorway.450k 330M
## 45 RcisTarget.hg19.motifDBs.cisbpOnly.500bp 326M
## 46 ccTutorial 325M
## 47 RnBeads.hg19 323M
## 48 Affyhgu133aExpr 318M
## 49 yeastRNASeq 310M
## 50 KEGGdzPathwaysGEO 302M
## 51 FlowSorted.Blood.450k 300M
## 52 brgedata 293M
## 53 curatedBreastData 285M
## 54 hapmapsnp6 279M
## 55 FlowSorted.DLPFC.450k 249M
## 56 bsseqData 243M
## 57 ConnectivityMap 238M
## 58 nanotubes 236M
## 59 SNPhoodData 235M
## 60 hpAnnot 230M
## 61 MetaGxPancreas 229M
## 62 Affyhgu133Plus2Expr 226M
## 63 ChIPexoQualExample 221M
## 64 mtbls2 217M
## 65 RTCGA.PANCAN12 215M
## 66 RnBeads.rn5 215M
## 67 TargetScoreData 208M
## 68 VariantToolsData 206M
## 69 aracne.networks 205M
## 70 BeadArrayUseCases 204M
## 71 seqpac 199M
## 72 pd.atdschip.tiling 196M
## 73 TCGAWorkflowData 195M
## 74 PCHiCdata 192M
## 75 RnBeads.mm9 189M
## 76 oct4 188M
## 77 lydata 184M
## 78 seqc 181M
## 79 MEDIPSData 179M
## 80 SCATE 159M
## 81 breakpointRdata 158M
## 82 LungCancerACvsSCCGEO 155M
## 83 recountmethylation 151M
## 84 CardinalWorkflows 147M
## 85 seventyGeneData 147M
## 86 chromstaRData 142M
## 87 Hiiragi2013 140M
## 88 DmelSGI 140M
## 89 HiCDataHumanIMR90 138M
## 90 epimutacionsData 137M
## 91 minionSummaryData 136M
## 92 RnBeads.hg38 135M
## 93 CCl4 134M
## 94 metaMSdata 130M
## 95 OMICsPCAdata 128M
## 96 MOFAdata 122M
## 97 topdownrdata 121M
## 98 NanoporeRNASeq 111M
## 99 BioImageDbs 111M
## 100 Fletcher2013a 110M
## 101 RTCGA.mutations 106M
## 102 pumadata 104M
## 103 geneLenDataBase 102M
## 104 minfiData 102M
## 105 chipseqDB 102M
## 106 dorothea 101M