The goal is to calculate quantiles on aggregated long Rle's. The information is split by chr, and combining it naively fails.
suppressMessages(library("IRanges")) ## Create dummy data lengths <- c(249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560, 59373566) chrdata <- lapply(lengths, function(x) { Rle(rep(0, x)) }) ## Just to see visualize explore it (doesn't matter if the type is ## numeric-Rle or integer-Rle) head(chrdata)
## [[1]] ## numeric-Rle of length 249250621 with 1 run ## Lengths: 249250621 ## Values : 0 ## ## [[2]] ## numeric-Rle of length 243199373 with 1 run ## Lengths: 243199373 ## Values : 0 ## ## [[3]] ## numeric-Rle of length 198022430 with 1 run ## Lengths: 198022430 ## Values : 0 ## ## [[4]] ## numeric-Rle of length 191154276 with 1 run ## Lengths: 191154276 ## Values : 0 ## ## [[5]] ## numeric-Rle of length 180915260 with 1 run ## Lengths: 180915260 ## Values : 0 ## ## [[6]] ## numeric-Rle of length 171115067 with 1 run ## Lengths: 171115067 ## Values : 0
## Trying to combine unlist(RleList(chrdata), use.names = FALSE)
## Error: error in evaluating the argument 'x' in selecting a method for function 'unlist': Error in .Call2("Rle_constructor", values, lengths, check, 0L, PACKAGE = "IRanges") :
## integer overflow while summing elements in 'lengths'
## Calls: RleList ... -> Rle -> Rle -> .local -> .Call2 -> .Call
traceback()
## No traceback available
## Works manually up to the first 12 tmp <- c(chrdata[[1]], chrdata[[2]], chrdata[[3]], chrdata[[4]], chrdata[[5]], chrdata[[6]], chrdata[[7]], chrdata[[8]], chrdata[[9]], chrdata[[10]], chrdata[[11]], chrdata[[12]]) ## For the first 13 it breaks tmp <- c(chrdata[[1]], chrdata[[2]], chrdata[[3]], chrdata[[4]], chrdata[[5]], chrdata[[6]], chrdata[[7]], chrdata[[8]], chrdata[[9]], chrdata[[10]], chrdata[[11]], chrdata[[12]], chrdata[[13]])
## Error: integer overflow while summing elements in 'lengths'
traceback()
## No traceback available
## Note the main issue is related to the 2^31 ceiling plot(cumsum(lengths), main = "Length of Rle exceeding 2^31") abline(v = 13, col = "red") abline(h = 2^31, col = "orange")
One idea: leave the Rle-world and use some kind of weighted quantile.
## Manually combine in two parts part1 <- unlist(RleList(chrdata[1:12]), use.names = FALSE) part2 <- unlist(RleList(chrdata[13:length(chrdata)]), use.names = FALSE) parts <- list(part1, part2) ## Reduce number of runs in the Rle's by sorting (kind of doing table()) sortedParts <- lapply(parts, function(x) sort(x)) ## Leave Rle-world info <- lapply(sortedParts, function(y) { data.frame(value = runValue(y), length = runLength(y)) }) info <- do.call(rbind, info) ## Format info$length <- as.numeric(info$length) ## Collapse collapsed <- tapply(info$length, info$value, sum) weights <- as.integer(names(collapsed)) ## Calculate weighted quantile of interest library("Hmisc")
wtd.quantile(collapsed, weights = weights, probs = 0.5)
## 50% ## 3.096e+09
Is there a way to calculate the quantiles of the aggregated Rle's without all this machinery? Are there other possibly more efficients ways to do so?
## Reproducibility sessionInfo()
## R version 3.0.2 (2013-09-25) ## Platform: x86_64-apple-darwin10.8.0 (64-bit) ## ## locale: ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 ## ## attached base packages: ## [1] splines parallel stats graphics grDevices utils datasets ## [8] methods base ## ## other attached packages: ## [1] Hmisc_3.12-2 Formula_1.1-1 survival_2.37-4 ## [4] IRanges_1.20.4 BiocGenerics_0.8.0 knitr_1.5 ## ## loaded via a namespace (and not attached): ## [1] cluster_1.14.4 evaluate_0.5.1 formatR_0.10 grid_3.0.2 ## [5] highr_0.3 lattice_0.20-24 rpart_4.1-3 stats4_3.0.2 ## [9] stringr_0.6.2 tools_3.0.2