library(dplyr)
library(ggplot2)
roundtrip <- function(df, con_fun, ...) {
test <- tempfile()
con <- con_fun(test, ...)
on.exit(close(con))
save <- system.time(saveRDS(df, con))[[3]]
load <- system.time(x <- readRDS(test))[[3]]
size <- file.info(test)$size / (1024) ^ 2
data_frame(save, load, size)
}
x <- runif(1e5)
df <- data.frame(x = x, y = x)
times <- bind_rows(
roundtrip(df, file),
roundtrip(df, gzfile, compression = 1),
roundtrip(df, gzfile, compression = 6),
roundtrip(df, gzfile, compression = 9),
roundtrip(df, bzfile, compression = 1),
roundtrip(df, bzfile, compression = 6),
roundtrip(df, bzfile, compression = 9),
roundtrip(df, xzfile, compression = 1),
roundtrip(df, xzfile, compression = 6),
roundtrip(df, xzfile, compression = 9)
)
times$type <- c("raw", rep(c("gz", "bz", "xz"), each = 3))
times$level <- c(0, rep(c(1, 6, 9), 3))
times
## Source: local data frame [10 x 5]
##
## save load size type level
## 1 0.005 0.003 1.5260477 raw 0
## 2 0.051 0.009 1.0302591 gz 1
## 3 0.215 0.009 1.0162907 gz 6
## 4 2.211 0.009 0.9845800 gz 9
## 5 0.165 0.076 0.9183054 bz 1
## 6 0.164 0.081 0.9084072 bz 6
## 7 0.170 0.096 0.8796549 bz 9
## 8 0.145 0.041 0.4402008 xz 1
## 9 0.623 0.042 0.4317474 xz 6
## 10 0.683 0.048 0.4317474 xz 9
Default compression is ~40x slower to save, ~5x slower to load, and only reduces size by ~30%.
base <- times %>% ggplot(aes(level, colour = type)) +
geom_point(size = 4) +
geom_line(size = 1)
Compression level has little impact on load times. Uncompressed is fastest by order of magnitude, followed by gzip, then bzip2, then finally by xz.
base + aes(y = load) + scale_y_log10()
Compression level has major impact on save times for gz and xz.
base + aes(y = save) + scale_y_log10()
Compression level has relatively little impact on size. Mostly determined by compression algorithm (might be dependent on this specific dataset).
base + aes(y = size)