Load packages, set random seed and set up timing function
library(dplyr)
#>
#> Attaching package: 'dplyr'
#>
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#>
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(data.table)
set.seed(1014)
time3 <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
Create data frame and initialise dplyr and data table objects.
n <- ceiling(5e7 / 26 ^ 2) # 50 million rows
df <- data.frame(
x = rep(LETTERS,each = 26 * n),
y = rep(letters,each = n),
v = rnorm(n * 26^2),
stringsAsFactors = FALSE
)
dplyr <- group_by(df, x, y)
dt <- data.table(df, key="x,y")
Compare filtering:
time3(df[df$x=="R" & df$y=="h", ])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 7.012 0.496 7.508 0 0
#> [2,] 6.934 0.492 7.424 0 0
#> [3,] 6.923 0.514 7.436 0 0
time3(filter(dplyr, x == "R" & y == "h"))
#> user.self sys.self elapsed user.child sys.child
#> [1,] 3.358 0.318 3.676 0 0
#> [2,] 3.346 0.320 3.665 0 0
#> [3,] 3.344 0.311 3.655 0 0
Compare grouped summarise:
time3(dt[, sum(v), by = x])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.679 0.377 1.056 0 0
#> [2,] 0.636 0.163 0.800 0 0
#> [3,] 0.636 0.166 0.802 0 0
time3(summarise(dplyr, sum(v)))
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.091 0.179 0.269 0 0
#> [2,] 0.047 0.001 0.048 0 0
#> [3,] 0.048 0.000 0.048 0 0