Load packages, set random seed and set up timing function
library(dplyr)
#>
#> Attaching package: 'dplyr'
#>
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#>
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(data.table)
set.seed(1014)
time3 <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
Create data frame and initialise dplyr and data table objects.
n <- ceiling(5e7 / 26 ^ 2) # 50 million rows
df <- data.frame(
x = rep(LETTERS,each = 26 * n),
y = rep(letters,each = n),
v = rnorm(n * 26^2),
stringsAsFactors = FALSE
)
dplyr <- group_by(df, x, y)
dt <- data.table(df, key="x,y")
Compare filtering:
time3(df[df$x=="R" & df$y=="h", ])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 7.098 0.493 7.593 0 0
#> [2,] 7.130 0.545 7.677 0 0
#> [3,] 7.284 0.521 7.825 0 0
time3(dt[dt$x=="R" & dt$y=="h", ])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 4.196 0.345 4.542 0 0
#> [2,] 4.196 0.351 4.557 0 0
#> [3,] 4.199 0.356 4.555 0 0
time3(filter(dplyr, x == "R", y == "h"))
#> user.self sys.self elapsed user.child sys.child
#> [1,] 3.437 0.361 3.798 0 0
#> [2,] 3.474 0.372 3.845 0 0
#> [3,] 3.477 0.427 3.905 0 0
Compare filters implemented with joins.
# data table uses keys
time3(dt[J("R", "h"), ])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.003 0.001 0.003 0 0
#> [2,] 0.003 0.000 0.002 0 0
#> [3,] 0.003 0.000 0.003 0 0
# dplyr does not
time3(inner_join(dplyr, data.frame(x = "R", y = "h", stringsAsFactors = FALSE),
by = c("x", "y")))
#> user.self sys.self elapsed user.child sys.child
#> [1,] 1.946 0.094 2.042 0 0
#> [2,] 1.938 0.090 2.029 0 0
#> [3,] 1.951 0.093 2.044 0 0
Compare grouped summarise:
time3(dt[, sum(v), by = list(x, y)])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.854 0.249 1.102 0 0
#> [2,] 0.867 0.253 1.121 0 0
#> [3,] 0.857 0.255 1.112 0 0
time3(summarise(dplyr, sum(v)))
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.044 0 0.045 0 0
#> [2,] 0.046 0 0.045 0 0
#> [3,] 0.045 0 0.044 0 0
Use 10,000 groups instead of ~700:
df$z <- sample(1e5, n, rep = TRUE)
dplyr <- group_by(df, z)
dt <- data.table(df, key="z")
time3(dt[, sum(v), by = z])
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.672 0.126 0.799 0 0
#> [2,] 0.657 0.125 0.782 0 0
#> [3,] 0.658 0.126 0.784 0 0
time3(summarise(dplyr, sum(v)))
#> user.self sys.self elapsed user.child sys.child
#> [1,] 0.092 0.181 0.273 0 0
#> [2,] 0.045 0.000 0.045 0 0
#> [3,] 0.045 0.000 0.045 0 0