More groups

Load packages, set random seed and set up timing function

library(dplyr)

#> 
#> Attaching package: 'dplyr'
#> 
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> 
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

library(data.table)
set.seed(1014)

time3 <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))    
  )  
}

Create data frame and initialise dplyr and data table objects.

n <- ceiling(5e7 / 26 ^ 2)   # 50 million rows
df <- data.frame(
  x = rep(LETTERS,each = 26 * n),
  y = rep(letters,each = n),
  v = rnorm(n * 26^2),
  stringsAsFactors = FALSE
)

dplyr <- group_by(df, x, y)
dt <- data.table(df, key="x,y")

Compare filtering:

time3(df[df$x=="R" & df$y=="h", ])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     7.098    0.493   7.593          0         0
#> [2,]     7.130    0.545   7.677          0         0
#> [3,]     7.284    0.521   7.825          0         0

time3(dt[dt$x=="R" & dt$y=="h", ])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     4.196    0.345   4.542          0         0
#> [2,]     4.196    0.351   4.557          0         0
#> [3,]     4.199    0.356   4.555          0         0

time3(filter(dplyr, x == "R", y == "h"))

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     3.437    0.361   3.798          0         0
#> [2,]     3.474    0.372   3.845          0         0
#> [3,]     3.477    0.427   3.905          0         0

Compare filters implemented with joins.

# data table uses keys
time3(dt[J("R", "h"), ])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.003    0.001   0.003          0         0
#> [2,]     0.003    0.000   0.002          0         0
#> [3,]     0.003    0.000   0.003          0         0

# dplyr does not
time3(inner_join(dplyr, data.frame(x = "R", y = "h", stringsAsFactors = FALSE), 
  by = c("x", "y")))

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     1.946    0.094   2.042          0         0
#> [2,]     1.938    0.090   2.029          0         0
#> [3,]     1.951    0.093   2.044          0         0

Compare grouped summarise:

time3(dt[, sum(v), by = list(x, y)])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.854    0.249   1.102          0         0
#> [2,]     0.867    0.253   1.121          0         0
#> [3,]     0.857    0.255   1.112          0         0

time3(summarise(dplyr, sum(v)))

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.044        0   0.045          0         0
#> [2,]     0.046        0   0.045          0         0
#> [3,]     0.045        0   0.044          0         0

More groups

Use 10,000 groups instead of ~700:

df$z <- sample(1e5, n, rep = TRUE)
dplyr <- group_by(df, z)
dt <- data.table(df, key="z")

time3(dt[, sum(v), by = z])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.672    0.126   0.799          0         0
#> [2,]     0.657    0.125   0.782          0         0
#> [3,]     0.658    0.126   0.784          0         0

time3(summarise(dplyr, sum(v)))

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.092    0.181   0.273          0         0
#> [2,]     0.045    0.000   0.045          0         0
#> [3,]     0.045    0.000   0.045          0         0