Load packages, set random seed and set up timing function

library(dplyr)

#> 
#> Attaching package: 'dplyr'
#> 
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> 
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

library(data.table)
set.seed(1014)

time3 <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))    
  )  
}

Create data frame and initialise dplyr and data table objects.

n <- ceiling(5e7 / 26 ^ 2)   # 50 million rows
df <- data.frame(
  x = rep(LETTERS,each = 26 * n),
  y = rep(letters,each = n),
  v = rnorm(n * 26^2),
  stringsAsFactors = FALSE
)

dplyr <- group_by(df, x, y)
dt <- data.table(df, key="x,y")

Compare filtering:

time3(df[df$x=="R" & df$y=="h", ])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     7.012    0.496   7.508          0         0
#> [2,]     6.934    0.492   7.424          0         0
#> [3,]     6.923    0.514   7.436          0         0

time3(filter(dplyr, x == "R" & y == "h"))

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     3.358    0.318   3.676          0         0
#> [2,]     3.346    0.320   3.665          0         0
#> [3,]     3.344    0.311   3.655          0         0

Compare grouped summarise:

time3(dt[, sum(v), by = x])

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.679    0.377   1.056          0         0
#> [2,]     0.636    0.163   0.800          0         0
#> [3,]     0.636    0.166   0.802          0         0

time3(summarise(dplyr, sum(v)))

#>      user.self sys.self elapsed user.child sys.child
#> [1,]     0.091    0.179   0.269          0         0
#> [2,]     0.047    0.001   0.048          0         0
#> [3,]     0.048    0.000   0.048          0         0