Benchmarks: baseball data

The purpose of these benchmarks is to be as fair as possible, to help understand the relatively performance tradeoffs of the different approaches. If you think my implementation of base or data.table equivalents is suboptimal, please let me know better ways.

Also note that I consider any significant performance difference between dt and dt_raw to be a bug in dplyr: for individual operations there should be very little overhead to calling data.table via dplyr. However, data.table may be significantly faster when performing the same sequence of operations as dplyr. This is because currently dplyr uses an eager evaluation approach so the individual calls to [.data.table don't get as much information about the desired result as the single call to [.data.table would if you did it by hand.

Data setup

The following benchmarks explore the performance on a somewhat realistic example: the Batting dataset from the Lahman package. It contains 96600 records on the batting careers of 96600 players from 1871 to 2012.

The first code block defines three alternative backends for the Batting dataset, and a players dataset that represents operations to be performed by player:

batting_df <- tbl_df(Batting)
players_df <- group_by(batting_df, playerID)

batting_dt <- tbl_dt(Batting)
players_dt <- group_by(batting_dt, playerID)

Summarise

Compute the average number of at bats for each player:

microbenchmark(
  dplyr_df = summarise(players_df, ab = mean(AB)),
  dplyr_dt = summarise(players_dt, ab = mean(AB)),
  dt_raw =   players_dt[, list(ab = mean(AB)), by = playerID],
  base =     tapply(batting_df$AB, batting_df$playerID, FUN = mean),
  times = 5, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr     min      lq  median      uq    max neval
#>  dplyr_df   0.643   0.705   0.819   0.821   1.01     5
#>  dplyr_dt  19.013  21.527  22.832  25.560  28.11     5
#>    dt_raw  17.441  17.543  18.983  20.976  22.23     5
#>      base 199.193 208.250 209.519 215.019 238.85     5

NB: base implementation captures computation but not output format, giving considerably less output.

However, this comparison is slightly unfair because both data.table and summarise.tbl_df use tricks to find a more efficient implementation of mean(). Data table calls a C implementation of the mean (using.External(Cfastmean, B, FALSE)and thus avoiding the overhead of S3 method dispatch).dplyr::summarise uses a hybrid evaluation technique, where common functions are implemented purely in C++, avoiding R function call overhead.

mean_ <- function(x) .Internal(mean(x))
microbenchmark(
  dplyr_df = summarise(players_df, ab = mean_(AB)),
  dplyr_dt = summarise(players_dt, ab = mean_(AB)),
  dt_raw =   players_dt[, list(ab = mean_(AB)), by = playerID],
  base =     tapply(batting_df$AB, batting_df$playerID, FUN = mean_),
  times = 5, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr  min   lq median   uq   max neval
#>  dplyr_df 10.7 10.9   13.3 16.9  21.5     5
#>  dplyr_dt 18.2 18.5   18.8 20.5  22.6     5
#>    dt_raw 16.8 17.4   19.6 20.0  20.6     5
#>      base 96.3 96.9   97.9 99.3 101.5     5

Arrange

Arrange by year within each player:

microbenchmark(
  dplyr_df = arrange(players_df, yearID),
  dplyr_dt = arrange(players_dt, yearID),
  dt_raw =   batting_dt[order(playerID, yearID), ],
  base   =   batting_df[order(batting_df$playerID, batting_df$yearID), ],
  times = 2, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr   min    lq median    uq   max neval
#>  dplyr_df  26.8  26.8   27.8  28.9  28.9     2
#>  dplyr_dt 165.4 165.4  183.4 201.5 201.5     2
#>    dt_raw 147.1 147.1  163.7 180.4 180.4     2
#>      base 159.2 159.2  159.2 159.2 159.2     2

Filter

Find the year for which each player played the most games:

microbenchmark(
  dplyr_df = filter(players_df, G == max(G)),
  dplyr_dt = filter(players_dt, G == max(G)),
  base   =   batting_df[ave(batting_df$G, batting_df$playerID, FUN = max) ==
    batting_df$G, ],
  times = 2, 
  unit = "ms"  
)

#> Unit: milliseconds
#>      expr   min    lq median    uq   max neval
#>  dplyr_df  28.7  28.7   29.1  29.6  29.6     2
#>  dplyr_dt  41.9  41.9   43.8  45.7  45.7     2
#>      base 128.3 128.3  128.5 128.8 128.8     2

I'm not aware of a single line data table equivalent (see SO 16573995). Suggetions welcome. dplyr currently doesn't support hybrid evaluation for logical comparison, but it is scheduled for 0.2 (see #113), this should give an additional (10-20x) speed up.

Mutate

Rank years based on number of at bats:

microbenchmark(
  dplyr_df  = mutate(players_df, rank = rank(desc(AB))),
  dplyr_dt  = mutate(players_dt, rank = rank(desc(AB))),
  dt_raw =    players_dt[, list(rank = rank(desc(AB))), by = playerID],
  times = 2, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr min  lq median  uq max neval
#>  dplyr_df 622 622    637 653 653     2
#>  dplyr_dt 634 634    637 640 640     2
#>    dt_raw 628 628    634 640 640     2

Compute year of career:

microbenchmark(
  dplyr_df = mutate(players_df, cyear = yearID - min(yearID) + 1),
  dplyr_dt = mutate(players_dt, cyear = yearID - min(yearID) + 1),
  dt_raw =   players_dt[, list(cyear = yearID - min(yearID) + 1), by = playerID],
  times = 5, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr  min   lq median   uq  max neval
#>  dplyr_df 31.3 32.4   70.8 71.3 73.0     5
#>  dplyr_dt 43.4 43.9   44.1 47.3 47.9     5
#>    dt_raw 24.0 26.6   27.7 28.8 33.0     5

Rank is a relatively expensive operation and min() is relatively cheap, showing the the relative performance overhead of the difference techniques.

dplyr currently has some support for hybrid evaluation of window functions:

microbenchmark(
  dplyr_df  = mutate(players_df, rank = min_rank(AB)),
  dplyr_dt  = mutate(players_dt, rank = min_rank(AB)),
  dt_raw =    players_dt[, list(rank = min_rank(AB)), by = playerID],
  times = 2, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr   min    lq median    uq   max neval
#>  dplyr_df  32.1  32.1   33.3  34.6  34.6     2
#>  dplyr_dt 669.4 669.4  679.4 689.4 689.4     2
#>    dt_raw 646.1 646.1  652.1 658.1 658.1     2

Joins

We conclude with some quick comparisons of joins. First we create two new datasets: master which contains demographic information on each player, and hall_of_fame which contains all players inducted into the hall of fame.

master_df <- tbl_df(Master) %.% select(playerID, hofID, birthYear)
hall_of_fame_df <- tbl_df(HallOfFame) %.% filter(inducted == "Y") %.% 
  select(hofID, votedBy, category)

master_dt <- tbl_dt(Master) %.% select(playerID, hofID, birthYear)
hall_of_fame_dt <- tbl_dt(HallOfFame) %.% filter(inducted == "Y") %.% 
  select(hofID, votedBy, category)

microbenchmark(
  dplyr_df = left_join(master_df, hall_of_fame_df, by = "hofID"),
  dplyr_dt = left_join(master_dt, hall_of_fame_dt, by = "hofID"),
  base     = merge(master_df, hall_of_fame_df, by = "hofID", all.x = TRUE),
  times = 10, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr    min     lq median    uq   max neval
#>  dplyr_df  0.978  0.985   1.09  1.30  2.17    10
#>  dplyr_dt  3.187  3.311   3.39  3.95 10.58    10
#>      base 33.563 34.740  40.98 44.16 86.78    10


microbenchmark(
  dplyr_df = inner_join(master_df, hall_of_fame_df, by = "hofID"),
  dplyr_dt = inner_join(master_dt, hall_of_fame_dt, by = "hofID"),
  base     = merge(master_df, hall_of_fame_df, by = "hofID"),
  times = 10, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr   min    lq median    uq  max neval
#>  dplyr_df 0.902 0.936  0.957 0.999 1.19    10
#>  dplyr_dt 2.081 2.155  2.259 2.443 2.93    10
#>      base 2.518 3.026  3.080 3.315 3.72    10


microbenchmark(
  dplyr_df = semi_join(master_df, hall_of_fame_df, by = "hofID"),
  dplyr_dt = semi_join(master_dt, hall_of_fame_dt, by = "hofID"),
  times = 10, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr  min    lq median    uq  max neval
#>  dplyr_df 0.88 0.887  0.893 0.908 1.15    10
#>  dplyr_dt 1.36 1.367  1.378 1.413 1.77    10


microbenchmark(
  dplyr_df = anti_join(master_df, hall_of_fame_df, by = "hofID"),
  dplyr_dt = anti_join(master_dt, hall_of_fame_dt, by = "hofID"),
  times = 10, 
  unit = "ms"
)

#> Unit: milliseconds
#>      expr  min   lq median   uq  max neval
#>  dplyr_df 1.28 1.33   1.37 1.41 1.43    10
#>  dplyr_dt 2.42 2.80   2.84 3.25 3.55    10