Fixing dplyr’s behavior for summarise() on empty data frames (#3274) led to a review of all hybrid handlers for this corner case. Indeed, some hybrid handlers didn’t account for this in some cases.

Microbenchmark to the rescue, using flights dataset with integer and numeric columns that have some NA values or don’t have NA values:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nycflights13)
summarise_at(flights, vars(arr_time, day, arr_delay, distance), funs(sum(is.na(.))))
## # A tibble: 1 x 4
##   arr_time   day arr_delay distance
##      <int> <int>     <int>    <int>
## 1     8713     0      9430        0

Benchmarking code, run for both master and the pull request:

library(nycflights13)
library(dplyr)

res <- microbenchmark::microbenchmark(
  integer_mean = summarise(flights, mean(arr_time)),
  integer_var = summarise(flights, var(arr_time)),
  integer_sum = summarise(flights, sum(arr_time)),

  integer2_mean = summarise(flights, mean(day)),
  integer2_var = summarise(flights, var(day)),
  integer2_sum = summarise(flights, sum(day)),

  real_mean = summarise(flights, mean(arr_delay)),
  real_var = summarise(flights, var(arr_delay)),
  real_sum = summarise(flights, sum(arr_delay)),

  real2_mean = summarise(flights, mean(distance)),
  real2_var = summarise(flights, var(distance)),
  real2_sum = summarise(flights, sum(distance)),

  integer_mean_na_rm = summarise(flights, mean(arr_time, na.rm = TRUE)),
  integer_var_na_rm = summarise(flights, var(arr_time, na.rm = TRUE)),
  integer_sum_na_rm = summarise(flights, sum(arr_time, na.rm = TRUE)),

  integer2_mean_na_rm = summarise(flights, mean(day, na.rm = TRUE)),
  integer2_var_na_rm = summarise(flights, var(day, na.rm = TRUE)),
  integer_2sum_na_rm = summarise(flights, sum(day, na.rm = TRUE)),

  real_mean_na_rm = summarise(flights, mean(arr_delay, na.rm = TRUE)),
  real_var_na_rm = summarise(flights, var(arr_delay, na.rm = TRUE)),
  real_sum_na_rm = summarise(flights, sum(arr_delay, na.rm = TRUE)),

  real2_mean_na_rm = summarise(flights, mean(distance, na.rm = TRUE)),
  real2_var_na_rm = summarise(flights, var(distance, na.rm = TRUE)),
  real2_sum_na_rm = summarise(flights, sum(distance, na.rm = TRUE)),

  unit = "ms",
  times = 10
) %>%
  summary() %>%
  as_tibble()

file <- paste0(git2r::revparse_single(git2r::repository("."), "HEAD")@sha, ".rds")
saveRDS(res, file)

res

The 2 suffix means that no NA values are present.

Comparing results:

master <-
  readRDS("cf542082bcaf408c81d2f4b8c60fe9714989514e.rds") %>% 
  mutate(code = "master")
pr <-
  readRDS("0d22c5653592da9e86d6fb24617a3545a45287e0.rds") %>% 
  mutate(code = "pr")

both <-
  bind_rows(master, pr) %>% 
  mutate(
    type = gsub("2?_.*$", "", expr),
    has_na = !grepl("2", expr),
    fun = gsub("^[^_]+_([^_]+)(_na_rm)?$", "\\1", expr),
    na_rm = grepl("na_rm", expr)
  ) %>%
  select(-expr) %>%
  mutate_at(vars(code, type, fun), funs(factor(., levels = unique(.))))

library(ggplot2)
both %>%
  mutate(time_ms = time / 1e6) %>%
  ggplot(aes(x = fun, y = time_ms, fill = code)) +
  geom_col(position = "dodge") +
  facet_wrap(type + na_rm ~ has_na, labeller = label_both, scales = "free_y", ncol = 2)

both %>%
  tidyr::spread(code, time) %>% 
  lm((pr / master) ~ ., .) %>% 
  summary()
## 
## Call:
## lm(formula = (pr/master) ~ ., data = .)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053676 -0.030997 -0.009526  0.028278  0.075004 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.10211    0.02141  51.470   <2e-16 ***
## typereal    -0.02679    0.01748  -1.532    0.143    
## has_naTRUE  -0.01588    0.01748  -0.908    0.376    
## funvar       0.01275    0.02141   0.595    0.559    
## funsum       0.00549    0.02141   0.256    0.801    
## na_rmTRUE   -0.04818    0.01748  -2.756    0.013 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04283 on 18 degrees of freedom
## Multiple R-squared:  0.382,  Adjusted R-squared:  0.2103 
## F-statistic: 2.225 on 5 and 18 DF,  p-value: 0.0965