Fixing dplyr’s behavior for summarise() on empty data frames (#3274) led to a review of all hybrid handlers for this corner case. Indeed, some hybrid handlers didn’t account for this in some cases.
is.na(x + NA) and avoided an early return, at least for the REALSXP versionMicrobenchmark to the rescue, using flights dataset with integer and numeric columns that have some NA values or don’t have NA values:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nycflights13)
summarise_at(flights, vars(arr_time, day, arr_delay, distance), funs(sum(is.na(.))))
## # A tibble: 1 x 4
## arr_time day arr_delay distance
## <int> <int> <int> <int>
## 1 8713 0 9430 0
Benchmarking code, run for both master and the pull request:
library(nycflights13)
library(dplyr)
res <- microbenchmark::microbenchmark(
integer_mean = summarise(flights, mean(arr_time)),
integer_var = summarise(flights, var(arr_time)),
integer_sum = summarise(flights, sum(arr_time)),
integer2_mean = summarise(flights, mean(day)),
integer2_var = summarise(flights, var(day)),
integer2_sum = summarise(flights, sum(day)),
real_mean = summarise(flights, mean(arr_delay)),
real_var = summarise(flights, var(arr_delay)),
real_sum = summarise(flights, sum(arr_delay)),
real2_mean = summarise(flights, mean(distance)),
real2_var = summarise(flights, var(distance)),
real2_sum = summarise(flights, sum(distance)),
integer_mean_na_rm = summarise(flights, mean(arr_time, na.rm = TRUE)),
integer_var_na_rm = summarise(flights, var(arr_time, na.rm = TRUE)),
integer_sum_na_rm = summarise(flights, sum(arr_time, na.rm = TRUE)),
integer2_mean_na_rm = summarise(flights, mean(day, na.rm = TRUE)),
integer2_var_na_rm = summarise(flights, var(day, na.rm = TRUE)),
integer_2sum_na_rm = summarise(flights, sum(day, na.rm = TRUE)),
real_mean_na_rm = summarise(flights, mean(arr_delay, na.rm = TRUE)),
real_var_na_rm = summarise(flights, var(arr_delay, na.rm = TRUE)),
real_sum_na_rm = summarise(flights, sum(arr_delay, na.rm = TRUE)),
real2_mean_na_rm = summarise(flights, mean(distance, na.rm = TRUE)),
real2_var_na_rm = summarise(flights, var(distance, na.rm = TRUE)),
real2_sum_na_rm = summarise(flights, sum(distance, na.rm = TRUE)),
unit = "ms",
times = 10
) %>%
summary() %>%
as_tibble()
file <- paste0(git2r::revparse_single(git2r::repository("."), "HEAD")@sha, ".rds")
saveRDS(res, file)
res
The 2 suffix means that no NA values are present.
Comparing results:
master <-
readRDS("cf542082bcaf408c81d2f4b8c60fe9714989514e.rds") %>%
mutate(code = "master")
pr <-
readRDS("0d22c5653592da9e86d6fb24617a3545a45287e0.rds") %>%
mutate(code = "pr")
both <-
bind_rows(master, pr) %>%
mutate(
type = gsub("2?_.*$", "", expr),
has_na = !grepl("2", expr),
fun = gsub("^[^_]+_([^_]+)(_na_rm)?$", "\\1", expr),
na_rm = grepl("na_rm", expr)
) %>%
select(-expr) %>%
mutate_at(vars(code, type, fun), funs(factor(., levels = unique(.))))
library(ggplot2)
both %>%
mutate(time_ms = time / 1e6) %>%
ggplot(aes(x = fun, y = time_ms, fill = code)) +
geom_col(position = "dodge") +
facet_wrap(type + na_rm ~ has_na, labeller = label_both, scales = "free_y", ncol = 2)
both %>%
tidyr::spread(code, time) %>%
lm((pr / master) ~ ., .) %>%
summary()
##
## Call:
## lm(formula = (pr/master) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053676 -0.030997 -0.009526 0.028278 0.075004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.10211 0.02141 51.470 <2e-16 ***
## typereal -0.02679 0.01748 -1.532 0.143
## has_naTRUE -0.01588 0.01748 -0.908 0.376
## funvar 0.01275 0.02141 0.595 0.559
## funsum 0.00549 0.02141 0.256 0.801
## na_rmTRUE -0.04818 0.01748 -2.756 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04283 on 18 degrees of freedom
## Multiple R-squared: 0.382, Adjusted R-squared: 0.2103
## F-statistic: 2.225 on 5 and 18 DF, p-value: 0.0965