data.table versus dplyrThis code shows the difference in execution time and rate of execution time increase for an analysis function coded in two different approaches, 1) data.table, and 2) dplyr. This analysis is the result of working with this function and using profvis to optimize. The end result is that the data.table version is much faster across all simulations and scales much, much better than the dplyrversion. Please read through and check out the profvis results: data.table and dplyr.
profvis to profile the execution time of each function, the full results are here for dplyr and here for data.tabledplyr and use it for everything. This is not intended as a critique, just some information on execution speeddplyr may not be optimum and may be contributing to the speed issueslibrary("ggplot2")
library("profvis")
library("dplyr")
library("tidyr")
library("knitr")
The function being tested here computes the aoristic weight for a series of simulated data. What an aorsitc weight is really doesn’t have much to do with this post, but if you are interested, more information can be found here. This somewhat obscure function is useful here because it is an example of the sort of function that an analyst would have to code themselves and therefore run into the decision of using dplyr or data.table. Obviously, you could write this is C++ or do something else to optimize it if that was your priority. However, this is a more realistic comparison (IMHO), as much of an analyst’s time is spent balancing optimization versus getting-stuff-done.
note: there is a little weirdness with negative dates and start/end dates. This is because I started out with BCE dates and then switch to years BP dates to match another analysis. So, it is little bit of a hodge-podge in dealing with BCE and BP.s
time_step_size <- 100 # years
seq_begin <- -5000 # years ago
seq_end <- 0 # years ago
### BE AWARE! - max samples over 10K, to 50K can take a fairly long timw!
max_sim_sites <- 50000
sim_site_count_seq <- c(10,seq(50,max_sim_sites,50))
Below are the two versions of the same function
data.table versionmy_aorist_DT <- function(events, start.date = 0, end.date = 2000, bin.width = 100) {
require(data.table)
setDT(events)
time_steps <- data.table(
bin.no = seq(1:(abs(end.date)/bin.width)),
Start = seq(start.date,(end.date-bin.width), by = bin.width),
End = seq((start.date+bin.width),end.date, by = bin.width))
setkey(time_steps, Start, End)
overlap_names <- data.table::foverlaps(events, time_steps,
type = "any", which = FALSE)
overlap_names <- overlap_names[i.Start != End & i.End != Start]
overlap_names[, duration := (i.End - i.Start)]
overlap_names[, W := (bin.width / duration)]
ov_sum <- overlap_names[, .(aorist = sum(W),
median_step_W = median(W),
site_count = .N), keyby=.(bin.no)]
setkey(ov_sum, bin.no)
setkey(time_steps, bin.no)
ov_sum2 <- ov_sum[time_steps, nomatch = NA]
ov_sum2[is.na(ov_sum2)] <- 0
ov_sum2[, bin := paste0(Start, "-", End)]
return(ov_sum2)
}
dplyr versionmy_aorist_dplyr <- function(events, weight = 1, start.date = 0, end.date = 2000,
bin.width = 100, round_int = 4) {
require(dplyr)
require(data.table)
time_steps <- data.frame(
bin.no = seq(1:(abs(end.date)/bin.width)),
Start = seq(start.date,(end.date-bin.width), by = bin.width),
End = seq((start.date+bin.width),end.date, by = bin.width))
setDT(time_steps)
setDT(events)
setkey(time_steps, Start, End)
overlap_names <- data.table::foverlaps(events, time_steps,
type = "any", which = FALSE) %>%
filter(i.Start != End & i.End != Start)
aorist <- overlap_names %>%
data.frame() %>%
group_by(name) %>%
mutate(site_count = n(),
W = (bin.width / (i.End - i.Start)),
W = round(W,round_int)) %>%
arrange(name, bin.no) %>%
group_by(bin.no) %>%
summarise(aorist = sum(W),
median_step_W = median(W),
site_count = n()) %>%
right_join(., time_steps, by = "bin.no") %>%
replace_na(list(aorist = 0, site_count = 0, median_step_W = 0)) %>%
mutate(bin = paste0(Start, "-", End)) %>%
dplyr::select(bin, bin.no, aorist)
return(aorist)
}
This loop simply loops over the 1001 site/event counts in sim_site_count_seq. For each loop, both functions are executed and the time it takes to run each is recorded into a table.
# set up container for results
execute_time_results <- matrix(nrow = length(sim_site_count_seq), ncol = 4)
colnames(execute_time_results) <- c("site_count", "dplyr", "data.table", "RMSE")
# commented out progress bar, but might be good for you
# pb <- txtProgressBar(min = 0, max = length(sim_site_count_seq), style = 3, char="*")
# for i in the number of different site_counts
for(i in seq_along(sim_site_count_seq)){
sim_site_count <- sim_site_count_seq[i]
# simulate a set of sites
sites <- data.frame(
name = seq(1:sim_site_count),
begin = sample(seq(seq_begin,seq_end, by = 100), sim_site_count, replace = TRUE)) %>%
mutate(end = begin + sample(seq(100, 3000, by = 100),
length(begin), replace = TRUE),
end = ifelse(end > 0, 0, end)) %>%
dplyr::select(begin, end, name) %>%
mutate(begin = abs(begin), end = abs(end)) %>%
dplyr::rename(Start = end, End = begin)
## run functions with Sys.time() to get execution time
aor_dplyr.start.time <- Sys.time()
aor_dplyr <- my_aorist_dplyr(sites, end.date = 5000, bin.width = 100)
aor_dplyr.end.time <- Sys.time()
aor_DT.start.time <- Sys.time()
aor_DT <- my_aorist_DT(sites, end.date = 5000, bin.width = 100)
aor_DT.end.time <- Sys.time()
# gather execution time results
aor_dplyr.execute.time <- aor_dplyr.end.time - aor_dplyr.start.time
aor_DT.execute.time <- aor_DT.end.time - aor_DT.start.time
execute_time_results[i,1] <- sim_site_count
execute_time_results[i,2] <- aor_dplyr.execute.time
execute_time_results[i,3] <- aor_DT.execute.time
execute_time_results[i,4] <- round(sqrt(mean(aor_DT$aorist - aor_dplyr$aorist)^2),5)
# commented out progress bar advance
# setTxtProgressBar(pb, i)
}
# commented out progress bar close.
# close(pb)
The results of the execution time are summarized to make some sense of them.
# add time difference
execute_time_results <- data.frame(execute_time_results) %>%
mutate(diff = dplyr - data.table)
# summarise execution results
execute_time_summary <- execute_time_results %>%
summarise(site_count_min = floor(min(site_count)),
site_count_max = floor(max(site_count)),
mean_dplyr = round(mean(dplyr),3),
mean_DT = round(mean(data.table),3),
max_diff = round(max(diff),3), # reverse b/c negative time
min_diff = round(min(diff),4), # reverse b/c negative time
mean_diff = round(mean(diff),3),
mean_RMSE = mean(RMSE)) %>%
t()
colnames(execute_time_summary) <- "measure"
# print results
kable(execute_time_summary, digits = 3)
| measure | |
|---|---|
| site_count_min | 10.000 |
| site_count_max | 50000.000 |
| mean_dplyr | 0.995 |
| mean_DT | 0.095 |
| max_diff | 1.981 |
| min_diff | -0.002 |
| mean_diff | 0.900 |
| mean_RMSE | 0.006 |
# http://stackoverflow.com/a/17257422/2259277
number_ticks <- function(n){
function(limits) pretty(limits, n)
}
# gather results for plotting
plot_dat <- execute_time_results %>%
dplyr::select(site_count, dplyr, data.table) %>%
gather(model, time, -site_count)
ggplot(plot_dat, aes(x = site_count, y = time, group = model, color = model)) +
geom_smooth(size = 1, method = "loess") +
scale_y_continuous(breaks = number_ticks(20)) +
theme_bw()
# look at execution rate over samples
aor_dplyr_lm <- lm(dplyr ~ site_count, data = execute_time_results)
aor_dplyr_lm <- (as.numeric(coef(aor_dplyr_lm))[2]) * 10000
aor_DT_lm <- lm(data.table ~ site_count, data = execute_time_results)
aor_DT_lm <- (as.numeric(coef(aor_DT_lm))[2]) * 10000
Data.table approach is 0.124 seconds faster at 10 sites/events, is 0.922 seconds faster, at 2.49510^{4} sites/events, and is 1.735 seconds faster at 510^{4} sites/events. For every 1000 sites/events added the data.table approach execution time increases by 0.0277 seconds; dplyr increases at a rate of 0.3765 seconds.
profvis({
aor_dplyr_profile <- my_aorist_dplyr(sites, end.date = 5000, bin.width = 100)
})
profvis({
aor_DT_profile <- my_aorist_DT(sites, end.date = 5000, bin.width = 100)
})