Source for this document.
@dattali asked, “what’s a safe way to iterate over rows of a data frame?” The example was to convert each row into a list and return a list of lists, indexed first by column, then by row.
A number of people gave suggestions on Twitter, which I’ve collected here. I’ve benchmarked these methods with data of various sizes; scroll down to see a plot of times.
library(purrr)
library(dplyr)
library(tidyr)
# @dattali
# Using apply (only safe when all cols are same type)
f_apply <- function(df) {
apply(df, 1, function(row) as.list(row))
}
# @drob
# split + lapply
f_split_lapply <- function(df) {
df <- split(df, seq_len(nrow(df)))
lapply(df, function(row) as.list(row))
}
# @winston_chang
# lapply over row indices
f_lapply_row <- function(df) {
lapply(seq_len(nrow(df)), function(i) as.list(df[i,,drop=FALSE]))
}
# @winston_chang
# lapply + lapply: Treat data frame like a list, and the slice out lists
f_lapply_lapply <- function(df) {
cols <- seq_len(length(df))
names(cols) <- names(df)
lapply(seq_len(nrow(df)), function(row) {
lapply(cols, function(col) {
df[[col]][[row]]
})
})
}
# @winston_chang
# lapply + lapply v2: Same as lapply_lapply, but explicitly convert df to a list
f_lapply_lapply2 <- function(df) {
rows <- seq_len(nrow(df))
cols <- seq_len(length(df))
names(cols) <- names(df)
df <- as.list(df)
lapply(rows, function(row) {
lapply(cols, function(col) {
df[[col]][[row]]
})
})
}
# @winston_chang
# nested_for: Same as lapply_lapply2, but use a for loop instead of lapply()
f_nested_for <- function(df) {
nrows <- nrow(df)
ncols <- length(df)
row_idxs <- seq_len(nrows)
col_idxs <- seq_len(ncols)
colnames <- names(df)
df <- as.list(df)
res <- vector("list", nrows)
for (i in row_idxs) {
row <- vector("list", ncols)
for (j in col_idxs) {
row[[j]] <- df[[j]][[i]]
}
names(row) <- colnames
res[[i]] <- row
}
res
}
# @ Tomasz Kalinowski
# .mapply
f_mapply <- function(df) {
.mapply(list, unclass(df), NULL)
}
# @JennyBryan
# purrr::pmap
f_pmap <- function(df) {
pmap(df, list)
}
# purrr::list_transpose
f_list_transpose <- function(df) {
list_transpose(as.list(df))
}
# purrr::transpose: This is superseded by list_transpose, but the old version is
# much faster.
f_transpose <- function(df) {
transpose(as.list(df))
}
Benchmark each of them, using data sets with varying numbers of rows:
run_benchmark <- function(nrow) {
# Make some data
df <- data.frame(
x = rnorm(nrow),
y = runif(nrow),
z = runif(nrow)
)
res <- list(
apply = system.time(f_apply(df)),
split_lapply = system.time(f_split_lapply(df)),
lapply_row = system.time(f_lapply_row(df)),
lapply_lapply = system.time(f_lapply_lapply(df)),
lapply_lapply2 = system.time(f_lapply_lapply2(df)),
nested_for = system.time(f_nested_for(df)),
mapply = system.time(f_mapply(df)),
pmap = system.time(f_pmap(df)),
list_transpose = system.time(f_list_transpose(df)),
transpose = system.time(f_transpose(df))
)
# Get elapsed times
res <- lapply(res, `[[`, "elapsed")
# Add nrow to front
res <- c(nrow = nrow, res)
res
}
# Run the benchmarks for various size data
all_times <- lapply(1:5, function(n) {
run_benchmark(10^n)
})
# Convert to data frame
times <- lapply(all_times, as.data.frame)
times <- do.call(rbind, times)
knitr::kable(times)
| nrow | apply | split_lapply | lapply_row | lapply_lapply | lapply_lapply2 | nested_for | mapply | pmap | list_transpose | transpose |
|---|---|---|---|---|---|---|---|---|---|---|
| 1e+01 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.004 | 0.000 | 0.002 | 0.001 | 0.004 |
| 1e+02 | 0.001 | 0.003 | 0.003 | 0.003 | 0.003 | 0.000 | 0.001 | 0.001 | 0.004 | 0.002 |
| 1e+03 | 0.002 | 0.015 | 0.014 | 0.008 | 0.002 | 0.001 | 0.001 | 0.001 | 0.037 | 0.001 |
| 1e+04 | 0.018 | 0.162 | 0.155 | 0.085 | 0.022 | 0.005 | 0.005 | 0.010 | 0.381 | 0.002 |
| 1e+05 | 0.299 | 1.813 | 1.518 | 0.872 | 0.214 | 0.043 | 0.071 | 0.105 | 3.625 | 0.011 |
This plot shows the number of seconds needed to process n rows, for each method. Both the x and y use log scales, so each step along the x scale represents a 10x increase in number of rows, and each step along the y scale represents a 10x increase in time.
library(ggplot2)
library(scales)
library(ggrepel)
# Convert to long format
times_long <- gather(times, method, seconds, -nrow)
# Set order of methods, for plots
times_long$method <- factor(times_long$method,
levels = c("apply", "split_lapply", "lapply_row",
"lapply_lapply", "lapply_lapply2", "nested_for", "mapply",
"pmap", "list_transpose", "transpose")
)
# Set up a column for labels
times_long$end_label <- sprintf("%s (%0.2fs)", times_long$method, times_long$seconds)
times_long$end_label[times_long$nrow != max(times_long$nrow)] <- NA
log10_breaks <- trans_breaks("log10", function(x) 10 ^ x)
log10_mbreaks <- function(x) {
limits <- c(floor(log10(x[1])), ceiling(log10(x[2])))
breaks <- 10 ^ seq(limits[1], limits[2])
unlist(lapply(breaks, function(x) x * seq(0.1, 0.9, by = 0.1)))
}
log10_labels <- trans_format("log10", math_format(10 ^ .x))
# Plot with log-log axes
ggplot(times_long, aes(x = nrow, y = seconds, colour = method)) +
geom_point(size = 2) +
geom_line(linewidth = 1) +
geom_label_repel(aes(label = end_label), point.padding = 1,
direction = "y", nudge_x = 1.5) +
annotation_logticks(sides = "trbl") +
guides(colour = "none") +
theme_bw() +
scale_y_log10(
breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks
) +
scale_x_log10(
breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks
)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 40 rows containing missing values (`geom_label_repel()`).