apply family

opts_chunk$set(warning = FALSE, error = FALSE, message = FALSE, tidy = FALSE)

Dummy datasets (data.frame, and list of data.frames:

df <- iris
ld <- replicate(3, data.frame(x=rnorm(10),y=rnorm(10)),simplify=FALSE)
names(ld) <- LETTERS[1:3]
str(df)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
str(ld)
## List of 3
##  $ A:'data.frame':   10 obs. of  2 variables:
##   ..$ x: num [1:10] 0.318 -0.107 -0.772 -0.899 -0.256 ...
##   ..$ y: num [1:10] -1.653 0.749 -0.799 1.283 -1.044 ...
##  $ B:'data.frame':   10 obs. of  2 variables:
##   ..$ x: num [1:10] -0.292 -0.45 -1.265 -0.642 0.446 ...
##   ..$ y: num [1:10] 1.207 0.14 0.969 -0.231 0.376 ...
##  $ C:'data.frame':   10 obs. of  2 variables:
##   ..$ x: num [1:10] -1.314 -1.892 -1.6 -0.573 0.854 ...
##   ..$ y: num [1:10] 1.0367 0.8039 -0.0995 -1.48 -1.2969 ...

Naming convention: **ply functions (namely llply, ldply, laply, ddply, dlply, daply, alply, adply, aaply) follow a convenient nomenclature whereby the first letter refers to the type of input variable (a for array, d for data.frame, l for list) and the second letter the output type. Thus, llply works on each element of a list, and returns a list, while adply works on a slice of an array, and returns a data.frame. Of course the function to apply to each subset should have compatible behaviour.

Here's an example:

llply(ld, transform, z = y^2, .progress = "text")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |=================================================================| 100%
## $A
##           x       y      z
## 1   0.31791 -1.6533 2.7332
## 2  -0.10695  0.7486 0.5603
## 3  -0.77196 -0.7995 0.6392
## 4  -0.89935  1.2832 1.6467
## 5  -0.25630 -1.0443 1.0905
## 6   0.11536  0.8378 0.7019
## 7   0.02464 -0.9568 0.9154
## 8   0.24214 -0.4570 0.2088
## 9  -1.18019  2.0997 4.4086
## 10  0.18165 -1.3804 1.9056
## 
## $B
##          x        y         z
## 1  -0.2923  1.20718 1.4572902
## 2  -0.4495  0.13960 0.0194879
## 3  -1.2654  0.96869 0.9383669
## 4  -0.6421 -0.23099 0.0533545
## 5   0.4462  0.37605 0.1414125
## 6  -0.1208  1.70897 2.9205749
## 7   1.5694 -0.65589 0.4301878
## 8  -0.9846 -1.17199 1.3735510
## 9  -0.8279 -0.02682 0.0007191
## 10 -1.0417 -0.46854 0.2195334
## 
## $C
##          x        y        z
## 1  -1.3138  1.03667 1.074695
## 2  -1.8917  0.80394 0.646313
## 3  -1.6004 -0.09949 0.009899
## 4  -0.5732 -1.48004 2.190524
## 5   0.8537 -1.29694 1.682041
## 6   0.5151 -0.77693 0.603619
## 7  -1.2700  1.64763 2.714681
## 8   0.2653 -0.13564 0.018398
## 9   0.3490  0.36592 0.133900
## 10  1.0267  1.28048 1.639630

We applied the function transform to each data.frame in the list, adding a new variable z defined as z=y^2. If the calculation is slow, it is sometimes nice to have a progress bar (default is none).

Sometimes the function only has side-effects and we don't care about saving results; a typical example is plotting. There's a special character for this, _

l_ply(ld, plot)

plot of chunk l_ply plot of chunk l_ply plot of chunk l_ply

Working on data.frames, one needs to specify how to form the subsets,

ddply(df, c("Species"), summarise, maxsepallength = max(Sepal.Length), petalsd=sd(Petal.Width))
##      Species maxsepallength petalsd
## 1     setosa            5.8  0.1054
## 2 versicolor            7.0  0.1978
## 3  virginica            7.9  0.2747

where summarise is a convenient function to create a new data.frame with the arguments (…) that follow in the call.

Similarly, working on arrays, one specifies which margin to slice.

Finally, my favorite is m*ply, which takes multiple input arguments:

peak = function(nu0=100, gamma=10, I0 =1, 
               fun=c("gauss","lorentz"), offset=0, ...){

  fun <- match.arg(fun)
  x <- seq(nu0-5*gamma,nu0+5*gamma, length=200)
  result <- switch(fun,
         gauss = I0*dnorm(x, nu0, gamma, ...),
         lorentz = I0*gamma / pi/((x-nu0)^2 + gamma^2))

  data.frame(x = x, value = result + offset)
}

# make up some combinations of parameters
params <- expand.grid(nu0=c(100, 120, 130),
                      gamma = seq(10, 30, length=3),
                      fun=c("gauss","lorentz"), stringsAsFactors=FALSE)
head(params)
##   nu0 gamma   fun
## 1 100    10 gauss
## 2 120    10 gauss
## 3 130    10 gauss
## 4 100    20 gauss
## 5 120    20 gauss
## 6 130    20 gauss

# apply to all combinations and combined into a data.frame
all <- mdply(params, peak, offset=2)
str(all)
## 'data.frame':    3600 obs. of  5 variables:
##  $ nu0  : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ gamma: num  10 10 10 10 10 10 10 10 10 10 ...
##  $ fun  : chr  "gauss" "gauss" "gauss" "gauss" ...
##  $ x    : num  50 50.5 51 51.5 52 ...
##  $ value: num  2 2 2 2 2 ...

The results are in long format, thus easily visualised with ggplot2

library(ggplot2)
ggplot(all, aes(x, value, colour=factor(gamma), linetype=factor(fun))) +
  facet_wrap(~nu0, ncol=1, scales="free") + 
  geom_line() +
  theme_minimal() +
  labs(colour="Width", linetype="Function")

plot of chunk plotting