本文档描R中apply函数族的用法。

1. apply

apply(array, margin, function, ...)
Applies a function to sections of an array and returns the results in an array.

m <- matrix(1:16, ncol = 4)
print(m)
     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
[3,]    3    7   11   15
[4,]    4    8   12   16
apply(m, 1, sum)  # 按行
[1] 28 32 36 40
apply(m, 2, sum)  # 按列
[1] 10 26 42 58
apply(m, 1, function(x) sum(x) + 2)  # 使用自定义(匿名)函数
[1] 30 34 38 42

2. lapply

lapply(list, function, ...)
Applies a function to elements in a list or a vector and returns the results in a list.

df1 <- data.frame(m)
print(df1)
  X1 X2 X3 X4
1  1  5  9 13
2  2  6 10 14
3  3  7 11 15
4  4  8 12 16
lapply(df1, sum)
$X1
[1] 10

$X2
[1] 26

$X3
[1] 42

$X4
[1] 58
# 使用lapply代替显式循环
lapply(1:5, function(x) x^2)
[[1]]
[1] 1

[[2]]
[1] 4

[[3]]
[1] 9

[[4]]
[1] 16

[[5]]
[1] 25

3. sapply

sapply(list, function, ..., simplify = TRUE)
Applies a function to elements in a list and returns the results in a vector, matrix or a list.

# 返回向量
sapply(df1, sum)
X1 X2 X3 X4 
10 26 42 58 
# 若funtion返回向量,则sapply返回矩阵
sapply(df1, range)
     X1 X2 X3 X4
[1,]  1  5  9 13
[2,]  4  8 12 16

4. vapply

vapply(list, function, function.value, ...)

# vapply指定返回值的类型,若与指定的类型不符就会报错
vapply(df1, fivenum, c(Min. = 0, "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0))  
         X1  X2   X3   X4
Min.    1.0 5.0  9.0 13.0
1st Qu. 1.5 5.5  9.5 13.5
Median  2.5 6.5 10.5 14.5
3rd Qu. 3.5 7.5 11.5 15.5
Max.    4.0 8.0 12.0 16.0
vapply(df1, fivenum, c(Min. = "", "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0))
Error in vapply(df1, fivenum, c(Min. = "", `1st Qu.` = 0, Median = 0, : 值的种类必需是'character',
 但FUN(X[[1]])结果的种类却是'double'

5. tapply

tapply(array, indicies, function, ..., simplify = TRUE)
Applies a function to each cell of a ragged array.

x1 <- runif(16)
index1 <- rep(1:4, 4)  # 分组变量1
index2 <- c(rep(1, 8), rep(2, 8))  # 分组变量2
df2 <- data.frame(x1, index1, index2)
print(df2)
           x1 index1 index2
1  0.97019462      1      1
2  0.98782881      2      1
3  0.95165966      3      1
4  0.88527474      4      1
5  0.34325478      1      1
6  0.66533882      2      1
7  0.62725130      3      1
8  0.34001986      4      1
9  0.31779573      1      2
10 0.20940271      2      2
11 0.05079445      3      2
12 0.72363907      4      2
13 0.45179433      1      2
14 0.11114236      2      2
15 0.71166645      3      2
16 0.33201999      4      2
tapply(df2$x1, df2$index1, mean)  # 按分组变量1求各组均值
        1         2         3         4 
0.5207599 0.4934282 0.5853430 0.5702384 
tapply(df2$x1, list(df2$index1, df2$index2), mean)  # 按分组变量1和分组变量2的交叉求各组均值
          1         2
1 0.6567247 0.3847950
2 0.8265838 0.1602725
3 0.7894555 0.3812304
4 0.6126473 0.5278295

6. mapply

mapply(function, ..., MoreArgs = NULL, simplify = TRUE)
Apply a function to multiple list or vector arguments.

mapply(rep, 1:5, 5:1)
[[1]]
[1] 1 1 1 1 1

[[2]]
[1] 2 2 2 2

[[3]]
[1] 3 3 3

[[4]]
[1] 4 4

[[5]]
[1] 5
mapply(sum, 1:5, MoreArgs = list(x = 10))
[1] 11 12 13 14 15

7. sweep

sweep(array, margin, stats, function, ...)
Returns an array like the input array with stats swept out.

colMeans(df1)  # df1每列均值
  X1   X2   X3   X4 
 2.5  6.5 10.5 14.5 
sweep(df1, 2, colMeans(df1), "-")  # df1每列数据减去当列均值
    X1   X2   X3   X4
1 -1.5 -1.5 -1.5 -1.5
2 -0.5 -0.5 -0.5 -0.5
3  0.5  0.5  0.5  0.5
4  1.5  1.5  1.5  1.5

8. by & aggregate

by(data.frame, indicies, function, ..., simplify = TRUE)
by() is an object-oriented wrapper for tapply applied to data frames.

# by是tapply的面向对象版本
by(df2$x1, list(df2$index1, df2$index2), mean)
: 1
: 1
[1] 0.6567247
-------------------------------------------------------- 
: 2
: 1
[1] 0.8265838
-------------------------------------------------------- 
: 3
: 1
[1] 0.7894555
-------------------------------------------------------- 
: 4
: 1
[1] 0.6126473
-------------------------------------------------------- 
: 1
: 2
[1] 0.384795
-------------------------------------------------------- 
: 2
: 2
[1] 0.1602725
-------------------------------------------------------- 
: 3
: 2
[1] 0.3812304
-------------------------------------------------------- 
: 4
: 2
[1] 0.5278295

aggregate(data.frame, indicies, function, ..., simplify = TRUE)
Splits the data into subsets, computes summary statistics for each, and returns the result in a convenient form.

# aggregate可以使用返回值为多值的函数
aggregate(df2$x1, list(df2$index1, df2$index2), fivenum)
  Group.1 Group.2        x.1        x.2        x.3        x.4        x.5
1       1       1 0.34325478 0.34325478 0.65672470 0.97019462 0.97019462
2       2       1 0.66533882 0.66533882 0.82658381 0.98782881 0.98782881
3       3       1 0.62725130 0.62725130 0.78945548 0.95165966 0.95165966
4       4       1 0.34001986 0.34001986 0.61264730 0.88527474 0.88527474
5       1       2 0.31779573 0.31779573 0.38479503 0.45179433 0.45179433
6       2       2 0.11114236 0.11114236 0.16027253 0.20940271 0.20940271
7       3       2 0.05079445 0.05079445 0.38123045 0.71166645 0.71166645
8       4       2 0.33201999 0.33201999 0.52782953 0.72363907 0.72363907