knitr::opts_chunk$set(echo = TRUE)

apply函数簇提供一种计算框架,我们把这个框架记住了,编写相应的函数放进去可以循环地实现我们的目的。编写处理函数,也就是解决问题的核心应该是我们应该关心的,而不是如何来执行这个函数,这个功能交给apply家族(当然还有一些其他的函数)。

1 apply

apply函数是最常用的代替for循环的函数。apply函数可以对矩阵、数据框、数组(二维、多维),按行或列进行循环计算,对子元素进行迭代,并把子元素以参数传递的形式给自定义的FUN函数中,并以返回计算结果。

library(DT)
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(purrr)
library(blogdown)
library(knitr)
knitr::include_graphics("apply.png")

x_df <- data.frame(x = 1:10,
                   y = letters[1:10],
                   z = rep(c("A","B"),each = 5))
str(x_df)
## 'data.frame':    10 obs. of  3 variables:
##  $ x: int  1 2 3 4 5 6 7 8 9 10
##  $ y: Factor w/ 10 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10
##  $ z: Factor w/ 2 levels "A","B": 1 1 1 1 1 2 2 2 2 2
y_df <- data.frame(x = 1:10,
                   y = letters[1:10],
                   z = rep(c("A","B"),each = 5),stringsAsFactors = FALSE)
str(y_df)
## 'data.frame':    10 obs. of  3 variables:
##  $ x: int  1 2 3 4 5 6 7 8 9 10
##  $ y: chr  "a" "b" "c" "d" ...
##  $ z: chr  "A" "A" "A" "A" ...
# 我们发现data_frame自动将字符识别为factor形式,可以添加参数stringsAsFactors = FALSE
apply(y_df,2,length)
##  x  y  z 
## 10 10 10
apply(y_df,1,length)
##  [1] 3 3 3 3 3 3 3 3 3 3
x <- tibble(a = 1:10,
            b = 2:11)
near(apply(x,2,identity),x)
##          a    b
##  [1,] TRUE TRUE
##  [2,] TRUE TRUE
##  [3,] TRUE TRUE
##  [4,] TRUE TRUE
##  [5,] TRUE TRUE
##  [6,] TRUE TRUE
##  [7,] TRUE TRUE
##  [8,] TRUE TRUE
##  [9,] TRUE TRUE
## [10,] TRUE TRUE
ma <- matrix(c(1:4,1,6:8),nrow = 2)
apply(ma,1, table)
## [[1]]
## 
## 1 3 7 
## 2 1 1 
## 
## [[2]]
## 
## 2 4 6 8 
## 1 1 1 1
# 值得注意的是,apply函数时可以针对数组进行计算你的,就是说数组未必是2维的
z <- array(1:24, dim = 2:4)
zseq <- apply(z, 1:2, function(x) seq_len(max(x)))
z
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]   13   15   17
## [2,]   14   16   18
## 
## , , 4
## 
##      [,1] [,2] [,3]
## [1,]   19   21   23
## [2,]   20   22   24
apply(z, 1, function(x) max(x))
## [1] 23 24
apply(z, 2, max)
## [1] 20 22 24
apply(z,3,max)
## [1]  6 12 18 24
ma <- matrix(c(rnorm(100)), nrow = 10)
dat<-ma
table(apply(dat,2,function(x) sum(x>0.5) )>2) # 列求和大于2
## 
## FALSE  TRUE 
##     3     7
table(apply(dat,1,function(x) sum(x>0.5) )>3) # 行求和大于3
## 
## FALSE  TRUE 
##     6     4
dat <- dat[apply(dat,1,function(x) sum(x>.5) )>3,
        apply(dat,2,function(x) sum(x>.5) )>2]
# 按行列求平均值和标准差
x_function <- function(x){
  c(n = sum(!is.na(x)),
  mean = mean(x),
  min = min(x),
  max = max(x))
}
ma <- matrix(c(rnorm(100)), nrow = 10)
ma %>% datatable()
apply(ma,2,function(x){max = max(x)})
##  [1] 1.5169424 0.9931905 1.6149066 1.1810736 0.2338709 1.5257351 1.6666046
##  [8] 3.1679849 1.8535513 2.7100855
apply(ma,2,FUN = x_function)
##            [,1]       [,2]        [,3]       [,4]       [,5]        [,6]
## n    10.0000000 10.0000000 10.00000000 10.0000000 10.0000000 10.00000000
## mean  0.1033192 -0.2833261  0.05821862 -0.7385798 -0.6408561 -0.01056467
## min  -1.2419795 -2.3682003 -0.99672982 -2.5044803 -1.5155723 -1.74879317
## max   1.5169424  0.9931905  1.61490660  1.1810736  0.2338709  1.52573513
##           [,7]       [,8]       [,9]      [,10]
## n    10.000000 10.0000000 10.0000000 10.0000000
## mean -0.122865  0.3681661  0.4728179  0.3862402
## min  -2.271421 -1.6655034 -1.1428838 -0.8655464
## max   1.666605  3.1679849  1.8535513  2.7100855

2 lapply

lapply函数是一个最基础循环操作函数之一,用来对list、data.frame数据集进行循环,并返回和X长度同样的list结构作为结果集,通过lapply的开头的第一个字母’l’就可以判断返回结果集的类型。

Usage
lapply(X, FUN, ...)
Arguments
X   a vector (atomic or list) or an `[expression](http://127.0.0.1:22572/help/library/base/help/expression)` object. Other objects (including classed objects) will be coerced by`base::[as.list](http://127.0.0.1:22572/help/library/base/help/as.list)`.

FUN   the function to be applied to each element of `X`: see ‘Details’. In the case of functions like `+`, `%*%`, the function name must be backquoted or quoted.
# Examples
require(stats); require(graphics)
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
x
## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $beta
## [1]  0.04978707  0.13533528  0.36787944  1.00000000  2.71828183  7.38905610
## [7] 20.08553692
## 
## $logic
## [1]  TRUE FALSE FALSE  TRUE
# compute the list mean for each list element
lapply(x, mean)
## $a
## [1] 5.5
## 
## $beta
## [1] 4.535125
## 
## $logic
## [1] 0.5
sapply(x, mean)
##        a     beta    logic 
## 5.500000 4.535125 0.500000
# median and quartiles for each list element
lapply(x, quantile, probs = 1:3/4)
## $a
##  25%  50%  75% 
## 3.25 5.50 7.75 
## 
## $beta
##       25%       50%       75% 
## 0.2516074 1.0000000 5.0536690 
## 
## $logic
## 25% 50% 75% 
## 0.0 0.5 1.0
sapply(x, quantile)
##          a        beta logic
## 0%    1.00  0.04978707   0.0
## 25%   3.25  0.25160736   0.0
## 50%   5.50  1.00000000   0.5
## 75%   7.75  5.05366896   1.0
## 100% 10.00 20.08553692   1.0

lapply就可以很方便地把list数据集进行循环操作,还可以用data.frame数据集按列进行循环,但如果传入的数据集是一个向量或矩阵对象,那么直接使用lapply就不能达到想要的效果了。

list(x = 1:10, y = exp(-3:3),logic = c(TRUE,FALSE,FALSE,TRUE))$x %>% class()
## [1] "integer"
x <- cbind(x1=3, x2=c(2:1,4:5))
x; class(x)
##      x1 x2
## [1,]  3  2
## [2,]  3  1
## [3,]  3  4
## [4,]  3  5
## [1] "matrix"
lapply(x, quantile, probs = 1:3/4)
## [[1]]
## 25% 50% 75% 
##   3   3   3 
## 
## [[2]]
## 25% 50% 75% 
##   3   3   3 
## 
## [[3]]
## 25% 50% 75% 
##   3   3   3 
## 
## [[4]]
## 25% 50% 75% 
##   3   3   3 
## 
## [[5]]
## 25% 50% 75% 
##   2   2   2 
## 
## [[6]]
## 25% 50% 75% 
##   1   1   1 
## 
## [[7]]
## 25% 50% 75% 
##   4   4   4 
## 
## [[8]]
## 25% 50% 75% 
##   5   5   5

lapply会分别循环矩阵中的每个值,而不是按行或按列进行分组计算。

3 sapply

sapply函数是一个简化版的lapply,sapply增加了2个参数simplify和USE.NAMES,主要就是让输出看起来更友好,返回值为向量,而不是list对象。在上面lapply中已经演示过了,如果saplly中simplify=FALSE和USE.NAMES=FALSE,那么完全sapply函数就等于lapply函数了。

data <- list(x = 1:100,y = rep(c(FALSE,TRUE),each = 25,time = 2),z = exp(-3:10))
lapply(data,length)
## $x
## [1] 100
## 
## $y
## [1] 100
## 
## $z
## [1] 14
sapply(data,length)
##   x   y   z 
## 100 100  14
sapply(data,max)
##        x        y        z 
##   100.00     1.00 22026.47

4 vapply

vapply类似于sapply,提供了FUN.VALUE参数,用来控制返回值的行名,这样可以让程序更健壮。可以对数据框的数据进行累计求和,并对每一行设置行名row.names,就比spply多一个命名的功能

i39 <- sapply(3:9, seq) # list of vectors
i39
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] 1 2 3 4
## 
## [[3]]
## [1] 1 2 3 4 5
## 
## [[4]]
## [1] 1 2 3 4 5 6
## 
## [[5]]
## [1] 1 2 3 4 5 6 7
## 
## [[6]]
## [1] 1 2 3 4 5 6 7 8
## 
## [[7]]
## [1] 1 2 3 4 5 6 7 8 9
sapply(i39, fivenum)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]  1.0  1.0    1  1.0  1.0  1.0    1
## [2,]  1.5  1.5    2  2.0  2.5  2.5    3
## [3,]  2.0  2.5    3  3.5  4.0  4.5    5
## [4,]  2.5  3.5    4  5.0  5.5  6.5    7
## [5,]  3.0  4.0    5  6.0  7.0  8.0    9
vapply(i39, fivenum,c(Min. = 0, "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0))
##         [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## Min.     1.0  1.0    1  1.0  1.0  1.0    1
## 1st Qu.  1.5  1.5    2  2.0  2.5  2.5    3
## Median   2.0  2.5    3  3.5  4.0  4.5    5
## 3rd Qu.  2.5  3.5    4  5.0  5.5  6.5    7
## Max.     3.0  4.0    5  6.0  7.0  8.0    9
## sapply(*, "array") -- artificial example
(v <- structure(10*(5:8), names = LETTERS[1:4]))
##  A  B  C  D 
## 50 60 70 80
f2 <- function(x, y) outer(rep(x, length.out = 3), y)
(a2 <- sapply(v, f2, y = 2*(1:5), simplify = "array"))
## , , A
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  100  200  300  400  500
## [2,]  100  200  300  400  500
## [3,]  100  200  300  400  500
## 
## , , B
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  120  240  360  480  600
## [2,]  120  240  360  480  600
## [3,]  120  240  360  480  600
## 
## , , C
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  140  280  420  560  700
## [2,]  140  280  420  560  700
## [3,]  140  280  420  560  700
## 
## , , D
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  160  320  480  640  800
## [2,]  160  320  480  640  800
## [3,]  160  320  480  640  800
a.2 <- vapply(v, f2, outer(1:3, 1:5), y = 2*(1:5))
a.2
## , , A
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  100  200  300  400  500
## [2,]  100  200  300  400  500
## [3,]  100  200  300  400  500
## 
## , , B
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  120  240  360  480  600
## [2,]  120  240  360  480  600
## [3,]  120  240  360  480  600
## 
## , , C
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  140  280  420  560  700
## [2,]  140  280  420  560  700
## [3,]  140  280  420  560  700
## 
## , , D
## 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  160  320  480  640  800
## [2,]  160  320  480  640  800
## [3,]  160  320  480  640  800
stopifnot(dim(a2) == c(3,5,4), all.equal(a2, a.2),identical(dimnames(a2), list(NULL,NULL,LETTERS[1:4])))

5 mapply

mapply也是sapply的变形函数,类似多变量的sapply,但是参数定义有些变化。第一参数为自定义的FUN函数,第二个参数’…’可以接收多个数据,作为FUN函数的参数调用。

Usage

mapply(FUN, ...,
       MoreArgs = NULL, 
       SIMPLIFY = TRUE,
       USE.NAMES = TRUE)

MoreArgs: 参数列表
SIMPLIFY: 是否数组化,当值array时,输出结果按数组进行分组
USE.NAMES: 如果X为字符串,TRUE设置字符串为数据名,FALSE不设置
比如,比较3个向量大小,按索引顺序取较大的值。
mapply(rep, 1:4, 4:1)
## [[1]]
## [1] 1 1 1 1
## 
## [[2]]
## [1] 2 2 2
## 
## [[3]]
## [1] 3 3
## 
## [[4]]
## [1] 4
mapply(rep, times = 1:4, x = 4:1)
## [[1]]
## [1] 4
## 
## [[2]]
## [1] 3 3
## 
## [[3]]
## [1] 2 2 2
## 
## [[4]]
## [1] 1 1 1 1
mapply(rep, times = 1:4, MoreArgs = list(x = 42))
## [[1]]
## [1] 42
## 
## [[2]]
## [1] 42 42
## 
## [[3]]
## [1] 42 42 42
## 
## [[4]]
## [1] 42 42 42 42
mapply(function(x, y) seq_len(x) + y,c(a =  1, b = 2, c = 3),  # names from first
       c(A = 10, B = 0, C = -10))
## $a
## [1] 11
## 
## $b
## [1] 1 2
## 
## $c
## [1] -9 -8 -7
word <- function(C, k) paste(rep.int(C, k), collapse = "")
word
## function(C, k) paste(rep.int(C, k), collapse = "")
function(C, k) paste(rep.int(C, k), collapse = "")
## function(C, k) paste(rep.int(C, k), collapse = "")
utils::str(mapply(word, LETTERS[1:6], 6:1, SIMPLIFY = FALSE))
## List of 6
##  $ A: chr "AAAAAA"
##  $ B: chr "BBBBB"
##  $ C: chr "CCCC"
##  $ D: chr "DDD"
##  $ E: chr "EE"
##  $ F: chr "F"