knitr::opts_chunk$set(echo = TRUE)
apply函数簇提供一种计算框架,我们把这个框架记住了,编写相应的函数放进去可以循环地实现我们的目的。编写处理函数,也就是解决问题的核心应该是我们应该关心的,而不是如何来执行这个函数,这个功能交给apply家族(当然还有一些其他的函数)。
apply函数是最常用的代替for循环的函数。apply函数可以对矩阵、数据框、数组(二维、多维),按行或列进行循环计算,对子元素进行迭代,并把子元素以参数传递的形式给自定义的FUN函数中,并以返回计算结果。
library(DT)
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(purrr)
library(blogdown)
library(knitr)
knitr::include_graphics("apply.png")
x_df <- data.frame(x = 1:10,
y = letters[1:10],
z = rep(c("A","B"),each = 5))
str(x_df)
## 'data.frame': 10 obs. of 3 variables:
## $ x: int 1 2 3 4 5 6 7 8 9 10
## $ y: Factor w/ 10 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10
## $ z: Factor w/ 2 levels "A","B": 1 1 1 1 1 2 2 2 2 2
y_df <- data.frame(x = 1:10,
y = letters[1:10],
z = rep(c("A","B"),each = 5),stringsAsFactors = FALSE)
str(y_df)
## 'data.frame': 10 obs. of 3 variables:
## $ x: int 1 2 3 4 5 6 7 8 9 10
## $ y: chr "a" "b" "c" "d" ...
## $ z: chr "A" "A" "A" "A" ...
# 我们发现data_frame自动将字符识别为factor形式,可以添加参数stringsAsFactors = FALSE
apply(y_df,2,length)
## x y z
## 10 10 10
apply(y_df,1,length)
## [1] 3 3 3 3 3 3 3 3 3 3
x <- tibble(a = 1:10,
b = 2:11)
near(apply(x,2,identity),x)
## a b
## [1,] TRUE TRUE
## [2,] TRUE TRUE
## [3,] TRUE TRUE
## [4,] TRUE TRUE
## [5,] TRUE TRUE
## [6,] TRUE TRUE
## [7,] TRUE TRUE
## [8,] TRUE TRUE
## [9,] TRUE TRUE
## [10,] TRUE TRUE
ma <- matrix(c(1:4,1,6:8),nrow = 2)
apply(ma,1, table)
## [[1]]
##
## 1 3 7
## 2 1 1
##
## [[2]]
##
## 2 4 6 8
## 1 1 1 1
# 值得注意的是,apply函数时可以针对数组进行计算你的,就是说数组未必是2维的
z <- array(1:24, dim = 2:4)
zseq <- apply(z, 1:2, function(x) seq_len(max(x)))
z
## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 7 9 11
## [2,] 8 10 12
##
## , , 3
##
## [,1] [,2] [,3]
## [1,] 13 15 17
## [2,] 14 16 18
##
## , , 4
##
## [,1] [,2] [,3]
## [1,] 19 21 23
## [2,] 20 22 24
apply(z, 1, function(x) max(x))
## [1] 23 24
apply(z, 2, max)
## [1] 20 22 24
apply(z,3,max)
## [1] 6 12 18 24
ma <- matrix(c(rnorm(100)), nrow = 10)
dat<-ma
table(apply(dat,2,function(x) sum(x>0.5) )>2) # 列求和大于2
##
## FALSE TRUE
## 3 7
table(apply(dat,1,function(x) sum(x>0.5) )>3) # 行求和大于3
##
## FALSE TRUE
## 6 4
dat <- dat[apply(dat,1,function(x) sum(x>.5) )>3,
apply(dat,2,function(x) sum(x>.5) )>2]
# 按行列求平均值和标准差
x_function <- function(x){
c(n = sum(!is.na(x)),
mean = mean(x),
min = min(x),
max = max(x))
}
ma <- matrix(c(rnorm(100)), nrow = 10)
ma %>% datatable()
apply(ma,2,function(x){max = max(x)})
## [1] 1.5169424 0.9931905 1.6149066 1.1810736 0.2338709 1.5257351 1.6666046
## [8] 3.1679849 1.8535513 2.7100855
apply(ma,2,FUN = x_function)
## [,1] [,2] [,3] [,4] [,5] [,6]
## n 10.0000000 10.0000000 10.00000000 10.0000000 10.0000000 10.00000000
## mean 0.1033192 -0.2833261 0.05821862 -0.7385798 -0.6408561 -0.01056467
## min -1.2419795 -2.3682003 -0.99672982 -2.5044803 -1.5155723 -1.74879317
## max 1.5169424 0.9931905 1.61490660 1.1810736 0.2338709 1.52573513
## [,7] [,8] [,9] [,10]
## n 10.000000 10.0000000 10.0000000 10.0000000
## mean -0.122865 0.3681661 0.4728179 0.3862402
## min -2.271421 -1.6655034 -1.1428838 -0.8655464
## max 1.666605 3.1679849 1.8535513 2.7100855
lapply函数是一个最基础循环操作函数之一,用来对list、data.frame数据集进行循环,并返回和X长度同样的list结构作为结果集,通过lapply的开头的第一个字母’l’就可以判断返回结果集的类型。
Usage
lapply(X, FUN, ...)
Arguments
X a vector (atomic or list) or an `[expression](http://127.0.0.1:22572/help/library/base/help/expression)` object. Other objects (including classed objects) will be coerced by`base::[as.list](http://127.0.0.1:22572/help/library/base/help/as.list)`.
FUN the function to be applied to each element of `X`: see ‘Details’. In the case of functions like `+`, `%*%`, the function name must be backquoted or quoted.
# Examples
require(stats); require(graphics)
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
x
## $a
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $beta
## [1] 0.04978707 0.13533528 0.36787944 1.00000000 2.71828183 7.38905610
## [7] 20.08553692
##
## $logic
## [1] TRUE FALSE FALSE TRUE
# compute the list mean for each list element
lapply(x, mean)
## $a
## [1] 5.5
##
## $beta
## [1] 4.535125
##
## $logic
## [1] 0.5
sapply(x, mean)
## a beta logic
## 5.500000 4.535125 0.500000
# median and quartiles for each list element
lapply(x, quantile, probs = 1:3/4)
## $a
## 25% 50% 75%
## 3.25 5.50 7.75
##
## $beta
## 25% 50% 75%
## 0.2516074 1.0000000 5.0536690
##
## $logic
## 25% 50% 75%
## 0.0 0.5 1.0
sapply(x, quantile)
## a beta logic
## 0% 1.00 0.04978707 0.0
## 25% 3.25 0.25160736 0.0
## 50% 5.50 1.00000000 0.5
## 75% 7.75 5.05366896 1.0
## 100% 10.00 20.08553692 1.0
lapply就可以很方便地把list数据集进行循环操作,还可以用data.frame数据集按列进行循环,但如果传入的数据集是一个向量或矩阵对象,那么直接使用lapply就不能达到想要的效果了。
list(x = 1:10, y = exp(-3:3),logic = c(TRUE,FALSE,FALSE,TRUE))$x %>% class()
## [1] "integer"
x <- cbind(x1=3, x2=c(2:1,4:5))
x; class(x)
## x1 x2
## [1,] 3 2
## [2,] 3 1
## [3,] 3 4
## [4,] 3 5
## [1] "matrix"
lapply(x, quantile, probs = 1:3/4)
## [[1]]
## 25% 50% 75%
## 3 3 3
##
## [[2]]
## 25% 50% 75%
## 3 3 3
##
## [[3]]
## 25% 50% 75%
## 3 3 3
##
## [[4]]
## 25% 50% 75%
## 3 3 3
##
## [[5]]
## 25% 50% 75%
## 2 2 2
##
## [[6]]
## 25% 50% 75%
## 1 1 1
##
## [[7]]
## 25% 50% 75%
## 4 4 4
##
## [[8]]
## 25% 50% 75%
## 5 5 5
lapply会分别循环矩阵中的每个值,而不是按行或按列进行分组计算。
sapply函数是一个简化版的lapply,sapply增加了2个参数simplify和USE.NAMES,主要就是让输出看起来更友好,返回值为向量,而不是list对象。在上面lapply中已经演示过了,如果saplly中simplify=FALSE和USE.NAMES=FALSE,那么完全sapply函数就等于lapply函数了。
data <- list(x = 1:100,y = rep(c(FALSE,TRUE),each = 25,time = 2),z = exp(-3:10))
lapply(data,length)
## $x
## [1] 100
##
## $y
## [1] 100
##
## $z
## [1] 14
sapply(data,length)
## x y z
## 100 100 14
sapply(data,max)
## x y z
## 100.00 1.00 22026.47
vapply类似于sapply,提供了FUN.VALUE参数,用来控制返回值的行名,这样可以让程序更健壮。可以对数据框的数据进行累计求和,并对每一行设置行名row.names,就比spply多一个命名的功能。
i39 <- sapply(3:9, seq) # list of vectors
i39
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 1 2 3 4
##
## [[3]]
## [1] 1 2 3 4 5
##
## [[4]]
## [1] 1 2 3 4 5 6
##
## [[5]]
## [1] 1 2 3 4 5 6 7
##
## [[6]]
## [1] 1 2 3 4 5 6 7 8
##
## [[7]]
## [1] 1 2 3 4 5 6 7 8 9
sapply(i39, fivenum)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.0 1.0 1 1.0 1.0 1.0 1
## [2,] 1.5 1.5 2 2.0 2.5 2.5 3
## [3,] 2.0 2.5 3 3.5 4.0 4.5 5
## [4,] 2.5 3.5 4 5.0 5.5 6.5 7
## [5,] 3.0 4.0 5 6.0 7.0 8.0 9
vapply(i39, fivenum,c(Min. = 0, "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## Min. 1.0 1.0 1 1.0 1.0 1.0 1
## 1st Qu. 1.5 1.5 2 2.0 2.5 2.5 3
## Median 2.0 2.5 3 3.5 4.0 4.5 5
## 3rd Qu. 2.5 3.5 4 5.0 5.5 6.5 7
## Max. 3.0 4.0 5 6.0 7.0 8.0 9
## sapply(*, "array") -- artificial example
(v <- structure(10*(5:8), names = LETTERS[1:4]))
## A B C D
## 50 60 70 80
f2 <- function(x, y) outer(rep(x, length.out = 3), y)
(a2 <- sapply(v, f2, y = 2*(1:5), simplify = "array"))
## , , A
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 100 200 300 400 500
## [2,] 100 200 300 400 500
## [3,] 100 200 300 400 500
##
## , , B
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 120 240 360 480 600
## [2,] 120 240 360 480 600
## [3,] 120 240 360 480 600
##
## , , C
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 140 280 420 560 700
## [2,] 140 280 420 560 700
## [3,] 140 280 420 560 700
##
## , , D
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 160 320 480 640 800
## [2,] 160 320 480 640 800
## [3,] 160 320 480 640 800
a.2 <- vapply(v, f2, outer(1:3, 1:5), y = 2*(1:5))
a.2
## , , A
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 100 200 300 400 500
## [2,] 100 200 300 400 500
## [3,] 100 200 300 400 500
##
## , , B
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 120 240 360 480 600
## [2,] 120 240 360 480 600
## [3,] 120 240 360 480 600
##
## , , C
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 140 280 420 560 700
## [2,] 140 280 420 560 700
## [3,] 140 280 420 560 700
##
## , , D
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 160 320 480 640 800
## [2,] 160 320 480 640 800
## [3,] 160 320 480 640 800
stopifnot(dim(a2) == c(3,5,4), all.equal(a2, a.2),identical(dimnames(a2), list(NULL,NULL,LETTERS[1:4])))
mapply也是sapply的变形函数,类似多变量的sapply,但是参数定义有些变化。第一参数为自定义的FUN函数,第二个参数’…’可以接收多个数据,作为FUN函数的参数调用。
Usage
mapply(FUN, ...,
MoreArgs = NULL,
SIMPLIFY = TRUE,
USE.NAMES = TRUE)
MoreArgs: 参数列表
SIMPLIFY: 是否数组化,当值array时,输出结果按数组进行分组
USE.NAMES: 如果X为字符串,TRUE设置字符串为数据名,FALSE不设置
比如,比较3个向量大小,按索引顺序取较大的值。
mapply(rep, 1:4, 4:1)
## [[1]]
## [1] 1 1 1 1
##
## [[2]]
## [1] 2 2 2
##
## [[3]]
## [1] 3 3
##
## [[4]]
## [1] 4
mapply(rep, times = 1:4, x = 4:1)
## [[1]]
## [1] 4
##
## [[2]]
## [1] 3 3
##
## [[3]]
## [1] 2 2 2
##
## [[4]]
## [1] 1 1 1 1
mapply(rep, times = 1:4, MoreArgs = list(x = 42))
## [[1]]
## [1] 42
##
## [[2]]
## [1] 42 42
##
## [[3]]
## [1] 42 42 42
##
## [[4]]
## [1] 42 42 42 42
mapply(function(x, y) seq_len(x) + y,c(a = 1, b = 2, c = 3), # names from first
c(A = 10, B = 0, C = -10))
## $a
## [1] 11
##
## $b
## [1] 1 2
##
## $c
## [1] -9 -8 -7
word <- function(C, k) paste(rep.int(C, k), collapse = "")
word
## function(C, k) paste(rep.int(C, k), collapse = "")
function(C, k) paste(rep.int(C, k), collapse = "")
## function(C, k) paste(rep.int(C, k), collapse = "")
utils::str(mapply(word, LETTERS[1:6], 6:1, SIMPLIFY = FALSE))
## List of 6
## $ A: chr "AAAAAA"
## $ B: chr "BBBBB"
## $ C: chr "CCCC"
## $ D: chr "DDD"
## $ E: chr "EE"
## $ F: chr "F"