本週作業與進度:
1. 研讀eapply()
2. 研讀rapply()
3. 研讀 Advanced R: Functional programming
4. 研讀 Advanced R: Functionals



R functions

R

funcationals

  • 高階函數(higher-order function):參數可定義函數,且回傳值亦為函數之函數
  • 泛函(functionals):參數為函數,而回傳值為向量或是矩陣之函數
set.seed(seed = 100)
randomize <- function(f) f(runif(1e3))
randomize(mean)
## [1] 0.5180817
randomize(sd)
## [1] 0.2929122
randomize(median)
## [1] 0.5057337
g <- function(f, x) f(x)
g(f = max, c(3, 1, 5))
## [1] 5
h <- function(f, ...) f(...)
h(f = max, c(3, 1, 5))
## [1] 5
h(f = mean, c(1, 3, 4, NA, 4), na.rm = TRUE)
## [1] 3

apply family

apply()

  • 適用於有維度的同質性資料結構,如array與matrix
?mtcars
class(mtcars)
## [1] "data.frame"
my_mtcar <- as.matrix(mtcars)
class(my_mtcar)
## [1] "matrix"
apply(X = my_mtcar, MARGIN = 2, mean)
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500
apply(X = my_mtcar, MARGIN = 2, max)
##     mpg     cyl    disp      hp    drat      wt    qsec      vs      am    gear 
##  33.900   8.000 472.000 335.000   4.930   5.424  22.900   1.000   1.000   5.000 
##    carb 
##   8.000
apply(X = my_mtcar, MARGIN = 1, function(v) colnames(mtcars)[which.max(v)])
##           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
##              "disp"              "disp"              "disp"              "disp" 
##   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
##              "disp"              "disp"              "disp"              "disp" 
##            Merc 230            Merc 280           Merc 280C          Merc 450SE 
##              "disp"              "disp"              "disp"              "disp" 
##          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
##              "disp"              "disp"              "disp"              "disp" 
##   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
##              "disp"              "disp"              "disp"              "disp" 
##       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
##              "disp"              "disp"              "disp"              "disp" 
##    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
##              "disp"              "disp"              "disp"                "hp" 
##      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
##              "disp"                "hp"                "hp"              "disp"
m = matrix(1:6, nrow = 3)
m
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
f <- function(x) x/c(2, 4)
apply(X = m, MARGIN = 1, f)
##      [,1] [,2] [,3]
## [1,]  0.5 1.00  1.5
## [2,]  1.0 1.25  1.5
# apply(m, 2, f)

lapply()

  • 適用於原子向量與異質向量-list
  • 回傳值必定為list
?lapply
set.seed(seed = 123)

d <- list(rnorm(n = 30, mean = 5, sd = 10), rnorm(n = 50, mean = 5, sd = 10), rnorm(n = 100, mean = 8, sd = 15))

lapply(X = d, mean)
## [[1]]
## [1] 4.528962
## 
## [[2]]
## [1] 5.590362
## 
## [[3]]
## [1] 7.912756
lapply(X = d, function(v) which(v > 0))
## [[1]]
##  [1]  2  3  4  5  6  7 10 11 12 13 14 16 17 19 20 22 27 28 30
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10 12 14 15 17 18 19 20 21 22 23 24 25 26 28 29
## [26] 30 31 33 36 37 38 39 40 41 43 46 47 49 50
## 
## [[3]]
##  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  15  17  18  19  22  23
## [20]  24  26  29  30  32  34  35  36  37  41  43  44  45  47  48  50  51  52  53
## [39]  54  56  58  59  61  62  66  68  69  71  72  73  75  76  77  78  79  80  81
## [58]  84  85  86  87  88  89  90  91  92  93  94  97  98  99 100
lapply(X = d, function(v, c) which(v > c), 0)
## [[1]]
##  [1]  2  3  4  5  6  7 10 11 12 13 14 16 17 19 20 22 27 28 30
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10 12 14 15 17 18 19 20 21 22 23 24 25 26 28 29
## [26] 30 31 33 36 37 38 39 40 41 43 46 47 49 50
## 
## [[3]]
##  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  15  17  18  19  22  23
## [20]  24  26  29  30  32  34  35  36  37  41  43  44  45  47  48  50  51  52  53
## [39]  54  56  58  59  61  62  66  68  69  71  72  73  75  76  77  78  79  80  81
## [58]  84  85  86  87  88  89  90  91  92  93  94  97  98  99 100
lapply(X = d, function(v, f) which(v > f(v)), f = mean)
## [[1]]
##  [1]  3  4  5  6  7 11 12 13 14 16 17 19 27 28 30
## 
## [[2]]
##  [1]  1  3  4  5  6  7 14 15 19 21 24 26 28 29 30 31 36 37 39 40 43 46 49
## 
## [[3]]
##  [1]  1  2  4  6  7  8 10 11 12 13 15 17 18 22 30 32 35 36 37 41 45 47 48 51 52
## [26] 53 56 58 59 61 68 69 71 72 73 77 79 81 84 86 87 89 90 92 94 97 98 99
  • 實作lapply()函數:
lapply2 <- function(x, f, ...) {
  out <- vector("list", length(x))
  for (i in seq_along(x)) {
    out[[i]] <- f(x[[i]], ...)
  }
  out
}
set.seed(seed = 123)
l <- replicate(20, runif(sample(1:10, 1)), simplify = FALSE)
l
## [[1]]
## [1] 0.8830174 0.9404673 0.0455565
## 
## [[2]]
## [1] 0.892419 0.551435
## 
## [[3]]
## [1] 0.9568333 0.4533342 0.6775706 0.5726334 0.1029247
## 
## [[4]]
## [1] 0.3279207 0.9545036 0.8895393 0.6928034 0.6405068
## 
## [[5]]
## [1] 0.6557058 0.7085305 0.5440660 0.5941420 0.2891597 0.1471136 0.9630242
## [8] 0.9022990 0.6907053
## 
## [[6]]
## [1] 0.02461368 0.47779597 0.75845954 0.21640794
## 
## [[7]]
## [1] 0.2316258 0.1428000 0.4145463 0.4137243 0.3688455
## 
## [[8]]
## [1] 0.13880606 0.23303410 0.46596245 0.26597264 0.85782772 0.04583117 0.44220007
## 
## [[9]]
## [1] 0.1218993 0.5609480 0.2065314 0.1275317 0.7533079 0.8950454 0.3744628
## 
## [[10]]
## [1] 0.09484066 0.38396964 0.27438364 0.81464004 0.44851634
## 
## [[11]]
## [1] 0.8123895
## 
## [[12]]
## [1] 0.7544751586 0.6292211316 0.7101824014 0.0006247733 0.4753165741
## [6] 0.2201188852 0.3798165377 0.6127710033 0.3517979092
## 
## [[13]]
## [1] 0.2436195 0.6680556 0.4176468 0.7881958
## 
## [[14]]
## [1] 0.4348927 0.9849570 0.8930511 0.8864691 0.1750527 0.1306957
## 
## [[15]]
## [1] 0.3435165 0.6567581
## 
## [[16]]
## [1] 0.18769112 0.78229430 0.09359499 0.46677904
## 
## [[17]]
## [1] 0.5999890 0.3328235 0.4886130
## 
## [[18]]
## [1] 0.48290240 0.89035022 0.91443819 0.60873498 0.41068978 0.14709469 0.93529980
## [8] 0.30122890 0.06072057
## 
## [[19]]
## [1] 0.1422943 0.5492847 0.9540912 0.5854834 0.4045103 0.6478935 0.3198206
## [8] 0.3077200 0.2197676
## 
## [[20]]
## [1] 0.9842192 0.1542023 0.0910440 0.1419069 0.6900071 0.6192565 0.8913941
# With a for loop
out <- vector("list", length(l))
for (i in seq_along(l)) {
  out[[i]] <- length(l[[i]])
}
unlist(out)
##  [1] 3 2 5 5 9 4 5 7 7 5 1 9 4 6 2 4 3 9 9 7
# With lapply
unlist(lapply(l, length))
##  [1] 3 2 5 5 9 4 5 7 7 5 1 9 4 6 2 4 3 9 9 7

sapply()

  • 為lapply()函數的簡化版:簡化lapply()的回傳結果,視狀況而定(可否簡化)簡化為向量或是矩陣
set.seed(seed = 123)
d <- list(rnorm(n = 30, mean = 5, sd = 10), rnorm(n = 50, mean = 5, sd = 10), rnorm(n = 100, mean = 8, sd = 15))

lapply(d, function(x) sum(x > 0))
## [[1]]
## [1] 19
## 
## [[2]]
## [1] 39
## 
## [[3]]
## [1] 72
sapply(d, function(x) sum(x > 0))
## [1] 19 39 72
lapply(d, quantile, probs = 1:3/4)
## [[1]]
##       25%       50%       75% 
## -1.713995  4.262667  9.886169 
## 
## [[2]]
##        25%        50%        75% 
##  0.4928728  4.6429139 11.6263363 
## 
## [[3]]
##       25%       50%       75% 
## -1.466591  7.245577 16.280677
sapply(d, quantile, probs = 1:3/4)
##          [,1]       [,2]      [,3]
## 25% -1.713995  0.4928728 -1.466591
## 50%  4.262667  4.6429139  7.245577
## 75%  9.886169 11.6263363 16.280677
lapply(d, function(x) which(x > 0))
## [[1]]
##  [1]  2  3  4  5  6  7 10 11 12 13 14 16 17 19 20 22 27 28 30
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10 12 14 15 17 18 19 20 21 22 23 24 25 26 28 29
## [26] 30 31 33 36 37 38 39 40 41 43 46 47 49 50
## 
## [[3]]
##  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  15  17  18  19  22  23
## [20]  24  26  29  30  32  34  35  36  37  41  43  44  45  47  48  50  51  52  53
## [39]  54  56  58  59  61  62  66  68  69  71  72  73  75  76  77  78  79  80  81
## [58]  84  85  86  87  88  89  90  91  92  93  94  97  98  99 100
sapply(d, function(x) which(x > 0))  
## [[1]]
##  [1]  2  3  4  5  6  7 10 11 12 13 14 16 17 19 20 22 27 28 30
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10 12 14 15 17 18 19 20 21 22 23 24 25 26 28 29
## [26] 30 31 33 36 37 38 39 40 41 43 46 47 49 50
## 
## [[3]]
##  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  15  17  18  19  22  23
## [20]  24  26  29  30  32  34  35  36  37  41  43  44  45  47  48  50  51  52  53
## [39]  54  56  58  59  61  62  66  68  69  71  72  73  75  76  77  78  79  80  81
## [58]  84  85  86  87  88  89  90  91  92  93  94  97  98  99 100

lists of functions

  • 函數可放在list中,可搭配lapply函數集中處理多項運算
  • EX:
compute_mean <- list(
  base = function(x) mean(x),
  sum = function(x) sum(x) / length(x),
  manual = function(x) {
    total <- 0
    n <- length(x)
    for (i in seq_along(x)) {
      total <- total + x[i] / n
    }
    total
  }
)
x <- runif(1e5)
system.time(compute_mean$base(x))
##    user  system elapsed 
##       0       0       0
system.time(compute_mean[[2]](x))
##    user  system elapsed 
##       0       0       0
system.time(compute_mean[["manual"]](x))
##    user  system elapsed 
##   0.009   0.001   0.009
lapply(compute_mean, function(f) f(x))
## $base
## [1] 0.4992423
## 
## $sum
## [1] 0.4992423
## 
## $manual
## [1] 0.4992423
call_fun <- function(f, ...) f(...)
lapply(compute_mean, call_fun, x)
## $base
## [1] 0.4992423
## 
## $sum
## [1] 0.4992423
## 
## $manual
## [1] 0.4992423
lapply(compute_mean, function(f) system.time(f(x)))
## $base
##    user  system elapsed 
##       0       0       0 
## 
## $sum
##    user  system elapsed 
##   0.000   0.000   0.001 
## 
## $manual
##    user  system elapsed 
##   0.005   0.000   0.005

vapply()

  • 需事先設定FUN執行結果的範本(FUN.VALUE)
  • 如與事先設定之範本不符,及產生錯誤訊息
  • 在大部分情況下,vapply()執行速度可再加快
  • 常使用在函數的定義中
set.seed(seed = 123)
d <- list(rnorm(n = 30, mean = 5, sd = 10), rnorm(n = 50, mean = 5, sd = 10), rnorm(n = 100, mean = 8, sd = 15))
vapply(X = d, max, FUN.VALUE = 0)
## [1] 22.86913 26.68956 56.61560
vapply(X = d, quantile, FUN.VALUE = c(0, 0, 0, 0, 0))
##            [,1]        [,2]       [,3]
## 0%   -14.666172 -18.0916888 -22.798708
## 25%   -1.713995   0.4928728  -1.466591
## 50%    4.262667   4.6429139   7.245577
## 75%    9.886169  11.6263363  16.280677
## 100%  22.869131  26.6895597  56.615599
vapply(X = d, quantile, FUN.VALUE = c(10, 10, 110, 10, 0))
##            [,1]        [,2]       [,3]
## 0%   -14.666172 -18.0916888 -22.798708
## 25%   -1.713995   0.4928728  -1.466591
## 50%    4.262667   4.6429139   7.245577
## 75%    9.886169  11.6263363  16.280677
## 100%  22.869131  26.6895597  56.615599
# vapply(X = d, quantile, FUN.VALUE = character(length = 5))
# Error in vapply(X = d, quantile, FUN.VALUE = character(length = 5)) : 
#   值的類型必須是 'character',
# 但是 FUN(X[[1]]) 的結果類型是 'double'

# vapply(X = d, quantile, FUN.VALUE = numeric(length = 4))
# Error in vapply(X = d, quantile, FUN.VALUE = numeric(length = 4)) : 
#   值的長度必須是 4,
# 但是 FUN(X[[1]]) 的結果長度是 5

mapply()

set.seed(seed = 123)
xs <- replicate(5, runif(10), simplify = FALSE)
ws <- replicate(5, rpois(10, 5) + 1, simplify = FALSE)

# It’s easy to use lapply() to compute the unweighted means:
unlist(lapply(xs, mean))
## [1] 0.5782475 0.5233693 0.6155837 0.5378580 0.3453964
# But, for weighted means?
weighted.mean(x = xs[[1]], w = ws[[1]])
## [1] 0.5989783
weighted.mean(x = xs[[2]], w = ws[[2]])
## [1] 0.4798772
unlist(lapply(seq_along(xs), function(i) {
  weighted.mean(xs[[i]], ws[[i]])
}))
## [1] 0.5989783 0.4798772 0.6175470 0.5619867 0.3535632
#
?mapply
mapply(FUN = weighted.mean, xs, ws)
## [1] 0.5989783 0.4798772 0.6175470 0.5619867 0.3535632
mapply(FUN = weighted.mean, xs, ws, SIMPLIFY = FALSE)
## [[1]]
## [1] 0.5989783
## 
## [[2]]
## [1] 0.4798772
## 
## [[3]]
## [1] 0.617547
## 
## [[4]]
## [1] 0.5619867
## 
## [[5]]
## [1] 0.3535632
f <- function(x, y, z) (x + y)^z
mapply(f, x = 1:5, y = seq(0, 3, length.out = 5), MoreArgs = list(z = 0.5), SIMPLIFY = FALSE)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 1.658312
## 
## [[3]]
## [1] 2.12132
## 
## [[4]]
## [1] 2.5
## 
## [[5]]
## [1] 2.828427
Map
## function (f, ...) 
## {
##     f <- match.fun(f)
##     mapply(FUN = f, ..., SIMPLIFY = FALSE)
## }
## <bytecode: 0x7fdc01e07230>
## <environment: namespace:base>

tapply()

factor

  • R中專門作為分類之用的資料型別,可視為 類別變數
  • 本質為integer, 只是多了一些屬性(attributes),如levels與class兩個屬性
set.seed(seed = 123)
v <- sample(x = month.abb, size = 50, replace = TRUE)
vf <- factor(x = v)   # 定義factor
attributes(vf)        # 查詢物件所有的屬性
## $levels
##  [1] "Apr" "Aug" "Dec" "Feb" "Jan" "Jul" "Jun" "Mar" "May" "Nov" "Oct" "Sep"
## 
## $class
## [1] "factor"
sort(vf)              # 排序,預設遞增排序
##  [1] Apr Apr Aug Aug Dec Dec Dec Dec Feb Feb Jan Jul Jul Jul Jul Jul Jun Jun Jun
## [20] Mar Mar Mar Mar Mar May May May May May May Nov Nov Nov Nov Nov Nov Oct Oct
## [39] Oct Oct Oct Oct Sep Sep Sep Sep Sep Sep Sep Sep
## Levels: Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
vf <- factor(x = v, levels = month.abb) 
sort(vf)
##  [1] Jan Feb Feb Mar Mar Mar Mar Mar Apr Apr May May May May May May Jun Jun Jun
## [20] Jul Jul Jul Jul Jul Aug Aug Sep Sep Sep Sep Sep Sep Sep Sep Oct Oct Oct Oct
## [39] Oct Oct Nov Nov Nov Nov Nov Nov Dec Dec Dec Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
levels(vf)
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
nlevels(vf)
## [1] 12
vf
##  [1] Mar Mar Oct Feb Jun Nov May Apr Jun Sep Oct Nov May Mar Nov Sep Dec Sep Sep
## [20] Mar Aug Oct Jul Oct Sep Mar Apr Jan Nov Jul May Dec Oct Jul Sep Sep Oct Jul
## [39] Nov Dec May Jul May Nov Jun Sep Feb May Aug Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
unclass(vf)
##  [1]  3  3 10  2  6 11  5  4  6  9 10 11  5  3 11  9 12  9  9  3  8 10  7 10  9
## [26]  3  4  1 11  7  5 12 10  7  9  9 10  7 11 12  5  7  5 11  6  9  2  5  8 12
## attr(,"levels")
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
typeof(vf)
## [1] "integer"

tapply()

  • 只適用於向量
  • 搭配factor,可有效率完成分組運算
set.seed(seed = 123)
dim(iris)
## [1] 150   5
sapply(iris, class)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##    "numeric"    "numeric"    "numeric"    "numeric"     "factor"
tapply(iris$Sepal.Length, INDEX = iris$Species, mean)
##     setosa versicolor  virginica 
##      5.006      5.936      6.588
tapply(X = iris$Sepal.Length, INDEX = list(iris$Species, 
                                           iris$Sepal.Width > mean(iris$Sepal.Width)), 
       FUN = mean)
##               FALSE     TRUE
## setosa     4.637500 5.076190
## versicolor 5.830952 6.487500
## virginica  6.475758 6.805882
ans <- tapply(X = iris$Sepal.Length, INDEX = list(iris$Species, 
                                           iris$Sepal.Width > mean(iris$Sepal.Width)), 
       FUN = quantile)
ans
##            FALSE     TRUE     
## setosa     Numeric,5 Numeric,5
## versicolor Numeric,5 Numeric,5
## virginica  Numeric,5 Numeric,5
mat <- matrix(1:15, nrow = 5)
mat
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    2    7   12
## [3,]    3    8   13
## [4,]    4    9   14
## [5,]    5   10   15
apply(X = mat, MARGIN = 2, cumsum)
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    3   13   23
## [3,]    6   21   36
## [4,]   10   30   50
## [5,]   15   40   65
t(apply(X = mat, MARGIN = 1, cumsum))
##      [,1] [,2] [,3]
## [1,]    1    7   18
## [2,]    2    9   21
## [3,]    3   11   24
## [4,]    4   13   27
## [5,]    5   15   30
tmp <- tapply(X = mat, INDEX = row(mat), cumsum)
tmp
## $`1`
## [1]  1  7 18
## 
## $`2`
## [1]  2  9 21
## 
## $`3`
## [1]  3 11 24
## 
## $`4`
## [1]  4 13 27
## 
## $`5`
## [1]  5 15 30
do.call(rbind, tmp)
##   [,1] [,2] [,3]
## 1    1    7   18
## 2    2    9   21
## 3    3   11   24
## 4    4   13   27
## 5    5   15   30
  • 實作tapply()功能:
tapply2 <- function(X, INDEX, FUN, ..., simplify = TRUE) {
  pieces <- split(X, INDEX)
  sapply(pieces, FUN = FUN, simplify = simplify)
}
?split

Map()與Reduce()

  • Map函數的底層為mapply(),且參數SIMPLIFY設定為FALSE
?Map
Map
## function (f, ...) 
## {
##     f <- match.fun(f)
##     mapply(FUN = f, ..., SIMPLIFY = FALSE)
## }
## <bytecode: 0x7fdc01e07230>
## <environment: namespace:base>
  • Reduce():
Reduce(`+`, 1:3) # -> ((1 + 2) + 3)
## [1] 6
Reduce(sum, 1:3) # -> sum(sum(1, 2), 3)
## [1] 6
set.seed(seed = 123)
l <- replicate(5, sample(1:10, 15, replace = T), simplify = FALSE)
str(l)
## List of 5
##  $ : int [1:15] 3 3 10 2 6 5 4 6 9 10 ...
##  $ : int [1:15] 3 8 10 7 10 9 3 4 1 7 ...
##  $ : int [1:15] 10 7 5 7 5 6 9 2 5 8 ...
##  $ : int [1:15] 5 9 10 4 6 8 6 6 7 1 ...
##  $ : int [1:15] 5 6 3 9 4 6 9 9 7 3 ...

如果你想找出所有資料的交集:

intersect(intersect(intersect(intersect(l[[1]], l[[2]]), l[[3]]), l[[4]]), l[[5]])
## [1] 5 9
Reduce(f = intersect, x = l)
## [1] 5 9

補充:rapply()

  • The recursive version of lapply()