m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), 
                       rnorm(30, 5)), nrow=30, ncol=3)
m
##              [,1]       [,2]     [,3]
##  [1,]  0.77168336  1.9800718 5.177931
##  [2,] -1.16892340  1.5922024 5.434991
##  [3,] -0.22924367  3.0031525 5.961884
##  [4,]  0.51566315  0.1801993 5.241008
##  [5,] -0.24349510  1.5474135 3.741760
##  [6,]  0.37641538  1.2689883 4.595778
##  [7,] -1.33640041  1.6629788 5.144250
##  [8,]  1.13349469  2.4526974 5.070174
##  [9,] -0.53577357  3.0349881 5.970443
## [10,]  0.04755581  3.1303161 6.313947
## [11,] -0.59650568  2.4775587 5.074128
## [12,] -1.02604823  0.3199054 4.412910
## [13,]  0.98665825  1.8983923 5.096624
## [14,]  0.18307471  2.1382636 4.226789
## [15,] -0.93046630 -0.2516156 4.317249
## [16,]  1.34372752  3.1226625 5.840271
## [17,] -0.17107218  2.4224861 6.473922
## [18,] -0.29197582  1.8473795 5.651875
## [19,] -0.53510253  1.0220945 4.221734
## [20,] -0.59466954  1.1572037 4.936397
## [21,]  1.69984405  2.5771975 5.057423
## [22,] -0.22620296  0.9770107 4.086539
## [23,]  0.48770419  2.1186531 3.868679
## [24,]  0.67449361  1.1942795 4.841737
## [25,] -0.40092539  0.7857326 4.982847
## [26,] -1.25122006  1.7048265 6.732674
## [27,] -1.52117518  0.7670682 4.015147
## [28,]  1.16276139  1.8674759 5.704011
## [29,] -0.50085993  2.9326783 6.848703
## [30,]  0.18223700  3.0532485 4.663616
apply(m, 1, mean)
##  [1] 2.643229 1.952757 2.911931 1.978957 1.681893 2.080394 1.823610
##  [8] 2.885455 2.823219 3.163940 2.318394 1.235589 2.660558 2.182709
## [15] 1.045056 3.435554 2.908445 2.402426 1.569575 1.832977 3.111488
## [22] 1.612449 2.158346 2.236837 1.789218 2.395427 1.087013 2.911416
## [29] 3.093507 2.633034
apply(m, 2, mean)
## [1] -0.06649156  1.79951699  5.12351470
apply(m, 2, function(x) length(x[x<0]))
## [1] 17  1  0
apply(m, 2, function(x) is.matrix(x))
## [1] FALSE FALSE FALSE
apply(m, 2, is.vector)
## [1] TRUE TRUE TRUE
apply(m, 2, function(x) mean(x[x>0]))
## [1] 0.7357933 1.8702457 5.1235147
#Using sapply and lapply
sapply(1:3, function(x) x^2)
## [1] 1 4 9
lapply(1:3, function(x) x^2)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 9
sapply(1:3, function(x) x^2, simplify=F)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 9
unlist(lapply(1:3, function(x) x^2))
## [1] 1 4 9
#!!!!
apply(m,2,mean)
## [1] -0.06649156  1.79951699  5.12351470
sapply(1:3, function(x) mean(m[,x]))
## [1] -0.06649156  1.79951699  5.12351470
sapply(1:3, function(x, y) mean(y[,x]), y=m)
## [1] -0.06649156  1.79951699  5.12351470

A <- matrix(1:9,3,3)
B <- matrix(4:15,4,3)
C <- matrix(c(8,8,9,9,10,10),3,2,byrow=TRUE)
MyList <- list(A,B,C)
MyList
## [[1]]
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,]    4    8   12
## [2,]    5    9   13
## [3,]    6   10   14
## [4,]    7   11   15
## 
## [[3]]
##      [,1] [,2]
## [1,]    8    8
## [2,]    9    9
## [3,]   10   10
lapply(MyList,"[",,2)
## [[1]]
## [1] 4 5 6
## 
## [[2]]
## [1]  8  9 10 11
## 
## [[3]]
## [1]  8  9 10
lapply(MyList,"[",,1)
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] 4 5 6 7
## 
## [[3]]
## [1]  8  9 10
lapply(MyList,"[",2,)
## [[1]]
## [1] 2 5 8
## 
## [[2]]
## [1]  5  9 13
## 
## [[3]]
## [1] 9 9
lapply(MyList,"[",2,2)
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 9
## 
## [[3]]
## [1] 9
lapply(MyList,"[", 2,1 )
## [[1]]
## [1] 2
## 
## [[2]]
## [1] 5
## 
## [[3]]
## [1] 9
sapply(MyList,"[", 2,1 )
## [1] 2 5 9
sapply(MyList,"[", 2,1, simplify=F)
## [[1]]
## [1] 2
## 
## [[2]]
## [1] 5
## 
## [[3]]
## [1] 9
unlist(lapply(MyList,"[", 2,1 ))
## [1] 2 5 9
Z=sapply(MyList,"[", 1,1 )
Z=rep(Z,c(3,1,2))
Z
## [1] 1 1 1 4 8 8
Q=matrix(c(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 4)),
         4,4)
Q
##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    1    2    3    4
## [3,]    1    2    3    4
## [4,]    1    2    3    4
Q=mapply(rep,1:4,4)
Q
##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    1    2    3    4
## [3,]    1    2    3    4
## [4,]    1    2    3    4

———–!!!

MyPoints=B  # just give an illustrative name to our matrix B
MyPoints
##      [,1] [,2] [,3]
## [1,]    4    8   12
## [2,]    5    9   13
## [3,]    6   10   14
## [4,]    7   11   15
MyPoints_means=apply(MyPoints,2,mean)
MyPoints_means
## [1]  5.5  9.5 13.5
MyPoints_sdev=apply(MyPoints,2,sd)
MyPoints_sdev
## [1] 1.290994 1.290994 1.290994
MyPoints_Trans1=sweep(MyPoints,2,MyPoints_means,"-")
MyPoints_Trans1
##      [,1] [,2] [,3]
## [1,] -1.5 -1.5 -1.5
## [2,] -0.5 -0.5 -0.5
## [3,]  0.5  0.5  0.5
## [4,]  1.5  1.5  1.5
MyPoints_Trans2=sweep(MyPoints_Trans1,2,MyPoints_sdev,"/")
MyPoints_Trans2
##            [,1]       [,2]       [,3]
## [1,] -1.1618950 -1.1618950 -1.1618950
## [2,] -0.3872983 -0.3872983 -0.3872983
## [3,]  0.3872983  0.3872983  0.3872983
## [4,]  1.1618950  1.1618950  1.1618950

sweep(X, MARGIN, 統計量, FUN=“-”, …)

ベクトルや行列,配列の MARGIN で指定した場所から統計量を引く(関数 scale() も同様の機能).FUN=“+” とすれば統計量を足す

( x <- matrix(1:8, ncol=4) )
##      [,1] [,2] [,3] [,4]
## [1,]    1    3    5    7
## [2,]    2    4    6    8
apply(x, 2, sum)          # 各列の最大値を求める
## [1]  3  7 11 15
apply(x, c(1,2), sqrt)    # 各要素の平方根を求める
##          [,1]     [,2]     [,3]     [,4]
## [1,] 1.000000 1.732051 2.236068 2.645751
## [2,] 1.414214 2.000000 2.449490 2.828427
sweep(x, 1, apply(x,1,mean))  # 行平均を引く
##      [,1] [,2] [,3] [,4]
## [1,]   -3   -1    1    3
## [2,]   -3   -1    1    3
sweep(x, 2, apply(x,2,mean))  # 列平均を引く
##      [,1] [,2] [,3] [,4]
## [1,] -0.5 -0.5 -0.5 -0.5
## [2,]  0.5  0.5  0.5  0.5

tapply()

グループ化されたデータの処理でよく使う関数が tapply() である.最初はとっつきにくいが,カテゴリカルデータを扱うは結構役に立つ.

 data(warpbreaks)     # 1台の織機当たりの反り破損の数
 attach(warpbreaks)   # 使うデータを固定
 fc <- factor(tension) # 要素をグループ化
 levels(fc)           # グループ化されているかを確認
## [1] "L" "M" "H"
 tapply(breaks, fc, mean) # breaks の平均をグループごとに計算
##        L        M        H 
## 36.38889 26.38889 21.66667

mapply()

トリッキーな使い方かもしれないが,mapply() を使えば規則的なリストを簡単に作成することが出来る.

 mapply(rep, 1:3, 3:1)   # rep(1,3), rep(2,2), rep(3,1) を並べる
## [[1]]
## [1] 1 1 1
## 
## [[2]]
## [1] 2 2
## 
## [[3]]
## [1] 3
 mapply(rep, 1:3, 3:1)   # rep(3,1), rep(2,2), rep(1,3) を並べる
## [[1]]
## [1] 1 1 1
## 
## [[2]]
## [1] 2 2
## 
## [[3]]
## [1] 3
Mydf <- data.frame(DepPC=c("90","91","92","93","94",
                           "75"), 
                   DProgr=c(1:120),
                   Qty=c(7:31,9:23,99:124,2:28,14:19,
                         21:29,4,3,1:9,66), 
                   Delivered=ifelse(rnorm(120)>0,TRUE,
                                    FALSE))
head(Mydf,15)
##    DepPC DProgr Qty Delivered
## 1     90      1   7     FALSE
## 2     91      2   8     FALSE
## 3     92      3   9      TRUE
## 4     93      4  10      TRUE
## 5     94      5  11      TRUE
## 6     75      6  12      TRUE
## 7     90      7  13     FALSE
## 8     91      8  14      TRUE
## 9     92      9  15     FALSE
## 10    93     10  16     FALSE
## 11    94     11  17     FALSE
## 12    75     12  18      TRUE
## 13    90     13  19      TRUE
## 14    91     14  20     FALSE
## 15    92     15  21     FALSE
tail(Mydf,5)
##     DepPC DProgr Qty Delivered
## 116    91    116   6      TRUE
## 117    92    117   7      TRUE
## 118    93    118   8     FALSE
## 119    94    119   9      TRUE
## 120    75    120  66      TRUE
sapply(Mydf, class)    # show data types for each column using sapply
##     DepPC    DProgr       Qty Delivered 
##  "factor" "integer" "numeric" "logical"
dim( Mydf)
## [1] 120   4
nrow(Mydf); ncol(Mydf) # the same, but separately
## [1] 120
## [1] 4
unique(Mydf$DepPC) # how many departments are there?
## [1] 90 91 92 93 94 75
## Levels: 75 90 91 92 93 94
aggregate(Mydf$Qty,by=Mydf["DepPC"],FUN=sum)
##   DepPC   x
## 1    75 878
## 2    90 689
## 3    91 684
## 4    92 701
## 5    93 707
## 6    94 802
library(ggplot2)
ggplot(data=aggregate(Mydf$Qty,by=Mydf["DepPC"],FUN=sum), 
       aes(x=DepPC, y=x)) +
  geom_point()+
  ggtitle("Sales per department - All")

# select only for delivered=True
Y <- Mydf[Mydf$Delivered==TRUE,]
head(Y)
##    DepPC DProgr Qty Delivered
## 3     92      3   9      TRUE
## 4     93      4  10      TRUE
## 5     94      5  11      TRUE
## 6     75      6  12      TRUE
## 8     91      8  14      TRUE
## 12    75     12  18      TRUE
#So we can repeat the plot:
  ggplot(data=aggregate(Y$Qty,by=Y["DepPC"],FUN=sum),
         aes(x=DepPC, y=x)) +
  geom_point()+
  ggtitle("Sales per department - Delivered")

  m <- as.data.frame(matrix(1:1e7, ncol = 1e5))
  system.time(colMeans(m))
##    user  system elapsed 
##     0.5     0.0     0.5
  # user  system elapsed 
  # 1.69    0.03    1.73 
  system.time(sapply(m, mean))
##    user  system elapsed 
##    0.59    0.00    0.59
  # user  system elapsed 
  # 1.50    0.03    1.60 
  system.time(apply(m, 2, mean))
##    user  system elapsed 
##    1.45    0.05    1.49
  # user  system elapsed 
  # 3.84    0.03    3.90 
  system.time(for(i in 1:ncol(m)) mean(m[, i]))
##    user  system elapsed 
##   12.17    0.00   12.19
  # user  system elapsed 
  # 13.78    0.01   13.93
  
  system.time(as.matrix(m))  #called by `colMeans` and `apply`
##    user  system elapsed 
##    0.47    0.02    0.48
  #   user  system elapsed 
  #   1.03    0.00    1.05
  system.time(for(i in 1:ncol(m)) m[, i])  #in the `for` loop
##    user  system elapsed 
##   11.41    0.00   11.40
  #   user  system elapsed 
  #  12.93    0.01   13.07


  
  mm = as.matrix(m)
  system.time(colMeans(mm))
##    user  system elapsed 
##    0.02    0.00    0.02
  #   user  system elapsed 
  #   0.01    0.00    0.01 
  system.time(apply(mm, 2, mean))
##    user  system elapsed 
##    0.92    0.00    0.93
  #   user  system elapsed 
  #   1.48    0.03    1.53 
  system.time(for(i in 1:ncol(mm)) mean(mm[, i]))
##    user  system elapsed 
##    0.75    0.00    0.75
  #   user  system elapsed 
  #   1.22    0.00    1.21
  for (i in seq_along(mtcars)) {
    means[i] <- mean(mtcars[[i]])
  }
  sds <- numeric(length(mtcars))
  for (i in seq_along(mtcars)) {
    sds[i] <- sd(mtcars[[i]])
  }
  #You can write:
  (means <- vapply(mtcars, mean, numeric(1)))
##        mpg        cyl       disp         hp       drat         wt 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250 
##       qsec         vs         am       gear       carb 
##  17.848750   0.437500   0.406250   3.687500   2.812500
  (sds <- vapply(mtcars, sd, numeric(1)))
##         mpg         cyl        disp          hp        drat          wt 
##   6.0269481   1.7859216 123.9386938  68.5628685   0.5346787   0.9784574 
##        qsec          vs          am        gear        carb 
##   1.7869432   0.5040161   0.4989909   0.7378041   1.6152000