m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2),
rnorm(30, 5)), nrow=30, ncol=3)
m
## [,1] [,2] [,3]
## [1,] 0.77168336 1.9800718 5.177931
## [2,] -1.16892340 1.5922024 5.434991
## [3,] -0.22924367 3.0031525 5.961884
## [4,] 0.51566315 0.1801993 5.241008
## [5,] -0.24349510 1.5474135 3.741760
## [6,] 0.37641538 1.2689883 4.595778
## [7,] -1.33640041 1.6629788 5.144250
## [8,] 1.13349469 2.4526974 5.070174
## [9,] -0.53577357 3.0349881 5.970443
## [10,] 0.04755581 3.1303161 6.313947
## [11,] -0.59650568 2.4775587 5.074128
## [12,] -1.02604823 0.3199054 4.412910
## [13,] 0.98665825 1.8983923 5.096624
## [14,] 0.18307471 2.1382636 4.226789
## [15,] -0.93046630 -0.2516156 4.317249
## [16,] 1.34372752 3.1226625 5.840271
## [17,] -0.17107218 2.4224861 6.473922
## [18,] -0.29197582 1.8473795 5.651875
## [19,] -0.53510253 1.0220945 4.221734
## [20,] -0.59466954 1.1572037 4.936397
## [21,] 1.69984405 2.5771975 5.057423
## [22,] -0.22620296 0.9770107 4.086539
## [23,] 0.48770419 2.1186531 3.868679
## [24,] 0.67449361 1.1942795 4.841737
## [25,] -0.40092539 0.7857326 4.982847
## [26,] -1.25122006 1.7048265 6.732674
## [27,] -1.52117518 0.7670682 4.015147
## [28,] 1.16276139 1.8674759 5.704011
## [29,] -0.50085993 2.9326783 6.848703
## [30,] 0.18223700 3.0532485 4.663616
apply(m, 1, mean)
## [1] 2.643229 1.952757 2.911931 1.978957 1.681893 2.080394 1.823610
## [8] 2.885455 2.823219 3.163940 2.318394 1.235589 2.660558 2.182709
## [15] 1.045056 3.435554 2.908445 2.402426 1.569575 1.832977 3.111488
## [22] 1.612449 2.158346 2.236837 1.789218 2.395427 1.087013 2.911416
## [29] 3.093507 2.633034
apply(m, 2, mean)
## [1] -0.06649156 1.79951699 5.12351470
apply(m, 2, function(x) length(x[x<0]))
## [1] 17 1 0
apply(m, 2, function(x) is.matrix(x))
## [1] FALSE FALSE FALSE
apply(m, 2, is.vector)
## [1] TRUE TRUE TRUE
apply(m, 2, function(x) mean(x[x>0]))
## [1] 0.7357933 1.8702457 5.1235147
#Using sapply and lapply
sapply(1:3, function(x) x^2)
## [1] 1 4 9
lapply(1:3, function(x) x^2)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
sapply(1:3, function(x) x^2, simplify=F)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
unlist(lapply(1:3, function(x) x^2))
## [1] 1 4 9
#!!!!
apply(m,2,mean)
## [1] -0.06649156 1.79951699 5.12351470
sapply(1:3, function(x) mean(m[,x]))
## [1] -0.06649156 1.79951699 5.12351470
sapply(1:3, function(x, y) mean(y[,x]), y=m)
## [1] -0.06649156 1.79951699 5.12351470
A <- matrix(1:9,3,3)
B <- matrix(4:15,4,3)
C <- matrix(c(8,8,9,9,10,10),3,2,byrow=TRUE)
MyList <- list(A,B,C)
MyList
## [[1]]
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
##
## [[2]]
## [,1] [,2] [,3]
## [1,] 4 8 12
## [2,] 5 9 13
## [3,] 6 10 14
## [4,] 7 11 15
##
## [[3]]
## [,1] [,2]
## [1,] 8 8
## [2,] 9 9
## [3,] 10 10
lapply(MyList,"[",,2)
## [[1]]
## [1] 4 5 6
##
## [[2]]
## [1] 8 9 10 11
##
## [[3]]
## [1] 8 9 10
lapply(MyList,"[",,1)
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 4 5 6 7
##
## [[3]]
## [1] 8 9 10
lapply(MyList,"[",2,)
## [[1]]
## [1] 2 5 8
##
## [[2]]
## [1] 5 9 13
##
## [[3]]
## [1] 9 9
lapply(MyList,"[",2,2)
## [[1]]
## [1] 5
##
## [[2]]
## [1] 9
##
## [[3]]
## [1] 9
lapply(MyList,"[", 2,1 )
## [[1]]
## [1] 2
##
## [[2]]
## [1] 5
##
## [[3]]
## [1] 9
sapply(MyList,"[", 2,1 )
## [1] 2 5 9
sapply(MyList,"[", 2,1, simplify=F)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 5
##
## [[3]]
## [1] 9
unlist(lapply(MyList,"[", 2,1 ))
## [1] 2 5 9
Z=sapply(MyList,"[", 1,1 )
Z=rep(Z,c(3,1,2))
Z
## [1] 1 1 1 4 8 8
Q=matrix(c(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 4)),
4,4)
Q
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 1 2 3 4
## [3,] 1 2 3 4
## [4,] 1 2 3 4
Q=mapply(rep,1:4,4)
Q
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 1 2 3 4
## [3,] 1 2 3 4
## [4,] 1 2 3 4
MyPoints=B # just give an illustrative name to our matrix B
MyPoints
## [,1] [,2] [,3]
## [1,] 4 8 12
## [2,] 5 9 13
## [3,] 6 10 14
## [4,] 7 11 15
MyPoints_means=apply(MyPoints,2,mean)
MyPoints_means
## [1] 5.5 9.5 13.5
MyPoints_sdev=apply(MyPoints,2,sd)
MyPoints_sdev
## [1] 1.290994 1.290994 1.290994
MyPoints_Trans1=sweep(MyPoints,2,MyPoints_means,"-")
MyPoints_Trans1
## [,1] [,2] [,3]
## [1,] -1.5 -1.5 -1.5
## [2,] -0.5 -0.5 -0.5
## [3,] 0.5 0.5 0.5
## [4,] 1.5 1.5 1.5
MyPoints_Trans2=sweep(MyPoints_Trans1,2,MyPoints_sdev,"/")
MyPoints_Trans2
## [,1] [,2] [,3]
## [1,] -1.1618950 -1.1618950 -1.1618950
## [2,] -0.3872983 -0.3872983 -0.3872983
## [3,] 0.3872983 0.3872983 0.3872983
## [4,] 1.1618950 1.1618950 1.1618950
sweep(X, MARGIN, 統計量, FUN=“-”, …)
ベクトルや行列,配列の MARGIN で指定した場所から統計量を引く(関数 scale() も同様の機能).FUN=“+” とすれば統計量を足す
( x <- matrix(1:8, ncol=4) )
## [,1] [,2] [,3] [,4]
## [1,] 1 3 5 7
## [2,] 2 4 6 8
apply(x, 2, sum) # 各列の最大値を求める
## [1] 3 7 11 15
apply(x, c(1,2), sqrt) # 各要素の平方根を求める
## [,1] [,2] [,3] [,4]
## [1,] 1.000000 1.732051 2.236068 2.645751
## [2,] 1.414214 2.000000 2.449490 2.828427
sweep(x, 1, apply(x,1,mean)) # 行平均を引く
## [,1] [,2] [,3] [,4]
## [1,] -3 -1 1 3
## [2,] -3 -1 1 3
sweep(x, 2, apply(x,2,mean)) # 列平均を引く
## [,1] [,2] [,3] [,4]
## [1,] -0.5 -0.5 -0.5 -0.5
## [2,] 0.5 0.5 0.5 0.5
tapply()
グループ化されたデータの処理でよく使う関数が tapply() である.最初はとっつきにくいが,カテゴリカルデータを扱うは結構役に立つ.
data(warpbreaks) # 1台の織機当たりの反り破損の数
attach(warpbreaks) # 使うデータを固定
fc <- factor(tension) # 要素をグループ化
levels(fc) # グループ化されているかを確認
## [1] "L" "M" "H"
tapply(breaks, fc, mean) # breaks の平均をグループごとに計算
## L M H
## 36.38889 26.38889 21.66667
mapply()
トリッキーな使い方かもしれないが,mapply() を使えば規則的なリストを簡単に作成することが出来る.
mapply(rep, 1:3, 3:1) # rep(1,3), rep(2,2), rep(3,1) を並べる
## [[1]]
## [1] 1 1 1
##
## [[2]]
## [1] 2 2
##
## [[3]]
## [1] 3
mapply(rep, 1:3, 3:1) # rep(3,1), rep(2,2), rep(1,3) を並べる
## [[1]]
## [1] 1 1 1
##
## [[2]]
## [1] 2 2
##
## [[3]]
## [1] 3
Mydf <- data.frame(DepPC=c("90","91","92","93","94",
"75"),
DProgr=c(1:120),
Qty=c(7:31,9:23,99:124,2:28,14:19,
21:29,4,3,1:9,66),
Delivered=ifelse(rnorm(120)>0,TRUE,
FALSE))
head(Mydf,15)
## DepPC DProgr Qty Delivered
## 1 90 1 7 FALSE
## 2 91 2 8 FALSE
## 3 92 3 9 TRUE
## 4 93 4 10 TRUE
## 5 94 5 11 TRUE
## 6 75 6 12 TRUE
## 7 90 7 13 FALSE
## 8 91 8 14 TRUE
## 9 92 9 15 FALSE
## 10 93 10 16 FALSE
## 11 94 11 17 FALSE
## 12 75 12 18 TRUE
## 13 90 13 19 TRUE
## 14 91 14 20 FALSE
## 15 92 15 21 FALSE
tail(Mydf,5)
## DepPC DProgr Qty Delivered
## 116 91 116 6 TRUE
## 117 92 117 7 TRUE
## 118 93 118 8 FALSE
## 119 94 119 9 TRUE
## 120 75 120 66 TRUE
sapply(Mydf, class) # show data types for each column using sapply
## DepPC DProgr Qty Delivered
## "factor" "integer" "numeric" "logical"
dim( Mydf)
## [1] 120 4
nrow(Mydf); ncol(Mydf) # the same, but separately
## [1] 120
## [1] 4
unique(Mydf$DepPC) # how many departments are there?
## [1] 90 91 92 93 94 75
## Levels: 75 90 91 92 93 94
aggregate(Mydf$Qty,by=Mydf["DepPC"],FUN=sum)
## DepPC x
## 1 75 878
## 2 90 689
## 3 91 684
## 4 92 701
## 5 93 707
## 6 94 802
library(ggplot2)
ggplot(data=aggregate(Mydf$Qty,by=Mydf["DepPC"],FUN=sum),
aes(x=DepPC, y=x)) +
geom_point()+
ggtitle("Sales per department - All")
# select only for delivered=True
Y <- Mydf[Mydf$Delivered==TRUE,]
head(Y)
## DepPC DProgr Qty Delivered
## 3 92 3 9 TRUE
## 4 93 4 10 TRUE
## 5 94 5 11 TRUE
## 6 75 6 12 TRUE
## 8 91 8 14 TRUE
## 12 75 12 18 TRUE
#So we can repeat the plot:
ggplot(data=aggregate(Y$Qty,by=Y["DepPC"],FUN=sum),
aes(x=DepPC, y=x)) +
geom_point()+
ggtitle("Sales per department - Delivered")
m <- as.data.frame(matrix(1:1e7, ncol = 1e5))
system.time(colMeans(m))
## user system elapsed
## 0.5 0.0 0.5
# user system elapsed
# 1.69 0.03 1.73
system.time(sapply(m, mean))
## user system elapsed
## 0.59 0.00 0.59
# user system elapsed
# 1.50 0.03 1.60
system.time(apply(m, 2, mean))
## user system elapsed
## 1.45 0.05 1.49
# user system elapsed
# 3.84 0.03 3.90
system.time(for(i in 1:ncol(m)) mean(m[, i]))
## user system elapsed
## 12.17 0.00 12.19
# user system elapsed
# 13.78 0.01 13.93
system.time(as.matrix(m)) #called by `colMeans` and `apply`
## user system elapsed
## 0.47 0.02 0.48
# user system elapsed
# 1.03 0.00 1.05
system.time(for(i in 1:ncol(m)) m[, i]) #in the `for` loop
## user system elapsed
## 11.41 0.00 11.40
# user system elapsed
# 12.93 0.01 13.07
mm = as.matrix(m)
system.time(colMeans(mm))
## user system elapsed
## 0.02 0.00 0.02
# user system elapsed
# 0.01 0.00 0.01
system.time(apply(mm, 2, mean))
## user system elapsed
## 0.92 0.00 0.93
# user system elapsed
# 1.48 0.03 1.53
system.time(for(i in 1:ncol(mm)) mean(mm[, i]))
## user system elapsed
## 0.75 0.00 0.75
# user system elapsed
# 1.22 0.00 1.21
for (i in seq_along(mtcars)) {
means[i] <- mean(mtcars[[i]])
}
sds <- numeric(length(mtcars))
for (i in seq_along(mtcars)) {
sds[i] <- sd(mtcars[[i]])
}
#You can write:
(means <- vapply(mtcars, mean, numeric(1)))
## mpg cyl disp hp drat wt
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250
## qsec vs am gear carb
## 17.848750 0.437500 0.406250 3.687500 2.812500
(sds <- vapply(mtcars, sd, numeric(1)))
## mpg cyl disp hp drat wt
## 6.0269481 1.7859216 123.9386938 68.5628685 0.5346787 0.9784574
## qsec vs am gear carb
## 1.7869432 0.5040161 0.4989909 0.7378041 1.6152000