plyr exercise 01

1.对于R的内置数据集mtcars,利用plyr包的函数分别做以下操作:

(1)把数据集按汽车的换挡方式"am"进行分组,分别计算自动挡和手动挡中各个变量的平均数,要求分别做出输出格式为list,array和dataframe的结果;

library(plyr)
(mt = mtcars)
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

# 将am变量因子化,并加上字符串标签
mt$am = factor(mt$am, labels = c("automatic", "manual"))

# numcolwise的作用类似于colwise函数(可查看?colwise),但它只对数值变量进行计算,而跳过分类变量,处理类型混杂的数据集非常方便
dlply(mt, .(am), numcolwise(mean))
## $automatic
##     mpg   cyl  disp    hp  drat    wt  qsec     vs  gear  carb
## 1 17.15 6.947 290.4 160.3 3.286 3.769 18.18 0.3684 3.211 2.737
## 
## $manual
##     mpg   cyl  disp    hp drat    wt  qsec     vs  gear  carb
## 1 24.39 5.077 143.5 126.8 4.05 2.411 17.36 0.5385 4.385 2.923
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##          am
## 1 automatic
## 2    manual
daply(mt, .(am), numcolwise(mean))
##            
## am          mpg   cyl   disp  hp    drat  wt    qsec  vs     gear  carb 
##   automatic 17.15 6.947 290.4 160.3 3.286 3.769 18.18 0.3684 3.211 2.737
##   manual    24.39 5.077 143.5 126.8 4.05  2.411 17.36 0.5385 4.385 2.923
ddply(mt, .(am), numcolwise(mean))
##          am   mpg   cyl  disp    hp  drat    wt  qsec     vs  gear  carb
## 1 automatic 17.15 6.947 290.4 160.3 3.286 3.769 18.18 0.3684 3.211 2.737
## 2    manual 24.39 5.077 143.5 126.8 4.050 2.411 17.36 0.5385 4.385 2.923

(2)把数据集按照车的重量"wt"分为三组(wt<=2;2<wt<=4; 4<wt),分别计算不同车重范围的每英里耗油量"mpg",要求分别做出输出格式为list,array和dataframe的结果;

# 增加一列分组标签
mt$class = ifelse(mt$wt <= 2, 1, ifelse(mt$wt > 2 & mt$wt <= 4, 2, 3))
mt$class = factor(mt$class, labels = c("wt<=2", "2<wt<=4", "wt>4"))

# 结果
dlply(mt, .(class), summarize, mean.mpg = mean(mpg))
## $`wt<=2`
##   mean.mpg
## 1     30.5
## 
## $`2<wt<=4`
##   mean.mpg
## 1    19.54
## 
## $`wt>4`
##   mean.mpg
## 1    12.97
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##     class
## 1   wt<=2
## 2 2<wt<=4
## 3    wt>4

# 如果在daply函数中也使用summarize函数,输出结果会自动变成list形式,所以这里用了另一种方法

daply(mt, .(class), numcolwise(mean))[, 1, drop = F]
##          
## class     mpg  
##   wt<=2   30.5 
##   2<wt<=4 19.54
##   wt>4    12.97

ddply(mt, .(class), summarize, mean.mpg = mean(mpg))
##     class mean.mpg
## 1   wt<=2    30.50
## 2 2<wt<=4    19.54
## 3    wt>4    12.97

(3)把数据集按照汽车的换挡方式"am"和上面设置的车重范围”wt”进行分组,计算不同换挡方式和车重范围的每英里耗油量"mpg"。

daply(mt, .(am, class), summarize, mean.mpg = mean(mpg))
##            class
## am          wt<=2 2<wt<=4 wt>4 
##   automatic NULL  18.26   12.97
##   manual    30.5  21.68   NULL
dlply(mt, .(am, class), summarize, mean.mpg = mean(mpg))
## $`automatic.2<wt<=4`
##   mean.mpg
## 1    18.26
## 
## $`automatic.wt>4`
##   mean.mpg
## 1    12.97
## 
## $`manual.wt<=2`
##   mean.mpg
## 1     30.5
## 
## $`manual.2<wt<=4`
##   mean.mpg
## 1    21.68
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##          am   class
## 1 automatic 2<wt<=4
## 2 automatic    wt>4
## 3    manual   wt<=2
## 4    manual 2<wt<=4
ddply(mt, .(am, class), summarize, mean.mpg = mean(mpg))
##          am   class mean.mpg
## 1 automatic 2<wt<=4    18.26
## 2 automatic    wt>4    12.97
## 3    manual   wt<=2    30.50
## 4    manual 2<wt<=4    21.68

2.对于R的内置数据集iris,把species这一列数据除去后,生成一个新的矩阵,计算

(1)按照列的方向的数值平均值,要求做出输出格式分别为list,array和dataframe的结果;

ir = as.matrix(iris[, -5])
alply(ir, 2, mean, .dims = T)
## $Sepal.Length
## [1] 5.843
## 
## $Sepal.Width
## [1] 3.057
## 
## $Petal.Length
## [1] 3.758
## 
## $Petal.Width
## [1] 1.199
## 
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##             X1
## 1 Sepal.Length
## 2  Sepal.Width
## 3 Petal.Length
## 4  Petal.Width
aaply(ir, 2, mean, .drop = T)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        5.843        3.057        3.758        1.199
adply(ir, 2, mean)
##             X1    V1
## 1 Sepal.Length 5.843
## 2  Sepal.Width 3.057
## 3 Petal.Length 3.758
## 4  Petal.Width 1.199

# 或者用如下的简单办法

# colMeans(ir)

# 或笨办法

# do.call(colwise(mean),list(iris[,-5]))

(2)按照行的方向的数值总和,要求做出输出格式分别为list,array和dataframe的结果。

# 以下2条语句结果略
alply(ir, 1, mean)
adply(ir, 1, mean)
aaply(ir, 1, mean)
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 2.550 2.375 2.350 2.350 2.550 2.850 2.425 2.525 2.225 2.400 2.700 2.500 
##    13    14    15    16    17    18    19    20    21    22    23    24 
## 2.325 2.125 2.800 3.000 2.750 2.575 2.875 2.675 2.675 2.675 2.350 2.650 
##    25    26    27    28    29    30    31    32    33    34    35    36 
## 2.575 2.450 2.600 2.600 2.550 2.425 2.425 2.675 2.725 2.825 2.425 2.400 
##    37    38    39    40    41    42    43    44    45    46    47    48 
## 2.625 2.500 2.225 2.550 2.525 2.100 2.275 2.675 2.800 2.375 2.675 2.350 
##    49    50    51    52    53    54    55    56    57    58    59    60 
## 2.675 2.475 4.075 3.900 4.100 3.275 3.850 3.575 3.975 2.900 3.850 3.300 
##    61    62    63    64    65    66    67    68    69    70    71    72 
## 2.875 3.650 3.300 3.775 3.350 3.900 3.650 3.400 3.600 3.275 3.925 3.550 
##    73    74    75    76    77    78    79    80    81    82    83    84 
## 3.800 3.700 3.725 3.850 3.950 4.100 3.725 3.200 3.200 3.150 3.400 3.850 
##    85    86    87    88    89    90    91    92    93    94    95    96 
## 3.600 3.875 4.000 3.575 3.500 3.325 3.425 3.775 3.400 2.900 3.450 3.525 
##    97    98    99   100   101   102   103   104   105   106   107   108 
## 3.525 3.675 2.925 3.475 4.525 3.875 4.525 4.150 4.375 4.825 3.400 4.575 
##   109   110   111   112   113   114   115   116   117   118   119   120 
## 4.200 4.850 4.200 4.075 4.350 3.800 4.025 4.300 4.200 5.100 4.875 3.675 
##   121   122   123   124   125   126   127   128   129   130   131   132 
## 4.525 3.825 4.800 3.925 4.450 4.550 3.900 3.950 4.225 4.400 4.550 5.025 
##   133   134   135   136   137   138   139   140   141   142   143   144 
## 4.250 3.925 3.925 4.775 4.425 4.200 3.900 4.375 4.450 4.350 3.875 4.550 
##   145   146   147   148   149   150 
## 4.550 4.300 3.925 4.175 4.325 3.950

# 或者用如下的简单办法

# rowMeans(ir)

附注:关于summarize函数

plyr包提供的summarize(==summarise)函数可以计算一个数据框中变量的各种统计量。

max,min,mean,median,length,unique,fivenum,sqrt,lm等等,凡是能够用到数据集上的统计函数(参见library(help=“stats”)),通通都可以传给summarize函数,最后返回的是各个统计量组成的新数据框。

例如(见?summarise):

summarise(baseball,
duration = max(year) - min(year),
nteams = length(unique(team)))

duration nteams
1 136 132

在**ply函数中,如果传入的第一个.fun参数是summarize,第二个及以后的.fun参数就都是传给summarize的统计量参数。

为了说明summarize函数的工作原理,我们依次运行以下三条语句:

# 这句表示分组之后什么也不做(.fun=NULL也可以)
dlply(mt, .(class), .fun = function(x) {
    x
})
## $`wt<=2`
##    mpg cyl disp  hp drat    wt  qsec vs     am gear carb class
## 1 30.4   4 75.7  52 4.93 1.615 18.52  1 manual    4    2 wt<=2
## 2 33.9   4 71.1  65 4.22 1.835 19.90  1 manual    4    1 wt<=2
## 3 27.3   4 79.0  66 4.08 1.935 18.90  1 manual    4    1 wt<=2
## 4 30.4   4 95.1 113 3.77 1.513 16.90  1 manual    5    2 wt<=2
## 
## $`2<wt<=4`
##     mpg cyl  disp  hp drat    wt  qsec vs        am gear carb   class
## 1  21.0   6 160.0 110 3.90 2.620 16.46  0    manual    4    4 2<wt<=4
## 2  21.0   6 160.0 110 3.90 2.875 17.02  0    manual    4    4 2<wt<=4
## 3  22.8   4 108.0  93 3.85 2.320 18.61  1    manual    4    1 2<wt<=4
## 4  21.4   6 258.0 110 3.08 3.215 19.44  1 automatic    3    1 2<wt<=4
## 5  18.7   8 360.0 175 3.15 3.440 17.02  0 automatic    3    2 2<wt<=4
## 6  18.1   6 225.0 105 2.76 3.460 20.22  1 automatic    3    1 2<wt<=4
## 7  14.3   8 360.0 245 3.21 3.570 15.84  0 automatic    3    4 2<wt<=4
## 8  24.4   4 146.7  62 3.69 3.190 20.00  1 automatic    4    2 2<wt<=4
## 9  22.8   4 140.8  95 3.92 3.150 22.90  1 automatic    4    2 2<wt<=4
## 10 19.2   6 167.6 123 3.92 3.440 18.30  1 automatic    4    4 2<wt<=4
## 11 17.8   6 167.6 123 3.92 3.440 18.90  1 automatic    4    4 2<wt<=4
## 12 17.3   8 275.8 180 3.07 3.730 17.60  0 automatic    3    3 2<wt<=4
## 13 15.2   8 275.8 180 3.07 3.780 18.00  0 automatic    3    3 2<wt<=4
## 14 32.4   4  78.7  66 4.08 2.200 19.47  1    manual    4    1 2<wt<=4
## 15 21.5   4 120.1  97 3.70 2.465 20.01  1 automatic    3    1 2<wt<=4
## 16 15.5   8 318.0 150 2.76 3.520 16.87  0 automatic    3    2 2<wt<=4
## 17 15.2   8 304.0 150 3.15 3.435 17.30  0 automatic    3    2 2<wt<=4
## 18 13.3   8 350.0 245 3.73 3.840 15.41  0 automatic    3    4 2<wt<=4
## 19 19.2   8 400.0 175 3.08 3.845 17.05  0 automatic    3    2 2<wt<=4
## 20 26.0   4 120.3  91 4.43 2.140 16.70  0    manual    5    2 2<wt<=4
## 21 15.8   8 351.0 264 4.22 3.170 14.50  0    manual    5    4 2<wt<=4
## 22 19.7   6 145.0 175 3.62 2.770 15.50  0    manual    5    6 2<wt<=4
## 23 15.0   8 301.0 335 3.54 3.570 14.60  0    manual    5    8 2<wt<=4
## 24 21.4   4 121.0 109 4.11 2.780 18.60  1    manual    4    2 2<wt<=4
## 
## $`wt>4`
##    mpg cyl  disp  hp drat    wt  qsec vs        am gear carb class
## 1 16.4   8 275.8 180 3.07 4.070 17.40  0 automatic    3    3  wt>4
## 2 10.4   8 472.0 205 2.93 5.250 17.98  0 automatic    3    4  wt>4
## 3 10.4   8 460.0 215 3.00 5.424 17.82  0 automatic    3    4  wt>4
## 4 14.7   8 440.0 230 3.23 5.345 17.42  0 automatic    3    4  wt>4
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##     class
## 1   wt<=2
## 2 2<wt<=4
## 3    wt>4

# 从分组后的几个数据框中分别取出mpg列
dlply(mt, .(class), summarize, mpg)
## $`wt<=2`
##    ..1
## 1 30.4
## 2 33.9
## 3 27.3
## 4 30.4
## 
## $`2<wt<=4`
##     ..1
## 1  21.0
## 2  21.0
## 3  22.8
## 4  21.4
## 5  18.7
## 6  18.1
## 7  14.3
## 8  24.4
## 9  22.8
## 10 19.2
## 11 17.8
## 12 17.3
## 13 15.2
## 14 32.4
## 15 21.5
## 16 15.5
## 17 15.2
## 18 13.3
## 19 19.2
## 20 26.0
## 21 15.8
## 22 19.7
## 23 15.0
## 24 21.4
## 
## $`wt>4`
##    ..1
## 1 16.4
## 2 10.4
## 3 10.4
## 4 14.7
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##     class
## 1   wt<=2
## 2 2<wt<=4
## 3    wt>4

# 对取出的mpg列求均值
dlply(mt, .(class), summarize, mean(mpg))
## $`wt<=2`
##    ..1
## 1 30.5
## 
## $`2<wt<=4`
##     ..1
## 1 19.54
## 
## $`wt>4`
##     ..1
## 1 12.97
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##     class
## 1   wt<=2
## 2 2<wt<=4
## 3    wt>4

这就相当于用一个函数取数据框中的变量,再求这个变量的统计量,也就是用fun(dataframe, variable)的形式返回一个变量的数据列。

上面的最后一条语句相当于对每一个分组都进行类似下面的操作:

summarize(subset(mt,class==1), mean(mpg))

而一般我们用基础R包做的时候,只能用方括号的操作stats_func(dataframe[, variable])(例如mean(mt[mt$class==1,“mpg”,drop=F]))。这种形式不利于函数式的处理,无法应用在d*ply函数中。