plyr exercise 02

1.对于R的内置数据集mtcars,利用plyr包的函数分别做以下操作:

(1)计算每种汽车的disp/mpg,并按disp/mpg 的结果排序,输出一个包含汽车名的数据框

library(plyr)

disp_per_mpg <- splat(function(disp, mpg, ...) disp/mpg)
vehicle <- function(df) rownames(df)
df = quickdf(each(vehicle, disp_per_mpg)(mtcars))
arrange(df, disp_per_mpg)
##                vehicle disp_per_mpg
## 1       Toyota Corolla        2.097
## 2             Fiat 128        2.429
## 3          Honda Civic        2.490
## 4            Fiat X1-9        2.894
## 5         Lotus Europa        3.128
## 6        Porsche 914-2        4.627
## 7           Datsun 710        4.737
## 8        Toyota Corona        5.586
## 9           Volvo 142E        5.654
## 10           Merc 240D        6.012
## 11            Merc 230        6.175
## 12        Ferrari Dino        7.360
## 13           Mazda RX4        7.619
## 14       Mazda RX4 Wag        7.619
## 15            Merc 280        8.729
## 16           Merc 280C        9.416
## 17      Hornet 4 Drive       12.056
## 18             Valiant       12.431
## 19          Merc 450SL       15.942
## 20          Merc 450SE       16.817
## 21         Merc 450SLC       18.145
## 22   Hornet Sportabout       19.251
## 23         AMC Javelin       20.000
## 24       Maserati Bora       20.067
## 25    Dodge Challenger       20.516
## 26    Pontiac Firebird       20.833
## 27      Ford Pantera L       22.215
## 28          Duster 360       25.175
## 29          Camaro Z28       26.316
## 30   Chrysler Imperial       29.932
## 31 Lincoln Continental       44.231
## 32  Cadillac Fleetwood       45.385

(2)计算数据集中disp的最小值,最大值,平均值和方差

each(max, min, mean, var)(mtcars$disp)
##     max     min    mean     var 
##   472.0    71.1   230.7 15360.8

(3)将数据集中hp的名字改为Ghp

rename(mtcars, replace = c(hp = "Ghp"))
##                      mpg cyl  disp Ghp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

# rename(mtcars,replace=c(hp='Ghp')) #旧变量名不需要引号

(4)计算数据集中am变量中出现0的次数与出现1的次数

count(mtcars, "am")
##   am freq
## 1  0   19
## 2  1   13

2.生成5组分别服从参数为(2,4),(1,2),(1,5)(5,7)(8,9)的均匀分布

(param = data.frame(n = rep(5, 5), min = c(2, 1, 1, 5, 8), max = c(4, 2, 5, 
    7, 9)))
##   n min max
## 1 5   2   4
## 2 5   1   2
## 3 5   1   5
## 4 5   5   7
## 5 5   8   9
mlply(param, runif)
## $`1`
## [1] 3.699 3.644 2.082 3.129 3.108
## 
## $`2`
## [1] 1.147 1.898 1.406 1.414 1.156
## 
## $`3`
## [1] 4.705 4.649 3.835 3.338 1.090
## 
## $`4`
## [1] 6.494 6.392 6.339 5.582 6.145
## 
## $`5`
## [1] 8.652 8.798 8.602 8.297 8.035
## 
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##   n min max
## 1 5   2   4
## 2 5   1   2
## 3 5   1   5
## 4 5   5   7
## 5 5   8   9
alply(param, 1, splat(runif))
## $`1`
## [1] 2.628 3.472 3.303 3.955 2.990
## 
## $`2`
## [1] 1.468 1.598 1.481 1.394 1.438
## 
## $`3`
## [1] 3.983 4.304 4.929 3.476 4.134
## 
## $`4`
## [1] 5.205 5.786 6.951 5.606 5.352
## 
## $`5`
## [1] 8.613 8.752 8.571 8.156 8.796
## 
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##   n min max
## 1 5   2   4
## 2 5   1   2
## 3 5   1   5
## 4 5   5   7
## 5 5   8   9

3.

x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5)

y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5)

以不同的type联合x与y,并简要说出不同type之间的不同。

(x <- data.frame(k1 = c(NA, NA, 3, 4, 5), k2 = c(1, NA, NA, 4, 5), data = 1:5))
##   k1 k2 data
## 1 NA  1    1
## 2 NA NA    2
## 3  3 NA    3
## 4  4  4    4
## 5  5  5    5
(y <- data.frame(k1 = c(NA, 2, NA, 4, 5), k2 = c(NA, NA, 3, 4, 5), data = 1:5))
##   k1 k2 data
## 1 NA NA    1
## 2  2 NA    2
## 3 NA  3    3
## 4  4  4    4
## 5  5  5    5

(1)左连接:保留x的全部行,将y里面y$k1与x$k1相匹配的行(只包括k1以外的列)接到x相应行的后面,y中其他的行不出现(在下面的例子中,没有出现y$k1=2的行就是明证)。

如果参数match=“all"(默认),将保留所有y$k1与x$k1相匹配的组合。

如果参数match="first",将只保留第一个y$k1与x$k1相匹配的组合。

join(x, y, by = "k1")
##   k1 k2 data k2 data
## 1 NA  1    1 NA    1
## 2 NA  1    1  3    3
## 3 NA NA    2 NA    1
## 4 NA NA    2  3    3
## 5  3 NA    3 NA   NA
## 6  4  4    4  4    4
## 7  5  5    5  5    5
join(x, y, by = "k1", match = "first")
##   k1 k2 data k2 data
## 1 NA  1    1 NA    1
## 2 NA NA    2 NA    1
## 3  3 NA    3 NA   NA
## 4  4  4    4  4    4
## 5  5  5    5  5    5

(2)右连接:保留y的全部行,将x里面x$k1与y$k1相匹配的行(只包括k1以外的列)接到y相应行(k1以外的变量)的前面,x中其他的行不出现(在下面的例子中,出现y$k1=2的行、而没有出现x$k1=3的行就是明证)。

match参数的含义同上。

join(x, y, "k1", type = "right")
##   k1 k2 data k2 data
## 1 NA  1    1 NA    1
## 2 NA NA    2 NA    1
## 3  2 NA   NA NA    2
## 4 NA  1    1  3    3
## 5 NA NA    2  3    3
## 6  4  4    4  4    4
## 7  5  5    5  5    5

这句会报错,原因不详,可参考下面的讨论:

join(x, y, by = "k1", type = "right", match = "first")
## Error: Duplicated key in y

用match="all"就没这个问题,因为所有的行每次都会遍历一边。暂时不能理解的是,用左连接也完全不会出现这个问题,无论match参数是什么。

# 注意这里是大写的X和Y,与题目里面小写的x和y互不影响
X = data.frame(k = c(2, 1, 2), v1 = 1:3)
Y = data.frame(k = rep(2, 3), v2 = 1:3)
join(X, Y, by = "k", type = "right", match = "first")  #报错
## Error: Duplicated key in y
join(Y, X, by = "k", type = "right", match = "first")  #报错
## Error: Duplicated key in y

# 如果使用key值不重复的Y1,X右连接Y1不报错了,但Y1右连接X仍然报错
Y1 = Y
Y1[, 1] = 1:3
Y1
##   k v2
## 1 1  1
## 2 2  2
## 3 3  3
join(X, Y1, by = "k", type = "right", match = "first")  #不报错了
##   k v1 v2
## 1 1  2  1
## 2 2  1  2
## 3 3 NA  3
join(Y1, X, by = "k", type = "right", match = "first")  #仍然报错
## Error: Duplicated key in y

# 用match='all'或type='left'参数时都完全正常
join(X, Y, by = "k", type = "right", match = "all")  #正常
##   k v1 v2
## 1 2  1  1
## 2 2  3  1
## 3 2  1  2
## 4 2  3  2
## 5 2  1  3
## 6 2  3  3
join(Y, X, by = "k", type = "right", match = "all")  #正常
##   k v2 v1
## 1 2  1  1
## 2 2  2  1
## 3 2  3  1
## 4 1 NA  2
## 5 2  1  3
## 6 2  2  3
## 7 2  3  3
join(X, Y, by = "k", type = "left", match = "first")  #正常
##   k v1 v2
## 1 2  1  1
## 2 1  2 NA
## 3 2  3  1
join(Y, X, by = "k", type = "left", match = "first")  #正常
##   k v2 v1
## 1 2  1  1
## 2 2  2  1
## 3 2  3  1

# 此问题也不是因为X的key乱序排列引起的:
(X1 = arrange(X, k))
##   k v1
## 1 1  2
## 2 2  1
## 3 2  3
join(X1, Y, by = "k", type = "right", match = "first")  #仍然报错
## Error: Duplicated key in y
join(Y, X1, by = "k", type = "right", match = "first")  #仍然报错
## Error: Duplicated key in y
join(X1, Y, by = "k", type = "right", match = "all")  #正常
##   k v1 v2
## 1 2  1  1
## 2 2  3  1
## 3 2  1  2
## 4 2  3  2
## 5 2  1  3
## 6 2  3  3
join(Y, X1, by = "k", type = "right", match = "all")  #正常
##   k v2 v1
## 1 1 NA  2
## 2 2  1  1
## 3 2  2  1
## 4 2  3  1
## 5 2  1  3
## 6 2  2  3
## 7 2  3  3

结论是:在右连接中,最好不要使用重复的key值,否则会出现莫名其妙的错误。

(3)内连接:只保留x$k1=y$k1的行,也就是x$k1和y$k1的交集部分所在的行。

join(x, y, "k1", type = "inner")
##   k1 k2 data k2 data
## 1 NA  1    1 NA    1
## 2 NA  1    1  3    3
## 3 NA NA    2 NA    1
## 4 NA NA    2  3    3
## 5  4  4    4  4    4
## 6  5  5    5  5    5
join(x, y, by = "k1", type = "inner", match = "first")
##     k1 k2 data k2 data
## 1   NA  1    1 NA    1
## 1.1 NA  1    1 NA    1
## 4    4  4    4  4    4
## 5    5  5    5  5    5

(4)全连接:保留x和y的全部行,合并x$k1和y$k1相匹配的行。

match参数的含义同上。

join(x, y, "k1", type = "full")
##   k1 k2 data
## 1 NA  1    1
## 2 NA  1    1
## 3 NA NA    2
## 4 NA NA    2
## 5  3 NA    3
## 6  4  4    4
## 7  5  5    5
## 8  2 NA    2
join(x, y, "k1", type = "full", match = "first")
##   k1 k2 data
## 1 NA  1    1
## 2 NA NA    2
## 3  3 NA    3
## 4  4  4    4
## 5  5  5    5
## 6  2 NA    2