dplyr

安装、加载包

# install.packages('dplyr') install.packages('tidyr')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

一、dplyr包相关函数

1.1 add_rownames(): 将行名转为显式变量

add_rownames(df, var = “rowname”)

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

head(mtcars %>% tbl_df())

## Source: local data frame [6 x 11]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  21.0     6   160   110  3.90 2.620 16.46     0     1     4     4
## 2  21.0     6   160   110  3.90 2.875 17.02     0     1     4     4
## 3  22.8     4   108    93  3.85 2.320 18.61     1     1     4     1
## 4  21.4     6   258   110  3.08 3.215 19.44     1     0     3     1
## 5  18.7     8   360   175  3.15 3.440 17.02     0     0     3     2
## 6  18.1     6   225   105  2.76 3.460 20.22     1     0     3     1

head(mtcars %>% add_rownames())

## Source: local data frame [6 x 12]
## 
##             rowname   mpg   cyl  disp    hp  drat    wt  qsec    vs    am
##               <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         Mazda RX4  21.0     6   160   110  3.90 2.620 16.46     0     1
## 2     Mazda RX4 Wag  21.0     6   160   110  3.90 2.875 17.02     0     1
## 3        Datsun 710  22.8     4   108    93  3.85 2.320 18.61     1     1
## 4    Hornet 4 Drive  21.4     6   258   110  3.08 3.215 19.44     1     0
## 5 Hornet Sportabout  18.7     8   360   175  3.15 3.440 17.02     0     0
## 6           Valiant  18.1     6   225   105  2.76 3.460 20.22     1     0
## Variables not shown: gear <dbl>, carb <dbl>.

1.2 all.equal.tbl_df(): 比较两个数据框是否相同

S3 method for class ‘tbl_df’

all.equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, …)

S3 method for class ‘tbl_dt’

all.equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, …)

target,current：需比较的数据框
ignore_col_order：是否忽略变量顺序，默认值为TRUE
ignore_row_order：是否忽略行顺序，默认值为TRUE
convert：默认值为FALSE，是否转换变量的数据类型

scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))]
mtcars_df <- tbl_df(mtcars)
all.equal(target = mtcars_df, current = scramble(mtcars_df))

## [1] TRUE

all.equal(target = mtcars_df, current = scramble(mtcars_df), ignore_col_order = FALSE)

## [1] "Column names same but in different order"

all.equal(target = mtcars_df, current = scramble(mtcars_df), ignore_row_order = FALSE)

##  [1] "Attributes: < Component \"row.names\": Modes: numeric, character >"              
##  [2] "Attributes: < Component \"row.names\": target is numeric, current is character >"
##  [3] "Component \"mpg\": Mean relative difference: 0.3596205"                          
##  [4] "Component \"cyl\": Mean relative difference: 0.5614035"                          
##  [5] "Component \"disp\": Mean relative difference: 0.7018268"                         
##  [6] "Component \"hp\": Mean relative difference: 0.521815"                            
##  [7] "Component \"drat\": Mean relative difference: 0.2149861"                         
##  [8] "Component \"wt\": Mean relative difference: 0.3676665"                           
##  [9] "Component \"qsec\": Mean relative difference: 0.1184257"                         
## [10] "Component \"vs\": Mean relative difference: 2"                                   
## [11] "Component \"am\": Mean relative difference: 2"                                   
## [12] "Component \"gear\": Mean relative difference: 0.3448276"                         
## [13] "Component \"carb\": Mean relative difference: 0.7297297"

1.3 arrange(): 对数据框按变量进行排序

arrange(.data, …)

mtcars[1:5, ]

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

arrange(.data = mtcars[1:5, ], cyl, disp)

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 2 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 3 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

arrange(.data = mtcars[1:5, ], desc(cyl), desc(disp))

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 2 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 3 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 4 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 5 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1

arrange(.data = mtcars[1:5, ], desc(cyl, disp))

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 2 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 3 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1

arrange(.data = mtcars[1:5, ], desc(disp))

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 2 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 3 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 4 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 5 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1

1.4 between(): 判断数据是否在某个数值区间中，返回逻辑值

between(x, left, right)

x <- rnorm(10)
x

##  [1]  0.7785705 -0.4961340  0.8021603 -1.3345285 -1.0332436 -0.4169530
##  [7] -0.2867346 -0.6100643  0.5070427 -0.6159983

between(x = x, left = -1, right = 1)

##  [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE

x[between(x = x, left = -1, right = 1)]

## [1]  0.7785705 -0.4961340  0.8021603 -0.4169530 -0.2867346 -0.6100643
## [7]  0.5070427 -0.6159983

1.5 do(): 对tbl数据进行某种操作

by_cyl <- group_by(mtcars, cyl)
do(by_cyl, head(., 2))

## Source: local data frame [6 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 2  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 3  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
## 4  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
## 5  18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
## 6  14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4

models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models[[2]]

## [[1]]
## 
## Call:
## lm(formula = mpg ~ disp, data = .)
## 
## Coefficients:
## (Intercept)         disp  
##     40.8720      -0.1351  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = mpg ~ disp, data = .)
## 
## Coefficients:
## (Intercept)         disp  
##   19.081987     0.003605  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = mpg ~ disp, data = .)
## 
## Coefficients:
## (Intercept)         disp  
##    22.03280     -0.01963

1.6 filter(): 按条件筛选行（样本）

filter(.data = mtcars, cyl == 8)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 2  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 3  16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 4  17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 5  15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 6  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 7  10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 8  14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 9  15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 10 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 11 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 12 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 13 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 14 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8

filter(.data = mtcars, cyl < 6)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 2  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 3  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 4  32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 5  30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 6  33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 7  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 8  27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 9  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 10 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 11 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

# Multiple criteria
filter(.data = mtcars, cyl < 6 & vs == 1)  # and

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 2  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 3  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 4  32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 5  30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 6  33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 7  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 8  27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 9  30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 10 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

filter(.data = mtcars, cyl < 6 | vs == 1)  # or

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 2  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 3  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 4  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 5  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 6  19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 7  17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 8  32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 9  30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 10 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 11 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 12 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 13 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 14 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 15 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

# Multiple arguments are equivalent to and
filter(.data = mtcars, cyl < 6, vs == 1)

##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 2  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 3  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 4  32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 5  30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 6  33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 7  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 8  27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 9  30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 10 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

1.7 group_by(): 将数据框按变量进行分组

1. group_by(.data, …, add = FALSE)
1. group_by_(.data, …, .dots, add = FALSE)

by_cyl <- group_by(mtcars, cyl)
n_groups(by_cyl)

## [1] 3

group_size(by_cyl)

## [1] 11  7 14

summarise(by_cyl, mean(disp), mean(hp))

## Source: local data frame [3 x 3]
## 
##     cyl mean(disp)  mean(hp)
##   <dbl>      <dbl>     <dbl>
## 1     4   105.1364  82.63636
## 2     6   183.3143 122.28571
## 3     8   353.1000 209.21429

filter(by_cyl, disp == max(disp))

## Source: local data frame [3 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
## 2  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 3  10.4     8 472.0   205  2.93 5.250 17.98     0     0     3     4

# summarise peels off a single layer of grouping
by_vs_am <- group_by(mtcars, vs, am)
by_vs <- summarise(by_vs_am, n = n())
by_vs

## Source: local data frame [4 x 3]
## Groups: vs [?]
## 
##      vs    am     n
##   (dbl) (dbl) (int)
## 1     0     0    12
## 2     0     1     6
## 3     1     0     7
## 4     1     1     7

summarise(by_vs, n = sum(n))

## Source: local data frame [2 x 2]
## 
##      vs     n
##   <dbl> <int>
## 1     0    18
## 2     1    14

# use ungroup() to remove if not wanted
summarise(ungroup(by_vs), n = sum(n))

## Source: local data frame [1 x 1]
## 
##       n
##   <int>
## 1    32

# You can group by expressions: this is just short-hand for a mutate/rename
# followed by a simple group_by
group_by(mtcars, vsam = vs + am)

## Source: local data frame [32 x 12]
## Groups: vsam [3]
## 
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb  vsam
##    (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1   21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4     1
## 2   21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4     1
## 3   22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1     2
## 4   21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1     1
## 5   18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2     0
## 6   18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1     1
## 7   14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4     0
## 8   24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2     1
## 9   22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2     1
## 10  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4     1
## ..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...

group_by(mtcars, vs2 = vs)

## Source: local data frame [32 x 12]
## Groups: vs2 [2]
## 
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb   vs2
##    (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1   21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4     0
## 2   21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4     0
## 3   22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1     1
## 4   21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1     1
## 5   18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2     0
## 6   18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1     1
## 7   14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4     0
## 8   24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2     1
## 9   22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2     1
## 10  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4     1
## ..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...

1.8 mutate(): 在数据框中增加新变量

1. mutate(.data, …)，在数据框中新增变量，且保留原有变量
1. mutate_(.data, …, .dots)
1. transmute(.data, …)，只保留新增变量，删除原有变量
1. transmute_(.data, …, .dots)

head(mutate(mtcars, displ_l = disp/61.0237, displ_2 = displ_l/2))

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb  displ_l   displ_2
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 2.621932 1.3109661
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 2.621932 1.3109661
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 1.769804 0.8849021
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 4.227866 2.1139328
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 5.899347 2.9496737
## 6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 3.687092 1.8435460

head(transmute(mtcars, displ_l = disp/61.0237, displ_2 = displ_l/2))

##    displ_l   displ_2
## 1 2.621932 1.3109661
## 2 2.621932 1.3109661
## 3 1.769804 0.8849021
## 4 4.227866 2.1139328
## 5 5.899347 2.9496737
## 6 3.687092 1.8435460

1.9 order_by(): 对变量进行排序后，再应用函数

order_by(order_by, call)

df <- data.frame(year = 2000:2005, value = (0:5)^2)
scrambled <- df[sample(nrow(df)), ]
scrambled

##   year value
## 3 2002     4
## 2 2001     1
## 4 2003     9
## 1 2000     0
## 5 2004    16
## 6 2005    25

wrong <- mutate(scrambled, running = cumsum(value))
wrong

##   year value running
## 1 2002     4       4
## 2 2001     1       5
## 3 2003     9      14
## 4 2000     0      14
## 5 2004    16      30
## 6 2005    25      55

arrange(wrong, year)

##   year value running
## 1 2000     0      14
## 2 2001     1       5
## 3 2002     4       4
## 4 2003     9      14
## 5 2004    16      30
## 6 2005    25      55

right <- mutate(scrambled, running = order_by(year, cumsum(value)))
right

##   year value running
## 1 2002     4       5
## 2 2001     1       1
## 3 2003     9      14
## 4 2000     0       0
## 5 2004    16      30
## 6 2005    25      55

arrange(right, year)

##   year value running
## 1 2000     0       0
## 2 2001     1       1
## 3 2002     4       5
## 4 2003     9      14
## 5 2004    16      30
## 6 2005    25      55

1.10 rowwise(): 按行输入

df <- expand.grid(x = 1:3, y = 3:1)
df

##   x y
## 1 1 3
## 2 2 3
## 3 3 3
## 4 1 2
## 5 2 2
## 6 3 2
## 7 1 1
## 8 2 1
## 9 3 1

rowwise(df)

## Source: local data frame [9 x 2]
## Groups: <by row>
## 
##       x     y
##   (int) (int)
## 1     1     3
## 2     2     3
## 3     3     3
## 4     1     2
## 5     2     2
## 6     3     2
## 7     1     1
## 8     2     1
## 9     3     1

df %>% rowwise() %>% do(i = seq(.$x, .$y))

## Source: local data frame [9 x 1]
## Groups: <by row>
## 
##          i
##      (chr)
## 1 <int[3]>
## 2 <int[2]>
## 3 <int[1]>
## 4 <int[2]>
## 5 <int[1]>
## 6 <int[2]>
## 7 <int[1]>
## 8 <int[2]>
## 9 <int[3]>

1.11 sample(): 抽样

用法：

1. sample_n(tbl, size, replace = FALSE, weight = NULL, .env = parent.frame())
1. sample_frac(tbl, size = 1, replace = FALSE, weight = NULL, .env = parent.frame())

# Sample fixed number per group
sample_n(mtcars, 10)

##                   mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Pontiac Firebird 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Valiant          18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 450SE       16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL       17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Toyota Corolla   33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Honda Civic      30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Fiat X1-9        27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Merc 230         22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Fiat 128         32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Mazda RX4 Wag    21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4

sample_n(mtcars, 10, replace = TRUE)

##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Ferrari Dino      19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Maserati Bora     15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Toyota Corona     21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Chrysler Imperial 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128          32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Merc 450SLC       15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3

sample_n(mtcars, 10, weight = mpg)

##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Porsche 914-2     26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Merc 450SLC       15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Fiat 128          32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Volvo 142E        21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Toyota Corolla    33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Merc 280          19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Honda Civic       30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2

by_cyl <- mtcars %>% group_by(cyl)
sample_n(by_cyl, 3)

## Source: local data frame [9 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
## 2  32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1
## 3  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 4  19.7     6 145.0   175  3.62 2.770 15.50     0     1     5     6
## 5  17.8     6 167.6   123  3.92 3.440 18.90     1     0     4     4
## 6  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
## 7  10.4     8 460.0   215  3.00 5.424 17.82     0     0     3     4
## 8  16.4     8 275.8   180  3.07 4.070 17.40     0     0     3     3
## 9  13.3     8 350.0   245  3.73 3.840 15.41     0     0     3     4

sample_n(by_cyl, 10, replace = TRUE)

## Source: local data frame [30 x 11]
## Groups: cyl [3]
## 
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##    (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1   30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
## 2   21.5     4 120.1    97  3.70 2.465 20.01     1     0     3     1
## 3   30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
## 4   33.9     4  71.1    65  4.22 1.835 19.90     1     1     4     1
## 5   24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 6   21.4     4 121.0   109  4.11 2.780 18.60     1     1     4     2
## 7   22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 8   32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1
## 9   26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2
## 10  26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2
## ..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...

sample_n(by_cyl, 3, weight = mpg/mean(mpg))

## Source: local data frame [9 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 2  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
## 3  26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2
## 4  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
## 5  19.7     6 145.0   175  3.62 2.770 15.50     0     1     5     6
## 6  18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
## 7  18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
## 8  19.2     8 400.0   175  3.08 3.845 17.05     0     0     3     2
## 9  14.7     8 440.0   230  3.23 5.345 17.42     0     0     3     4

# Sample fixed fraction per group Default is to sample all data = randomly
# resample rows sample_frac(mtcars)
sample_frac(mtcars, 0.1)

##                    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## AMC Javelin       15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Merc 230          22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Chrysler Imperial 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4

# sample_frac(mtcars, 1.5, replace = TRUE)
sample_frac(mtcars, 0.1, weight = 1/mpg)

##                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Toyota Corolla     33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Chrysler Imperial  14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Cadillac Fleetwood 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4

sample_frac(by_cyl, 0.2)

## Source: local data frame [6 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  21.4     4 121.0   109  4.11 2.780 18.60     1     1     4     2
## 2  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
## 3  19.7     6 145.0   175  3.62 2.770 15.50     0     1     5     6
## 4  14.7     8 440.0   230  3.23 5.345 17.42     0     0     3     4
## 5  10.4     8 472.0   205  2.93 5.250 17.98     0     0     3     4
## 6  15.0     8 301.0   335  3.54 3.570 14.60     0     1     5     8

# sample_frac(by_cyl, 1, replace = TRUE)

1.12 select(): 选择变量

starts_with(x, ignore.case = TRUE): names starts with x
ends_with(x, ignore.case = TRUE): names ends in x
contains(x, ignore.case = TRUE): selects all variables whose name contains x
matches(x, ignore.case = TRUE): selects all variables whose name matches the regular expression x - num_range(“x”, 1:5, width = 2): selects all variables (numerically) from x01 to x05.
one_of(“x”, “y”, “z”): selects variables provided in a character vector.
everything(): selects all variables.

iris <- tbl_df(iris[1:5, ])  # so it prints a little nicer 
select(iris, starts_with("Petal"))

## Source: local data frame [5 x 2]
## 
##   Petal.Length Petal.Width
##          <dbl>       <dbl>
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2

select(iris, ends_with("Width"))

## Source: local data frame [5 x 2]
## 
##   Sepal.Width Petal.Width
##         <dbl>       <dbl>
## 1         3.5         0.2
## 2         3.0         0.2
## 3         3.2         0.2
## 4         3.1         0.2
## 5         3.6         0.2

select(iris, contains("etal"))

## Source: local data frame [5 x 2]
## 
##   Petal.Length Petal.Width
##          <dbl>       <dbl>
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2

select(iris, matches(".t."))

## Source: local data frame [5 x 4]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2

select(iris, Petal.Length, Petal.Width)

## Source: local data frame [5 x 2]
## 
##   Petal.Length Petal.Width
##          <dbl>       <dbl>
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2

vars <- c("Petal.Length", "Petal.Width")
select(iris, one_of(vars))

## Source: local data frame [5 x 2]
## 
##   Petal.Length Petal.Width
##          <dbl>       <dbl>
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2
## 4          1.5         0.2
## 5          1.4         0.2

df <- as.data.frame(matrix(runif(60), nrow = 6))
df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)])
select(df, V8:V6)

## Source: local data frame [6 x 4]
## 
##           V8           V5        V2        V6
##        <dbl>        <dbl>     <dbl>     <dbl>
## 1 0.50784733 0.3705222928 0.5708563 0.2725626
## 2 0.42939419 0.4695904597 0.6392353 0.3995365
## 3 0.93737482 0.0724870036 0.4107849 0.6763092
## 4 0.11792486 0.9691334283 0.9426179 0.8371149
## 5 0.05031189 0.0007885925 0.1192577 0.2032340
## 6 0.15951781 0.3957703849 0.4203731 0.2628636

select(df, num_range("V", 4:6))

## Source: local data frame [6 x 3]
## 
##           V4           V5        V6
##        <dbl>        <dbl>     <dbl>
## 1 0.97624757 0.3705222928 0.2725626
## 2 0.08484239 0.4695904597 0.3995365
## 3 0.58344759 0.0724870036 0.6763092
## 4 0.21230545 0.9691334283 0.8371149
## 5 0.15639510 0.0007885925 0.2032340
## 6 0.80585008 0.3957703849 0.2628636

# Drop variables
select(iris, -starts_with("Petal"))

## Source: local data frame [5 x 3]
## 
##   Sepal.Length Sepal.Width Species
##          <dbl>       <dbl>  <fctr>
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa

select(iris, -ends_with("Width"))

## Source: local data frame [5 x 3]
## 
##   Sepal.Length Petal.Length Species
##          <dbl>        <dbl>  <fctr>
## 1          5.1          1.4  setosa
## 2          4.9          1.4  setosa
## 3          4.7          1.3  setosa
## 4          4.6          1.5  setosa
## 5          5.0          1.4  setosa

select(iris, -contains("etal"))

## Source: local data frame [5 x 3]
## 
##   Sepal.Length Sepal.Width Species
##          <dbl>       <dbl>  <fctr>
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa

select(iris, -matches(".t."))

## Source: local data frame [5 x 1]
## 
##   Species
##    <fctr>
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa
## 5  setosa

select(iris, -Petal.Length, -Petal.Width)

## Source: local data frame [5 x 3]
## 
##   Sepal.Length Sepal.Width Species
##          <dbl>       <dbl>  <fctr>
## 1          5.1         3.5  setosa
## 2          4.9         3.0  setosa
## 3          4.7         3.2  setosa
## 4          4.6         3.1  setosa
## 5          5.0         3.6  setosa

# Rename variables: * select() keeps only the variables you specify
select(iris, petal_length = Petal.Length)

## Source: local data frame [5 x 1]
## 
##   petal_length
##          <dbl>
## 1          1.4
## 2          1.4
## 3          1.3
## 4          1.5
## 5          1.4

# Renaming multiple variables uses a prefix:
select(iris, petal = starts_with("Petal"))

## Source: local data frame [5 x 2]
## 
##   petal1 petal2
##    <dbl>  <dbl>
## 1    1.4    0.2
## 2    1.4    0.2
## 3    1.3    0.2
## 4    1.5    0.2
## 5    1.4    0.2

# Reorder variables: keep the variable 'Species' in the front
select(iris, Species, everything())

## Source: local data frame [5 x 5]
## 
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
##    <fctr>        <dbl>       <dbl>        <dbl>       <dbl>
## 1  setosa          5.1         3.5          1.4         0.2
## 2  setosa          4.9         3.0          1.4         0.2
## 3  setosa          4.7         3.2          1.3         0.2
## 4  setosa          4.6         3.1          1.5         0.2
## 5  setosa          5.0         3.6          1.4         0.2

# * rename() keeps all variables
rename(iris, petal_length = Petal.Length)

## Source: local data frame [5 x 5]
## 
##   Sepal.Length Sepal.Width petal_length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl>  <fctr>
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa

1.13 setops: 集合操作

intersect(x, y, …) 交集
union(x, y, …) 并集
setdiff(x, y, …) 属于x且不属于y的集合
setequal(x, y, …) 判断2个集合是否相同

first <- 1:5
second <- 3:8
intersect(first, second)

## [1] 3 4 5

union(first, second)

## [1] 1 2 3 4 5 6 7 8

setdiff(first, second)

## [1] 1 2

setdiff(second, first)

## [1] 6 7 8

setequal(first, second)

## [1] FALSE

setequal(1:5, 5:1)

## [1] TRUE

1.14 summarise(): 总结汇总

# install.packages('data.table')
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, last

summarise(mtcars, mean(disp))

##   mean(disp)
## 1   230.7219

summarise(group_by(mtcars, cyl), mean(disp))

## Source: local data frame [3 x 2]
## 
##     cyl mean(disp)
##   <dbl>      <dbl>
## 1     4   105.1364
## 2     6   183.3143
## 3     8   353.1000

summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))

## Source: local data frame [3 x 3]
## 
##     cyl        m       sd
##   <dbl>    <dbl>    <dbl>
## 1     4 105.1364 26.87159
## 2     6 183.3143 41.56246
## 3     8 353.1000 67.77132

# With data frames, you can create and immediately use summaries
by_cyl <- mtcars %>% group_by(cyl)
by_cyl %>% summarise(a = n(), b = a + 1)

## Source: local data frame [3 x 3]
## 
##     cyl     a     b
##   <dbl> <int> <dbl>
## 1     4    11    12
## 2     6     7     8
## 3     8    14    15

1.15 summarise_each():

# One function
by_species <- iris %>% group_by(Species)
by_species %>% summarise_each(funs(length))

## Source: local data frame [1 x 5]
## 
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
##    <fctr>        <int>       <int>        <int>       <int>
## 1  setosa            5           5            5           5

by_species %>% summarise_each(funs(mean))

## Source: local data frame [1 x 5]
## 
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
##    <fctr>        <dbl>       <dbl>        <dbl>       <dbl>
## 1  setosa         4.86        3.28          1.4         0.2

by_species %>% summarise_each(funs(mean), Petal.Width)

## Source: local data frame [1 x 2]
## 
##   Species Petal.Width
##    <fctr>       <dbl>
## 1  setosa         0.2

by_species %>% summarise_each(funs(mean), matches("Width"))

## Source: local data frame [1 x 3]
## 
##   Species Sepal.Width Petal.Width
##    <fctr>       <dbl>       <dbl>
## 1  setosa        3.28         0.2

by_species %>% mutate_each(funs(half = ./2))

## Source: local data frame [5 x 5]
## Groups: Species [1]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          (dbl)       (dbl)        (dbl)       (dbl)  (fctr)
## 1         2.55        1.75         0.70         0.1  setosa
## 2         2.45        1.50         0.70         0.1  setosa
## 3         2.35        1.60         0.65         0.1  setosa
## 4         2.30        1.55         0.75         0.1  setosa
## 5         2.50        1.80         0.70         0.1  setosa

by_species %>% mutate_each(funs(min_rank))

## Source: local data frame [5 x 5]
## Groups: Species [1]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          (int)       (int)        (int)       (int)  (fctr)
## 1            5           4            2           1  setosa
## 2            3           1            2           1  setosa
## 3            2           3            1           1  setosa
## 4            1           2            5           1  setosa
## 5            4           5            2           1  setosa

# Two functions
by_species %>% summarise_each(funs(min, max))

## Source: local data frame [1 x 9]
## 
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min
##    <fctr>            <dbl>           <dbl>            <dbl>
## 1  setosa              4.6               3              1.3
## Variables not shown: Petal.Width_min <dbl>, Sepal.Length_max <dbl>,
##   Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>.

by_species %>% summarise_each(funs(min, max), Petal.Width, Sepal.Width)

## Source: local data frame [1 x 5]
## 
##   Species Petal.Width_min Sepal.Width_min Petal.Width_max Sepal.Width_max
##    <fctr>           <dbl>           <dbl>           <dbl>           <dbl>
## 1  setosa             0.2               3             0.2             3.6

by_species %>% summarise_each(funs(min, max), matches("Width"))

## Source: local data frame [1 x 5]
## 
##   Species Sepal.Width_min Petal.Width_min Sepal.Width_max Petal.Width_max
##    <fctr>           <dbl>           <dbl>           <dbl>           <dbl>
## 1  setosa               3             0.2             3.6             0.2

# Alternative function specification
iris %>% summarise_each(funs(ul = length(unique(.))))

## Source: local data frame [1 x 5]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <int>       <int>        <int>       <int>   <int>
## 1            5           5            3           1       1

by_species %>% summarise_each(funs(ul = length(unique(.))))

## Source: local data frame [1 x 5]
## 
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
##    <fctr>        <int>       <int>        <int>       <int>
## 1  setosa            5           5            3           1

by_species %>% summarise_each(c("min", "max"))

## Source: local data frame [1 x 9]
## 
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min
##    <fctr>            <dbl>           <dbl>            <dbl>
## 1  setosa              4.6               3              1.3
## Variables not shown: Petal.Width_min <dbl>, Sepal.Length_max <dbl>,
##   Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>.

# Alternative variable specification
summarise_each_(iris, funs(max), names(iris)[-5])

## Source: local data frame [1 x 4]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1          5.1         3.6          1.5         0.2

summarise_each_(iris, funs(max), list(quote(-Species)))

## Source: local data frame [1 x 4]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1          5.1         3.6          1.5         0.2

1.16 top_n (): 选择top n 行

top_n(x, n, wt)

gb_cyl <- group_by(mtcars_df, cyl)
top_n(gb_cyl, 2, desc(disp))

## Source: local data frame [8 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
## 2  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
## 3  16.4     8 275.8   180  3.07 4.070 17.40     0     0     3     3
## 4  17.3     8 275.8   180  3.07 3.730 17.60     0     0     3     3
## 5  15.2     8 275.8   180  3.07 3.780 18.00     0     0     3     3
## 6  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
## 7  33.9     4  71.1    65  4.22 1.835 19.90     1     1     4     1
## 8  19.7     6 145.0   175  3.62 2.770 15.50     0     1     5     6

top_n(gb_cyl, 2, disp)

## Source: local data frame [6 x 11]
## Groups: cyl [3]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
## 2  18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
## 3  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
## 4  22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2
## 5  10.4     8 472.0   205  2.93 5.250 17.98     0     0     3     4
## 6  10.4     8 460.0   215  3.00 5.424 17.82     0     0     3     4

1.17 窗口函数:

row_number(x)
ntile(x, n)
min_rank(x)
dense_rank(x)
percent_rank(x)
cume_dist(x)

x <- c(5, 1, 3, 2, 2, NA)
row_number(x)

## [1]  5  1  4  2  3 NA

min_rank(x)

## [1]  5  1  4  2  2 NA

dense_rank(x)

## [1]  4  1  3  2  2 NA

percent_rank(x)

## [1] 1.00 0.00 0.75 0.25 0.25   NA

cume_dist(x)

## [1] 1.0 0.2 0.8 0.6 0.6  NA

ntile(x, 2)

## [1]  2  1  2  1  1 NA

ntile(runif(100), 10)

##   [1]  9  5  4  5  5  9  6  9  8  7  3  2  7 10  7  8  9 10  3  2  9  5  4
##  [24]  8  6  5  6  2  6  4  8  4  9  7  2  1  2  2  2  3  8  9  6  1 10  8
##  [47]  5 10  3  9  4  6  1 10 10 10  6  3  1 10  1  8  3  1 10  8  5  7  8
##  [70]  1  2  2  4  4  6  7  8  1  4  4  5  6  5 10  1  4  7  6  5  3  9  7
##  [93]  2  3  3  7  7  1  3  9

二、tidyr包的相关函数

2.1 complete():

complete(data, …, fill = list())

library(dplyr)
library(tidyr)
df <- data_frame(group = c(1:2, 1), item_id = c(1:2, 2), item_name = c("a", 
    "b", "b"), value1 = 1:3, value2 = 4:6)
df

## Source: local data frame [3 x 5]
## 
##   group item_id item_name value1 value2
##   <dbl>   <dbl>     <chr>  <int>  <int>
## 1     1       1         a      1      4
## 2     2       2         b      2      5
## 3     1       2         b      3      6

df %>% complete(group, nesting(item_id, item_name))

## Source: local data frame [4 x 5]
## 
##   group item_id item_name value1 value2
##   <dbl>   <dbl>     <chr>  <int>  <int>
## 1     1       1         a      1      4
## 2     1       2         b      3      6
## 3     2       1         a     NA     NA
## 4     2       2         b      2      5

# You can also choose to fill in missing values
df %>% complete(group, nesting(item_id, item_name), fill = list(value1 = 0, 
    value2 = 99))

## Source: local data frame [4 x 5]
## 
##   group item_id item_name value1 value2
##   <dbl>   <dbl>     <chr>  <dbl>  <dbl>
## 1     1       1         a      1      4
## 2     1       2         b      3      6
## 3     2       1         a      0     99
## 4     2       2         b      2      5

2.2 expand(): 扩展成所有值组成的组合

expand(data, …)
crossing(…)
nesting(…)

df <- data_frame(year = c(2010, 2010, 2010, 2010, 2012, 2012, 2012), qtr = c(1, 
    2, 3, 4, 1, 2, 3), return = rnorm(7))
df

## Source: local data frame [7 x 3]
## 
##    year   qtr     return
##   <dbl> <dbl>      <dbl>
## 1  2010     1  0.4171819
## 2  2010     2 -2.1309442
## 3  2010     3 -0.7005132
## 4  2010     4  0.1543173
## 5  2012     1 -0.3485685
## 6  2012     2  0.3815484
## 7  2012     3  0.7407655

df %>% expand(year, qtr)

## Source: local data frame [8 x 2]
## 
##    year   qtr
##   <dbl> <dbl>
## 1  2010     1
## 2  2010     2
## 3  2010     3
## 4  2010     4
## 5  2012     1
## 6  2012     2
## 7  2012     3
## 8  2012     4

df %>% expand(year = 2010:2012, qtr)

## Source: local data frame [12 x 2]
## 
##     year   qtr
##    <int> <dbl>
## 1   2010     1
## 2   2010     2
## 3   2010     3
## 4   2010     4
## 5   2011     1
## 6   2011     2
## 7   2011     3
## 8   2011     4
## 9   2012     1
## 10  2012     2
## 11  2012     3
## 12  2012     4

df %>% expand(year = full_seq(year, 1), qtr)

## Source: local data frame [12 x 2]
## 
##     year   qtr
##    <dbl> <dbl>
## 1   2010     1
## 2   2010     2
## 3   2010     3
## 4   2010     4
## 5   2011     1
## 6   2011     2
## 7   2011     3
## 8   2011     4
## 9   2012     1
## 10  2012     2
## 11  2012     3
## 12  2012     4

df %>% complete(year = full_seq(year, 1), qtr)

## Source: local data frame [12 x 3]
## 
##     year   qtr     return
##    <dbl> <dbl>      <dbl>
## 1   2010     1  0.4171819
## 2   2010     2 -2.1309442
## 3   2010     3 -0.7005132
## 4   2010     4  0.1543173
## 5   2011     1         NA
## 6   2011     2         NA
## 7   2011     3         NA
## 8   2011     4         NA
## 9   2012     1 -0.3485685
## 10  2012     2  0.3815484
## 11  2012     3  0.7407655
## 12  2012     4         NA

2.3 extract(): 将一列分裂成多列

extract(data, col, into, regex = “([[:alnum:]]+)”, remove = TRUE, convert = FALSE, …)

df <- data.frame(x = c(NA, "a-b", "a-d", "b-c", "d-e"))
df

##      x
## 1 <NA>
## 2  a-b
## 3  a-d
## 4  b-c
## 5  d-e

df %>% extract(x, "A")

##      A
## 1 <NA>
## 2    a
## 3    a
## 4    b
## 5    d

df %>% extract(x, c("A", "B"), "([[:alnum:]]+)-([[:alnum:]]+)")

##      A    B
## 1 <NA> <NA>
## 2    a    b
## 3    a    d
## 4    b    c
## 5    d    e

# If no match, NA:
df %>% extract(x, c("A", "B"), "([a-d]+)-([a-d]+)")

##      A    B
## 1 <NA> <NA>
## 2    a    b
## 3    a    d
## 4    b    c
## 5 <NA> <NA>

2.4 extract_numeric(): 提取数字

extract_numeric("$1,200.34")

## [1] 1200.34

extract_numeric("-2%")

## [1] -2

extract_numeric("-2-2")

## [1] NA

extract_numeric("12abc34")

## [1] 1234

2.5 fill(): 使用缺失值前一个值向下或后一个值向上填充缺失值

fill(data, …, .direction = c(“down”, “up”))

df <- data.frame(Month = 1:12, Year = c(2000, rep(NA, 4), 2005, rep(NA, 3), 
    2006, NA, 2010))
df

##    Month Year
## 1      1 2000
## 2      2   NA
## 3      3   NA
## 4      4   NA
## 5      5   NA
## 6      6 2005
## 7      7   NA
## 8      8   NA
## 9      9   NA
## 10    10 2006
## 11    11   NA
## 12    12 2010

df %>% fill(Year, .direction = "down")

##    Month Year
## 1      1 2000
## 2      2 2000
## 3      3 2000
## 4      4 2000
## 5      5 2000
## 6      6 2005
## 7      7 2005
## 8      8 2005
## 9      9 2005
## 10    10 2006
## 11    11 2006
## 12    12 2010

df %>% fill(Year, .direction = "up")

##    Month Year
## 1      1 2000
## 2      2 2005
## 3      3 2005
## 4      4 2005
## 5      5 2005
## 6      6 2005
## 7      7 2006
## 8      8 2006
## 9      9 2006
## 10    10 2006
## 11    11 2010
## 12    12 2010

2.6 full_seq(): 创建完整序列

full_seq(x, period, tol = 1e-06)

full_seq(c(1, 2, 4, 5, 10), 1)

##  [1]  1  2  3  4  5  6  7  8  9 10

full_seq(c(2, 4, 10), 2)

## [1]  2  4  6  8 10

2.7 gather(): 宽数据转换成长数据

gather(data, key, value, …, na.rm = FALSE, convert = FALSE, factor_key = FALSE)

stocks <- data_frame(time = as.Date("2009-01-01") + 0:3, X = rnorm(4, 0, 1), 
    Y = rnorm(4, 0, 2), Z = rnorm(4, 0, 4))
stocks

## Source: local data frame [4 x 4]
## 
##         time          X         Y         Z
##       <date>      <dbl>     <dbl>     <dbl>
## 1 2009-01-01 -2.9642095  0.493950 10.318943
## 2 2009-01-02 -0.7862788 -3.017822  2.459886
## 3 2009-01-03  1.6122230 -2.072584 -3.610145
## 4 2009-01-04 -1.1993662 -4.241418  3.852937

gather(data = stocks, key = stock, value = price, -time)

## Source: local data frame [12 x 3]
## 
##          time stock      price
##        <date> <chr>      <dbl>
## 1  2009-01-01     X -2.9642095
## 2  2009-01-02     X -0.7862788
## 3  2009-01-03     X  1.6122230
## 4  2009-01-04     X -1.1993662
## 5  2009-01-01     Y  0.4939500
## 6  2009-01-02     Y -3.0178216
## 7  2009-01-03     Y -2.0725837
## 8  2009-01-04     Y -4.2414183
## 9  2009-01-01     Z 10.3189432
## 10 2009-01-02     Z  2.4598863
## 11 2009-01-03     Z -3.6101455
## 12 2009-01-04     Z  3.8529366

mini_iris <- iris[c(1, 51, 101), ]
mini_iris

## Source: local data frame [3 x 5]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl>  <fctr>
## 1          5.1         3.5          1.4         0.2  setosa
## 2           NA          NA           NA          NA      NA
## 3           NA          NA           NA          NA      NA

gather(mini_iris, key = flower_att, value = measurement, Sepal.Length, Sepal.Width, 
    Petal.Length, Petal.Width)

## Source: local data frame [12 x 3]
## 
##    Species   flower_att measurement
##     <fctr>        <chr>       <dbl>
## 1   setosa Sepal.Length         5.1
## 2       NA Sepal.Length          NA
## 3       NA Sepal.Length          NA
## 4   setosa  Sepal.Width         3.5
## 5       NA  Sepal.Width          NA
## 6       NA  Sepal.Width          NA
## 7   setosa Petal.Length         1.4
## 8       NA Petal.Length          NA
## 9       NA Petal.Length          NA
## 10  setosa  Petal.Width         0.2
## 11      NA  Petal.Width          NA
## 12      NA  Petal.Width          NA

gather(mini_iris, key = flower_att, value = measurement, -Species)

## Source: local data frame [12 x 3]
## 
##    Species   flower_att measurement
##     <fctr>        <chr>       <dbl>
## 1   setosa Sepal.Length         5.1
## 2       NA Sepal.Length          NA
## 3       NA Sepal.Length          NA
## 4   setosa  Sepal.Width         3.5
## 5       NA  Sepal.Width          NA
## 6       NA  Sepal.Width          NA
## 7   setosa Petal.Length         1.4
## 8       NA Petal.Length          NA
## 9       NA Petal.Length          NA
## 10  setosa  Petal.Width         0.2
## 11      NA  Petal.Width          NA
## 12      NA  Petal.Width          NA

mini_iris <- iris %>% group_by(Species) %>% slice(1)
mini_iris

## Source: local data frame [1 x 5]
## Groups: Species [1]
## 
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          (dbl)       (dbl)        (dbl)       (dbl)  (fctr)
## 1          5.1         3.5          1.4         0.2  setosa

mini_iris %>% gather(key = flower_att, value = measurement, -Species)

## Source: local data frame [4 x 3]
## Groups: Species [1]
## 
##   Species   flower_att measurement
##    (fctr)        (chr)       (dbl)
## 1  setosa Sepal.Length         5.1
## 2  setosa  Sepal.Width         3.5
## 3  setosa Petal.Length         1.4
## 4  setosa  Petal.Width         0.2

2.8 nest():

nest(data, …, .key = data)

iris %>% nest(-Species)

## Source: local data frame [1 x 2]
## 
##   Species           data
##    <fctr>         <list>
## 1  setosa <tbl_df [5,4]>

chickwts %>% nest(weight)

## Source: local data frame [6 x 2]
## 
##        feed            data
##      <fctr>          <list>
## 1 horsebean <tbl_df [10,1]>
## 2   linseed <tbl_df [12,1]>
## 3   soybean <tbl_df [14,1]>
## 4 sunflower <tbl_df [12,1]>
## 5  meatmeal <tbl_df [11,1]>
## 6    casein <tbl_df [12,1]>

2.9 replace_na(): 替换缺失值

replace_na(data, replace = list(), …)

library(dplyr)
df <- data_frame(x = c(1, 2, NA), y = c("a", NA, "b"))
df %>% replace_na(list(x = 0, y = "unknown"))

## Source: local data frame [3 x 2]
## 
##       x       y
##   <dbl>   <chr>
## 1     1       a
## 2     2 unknown
## 3     0       b

2.10 separate(): 将一列转换成多列

separate(data, col, into, sep = “[^[:alnum:]]+”, remove = TRUE, convert = FALSE, extra = “warn”, fill = “warn”, …)

library(dplyr)
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df %>% separate(x, c("A", "B"))

##      A    B
## 1 <NA> <NA>
## 2    a    b
## 3    a    d
## 4    b    c

# If every row doesn't split into the same number of pieces, use the extra
# and file arguments to control what happens
df <- data.frame(x = c("a", "a b", "a b c", NA))
df %>% separate(x, c("a", "b"))

##      a    b
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>

# The same behaviour but no warnings
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")

##      a    b
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>

# Another option:
df %>% separate(x, c("a", "b"), extra = "merge", fill = "left")

##      a    b
## 1 <NA>    a
## 2    a    b
## 3    a  b c
## 4 <NA> <NA>

# If only want to split specified number of times use extra = 'merge'
df <- data.frame(x = c("x: 123", "y: error: 7"))
df %>% separate(x, c("key", "value"), ": ", extra = "merge")

##   key    value
## 1   x      123
## 2   y error: 7

2.11 spread(): 将长数据转换成宽数据

spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)

library(dplyr)
stocks <- data.frame(time = as.Date("2009-01-01") + 0:9, X = rnorm(10, 0, 1), 
    Y = rnorm(10, 0, 2), Z = rnorm(10, 0, 4))
stocksm <- stocks %>% gather(stock, price, -time)
stocksm

##          time stock       price
## 1  2009-01-01     X -0.52189978
## 2  2009-01-02     X  0.80832212
## 3  2009-01-03     X  1.53476105
## 4  2009-01-04     X  1.40697005
## 5  2009-01-05     X -1.26666165
## 6  2009-01-06     X -0.66619804
## 7  2009-01-07     X -0.27924740
## 8  2009-01-08     X  1.48666964
## 9  2009-01-09     X  1.69425614
## 10 2009-01-10     X -1.35129012
## 11 2009-01-01     Y -1.92650802
## 12 2009-01-02     Y -1.04971015
## 13 2009-01-03     Y -0.06182085
## 14 2009-01-04     Y  0.47333560
## 15 2009-01-05     Y -2.79939690
## 16 2009-01-06     Y -5.98040102
## 17 2009-01-07     Y  0.84574146
## 18 2009-01-08     Y -1.33068578
## 19 2009-01-09     Y  0.35614342
## 20 2009-01-10     Y -2.83740229
## 21 2009-01-01     Z -5.27641988
## 22 2009-01-02     Z  3.39574345
## 23 2009-01-03     Z  4.30022453
## 24 2009-01-04     Z -0.39124420
## 25 2009-01-05     Z -3.19956039
## 26 2009-01-06     Z -2.83736630
## 27 2009-01-07     Z -2.80122389
## 28 2009-01-08     Z -3.04052729
## 29 2009-01-09     Z  1.71237941
## 30 2009-01-10     Z -0.08754712

stocksm %>% spread(stock, price)

##          time          X           Y           Z
## 1  2009-01-01 -0.5218998 -1.92650802 -5.27641988
## 2  2009-01-02  0.8083221 -1.04971015  3.39574345
## 3  2009-01-03  1.5347610 -0.06182085  4.30022453
## 4  2009-01-04  1.4069700  0.47333560 -0.39124420
## 5  2009-01-05 -1.2666617 -2.79939690 -3.19956039
## 6  2009-01-06 -0.6661980 -5.98040102 -2.83736630
## 7  2009-01-07 -0.2792474  0.84574146 -2.80122389
## 8  2009-01-08  1.4866696 -1.33068578 -3.04052729
## 9  2009-01-09  1.6942561  0.35614342  1.71237941
## 10 2009-01-10 -1.3512901 -2.83740229 -0.08754712

stocksm %>% spread(time, price)

##   stock 2009-01-01 2009-01-02  2009-01-03 2009-01-04 2009-01-05 2009-01-06
## 1     X -0.5218998  0.8083221  1.53476105  1.4069700  -1.266662  -0.666198
## 2     Y -1.9265080 -1.0497102 -0.06182085  0.4733356  -2.799397  -5.980401
## 3     Z -5.2764199  3.3957434  4.30022453 -0.3912442  -3.199560  -2.837366
##   2009-01-07 2009-01-08 2009-01-09  2009-01-10
## 1 -0.2792474   1.486670  1.6942561 -1.35129012
## 2  0.8457415  -1.330686  0.3561434 -2.83740229
## 3 -2.8012239  -3.040527  1.7123794 -0.08754712

# Spread and gather are complements
df <- data.frame(x = c("a", "b"), y = c(3, 4), z = c(5, 6))
df

##   x y z
## 1 a 3 5
## 2 b 4 6

df %>% spread(x, y) %>% gather(x, y, a:b, na.rm = TRUE)

##   z x y
## 1 5 a 3
## 4 6 b 4

# Use 'convert = TRUE' to produce variables of mixed type
df <- data.frame(row = rep(c(1, 51), each = 3), var = c("Sepal.Length", "Species", 
    "Species_num"), value = c(5.1, "setosa", 1, 7, "versicolor", 2))
df %>% spread(var, value) %>% str

## 'data.frame':    2 obs. of  4 variables:
##  $ row         : num  1 51
##  $ Sepal.Length: Factor w/ 6 levels "1","2","5.1",..: 3 4
##  $ Species     : Factor w/ 6 levels "1","2","5.1",..: 5 6
##  $ Species_num : Factor w/ 6 levels "1","2","5.1",..: 1 2

df %>% spread(var, value, convert = TRUE) %>% str

## 'data.frame':    2 obs. of  4 variables:
##  $ row         : num  1 51
##  $ Sepal.Length: num  5.1 7
##  $ Species     : chr  "setosa" "versicolor"
##  $ Species_num : int  1 2

dplyr

囧囧有神

2016年6月19日

安装、加载包

一、dplyr包相关函数

1.1 add_rownames(): 将行名转为显式变量

1.2 all.equal.tbl_df(): 比较两个数据框是否相同

S3 method for class ‘tbl_df’

S3 method for class ‘tbl_dt’

1.3 arrange(): 对数据框按变量进行排序

1.4 between(): 判断数据是否在某个数值区间中，返回逻辑值

1.5 do(): 对tbl数据进行某种操作

1.6 filter(): 按条件筛选行（样本）

1.7 group_by(): 将数据框按变量进行分组

1.8 mutate(): 在数据框中增加新变量

1.9 order_by(): 对变量进行排序后，再应用函数

1.10 rowwise(): 按行输入

1.11 sample(): 抽样

1.12 select(): 选择变量

1.13 setops: 集合操作

1.14 summarise(): 总结汇总

1.15 summarise_each():

1.16 top_n (): 选择top n 行

1.17 窗口函数:

二、tidyr包的相关函数

2.1 complete():

2.2 expand(): 扩展成所有值组成的组合

2.3 extract(): 将一列分裂成多列

2.4 extract_numeric(): 提取数字

2.5 fill(): 使用缺失值前一个值向下或后一个值向上填充缺失值

2.6 full_seq(): 创建完整序列

2.7 gather(): 宽数据转换成长数据

2.8 nest():

2.9 replace_na(): 替换缺失值

2.10 separate(): 将一列转换成多列

2.11 spread(): 将长数据转换成宽数据

1.12 select(): 选择变量