Data Transform using dplyr

R webinar ’The Grammar and Graphics of Data Science’편에서 dplyr demo를 정리한 문서이다. 해당 문서의 소스는 아래와 같다.

origin source : R webinar demo

Single table verbs

library(nycflights13)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

tbl_df

flights
## Source: local data frame [336,776 x 16]
## 
##    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1  2013     1   1      517         2      830        11      UA  N14228
## 2  2013     1   1      533         4      850        20      UA  N24211
## 3  2013     1   1      542         2      923        33      AA  N619AA
## 4  2013     1   1      544        -1     1004       -18      B6  N804JB
## 5  2013     1   1      554        -6      812       -25      DL  N668DN
## 6  2013     1   1      554        -4      740        12      UA  N39463
## 7  2013     1   1      555        -5      913        19      B6  N516JB
## 8  2013     1   1      557        -3      709       -14      EV  N829AS
## 9  2013     1   1      557        -3      838        -8      B6  N593JB
## 10 2013     1   1      558        -2      753         8      AA  N3ALAA
## ..  ...   ... ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

filter

keep rows matching criteria

filter(flights, dest == "IAH")
## Source: local data frame [7,198 x 16]
## 
##    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1  2013     1   1      517         2      830        11      UA  N14228
## 2  2013     1   1      533         4      850        20      UA  N24211
## 3  2013     1   1      623        -4      933         1      UA  N459UA
## 4  2013     1   1      728        -4     1041         3      UA  N488UA
## 5  2013     1   1      739         0     1104        26      UA  N37408
## 6  2013     1   1      908         0     1228         9      UA  N12216
## 7  2013     1   1     1028         2     1350        11      UA  N76508
## 8  2013     1   1     1044        -1     1352         1      UA  N667UA
## 9  2013     1   1     1114       134     1447       145      UA  N76502
## 10 2013     1   1     1205         5     1503        -2      UA  N39418
## ..  ...   ... ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

select

pick columns by name

select(flights, starts_with("arr"))
## Source: local data frame [336,776 x 2]
## 
##    arr_time arr_delay
## 1       830        11
## 2       850        20
## 3       923        33
## 4      1004       -18
## 5       812       -25
## 6       740        12
## 7       913        19
## 8       709       -14
## 9       838        -8
## 10      753         8
## ..      ...       ...

arrange

reorder rows

arrange(flights, desc(arr_delay))
## Source: local data frame [336,776 x 16]
## 
##    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1  2013     1   9      641      1301     1242      1272      HA  N384HA
## 2  2013     6  15     1432      1137     1607      1127      MQ  N504MQ
## 3  2013     1  10     1121      1126     1239      1109      MQ  N517MQ
## 4  2013     9  20     1139      1014     1457      1007      AA  N338AA
## 5  2013     7  22      845      1005     1044       989      MQ  N665MQ
## 6  2013     4  10     1100       960     1342       931      DL  N959DL
## 7  2013     3  17     2321       911      135       915      DL  N927DA
## 8  2013     7  22     2257       898      121       895      DL  N6716C
## 9  2013    12   5      756       896     1058       878      AA  N5DMAA
## 10 2013     5   3     1133       878     1250       875      MQ  N523MQ
## ..  ...   ... ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

mutate

add new variables

mutate(flights, speed = distance / air_time * 60)
## Source: local data frame [336,776 x 17]
## 
##    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1  2013     1   1      517         2      830        11      UA  N14228
## 2  2013     1   1      533         4      850        20      UA  N24211
## 3  2013     1   1      542         2      923        33      AA  N619AA
## 4  2013     1   1      544        -1     1004       -18      B6  N804JB
## 5  2013     1   1      554        -6      812       -25      DL  N668DN
## 6  2013     1   1      554        -4      740        12      UA  N39463
## 7  2013     1   1      555        -5      913        19      B6  N516JB
## 8  2013     1   1      557        -3      709       -14      EV  N829AS
## 9  2013     1   1      557        -3      838        -8      B6  N593JB
## 10 2013     1   1      558        -2      753         8      AA  N3ALAA
## ..  ...   ... ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl), speed (dbl)

summarise

reduce variables to values

by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## Source: local data frame [365 x 4]
## Groups: year, month
## 
##    year month day     delay
## 1  2013     1   1 11.548926
## 2  2013     1   2 13.858824
## 3  2013     1   3 10.987832
## 4  2013     1   4  8.951595
## 5  2013     1   5  5.732218
## 6  2013     1   6  7.148014
## 7  2013     1   7  5.417204
## 8  2013     1   8  2.553073
## 9  2013     1   9  2.276477
## 10 2013     1  10  2.844995
## ..  ...   ... ...       ...
summarise(by_day, count=n())
## Source: local data frame [365 x 4]
## Groups: year, month
## 
##    year month day count
## 1  2013     1   1   842
## 2  2013     1   2   943
## 3  2013     1   3   914
## 4  2013     1   4   915
## 5  2013     1   5   720
## 6  2013     1   6   832
## 7  2013     1   7   933
## 8  2013     1   8   899
## 9  2013     1   9   902
## 10 2013     1  10   932
## ..  ...   ... ...   ...

Multi-table verbs

semi_join(planes, flights)
## Joining by: c("tailnum", "year")
## Source: local data frame [92 x 9]
## 
##    tailnum year                    type   manufacturer           model
## 1   N37465 2013 Fixed wing multi engine         BOEING       737-924ER
## 2   N355JB 2013 Fixed wing multi engine        EMBRAER ERJ 190-100 IGW
## 3   N37471 2013 Fixed wing multi engine         BOEING       737-924ER
## 4   N292PQ 2013 Fixed wing multi engine BOMBARDIER INC     CL-600-2D24
## 5   N354JB 2013 Fixed wing multi engine        EMBRAER ERJ 190-100 IGW
## 6   N373JB 2013 Fixed wing multi engine        EMBRAER ERJ 190-100 IGW
## 7   N374JB 2013 Fixed wing multi engine        EMBRAER ERJ 190-100 IGW
## 8   N368JB 2013 Fixed wing multi engine        EMBRAER ERJ 190-100 IGW
## 9   N156UW 2013 Fixed wing multi engine         AIRBUS        A321-211
## 10  N157UW 2013 Fixed wing multi engine         AIRBUS        A321-211
## ..     ...  ...                     ...            ...             ...
## Variables not shown: engines (int), seats (int), speed (int), engine (chr)
anti_join(planes, flights)
## Joining by: c("tailnum", "year")
## Source: local data frame [3,230 x 9]
## 
##    tailnum year                    type                  manufacturer
## 1   N995DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 2   N993AT 2002 Fixed wing multi engine                        BOEING
## 3   N992AT 2002 Fixed wing multi engine                        BOEING
## 4   N991DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 5   N991AT   NA Fixed wing multi engine                        BOEING
## 6   N989DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 7   N989AT 2001 Fixed wing multi engine                        BOEING
## 8   N988DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 9   N987AT 2001 Fixed wing multi engine                        BOEING
## 10  N986DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## ..     ...  ...                     ...                           ...
## Variables not shown: model (chr), engines (int), seats (int), speed (int),
##   engine (chr)

Pipelines

x %>% f(y) == f(x, y)

hourly_delay <- flights %>%
  filter(!is.na(dep_delay)) %>%
  group_by(air_time, distance) %>%
  summarise( delay = mean(dep_delay), n = n() ) %>%
  filter(n > 10)
hourly_delay
## Source: local data frame [5,277 x 4]
## Groups: air_time
## 
##    air_time distance     delay   n
## 1        22      116  8.312500  32
## 2        23      116  7.391304  69
## 3        24       94 20.920000  25
## 4        24      116 18.320000  75
## 5        25       94 12.672727  55
## 6        25      116 14.714286  63
## 7        26       94 12.176471 102
## 8        26      116 27.274510  51
## 9        27       94 22.504854 103
## 10       27      116  8.869565  23
## ..      ...      ...       ... ...

Learn more

# Built-in vignettes
browseVignettes(package = "dplyr")