dplyrR webinar ’The Grammar and Graphics of Data Science’편에서 dplyr demo를 정리한 문서이다. 해당 문서의 소스는 아래와 같다.
origin source : R webinar demo
library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flights
## Source: local data frame [336,776 x 16]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 542 2 923 33 AA N619AA
## 4 2013 1 1 544 -1 1004 -18 B6 N804JB
## 5 2013 1 1 554 -6 812 -25 DL N668DN
## 6 2013 1 1 554 -4 740 12 UA N39463
## 7 2013 1 1 555 -5 913 19 B6 N516JB
## 8 2013 1 1 557 -3 709 -14 EV N829AS
## 9 2013 1 1 557 -3 838 -8 B6 N593JB
## 10 2013 1 1 558 -2 753 8 AA N3ALAA
## .. ... ... ... ... ... ... ... ... ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
keep rows matching criteria
filter(flights, dest == "IAH")
## Source: local data frame [7,198 x 16]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 623 -4 933 1 UA N459UA
## 4 2013 1 1 728 -4 1041 3 UA N488UA
## 5 2013 1 1 739 0 1104 26 UA N37408
## 6 2013 1 1 908 0 1228 9 UA N12216
## 7 2013 1 1 1028 2 1350 11 UA N76508
## 8 2013 1 1 1044 -1 1352 1 UA N667UA
## 9 2013 1 1 1114 134 1447 145 UA N76502
## 10 2013 1 1 1205 5 1503 -2 UA N39418
## .. ... ... ... ... ... ... ... ... ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
pick columns by name
select(flights, starts_with("arr"))
## Source: local data frame [336,776 x 2]
##
## arr_time arr_delay
## 1 830 11
## 2 850 20
## 3 923 33
## 4 1004 -18
## 5 812 -25
## 6 740 12
## 7 913 19
## 8 709 -14
## 9 838 -8
## 10 753 8
## .. ... ...
reorder rows
arrange(flights, desc(arr_delay))
## Source: local data frame [336,776 x 16]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 9 641 1301 1242 1272 HA N384HA
## 2 2013 6 15 1432 1137 1607 1127 MQ N504MQ
## 3 2013 1 10 1121 1126 1239 1109 MQ N517MQ
## 4 2013 9 20 1139 1014 1457 1007 AA N338AA
## 5 2013 7 22 845 1005 1044 989 MQ N665MQ
## 6 2013 4 10 1100 960 1342 931 DL N959DL
## 7 2013 3 17 2321 911 135 915 DL N927DA
## 8 2013 7 22 2257 898 121 895 DL N6716C
## 9 2013 12 5 756 896 1058 878 AA N5DMAA
## 10 2013 5 3 1133 878 1250 875 MQ N523MQ
## .. ... ... ... ... ... ... ... ... ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
add new variables
mutate(flights, speed = distance / air_time * 60)
## Source: local data frame [336,776 x 17]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 542 2 923 33 AA N619AA
## 4 2013 1 1 544 -1 1004 -18 B6 N804JB
## 5 2013 1 1 554 -6 812 -25 DL N668DN
## 6 2013 1 1 554 -4 740 12 UA N39463
## 7 2013 1 1 555 -5 913 19 B6 N516JB
## 8 2013 1 1 557 -3 709 -14 EV N829AS
## 9 2013 1 1 557 -3 838 -8 B6 N593JB
## 10 2013 1 1 558 -2 753 8 AA N3ALAA
## .. ... ... ... ... ... ... ... ... ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl), speed (dbl)
reduce variables to values
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## Source: local data frame [365 x 4]
## Groups: year, month
##
## year month day delay
## 1 2013 1 1 11.548926
## 2 2013 1 2 13.858824
## 3 2013 1 3 10.987832
## 4 2013 1 4 8.951595
## 5 2013 1 5 5.732218
## 6 2013 1 6 7.148014
## 7 2013 1 7 5.417204
## 8 2013 1 8 2.553073
## 9 2013 1 9 2.276477
## 10 2013 1 10 2.844995
## .. ... ... ... ...
summarise(by_day, count=n())
## Source: local data frame [365 x 4]
## Groups: year, month
##
## year month day count
## 1 2013 1 1 842
## 2 2013 1 2 943
## 3 2013 1 3 914
## 4 2013 1 4 915
## 5 2013 1 5 720
## 6 2013 1 6 832
## 7 2013 1 7 933
## 8 2013 1 8 899
## 9 2013 1 9 902
## 10 2013 1 10 932
## .. ... ... ... ...
semi_join(planes, flights)
## Joining by: c("tailnum", "year")
## Source: local data frame [92 x 9]
##
## tailnum year type manufacturer model
## 1 N37465 2013 Fixed wing multi engine BOEING 737-924ER
## 2 N355JB 2013 Fixed wing multi engine EMBRAER ERJ 190-100 IGW
## 3 N37471 2013 Fixed wing multi engine BOEING 737-924ER
## 4 N292PQ 2013 Fixed wing multi engine BOMBARDIER INC CL-600-2D24
## 5 N354JB 2013 Fixed wing multi engine EMBRAER ERJ 190-100 IGW
## 6 N373JB 2013 Fixed wing multi engine EMBRAER ERJ 190-100 IGW
## 7 N374JB 2013 Fixed wing multi engine EMBRAER ERJ 190-100 IGW
## 8 N368JB 2013 Fixed wing multi engine EMBRAER ERJ 190-100 IGW
## 9 N156UW 2013 Fixed wing multi engine AIRBUS A321-211
## 10 N157UW 2013 Fixed wing multi engine AIRBUS A321-211
## .. ... ... ... ... ...
## Variables not shown: engines (int), seats (int), speed (int), engine (chr)
anti_join(planes, flights)
## Joining by: c("tailnum", "year")
## Source: local data frame [3,230 x 9]
##
## tailnum year type manufacturer
## 1 N995DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 2 N993AT 2002 Fixed wing multi engine BOEING
## 3 N992AT 2002 Fixed wing multi engine BOEING
## 4 N991DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 5 N991AT NA Fixed wing multi engine BOEING
## 6 N989DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 7 N989AT 2001 Fixed wing multi engine BOEING
## 8 N988DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## 9 N987AT 2001 Fixed wing multi engine BOEING
## 10 N986DL 1991 Fixed wing multi engine MCDONNELL DOUGLAS AIRCRAFT CO
## .. ... ... ... ...
## Variables not shown: model (chr), engines (int), seats (int), speed (int),
## engine (chr)
x %>% f(y) == f(x, y)
hourly_delay <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(air_time, distance) %>%
summarise( delay = mean(dep_delay), n = n() ) %>%
filter(n > 10)
hourly_delay
## Source: local data frame [5,277 x 4]
## Groups: air_time
##
## air_time distance delay n
## 1 22 116 8.312500 32
## 2 23 116 7.391304 69
## 3 24 94 20.920000 25
## 4 24 116 18.320000 75
## 5 25 94 12.672727 55
## 6 25 116 14.714286 63
## 7 26 94 12.176471 102
## 8 26 116 27.274510 51
## 9 27 94 22.504854 103
## 10 27 116 8.869565 23
## .. ... ... ... ...
# Built-in vignettes
browseVignettes(package = "dplyr")