dplyr Package

The dplyr package makes these steps fast and easy:

By constraining your options, it simplifies how you can think about common data manipulation tasks.

It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate those thoughts into code.

It uses efficient data storage backends, so you spend less time waiting for the computer.

Single table verbs

Dplyr aims to provide a function for each basic verb of data manipulation:
filter() (and slice()) arrange() select() (and rename()) distinct() mutate() (and transmute()) summarise() sample_n() and sample_frac() If you’ve used plyr before, many of these will be familar.

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nycflights13)
dim(flights)
## [1] 336776     16
head(flights)
## Source: local data frame [6 x 16]
## 
##    year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##   (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1  2013     1     1      517         2      830        11      UA  N14228
## 2  2013     1     1      533         4      850        20      UA  N24211
## 3  2013     1     1      542         2      923        33      AA  N619AA
## 4  2013     1     1      544        -1     1004       -18      B6  N804JB
## 5  2013     1     1      554        -6      812       -25      DL  N668DN
## 6  2013     1     1      554        -4      740        12      UA  N39463
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
colnames(flights)
##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay"
##  [6] "arr_time"  "arr_delay" "carrier"   "tailnum"   "flight"   
## [11] "origin"    "dest"      "air_time"  "distance"  "hour"     
## [16] "minute"

Exercises HEAD, TAIL, SELECT

head(select(flights,carrier,flight))
## Source: local data frame [6 x 2]
## 
##   carrier flight
##     (chr)  (int)
## 1      UA   1545
## 2      UA   1714
## 3      AA   1141
## 4      B6    725
## 5      DL    461
## 6      UA   1696
tail(select(flights,carrier,flight))
## Source: local data frame [6 x 2]
## 
##   carrier flight
##     (chr)  (int)
## 1      EV   5274
## 2      9E   3393
## 3      9E   3525
## 4      MQ   3461
## 5      MQ   3572
## 6      MQ   3531

USING PIPE LINE, SELECT

flights %>% select(carrier,flight)
## Source: local data frame [336,776 x 2]
## 
##    carrier flight
##      (chr)  (int)
## 1       UA   1545
## 2       UA   1714
## 3       AA   1141
## 4       B6    725
## 5       DL    461
## 6       UA   1696
## 7       B6    507
## 8       EV   5708
## 9       B6     79
## 10      AA    301
## ..     ...    ...
flights %>% select(carrier,flight, dep_time, arr_time)
## Source: local data frame [336,776 x 4]
## 
##    carrier flight dep_time arr_time
##      (chr)  (int)    (int)    (int)
## 1       UA   1545      517      830
## 2       UA   1714      533      850
## 3       AA   1141      542      923
## 4       B6    725      544     1004
## 5       DL    461      554      812
## 6       UA   1696      554      740
## 7       B6    507      555      913
## 8       EV   5708      557      709
## 9       B6     79      557      838
## 10      AA    301      558      753
## ..     ...    ...      ...      ...
select(flights,day,distance)
## Source: local data frame [336,776 x 2]
## 
##      day distance
##    (int)    (dbl)
## 1      1     1400
## 2      1     1416
## 3      1     1089
## 4      1     1576
## 5      1      762
## 6      1      719
## 7      1     1065
## 8      1      229
## 9      1      944
## 10     1      733
## ..   ...      ...

USING FILTER

flights %>% select(day,distance) %>% filter(day>2)
## Source: local data frame [314,932 x 2]
## 
##      day distance
##    (int)    (dbl)
## 1      3     1598
## 2      3      301
## 3      3     1576
## 4      3      529
## 5      3     1400
## 6      3     1416
## 7      3     1089
## 8      3     1576
## 9      3      944
## 10     3      502
## ..   ...      ...
flights %>% filter(day>2)
## Source: local data frame [314,932 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     3       32        33      504        22      B6  N763JB
## 2   2013     1     3       50       185      203       172      B6  N329JB
## 3   2013     1     3      235       156      700       143      B6  N618JB
## 4   2013     1     3      458        -2      650         0      US  N172US
## 5   2013     1     3      520        -5      830        10      UA  N78511
## 6   2013     1     3      532         2      851        20      UA  N76522
## 7   2013     1     3      535        -5      835       -15      AA  N618AA
## 8   2013     1     3      543        -2     1009       -13      B6  N779JB
## 9   2013     1     3      550       -10      843        -3      B6  N708JB
## 10  2013     1     3      552        -8      759        -2      DL  N954DL
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
filter(flights, month == 1, day == 1)
## Source: local data frame [842 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
flights[flights$month == 1 & flights$day == 1, ]
## Source: local data frame [842 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
filter(flights, month == 1 | month == 2)
## Source: local data frame [51,955 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
flights %>% filter(dep_time>=600, dep_time<=630)
## Source: local data frame [11,660 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      600         0      851        -7      B6  N595JB
## 2   2013     1     1      600         0      837        12      MQ  N542MQ
## 3   2013     1     1      601         1      844        -6      B6  N644JB
## 4   2013     1     1      602        -8      812        -8      DL  N971DL
## 5   2013     1     1      602        -3      821        16      MQ  N730MQ
## 6   2013     1     1      606        -4      858       -12      AA  N633AA
## 7   2013     1     1      606        -4      837        -8      DL  N3739P
## 8   2013     1     1      607         0      858       -17      UA  N53442
## 9   2013     1     1      608         8      807        32      MQ  N9EAMQ
## 10  2013     1     1      611        11      945        14      UA  N532UA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
flights %>% filter(dep_time>=600 & dep_time<=630)
## Source: local data frame [11,660 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      600         0      851        -7      B6  N595JB
## 2   2013     1     1      600         0      837        12      MQ  N542MQ
## 3   2013     1     1      601         1      844        -6      B6  N644JB
## 4   2013     1     1      602        -8      812        -8      DL  N971DL
## 5   2013     1     1      602        -3      821        16      MQ  N730MQ
## 6   2013     1     1      606        -4      858       -12      AA  N633AA
## 7   2013     1     1      606        -4      837        -8      DL  N3739P
## 8   2013     1     1      607         0      858       -17      UA  N53442
## 9   2013     1     1      608         8      807        32      MQ  N9EAMQ
## 10  2013     1     1      611        11      945        14      UA  N532UA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

Filter NA rows

flights %>% filter(is.na(dep_time))
## Source: local data frame [8,255 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1       NA        NA       NA        NA      EV  N18120
## 2   2013     1     1       NA        NA       NA        NA      AA  N3EHAA
## 3   2013     1     1       NA        NA       NA        NA      AA  N3EVAA
## 4   2013     1     1       NA        NA       NA        NA      B6  N618JB
## 5   2013     1     2       NA        NA       NA        NA      EV  N10575
## 6   2013     1     2       NA        NA       NA        NA      EV  N13949
## 7   2013     1     2       NA        NA       NA        NA      EV  N10575
## 8   2013     1     2       NA        NA       NA        NA      EV  N759EV
## 9   2013     1     2       NA        NA       NA        NA      EV  N13550
## 10  2013     1     2       NA        NA       NA        NA      AA        
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
flights %>% filter(!is.na(dep_time))
## Source: local data frame [328,521 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

Remove Columns using “-”

flights %>% select(-month,-day)
## Source: local data frame [336,776 x 14]
## 
##     year dep_time dep_delay arr_time arr_delay carrier tailnum flight
##    (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)  (int)
## 1   2013      517         2      830        11      UA  N14228   1545
## 2   2013      533         4      850        20      UA  N24211   1714
## 3   2013      542         2      923        33      AA  N619AA   1141
## 4   2013      544        -1     1004       -18      B6  N804JB    725
## 5   2013      554        -6      812       -25      DL  N668DN    461
## 6   2013      554        -4      740        12      UA  N39463   1696
## 7   2013      555        -5      913        19      B6  N516JB    507
## 8   2013      557        -3      709       -14      EV  N829AS   5708
## 9   2013      557        -3      838        -8      B6  N593JB     79
## 10  2013      558        -2      753         8      AA  N3ALAA    301
## ..   ...      ...       ...      ...       ...     ...     ...    ...
## Variables not shown: origin (chr), dest (chr), air_time (dbl), distance
##   (dbl), hour (dbl), minute (dbl)
flights %>% select(-(dep_time:arr_time))
## Source: local data frame [336,776 x 13]
## 
##     year month   day arr_delay carrier tailnum flight origin  dest
##    (int) (int) (int)     (dbl)   (chr)   (chr)  (int)  (chr) (chr)
## 1   2013     1     1        11      UA  N14228   1545    EWR   IAH
## 2   2013     1     1        20      UA  N24211   1714    LGA   IAH
## 3   2013     1     1        33      AA  N619AA   1141    JFK   MIA
## 4   2013     1     1       -18      B6  N804JB    725    JFK   BQN
## 5   2013     1     1       -25      DL  N668DN    461    LGA   ATL
## 6   2013     1     1        12      UA  N39463   1696    EWR   ORD
## 7   2013     1     1        19      B6  N516JB    507    EWR   FLL
## 8   2013     1     1       -14      EV  N829AS   5708    LGA   IAD
## 9   2013     1     1        -8      B6  N593JB     79    JFK   MCO
## 10  2013     1     1         8      AA  N3ALAA    301    LGA   ORD
## ..   ...   ...   ...       ...     ...     ...    ...    ...   ...
## Variables not shown: air_time (dbl), distance (dbl), hour (dbl), minute
##   (dbl)
flights %>% select(-contains("time"))
## Source: local data frame [336,776 x 13]
## 
##     year month   day dep_delay arr_delay carrier tailnum flight origin
##    (int) (int) (int)     (dbl)     (dbl)   (chr)   (chr)  (int)  (chr)
## 1   2013     1     1         2        11      UA  N14228   1545    EWR
## 2   2013     1     1         4        20      UA  N24211   1714    LGA
## 3   2013     1     1         2        33      AA  N619AA   1141    JFK
## 4   2013     1     1        -1       -18      B6  N804JB    725    JFK
## 5   2013     1     1        -6       -25      DL  N668DN    461    LGA
## 6   2013     1     1        -4        12      UA  N39463   1696    EWR
## 7   2013     1     1        -5        19      B6  N516JB    507    EWR
## 8   2013     1     1        -3       -14      EV  N829AS   5708    LGA
## 9   2013     1     1        -3        -8      B6  N593JB     79    JFK
## 10  2013     1     1        -2         8      AA  N3ALAA    301    LGA
## ..   ...   ...   ...       ...       ...     ...     ...    ...    ...
## Variables not shown: dest (chr), distance (dbl), hour (dbl), minute (dbl)

Renaming column

colnames(flights)
##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay"
##  [6] "arr_time"  "arr_delay" "carrier"   "tailnum"   "flight"   
## [11] "origin"    "dest"      "air_time"  "distance"  "hour"     
## [16] "minute"
flights %>% select(tail=tailnum)
## Source: local data frame [336,776 x 1]
## 
##      tail
##     (chr)
## 1  N14228
## 2  N24211
## 3  N619AA
## 4  N804JB
## 5  N668DN
## 6  N39463
## 7  N516JB
## 8  N829AS
## 9  N593JB
## 10 N3ALAA
## ..    ...

Difference between rename & select

flights %>% select(contains("time")) %>% rename(DEPARTURE_TIME=dep_time)
## Source: local data frame [336,776 x 3]
## 
##    DEPARTURE_TIME arr_time air_time
##             (int)    (int)    (dbl)
## 1             517      830      227
## 2             533      850      227
## 3             542      923      160
## 4             544     1004      183
## 5             554      812      116
## 6             554      740      150
## 7             555      913      158
## 8             557      709       53
## 9             557      838      140
## 10            558      753      138
## ..            ...      ...      ...
flights %>% select(contains("time")) %>% select(DEPARTURE_TIME=dep_time)
## Source: local data frame [336,776 x 1]
## 
##    DEPARTURE_TIME
##             (int)
## 1             517
## 2             533
## 3             542
## 4             544
## 5             554
## 6             554
## 7             555
## 8             557
## 9             557
## 10            558
## ..            ...

To select rows by position, use slice()

slice(flights,1:10)
## Source: local data frame [10 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
slice(flights,50:60)
## Source: local data frame [11 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      646         1      910        -6      UA  N569UA
## 2   2013     1     1      646         1     1023        -7      UA  N38727
## 3   2013     1     1      651        -4      936        -6      B6  N558JB
## 4   2013     1     1      652        -3      932        11      B6  N178JB
## 5   2013     1     1      653        -7      936       -33      DL  N327NW
## 6   2013     1     1      655         0     1021        -9      DL  N3763D
## 7   2013     1     1      655        -5     1037        -8      DL  N705TW
## 8   2013     1     1      655        -5     1002       -18      DL  N997DL
## 9   2013     1     1      656        -4      854         4      AA  N4WNAA
## 10  2013     1     1      656        -3      949       -10      AA  N5FMAA
## 11  2013     1     1      656        -9     1007        27      MQ  N722MQ
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

Arrange rows with arrange()

arrange() works similarly to filter() except that instead of filtering or selecting rows, it reorders them. It takes a data frame, and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:

arrange(flights, year, month, day)
## Source: local data frame [336,776 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

Add new columns with mutate()

Besides selecting sets of existing columns, it’s often useful to add new columns that are functions of existing columns. This is the job of mutate():

mutate(flights, gain = arr_delay - dep_delay, speed = distance / air_time * 60)
## Source: local data frame [336,776 x 18]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl), gain (dbl), speed (dbl)