What is dplyr?

Load our data package

library(nycflights13)
## Warning: package 'nycflights13' was built under R version 3.1.1
suppressMessages(library(dplyr))

Package nycflights13 has data about all flights that departed NYC in 2013 - 5 datasets

  1. flights: Flights data
  2. airlines: Airline names
  3. airports: Airport metadata
  4. planes : Plane metadata
  5. weather: Hourly weather data

tbl_df

Prints nicely and prevents an accidental display of the whole dataset

tblflights <- tbl_df(flights)
head(tblflights,3) # Can also use print(flights,3) instead
## Source: local data frame [3 x 16]
## 
##   year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013     1   1      517         2      830        11      UA  N14228
## 2 2013     1   1      533         4      850        20      UA  N24211
## 3 2013     1   1      542         2      923        33      AA  N619AA
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

Basic single table (df) verbs

  1. filter: for subsetting variables
  2. select: for subsetting rows
  3. arrange: for re-ordering rows
  4. mutate: for adding new columns
  5. summarise or summarize: for reducing each group to a smaller number of summary statistics

filter: Keep rows matching criteria

# base R: tblflights[tblflights$carrier=="AA" & tblflights$origin=="LGA", ]
filter(tblflights, carrier=="AA" & origin=="LGA")
## Source: local data frame [15,459 x 16]
## 
##    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1  2013     1   1      558        -2      753         8      AA  N3ALAA
## 2  2013     1   1      559        -1      941        31      AA  N3DUAA
## 3  2013     1   1      623        13      920         5      AA  N3EMAA
## 4  2013     1   1      629        -1      824        14      AA  N3CYAA
## 5  2013     1   1      635         0     1028        48      AA  N3GKAA
## 6  2013     1   1      656        -4      854         4      AA  N4WNAA
## 7  2013     1   1      659        -1     1008        -7      AA  N3EKAA
## 8  2013     1   1      724        -6     1111        31      AA  N541AA
## 9  2013     1   1      739        -6      918       -12      AA  N4WPAA
## 10 2013     1   1      753        -2     1056       -14      AA  N3HMAA
## ..  ...   ... ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
# same as filter(tblflights, carrier=="AA", origin=="LGA")

filter again

filter(tblflights, carrier %in% c("AA", "UA"))
## Source: local data frame [91,394 x 16]
## 
##    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1  2013     1   1      517         2      830        11      UA  N14228
## 2  2013     1   1      533         4      850        20      UA  N24211
## 3  2013     1   1      542         2      923        33      AA  N619AA
## 4  2013     1   1      554        -4      740        12      UA  N39463
## 5  2013     1   1      558        -2      753         8      AA  N3ALAA
## 6  2013     1   1      558        -2      924         7      UA  N29129
## 7  2013     1   1      558        -2      923       -14      UA  N53441
## 8  2013     1   1      559        -1      941        31      AA  N3DUAA
## 9  2013     1   1      559        -1      854        -8      UA  N76515
## 10 2013     1   1      606        -4      858       -12      AA  N633AA
## ..  ...   ... ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)

select again

# dplyr approach
print(select(tblflights, dep_time, arr_time, flight),n=6)
## Source: local data frame [336,776 x 3]
## 
##    dep_time arr_time flight
## 1       517      830   1545
## 2       533      850   1714
## 3       542      923   1141
## 4       544     1004    725
## 5       554      812    461
## 6       554      740   1696
## ..      ...      ...    ...

Chaining over Nesting?

# nesting method to select carrier and dep_delay columns and filter for delays over 60 minutes
head(filter(select(tblflights, carrier, dep_delay), dep_delay > 60))
## Source: local data frame [6 x 2]
## 
##   carrier dep_delay
## 1      MQ       101
## 2      AA        71
## 3      MQ       853
## 4      UA       144
## 5      UA       134
## 6      EV        96

Chaining

arrange: Reorder rows

# base R approach to select carrier and dep_delay columns and sort by dep_delay
head(tblflights[order(tblflights$dep_delay), c("carrier", "dep_delay")])
## Source: local data frame [6 x 2]
## 
##        carrier dep_delay
## 89674       B6       -43
## 113634      DL       -33
## 64502       EV       -32
## 9620        DL       -30
## 24916       F9       -27
## 287743      MQ       -26

mutate: create new variables that are functions of existing variables

# base R approach to create a new variable - sum of squares of delays (arr and dep)
tblflights$delaysquare <- tblflights$dep_delay^2 + tblflights$arr_delay^2
head(tblflights[, c("dep_delay", "arr_delay", "delaysquare")])
## Source: local data frame [6 x 3]
## 
##   dep_delay arr_delay delaysquare
## 1         2        11         125
## 2         4        20         416
## 3         2        33        1093
## 4        -1       -18         325
## 5        -6       -25         661
## 6        -4        12         160

summarise/summarize: Reduce multiple variables to values

# base R approaches to calculate the mean arrival delays at different airports
aggregate(arr_delay ~ origin, tblflights, mean)
##   origin arr_delay
## 1    EWR     9.107
## 2    JFK     5.551
## 3    LGA     5.783
# or with(tblflights, tapply(arr_delay, origin, mean, na.rm=TRUE))

summarise_each/mutate_each: apply the same summary/mutate function(s) to multiple columns at once

# for each carrier, calculate the mean arrival and departure delays at the different origin airports
tblflights %>%
    group_by(origin) %>%
    summarise_each(funs(mean(.,na.rm=TRUE)), arr_delay, dep_delay) %>%
    head()
## Source: local data frame [3 x 3]
## 
##   origin arr_delay dep_delay
## 1    EWR     9.107     15.11
## 2    JFK     5.551     12.11
## 3    LGA     5.783     10.35

mutate_each

tblflights %>%
    select(matches("_delay")) %>%
    head(3)
## Source: local data frame [3 x 2]
## 
##   dep_delay arr_delay
## 1         2        11
## 2         4        20
## 3         2        33
tblflights %>%
    select(matches("_delay")) %>%
    mutate_each(funs(half=./2)) %>%
    head(3)
## Source: local data frame [3 x 2]
## 
##   dep_delay arr_delay
## 1         1       5.5
## 2         2      10.0
## 3         1      16.5

n_distinct(vector): counts the number of unique items in that vector

# for each destination, count the total number of flights and the number of distinct planes that flew there
tblflights %>%
    group_by(dest) %>%
    summarise(flight_count = n(), plane_count = n_distinct(tailnum)) %>%
    head()
## Source: local data frame [6 x 3]
## 
##   dest flight_count plane_count
## 1  ABQ          254         108
## 2  ACK          265          58
## 3  ALB          439         172
## 4  ANC            8           6
## 5  ATL        17215        1180
## 6  AUS         2439         993

Window Functions

Other things to play with

# for each carrier, calculate which two days of the year they had their longest departure delays--- rewrite previous with the `top_n` function
tblflights %>%
    group_by(carrier) %>%
    select(month, day, dep_delay) %>%
    filter(dep_delay!="NA") %>%
    top_n(2) %>%
    arrange(carrier, desc(dep_delay)) %>% head()

# for each month, calculate the number of flights and the change from the previous month
tblflights %>%
    group_by(month) %>%
    summarise(flight_count = n()) %>%
    mutate(change = flight_count - lag(flight_count))

# rewrite previous with the `tally` function
tblflights %>%
    group_by(month) %>%
    tally() %>%
    mutate(change = n - lag(n))

do : for doing arbitrary operations

model=tblflights %>% group_by(origin) %>% do(lm=lm(dep_delay~arr_delay+carrier,data=.)) 
## 
|================================                 | 67% ~2 s remaining     
Completed after 4 s
model %>% summarise(rsq=summary(lm)$r.squared)
## Source: local data frame [3 x 1]
## 
##      rsq
## 1 0.8636
## 2 0.8182
## 3 0.8417

Quick look at 3 of 5 nycflights13 datasets

tblflights %>% head(1) 
## Source: local data frame [1 x 17]
## 
##   year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013     1   1      517         2      830        11      UA  N14228
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl), delaysquare (dbl)
airlines %>% head(1)
## Source: local data frame [1 x 2]
## 
##   carrier              name
## 1      9E Endeavor Air Inc.
airports %>% head(1)
## Source: local data frame [1 x 7]
## 
##   faa              name   lat    lon  alt tz dst
## 1 04G Lansdowne Airport 41.13 -80.62 1044 -5   A

left_join(x,y): all x + matching y

faatblflights=tblflights %>% select(origin) %>% mutate(faa=origin)
faatblflights %>% left_join(airports) %>% head(2) 
## Joining by: "faa"
## Source: local data frame [2 x 8]
## 
##   origin faa                name   lat    lon alt tz dst
## 1    EWR EWR Newark Liberty Intl 40.69 -74.17  18 -5   A
## 2    LGA LGA          La Guardia 40.78 -73.87  22 -5   A
airports %>% left_join(faatblflights) %>% head(2)
## Joining by: "faa"
## Source: local data frame [2 x 8]
## 
##   faa                          name   lat    lon  alt tz dst origin
## 1 04G             Lansdowne Airport 41.13 -80.62 1044 -5   A     NA
## 2 06A Moton Field Municipal Airport 32.46 -85.68  264 -5   A     NA

anti_join(x, y) : all x without match in y

faatblflights %>% anti_join(airports)
## Joining by: "faa"
## Source: local data frame [0 x 2]
airports %>% anti_join(faatblflights)
## Joining by: "faa"
## Source: local data frame [1,394 x 7]
## 
##    faa                                name   lat     lon  alt  tz dst
## 1  PVC                   Provincetown Muni 42.07  -70.22    9  -5   A
## 2  GDV            Dawson Community Airport 47.14 -104.81  749  -7   A
## 3  OWB    Owensboro Daviess County Airport 37.74  -87.17  406  -6   A
## 4  ESD                Orcas Island Airport 48.71 -122.91   31  -8   A
## 5  PBI                     Palm Beach Intl 26.68  -80.10   19  -5   A
## 6  NIB                     Nikolai Airport 63.01 -154.38  427  -9   A
## 7  DLL     Baraboo Wisconsin Dells Airport 43.52  -89.77  979  -6   A
## 8  MKK                             Molokai 21.15 -157.10  454 -10   N
## 9  CFD                         Coulter Fld 30.72  -96.33  367  -6   A
## 10 BJC Rocky Mountain Metropolitan Airport 39.91 -105.12 5670  -7   A
## .. ...                                 ...   ...     ...  ... ... ...

The fun has to stop here…