0.1 Introduction

dplyr is an R package used for Data Manipulation and Exploration

Let’s get started!

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hflights)

data("hflights")

head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0
str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
# Converting to local data frame
flights <- tbl_df(hflights)

flights
## Source: local data frame [227,496 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## 7  2011     1          7         5    1359    1509            AA       428
## 8  2011     1          8         6    1355    1454            AA       428
## 9  2011     1          9         7    1443    1554            AA       428
## 10 2011     1         10         1    1443    1553            AA       428
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
# I can specify number of rows I want to see
print(flights, n = 30)
## Source: local data frame [227,496 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## 7  2011     1          7         5    1359    1509            AA       428
## 8  2011     1          8         6    1355    1454            AA       428
## 9  2011     1          9         7    1443    1554            AA       428
## 10 2011     1         10         1    1443    1553            AA       428
## 11 2011     1         11         2    1429    1539            AA       428
## 12 2011     1         12         3    1419    1515            AA       428
## 13 2011     1         13         4    1358    1501            AA       428
## 14 2011     1         14         5    1357    1504            AA       428
## 15 2011     1         15         6    1359    1459            AA       428
## 16 2011     1         16         7    1359    1509            AA       428
## 17 2011     1         17         1    1530    1634            AA       428
## 18 2011     1         18         2    1408    1508            AA       428
## 19 2011     1         19         3    1356    1503            AA       428
## 20 2011     1         20         4    1507    1622            AA       428
## 21 2011     1         21         5    1357    1459            AA       428
## 22 2011     1         22         6    1355    1456            AA       428
## 23 2011     1         23         7    1356    1501            AA       428
## 24 2011     1         24         1    1356    1513            AA       428
## 25 2011     1         25         2    1352    1452            AA       428
## 26 2011     1         26         3    1353    1455            AA       428
## 27 2011     1         27         4    1356    1458            AA       428
## 28 2011     1         28         5    1359    1505            AA       428
## 29 2011     1         29         6    1355    1455            AA       428
## 30 2011     1         30         7    1359    1456            AA       428
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
# Convert to a normal data frame to see all of the columns 

data.frame(head(flights))
##   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011     1          1         6    1400    1500            AA       428
## 2 2011     1          2         7    1401    1501            AA       428
## 3 2011     1          3         1    1352    1502            AA       428
## 4 2011     1          4         2    1403    1513            AA       428
## 5 2011     1          5         3    1405    1507            AA       428
## 6 2011     1          6         4    1359    1503            AA       428
##   TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1  N576AA                60      40      -10        0    IAH  DFW      224
## 2  N557AA                60      45       -9        1    IAH  DFW      224
## 3  N541AA                70      48       -8       -8    IAH  DFW      224
## 4  N403AA                70      39        3        3    IAH  DFW      224
## 5  N492AA                62      44       -3        5    IAH  DFW      224
## 6  N262AA                64      45       -7       -1    IAH  DFW      224
##   TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1      7      13         0                         0
## 2      6       9         0                         0
## 3      5      17         0                         0
## 4      9      22         0                         0
## 5      9       9         0                         0
## 6      6      13         0                         0

0.2 Filter the rows

# Base R approach

flights[flights$Month == 1 & flights$DayofMonth == 1,]
## Source: local data frame [552 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          1         6     728     840            AA       460
## 3  2011     1          1         6    1631    1736            AA      1121
## 4  2011     1          1         6    1756    2112            AA      1294
## 5  2011     1          1         6    1012    1347            AA      1700
## 6  2011     1          1         6    1211    1325            AA      1820
## 7  2011     1          1         6     557     906            AA      1994
## 8  2011     1          1         6    1824    2106            AS       731
## 9  2011     1          1         6     654    1124            B6       620
## 10 2011     1          1         6    1639    2110            B6       622
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
# dplyr  approach

# AND with ampersand
filter(.data = flights, ... = Month == 1 & DayofMonth == 1)
## Source: local data frame [552 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          1         6     728     840            AA       460
## 3  2011     1          1         6    1631    1736            AA      1121
## 4  2011     1          1         6    1756    2112            AA      1294
## 5  2011     1          1         6    1012    1347            AA      1700
## 6  2011     1          1         6    1211    1325            AA      1820
## 7  2011     1          1         6     557     906            AA      1994
## 8  2011     1          1         6    1824    2106            AS       731
## 9  2011     1          1         6     654    1124            B6       620
## 10 2011     1          1         6    1639    2110            B6       622
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
# OR with pipe sumbol
filter(.data = flights, ... = UniqueCarrier == "AA" | UniqueCarrier == "UA")
## Source: local data frame [5,316 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## 7  2011     1          7         5    1359    1509            AA       428
## 8  2011     1          8         6    1355    1454            AA       428
## 9  2011     1          9         7    1443    1554            AA       428
## 10 2011     1         10         1    1443    1553            AA       428
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
# Note : dplyr discards rownames

# You can even use the %in% operator
filter(.data = flights, ... = UniqueCarrier %in% c("AA", "UA"))
## Source: local data frame [5,316 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## 7  2011     1          7         5    1359    1509            AA       428
## 8  2011     1          8         6    1355    1454            AA       428
## 9  2011     1          9         7    1443    1554            AA       428
## 10 2011     1         10         1    1443    1553            AA       428
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)

0.3 Select Columns by Name

# Base R apporach

flights[, c("ArrTime", "DepTime")]
## Source: local data frame [227,496 x 2]
## 
##    ArrTime DepTime
## 1     1500    1400
## 2     1501    1401
## 3     1502    1352
## 4     1513    1403
## 5     1507    1405
## 6     1503    1359
## 7     1509    1359
## 8     1454    1355
## 9     1554    1443
## 10    1553    1443
## ..     ...     ...
# dplyr approach

select(.data = flights, ArrTime, DepTime)
## Source: local data frame [227,496 x 2]
## 
##    ArrTime DepTime
## 1     1500    1400
## 2     1501    1401
## 3     1502    1352
## 4     1513    1403
## 5     1507    1405
## 6     1503    1359
## 7     1509    1359
## 8     1454    1355
## 9     1554    1443
## 10    1553    1443
## ..     ...     ...
select(.data = flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
## Source: local data frame [227,496 x 7]
## 
##    Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## 1  2011     1          1      7      13      -10        0
## 2  2011     1          2      6       9       -9        1
## 3  2011     1          3      5      17       -8       -8
## 4  2011     1          4      9      22        3        3
## 5  2011     1          5      9       9       -3        5
## 6  2011     1          6      6      13       -7       -1
## 7  2011     1          7     12      15       -1       -1
## 8  2011     1          8      7      12      -16       -5
## 9  2011     1          9      8      22       44       43
## 10 2011     1         10      6      19       43       43
## ..  ...   ...        ...    ...     ...      ...      ...

0.4 Chaining or Pipelining

# nesting method

filter(select(flights, ArrDelay, DepDelay), DepDelay > 3)
## Source: local data frame [82,841 x 2]
## 
##    ArrDelay DepDelay
## 1        -3        5
## 2        44       43
## 3        43       43
## 4        29       29
## 5         5       19
## 6        84       90
## 7        -2        8
## 8        72       67
## 9        43       41
## 10        5        8
## ..      ...      ...
# chaining method

flights %>%
    select(ArrDelay, DepDelay) %>%
    filter(DepDelay > 3)
## Source: local data frame [82,841 x 2]
## 
##    ArrDelay DepDelay
## 1        -3        5
## 2        44       43
## 3        43       43
## 4        29       29
## 5         5       19
## 6        84       90
## 7        -2        8
## 8        72       67
## 9        43       41
## 10        5        8
## ..      ...      ...
# create two vectors and calculate Euclidean distance between them

x1 <- 1:5
x2 <- 3:7

(x1 - x2) ^ 2 %>% sum() %>% sqrt()
## [1] 4.472136

0.5 Arrange or Reorder Rows

flights %>% 
    select(UniqueCarrier, DepDelay) %>% 
    arrange(DepDelay)
## Source: local data frame [227,496 x 2]
## 
##    UniqueCarrier DepDelay
## 1             OO      -33
## 2             MQ      -23
## 3             XE      -19
## 4             XE      -19
## 5             CO      -18
## 6             EV      -18
## 7             XE      -17
## 8             CO      -17
## 9             XE      -17
## 10            MQ      -17
## ..           ...      ...
flights %>%
    select(UniqueCarrier, DepDelay) %>%
    arrange(desc(DepDelay))
## Source: local data frame [227,496 x 2]
## 
##    UniqueCarrier DepDelay
## 1             CO      981
## 2             AA      970
## 3             MQ      931
## 4             UA      869
## 5             MQ      814
## 6             MQ      803
## 7             CO      780
## 8             CO      758
## 9             DL      730
## 10            MQ      691
## ..           ...      ...

0.6 Mutate: Add new variables

# Create new variables which are functions of existing variables

flights %>%
    select(Distance, AirTime) %>%
    mutate(Speed = Distance / AirTime * 60)
## Source: local data frame [227,496 x 3]
## 
##    Distance AirTime    Speed
## 1       224      40 336.0000
## 2       224      45 298.6667
## 3       224      48 280.0000
## 4       224      39 344.6154
## 5       224      44 305.4545
## 6       224      45 298.6667
## 7       224      43 312.5581
## 8       224      40 336.0000
## 9       224      41 327.8049
## 10      224      45 298.6667
## ..      ...     ...      ...
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
# To store the new variable

flights <- flights %>%
                mutate(Speed = Distance / AirTime * 60)

str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame':    227496 obs. of  22 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Speed            : num  336 299 280 345 305 ...

0.7 Summarise: Reduce variables to values

head(data.frame(flights %>% select(Dest, TailNum) %>% group_by(Dest)))
##   Dest TailNum
## 1  DFW  N576AA
## 2  DFW  N557AA
## 3  DFW  N541AA
## 4  DFW  N403AA
## 5  DFW  N492AA
## 6  DFW  N262AA
flights %>% group_by(Dest) %>% summarise(avg_delay = mean(ArrDelay, na.rm = TRUE))
## Source: local data frame [116 x 2]
## 
##    Dest  avg_delay
## 1   ABQ   7.226259
## 2   AEX   5.839437
## 3   AGS   4.000000
## 4   AMA   6.840095
## 5   ANC  26.080645
## 6   ASE   6.794643
## 7   ATL   8.233251
## 8   AUS   7.448718
## 9   AVL   9.973988
## 10  BFL -13.198807
## ..  ...        ...

0.7.1 summarize_each

Allows you to apply the same function to multiple columns at once.
Similar to summarize_each, we have mutate_each serving the same purpose as mutate but can be applied on multiple columns.

flights %>%
    group_by(UniqueCarrier) %>%
    summarise_each(funs(mean), Cancelled, Diverted)
## Source: local data frame [15 x 3]
## 
##    UniqueCarrier   Cancelled    Diverted
## 1             AA 0.018495684 0.001849568
## 2             AS 0.000000000 0.002739726
## 3             B6 0.025899281 0.005755396
## 4             CO 0.006782614 0.002627370
## 5             DL 0.015903067 0.003029156
## 6             EV 0.034482759 0.003176044
## 7             F9 0.007159905 0.000000000
## 8             FL 0.009817672 0.003272557
## 9             MQ 0.029044750 0.001936317
## 10            OO 0.013946828 0.003486707
## 11            UA 0.016409266 0.002413127
## 12            US 0.011268986 0.001469868
## 13            WN 0.015504047 0.002293629
## 14            XE 0.015495599 0.003449550
## 15            YV 0.012658228 0.000000000
flights %>%
    group_by(UniqueCarrier) %>%
    summarise_each(funs(min(., na.rm = TRUE), max(., na.rm = TRUE)), matches("Delay"))
## Source: local data frame [15 x 5]
## 
##    UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
## 1             AA          -39          -15          978          970
## 2             AS          -43          -15          183          172
## 3             B6          -44          -14          335          310
## 4             CO          -55          -18          957          981
## 5             DL          -32          -17          701          730
## 6             EV          -40          -18          469          479
## 7             F9          -24          -15          277          275
## 8             FL          -30          -14          500          507
## 9             MQ          -38          -23          918          931
## 10            OO          -57          -33          380          360
## 11            UA          -47          -11          861          869
## 12            US          -42          -17          433          425
## 13            WN          -44          -10          499          548
## 14            XE          -70          -19          634          628
## 15            YV          -32          -11           72           54
# n() - counts the number of rows in a group

# n_distinct(vector) - counts the number of unique items in a vector

flights %>%
    group_by(Month, DayofMonth) %>%
    summarize(flight_count = n()) %>%
    arrange(desc(flight_count))
## Source: local data frame [365 x 3]
## Groups: Month
## 
##    Month DayofMonth flight_count
## 1      1          3          702
## 2      1          2          678
## 3      1         20          663
## 4      1         27          663
## 5      1         13          662
## 6      1          7          661
## 7      1         14          661
## 8      1         21          661
## 9      1         28          661
## 10     1          6          660
## ..   ...        ...          ...
# Doing same thing with tally() function

flights %>%
    group_by(Month, DayofMonth) %>%
    tally(sort = TRUE)
## Source: local data frame [365 x 3]
## Groups: Month
## 
##    Month DayofMonth   n
## 1      1          3 702
## 2      1          2 678
## 3      1         20 663
## 4      1         27 663
## 5      1         13 662
## 6      1          7 661
## 7      1         14 661
## 8      1         21 661
## 9      1         28 661
## 10     1          6 660
## ..   ...        ... ...
flights %>%
    group_by(Dest) %>%
    summarise(flight_count = n(), plane_count = n_distinct(TailNum))
## Source: local data frame [116 x 3]
## 
##    Dest flight_count plane_count
## 1   ABQ         2812         716
## 2   AEX          724         215
## 3   AGS            1           1
## 4   AMA         1297         158
## 5   ANC          125          38
## 6   ASE          125          60
## 7   ATL         7886         983
## 8   AUS         5022        1015
## 9   AVL          350         142
## 10  BFL          504          70
## ..  ...          ...         ...

Grouping can sometimes be useful without summarizing

flights %>%
    group_by(Dest) %>%
    select(Cancelled) %>%
    table() %>%
    head()
##      Cancelled
## Dest     0  1
##   ABQ 2787 25
##   AEX  712 12
##   AGS    1  0
##   AMA 1265 32
##   ANC  125  0
##   ASE  120  5

0.8 Window Functions

An aggregation function - takes n inputs, return 1 value
A window function - takes n inputs, returns n values eg ranking and ordering functions, offset functions, and cumulative aggregates

Ranking and Ordering Fuctions, like min_rank
Offset Functions, like lead and lag
Cumulative Aggregates, like cummean

# for each carrier, calculate which two days of the year they had their longest departure delays

flights %>%
    group_by(UniqueCarrier) %>%
    select(Month, DayofMonth, DepDelay) %>%
    filter(min_rank(desc(DepDelay)) <= 2) %>%
    arrange(UniqueCarrier, desc(DepDelay))
## Source: local data frame [30 x 4]
## Groups: UniqueCarrier
## 
##    UniqueCarrier Month DayofMonth DepDelay
## 1             AA    12         12      970
## 2             AA    11         19      677
## 3             AS     2         28      172
## 4             AS     7          6      138
## 5             B6    10         29      310
## 6             B6     8         19      283
## 7             CO     8          1      981
## 8             CO     1         20      780
## 9             DL    10         25      730
## 10            DL     4          5      497
## ..           ...   ...        ...      ...
# using top_n() function to simplify the above command

flights %>%
    group_by(UniqueCarrier) %>%
    select(Month, DayofMonth, DepDelay) %>%
    top_n(n = 2) %>%
    arrange(UniqueCarrier, desc(DepDelay))
## Selecting by DepDelay
## Source: local data frame [30 x 4]
## Groups: UniqueCarrier
## 
##    UniqueCarrier Month DayofMonth DepDelay
## 1             AA    12         12      970
## 2             AA    11         19      677
## 3             AS     2         28      172
## 4             AS     7          6      138
## 5             B6    10         29      310
## 6             B6     8         19      283
## 7             CO     8          1      981
## 8             CO     1         20      780
## 9             DL    10         25      730
## 10            DL     4          5      497
## ..           ...   ...        ...      ...
# for each month calculate the number of flights and the change from the previous month

flights %>%
    group_by(Month) %>%
    summarise(flight_count = n()) %>%
    mutate(change = flight_count - lag(flight_count))
## Source: local data frame [12 x 3]
## 
##    Month flight_count change
## 1      1        18910     NA
## 2      2        17128  -1782
## 3      3        19470   2342
## 4      4        18593   -877
## 5      5        19172    579
## 6      6        19600    428
## 7      7        20548    948
## 8      8        20176   -372
## 9      9        18065  -2111
## 10    10        18696    631
## 11    11        18021   -675
## 12    12        19117   1096

0.9 Other useful convenience functions

# randomly sample a fixed number of rows without replacement

flights %>% sample_n(size = 10)
## Source: local data frame [10 x 22]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     7         16         6    1543    1653            XE      2069
## 2  2011     1         11         2    1325    1637            XE      2980
## 3  2011     2         20         7    1428    1757            CO      1844
## 4  2011     8         20         6    1023    1206            UA       434
## 5  2011     8          7         7    2014    2154            WN       164
## 6  2011    10         28         5    2009    2320            WN       287
## 7  2011    12         24         6     906    1216            WN      2215
## 8  2011     8          4         4     843    1327            CO      1529
## 9  2011     5         29         7    1648    1735            WN        40
## 10 2011    11          5         6    1830    2117            FL       296
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int), Speed (dbl)
# randomly sample a fraction of rows with replacement

flights %>% sample_frac(size = 0.6, replace = TRUE)
## Source: local data frame [136,498 x 22]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1  2011     2         22         2    1036    1123            WN       576
## 2  2011     2         14         1    1038    1337            XE      2029
## 3  2011     7          5         2    1759    2241            CO      1540
## 4  2011    12         29         4     706    1045            XE      4407
## 5  2011    12         21         3    1401    1730            UA       312
## 6  2011     7         18         1    2106    2207            XE      2654
## 7  2011     3         17         4    1833    2045            OO      4455
## 8  2011    12         16         5    1948    2041            WN        47
## 9  2011     9          4         7    1603    1810            XE      2944
## 10 2011     5          5         4     719     804            OO      1184
## ..  ...   ...        ...       ...     ...     ...           ...       ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int), Speed (dbl)
# dplyr approach, better formatting and adapts to your screen width

glimpse(flights) # dplyr approach to str
## Observations: 227496
## Variables:
## $ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Speed             (dbl) 336.0000, 298.6667, 280.0000, 344.6154, 305....