Data loading

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hflights)
## Warning: package 'hflights' was built under R version 3.2.2
str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
hflights <- tbl_df(hflights)
class(hflights)
## [1] "tbl_df"     "tbl"        "data.frame"
hflights
## Source: local data frame [227,496 x 21]
## 
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    (int) (int)      (int)     (int)   (int)   (int)         (chr)
## 1   2011     1          1         6    1400    1500            AA
## 2   2011     1          2         7    1401    1501            AA
## 3   2011     1          3         1    1352    1502            AA
## 4   2011     1          4         2    1403    1513            AA
## 5   2011     1          5         3    1405    1507            AA
## 6   2011     1          6         4    1359    1503            AA
## 7   2011     1          7         5    1359    1509            AA
## 8   2011     1          8         6    1355    1454            AA
## 9   2011     1          9         7    1443    1554            AA
## 10  2011     1         10         1    1443    1553            AA
## ..   ...   ...        ...       ...     ...     ...           ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
##   (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
##   (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)

Return a copy of hflights containing the columns Origin up to Cancelled

select(hflights,Origin, Dest, Distance, TaxiIn, TaxiOut, Cancelled)
## Source: local data frame [227,496 x 6]
## 
##    Origin  Dest Distance TaxiIn TaxiOut Cancelled
##     (chr) (chr)    (int)  (int)   (int)     (int)
## 1     IAH   DFW      224      7      13         0
## 2     IAH   DFW      224      6       9         0
## 3     IAH   DFW      224      5      17         0
## 4     IAH   DFW      224      9      22         0
## 5     IAH   DFW      224      9       9         0
## 6     IAH   DFW      224      6      13         0
## 7     IAH   DFW      224     12      15         0
## 8     IAH   DFW      224      7      12         0
## 9     IAH   DFW      224      8      22         0
## 10    IAH   DFW      224      6      19         0
## ..    ...   ...      ...    ...     ...       ...

Find the most concise way to select: columns Year up to and including DayOfWeek, columns ArrDelay up to and including Diverted. (The - operator allows you to select everything except a column or a range of columns.)

select(hflights, -DepTime, -ArrTime, -UniqueCarrier, -FlightNum, -TailNum, -ActualElapsedTime, -AirTime)
## Source: local data frame [227,496 x 14]
## 
##     Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin  Dest
##    (int) (int)      (int)     (int)    (int)    (int)  (chr) (chr)
## 1   2011     1          1         6      -10        0    IAH   DFW
## 2   2011     1          2         7       -9        1    IAH   DFW
## 3   2011     1          3         1       -8       -8    IAH   DFW
## 4   2011     1          4         2        3        3    IAH   DFW
## 5   2011     1          5         3       -3        5    IAH   DFW
## 6   2011     1          6         4       -7       -1    IAH   DFW
## 7   2011     1          7         5       -1       -1    IAH   DFW
## 8   2011     1          8         6      -16       -5    IAH   DFW
## 9   2011     1          9         7       44       43    IAH   DFW
## 10  2011     1         10         1       43       43    IAH   DFW
## ..   ...   ...        ...       ...      ...      ...    ...   ...
## Variables not shown: Distance (int), TaxiIn (int), TaxiOut (int),
##   Cancelled (int), CancellationCode (chr), Diverted (int)

dplyr provides 6 helper functions, each of which only works when used inside select().

Use select and a helper function to return a tbl copy of hflights that contains just ArrDelay and DepDelay.

select(hflights,contains("ArrDelay"), contains("DepDelay"))
## Source: local data frame [227,496 x 2]
## 
##    ArrDelay DepDelay
##       (int)    (int)
## 1       -10        0
## 2        -9        1
## 3        -8       -8
## 4         3        3
## 5        -3        5
## 6        -7       -1
## 7        -1       -1
## 8       -16       -5
## 9        44       43
## 10       43       43
## ..      ...      ...

Use a combination of helper functions and variable names to return the UniqueCarrier, FlightNum, TailNum, Cancelled, and CancellationCode columns of hflights.

select(hflights,matches("UniqueCarrier"),ends_with("Num"), contains("Cancell"))
## Source: local data frame [227,496 x 5]
## 
##    UniqueCarrier FlightNum TailNum Cancelled CancellationCode
##            (chr)     (int)   (chr)     (int)            (chr)
## 1             AA       428  N576AA         0                 
## 2             AA       428  N557AA         0                 
## 3             AA       428  N541AA         0                 
## 4             AA       428  N403AA         0                 
## 5             AA       428  N492AA         0                 
## 6             AA       428  N262AA         0                 
## 7             AA       428  N493AA         0                 
## 8             AA       428  N477AA         0                 
## 9             AA       428  N476AA         0                 
## 10            AA       428  N504AA         0                 
## ..           ...       ...     ...       ...              ...

Find the most concise way to return the following columns with select and its helper functions: DepTime, ArrTime, ActualElapsedTime, AirTime, ArrDelay, DepDelay. Use only helper functions!

select(hflights,starts_with("A"),starts_with("De"),-Dest)
## Source: local data frame [227,496 x 6]
## 
##    ArrTime ActualElapsedTime AirTime ArrDelay DepTime DepDelay
##      (int)             (int)   (int)    (int)   (int)    (int)
## 1     1500                60      40      -10    1400        0
## 2     1501                60      45       -9    1401        1
## 3     1502                70      48       -8    1352       -8
## 4     1513                70      39        3    1403        3
## 5     1507                62      44       -3    1405        5
## 6     1503                64      45       -7    1359       -1
## 7     1509                70      43       -1    1359       -1
## 8     1454                59      40      -16    1355       -5
## 9     1554                71      41       44    1443       43
## 10    1553                70      45       43    1443       43
## ..     ...               ...     ...      ...     ...      ...

basic R vs dplyr

# Exercise 1
ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")]
ex1d <- select(hflights,TaxiIn,TaxiOut,Distance)

# Exercise 2
ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")]
ex2d <- select(hflights,Year:ArrTime, -DayofMonth)

# Exercise 3
ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")]
ex3d <- select(hflights, starts_with("Ta"))

Mutating is creating

# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)

# Add the new variable GroundTime to a g1. Save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)

# Add the new variable AverageSpeed to g2. Save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)

# Print out g3
g3
## Source: local data frame [227,496 x 24]
## 
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    (int) (int)      (int)     (int)   (int)   (int)         (chr)
## 1   2011     1          1         6    1400    1500            AA
## 2   2011     1          2         7    1401    1501            AA
## 3   2011     1          3         1    1352    1502            AA
## 4   2011     1          4         2    1403    1513            AA
## 5   2011     1          5         3    1405    1507            AA
## 6   2011     1          6         4    1359    1503            AA
## 7   2011     1          7         5    1359    1509            AA
## 8   2011     1          8         6    1355    1454            AA
## 9   2011     1          9         7    1443    1554            AA
## 10  2011     1         10         1    1443    1553            AA
## ..   ...   ...        ...       ...     ...     ...           ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
##   (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
##   (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int), ActualGroundTime (int),
##   GroundTime (int), AverageSpeed (dbl)