01 - introduction to dplyr

# dplyr is an R package, a collection of functions and data sets that enhance the R language.
# install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

hflights data set

# package hflights stores data set of airline flight data containing flights that departed from Houston
# Load the hflights package
# install.packages("hflights")
library(hflights)

class(hflights)
## [1] "data.frame"

head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

summary of hflights

summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime        ArrTime     UniqueCarrier        FlightNum   
##  Min.   :   1   Min.   :   1   Length:227496      Min.   :   1  
##  1st Qu.:1021   1st Qu.:1215   Class :character   1st Qu.: 855  
##  Median :1416   Median :1617   Mode  :character   Median :1696  
##  Mean   :1396   Mean   :1578                      Mean   :1962  
##  3rd Qu.:1801   3rd Qu.:1953                      3rd Qu.:2755  
##  Max.   :2400   Max.   :2400                      Max.   :7290  
##  NA's   :2905   NA's   :3066                                    
##    TailNum          ActualElapsedTime    AirTime         ArrDelay      
##  Length:227496      Min.   : 34.0     Min.   : 11.0   Min.   :-70.000  
##  Class :character   1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000  
##  Mode  :character   Median :128.0     Median :107.0   Median :  0.000  
##                     Mean   :129.3     Mean   :108.1   Mean   :  7.094  
##                     3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000  
##                     Max.   :575.0     Max.   :549.0   Max.   :978.000  
##                     NA's   :3622      NA's   :3622    NA's   :3622     
##     DepDelay          Origin              Dest              Distance     
##  Min.   :-33.000   Length:227496      Length:227496      Min.   :  79.0  
##  1st Qu.: -3.000   Class :character   Class :character   1st Qu.: 376.0  
##  Median :  0.000   Mode  :character   Mode  :character   Median : 809.0  
##  Mean   :  9.445                                         Mean   : 787.8  
##  3rd Qu.:  9.000                                         3rd Qu.:1042.0  
##  Max.   :981.000                                         Max.   :3904.0  
##  NA's   :2905                                                            
##      TaxiIn           TaxiOut         Cancelled       CancellationCode  
##  Min.   :  1.000   Min.   :  1.00   Min.   :0.00000   Length:227496     
##  1st Qu.:  4.000   1st Qu.: 10.00   1st Qu.:0.00000   Class :character  
##  Median :  5.000   Median : 14.00   Median :0.00000   Mode  :character  
##  Mean   :  6.099   Mean   : 15.09   Mean   :0.01307                     
##  3rd Qu.:  7.000   3rd Qu.: 18.00   3rd Qu.:0.00000                     
##  Max.   :165.000   Max.   :163.00   Max.   :1.00000                     
##  NA's   :3066      NA's   :2947                                         
##     Diverted       
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.002853  
##  3rd Qu.:0.000000  
##  Max.   :1.000000  
##

structure of hflights

str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...

02 - tbl (pronounced tibble)

special kind of data.frame
make data:
- easier to look at
- easier to work with
changes how R displays your data
- does not change the data’s underlying data structure
- inherits the original class of its input

# derive a tbl from a data.frame structure
# Convert the hflights data.frame into a hflights tbl
hflights <- tbl_df(hflights)

# Display the new hflights in your console window. 
# Notice the easy-to-read layout.
hflights
## # A tibble: 227,496 x 21
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##  * <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1          1         6    1400    1500 AA           
##  2  2011     1          2         7    1401    1501 AA           
##  3  2011     1          3         1    1352    1502 AA           
##  4  2011     1          4         2    1403    1513 AA           
##  5  2011     1          5         3    1405    1507 AA           
##  6  2011     1          6         4    1359    1503 AA           
##  7  2011     1          7         5    1359    1509 AA           
##  8  2011     1          8         6    1355    1454 AA           
##  9  2011     1          9         7    1443    1554 AA           
## 10  2011     1         10         1    1443    1553 AA           
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

# inherits the original class of its input
class(hflights)
## [1] "tbl_df"     "tbl"        "data.frame"

# save the UniqueCarrier column of hflights as an object named carriers, using base R syntax only.
carriers <- hflights$UniqueCarrier

Changing labels

UniqueCarrier

unique(hflights$UniqueCarrier) # uses a confusing code system.
##  [1] "AA" "AS" "B6" "CO" "DL" "OO" "UA" "US" "WN" "EV" "F9" "FL" "MQ" "XE"
## [15] "YV"

# "lookup table"" in the form of a named vector

# When you subset the lookup table with a character string 
#   R will return the values of the lookup table 
#     that correspond to the names in the character string

two <- c("AA", "AS")
lut <- c("AA" = "American", 
         "AS" = "Alaska", 
         "B6" = "JetBlue")
two <- lut[two]
two
##         AA         AS 
## "American"   "Alaska"


lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", 
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", 
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", 
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")

# Add a new Carrier column to hflights 
# by combining lut 
# with the UniqueCarrier column of hflights
# INCORRECT: hflights$Carrier <- c(lut,carriers)
hflights$Carrier <- lut[hflights$UniqueCarrier]

# how to check this was added correctly?
# Carrier variable does not appear when you print hflights
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier           <chr> "American", "American", "American", "America...

CancellationCode

# lists reasons why a flight was cancelled using a non-informative alphabetical code
unique(hflights$CancellationCode)
## [1] ""  "A" "B" "C" "D"

# The lookup table: converts the alphabetical codes into more meaningful strings
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA", "D" = "security", "E" = "not cancelled")

# Use lut to change the labels of the CancellationCode column of hflights. 
# Store the recoded vector in a new column Code.
# Add the Code column
hflights$Code <- lut[hflights$CancellationCode]

# check your results
glimpse(hflights)
## Observations: 227,496
## Variables: 23
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier           <chr> "American", "American", "American", "America...
## $ Code              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

03 - five verbs: select

select(), which returns a subset of the columns,

# selects the variables cyl and disp from the data frame df: e.g. mtcars
# select(df, var1, var2)
select(mtcars, cyl, disp)
##                     cyl  disp
## Mazda RX4             6 160.0
## Mazda RX4 Wag         6 160.0
## Datsun 710            4 108.0
## Hornet 4 Drive        6 258.0
## Hornet Sportabout     8 360.0
## Valiant               6 225.0
## Duster 360            8 360.0
## Merc 240D             4 146.7
## Merc 230              4 140.8
## Merc 280              6 167.6
## Merc 280C             6 167.6
## Merc 450SE            8 275.8
## Merc 450SL            8 275.8
## Merc 450SLC           8 275.8
## Cadillac Fleetwood    8 472.0
## Lincoln Continental   8 460.0
## Chrysler Imperial     8 440.0
## Fiat 128              4  78.7
## Honda Civic           4  75.7
## Toyota Corolla        4  71.1
## Toyota Corona         4 120.1
## Dodge Challenger      8 318.0
## AMC Javelin           8 304.0
## Camaro Z28            8 350.0
## Pontiac Firebird      8 400.0
## Fiat X1-9             4  79.0
## Porsche 914-2         4 120.3
## Lotus Europa          4  95.1
## Ford Pantera L        8 351.0
## Ferrari Dino          6 145.0
## Maserati Bora         8 301.0
## Volvo 142E            4 121.0

# use : to select a range of variables 
# and - to exclude some variables
# selects the four first variables except for the second one
select(mtcars, 1:4, -2)
##                      mpg  disp  hp
## Mazda RX4           21.0 160.0 110
## Mazda RX4 Wag       21.0 160.0 110
## Datsun 710          22.8 108.0  93
## Hornet 4 Drive      21.4 258.0 110
## Hornet Sportabout   18.7 360.0 175
## Valiant             18.1 225.0 105
## Duster 360          14.3 360.0 245
## Merc 240D           24.4 146.7  62
## Merc 230            22.8 140.8  95
## Merc 280            19.2 167.6 123
## Merc 280C           17.8 167.6 123
## Merc 450SE          16.4 275.8 180
## Merc 450SL          17.3 275.8 180
## Merc 450SLC         15.2 275.8 180
## Cadillac Fleetwood  10.4 472.0 205
## Lincoln Continental 10.4 460.0 215
## Chrysler Imperial   14.7 440.0 230
## Fiat 128            32.4  78.7  66
## Honda Civic         30.4  75.7  52
## Toyota Corolla      33.9  71.1  65
## Toyota Corona       21.5 120.1  97
## Dodge Challenger    15.5 318.0 150
## AMC Javelin         15.2 304.0 150
## Camaro Z28          13.3 350.0 245
## Pontiac Firebird    19.2 400.0 175
## Fiat X1-9           27.3  79.0  66
## Porsche 914-2       26.0 120.3  91
## Lotus Europa        30.4  95.1 113
## Ford Pantera L      15.8 351.0 264
## Ferrari Dino        19.7 145.0 175
## Maserati Bora       15.0 301.0 335
## Volvo 142E          21.4 121.0 109


# Use select() to print out a tbl 
#   that contains only the columns
#   ActualElapsedTime, AirTime, ArrDelay and DepDelay 
#   of hflights.
# Print out a tbl with the four columns of hflights related to delay
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## # A tibble: 227,496 x 4
##    ActualElapsedTime AirTime ArrDelay DepDelay
##  *             <int>   <int>    <int>    <int>
##  1                60      40      -10        0
##  2                60      45       -9        1
##  3                70      48       -8       -8
##  4                70      39        3        3
##  5                62      44       -3        5
##  6                64      45       -7       -1
##  7                70      43       -1       -1
##  8                59      40      -16       -5
##  9                71      41       44       43
## 10                70      45       43       43
## # ... with 227,486 more rows

# Print out a tbl 
#   with the columns Origin up to and including Cancelled 
#   of hflights.
# Print out the columns Origin up to Cancelled of hflights
select(hflights, Origin:Cancelled)
## # A tibble: 227,496 x 6
##    Origin Dest  Distance TaxiIn TaxiOut Cancelled
##  * <chr>  <chr>    <int>  <int>   <int>     <int>
##  1 IAH    DFW        224      7      13         0
##  2 IAH    DFW        224      6       9         0
##  3 IAH    DFW        224      5      17         0
##  4 IAH    DFW        224      9      22         0
##  5 IAH    DFW        224      9       9         0
##  6 IAH    DFW        224      6      13         0
##  7 IAH    DFW        224     12      15         0
##  8 IAH    DFW        224      7      12         0
##  9 IAH    DFW        224      8      22         0
## 10 IAH    DFW        224      6      19         0
## # ... with 227,486 more rows

# Find the most concise way to select:
# columns Year up to and including DayOfWeek, 
# columns ArrDelay up to and including Diverted. 
# You can examine the order of the variables in hflights
#   with names(hflights) in the console.
# Answer to last question: be concise!
names(hflights)
##  [1] "Year"              "Month"             "DayofMonth"       
##  [4] "DayOfWeek"         "DepTime"           "ArrTime"          
##  [7] "UniqueCarrier"     "FlightNum"         "TailNum"          
## [10] "ActualElapsedTime" "AirTime"           "ArrDelay"         
## [13] "DepDelay"          "Origin"            "Dest"             
## [16] "Distance"          "TaxiIn"            "TaxiOut"          
## [19] "Cancelled"         "CancellationCode"  "Diverted"         
## [22] "Carrier"           "Code"
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
## # A tibble: 227,496 x 14
##     Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest 
##  * <int> <int>      <int>     <int>    <int>    <int> <chr>  <chr>
##  1  2011     1          1         6      -10        0 IAH    DFW  
##  2  2011     1          2         7       -9        1 IAH    DFW  
##  3  2011     1          3         1       -8       -8 IAH    DFW  
##  4  2011     1          4         2        3        3 IAH    DFW  
##  5  2011     1          5         3       -3        5 IAH    DFW  
##  6  2011     1          6         4       -7       -1 IAH    DFW  
##  7  2011     1          7         5       -1       -1 IAH    DFW  
##  8  2011     1          8         6      -16       -5 IAH    DFW  
##  9  2011     1          9         7       44       43 IAH    DFW  
## 10  2011     1         10         1       43       43 IAH    DFW  
## # ... with 227,486 more rows, and 6 more variables: Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>

helper functions

starts_with(“X”): every name that starts with “X”,
ends_with(“X”): every name that ends with “X”,
contains(“X”): every name that contains “X”,
matches(“X”): every name that matches “X”, where “X” can be a regular expression,
num_range(“x”, 1:5): the variables named x01, x02, x03, x04 and x05,
one_of(x): every name that appears in x, which should be a character vector.

NB1: When you refer to columns directly inside select(), you don’t use quotes.

NB2: If you use the helper functions, you do use quotes.

# Print out a tbl containing just ArrDelay and DepDelay
select(hflights, ArrDelay, DepDelay)
## # A tibble: 227,496 x 2
##    ArrDelay DepDelay
##  *    <int>    <int>
##  1      -10        0
##  2       -9        1
##  3       -8       -8
##  4        3        3
##  5       -3        5
##  6       -7       -1
##  7       -1       -1
##  8      -16       -5
##  9       44       43
## 10       43       43
## # ... with 227,486 more rows

select(hflights, ends_with("Delay"))
## # A tibble: 227,496 x 2
##    ArrDelay DepDelay
##  *    <int>    <int>
##  1      -10        0
##  2       -9        1
##  3       -8       -8
##  4        3        3
##  5       -3        5
##  6       -7       -1
##  7       -1       -1
##  8      -16       -5
##  9       44       43
## 10       43       43
## # ... with 227,486 more rows


# Use a combination of helper functions and variable names 
# to print out only 
# UniqueCarrier, FlightNum, TailNum, Cancelled, and CancellationCode 
# columns of hflights.
select(hflights, UniqueCarrier:TailNum, starts_with("Cancell"))
## # A tibble: 227,496 x 5
##    UniqueCarrier FlightNum TailNum Cancelled CancellationCode
##  * <chr>             <int> <chr>       <int> <chr>           
##  1 AA                  428 N576AA          0 ""              
##  2 AA                  428 N557AA          0 ""              
##  3 AA                  428 N541AA          0 ""              
##  4 AA                  428 N403AA          0 ""              
##  5 AA                  428 N492AA          0 ""              
##  6 AA                  428 N262AA          0 ""              
##  7 AA                  428 N493AA          0 ""              
##  8 AA                  428 N477AA          0 ""              
##  9 AA                  428 N476AA          0 ""              
## 10 AA                  428 N504AA          0 ""              
## # ... with 227,486 more rows
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))
## # A tibble: 227,496 x 5
##    UniqueCarrier FlightNum TailNum Cancelled CancellationCode
##  * <chr>             <int> <chr>       <int> <chr>           
##  1 AA                  428 N576AA          0 ""              
##  2 AA                  428 N557AA          0 ""              
##  3 AA                  428 N541AA          0 ""              
##  4 AA                  428 N403AA          0 ""              
##  5 AA                  428 N492AA          0 ""              
##  6 AA                  428 N262AA          0 ""              
##  7 AA                  428 N493AA          0 ""              
##  8 AA                  428 N477AA          0 ""              
##  9 AA                  428 N476AA          0 ""              
## 10 AA                  428 N504AA          0 ""              
## # ... with 227,486 more rows


# Find the most concise way to return 
# the following columns
# with select 
# and its helper functions: 
# DepTime, ArrTime, ActualElapsedTime, AirTime, ArrDelay, DepDelay
# using only helper functions.
select(hflights, contains("Time"), ends_with("Delay"))
## # A tibble: 227,496 x 6
##    DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
##  *   <int>   <int>             <int>   <int>    <int>    <int>
##  1    1400    1500                60      40      -10        0
##  2    1401    1501                60      45       -9        1
##  3    1352    1502                70      48       -8       -8
##  4    1403    1513                70      39        3        3
##  5    1405    1507                62      44       -3        5
##  6    1359    1503                64      45       -7       -1
##  7    1359    1509                70      43       -1       -1
##  8    1355    1454                59      40      -16       -5
##  9    1443    1554                71      41       44       43
## 10    1443    1553                70      45       43       43
## # ... with 227,486 more rows
select(hflights, contains("Tim"), contains("Del"))
## # A tibble: 227,496 x 6
##    DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
##  *   <int>   <int>             <int>   <int>    <int>    <int>
##  1    1400    1500                60      40      -10        0
##  2    1401    1501                60      45       -9        1
##  3    1352    1502                70      48       -8       -8
##  4    1403    1513                70      39        3        3
##  5    1405    1507                62      44       -3        5
##  6    1359    1503                64      45       -7       -1
##  7    1359    1509                70      43       -1       -1
##  8    1355    1454                59      40      -16       -5
##  9    1443    1554                71      41       44       43
## 10    1443    1553                70      45       43       43
## # ... with 227,486 more rows

To see the added value of the dplyr package, it is useful to compare its syntax with base R

# Finish select call so that ex1d matches ex1r
ex1r <- hflights[c("TaxiIn", "TaxiOut", "Distance")]
ex1d <- select(hflights, starts_with("Taxi"), Distance)
ex1d <- select(hflights, contains("Taxi"), Distance)

# Finish select call so that ex2d matches ex2r
ex2r <- hflights[c("Year", "Month", "DayOfWeek", "DepTime", "ArrTime")]
ex2d <- select(hflights, Year:Month, DayOfWeek:ArrTime)
ex2d <- select(hflights, Year:ArrTime, -DayofMonth)

# Finish select call so that ex3d matches ex3r
ex3r <- hflights[c("TailNum", "TaxiIn", "TaxiOut")]
ex3d <- select(hflights, starts_with("T"))

04 - five verbs: mutate

mutate(), used to add columns from existing data,

# Create a new data frame, g1, 
# which is the data frame hflights 
# with an additional column: 
#   ActualGroundTime, the difference between
#                         ActualElapsedTime and AirTime.

# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)


# Extend g1 further, 
# by adding an additional column GroundTime. 
#       sum of the TaxiIn and TaxiOut columns
# Store the resulting data frame in g2. 

# Add the new variable GroundTime to g1. Save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)

# Check in the console that the GroundTime and ActualGroundTime columns are equal.
sum(g2$GroundTime, na.rm = TRUE)
## [1] 4755621
sum(g2$ActualGroundTime, na.rm = TRUE)
## [1] 4741967
sum(na.omit(g2$GroundTime))
## [1] 4755621
sum(na.omit(g2$ActualGroundTime))
## [1] 4741967
sum(as.numeric(na.omit(g2$GroundTime - g2$ActualGroundTime)))
## [1] 0

# Add the new variable AverageSpeed to g2. Save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)

# Print out g3
g3
## # A tibble: 227,496 x 26
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1          1         6    1400    1500 AA           
##  2  2011     1          2         7    1401    1501 AA           
##  3  2011     1          3         1    1352    1502 AA           
##  4  2011     1          4         2    1403    1513 AA           
##  5  2011     1          5         3    1405    1507 AA           
##  6  2011     1          6         4    1359    1503 AA           
##  7  2011     1          7         5    1359    1509 AA           
##  8  2011     1          8         6    1355    1454 AA           
##  9  2011     1          9         7    1443    1554 AA           
## 10  2011     1         10         1    1443    1553 AA           
## # ... with 227,486 more rows, and 19 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>, ActualGroundTime <int>,
## #   GroundTime <int>, AverageSpeed <dbl>

Add multiple variables

mutate(my_df, x = a + b, y = x + c)

# Add a second variable loss_ratio to the dataset: m1
m1 <- mutate(hflights, loss = ArrDelay - DepDelay)
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_ratio = loss / DepDelay)

m2 <- mutate(hflights, TotalTaxi = TaxiIn + TaxiOut, ActualGroundTime = ActualElapsedTime - AirTime, Diff = TotalTaxi - ActualGroundTime)

# Diff column should be zero for all observations!
head(as.numeric(na.omit(m2$Diff)),20)
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
head(m2$Diff[!is.na(m2$Diff)],20)
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

05 - five verbs: filter

filter(), that is able to return a subset of the rows,

e.g. keep observations for which a is positive filter(df, a > 0)

Logical operators:

x < y, TRUE if x is less than y
x <= y, TRUE if x is less than or equal to y
x == y, TRUE if x equals y
x != y, TRUE if x does not equal y
x >= y, TRUE if x is greater than or equal to y
x > y, TRUE if x is greater than y
x %in% c(a, b, c), TRUE if x is in the vector c(a, b, c)

# All flights that traveled 3000 miles or more
filter(hflights, Distance >= 3000)
## # A tibble: 527 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1         31         1     924    1413 CO           
##  2  2011     1         30         7     925    1410 CO           
##  3  2011     1         29         6    1045    1445 CO           
##  4  2011     1         28         5    1516    1916 CO           
##  5  2011     1         27         4     950    1344 CO           
##  6  2011     1         26         3     944    1350 CO           
##  7  2011     1         25         2     924    1337 CO           
##  8  2011     1         24         1    1144    1605 CO           
##  9  2011     1         23         7     926    1335 CO           
## 10  2011     1         22         6     942    1340 CO           
## # ... with 517 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c("JetBlue", "Southwest", "Delta"))
## # A tibble: 0 x 23
## # ... with 23 variables: Year <int>, Month <int>, DayofMonth <int>,
## #   DayOfWeek <int>, DepTime <int>, ArrTime <int>, UniqueCarrier <chr>,
## #   FlightNum <int>, TailNum <chr>, ActualElapsedTime <int>,
## #   AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## #   Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## #   Cancelled <int>, CancellationCode <chr>, Diverted <int>,
## #   Carrier <chr>, Code <chr>

# All flights where taxiing took longer than flying
#                                            actual flight
# Avoid the use of mutate() 
# do the math directly in the logical expression of filter().
filter(hflights, TaxiIn + TaxiOut > AirTime)
## # A tibble: 1,389 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1         24         1     731     904 AA           
##  2  2011     1         30         7    1959    2132 AA           
##  3  2011     1         24         1    1621    1749 AA           
##  4  2011     1         10         1     941    1113 AA           
##  5  2011     1         31         1    1301    1356 CO           
##  6  2011     1         31         1    2113    2215 CO           
##  7  2011     1         31         1    1434    1539 CO           
##  8  2011     1         31         1     900    1006 CO           
##  9  2011     1         30         7    1304    1408 CO           
## 10  2011     1         30         7    2004    2128 CO           
## # ... with 1,379 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

Boolean operators:

& (and)
| (or)
! (not)

Instead of using the & operator, you can also pass several logical tests to filter(), separated by commas.

The following two calls are completely equivalent:

filter(df, a > 0 & b > 0)
filter(df, a > 0, b > 0)

To keep the observations in df for which the variable x is not NA:

filter(df, !is.na(x))

# Use R's logical and boolean operators
# to select just the rows 
# where a flight 
#           left before 5:00 am (500) 
#     or arrived after 10:00 pm (2200)

# All flights that departed before 5am or arrived after 10pm
filter(hflights, DepTime < 500 | ArrTime > 2200)
## # A tibble: 27,799 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1          4         2    2100    2207 AA           
##  2  2011     1         14         5    2119    2229 AA           
##  3  2011     1         10         1    1934    2235 AA           
##  4  2011     1         26         3    1905    2211 AA           
##  5  2011     1         30         7    1856    2209 AA           
##  6  2011     1          9         7    1938    2228 AS           
##  7  2011     1         31         1    1919    2231 CO           
##  8  2011     1         31         1    2116    2344 CO           
##  9  2011     1         31         1    1850    2211 CO           
## 10  2011     1         31         1    2102    2216 CO           
## # ... with 27,789 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

# All flights that departed late but arrived ahead of schedule
filter(hflights, DepDelay > 0,  ArrDelay < 0)
## # A tibble: 27,712 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1          2         7    1401    1501 AA           
##  2  2011     1          5         3    1405    1507 AA           
##  3  2011     1         18         2    1408    1508 AA           
##  4  2011     1         18         2     721     827 AA           
##  5  2011     1         12         3    2015    2113 AA           
##  6  2011     1         13         4    2020    2116 AA           
##  7  2011     1         26         3    2009    2103 AA           
##  8  2011     1          1         6    1631    1736 AA           
##  9  2011     1         10         1    1639    1740 AA           
## 10  2011     1         12         3    1631    1739 AA           
## # ... with 27,702 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

# All flights that were cancelled after being delayed
filter(hflights, DepDelay > 0, Cancelled == 1)
## # A tibble: 40 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1         26         3    1926      NA CO           
##  2  2011     1         11         2    1100      NA US           
##  3  2011     1         19         3    1811      NA XE           
##  4  2011     1          7         5    2028      NA XE           
##  5  2011     2          4         5    1638      NA AA           
##  6  2011     2          8         2    1057      NA CO           
##  7  2011     2          2         3     802      NA XE           
##  8  2011     2          9         3     904      NA XE           
##  9  2011     2          1         2    1508      NA OO           
## 10  2011     3         31         4    1016      NA CO           
## # ... with 30 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

Exercise

# Select the flights that had JFK as their destination: c1
  
  # flights that had JFK as their destination
  hflights$Dest[hflights$Dest == "JFK"]
##   [1] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [12] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [23] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [34] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [45] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [56] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [67] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [78] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
##  [89] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [100] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [111] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [122] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [133] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [144] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [155] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [166] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [177] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [188] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [199] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [210] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [221] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [232] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [243] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [254] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [265] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [276] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [287] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [298] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [309] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [320] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [331] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [342] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [353] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [364] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [375] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [386] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [397] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [408] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [419] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [430] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [441] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [452] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [463] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [474] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [485] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [496] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [507] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [518] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [529] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [540] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [551] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [562] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [573] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [584] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [595] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [606] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [617] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [628] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [639] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [650] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [661] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [672] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [683] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [694] "JFK" "JFK"
  length(hflights$Dest[hflights$Dest == "JFK"])
## [1] 695

  filter(hflights, Dest == "JFK")
## # A tibble: 695 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1          1         6     654    1124 B6           
##  2  2011     1          1         6    1639    2110 B6           
##  3  2011     1          2         7     703    1113 B6           
##  4  2011     1          2         7    1604    2040 B6           
##  5  2011     1          3         1     659    1100 B6           
##  6  2011     1          3         1    1801    2200 B6           
##  7  2011     1          4         2     654    1103 B6           
##  8  2011     1          4         2    1608    2034 B6           
##  9  2011     1          5         3     700    1103 B6           
## 10  2011     1          5         3    1544    1954 B6           
## # ... with 685 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>
  c1 <- filter(hflights, Dest == "JFK")

# Combine the Year, Month and DayofMonth variables to create a Date column: c2
  
  c2 <- mutate(c1, Date = paste(Year, "-", Month, "-", DayofMonth))
  head(c2$Date) # INCORRECT
## [1] "2011 - 1 - 1" "2011 - 1 - 1" "2011 - 1 - 2" "2011 - 1 - 2"
## [5] "2011 - 1 - 3" "2011 - 1 - 3"

  # using the sep attribute of paste()
  c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep = "-"))
  head(c2$Date) # CORRECT
## [1] "2011-1-1" "2011-1-1" "2011-1-2" "2011-1-2" "2011-1-3" "2011-1-3"


# Finally, select some columns to provide an overview: 
# Date, DepTime, ArrTime and TailNum, in this order. 
# Do not assign the resulting data frame to a variable; 
# just print it to the console.

# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)
## # A tibble: 695 x 4
##    Date     DepTime ArrTime TailNum
##    <chr>      <int>   <int> <chr>  
##  1 2011-1-1     654    1124 N324JB 
##  2 2011-1-1    1639    2110 N324JB 
##  3 2011-1-2     703    1113 N324JB 
##  4 2011-1-2    1604    2040 N324JB 
##  5 2011-1-3     659    1100 N229JB 
##  6 2011-1-3    1801    2200 N206JB 
##  7 2011-1-4     654    1103 N267JB 
##  8 2011-1-4    1608    2034 N267JB 
##  9 2011-1-5     700    1103 N708JB 
## 10 2011-1-5    1544    1954 N644JB 
## # ... with 685 more rows


# How many weekend flights flew a distance of more than 1000 miles but had a total taxiing time below 15 minutes?
names(hflights)
##  [1] "Year"              "Month"             "DayofMonth"       
##  [4] "DayOfWeek"         "DepTime"           "ArrTime"          
##  [7] "UniqueCarrier"     "FlightNum"         "TailNum"          
## [10] "ActualElapsedTime" "AirTime"           "ArrDelay"         
## [13] "DepDelay"          "Origin"            "Dest"             
## [16] "Distance"          "TaxiIn"            "TaxiOut"          
## [19] "Cancelled"         "CancellationCode"  "Diverted"         
## [22] "Carrier"           "Code"
head(hflights$DayOfWeek)
## [1] 6 7 1 2 3 4
filter(hflights, DayOfWeek > 5, Distance > 1000, TaxiIn + TaxiOut < 15)
## # A tibble: 1,739 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1         23         7    1535    1933 B6           
##  2  2011     1         30         7     851    1230 CO           
##  3  2011     1         30         7    2234       2 CO           
##  4  2011     1         29         6    1220    1353 CO           
##  5  2011     1         23         7     847    1213 CO           
##  6  2011     1         23         7    1224    1345 CO           
##  7  2011     1         23         7     931    1045 CO           
##  8  2011     1         22         6     942    1340 CO           
##  9  2011     1         16         7     848    1136 CO           
## 10  2011     1         16         7    1030    1406 CO           
## # ... with 1,729 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

06 - five verbs: arrange

arrange(), that reorders the rows according to single or multiple variables,

# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))

# Arrange dtc by departure delays
# i.e. shortest departure delay is at the top of the data set.
arrange(dtc, DepDelay)
## # A tibble: 68 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     7         23         6     605      NA F9           
##  2  2011     1         17         1     916      NA XE           
##  3  2011    12          1         4     541      NA US           
##  4  2011    10         12         3    2022      NA MQ           
##  5  2011     7         29         5    1424      NA CO           
##  6  2011     9         29         4    1639      NA OO           
##  7  2011     2          9         3     555      NA MQ           
##  8  2011     5          9         1     715      NA OO           
##  9  2011     1         20         4    1413      NA UA           
## 10  2011     1         17         1     831      NA WN           
## # ... with 58 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

# verify shortest departure delay is at the top 
dtc$DepDelay # informs of order in "dtc"
##  [1]  26   0  -4 135  -4   6  -9  73   8 187   2   4  28  -3  -5 156  -3
## [18]  42 548   3 131 109   5   1  87  -2  -3  -3  -5  -1  -3 110  -1  -3
## [35]  64  -6  24 271 -10  -3 110 153  -1 173  -1  37   3   0  83  -2 220
## [52]  27  64  -6  -2   1  -8  75  21 129  80  -3  64  21 103  27   0  -9
select(arrange(dtc, DepDelay),DepDelay)
## # A tibble: 68 x 1
##    DepDelay
##       <int>
##  1      -10
##  2       -9
##  3       -9
##  4       -8
##  5       -6
##  6       -6
##  7       -5
##  8       -5
##  9       -4
## 10       -4
## # ... with 58 more rows

# Arrange dtc so that 
# flights that were cancelled 
# for the same reason 
# appear next to each other.

# Arrange dtc so that cancellation reasons are grouped
names(dtc)
##  [1] "Year"              "Month"             "DayofMonth"       
##  [4] "DayOfWeek"         "DepTime"           "ArrTime"          
##  [7] "UniqueCarrier"     "FlightNum"         "TailNum"          
## [10] "ActualElapsedTime" "AirTime"           "ArrDelay"         
## [13] "DepDelay"          "Origin"            "Dest"             
## [16] "Distance"          "TaxiIn"            "TaxiOut"          
## [19] "Cancelled"         "CancellationCode"  "Diverted"         
## [22] "Carrier"           "Code"
arrange(dtc, CancellationCode)
## # A tibble: 68 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     1         20         4    1413      NA UA           
##  2  2011     1          7         5    2028      NA XE           
##  3  2011     2          4         5    1638      NA AA           
##  4  2011     2          8         2    1057      NA CO           
##  5  2011     2          1         2    1508      NA OO           
##  6  2011     2         21         1    2257      NA OO           
##  7  2011     2          9         3     555      NA MQ           
##  8  2011     3         18         5     727      NA UA           
##  9  2011     4          4         1    1632      NA DL           
## 10  2011     4          8         5    1608      NA WN           
## # ... with 58 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

# verify
select(arrange(dtc, CancellationCode),CancellationCode)
## # A tibble: 68 x 1
##    CancellationCode
##    <chr>           
##  1 A               
##  2 A               
##  3 A               
##  4 A               
##  5 A               
##  6 A               
##  7 A               
##  8 A               
##  9 A               
## 10 A               
## # ... with 58 more rows

# Arrange dtc so that flights
# by the same carrier appear next to each other
#   Within each carrier, 
#   flights that have smaller departure delays
#   appear before flights that have higher departure delays
#     Do this in a one-liner.
# Arrange dtc according to carrier and departure delays
names(dtc)
##  [1] "Year"              "Month"             "DayofMonth"       
##  [4] "DayOfWeek"         "DepTime"           "ArrTime"          
##  [7] "UniqueCarrier"     "FlightNum"         "TailNum"          
## [10] "ActualElapsedTime" "AirTime"           "ArrDelay"         
## [13] "DepDelay"          "Origin"            "Dest"             
## [16] "Distance"          "TaxiIn"            "TaxiOut"          
## [19] "Cancelled"         "CancellationCode"  "Diverted"         
## [22] "Carrier"           "Code"
arrange(dtc, UniqueCarrier, DepDelay)
## # A tibble: 68 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     8         18         4    1808      NA AA           
##  2  2011     2          4         5    1638      NA AA           
##  3  2011     7         29         5    1424      NA CO           
##  4  2011     1         26         3    1703      NA CO           
##  5  2011     8         11         4    1320      NA CO           
##  6  2011     7         25         1    1654      NA CO           
##  7  2011     1         26         3    1926      NA CO           
##  8  2011     3         31         4    1016      NA CO           
##  9  2011     2          8         2    1057      NA CO           
## 10  2011     4          4         1    1632      NA DL           
## # ... with 58 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

Reverse the order of arranging

arrange(var1) - rows from smallest to largest
arrange(desc(var1)) - rows from largest to smallest


# Arrange hflights so that 
# flights by the same carrier 
# appear next to each other 
# and within each carrier, 
#    flights that have larger departure delays 
#    appear before flights that have 
#    smaller departure delays.

# before
#   arrange(dtc, UniqueCarrier, DepDelay)
# now reverse
    arrange(hflights, UniqueCarrier, desc(DepDelay))
## # A tibble: 227,496 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011    12         12         1     650     808 AA           
##  2  2011    11         19         6    1752    1910 AA           
##  3  2011    12         22         4    1728    1848 AA           
##  4  2011    10         23         7    2305       2 AA           
##  5  2011     9         27         2    1206    1300 AA           
##  6  2011     3         17         4    1647    1747 AA           
##  7  2011     6         21         2     955    1315 AA           
##  8  2011     5         20         5    2359     130 AA           
##  9  2011     4         19         2    2023    2142 AA           
## 10  2011     5         12         4    2133      53 AA           
## # ... with 227,486 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

# Arrange flights by total delay (normal order).
arrange(hflights, DepDelay + ArrDelay)
## # A tibble: 227,496 x 23
##     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
##    <int> <int>      <int>     <int>   <int>   <int> <chr>        
##  1  2011     7          3         7    1914    2039 XE           
##  2  2011     8         31         3     934    1039 OO           
##  3  2011     8         21         7     935    1039 OO           
##  4  2011     8         28         7    2059    2206 OO           
##  5  2011     8         29         1     935    1041 OO           
##  6  2011    12         25         7     741     926 OO           
##  7  2011     1         30         7     620     812 OO           
##  8  2011     8          3         3    1741    1810 XE           
##  9  2011     8          4         4     930    1041 OO           
## 10  2011     8         18         4     939    1043 OO           
## # ... with 227,486 more rows, and 16 more variables: FlightNum <int>,
## #   TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## #   DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## #   TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## #   Diverted <int>, Carrier <chr>, Code <chr>

07 - five verbs: summarise

summarise(), which reduces each group to a single row by calculating aggregate measures.

# Print out a summary with variables min_dist and max_dist
summarise(hflights, min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 x 2
##   min_dist max_dist
##      <dbl>    <dbl>
## 1       79     3904

# Print out a summary of hflights 
# with a single variable, 
# max_div: 
# the longest Distance for diverted flights. 
# You will need one of the four other verbs to do this!
# i.e. Print out a summary with variable max_div
unique(hflights$Diverted)
## [1] 0 1
summarise(filter(hflights, Diverted == 1), max_div = max(Distance))
## # A tibble: 1 x 1
##   max_div
##     <dbl>
## 1    3904

Aggregate functions

min(x) - minimum value of vector x.
max(x) - maximum value of vector x.
mean(x) - mean value of vector x.
median(x) - median value of vector x.
quantile(x, p) - pth quantile of vector x.
sd(x) - standard deviation of vector x.
var(x) - variance of vector x.
IQR(x) - Inter Quartile Range (IQR) of vector x.
diff(range(x)) - total range of vector x.

# Remove rows that have NAs in the arrival delay column 
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))

# Generate summary about ArrDelay column of temp1
# 1. earliest: the minimum arrival delay,
# 2. average: the average arrival delay,
# 3. latest: the longest arrival delay,
# 4. sd: the standard deviation for arrival delays.

# test:
summarise(temp1, earliest = min(temp1$ArrDelay))
## # A tibble: 1 x 1
##   earliest
##      <int>
## 1      -70

# alternatively:
summarise(temp1, earliest = min(ArrDelay))
## # A tibble: 1 x 1
##   earliest
##      <dbl>
## 1      -70

# all four
summarise(temp1, earliest = min(ArrDelay),
                 average = mean(ArrDelay),
                 latest = max(ArrDelay),
                 sd = sd(ArrDelay)
         )
## # A tibble: 1 x 4
##   earliest average latest    sd
##      <dbl>   <dbl>  <dbl> <dbl>
## 1      -70    7.09    978  30.7

# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn), !is.na(TaxiOut))

# Print the maximum taxiing difference of temp2 with summarise()
summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## # A tibble: 1 x 1
##   max_taxi_diff
##           <int>
## 1           160

dplyr aggregate functions

first(x) - The first element of vector x.
last(x) - The last element of vector x.
nth(x, n) - The nth element of vector x.
n() - The number of rows in the data.frame or group of observations that summarise() describes.
n_distinct(x) - The number of unique values in vector x.

# Print out a summary of hflights with the following variables:
#   n_obs: the total number of observations,
#   n_carrier: the total number of carriers,
#   n_dest: the total number of destinations,

# Generate summarizing statistics for hflights
summarise(hflights, 
          n_obs = n(), 
          n_carrier = n_distinct(UniqueCarrier), 
          n_dest = n_distinct(Dest))
## # A tibble: 1 x 3
##    n_obs n_carrier n_dest
##    <int>     <int>  <int>
## 1 227496        15    116

# All American Airline flights
aa <- filter(hflights, UniqueCarrier == "American")

# Print out a summary of aa with the following variables:
#   n_flights: the total number of flights (each observation is a flight),
#   n_canc: the total number of cancelled flights,
#   avg_delay: the average arrival delay of flights whose delay is not NA (na.rm = TRUE).

# Generate summarizing statistics for aa 
summarise(aa, 
          n_flights = n(), 
          n_canc = sum(Cancelled == 1),
          avg_delay = mean(ArrDelay, na.rm = TRUE))
## # A tibble: 1 x 3
##   n_flights n_canc avg_delay
##       <int>  <int>     <dbl>
## 1         0      0       NaN

saving intermediate results to temporary variables or nesting function calls is cumbersome and error-prone.

08 - pipe operator: chaining your functions

The %>% operator allows you to extract the first argument of a function from the arguments list and put it in front of it

# the following two commands are completely equivalent:
mean(c(1, 2, 3, NA), na.rm = TRUE)
## [1] 2
c(1, 2, 3, NA) %>% mean(na.rm = TRUE)
## [1] 2

Syntax:

# Write the 'piped' version of the English sentences.

# Take the hflights data set and then ...
hflights %>%

# Add a variable named diff that is the result of subtracting TaxiIn from TaxiOut, and then ...
mutate( diff = TaxiOut - TaxiIn ) %>%

# Pick all of the rows whose diff value does not equal NA, and then ...
# filter( diff != NA ) %>% # INCORRECT
filter( !is.na(diff) ) %>% # CORRECT

# Summarise the data set with a value named avg that is the mean diff value
summarise(avg = mean(diff))
## # A tibble: 1 x 1
##     avg
##   <dbl>
## 1  8.99

Drive or fly?


hflights %>%

    # mutate() the hflights dataset and add two variables:
    #  RealTime: 
    #   the actual elapsed time plus 100 minutes (for the overhead that flying involves) and
    #  mph: 
    #   calculated as Distance / RealTime * 60, then
    mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%

        # filter() to keep observations that have an mph that is not NA and that is below 70, finally
        filter( !is.na(mph), mph < 70) %>%

            # summarise() the result by creating four summary variables:
            # n_less, the number of observations,
            # n_dest, the number of destinations,
            # min_dist, the minimum distance and
            # max_dist, the maximum distance.
            summarise(
                      n_less = n(),
                      n_dest = n_distinct(Dest),
                      min_dist = min(Distance),
                      max_dist = max(Distance)
            )
## # A tibble: 1 x 4
##   n_less n_dest min_dist max_dist
##    <int>  <int>    <dbl>    <dbl>
## 1   6726     13       79      305

from previous exercise, some flights might be less efficient than driving in terms of speed

But is speed all that matters?

Flying imposes burdens on a traveler that driving does not:

airplane tickets are very expensive Air travelers also need to limit what they bring on their trip arrange for a pick up or a drop off

DEFINE:

preferable flights = flights that are at least 50% faster than driving,

i.e. that travel 105 mph or greater in real time

ASSUME:

cancelled or diverted flights are less preferable than driving.


hflights %>%

    mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%

        # filter() 
        # keep observations that have an mph under 105 
        # or for which Cancelled equals 1 
        # or for which Diverted equals 1.

        filter( mph < 105 | Cancelled == 1 | Diverted == 1) %>%

            # summarise() the result by creating four summary variables:
            # n_non, the number of observations,
            # n_dest, the number of destinations,
            # min_dist, the minimum distance and
            # max_dist, the maximum distance.

            summarise(
                      n_non = n(),
                      n_dest = n_distinct(Dest),
                      min_dist = min(Distance),
                      max_dist = max(Distance)
            )
## # A tibble: 1 x 4
##   n_non n_dest min_dist max_dist
##   <int>  <int>    <dbl>    <dbl>
## 1 42400    113       79     3904

The results show that almost 19% of flights appear less desirable than simply driving to the destination, which is rather surprising!

How many flights were overnight flights?

hflights %>%
filter(!is.na(DepTime), !is.na(ArrTime), DepTime> ArrTime) %>%
summarise(num = n()) # 265 flights
## # A tibble: 1 x 1
##     num
##   <int>
## 1  2718

09 - group_by

Ordered per-carrier summary

# Make an ordered per-carrier summary of hflights
hflights %>%
  group_by(UniqueCarrier) %>%
  summarise(p_canc = mean(Cancelled == 1) * 100, 
                   # percentage of cancelled flights
            avg_delay = mean(ArrDelay, na.rm = TRUE)) %>%
                   # average arrival delay of flights 
                   # whose delay does not equal NA
  arrange(avg_delay, p_canc)
## # A tibble: 15 x 3
##    UniqueCarrier p_canc avg_delay
##    <chr>          <dbl>     <dbl>
##  1 US             1.13     -0.631
##  2 AA             1.85      0.892
##  3 FL             0.982     1.85 
##  4 AS             0         3.19 
##  5 YV             1.27      4.01 
##  6 DL             1.59      6.08 
##  7 CO             0.678     6.10 
##  8 MQ             2.90      7.15 
##  9 EV             3.45      7.26 
## 10 WN             1.55      7.59 
## 11 F9             0.716     7.67 
## 12 XE             1.55      8.19 
## 13 OO             1.39      8.69 
## 14 B6             2.59      9.86 
## 15 UA             1.64     10.5
  
# SkyWest had the longest average arrival delay
# Here, we didn't use mutate() 
#       yet: when you combine it with group_by() you can do great things!

Ordered average arrival delays

# Ordered overview of average arrival delays per carrier
hflights %>%
  filter( !is.na(ArrDelay),  ArrDelay > 0 ) %>%
  group_by(UniqueCarrier) %>%
  summarise(avg = mean(ArrDelay)) %>%
  mutate(rank = rank(avg)) %>%
  arrange(rank)
## # A tibble: 15 x 3
##    UniqueCarrier   avg  rank
##    <chr>         <dbl> <dbl>
##  1 YV             18.7     1
##  2 F9             18.7     2
##  3 US             20.7     3
##  4 CO             22.1     4
##  5 AS             22.9     5
##  6 OO             24.1     6
##  7 XE             24.2     7
##  8 WN             25.3     8
##  9 FL             27.9     9
## 10 AA             28.5    10
## 11 DL             32.1    11
## 12 UA             32.5    12
## 13 MQ             38.8    13
## 14 EV             40.2    14
## 15 B6             45.5    15

No. airplanes flew to one destination

# How many airplanes only flew to one destination?
hflights %>%
  group_by(TailNum) %>%
  summarise(ndest = n_distinct(Dest)) %>%
  filter(ndest == 1) %>%
  summarise(nplanes = n())
## # A tibble: 1 x 1
##   nplanes
##     <int>
## 1    1526

Most visited destination

# Find the most visited destination for each carrier
hflights %>%
  group_by(UniqueCarrier, Dest) %>%
  summarise(n = n()
            # how often a carrier visited a particular destination
            ) %>%
  mutate(rank = rank(desc(n))) %>%
  filter(rank == 1)
## # A tibble: 15 x 4
## # Groups:   UniqueCarrier [15]
##    UniqueCarrier Dest      n  rank
##    <chr>         <chr> <int> <dbl>
##  1 AA            DFW    2105     1
##  2 AS            SEA     365     1
##  3 B6            JFK     695     1
##  4 CO            EWR    3924     1
##  5 DL            ATL    2396     1
##  6 EV            DTW     851     1
##  7 F9            DEN     837     1
##  8 FL            ATL    2029     1
##  9 MQ            DFW    2424     1
## 10 OO            COS    1335     1
## 11 UA            SFO     643     1
## 12 US            CLT    2212     1
## 13 WN            DAL    8243     1
## 14 XE            CRP    3175     1
## 15 YV            CLT      71     1

10 - dplyr and databases

data.table

# from section one
# hflights <- tbl_df(hflights)
class(hflights)
## [1] "tbl_df"     "tbl"        "data.frame"

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
hflights2 <- as.data.table(hflights)

class(hflights2)
## [1] "data.table" "data.frame"

# Even though hflights2 is a different data structure, 
# you can use the same dplyr functions 
# to manipulate hflights2 
# as you used to manipulate hflights.

# total number of unique carriers

hflights2 %>%
  summarise(n_carrier = n_distinct(UniqueCarrier))
##   n_carrier
## 1        15

# OR

  summarise(hflights2, n_carrier = n_distinct(UniqueCarrier))
##   n_carrier
## 1        15

DataCamp hosts a mySQL database

library(RMySQL)
## Loading required package: DBI
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql


# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr", 
                   host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                   port = 3306, 
                   user = "student",
                   password = "datacamp")

# data about flights that departed from New York City in 2013
# data is similar to the data in hflights
# but it does not contain information about cancellations or diversions

# Reference a MySQL table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")

# Behind the scenes, dplyr will convert the commands to the database's native language (in this case, SQL), and return the results

# glimpse at nycflights
glimpse(nycflights)
## Observations: ??
## Variables: 17
## $ id        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time  <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay <int> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time  <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay <int> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier   <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum   <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight    <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin    <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest      <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time  <int> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance  <int> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour      <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute    <int> 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...

# Ordered, grouped summary of nycflights

  nycflights %>%
    group_by(carrier) %>%
      summarise(n_flights = n(),
            avg_delay = mean(arr_delay)
            )  %>%
        arrange(avg_delay)
## Warning: Missing values are always removed in SQL.
## Use `AVG(x, na.rm = TRUE)` to silence this warning
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## # Source:     lazy query [?? x 3]
## # Database:   mysql 5.6.34-log
## #   [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## # Ordered by: avg_delay
##    carrier n_flights avg_delay
##    <chr>       <dbl>     <dbl>
##  1 AS            714    -9.86 
##  2 HA            342    -6.92 
##  3 AA          32729     0.356
##  4 DL          48110     1.63 
##  5 VX           5162     1.75 
##  6 US          20536     2.06 
##  7 UA          58665     3.50 
##  8 9E          18460     6.91 
##  9 B6          54635     9.36 
## 10 WN          12275     9.47 
## # ... with more rows

9 - dplyr (data manipulation)

Kunal Haira

2018-05-11