Prerequisites - Loading packages

library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## nycflights13
nycflights13::flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Sections: Introduction, Prerequisites, nycflights13, dplyr Basics, Filter Rows with filter(), Comparisons, Logical Operators, Missing Values.

Exercise 1.1 Had an arrival delay of two or more hours

filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
options(tibble.width=Inf)

Exercise 1.2 Flew to Houston (IAH or HOU)

filter(flights, dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
##  2        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
##  3         1 UA         496 N459UA  LGA    IAH        229     1416     6     27
##  4         3 UA         473 N488UA  LGA    IAH        238     1416     7     32
##  5        26 UA        1479 N37408  EWR    IAH        249     1400     7     39
##  6         9 UA        1220 N12216  EWR    IAH        233     1400     9      8
##  7        11 UA        1004 N76508  LGA    IAH        237     1416    10     26
##  8         1 UA         455 N667UA  EWR    IAH        229     1400    10     45
##  9       145 UA        1086 N76502  LGA    IAH        248     1416     9      0
## 10        -2 UA        1461 N39418  EWR    IAH        221     1400    12      0
##    time_hour          
##    <dttm>             
##  1 2013-01-01 05:00:00
##  2 2013-01-01 05:00:00
##  3 2013-01-01 06:00:00
##  4 2013-01-01 07:00:00
##  5 2013-01-01 07:00:00
##  6 2013-01-01 09:00:00
##  7 2013-01-01 10:00:00
##  8 2013-01-01 10:00:00
##  9 2013-01-01 09:00:00
## 10 2013-01-01 12:00:00
## # ... with 9,303 more rows

Exercise 1.3 Were operated by United, American, or Delta

filter(flights, carrier %in% c("UA","AA","DL"))
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            600        -1      941            910
## 10  2013     1     1      559            600        -1      854            902
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
##  2        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
##  3        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
##  4       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
##  5        12 UA        1696 N39463  EWR    ORD        150      719     5     58
##  6         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
##  7         7 UA         194 N29129  JFK    LAX        345     2475     6      0
##  8       -14 UA        1124 N53441  EWR    SFO        361     2565     6      0
##  9        31 AA         707 N3DUAA  LGA    DFW        257     1389     6      0
## 10        -8 UA        1187 N76515  EWR    LAS        337     2227     6      0
##    time_hour          
##    <dttm>             
##  1 2013-01-01 05:00:00
##  2 2013-01-01 05:00:00
##  3 2013-01-01 05:00:00
##  4 2013-01-01 06:00:00
##  5 2013-01-01 05:00:00
##  6 2013-01-01 06:00:00
##  7 2013-01-01 06:00:00
##  8 2013-01-01 06:00:00
##  9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ... with 139,494 more rows

Exercise 1.4 Departed in summer (July, August, and September)

filter(flights, month %in% c(7,8,9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1       157 B6         915 N653JB  JFK    SFO        315     2586    20     29
##  2         0 B6        1503 N805JB  JFK    SJU        200     1598    23     59
##  3       110 B6         234 N348JB  JFK    BTV         66      266    22     45
##  4       188 B6        1371 N794JB  LGA    FLL        143     1076    21     30
##  5       120 AA         185 N324AA  JFK    LAX        297     2475    21     50
##  6       186 B6         165 N640JB  JFK    PDX        304     2454    20     51
##  7       243 VX         415 N627VA  JFK    LAX        298     2475    20      1
##  8       172 B6         425 N535JB  JFK    TPA        140     1005    21     55
##  9       177 B6        1183 N531JB  JFK    MCO        126      944    21     46
## 10       122 B6         623 N663JB  JFK    LAX        304     2475    22     45
##    time_hour          
##    <dttm>             
##  1 2013-07-01 20:00:00
##  2 2013-07-01 23:00:00
##  3 2013-07-01 22:00:00
##  4 2013-07-01 21:00:00
##  5 2013-07-01 21:00:00
##  6 2013-07-01 20:00:00
##  7 2013-07-01 20:00:00
##  8 2013-07-01 21:00:00
##  9 2013-07-01 21:00:00
## 10 2013-07-01 22:00:00
## # ... with 86,316 more rows

Exercise 1.5 Arrived more than two hours late, but didn’t leave late

filter(flights, arr_delay > 120 & dep_delay <= 0)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    27     1419           1420        -1     1754           1550
##  2  2013    10     7     1350           1350         0     1736           1526
##  3  2013    10     7     1357           1359        -2     1858           1654
##  4  2013    10    16      657            700        -3     1258           1056
##  5  2013    11     1      658            700        -2     1329           1015
##  6  2013     3    18     1844           1847        -3       39           2219
##  7  2013     4    17     1635           1640        -5     2049           1845
##  8  2013     4    18      558            600        -2     1149            850
##  9  2013     4    18      655            700        -5     1213            950
## 10  2013     5    22     1827           1830        -3     2217           2010
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1       124 MQ        3728 N1EAMQ  EWR    ORD        135      719    14     20
##  2       130 EV        5181 N611QX  LGA    MSN        117      812    13     50
##  3       124 AA        1151 N3CMAA  LGA    DFW        192     1389    13     59
##  4       122 B6           3 N703JB  JFK    SJU        225     1598     7      0
##  5       194 VX         399 N629VA  JFK    LAX        336     2475     7      0
##  6       140 UA         389 N560UA  JFK    SFO        386     2586    18     47
##  7       124 MQ        4540 N721MQ  LGA    DTW        130      502    16     40
##  8       179 AA         707 N3EXAA  LGA    DFW        234     1389     6      0
##  9       143 AA        2083 N565AA  EWR    DFW        230     1372     7      0
## 10       127 MQ        4674 N518MQ  LGA    CLE         90      419    18     30
##    time_hour          
##    <dttm>             
##  1 2013-01-27 14:00:00
##  2 2013-10-07 13:00:00
##  3 2013-10-07 13:00:00
##  4 2013-10-16 07:00:00
##  5 2013-11-01 07:00:00
##  6 2013-03-18 18:00:00
##  7 2013-04-17 16:00:00
##  8 2013-04-18 06:00:00
##  9 2013-04-18 07:00:00
## 10 2013-05-22 18:00:00
## # ... with 19 more rows

Exercise 1.6 Were delayed by at least an hour, but made up over 30 minutes in flight

filter(flights, dep_delay >= 60 & arr_delay < 30)
## # A tibble: 206 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     3     1850           1745        65     2148           2120
##  2  2013     1     3     1950           1845        65     2228           2227
##  3  2013     1     3     2015           1915        60     2135           2111
##  4  2013     1     6     1019            900        79     1558           1530
##  5  2013     1     7     1543           1430        73     1758           1735
##  6  2013     1    11     1020            920        60     1311           1245
##  7  2013     1    12     1706           1600        66     1949           1927
##  8  2013     1    12     1953           1845        68     2154           2137
##  9  2013     1    19     1456           1355        61     1636           1615
## 10  2013     1    21     1531           1430        61     1843           1815
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        28 AA         177 N332AA  JFK    SFO        314     2586    17     45
##  2         1 B6          91 N636JB  JFK    OAK        319     2576    18     45
##  3        24 9E        3525 N903XJ  JFK    ORD        110      740    19     15
##  4        28 HA          51 N385HA  JFK    HNL        611     4983     9      0
##  5        23 AA         883 N4XBAA  EWR    DFW        177     1372    14     30
##  6        26 AA         721 N434AA  LGA    DFW        201     1389     9     20
##  7        22 DL        2139 N982DL  LGA    MIA        141     1096    16      0
##  8        17 9E        3369 N919XJ  JFK    MSY        165     1182    18     45
##  9        21 EV        5469 N724EV  LGA    MEM        137      963    13     55
## 10        28 DL        2126 N710TW  JFK    SFO        345     2586    14     30
##    time_hour          
##    <dttm>             
##  1 2013-01-03 17:00:00
##  2 2013-01-03 18:00:00
##  3 2013-01-03 19:00:00
##  4 2013-01-06 09:00:00
##  5 2013-01-07 14:00:00
##  6 2013-01-11 09:00:00
##  7 2013-01-12 16:00:00
##  8 2013-01-12 18:00:00
##  9 2013-01-19 13:00:00
## 10 2013-01-21 14:00:00
## # ... with 196 more rows

Exercise 3 How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        NA EV        4308 N18120  EWR    RDU         NA      416    16     30
##  2        NA AA         791 N3EHAA  LGA    DFW         NA     1389    19     35
##  3        NA AA        1925 N3EVAA  LGA    MIA         NA     1096    15      0
##  4        NA B6         125 N618JB  JFK    FLL         NA     1069     6      0
##  5        NA EV        4352 N10575  EWR    CVG         NA      569    15     40
##  6        NA EV        4406 N13949  EWR    PIT         NA      319    16     20
##  7        NA EV        4434 N10575  EWR    MHT         NA      209    13     55
##  8        NA EV        4935 N759EV  EWR    ATL         NA      746    14     20
##  9        NA EV        3849 N13550  EWR    IND         NA      645    13     21
## 10        NA AA         133 <NA>    JFK    LAX         NA     2475    15     45
##    time_hour          
##    <dttm>             
##  1 2013-01-01 16:00:00
##  2 2013-01-01 19:00:00
##  3 2013-01-01 15:00:00
##  4 2013-01-01 06:00:00
##  5 2013-01-02 15:00:00
##  6 2013-01-02 16:00:00
##  7 2013-01-02 13:00:00
##  8 2013-01-02 14:00:00
##  9 2013-01-02 13:00:00
## 10 2013-01-02 15:00:00
## # ... with 8,245 more rows

Arrival time (arr_time) is also missing for these flights along with departure time (dep_time). This could represent that the flights were either canceled or rescheduled

Sections: Arrange Rows with Rows()

Exercise 1 How could you use arrange() to sort all missing values to the start?

arrange(flights, desc(is.na(dep_time)))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        NA EV        4308 N18120  EWR    RDU         NA      416    16     30
##  2        NA AA         791 N3EHAA  LGA    DFW         NA     1389    19     35
##  3        NA AA        1925 N3EVAA  LGA    MIA         NA     1096    15      0
##  4        NA B6         125 N618JB  JFK    FLL         NA     1069     6      0
##  5        NA EV        4352 N10575  EWR    CVG         NA      569    15     40
##  6        NA EV        4406 N13949  EWR    PIT         NA      319    16     20
##  7        NA EV        4434 N10575  EWR    MHT         NA      209    13     55
##  8        NA EV        4935 N759EV  EWR    ATL         NA      746    14     20
##  9        NA EV        3849 N13550  EWR    IND         NA      645    13     21
## 10        NA AA         133 <NA>    JFK    LAX         NA     2475    15     45
##    time_hour          
##    <dttm>             
##  1 2013-01-01 16:00:00
##  2 2013-01-01 19:00:00
##  3 2013-01-01 15:00:00
##  4 2013-01-01 06:00:00
##  5 2013-01-02 15:00:00
##  6 2013-01-02 16:00:00
##  7 2013-01-02 13:00:00
##  8 2013-01-02 14:00:00
##  9 2013-01-02 13:00:00
## 10 2013-01-02 15:00:00
## # ... with 336,766 more rows
arrange(flights, desc(is.na(arr_time)))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     2016           1930        46       NA           2220
##  2  2013     1     1       NA           1630        NA       NA           1815
##  3  2013     1     1       NA           1935        NA       NA           2240
##  4  2013     1     1       NA           1500        NA       NA           1825
##  5  2013     1     1       NA            600        NA       NA            901
##  6  2013     1     2     2041           2045        -4       NA           2359
##  7  2013     1     2     2145           2129        16       NA             33
##  8  2013     1     2       NA           1540        NA       NA           1747
##  9  2013     1     2       NA           1620        NA       NA           1746
## 10  2013     1     2       NA           1355        NA       NA           1459
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        NA EV        4204 N14168  EWR    OKC         NA     1325    19     30
##  2        NA EV        4308 N18120  EWR    RDU         NA      416    16     30
##  3        NA AA         791 N3EHAA  LGA    DFW         NA     1389    19     35
##  4        NA AA        1925 N3EVAA  LGA    MIA         NA     1096    15      0
##  5        NA B6         125 N618JB  JFK    FLL         NA     1069     6      0
##  6        NA B6         147 N630JB  JFK    RSW         NA     1074    20     45
##  7        NA UA        1299 N12221  EWR    RSW         NA     1068    21     29
##  8        NA EV        4352 N10575  EWR    CVG         NA      569    15     40
##  9        NA EV        4406 N13949  EWR    PIT         NA      319    16     20
## 10        NA EV        4434 N10575  EWR    MHT         NA      209    13     55
##    time_hour          
##    <dttm>             
##  1 2013-01-01 19:00:00
##  2 2013-01-01 16:00:00
##  3 2013-01-01 19:00:00
##  4 2013-01-01 15:00:00
##  5 2013-01-01 06:00:00
##  6 2013-01-02 20:00:00
##  7 2013-01-02 21:00:00
##  8 2013-01-02 15:00:00
##  9 2013-01-02 16:00:00
## 10 2013-01-02 13:00:00
## # ... with 336,766 more rows

Exercise 2 Sort flights to find the most delayed flights. Find the flights that left earliest

arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1      1272 HA          51 N384HA  JFK    HNL        640     4983     9      0
##  2      1127 MQ        3535 N504MQ  JFK    CMH         74      483    19     35
##  3      1109 MQ        3695 N517MQ  EWR    ORD        111      719    16     35
##  4      1007 AA         177 N338AA  JFK    SFO        354     2586    18     45
##  5       989 MQ        3075 N665MQ  JFK    CVG         96      589    16      0
##  6       931 DL        2391 N959DL  JFK    TPA        139     1005    19      0
##  7       915 DL        2119 N927DA  LGA    MSP        167     1020     8     10
##  8       850 DL        2007 N3762Y  JFK    PDX        313     2454    19      0
##  9       895 DL        2047 N6716C  LGA    ATL        109      762     7     59
## 10       878 AA         172 N5DMAA  EWR    MIA        149     1085    17      0
##    time_hour          
##    <dttm>             
##  1 2013-01-09 09:00:00
##  2 2013-06-15 19:00:00
##  3 2013-01-10 16:00:00
##  4 2013-09-20 18:00:00
##  5 2013-07-22 16:00:00
##  6 2013-04-10 19:00:00
##  7 2013-03-17 08:00:00
##  8 2013-06-27 19:00:00
##  9 2013-07-22 07:00:00
## 10 2013-12-05 17:00:00
## # ... with 336,766 more rows
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        48 B6          97 N592JB  JFK    DEN        265     1626    21     23
##  2       -58 DL        1715 N612DL  LGA    MSY        162     1183    20     55
##  3       -10 EV        5713 N825AS  LGA    IAD         52      229    14     40
##  4       -10 DL        1435 N934DL  LGA    TPA        139     1010    19     30
##  5       -10 F9         837 N208FR  LGA    DEN        250     1620    17     30
##  6         7 MQ        3478 N711MQ  LGA    DTW         88      502     7     55
##  7         0 EV        4361 N13994  EWR    TYS        111      631    19     32
##  8       -37 MQ        4573 N725MQ  LGA    DTW         87      502    20     55
##  9       -30 9E        3318 N929XJ  JFK    BUF         55      301    14     55
## 10       -44 B6         375 N531JB  LGA    FLL        150     1076     9     58
##    time_hour          
##    <dttm>             
##  1 2013-12-07 21:00:00
##  2 2013-02-03 20:00:00
##  3 2013-11-10 14:00:00
##  4 2013-01-11 19:00:00
##  5 2013-01-29 17:00:00
##  6 2013-08-09 07:00:00
##  7 2013-10-23 19:00:00
##  8 2013-03-30 20:00:00
##  9 2013-03-02 14:00:00
## 10 2013-05-05 09:00:00
## # ... with 336,766 more rows

Flight that left earliest was B6 carrier flight 97 and HA carrier flight 51 was the most delayed.

Sections: Select columns with select()

Exercise 3 What does the any_of() function do? Why might it be helpful in conjunction with this vector?

vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, any_of(vars))
## # A tibble: 336,776 x 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # ... with 336,766 more rows

any_of() and all_of() functions have replaced one_of function in R. all_of() will throw an error if even one of the variables in missing, but any_of() will ignore any missing variables present in the data frame.

Sections: Add new variables with mutate(), Useful Creation Functions

Exercise 2 Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

flightsairtime <- mutate(flights, 
                  dep_time = (dep_time %/% 100*60 + dep_time %% 100) %% 1440,
                  arr_time = (arr_time %/% 100*60 + arr_time %% 100) %% 1440,
                  )
nrow(filter(flightsairtime, air_time != arr_time - dep_time ))
## [1] 327150

I expected to see that air_time = arr_time - dep_time. However, I see that there are 327150 rows where airtime != arr_time - dep_time. Time zone differences could be causing this difference. Also, another reason could be flights crossing over midnight to go to the destination and that means it is next day.

Exercise 4 Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

delayed_flights <- mutate(flights, minrank_dep_delay = min_rank(desc(dep_delay)))
delayed_flights <- arrange(delayed_flights, minrank_dep_delay)
head(delayed_flights, 10)
## # A tibble: 10 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1      1272 HA          51 N384HA  JFK    HNL        640     4983     9      0
##  2      1127 MQ        3535 N504MQ  JFK    CMH         74      483    19     35
##  3      1109 MQ        3695 N517MQ  EWR    ORD        111      719    16     35
##  4      1007 AA         177 N338AA  JFK    SFO        354     2586    18     45
##  5       989 MQ        3075 N665MQ  JFK    CVG         96      589    16      0
##  6       931 DL        2391 N959DL  JFK    TPA        139     1005    19      0
##  7       915 DL        2119 N927DA  LGA    MSP        167     1020     8     10
##  8       850 DL        2007 N3762Y  JFK    PDX        313     2454    19      0
##  9       895 DL        2047 N6716C  LGA    ATL        109      762     7     59
## 10       878 AA         172 N5DMAA  EWR    MIA        149     1085    17      0
##    time_hour           minrank_dep_delay
##    <dttm>                          <int>
##  1 2013-01-09 09:00:00                 1
##  2 2013-06-15 19:00:00                 2
##  3 2013-01-10 16:00:00                 3
##  4 2013-09-20 18:00:00                 4
##  5 2013-07-22 16:00:00                 5
##  6 2013-04-10 19:00:00                 6
##  7 2013-03-17 08:00:00                 7
##  8 2013-06-27 19:00:00                 8
##  9 2013-07-22 07:00:00                 9
## 10 2013-12-05 17:00:00                10

Exercise 5 What does 1:3 + 1:10 return? Why?

1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
##  [1]  2  4  6  5  7  9  8 10 12 11

Sections: Grouped summaries with summarize(), Combining multiple operations with the Pipe, Missing Values, Counts, Useful Summary Functions, Grouping by Multiple Variables, Ungrouping

Exercise 5 Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not?

flights %>%
  group_by(carrier) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))
## # A tibble: 16 x 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93

Exercise 6 What does the sort argument to count() do. When might you use it?

flights %>%
  count(carrier, sort = TRUE)
## # A tibble: 16 x 2
##    carrier     n
##    <chr>   <int>
##  1 UA      58665
##  2 B6      54635
##  3 EV      54173
##  4 DL      48110
##  5 AA      32729
##  6 MQ      26397
##  7 US      20536
##  8 9E      18460
##  9 WN      12275
## 10 VX       5162
## 11 FL       3260
## 12 AS        714
## 13 F9        685
## 14 YV        601
## 15 HA        342
## 16 OO         32

Sort argument to count() sorts the output, that is, the number of occurrences from highest to lowest. Sort argument in this code is sorting the carriers by the number of times it appears in the dataframe. It is useful when we want to identify which row is appearing often in the dataframe.

Sections: Grouped Mutates (and Filters)

Exercise 2 Which plane (tailnum) has the worst on-time record?

flights %>% 
  group_by(tailnum) %>%
  filter(dep_delay > 0) %>%
  summarise(avg_delay = mean(dep_delay)) %>%
  arrange(desc(avg_delay))
## # A tibble: 3,885 x 2
##    tailnum avg_delay
##    <chr>       <dbl>
##  1 N844MH       297 
##  2 N452UW       291 
##  3 N922EV       274 
##  4 N587NW       272 
##  5 N911DA       268 
##  6 N665MQ       268.
##  7 N673MQ       257 
##  8 N851NW       233 
##  9 N654UA       227 
## 10 N550NW       212.
## # ... with 3,875 more rows

Based on the above results, N844MH has worst on-time record.

Exercise 4 For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.

flights %>%
  filter(arr_delay > 0) %>%
  group_by(dest) %>%
  mutate(sum_arr_delay = sum(arr_delay),
         proportion_arr_delay = arr_delay / sum_arr_delay
         ) %>%
  select(dest, month, day, dep_time, carrier, flight,
         arr_delay, proportion_arr_delay) %>%
  arrange(dest, desc(proportion_arr_delay))
## # A tibble: 133,004 x 8
## # Groups:   dest [103]
##    dest  month   day dep_time carrier flight arr_delay proportion_arr_delay
##    <chr> <int> <int>    <int> <chr>    <int>     <dbl>                <dbl>
##  1 ABQ       7    22     2145 B6        1505       153               0.0341
##  2 ABQ      12    14     2223 B6          65       149               0.0332
##  3 ABQ      10    15     2146 B6          65       138               0.0308
##  4 ABQ       7    23     2206 B6        1505       137               0.0305
##  5 ABQ      12    17     2220 B6          65       136               0.0303
##  6 ABQ       7    10     2025 B6        1505       126               0.0281
##  7 ABQ       7    30     2212 B6        1505       118               0.0263
##  8 ABQ       7    28     2038 B6        1505       117               0.0261
##  9 ABQ      12     8     2049 B6          65       114               0.0254
## 10 ABQ       9     2     2212 B6        1505       109               0.0243
## # ... with 132,994 more rows

Exercise 7 Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

flights %>%
   group_by(dest) %>%
   mutate(num_carriers = n_distinct(carrier)) %>%
   filter(num_carriers >= 2) %>%
   group_by(carrier) %>%
   summarise(num_dest = n_distinct(dest)) %>%
   arrange(desc(num_dest))
## # A tibble: 16 x 2
##    carrier num_dest
##    <chr>      <int>
##  1 EV            51
##  2 9E            48
##  3 UA            42
##  4 DL            39
##  5 B6            35
##  6 AA            19
##  7 MQ            19
##  8 WN            10
##  9 OO             5
## 10 US             5
## 11 VX             4
## 12 YV             3
## 13 FL             2
## 14 AS             1
## 15 F9             1
## 16 HA             1