library(nycflights13)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

nycflights13

Note: you will probablly have to install nycflights13 using install.packages and the load it with the library command. nycflights13 is a relational database containing the following tables (data frames). This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley

data frame description
airlines Airline names
airports Airport metadata
flights Flights data
planes Planes meta data
weather Hourly data

This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley. You should start reading this chapter now, and try to complete reading it by the end of this week.

flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Task 1

  1. Change your name to author name in the YAML block at the top of this page.
  2. Install nycflights13 if necessary.
  3. Do the Exercises in section 5.2.4 of the Wickham book you will have to read the material prior to the exercises, though you have probably seen all this in the preceeding sections.

Write up your solutions to the exercises in 5.2.4 in this document, including the code chunks you use to determine the answer.

5.2.4 Exercises

1. Find all flights that

1. Had an arrival delay of two or more hours

flights %>%
  filter(arr_delay >= 120)
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

2. Flew to Houston (IAH or HOU)

filter(flights, dest == "IAH")
## # A tibble: 7,198 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
## # … with 7,188 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest == "HOU")
## # A tibble: 2,115 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1208           1158        10     1540           1502
##  2  2013     1     1     1306           1300         6     1622           1610
##  3  2013     1     1     1708           1700         8     2037           2005
##  4  2013     1     1     2030           2035        -5     2354           2342
##  5  2013     1     2      734            700        34     1045           1025
##  6  2013     1     2     1156           1158        -2     1517           1502
##  7  2013     1     2     1319           1305        14     1633           1615
##  8  2013     1     2     1810           1655        75     2146           2000
##  9  2013     1     2     2031           2035        -4     2353           2342
## 10  2013     1     3      704            700         4     1036           1025
## # … with 2,105 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. Were operated by United, American, or Delta

filter(flights, carrier == "UA")
## # A tibble: 58,665 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      558            600        -2      924            917
##  5  2013     1     1      558            600        -2      923            937
##  6  2013     1     1      559            600        -1      854            902
##  7  2013     1     1      607            607         0      858            915
##  8  2013     1     1      611            600        11      945            931
##  9  2013     1     1      623            627        -4      933            932
## 10  2013     1     1      628            630        -2     1016            947
## # … with 58,655 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, carrier == "AA")
## # A tibble: 32,729 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      558            600        -2      753            745
##  3  2013     1     1      559            600        -1      941            910
##  4  2013     1     1      606            610        -4      858            910
##  5  2013     1     1      623            610        13      920            915
##  6  2013     1     1      628            630        -2     1137           1140
##  7  2013     1     1      629            630        -1      824            810
##  8  2013     1     1      635            635         0     1028            940
##  9  2013     1     1      656            700        -4      854            850
## 10  2013     1     1      656            659        -3      949            959
## # … with 32,719 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, carrier == "DL")
## # A tibble: 48,110 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      554            600        -6      812            837
##  2  2013     1     1      602            610        -8      812            820
##  3  2013     1     1      606            610        -4      837            845
##  4  2013     1     1      615            615         0      833            842
##  5  2013     1     1      653            700        -7      936           1009
##  6  2013     1     1      655            655         0     1021           1030
##  7  2013     1     1      655            700        -5     1037           1045
##  8  2013     1     1      655            700        -5     1002           1020
##  9  2013     1     1      657            700        -3      959           1013
## 10  2013     1     1      658            700        -2      944            939
## # … with 48,100 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

4. Departed in summer (July, August, and September)

filter(flights, month == 7 | month == 8 | month == 9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

5. Arrived more than two hours late, but didn’t leave late

filter(flights, arr_delay > 120, dep_delay <= 0)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    27     1419           1420        -1     1754           1550
##  2  2013    10     7     1350           1350         0     1736           1526
##  3  2013    10     7     1357           1359        -2     1858           1654
##  4  2013    10    16      657            700        -3     1258           1056
##  5  2013    11     1      658            700        -2     1329           1015
##  6  2013     3    18     1844           1847        -3       39           2219
##  7  2013     4    17     1635           1640        -5     2049           1845
##  8  2013     4    18      558            600        -2     1149            850
##  9  2013     4    18      655            700        -5     1213            950
## 10  2013     5    22     1827           1830        -3     2217           2010
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

6. Were delayed by at least an hour, but made up over 30 minutes in flight

filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     2205           1720       285       46           2040
##  2  2013     1     1     2326           2130       116      131             18
##  3  2013     1     3     1503           1221       162     1803           1555
##  4  2013     1     3     1839           1700        99     2056           1950
##  5  2013     1     3     1850           1745        65     2148           2120
##  6  2013     1     3     1941           1759       102     2246           2139
##  7  2013     1     3     1950           1845        65     2228           2227
##  8  2013     1     3     2015           1915        60     2135           2111
##  9  2013     1     3     2257           2000       177       45           2224
## 10  2013     1     4     1917           1700       137     2135           1950
## # … with 1,834 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

7. Departed between midnight and 6am (inclusive)

filter(flights, between(dep_time, 0000, 0600))
## # A tibble: 9,344 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 9,334 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

2. Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?

Answer) Between is a short way of testing two inequalities at once. For example, between(x, y, z): x >= y, x <= z
filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, !between(dep_time, 0601, 2359))
## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

Answer) 8255 flights have a missing dep_time.
8255 have a missing dep_delay, 8713 have a missing arr_time, 9430 have a missing arr_delay, and 9430 have a missing air_time.
summary(flights$dep_time)    # NA's means missing data
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1     907    1401    1349    1744    2400    8255
summary(flights)  
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 

4. Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)

Answer) 0 ^ 0 = 1 because all numbers to powers of zero are 1. Also, 1 or anything other than 0 means TRUE and 0 means FALSE.
NA ^ 0
## [1] 1
NA | TRUE
## [1] TRUE

Task 2

Exercises in 5.3.1

Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.

1. How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).

arrange(flights, desc(is.na(dep_time)), dep_time)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

2. Sort flights to find the most delayed flights. Find the flights that left earliest.

Answer) The most delayed flight is HA51, JFK to HNL, which was delayed 1,301 minutes, and the earliest leaving flight was B697, JFK to DEN, which was left 43 minutes early.
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. Sort flights to find the fastest (highest speed) flights.

Answer) The fastest flight is DL1499, LGA to ATL.
arrange(flights, desc(distance / air_time))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     5    25     1709           1700         9     1923           1937
##  2  2013     7     2     1558           1513        45     1745           1719
##  3  2013     5    13     2040           2025        15     2225           2226
##  4  2013     3    23     1914           1910         4     2045           2043
##  5  2013     1    12     1559           1600        -1     1849           1917
##  6  2013    11    17      650            655        -5     1059           1150
##  7  2013     2    21     2355           2358        -3      412            438
##  8  2013    11    17      759            800        -1     1212           1255
##  9  2013    11    16     2003           1925        38       17             36
## 10  2013    11    16     2349           2359       -10      402            440
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

4. Which flights traveled the farthest? Which traveled the shortest?

Answer) The farthest flight is HA51, JFK to HNL, which is 4,983 miles, and the shortest flight is US1632, EWR to LGA, which is 17 miles.
arrange(flights, desc(distance))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      857            900        -3     1516           1530
##  2  2013     1     2      909            900         9     1525           1530
##  3  2013     1     3      914            900        14     1504           1530
##  4  2013     1     4      900            900         0     1516           1530
##  5  2013     1     5      858            900        -2     1519           1530
##  6  2013     1     6     1019            900        79     1558           1530
##  7  2013     1     7     1042            900       102     1620           1530
##  8  2013     1     8      901            900         1     1504           1530
##  9  2013     1     9      641            900      1301     1242           1530
## 10  2013     1    10      859            900        -1     1449           1530
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, distance)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7    27       NA            106        NA       NA            245
##  2  2013     1     3     2127           2129        -2     2222           2224
##  3  2013     1     4     1240           1200        40     1333           1306
##  4  2013     1     4     1829           1615       134     1937           1721
##  5  2013     1     4     2128           2129        -1     2218           2224
##  6  2013     1     5     1155           1200        -5     1241           1306
##  7  2013     1     6     2125           2129        -4     2224           2224
##  8  2013     1     7     2124           2129        -5     2212           2224
##  9  2013     1     8     2127           2130        -3     2304           2225
## 10  2013     1     9     2126           2129        -3     2217           2224
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Task 3

Exercises 5.4.1

Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.

1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows
select(flights, "dep_time", "dep_delay", "arr_time", "arr_delay")
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows
select(flights, c("dep_time", "dep_delay", "arr_time", "arr_delay"))
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows
select(flights, c(dep_time, dep_delay, arr_time, arr_delay))
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows
select(flights, starts_with("dep_"), starts_with("arr_"))
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows

2. What happens if you include the name of a variable multiple times in a select() call?

Answer) If I include the name of a variable multiple times in a select(), it shows only once.
select(flights, dep_time, dep_delay, arr_time, arr_delay, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows

3. What does the any_of() function do? Why might it be helpful in conjunction with this vector?

Answer) We don’t need to input the variables every single time.
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))
## # A tibble: 336,776 x 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # … with 336,766 more rows

4. Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?

Answer) Yes. It made me surprised. I didn’t know that the contains() doesn’t care capital or lower case.
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # … with 336,766 more rows

Task 4

Exercises 5.5.2

Write up your solutions to the exercises in 5.5.2 in this document, including the code chunks you use to determine the answer.

1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.

#select(flights, dep_time, sched_dep_time)
minutes <-
  mutate(flights, 
         dep_time_minutes = dep_time %/% 100 * 60 + dep_time %% 100,    # hours extraction + minutes extraction
         sched_dep_time_minutes = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100)

select(minutes, dep_time, sched_dep_time, dep_time_minutes, sched_dep_time_minutes)
## # A tibble: 336,776 x 4
##    dep_time sched_dep_time dep_time_minutes sched_dep_time_minutes
##       <int>          <int>            <dbl>                  <dbl>
##  1      517            515              317                    315
##  2      533            529              333                    329
##  3      542            540              342                    340
##  4      544            545              344                    345
##  5      554            600              354                    360
##  6      554            558              354                    358
##  7      555            600              355                    360
##  8      557            600              357                    360
##  9      557            600              357                    360
## 10      558            600              358                    360
## # … with 336,766 more rows

2. Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

Answer) The format of arr_time and dep_time is hour and minutes, so I needed to fix them in minutes with %/%(hour extraction) * 60 and %%(minutes extraction).
#select(flights, air_time, arr_time, dep_time)
compare <- mutate(flights, air_time, 
                  comp_time = arr_time - dep_time)

select(compare, air_time, comp_time)
## # A tibble: 336,776 x 2
##    air_time comp_time
##       <dbl>     <int>
##  1      227       313
##  2      227       317
##  3      160       381
##  4      183       460
##  5      116       258
##  6      150       186
##  7      158       358
##  8       53       152
##  9      140       281
## 10      138       195
## # … with 336,766 more rows
compare2 <- mutate(flights, air_time,
                  compare_time = (arr_time %/% 100 * 60 + arr_time %% 100) - (dep_time %/% 100 * 60 + dep_time %% 100))

select(compare2, air_time, compare_time)
## # A tibble: 336,776 x 2
##    air_time compare_time
##       <dbl>        <dbl>
##  1      227          193
##  2      227          197
##  3      160          221
##  4      183          260
##  5      116          138
##  6      150          106
##  7      158          198
##  8       53           72
##  9      140          161
## 10      138          115
## # … with 336,766 more rows

4. Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

#arrange(flights, desc(dep_delay))
delay_rank <- mutate(flights,
               rank = min_rank(desc(dep_delay)))

delay_rank2 <- filter(delay_rank,
                     rank <= 10)

delay_rank3 <- arrange(delay_rank2, rank)

select(delay_rank3, rank, dep_delay, carrier, flight)
## # A tibble: 10 x 4
##     rank dep_delay carrier flight
##    <int>     <dbl> <chr>    <int>
##  1     1      1301 HA          51
##  2     2      1137 MQ        3535
##  3     3      1126 MQ        3695
##  4     4      1014 AA         177
##  5     5      1005 MQ        3075
##  6     6       960 DL        2391
##  7     7       911 DL        2119
##  8     8       899 DL        2007
##  9     9       898 DL        2047
## 10    10       896 AA         172

5. What does 1:3 + 1:10 return? Why?

Answer) It returns (between 1 to 3) + (between 1 to 10) in order. So, 1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10.
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
##  [1]  2  4  6  5  7  9  8 10 12 11

6. What trigonometric functions does R provide?

Answer) When I look at help tab, it says these functions give the obvious trigonometric functions. They respectively compute the cosine, sine, tangent, arc-cosine, arc-sine, arc-tangent, and the two-argument arc-tangent.
?Trig

Task 5

Exercises 5.6.7

Write up your solutions to the exercises in 5.6.7 in this document, including the code chunks you use to determine the answer.

1. Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios:

- A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.

flights %>%
  group_by(flight) %>%
  summarise(early = mean(arr_delay <= -15, na.rm = TRUE),
            late = mean(arr_delay >= 15, na.rm = TRUE)) %>%
  filter(early == .5,
         late == .5)
## # A tibble: 21 x 3
##    flight early  late
##     <int> <dbl> <dbl>
##  1    107   0.5   0.5
##  2   2072   0.5   0.5
##  3   2366   0.5   0.5
##  4   2500   0.5   0.5
##  5   2552   0.5   0.5
##  6   3495   0.5   0.5
##  7   3505   0.5   0.5
##  8   3518   0.5   0.5
##  9   3544   0.5   0.5
## 10   3651   0.5   0.5
## # … with 11 more rows

- A flight is always 10 minutes late.

flights %>%
  group_by(flight) %>%
  summarise(late_10 = mean(arr_delay >= 10, na.rm = TRUE)) %>%
  filter(late_10 == 1)    # always means 100%
## # A tibble: 98 x 2
##    flight late_10
##     <int>   <dbl>
##  1     94       1
##  2    730       1
##  3    974       1
##  4   1084       1
##  5   1226       1
##  6   1510       1
##  7   1514       1
##  8   1859       1
##  9   1868       1
## 10   1998       1
## # … with 88 more rows

- A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.

flights %>%
  group_by(flight) %>%
  summarise(early_30 = mean(arr_delay <= -30, na.rm = TRUE),
            late_30 = mean(arr_delay >= 30, na.rm = TRUE)) %>%
  filter(early_30 == .5,
         late_30 == .5)
## # A tibble: 3 x 3
##   flight early_30 late_30
##    <int>    <dbl>   <dbl>
## 1   3651      0.5     0.5
## 2   3916      0.5     0.5
## 3   3951      0.5     0.5

- 99% of the time a flight is on time. 1% of the time it’s 2 hours late.

flights %>%
  group_by(flight) %>%
  summarise(ontime = mean(arr_delay <= 0, na.rm = TRUE),
            late_120 = mean(arr_delay >= 120, na.rm = TRUE)) %>%
  filter(ontime == .99,
         late_120 == .01)
## # A tibble: 0 x 3
## # … with 3 variables: flight <int>, ontime <dbl>, late_120 <dbl>

Which is more important: arrival delay or departure delay?

Answer) Of course, the arrival delay is more important. We can arrive on time even if our departure is delayed, but delays are more important because if our arrival is delayed, we may miss our connecting flight.

2. Come up with another approach that will give you the same output as

not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count())

not_cancelled <- flights %>% 
  count(dest)
not_cancelled
## # A tibble: 105 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     265
##  3 ALB     439
##  4 ANC       8
##  5 ATL   17215
##  6 AUS    2439
##  7 AVL     275
##  8 BDL     443
##  9 BGR     375
## 10 BHM     297
## # … with 95 more rows
not_cancelled2 <- flights %>%
  count(tailnum, wt = distance)
not_cancelled2
## # A tibble: 4,044 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  250866
##  3 N10156  115966
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   25157
##  7 N10575  150194
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # … with 4,034 more rows
not_cancelled_dest <- flights %>%
  group_by(dest) %>%
  summarise(n())
not_cancelled_dest
## # A tibble: 105 x 2
##    dest  `n()`
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     265
##  3 ALB     439
##  4 ANC       8
##  5 ATL   17215
##  6 AUS    2439
##  7 AVL     275
##  8 BDL     443
##  9 BGR     375
## 10 BHM     297
## # … with 95 more rows
not_cancelled_tailnum <- flights %>%
  group_by(tailnum) %>%
  summarise(n = sum(distance))
not_cancelled_tailnum
## # A tibble: 4,044 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  250866
##  3 N10156  115966
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   25157
##  7 N10575  150194
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # … with 4,034 more rows

3. Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) ) is slightly suboptimal. Why? Which is the most important column?

Answer) arr_delay is the most important column because regardless of the departure time of the flight, there is no problem as long as the arrival time matches the schedule. However, even if the departure time of the flight is on time, if the arrival time is different from the schedule, there will be a big setback. For example, missing a connecting flight.
flights %>%
  filter(is.na(dep_delay) | is.na(arr_delay)) %>%
  select(dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay)
## # A tibble: 9,430 x 6
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1     1525           1530        -5     1934           1805        NA
##  2     1528           1459        29     2002           1647        NA
##  3     1740           1745        -5     2158           2020        NA
##  4     1807           1738        29     2251           2103        NA
##  5     1939           1840        59       29           2151        NA
##  6     1952           1930        22     2358           2207        NA
##  7     2016           1930        46       NA           2220        NA
##  8       NA           1630        NA       NA           1815        NA
##  9       NA           1935        NA       NA           2240        NA
## 10       NA           1500        NA       NA           1825        NA
## # … with 9,420 more rows

5. Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))

Answer) The worst departure airport is EWR and the worst arrival airport is CAE, and the worst carrier is F9. The interesting thing is the worst departure airports are very limited and departure delays are habitual.
bad_dep_airports <- flights %>%
  group_by(origin) %>%
  summarise(worst_dep_airport = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(worst_dep_airport))
bad_dep_airports
## # A tibble: 3 x 2
##   origin worst_dep_airport
##   <chr>              <dbl>
## 1 EWR                 15.1
## 2 JFK                 12.1
## 3 LGA                 10.3
bad_arr_airports <- flights %>%
  group_by(dest) %>%
  summarise(worst_arr_airport = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(worst_arr_airport))
bad_arr_airports
## # A tibble: 105 x 2
##    dest  worst_arr_airport
##    <chr>             <dbl>
##  1 CAE                41.8
##  2 TUL                33.7
##  3 OKC                30.6
##  4 JAC                28.1
##  5 TYS                24.1
##  6 MSN                20.2
##  7 RIC                20.1
##  8 CAK                19.7
##  9 DSM                19.0
## 10 GRR                18.2
## # … with 95 more rows
bad_carrier <- flights %>% 
  group_by(carrier) %>% 
  summarise(worst_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(worst_delay))
bad_carrier
## # A tibble: 16 x 2
##    carrier worst_delay
##    <chr>         <dbl>
##  1 F9           21.9  
##  2 FL           20.1  
##  3 EV           15.8  
##  4 YV           15.6  
##  5 OO           11.9  
##  6 MQ           10.8  
##  7 WN            9.65 
##  8 B6            9.46 
##  9 9E            7.38 
## 10 UA            3.56 
## 11 US            2.13 
## 12 VX            1.76 
## 13 DL            1.64 
## 14 AA            0.364
## 15 HA           -6.92 
## 16 AS           -9.93

6. What does the sort argument to count() do. When might you use it?

Answer) count() sorts in unmarical and alphabetical order.
flights %>%
  count(carrier)
## # A tibble: 16 x 2
##    carrier     n
##    <chr>   <int>
##  1 9E      18460
##  2 AA      32729
##  3 AS        714
##  4 B6      54635
##  5 DL      48110
##  6 EV      54173
##  7 F9        685
##  8 FL       3260
##  9 HA        342
## 10 MQ      26397
## 11 OO         32
## 12 UA      58665
## 13 US      20536
## 14 VX       5162
## 15 WN      12275
## 16 YV        601

Thank you :)