#To clear environment and re-run from the start...
#rm(list = ls())
#install.packages('nycflights13')
library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

nycflights13

Note: you will probablly have to install nycflights13 using install.packages and the load it with the library command. nycflights13 is a relational database containig the following tables (data frames). This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley

data frame description
airlines Airline names
airports Airport metadata
flights Flights data
planes Planes meta data
weather Hourly data

This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley. You should start reading this chapter now, and try to complete reading it by the end of this week.

flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#******************************************************************************#

  1. Change your name to author name in the YAML block at the top of this page.
  2. Install nycflights13 if necessary.
  3. Do the Exercises in section 5.2.4 of the Wickham book you will have to read the material prior to the exercises, though you have probably seen all this in the preceeding sections.

Task 1

Exercise Solutions Section 5.2.4

Write up your solutions to the exercises in 5.2.4 in this document, including the code chunks you use to determine the answer.

1-1. Had arrival delay of 2 or more hours

filter(flights, arr_delay >= 120) -> long_delay
long_delay <- tibble(long_delay)
long_delay
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
ld_count <- count(long_delay)
ld_count
## # A tibble: 1 x 1
##       n
##   <int>
## 1 10200

1-2. Flew to Houston

filter(flights, (dest == 'IAH' | dest == 'HOU')) -> to_houston
to_houston <- tibble(to_houston)
to_houston
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
hou_count <- count(to_houston)
hou_count
## # A tibble: 1 x 1
##       n
##   <int>
## 1  9313

1-3. Carrier was United, American, or Delta

filter(flights, (carrier == 'DL' | carrier == 'AA' | carrier == 'UA')) -> car_type
car_type <- tibble(car_type)
car_type
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            600        -1      941            910
## 10  2013     1     1      559            600        -1      854            902
## # ... with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
type_count <- count(car_type)
type_count
## # A tibble: 1 x 1
##        n
##    <int>
## 1 139504

1-4. Summer departure (July, Aug, Sep)

filter(flights, (month == 7 | month == 8 | month == 9)) -> sum_dpt
sum_dpt <- tibble(sum_dpt)
sum_dpt
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # ... with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
sum_count <- count(sum_dpt)
sum_count
## # A tibble: 1 x 1
##       n
##   <int>
## 1 86326

1-5. Arrived more than 2 hours late but did not leave late

filter(flights, (arr_delay >= 120 & dep_delay <= 0)) -> late_no_delay
late_no_delay <- tibble(late_no_delay)
late_no_delay
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    27     1419           1420        -1     1754           1550
##  2  2013    10     7     1350           1350         0     1736           1526
##  3  2013    10     7     1357           1359        -2     1858           1654
##  4  2013    10    16      657            700        -3     1258           1056
##  5  2013    11     1      658            700        -2     1329           1015
##  6  2013     3    18     1844           1847        -3       39           2219
##  7  2013     4    17     1635           1640        -5     2049           1845
##  8  2013     4    18      558            600        -2     1149            850
##  9  2013     4    18      655            700        -5     1213            950
## 10  2013     5    22     1827           1830        -3     2217           2010
## # ... with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late_nd_count <- count(late_no_delay)
late_nd_count
## # A tibble: 1 x 1
##       n
##   <int>
## 1    29

1-6. Delayed at least 1 hour but made up at least 30 min in flight

filter(flights, (dep_delay >= 60 &  (arr_delay - dep_delay) <= -30)) -> late_made30
late_made30 <- tibble(late_made30)
late_made30
## # A tibble: 2,074 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1716           1545        91     2140           2039
##  2  2013     1     1     2205           1720       285       46           2040
##  3  2013     1     1     2326           2130       116      131             18
##  4  2013     1     3     1503           1221       162     1803           1555
##  5  2013     1     3     1821           1530       171     2131           1910
##  6  2013     1     3     1839           1700        99     2056           1950
##  7  2013     1     3     1850           1745        65     2148           2120
##  8  2013     1     3     1923           1815        68     2036           1958
##  9  2013     1     3     1941           1759       102     2246           2139
## 10  2013     1     3     1950           1845        65     2228           2227
## # ... with 2,064 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late_made30_count <- count(late_made30)
late_made30_count
## # A tibble: 1 x 1
##       n
##   <int>
## 1  2074

1-7. Departed between midnight and 6 a.m.

filter(flights, (dep_time == 2400 | dep_time <= 600)) -> late_no_delay #dataset considers midnight 2400 rather than 0000
late_no_delay <- tibble(late_no_delay)
late_no_delay
## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late_nd_count <- count(late_no_delay)
late_nd_count
## # A tibble: 1 x 1
##       n
##   <int>
## 1  9373
  1. Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?
#between takes a starting value and an ending value and returns all the values greater than the starting value
#and smaller than the ending value (inclusive)

#between() could be used for question 1-4, to get the months between(month, 7, 9)
#IF 0000 was midnight instead of 2400, it also could have been used there between(dep_time, 0, 600)
  1. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?
filter(flights, is.na(dep_time)) -> dep_missing
dep_missing
## # A tibble: 8,255 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # ... with 8,245 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
count(dep_missing)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  8255
#since these flights also have missing arrive time information etc., it likely indicates canceled fligths
  1. Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)
#NA^0 -- any value to the 0 is always 1

#NA | TRUE -- or returns true as long as one is true. TRUE will always be TRUE

#FALSE & NA -- and needs both to be true, otherwise it is false. A value cannot be both missing and false
#so this statement will always be FALSE

#NA * 0 -- is attempting to multiply nothing by 0, this still leave nothing there.

Task 2

Exercise Solutions Section 5.3.1

  1. Do the exercises in 5.3.1

Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.

  1. How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).
arrange(flights, desc(is.na(flights$year)), desc(is.na(flights$month)), desc(is.na(flights$day)), desc(is.na(flights$dep_time)), desc(is.na(flights$sched_dep_time)), desc(is.na(flights$dep_delay)), desc(is.na(flights$arr_time)), desc(is.na(flights$sched_arr_time)), desc(is.na(flights$arr_delay)), desc(is.na(flights$carrier)), desc(is.na(flights$flight)), desc(is.na(flights$air_time)), desc(is.na(flights$distance)), desc(is.na(flights$hour)), desc(is.na(flights$minute)), desc(is.na(flights$time_hour))) -> no_data
no_data
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Sort flights to find the most delayed flights. Find the flights that left earliest.
late <- arrange(flights, desc(dep_delay))
late
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
early <- arrange(flights, dep_delay)
early
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Sort flights to find the fastest (highest speed) flights.
fastest <- arrange(flights, desc(distance / (air_time/60)))
fastest
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     5    25     1709           1700         9     1923           1937
##  2  2013     7     2     1558           1513        45     1745           1719
##  3  2013     5    13     2040           2025        15     2225           2226
##  4  2013     3    23     1914           1910         4     2045           2043
##  5  2013     1    12     1559           1600        -1     1849           1917
##  6  2013    11    17      650            655        -5     1059           1150
##  7  2013     2    21     2355           2358        -3      412            438
##  8  2013    11    17      759            800        -1     1212           1255
##  9  2013    11    16     2003           1925        38       17             36
## 10  2013    11    16     2349           2359       -10      402            440
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Which flights travelled the farthest? Which travelled the shortest?
farthest <- arrange(flights, desc(distance))
farthest
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      857            900        -3     1516           1530
##  2  2013     1     2      909            900         9     1525           1530
##  3  2013     1     3      914            900        14     1504           1530
##  4  2013     1     4      900            900         0     1516           1530
##  5  2013     1     5      858            900        -2     1519           1530
##  6  2013     1     6     1019            900        79     1558           1530
##  7  2013     1     7     1042            900       102     1620           1530
##  8  2013     1     8      901            900         1     1504           1530
##  9  2013     1     9      641            900      1301     1242           1530
## 10  2013     1    10      859            900        -1     1449           1530
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
shortest <- arrange(flights, distance)
shortest
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7    27       NA            106        NA       NA            245
##  2  2013     1     3     2127           2129        -2     2222           2224
##  3  2013     1     4     1240           1200        40     1333           1306
##  4  2013     1     4     1829           1615       134     1937           1721
##  5  2013     1     4     2128           2129        -1     2218           2224
##  6  2013     1     5     1155           1200        -5     1241           1306
##  7  2013     1     6     2125           2129        -4     2224           2224
##  8  2013     1     7     2124           2129        -5     2212           2224
##  9  2013     1     8     2127           2130        -3     2304           2225
## 10  2013     1     9     2126           2129        -3     2217           2224
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Task 3

  1. Do the exercises 5.4.1

Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.

  1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
select(flights, dep_time:arr_delay, -c(sched_dep_time, sched_arr_time))
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
select(flights, -(year:day), -c(sched_dep_time, sched_arr_time), -(carrier:time_hour))
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
  1. What happens if you include the name of a variable multiple times in a select() call?
#It appears that the duplicated is ignored

select(flights, dep_time, dep_time, arr_time)
## # A tibble: 336,776 x 2
##    dep_time arr_time
##       <int>    <int>
##  1      517      830
##  2      533      850
##  3      542      923
##  4      544     1004
##  5      554      812
##  6      554      740
##  7      555      913
##  8      557      709
##  9      557      838
## 10      558      753
## # ... with 336,766 more rows
  1. What does the any_of() function do? Why might it be helpful in conjunction with this vector?

vars <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”)

#It selects any of the columns found in the vector and ignores the values of the vector that are not found in the data set.
#If one of the values in the vector, 'vars', was not a column name using any_of would stop an error from being thrown.

vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, any_of(vars))
## # A tibble: 336,776 x 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # ... with 336,766 more rows
  1. Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?

select(flights, contains(“TIME”))

#It appeas that, by default, the helpers turn everything into lowercase and look for matches.
#To change this default, one can use the following argument:

#contains(match, ignore.case = FALSE)

#Source:
#https://www.r-bloggers.com/2015/07/the-complete-catalog-of-argument-variations-of-select-in-dplyr/

select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # ... with 336,766 more rows
select(flights, contains("TIME", ignore.case = FALSE))
## # A tibble: 336,776 x 0

#To clear and run again...
#rm(flights_updated)

Task 4

  1. Do the exercises 5.5.2

Write up your solutions to the exercises in 5.5.2 in this document, including the code chunks you use to determine the answer.

  1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.
#convert dep_time and arr_time to minutes since midnight
mutate(flights, dep_mins = (dep_time %/% 100 * 60) + (dep_time %% 100)) %>%
  mutate(flights, arr_mins = (arr_time %/% 100 * 60) + (arr_time %% 100)) -> flights_updated
#Replace any midnight departure or arrivals (min value of 1440) with 0 (mins since midnight)
replace(flights_updated$dep_mins, flights_updated$dep_mins == 1440, 0) -> flights_updated$dep_mins
replace(flights_updated$arr_mins, flights_updated$arr_mins == 1440, 0) -> flights_updated$arr_mins

#results
#flights #original data set
flights_updated #updated data set shows new columns
## # A tibble: 336,776 x 21
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 13 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   dep_mins <dbl>, arr_mins <dbl>
filter(flights_updated, dep_mins == 0) #check that all 2400 changed to 0
## # A tibble: 29 x 21
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    10    30     2400           2359         1      327            337
##  2  2013    11    27     2400           2359         1      515            445
##  3  2013    12     5     2400           2359         1      427            440
##  4  2013    12     9     2400           2359         1      432            440
##  5  2013    12     9     2400           2250        70       59           2356
##  6  2013    12    13     2400           2359         1      432            440
##  7  2013    12    19     2400           2359         1      434            440
##  8  2013    12    29     2400           1700       420      302           2025
##  9  2013     2     7     2400           2359         1      432            436
## 10  2013     2     7     2400           2359         1      443            444
## # ... with 19 more rows, and 13 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, dep_mins <dbl>,
## #   arr_mins <dbl>
filter(flights_updated, arr_mins == 0) #check that all 2400 changed to 0
## # A tibble: 150 x 21
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     2209           2155        14     2400           2337
##  2  2013     1     5     2116           2130       -14     2400             18
##  3  2013     1    13     2243           2129        74     2400           2224
##  4  2013     1    16     2138           2107        31     2400           2322
##  5  2013     1    17     2256           2249         7     2400           2357
##  6  2013     1    22     2212           2055        77     2400           2250
##  7  2013     1    22     2249           2125        84     2400           2250
##  8  2013     1    25     2055           1725       210     2400           1933
##  9  2013     1    28     2303           2250        13     2400           2354
## 10  2013     1    30     2155           1915       160     2400           2137
## # ... with 140 more rows, and 13 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   dep_mins <dbl>, arr_mins <dbl>
  1. Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?
#flights

#I expect this to equal TRUE
#print((flights$arr_time - flights$dep_time) - flights$air_time == 0)
#but in fact, all are FALSE
#This is because arr_time and dep_time are in HHMM, not pure minutes

#must use converted dep_mins and arr_mins)
#print((flights_updated$arr_mins - flights_updated$dep_mins) - flights_updated$air_time == 0)
#This still returns all FALSE

#Upon examining the info on the data set, air time does not include time on the runway etc.
#To get the desired info, I must calculate a new column, which is total time of flight 
#abs(arr_min - dep_mins) -- abs() because midnight arrivals are 0 and not 1440 now.

flights_updated <- mutate(flights_updated, total_time = abs(arr_mins - dep_mins))
flights_updated
## # A tibble: 336,776 x 22
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 14 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   dep_mins <dbl>, arr_mins <dbl>, total_time <dbl>
#however, for some flights, total_time is now less than air_time
#e.g. for the first entry, total_time is 193 and air_time is 227.
#This is due to arrival time actually being in the local time zone.
#So, total time must be adjusted for based on time zone.
#This would require running a function that checks each airport's timezone and add to total time 60, 120, or 180 minutes.
#It also would require checking for flights that left before and arrived after midnight.
  1. Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?
#I would expect dep_time - sched_dep_time = dep_delay
#print((flights$dep_time - flights$sched_dep_time) == flights$dep_delay)
#Instead, only those flights that left on time are TRUE

#This is also due to the same issue that dep_time and arr_time are in HHMM, not pure minutes
#Both need to be converted to pure minutes since midnight using mutate()
#I've already done this for dep_mins, so I will do it for sched_dep_mins

mutate(flights_updated, sched_dep_mins = (sched_dep_time %/% 100 * 60) + (sched_dep_time %% 100)) -> flights_updated
replace(flights_updated$sched_dep_mins, flights_updated$sched_dep_mins == 1440, 0) -> flights_updated$sched_dep_mins
#print((flights_updated$dep_mins - flights_updated$sched_dep_mins) == flights_updated$dep_delay)
#There still are a few FALSEs

#in all cases, these were for flight whose delay pushed them past midnight into the next day
#filter(flights_updated, dep_mins - sched_dep_mins != dep_delay)

#This filter selects all flights were this was the case
next_day <- filter(flights_updated, dep_mins < sched_dep_mins & dep_delay > 0)
next_day
## # A tibble: 1,236 x 23
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848           1835       853     1001           1950
##  2  2013     1     2       42           2359        43      518            442
##  3  2013     1     2      126           2250       156      233           2359
##  4  2013     1     3       32           2359        33      504            442
##  5  2013     1     3       50           2145       185      203           2311
##  6  2013     1     3      235           2359       156      700            437
##  7  2013     1     4       25           2359        26      505            442
##  8  2013     1     4      106           2245       141      201           2356
##  9  2013     1     5       14           2359        15      503            445
## 10  2013     1     5       37           2230       127      341            131
## # ... with 1,226 more rows, and 15 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   dep_mins <dbl>, arr_mins <dbl>, total_time <dbl>, sched_dep_mins <dbl>

#To clear and run again...
#rm(flights2)
#rm(flights3)

#make a copy for testing
flights2 <- flights_updated

#Now I can replace the dep_mins with dep_mins + 1440 (an extra 24 hours, i.e the next day)
#flights2$dep_mins[flights2$dep_mins < flights2$sched_dep_mins & flights2$dep_delay > 0] <- flights2$dep_mins + 1440
#flights2

#This gave me an error that NAs are not allowed in subscripted assignemnts.
#There are a few NAs that I need to address
sum(is.na(next_day))
## [1] 22
colSums(is.na(next_day))
##           year          month            day       dep_time sched_dep_time 
##              0              0              0              0              0 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##              0              2              0              8              0 
##         flight        tailnum         origin           dest       air_time 
##              0              0              0              0              8 
##       distance           hour         minute      time_hour       dep_mins 
##              0              0              0              0              0 
##       arr_mins     total_time sched_dep_mins 
##              2              2              0
#However, I am not doing calculations/making changes on any data that has NA as its value
#so I don't understand why it is an issue
#I am only working with dep_mins, sched_dep_mins, and dep_delay
#filter(next_day, is.na(next_day$dep_mins) | is.na(next_day$sched_dep_mins) | is.na(next_day$dep_delay))
#filter(next_day, is.na(next_day$arr_delay))

#################################################################################

#I tried creating a new column, temp_dep_time that added 1440 to all dep_times
#Then I would match where flights were delayed into the next day and swapped dep_time for temp_dep_time
#But this also ran into the subcsripted NAs error.
#Essentially the exact same error as before 
#mutate(flights2, temp_dep_mins = dep_mins + 1440) -> flights3
#flights3
#flights3$dep_mins[flights3$dep_mins < flights3$sched_dep_mins & flights3$dep_delay > 0] <- flights3$temp_dep_mins
#flights3

#At this point, I don't know how to fix it unless I drop all record with any NAs, or assign some value like -1 to all NAs. Neither option seems like a good one for this data set and the questions being examined.
  1. Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().
#?min_rank
flights3 <- flights
flights3 <- mutate(flights3, dep_rank = min_rank(desc(dep_delay)))
arrange(filter(flights3, dep_rank <= 10), by = dep_rank)
## # A tibble: 10 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ... with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, dep_rank <int>
#To handle ties, I like min_rank's approach of giving the minimum ranking to all tied elements, then moving on to the nth next ranking.
#That is pretty much how ranking is typically done.
  1. What does 1:3 + 1:10 return? Why?
#The two vectors this sequencing creates are not of equal length. Therefore, the smaller vector will wrap around until the end of the larger vector-- 1+1, 2+2, 3+3, 1+4, 2+5, 3+6, 1+7, 2+8, 3+9, 1+10
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
##  [1]  2  4  6  5  7  9  8 10 12 11
  1. What trigonometric functions does R provide?
#?Trig
#According to the documentation, R provides the cos(x), sin(x), tan(x), arc-cos(x), arc-sin(x), arc-tan(x) functions
#as well as cospi(x), sinpi(x), tanpi(x), which would be the trig calculation of x*pi

Task 5

  1. Do the exercises 5.6.7

Write up your solutions to the exercises in 5.6.7 in this document, including the code chunks you use to determine the answer.

  1. Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios:

1-A. A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.

1-B. A flight is always 10 minutes late.

1-C. A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.

1-D. 99% of the time a flight is on time. 1% of the time it’s 2 hours late.

1-E. Which is more important: arrival delay or departure delay?

#Ways to asses typical delay characteristics
#1- Length of departure/arrival delay
#2- Ratio of being late vs. early in departure/arrival
#3- Consistency of pattern
#4- Avg. number of connecting flights pre/post flight
#5- Occurrences of outlier major delays

#I would argue that arrival delay is far more important
#Arrival delay has more potential impact, especially for connecting flights, etc.
  1. Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).
####
not_cancelled <- filter(flights, !(is.na(dep_delay) | is.na(arr_delay)))

#With count
not_cancelled %>%
  count(dest)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows
not_cancelled %>%
  count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # ... with 4,027 more rows
#With group_by and summarize
not_cancelled %>%
  group_by(dest) %>%
  summarize(num_not_cancelled = n())
## # A tibble: 104 x 2
##    dest  num_not_cancelled
##    <chr>             <int>
##  1 ABQ                 254
##  2 ACK                 264
##  3 ALB                 418
##  4 ANC                   8
##  5 ATL               16837
##  6 AUS                2411
##  7 AVL                 261
##  8 BDL                 412
##  9 BGR                 358
## 10 BHM                 269
## # ... with 94 more rows
not_cancelled %>%
  group_by(tailnum) %>%
  summarize(total_distance_of_flights = sum(distance))
## # A tibble: 4,037 x 2
##    tailnum total_distance_of_flights
##    <chr>                       <dbl>
##  1 D942DN                       3418
##  2 N0EGMQ                     239143
##  3 N10156                     109664
##  4 N102UW                      25722
##  5 N103US                      24619
##  6 N104UW                      24616
##  7 N10575                     139903
##  8 N105UW                      23618
##  9 N107US                      21677
## 10 N108UW                      32070
## # ... with 4,027 more rows
  1. Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) ) is slightly suboptimal. Why? Which is the most important column?
#The most important between these two is dep_delay.
#A flight might have a reason for no arrival time info (diverted), but won't have any info if it never leaves
#i.e. There are no flights that don't leave but have arr_delay
count(filter(flights, (is.na(dep_delay) | is.na(arr_delay)))) #suboptimal
## # A tibble: 1 x 1
##       n
##   <int>
## 1  9430
count(filter(flights, (is.na(dep_delay) & !is.na(arr_delay)))) #num flights that don't have dep_delay but have arr_delay
## # A tibble: 1 x 1
##       n
##   <int>
## 1     0
count(filter(flights, is.na(dep_delay))) #proposed method for finding true num of cancelled flights
## # A tibble: 1 x 1
##       n
##   <int>
## 1  8255
  1. Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
#Create the new info columns

group_by(flights, year, month, day) %>%
  summarize(num_flights = n(),
            num_cancelled_flights = sum(is.na(dep_delay)),
            prop_cancelled = sum(is.na(dep_delay)) / n(),
            avg_dep_delay = mean(dep_delay, na.rm = TRUE),
            avg_arr_delay = mean(arr_delay, na.rm = TRUE)) -> flights3
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
flights3
## # A tibble: 365 x 8
## # Groups:   year, month [12]
##     year month   day num_flights num_cancelled_fli~ prop_cancelled avg_dep_delay
##    <int> <int> <int>       <int>              <int>          <dbl>         <dbl>
##  1  2013     1     1         842                  4        0.00475         11.5 
##  2  2013     1     2         943                  8        0.00848         13.9 
##  3  2013     1     3         914                 10        0.0109          11.0 
##  4  2013     1     4         915                  6        0.00656          8.95
##  5  2013     1     5         720                  3        0.00417          5.73
##  6  2013     1     6         832                  1        0.00120          7.15
##  7  2013     1     7         933                  3        0.00322          5.42
##  8  2013     1     8         899                  4        0.00445          2.55
##  9  2013     1     9         902                  5        0.00554          2.28
## 10  2013     1    10         932                  3        0.00322          2.84
## # ... with 355 more rows, and 1 more variable: avg_arr_delay <dbl>

Graphing…

#When there are more than ~900 flights on a given day, the number (and %) of cancellations increase significantly.
#When either arrival or departure delays are greater than ~15 minutes, there is a moderate correlation with the percentage #of flights that get cancelled that day

flights3 %>%
  ggplot() +
  geom_point(mapping = aes(x = num_flights, y = num_cancelled_flights), color = 'mediumpurple1', alpha = 0.5)

flights3 %>%
  ggplot() +
  geom_point(mapping = aes(x = num_flights, y = prop_cancelled), color = 'mediumpurple3', alpha = 0.5)

flights3 %>%
  ggplot(mapping = aes(y = prop_cancelled)) +
  geom_point(mapping = aes(x = avg_dep_delay), color = 'red', alpha = 0.5) +
  geom_point(mapping = aes(x = avg_arr_delay), color = 'green', alpha = 0.5)

rm(car_delays, car_delays_comp, car_delays_filter, car_delays_filter2, car_delays2, car_delays3)
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays_comp' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays_filter' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays_filter2' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays2' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays3' not found
  1. Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))
#Worst Delays -- raw avg minutes
car_delays_filter <- group_by(flights, carrier)
car_delays_filter %>%
  summarize(avg_dep_delay_carr = mean(dep_delay, na.rm = TRUE),
            avg_arr_delay_carr = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_arr_delay_carr)) -> car_delays
car_delays
## # A tibble: 16 x 3
##    carrier avg_dep_delay_carr avg_arr_delay_carr
##    <chr>                <dbl>              <dbl>
##  1 F9                   20.2              21.9  
##  2 FL                   18.7              20.1  
##  3 EV                   20.0              15.8  
##  4 YV                   19.0              15.6  
##  5 OO                   12.6              11.9  
##  6 MQ                   10.6              10.8  
##  7 WN                   17.7               9.65 
##  8 B6                   13.0               9.46 
##  9 9E                   16.7               7.38 
## 10 UA                   12.1               3.56 
## 11 US                    3.78              2.13 
## 12 VX                   12.9               1.76 
## 13 DL                    9.26              1.64 
## 14 AA                    8.59              0.364
## 15 HA                    4.90             -6.92 
## 16 AS                    5.80             -9.93

#Avg delays (all carriers) seen at each airport
group_by(car_delays_filter, dest) %>%
  summarize(avg_dep_delay_dest = mean(dep_delay, na.rm = TRUE),
            avg_arr_delay_dest = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_arr_delay_dest)) -> car_delays2
car_delays2
## # A tibble: 105 x 3
##    dest  avg_dep_delay_dest avg_arr_delay_dest
##    <chr>              <dbl>              <dbl>
##  1 CAE                 35.6               41.8
##  2 TUL                 34.9               33.7
##  3 OKC                 30.6               30.6
##  4 JAC                 26.5               28.1
##  5 TYS                 28.5               24.1
##  6 MSN                 23.6               20.2
##  7 RIC                 23.6               20.1
##  8 CAK                 20.8               19.7
##  9 DSM                 26.2               19.0
## 10 GRR                 19.5               18.2
## # ... with 95 more rows

#Worst delays with airport figured in
car_delays_filter2 <- group_by(flights, carrier, dest)
car_delays_filter2 %>%
  summarize(avg_dep_delay_carr = mean(dep_delay, na.rm = TRUE),
            avg_arr_delay_carr = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_arr_delay_carr)) -> car_delays3
## `summarise()` has grouped output by 'carrier'. You can override using the `.groups` argument.
car_delays3
## # A tibble: 314 x 4
## # Groups:   carrier [16]
##    carrier dest  avg_dep_delay_carr avg_arr_delay_carr
##    <chr>   <chr>              <dbl>              <dbl>
##  1 UA      STL                 77.5              110  
##  2 OO      ORD                 67                107  
##  3 OO      DTW                 61                 68.5
##  4 UA      RDU                 60                 56  
##  5 EV      CAE                 36.7               42.8
##  6 EV      TYS                 41.8               41.2
##  7 EV      PBI                 48.7               40.7
##  8 EV      TUL                 34.9               33.7
##  9 EV      OKC                 30.6               30.6
## 10 UA      JAC                 28.7               29.9
## # ... with 304 more rows
#Make a comparison based on +/- avg delay at airport
car_delays_comp <- car_delays3
car_delays_comp %>%
  left_join(select(car_delays2, c(dest, avg_arr_delay_dest)), by = 'dest') -> test

#drop_na(test) -> test
test
## # A tibble: 314 x 5
## # Groups:   carrier [16]
##    carrier dest  avg_dep_delay_carr avg_arr_delay_carr avg_arr_delay_dest
##    <chr>   <chr>              <dbl>              <dbl>              <dbl>
##  1 UA      STL                 77.5              110                11.1 
##  2 OO      ORD                 67                107                 5.88
##  3 OO      DTW                 61                 68.5               5.43
##  4 UA      RDU                 60                 56                10.1 
##  5 EV      CAE                 36.7               42.8              41.8 
##  6 EV      TYS                 41.8               41.2              24.1 
##  7 EV      PBI                 48.7               40.7               8.56
##  8 EV      TUL                 34.9               33.7              33.7 
##  9 EV      OKC                 30.6               30.6              30.6 
## 10 UA      JAC                 28.7               29.9              28.1 
## # ... with 304 more rows
car_delays_comp <- mutate(test, adj_avg_arr_delay = avg_arr_delay_carr - avg_arr_delay_dest)

#flights2 %>%
#  select(-origin,-dest) %>%
#  left_join(airlines,by = "carrier")

#select(df2, c(id,value2)), by = "id")
#Summarize the mean of the adjusted delays
car_delays_comp %>%
  summarize(mean_of_avg_delays_relative_to_all_carriers_avg = mean(adj_avg_arr_delay, na.rm = TRUE)) %>%
  arrange(desc(mean_of_avg_delays_relative_to_all_carriers_avg))
## # A tibble: 16 x 2
##    carrier mean_of_avg_delays_relative_to_all_carriers_avg
##    <chr>                                             <dbl>
##  1 OO                                               28.0  
##  2 F9                                               13.3  
##  3 B6                                                2.61 
##  4 EV                                                2.16 
##  5 MQ                                                1.49 
##  6 VX                                                0.475
##  7 AA                                               -0.153
##  8 UA                                               -0.695
##  9 WN                                               -1.23 
## 10 FL                                               -1.42 
## 11 US                                               -3.80 
## 12 YV                                               -4.65 
## 13 HA                                               -5.55 
## 14 9E                                               -6.69 
## 15 AS                                               -8.83 
## 16 DL                                               -9.87
#original
select(car_delays, -c(avg_dep_delay_carr))
## # A tibble: 16 x 2
##    carrier avg_arr_delay_carr
##    <chr>                <dbl>
##  1 F9                  21.9  
##  2 FL                  20.1  
##  3 EV                  15.8  
##  4 YV                  15.6  
##  5 OO                  11.9  
##  6 MQ                  10.8  
##  7 WN                   9.65 
##  8 B6                   9.46 
##  9 9E                   7.38 
## 10 UA                   3.56 
## 11 US                   2.13 
## 12 VX                   1.76 
## 13 DL                   1.64 
## 14 AA                   0.364
## 15 HA                  -6.92 
## 16 AS                  -9.93
#NOTE
#I feel like I actually needed to go back and make the joins with the original full flights table
#so that when the means is taken here, the number of flights per carrier per airport is weighted properly??
  1. What does the sort argument to count() do. When might you use it?
#Using the sort argument places the most common occurrences first
#This is useful when doing any analysis based on frequency or volume.
not_cancelled %>%
  count(dest, sort= TRUE)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ATL   16837
##  2 ORD   16566
##  3 LAX   16026
##  4 BOS   15022
##  5 MCO   13967
##  6 CLT   13674
##  7 SFO   13173
##  8 FLL   11897
##  9 MIA   11593
## 10 DCA    9111
## # ... with 94 more rows

Turning in your work and announcements for the remainder of the Semester

Final Project

  • Then there will be a final project. There will be no final exam. I will provide a list of three projects from which you may choose the project you will do.
  • I will provide all the data necessary.
  • You will also post your final project on RPubs.
  • The final project will be due on Midnight the day the final is scheduled.