library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map

Exercise 13.4.6

  1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays.
airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point() +
    coord_quickmap()

avg_dest_delays <-
  flights %>%
  group_by(dest) %>%
  summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c(dest = "faa"))

avg_dest_delays %>%
  ggplot(aes(lon, lat, colour = delay)) +
    borders("state") +
    geom_point() +
    coord_quickmap() + 
  ylab("Latitude") + 
  xlab("Longitude")+ 
  ggtitle("Average Delay by Arrival Times")

  1. Add the location of the origin and destination (i.e. the lat and lon) to flights.
airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(origin, dest) %>%
  left_join(airport_locations, by = c("origin" = "faa")) %>%
  left_join(airport_locations, by = c("dest" = "faa"), suffix = c(".origin", ".dest"))
## # A tibble: 336,776 x 6
##    origin dest  lat.origin lon.origin lat.dest lon.dest
##    <chr>  <chr>      <dbl>      <dbl>    <dbl>    <dbl>
##  1 EWR    IAH         40.7      -74.2     30.0    -95.3
##  2 LGA    IAH         40.8      -73.9     30.0    -95.3
##  3 JFK    MIA         40.6      -73.8     25.8    -80.3
##  4 JFK    BQN         40.6      -73.8     NA       NA  
##  5 LGA    ATL         40.8      -73.9     33.6    -84.4
##  6 EWR    ORD         40.7      -74.2     42.0    -87.9
##  7 EWR    FLL         40.7      -74.2     26.1    -80.2
##  8 LGA    IAD         40.8      -73.9     38.9    -77.5
##  9 JFK    MCO         40.6      -73.8     28.4    -81.3
## 10 LGA    ORD         40.8      -73.9     42.0    -87.9
## # … with 336,766 more rows
  1. Is there a relationship between the age of a plane and its delays?
plane_ages <- 
  planes %>%
  mutate(age = 2013 - year) %>%
  select(tailnum, age)

flights %>%
  inner_join(plane_ages, by = "tailnum") %>%
  group_by(age) %>%
  filter(!is.na(dep_delay)) %>%
  summarise(delay = mean(dep_delay)) %>%
  ggplot(aes(x = age, y = delay)) +
  geom_point() +
  geom_line()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).

There does not appear to be a relationship between the age of a plane and its delays. From 0 to 10 years, there was a strong positives correlation, then the trend reverse, causing a negative correlation between plane age and arrival delay from 10 to 30 years.

  1. What weather conditions make it more likely to see a delay?
flight_weather <- flights %>% 
  inner_join(weather,by=c("origin", "year", "month", "day", "hour" ))

 flight_weather %>%
  group_by(precip) %>%
  summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = precip, y = delay)) +
    geom_line() + geom_point() +
  ylab("Delays") + 
  xlab("Precipitation")+ 
  ggtitle("Precipitation vs Delays")

flight_weather %>%
  group_by(wind_gust) %>%
  summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = wind_gust, y = delay)) +
    geom_line() + geom_point() +
   ylab("Delays") + 
  xlab("Wind Gust")+ 
  ggtitle("Wind Gust vs Delays")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

flight_weather %>%
  group_by(visib) %>%
  summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = visib, y = delay)) +
    geom_line() + geom_point() +
   ylab("Delays") + 
  xlab("Visibility")+ 
  ggtitle("Visibility vs Delays")

Precipitation and departure delays seem to have a positive correlation with departure delays. Higher levels of precipitation correspond to delayed flights.

Wind Gust and departure delays seem to have a positive correlation with departure delays. higher levels of wind gust correspond to delayed flights.

Visibility seem to have a negative correlation with departure delays. Low visibility correspond to flight more likely to be delayed.

  1. What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.
flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarize(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  ggplot(aes(x = lon, y = lat, size = delay, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap() +
  xlab("Latitude") +
  ylab("Longitude") +
  ggtitle(" Average Delay by Arrival Times on June 13, 2013")
## Warning: Removed 3 rows containing missing values (geom_point).

The East side and the Midwest of the United States were affected by the storms.

Exercises 13.5.1

  1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)
flights %>%
  filter(is.na(tailnum))
## # A tibble: 2,512 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     2       NA           1545        NA       NA           1910
##  2  2013     1     2       NA           1601        NA       NA           1735
##  3  2013     1     3       NA            857        NA       NA           1209
##  4  2013     1     3       NA            645        NA       NA            952
##  5  2013     1     4       NA            845        NA       NA           1015
##  6  2013     1     4       NA           1830        NA       NA           2044
##  7  2013     1     5       NA            840        NA       NA           1001
##  8  2013     1     7       NA            820        NA       NA            958
##  9  2013     1     8       NA           1645        NA       NA           1838
## 10  2013     1     9       NA            755        NA       NA           1012
## # … with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  anti_join(planes, by = "tailnum") %>%
  count(carrier, sort = TRUE)
## # A tibble: 10 x 2
##    carrier     n
##    <chr>   <int>
##  1 MQ      25397
##  2 AA      22558
##  3 UA       1693
##  4 9E       1044
##  5 B6        830
##  6 US        699
##  7 FL        187
##  8 DL        110
##  9 F9         50
## 10 WN         38

Flights with missing tail numbers have missing departure and arrival times. It can assumed that the flights without tail numbers were grounded and didn’t leave the origin airport.

  1. Filter flights to only show flights with planes that have flown at least 100 flights.
flights %>%
  group_by(tailnum) %>%
  filter(n() > 100)
## # A tibble: 229,202 x 19
## # Groups:   tailnum [1,201]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # … with 229,192 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.
library(fueleconomy)
fueleconomy::vehicles %>%
  semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 x 12
##       id make  model   year class   trans  drive     cyl displ fuel    hwy   cty
##    <dbl> <chr> <chr>  <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
##  1  1833 Acura Integ…  1986 Subcom… Autom… Front-…     4   1.6 Regu…    28    22
##  2  1834 Acura Integ…  1986 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  3  3037 Acura Integ…  1987 Subcom… Autom… Front-…     4   1.6 Regu…    28    22
##  4  3038 Acura Integ…  1987 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  5  4183 Acura Integ…  1988 Subcom… Autom… Front-…     4   1.6 Regu…    27    22
##  6  4184 Acura Integ…  1988 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  7  5303 Acura Integ…  1989 Subcom… Autom… Front-…     4   1.6 Regu…    27    22
##  8  5304 Acura Integ…  1989 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  9  6442 Acura Integ…  1990 Subcom… Autom… Front-…     4   1.8 Regu…    24    20
## 10  6443 Acura Integ…  1990 Subcom… Manua… Front-…     4   1.8 Regu…    26    21
## # … with 14,521 more rows
  1. Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?
delay_weather <- flights %>% 
  inner_join(weather, by= c("origin", "year", "month", "day", "hour")) %>%
  arrange(desc(dep_delay)) %>%
  slice(1:48)
delay_weather
## # A tibble: 48 x 29
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # … with 38 more rows, and 21 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour.x <dttm>, temp <dbl>,
## #   dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
summary(delay_weather)
##       year          month             day           dep_time     
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :  12.0  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.50   1st Qu.: 693.0  
##  Median :2013   Median : 6.000   Median :14.00   Median : 935.5  
##  Mean   :2013   Mean   : 6.292   Mean   :13.92   Mean   :1220.6  
##  3rd Qu.:2013   3rd Qu.: 9.000   3rd Qu.:19.00   3rd Qu.:1971.8  
##  Max.   :2013   Max.   :12.000   Max.   :27.00   Max.   :2346.0  
##                                                                  
##  sched_dep_time   dep_delay         arr_time      sched_arr_time
##  Min.   : 615   Min.   : 545.0   Min.   :  17.0   Min.   :  32  
##  1st Qu.:1030   1st Qu.: 635.5   1st Qu.: 791.5   1st Qu.:1296  
##  Median :1672   Median : 787.5   Median :1013.0   Median :1924  
##  Mean   :1482   Mean   : 777.0   Mean   :1049.5   Mean   :1700  
##  3rd Qu.:1845   3rd Qu.: 853.0   3rd Qu.:1293.0   3rd Qu.:2060  
##  Max.   :2100   Max.   :1301.0   Max.   :2308.0   Max.   :2249  
##                                  NA's   :1                      
##    arr_delay        carrier              flight         tailnum         
##  Min.   : 538.0   Length:48          Min.   :  23.0   Length:48         
##  1st Qu.: 638.5   Class :character   1st Qu.: 395.8   Class :character  
##  Median : 780.0   Mode  :character   Median :1706.0   Mode  :character  
##  Mean   : 772.6                      Mean   :1607.5                     
##  3rd Qu.: 851.5                      3rd Qu.:2343.0                     
##  Max.   :1272.0                      Max.   :4711.0                     
##  NA's   :1                                                              
##     origin              dest              air_time        distance     
##  Length:48          Length:48          Min.   : 41.0   Min.   : 184.0  
##  Class :character   Class :character   1st Qu.:111.5   1st Qu.: 756.5  
##  Mode  :character   Mode  :character   Median :145.0   Median :1020.0  
##                                        Mean   :185.7   Mean   :1344.0  
##                                        3rd Qu.:227.5   3rd Qu.:1707.2  
##                                        Max.   :640.0   Max.   :4983.0  
##                                        NA's   :1                       
##       hour          minute       time_hour.x                       temp      
##  Min.   : 6.0   Min.   : 0.00   Min.   :2013-01-01 18:00:00   Min.   :23.00  
##  1st Qu.:10.0   1st Qu.: 3.75   1st Qu.:2013-04-09 04:45:00   1st Qu.:41.54  
##  Median :16.5   Median :25.00   Median :2013-06-27 18:00:00   Median :58.01  
##  Mean   :14.6   Mean   :21.50   Mean   :2013-06-23 16:58:45   Mean   :57.64  
##  3rd Qu.:18.0   3rd Qu.:30.00   3rd Qu.:2013-09-12 14:00:00   3rd Qu.:77.27  
##  Max.   :21.0   Max.   :59.00   Max.   :2013-12-19 17:00:00   Max.   :89.06  
##                                                                              
##       dewp           humid           wind_dir       wind_speed    
##  Min.   :-0.04   Min.   : 26.89   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:27.73   1st Qu.: 55.35   1st Qu.:137.5   1st Qu.: 8.055  
##  Median :52.52   Median : 73.25   Median :180.0   Median :10.357  
##  Mean   :47.08   Mean   : 70.28   Mean   :191.9   Mean   :12.563  
##  3rd Qu.:69.31   3rd Qu.: 83.79   3rd Qu.:267.5   3rd Qu.:16.111  
##  Max.   :73.40   Max.   :100.00   Max.   :360.0   Max.   :33.373  
##                                                                   
##    wind_gust         precip          pressure        visib       
##  Min.   :24.17   Min.   :0.0000   Min.   :1005   Min.   : 0.120  
##  1st Qu.:25.32   1st Qu.:0.0000   1st Qu.:1011   1st Qu.: 8.000  
##  Median :25.32   Median :0.0000   Median :1017   Median :10.000  
##  Mean   :29.06   Mean   :0.0075   Mean   :1017   Mean   : 8.015  
##  3rd Qu.:33.37   3rd Qu.:0.0000   3rd Qu.:1021   3rd Qu.:10.000  
##  Max.   :40.28   Max.   :0.1400   Max.   :1033   Max.   :10.000  
##  NA's   :40                       NA's   :15                     
##   time_hour.y                 
##  Min.   :2013-01-01 18:00:00  
##  1st Qu.:2013-04-09 04:45:00  
##  Median :2013-06-27 18:00:00  
##  Mean   :2013-06-23 16:58:45  
##  3rd Qu.:2013-09-12 14:00:00  
##  Max.   :2013-12-19 17:00:00  
## 

All variables were higher on average times and days.

  1. What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?
flights %>%
  anti_join(airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      615            615         0     1039           1100
##  3  2013     1     1      628            630        -2     1137           1140
##  4  2013     1     1      701            700         1     1123           1154
##  5  2013     1     1      711            715        -4     1151           1206
##  6  2013     1     1      820            820         0     1254           1310
##  7  2013     1     1      820            820         0     1249           1329
##  8  2013     1     1      840            845        -5     1311           1350
##  9  2013     1     1      909            810        59     1331           1315
## 10  2013     1     1      913            918        -5     1346           1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

anti_join(flights, airports, by = c(“dest” = “faa”)) are flights that go to an airport that is not in FAA list of destinations.

airports %>%
  anti_join(flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo…
##  2 06A   Moton Field Municipal A…  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo…
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo…
##  6 0A9   Elizabethton Municipal …  36.4  -82.2  1593    -5 A     America/New_Yo…
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo…
##  8 0G7   Finger Lakes Regional A…  42.9  -76.8   492    -5 A     America/New_Yo…
##  9 0P2   Shoestring Aviation Air…  39.8  -76.6  1000    -5 U     America/New_Yo…
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An…
## # … with 1,347 more rows

anti_join(airports, flights, by = c(“faa” = “dest”)) are US airports that don’t have a flight in the data, meaning that there were no flights to that aiport from New York in 2013.

  1. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.
plane_airline <- flights %>%
  filter(!is.na(tailnum)) %>%
  distinct(tailnum, carrier)
multi_airline <- plane_airline %>%
  group_by(tailnum) %>%
  filter(n() > 1) %>%
  arrange(tailnum)
multi_airline
## # A tibble: 34 x 2
## # Groups:   tailnum [17]
##    carrier tailnum
##    <chr>   <chr>  
##  1 9E      N146PQ 
##  2 EV      N146PQ 
##  3 9E      N153PQ 
##  4 EV      N153PQ 
##  5 9E      N176PQ 
##  6 EV      N176PQ 
##  7 9E      N181PQ 
##  8 EV      N181PQ 
##  9 9E      N197PQ 
## 10 EV      N197PQ 
## # … with 24 more rows

The hypothesis is rejected. There are seventeen planes that been used by more than one carrier.