library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("nycflights13")
library("fueleconomy")

Exercise 13.4.6

1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here is an easy way to draw a map of the United States:

head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
head(airports)
## # A tibble: 6 x 8
##   faa   name                          lat   lon   alt    tz dst   tzone         
##   <chr> <chr>                       <dbl> <dbl> <dbl> <dbl> <chr> <chr>         
## 1 04G   Lansdowne Airport            41.1 -80.6  1044    -5 A     America/New_Y…
## 2 06A   Moton Field Municipal Airp…  32.5 -85.7   264    -6 A     America/Chica…
## 3 06C   Schaumburg Regional          42.0 -88.1   801    -6 A     America/Chica…
## 4 06N   Randall Airport              41.4 -74.4   523    -5 A     America/New_Y…
## 5 09J   Jekyll Island Airport        31.1 -81.4    11    -5 A     America/New_Y…
## 6 0A9   Elizabethton Municipal Air…  36.4 -82.2  1593    -5 A     America/New_Y…
head(weather)
## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA
## 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA
## 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA
## 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA
## 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA
## 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA
## # … with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>
airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
  borders("state") +
  geom_point() +
  xlab("Longitude") +
  ylab("Latitude")+
  coord_quickmap()

p1.13.4.6 <- flights %>% group_by(dest) %>%
  summarize(Average_Delay = mean(arr_delay, na.rm = TRUE)) %>%
   #inner vs left or right join in this case
  inner_join(airports, by = c('dest' = 'faa')) %>%
  ggplot(aes(x = lon, y = lat, color = Average_Delay)) +
  borders('state') +
  geom_point() +
  labs(title = "Average Flight Delay by Destination", 
       subtitle = "Spatial distribution of delays" , 
       caption = "Dataset: nycflights13") +
  xlab("Longitude") +
  ylab("Latitude") +
  scale_color_gradient(low = "#ffd60a", high = "#03045e") +
  coord_quickmap()
p1.13.4.6

2. Add the location of the origin and destination (i.e. the lat and lon) to flights.

q2.13.4.6 <- flights %>% 
  left_join(airports, c(origin = "faa")) %>%
  #each time we left join it copies airport fields into flights, suffix to differentiate 
  left_join(airports, c(dest = "faa"), suffix = c('.origin', '.dest')) %>%
  #left_join(airports, c(dest = "faa"))
  select(dest, origin, contains('lat'), contains('lon'))
q2.13.4.6
## # A tibble: 336,776 x 6
##    dest  origin lat.origin lat.dest lon.origin lon.dest
##    <chr> <chr>       <dbl>    <dbl>      <dbl>    <dbl>
##  1 IAH   EWR          40.7     30.0      -74.2    -95.3
##  2 IAH   LGA          40.8     30.0      -73.9    -95.3
##  3 MIA   JFK          40.6     25.8      -73.8    -80.3
##  4 BQN   JFK          40.6     NA        -73.8     NA  
##  5 ATL   LGA          40.8     33.6      -73.9    -84.4
##  6 ORD   EWR          40.7     42.0      -74.2    -87.9
##  7 FLL   EWR          40.7     26.1      -74.2    -80.2
##  8 IAD   LGA          40.8     38.9      -73.9    -77.5
##  9 MCO   JFK          40.6     28.4      -73.8    -81.3
## 10 ORD   LGA          40.8     42.0      -73.9    -87.9
## # … with 336,766 more rows

3. Is there a relationship between the age of a plane and its delays?

After fitting a linear regression line to the age versus delays plot where age is the explanatory variable and delays is the response variable it appears that delay seems to decrease slightly with the age of a plane.

plane_age <- planes %>%
  mutate(age = 2013 - year) %>%
  select(tailnum, age)
head(plane_age)
## # A tibble: 6 x 2
##   tailnum   age
##   <chr>   <dbl>
## 1 N10156      9
## 2 N102UW     15
## 3 N103US     14
## 4 N104UW     14
## 5 N10575     11
## 6 N105UW     14
p3.13.4.6 <- flights %>%
  #can also do, by = "tailnum"
  inner_join(plane_age, by = c("tailnum")) %>%
  group_by(age) %>%
  filter(!is.na(dep_delay) & !is.na(age)) %>%
  #why does mean(arr_delay) give me weird data
  summarise(delay = mean(dep_delay)) %>%
  ggplot(aes(x = age, y = delay)) +
  geom_point(color = "#001f54")+
  geom_smooth(method = lm, color = "#c9a227") +
  labs(title = "Flight Delay by Age of a Plane", 
       subtitle = "Relationship between the age of a plane and its delays" , 
       caption = "Dataset: nycflights13") +
  xlab("Age of the Plane (in years)") +
  ylab("Delay") +
  geom_line()
p3.13.4.6
## `geom_smooth()` using formula 'y ~ x'

4. What weather conditions make it more likely to see a delay?

Just looking at the individual bi-variate relationship between departure delay and each weather condition, there does not appear to be any strong correlation. While precipitation seems to make the largest difference, it is still not as prominent making it tough to use weather condition as a good predictor for flight delay.

p4.13.4.6 <- flights %>% 
  #why does by = "year" give a limit error?
  left_join(weather, by = c('year','month','day','hour', 'origin', 'time_hour')) %>% 
  filter(!is.na(temp) & !is.na(dewp) & !is.na(humid) & 
           !is.na(precip) & !is.na(pressure) & !is.na(visib) & 
           !is.na(wind_dir) & !is.na(wind_gust) & !is.na(wind_speed)) %>%
  gather(condition, value, c(temp, dewp, humid, precip, pressure, visib, wind_dir, wind_gust, wind_speed)) %>%
  filter(!is.na(dep_delay)) %>%
  ggplot(mapping = aes(x = value, y = dep_delay, color = condition)) +
  geom_point() +
  labs(title = "Flight Delay by Weather Condition", 
       subtitle = "Weather conditions that are more likely to see a delay" , 
       caption = "Dataset: nycflights13") +
  xlab("Weather Stats") +
  ylab("Delay") +
  facet_wrap(~condition, ncol = 3, scale = "free") +
  scale_color_manual( name = "Weather Condition", 
                     labels = c("Dew Point", "Humidity", "Precipitation", "Pressure", "Temperature", "Visibility", "Wind Direction", "Wind Speed"), 
                     values = c("#edc531", "#dbb42c", "#c9a227", "#682a92", "#5d2689", "#522882", "#065a82", "#1b3b6f","#21295c" )) 
p4.13.4.6

5. What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.

According to the spatial pattern of delays there appears to be large delays in the southeast region. Cross-referencing this information reveals that nationwide, more than 900 flights were canceled and 7,400 delayed, according to flight-tracking service FlightStats due to a line of strong storms moving in the mid-Atlantic on June 13, 2013.

p5.13.4.6 <- flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
  borders("state") +
  geom_point(alpha = 0.9) + 
  labs(title = "Spatial Pattern of Flights Delayed", 
       subtitle = "On June 13, 2013", 
       caption = "Dataset: nycflights13") +
  xlab("Longitude") +
  ylab("Latitude") +
  coord_quickmap() +
  scale_color_gradient2(low = "#ffd60a", mid = "#ffd60a", high = "#03045e")
p5.13.4.6
## Warning: Removed 3 rows containing missing values (geom_point).

Exercises 13.5.1

1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)

Flights that have a missing tail number all have missing values in the arrival time column, meaning that the flight was canceled. For those tailnum that don’t have a matching record in plane, it seems most of them come from the same two carriers.

q1a.13.5.1 <- flights %>%
  filter(is.na(tailnum), !is.na(arr_time)) %>%
  nrow()
q1a.13.5.1
## [1] 0
q1b.13.5.1 <- flights %>%
  anti_join(planes, by = 'tailnum') %>%
  group_by(carrier) %>%
  summarize(n = n()) %>%
  arrange(desc(n))
q1b.13.5.1
## # A tibble: 10 x 2
##    carrier     n
##    <chr>   <int>
##  1 MQ      25397
##  2 AA      22558
##  3 UA       1693
##  4 9E       1044
##  5 B6        830
##  6 US        699
##  7 FL        187
##  8 DL        110
##  9 F9         50
## 10 WN         38

2. Filter flights to only show flights with planes that have flown at least 100 flights.

q2.13.5.1 <- flights %>% 
  group_by(tailnum) %>% 
  filter(n() > 100) 
q2.13.5.1
## # A tibble: 229,202 x 19
## # Groups:   tailnum [1,201]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # … with 229,192 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

q3.13.5.1 <- fueleconomy::vehicles %>%
  semi_join(fueleconomy::common, by = c("make", "model"))
q3.13.5.1
## # A tibble: 14,531 x 12
##       id make  model   year class   trans  drive     cyl displ fuel    hwy   cty
##    <dbl> <chr> <chr>  <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
##  1  1833 Acura Integ…  1986 Subcom… Autom… Front-…     4   1.6 Regu…    28    22
##  2  1834 Acura Integ…  1986 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  3  3037 Acura Integ…  1987 Subcom… Autom… Front-…     4   1.6 Regu…    28    22
##  4  3038 Acura Integ…  1987 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  5  4183 Acura Integ…  1988 Subcom… Autom… Front-…     4   1.6 Regu…    27    22
##  6  4184 Acura Integ…  1988 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  7  5303 Acura Integ…  1989 Subcom… Autom… Front-…     4   1.6 Regu…    27    22
##  8  5304 Acura Integ…  1989 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  9  6442 Acura Integ…  1990 Subcom… Autom… Front-…     4   1.8 Regu…    24    20
## 10  6443 Acura Integ…  1990 Subcom… Manua… Front-…     4   1.8 Regu…    26    21
## # … with 14,521 more rows

4. Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?

Although, cross-referencing this information did not turn up any results, it appears to be that high precipitation, high pressure and high wind direction were recorded around June 30th and July 1st 2013 which may have caused flight delays within a 48 hour period in 2013.

p4a.13.5.1 <- flights %>% 
  ggplot(aes(x=time_hour, y=arr_delay)) +
  geom_smooth(color = "#0f4c5c") +
  labs(title = "Worst Delays over the course of 2013", 
       subtitle = "Within a 48 hour period", 
       caption = "Dataset: nycflights13") +
  xlab("Jan 2013 - Dec 2013") + 
  ylab("Delay")
p4a.13.5.1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9430 rows containing non-finite values (stat_smooth).

p4b.13.5.1 <- flights %>% 
  filter((month == 6 & day %in% c(29,30)) | (month == 7 & day %in% c(1,2))) %>%
  ggplot(aes(x=time_hour, y=arr_delay)) + 
  geom_smooth(color = "#0f4c5c") +
  labs(title = "Worst Delays over the course of 2013", 
       subtitle = "Within a 48 hour period", 
       caption = "Dataset: nycflights13") +
  xlab("Date (from 06/30 to 07/02)") + 
  ylab("Delay")
p4b.13.5.1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 221 rows containing non-finite values (stat_smooth).

q4.13.5.1 <- flights %>% 
  filter(time_hour > "2013-06-30 07:00:00" & time_hour < "2013-07-01 13:00:00" ) %>%
  left_join(weather, by = c("origin", "time_hour")) 
q4.13.5.1
## # A tibble: 1,244 x 32
##    year.x month.x day.x dep_time sched_dep_time dep_delay arr_time
##     <int>   <int> <int>    <int>          <int>     <dbl>    <int>
##  1   2013       6    30       12           2231       101      352
##  2   2013       6    30       21           2300        81      116
##  3   2013       6    30       23           2055       208      123
##  4   2013       6    30       25           2359        26      413
##  5   2013       6    30       43           2250       113      150
##  6   2013       6    30       56           2245       131      201
##  7   2013       6    30      116           2359        77      451
##  8   2013       6    30      153           2245       188      422
##  9   2013       6    30      217           2359       138      545
## 10   2013       6    30      754            800        -6     1143
## # … with 1,234 more rows, and 25 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour.x <dbl>, minute <dbl>,
## #   time_hour <dttm>, year.y <int>, month.y <int>, day.y <int>, hour.y <int>,
## #   temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>,
## #   wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>
p4c.13.5.1 <- q4.13.5.1 %>% 
  gather(key = "weather", value = "amount", 
         temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib) %>% 
  ggplot(aes(x = time_hour, y = amount, color = weather)) + 
  geom_smooth(na.rm = TRUE) + 
  labs(title = "Worst Delays over the course of 2013", 
       subtitle = "Within a 48 hour period", 
       caption = "Dataset: nycflights13") +
  xlab("Time of the Day") + 
  ylab("Weather Stats") + 
  theme(axis.text.x = element_text(angle = 80, vjust = .55)) +
  facet_wrap(vars(weather), scales = "free", ncol = 4) +
  scale_color_manual(name = "Weather Condition", 
                     labels = c("Dew Point", "Humidity", "Precipitation", "Pressure", "Temperature", "Visibility", "Wind Direction", "Wind Speed") ,  
                     values = c("#b7094c", "#a01a58", "#892b64", "#723c70", "#5c4d7d", "#455e89", "#1780a1", "#0091ad"))
p4c.13.5.1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

5. What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?

The expression anti_join(flights, airports, by = c(“dest” = “faa”)) returns the flights that went to an airport that is not in the FAA list of destinations. The expression anti_join(airports, flights, by = c(“faa” = “dest”)) returns the US airports that were not the destination of any flight in the data.

6. You might expect that there is an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you have learned above.

The hypothesis that each plane is flown by a single airline should be rejected because as seen below, there were 17 planes that were flown by over 1 carrier.

q6.13.5.1 <- flights %>%
  select(carrier, tailnum) %>%
  group_by(tailnum) %>%
  summarize(n = length(unique(carrier))) %>%
  filter(n > 1)
q6.13.5.1
## # A tibble: 18 x 2
##    tailnum     n
##    <chr>   <int>
##  1 N146PQ      2
##  2 N153PQ      2
##  3 N176PQ      2
##  4 N181PQ      2
##  5 N197PQ      2
##  6 N200PQ      2
##  7 N228PQ      2
##  8 N232PQ      2
##  9 N933AT      2
## 10 N935AT      2
## 11 N977AT      2
## 12 N978AT      2
## 13 N979AT      2
## 14 N981AT      2
## 15 N989AT      2
## 16 N990AT      2
## 17 N994AT      2
## 18 <NA>        7