flights <- read.csv(“flights.csv”)

tinytex::install_tinytex()

Question #1

anti_join(flights, airports, by = c("dest" = "faa")) %>% distinct(dest)
## # A tibble: 4 × 1
##   dest 
##   <chr>
## 1 BQN  
## 2 SJU  
## 3 STT  
## 4 PSE

anti_join(flights, airports, by = c(“dest” = “faa”)) tells us which destinations are not listed in the airports dataset.This could be because the dataset has missing or incomplete data.

anti_join(airports, flights, by = c("faa" = "dest")) %>% distinct(faa, name)
## # A tibble: 1,357 × 2
##    faa   name                          
##    <chr> <chr>                         
##  1 04G   Lansdowne Airport             
##  2 06A   Moton Field Municipal Airport 
##  3 06C   Schaumburg Regional           
##  4 06N   Randall Airport               
##  5 09J   Jekyll Island Airport         
##  6 0A9   Elizabethton Municipal Airport
##  7 0G6   Williams County Airport       
##  8 0G7   Finger Lakes Regional Airport 
##  9 0P2   Shoestring Aviation Airfield  
## 10 0S9   Jefferson County Intl         
## # ℹ 1,347 more rows

anti_join(airports, flights, by = c(“faa” = “dest”)) contains destinations in the airports dataset that are not in nycflights13 because they were never destinations flown from any of the NYC airports.It looks like there are 1357 airports that were not a destination for NYC flights.

Question #2

longest_delay_day <- flights %>%
  group_by(year, month, day) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(avg_dep_delay)) %>%
  slice(1)
print(longest_delay_day)
## # A tibble: 1 × 4
##    year month   day avg_dep_delay
##   <int> <int> <int>         <dbl>
## 1  2013     3     8          83.5
flights_weather <- flights %>%
  left_join(weather, by = c("year", "month", "day", "origin", "hour"))

delayed_day_weather <- flights_weather %>%
  filter(year == longest_delay_day$year, 
         month == longest_delay_day$month, 
         day == longest_delay_day$day)

weather_summary <- delayed_day_weather %>%
  group_by(origin) %>%
  summarize(avg_temp = mean(temp, na.rm = TRUE),
            avg_wind_speed = mean(wind_speed, na.rm = TRUE),
            avg_precip = mean(precip, na.rm = TRUE))

print(weather_summary)
## # A tibble: 3 × 4
##   origin avg_temp avg_wind_speed avg_precip
##   <chr>     <dbl>          <dbl>      <dbl>
## 1 EWR        35.6           14.1    0.0477 
## 2 JFK        35.7           18.2    0.00478
## 3 LGA        36.3           18.2    0.0125

What immediately jumps out to me about the day with the longest average departure delay for the 3 NYC airports is that the average temp was about 36 degrees F on March 8, 2013. No precipitation, though. Googling the weather for JFK on that day, it also looks like it was cloudy. My guess is the cold and wind on March 8, 2013 lead to delays.

Question #3

fastest_planes <- flights %>%
  left_join(planes, by = "tailnum") %>%
  mutate(speed = distance / (air_time / 60)) %>%
  filter(!is.na(air_time) & !is.na(distance)) %>%
  group_by(model) %>%
  summarize(avg_speed = mean(speed, na.rm = TRUE)) %>%
  arrange(desc(avg_speed))

head(fastest_planes, 5)
## # A tibble: 5 × 2
##   model     avg_speed
##   <chr>         <dbl>
## 1 777-222        483.
## 2 A330-243       480.
## 3 767-424ER      467.
## 4 757-212        456.
## 5 A319-115       455.

The fastest two models are the Boeing 777, with an average speed of 482.63mph, and the Airbus A330, with an average speed of 480.36mph. I don’t think it’s surprising that a Boeing is the fastest model, given their status as a manufacturer. I do think what’s interesting to note is that the 777’s and the A330’s average speed are both over 480mph, while the third fastest model, the Boeing 767’s average speed is 466.6mph, which is ~14mph slower than the top two models.

A quick Google search shows that the max speed for common commercial aircraft (the 747, 737, and A380 in this case) can reach between 614-737mph. So the average speeds in this dataset probably make sense. Planes probably don’t always max out their speeds on flights, and when we factor in the speed of the flights at takeoff, cruising altitude, and landing, those average speeds make sense since speed will be slower at takeoff and landing.

Question

plane_delays <- flights %>%
  left_join(planes, by = "tailnum") %>%
  mutate(age = 2013 - year.y) %>%
  group_by(age) %>%
  summarize(
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    n_flights = n()  
  ) %>%
  filter(!is.na(age)) %>%
  arrange(age)

print(plane_delays, n = Inf)
## # A tibble: 46 × 3
##      age avg_dep_delay n_flights
##    <dbl>         <dbl>     <int>
##  1     0         10.6       4630
##  2     1          9.64      7252
##  3     2         11.8       6046
##  4     3         12.5       3797
##  5     4         11.0       6632
##  6     5         13.2      17878
##  7     6         13.7      15300
##  8     7         14.6      13203
##  9     8         14.7      14369
## 10     9         16.4      15706
## 11    10         15.0      15069
## 12    11         15.5      23741
## 13    12         14.2      26889
## 14    13         11.7      22334
## 15    14         13.1      19373
## 16    15         13.4      17231
## 17    16          9.49      6008
## 18    17         11.9       1799
## 19    18         12.9       1378
## 20    19         11.8       2714
## 21    20         13.3       3358
## 22    21         10.9       7696
## 23    22         10.7       6002
## 24    23         11.5       5394
## 25    24         11.1       3116
## 26    25         10.6       3856
## 27    26         10.1       3506
## 28    27          9.32      1800
## 29    28          9.79       994
## 30    29          6.84       115
## 31    30         13.2        246
## 32    33          6.16       109
## 33    34         16.1         64
## 34    35         17.2         26
## 35    36          4.51       187
## 36    37          6.93       544
## 37    38         13.0         92
## 38    39          9.37       103
## 39    40          6.18        22
## 40    41         17.6         25
## 41    45         10.5         43
## 42    46         10.3         22
## 43    48         -4            4
## 44    50          8.08        52
## 45    54          9.93       117
## 46    57          5.95        22
ggplot(plane_delays, aes(x = age, y = avg_dep_delay, size = n_flights)) +
  geom_point(alpha = 0.6) +
  labs(title = "Average Departure Delay by Plane Age",
       subtitle = "Point size indicates number of flights",
       x = "Plane Age (Years)",
       y = "Average Departure Delay (Minutes)") +
  theme_minimal()

There isn’t a clear linear relationship between plane age and average departure delays, i.e., it isn’t accurate to simply say that as a plane ages it experiences greater departure delay times. It looks like planes from 0-10 year old experience a rise in departure delay times, from about a 10 minute to 15 minute delay time. Then, from 10-20 years old, planes experience a decrease in departure delay time, from 15 minutes back down to 10. Hard to say why that may be just based on data alone. Planes around 10 years of age appear to experience the most delays here. After 20 years old, the data becomes fewer and less reliable, particularly as planes get closer to 30 years old and older. The data is more scattered and there are fewer flights to draw from for these older planes. That may be because there are fewer planes that are worthy to fly above age 20 – a quick Google search says that the average lifespan of a commercial airliner is 20-30 years.

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(ggplot2)   # <- Add this line
library(nycflights13)
anti_join(flights, airports, by = c("dest" = "faa")) %>% distinct(dest)
anti_join(airports, flights, by = c("faa" = "dest")) %>% distinct(faa, name)
longest_delay_day <- flights %>%
  group_by(year, month, day) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(avg_dep_delay)) %>%
  slice(1)
print(longest_delay_day)
flights_weather <- flights %>%
  left_join(weather, by = c("year", "month", "day", "origin", "hour"))

delayed_day_weather <- flights_weather %>%
  filter(year == longest_delay_day$year, 
         month == longest_delay_day$month, 
         day == longest_delay_day$day)

weather_summary <- delayed_day_weather %>%
  group_by(origin) %>%
  summarize(avg_temp = mean(temp, na.rm = TRUE),
            avg_wind_speed = mean(wind_speed, na.rm = TRUE),
            avg_precip = mean(precip, na.rm = TRUE))

print(weather_summary)
fastest_planes <- flights %>%
  left_join(planes, by = "tailnum") %>%
  mutate(speed = distance / (air_time / 60)) %>%
  filter(!is.na(air_time) & !is.na(distance)) %>%
  group_by(model) %>%
  summarize(avg_speed = mean(speed, na.rm = TRUE)) %>%
  arrange(desc(avg_speed))

head(fastest_planes, 5)
plane_delays <- flights %>%
  left_join(planes, by = "tailnum") %>%
  mutate(age = 2013 - year.y) %>%
  group_by(age) %>%
  summarize(
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    n_flights = n()  
  ) %>%
  filter(!is.na(age)) %>%
  arrange(age)

print(plane_delays, n = Inf)

ggplot(plane_delays, aes(x = age, y = avg_dep_delay, size = n_flights)) +
  geom_point(alpha = 0.6) +
  labs(title = "Average Departure Delay by Plane Age",
       subtitle = "Point size indicates number of flights",
       x = "Plane Age (Years)",
       y = "Average Departure Delay (Minutes)") +
  theme_minimal()