flights <- read.csv(“flights.csv”)
tinytex::install_tinytex()
anti_join(flights, airports, by = c("dest" = "faa")) %>% distinct(dest)
## # A tibble: 4 × 1
## dest
## <chr>
## 1 BQN
## 2 SJU
## 3 STT
## 4 PSE
anti_join(flights, airports, by = c(“dest” = “faa”)) tells us which destinations are not listed in the airports dataset.This could be because the dataset has missing or incomplete data.
anti_join(airports, flights, by = c("faa" = "dest")) %>% distinct(faa, name)
## # A tibble: 1,357 × 2
## faa name
## <chr> <chr>
## 1 04G Lansdowne Airport
## 2 06A Moton Field Municipal Airport
## 3 06C Schaumburg Regional
## 4 06N Randall Airport
## 5 09J Jekyll Island Airport
## 6 0A9 Elizabethton Municipal Airport
## 7 0G6 Williams County Airport
## 8 0G7 Finger Lakes Regional Airport
## 9 0P2 Shoestring Aviation Airfield
## 10 0S9 Jefferson County Intl
## # ℹ 1,347 more rows
anti_join(airports, flights, by = c(“faa” = “dest”)) contains destinations in the airports dataset that are not in nycflights13 because they were never destinations flown from any of the NYC airports.It looks like there are 1357 airports that were not a destination for NYC flights.
longest_delay_day <- flights %>%
group_by(year, month, day) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(avg_dep_delay)) %>%
slice(1)
print(longest_delay_day)
## # A tibble: 1 × 4
## year month day avg_dep_delay
## <int> <int> <int> <dbl>
## 1 2013 3 8 83.5
flights_weather <- flights %>%
left_join(weather, by = c("year", "month", "day", "origin", "hour"))
delayed_day_weather <- flights_weather %>%
filter(year == longest_delay_day$year,
month == longest_delay_day$month,
day == longest_delay_day$day)
weather_summary <- delayed_day_weather %>%
group_by(origin) %>%
summarize(avg_temp = mean(temp, na.rm = TRUE),
avg_wind_speed = mean(wind_speed, na.rm = TRUE),
avg_precip = mean(precip, na.rm = TRUE))
print(weather_summary)
## # A tibble: 3 × 4
## origin avg_temp avg_wind_speed avg_precip
## <chr> <dbl> <dbl> <dbl>
## 1 EWR 35.6 14.1 0.0477
## 2 JFK 35.7 18.2 0.00478
## 3 LGA 36.3 18.2 0.0125
What immediately jumps out to me about the day with the longest average departure delay for the 3 NYC airports is that the average temp was about 36 degrees F on March 8, 2013. No precipitation, though. Googling the weather for JFK on that day, it also looks like it was cloudy. My guess is the cold and wind on March 8, 2013 lead to delays.
fastest_planes <- flights %>%
left_join(planes, by = "tailnum") %>%
mutate(speed = distance / (air_time / 60)) %>%
filter(!is.na(air_time) & !is.na(distance)) %>%
group_by(model) %>%
summarize(avg_speed = mean(speed, na.rm = TRUE)) %>%
arrange(desc(avg_speed))
head(fastest_planes, 5)
## # A tibble: 5 × 2
## model avg_speed
## <chr> <dbl>
## 1 777-222 483.
## 2 A330-243 480.
## 3 767-424ER 467.
## 4 757-212 456.
## 5 A319-115 455.
The fastest two models are the Boeing 777, with an average speed of 482.63mph, and the Airbus A330, with an average speed of 480.36mph. I don’t think it’s surprising that a Boeing is the fastest model, given their status as a manufacturer. I do think what’s interesting to note is that the 777’s and the A330’s average speed are both over 480mph, while the third fastest model, the Boeing 767’s average speed is 466.6mph, which is ~14mph slower than the top two models.
A quick Google search shows that the max speed for common commercial aircraft (the 747, 737, and A380 in this case) can reach between 614-737mph. So the average speeds in this dataset probably make sense. Planes probably don’t always max out their speeds on flights, and when we factor in the speed of the flights at takeoff, cruising altitude, and landing, those average speeds make sense since speed will be slower at takeoff and landing.
plane_delays <- flights %>%
left_join(planes, by = "tailnum") %>%
mutate(age = 2013 - year.y) %>%
group_by(age) %>%
summarize(
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
n_flights = n()
) %>%
filter(!is.na(age)) %>%
arrange(age)
print(plane_delays, n = Inf)
## # A tibble: 46 × 3
## age avg_dep_delay n_flights
## <dbl> <dbl> <int>
## 1 0 10.6 4630
## 2 1 9.64 7252
## 3 2 11.8 6046
## 4 3 12.5 3797
## 5 4 11.0 6632
## 6 5 13.2 17878
## 7 6 13.7 15300
## 8 7 14.6 13203
## 9 8 14.7 14369
## 10 9 16.4 15706
## 11 10 15.0 15069
## 12 11 15.5 23741
## 13 12 14.2 26889
## 14 13 11.7 22334
## 15 14 13.1 19373
## 16 15 13.4 17231
## 17 16 9.49 6008
## 18 17 11.9 1799
## 19 18 12.9 1378
## 20 19 11.8 2714
## 21 20 13.3 3358
## 22 21 10.9 7696
## 23 22 10.7 6002
## 24 23 11.5 5394
## 25 24 11.1 3116
## 26 25 10.6 3856
## 27 26 10.1 3506
## 28 27 9.32 1800
## 29 28 9.79 994
## 30 29 6.84 115
## 31 30 13.2 246
## 32 33 6.16 109
## 33 34 16.1 64
## 34 35 17.2 26
## 35 36 4.51 187
## 36 37 6.93 544
## 37 38 13.0 92
## 38 39 9.37 103
## 39 40 6.18 22
## 40 41 17.6 25
## 41 45 10.5 43
## 42 46 10.3 22
## 43 48 -4 4
## 44 50 8.08 52
## 45 54 9.93 117
## 46 57 5.95 22
ggplot(plane_delays, aes(x = age, y = avg_dep_delay, size = n_flights)) +
geom_point(alpha = 0.6) +
labs(title = "Average Departure Delay by Plane Age",
subtitle = "Point size indicates number of flights",
x = "Plane Age (Years)",
y = "Average Departure Delay (Minutes)") +
theme_minimal()
There isn’t a clear linear relationship between plane age and average
departure delays, i.e., it isn’t accurate to simply say that as a plane
ages it experiences greater departure delay times. It looks like planes
from 0-10 year old experience a rise in departure delay times, from
about a 10 minute to 15 minute delay time. Then, from 10-20 years old,
planes experience a decrease in departure delay time, from 15 minutes
back down to 10. Hard to say why that may be just based on data alone.
Planes around 10 years of age appear to experience the most delays here.
After 20 years old, the data becomes fewer and less reliable,
particularly as planes get closer to 30 years old and older. The data is
more scattered and there are fewer flights to draw from for these older
planes. That may be because there are fewer planes that are worthy to
fly above age 20 – a quick Google search says that the average lifespan
of a commercial airliner is 20-30 years.
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(ggplot2) # <- Add this line
library(nycflights13)
anti_join(flights, airports, by = c("dest" = "faa")) %>% distinct(dest)
anti_join(airports, flights, by = c("faa" = "dest")) %>% distinct(faa, name)
longest_delay_day <- flights %>%
group_by(year, month, day) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(avg_dep_delay)) %>%
slice(1)
print(longest_delay_day)
flights_weather <- flights %>%
left_join(weather, by = c("year", "month", "day", "origin", "hour"))
delayed_day_weather <- flights_weather %>%
filter(year == longest_delay_day$year,
month == longest_delay_day$month,
day == longest_delay_day$day)
weather_summary <- delayed_day_weather %>%
group_by(origin) %>%
summarize(avg_temp = mean(temp, na.rm = TRUE),
avg_wind_speed = mean(wind_speed, na.rm = TRUE),
avg_precip = mean(precip, na.rm = TRUE))
print(weather_summary)
fastest_planes <- flights %>%
left_join(planes, by = "tailnum") %>%
mutate(speed = distance / (air_time / 60)) %>%
filter(!is.na(air_time) & !is.na(distance)) %>%
group_by(model) %>%
summarize(avg_speed = mean(speed, na.rm = TRUE)) %>%
arrange(desc(avg_speed))
head(fastest_planes, 5)
plane_delays <- flights %>%
left_join(planes, by = "tailnum") %>%
mutate(age = 2013 - year.y) %>%
group_by(age) %>%
summarize(
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
n_flights = n()
) %>%
filter(!is.na(age)) %>%
arrange(age)
print(plane_delays, n = Inf)
ggplot(plane_delays, aes(x = age, y = avg_dep_delay, size = n_flights)) +
geom_point(alpha = 0.6) +
labs(title = "Average Departure Delay by Plane Age",
subtitle = "Point size indicates number of flights",
x = "Plane Age (Years)",
y = "Average Departure Delay (Minutes)") +
theme_minimal()