airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()
avg_delay <- flights %>%
group_by(dest) %>%
summarize(delay=mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa"))
avg_delay %>%
ggplot(aes(lon, lat, color=delay)) +
borders("state") +
geom_point() +
coord_quickmap()
airport_locations <- airports %>%
select(faa, lat, lon)
flights %>%
select(origin, dest) %>%
left_join(airport_locations, by = c("origin" = "faa")) %>%
left_join(airport_locations, by = c("dest" = "faa"), suffix = c(".origin", ".dest"))
## # A tibble: 336,776 x 6
## origin dest lat.origin lon.origin lat.dest lon.dest
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 EWR IAH 40.7 -74.2 30.0 -95.3
## 2 LGA IAH 40.8 -73.9 30.0 -95.3
## 3 JFK MIA 40.6 -73.8 25.8 -80.3
## 4 JFK BQN 40.6 -73.8 NA NA
## 5 LGA ATL 40.8 -73.9 33.6 -84.4
## 6 EWR ORD 40.7 -74.2 42.0 -87.9
## 7 EWR FLL 40.7 -74.2 26.1 -80.2
## 8 LGA IAD 40.8 -73.9 38.9 -77.5
## 9 JFK MCO 40.6 -73.8 28.4 -81.3
## 10 LGA ORD 40.8 -73.9 42.0 -87.9
## # … with 336,766 more rows
delays_by_age <- inner_join(flights, select(planes, tailnum, plane_year=year), by = "tailnum") %>%
mutate(age = 2013 - plane_year) %>%
filter(!is.na(age)) %>%
group_by(age) %>%
summarize(avg_delay = mean(arr_delay, na.rm = TRUE))
ggplot(delays_by_age, aes(x=age, y=avg_delay)) +
geom_point()
There does seem to be a strong positive correlation between plane age and arrival delay until the plane is about 10 years old, at which point the trend reverses to a negative correlation between plane age and arrival delay.
delays_by_age <- inner_join(flights, select(planes, tailnum, plane_year=year), by = "tailnum") %>%
mutate(age = 2013 - plane_year) %>%
filter(!is.na(age)) %>%
group_by(age) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE))
ggplot(delays_by_age, aes(x=age, y=avg_delay)) +
geom_point()
Graphing age against departure delays shows similar trends to arrival delays.
Precipitation and wind speed seem to have a positive correlation with departure delays - higher levels of precipitation and higher wind speeds correspond to delayed flights.
Gust speed did not appear to have much of a relationship with departure delay, except in the one outlier case of an over 60mph gust. Visibility had a negative correlation with departure delays - low visibility meant the flight was more likely to be delayed.
weather_delay <- flights %>%
inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
group_by(precip) %>%
summarize(precip_delay=mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x=precip, y=precip_delay)) +
geom_point()
weather_delay <- flights %>%
inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
group_by(wind_speed) %>%
summarize(wind_delay=mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x=wind_speed, y=wind_delay)) +
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
weather_delay <- flights %>%
inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
group_by(wind_gust) %>%
summarize(gust_delay=mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x=wind_gust, y=gust_delay)) +
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
weather_delay <- flights %>%
inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
group_by(visib) %>%
summarize(visib_delay=mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x=visib, y=visib_delay)) +
geom_point()
On June 13, 2013, there was a severe storm in the mid- to south-east part of the United States (https://en.wikipedia.org/wiki/June_12%E2%80%9313,_2013_derecho_series).
flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarize(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap()
## Warning: Removed 3 rows containing missing values (geom_point).
flights %>%
filter(is.na(tailnum))
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 NA 1545 NA NA 1910
## 2 2013 1 2 NA 1601 NA NA 1735
## 3 2013 1 3 NA 857 NA NA 1209
## 4 2013 1 3 NA 645 NA NA 952
## 5 2013 1 4 NA 845 NA NA 1015
## 6 2013 1 4 NA 1830 NA NA 2044
## 7 2013 1 5 NA 840 NA NA 1001
## 8 2013 1 7 NA 820 NA NA 958
## 9 2013 1 8 NA 1645 NA NA 1838
## 10 2013 1 9 NA 755 NA NA 1012
## # … with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Flights without tail numbers do not have departure times or air times, which may mean that they were cancelled. If the flights were cancelled, there was no plane to record since it did not take off.
top_flyers <- flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
count() %>%
filter(n>=100)
top_flyers
## # A tibble: 1,217 x 2
## # Groups: tailnum [1,217]
## tailnum n
## <chr> <int>
## 1 N0EGMQ 371
## 2 N10156 153
## 3 N10575 289
## 4 N11106 129
## 5 N11107 148
## 6 N11109 148
## 7 N11113 138
## 8 N11119 148
## 9 N11121 154
## 10 N11127 124
## # … with 1,207 more rows
fueleconomy::vehicles %>%
semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1833 Acura Integ… 1986 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 2 1834 Acura Integ… 1986 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 3 3037 Acura Integ… 1987 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 4 3038 Acura Integ… 1987 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 5 4183 Acura Integ… 1988 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 6 4184 Acura Integ… 1988 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 7 5303 Acura Integ… 1989 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 8 5304 Acura Integ… 1989 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 9 6442 Acura Integ… 1990 Subcom… Autom… Front-… 4 1.8 Regu… 24 20
## 10 6443 Acura Integ… 1990 Subcom… Manua… Front-… 4 1.8 Regu… 26 21
## # … with 14,521 more rows
delay_weather <- flights %>%
inner_join(weather, by= c("origin", "year", "month", "day", "hour")) %>%
arrange(desc(dep_delay)) %>%
slice(1:48)
delay_weather
## # A tibble: 48 x 29
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # … with 38 more rows, and 21 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour.x <dttm>, temp <dbl>,
## # dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
I do not see any strong patterns.
anti_join(flights, airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
The anti join here shows rows in flights that had destinations that are not on the FAA destinations list.
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo…
## 2 06A Moton Field Municipal A… 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo…
## 6 0A9 Elizabethton Municipal … 36.4 -82.2 1593 -5 A America/New_Yo…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo…
## 8 0G7 Finger Lakes Regional A… 42.9 -76.8 492 -5 A America/New_Yo…
## 9 0P2 Shoestring Aviation Air… 39.8 -76.6 1000 -5 U America/New_Yo…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An…
## # … with 1,347 more rows
This anti join shows FAA destinations that no flights on our flights list arrived in.
plane_airline <- flights %>%
filter(!is.na(tailnum)) %>%
distinct(tailnum, carrier)
multi_airline <- plane_airline %>%
group_by(tailnum) %>%
filter(n() > 1) %>%
arrange(tailnum)
multi_airline
## # A tibble: 34 x 2
## # Groups: tailnum [17]
## carrier tailnum
## <chr> <chr>
## 1 9E N146PQ
## 2 EV N146PQ
## 3 9E N153PQ
## 4 EV N153PQ
## 5 9E N176PQ
## 6 EV N176PQ
## 7 9E N181PQ
## 8 EV N181PQ
## 9 9E N197PQ
## 10 EV N197PQ
## # … with 24 more rows
There is not necessarily an implicit relationship between plane and airline; the data show that planes can be flown by multiple carriers (potentially due to airline partnerships such as the OneWorld Alliance, which could mean that a British Airways flight could be operated by American Airlines, for example).