library(nycflights13)
library(tidyverse)
13.4.6 (1)
compute average delay by destination
flights <- flights
delay <-
flights %>%
filter(!is.na(dep_delay)) %>%
group_by(dest) %>%
summarise(avg_delay = mean(dep_delay))
airports <- airports
airports %>%
right_join(delay, c("faa" = "dest")) %>%
ggplot(aes(lon, lat, color = avg_delay)) +
borders("state") +
geom_point() +
coord_quickmap()
Make 2 dataframes with lon and lat columns:
origin_locations <- airports %>% select(c("faa", "lon", "lat"))
dest_locations <- airports %>% select(c("faa", "lon", "lat"))
change names to origin and destination:
origin_locations <- rename(origin_locations, c("origin_lon" = lon, "origin_lat" = lat))
dest_locations <- rename(dest_locations, c("dest_lon" = lon, "dest_lat" = lat))
join locations to new flights dataframe:
flights2 <- flights %>% left_join(origin_locations, by = c("origin" = "faa"))
flights2 <- flights2 %>% left_join(dest_locations, by = c("dest" = "faa"))
head(flights2)
Compute average arrival delay for each plane:
plane_delays <- flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(mean_delay = mean(arr_delay))
planes <- planes
plane_data <- plane_delays %>% left_join(planes, by = c("tailnum" = "tailnum"))
Overall there is very little correlation, but to the extent that there is, it seems like newer planes are subject to longer delays
airport_weather %>%
ggplot() +
geom_point(mapping = aes(x = humid, y = arr_delay))
Delay time is weakly correlated to humidity
airport_weather %>%
ggplot() +
geom_point(mapping = aes(x = humid, y = arr_delay))
airport_weather %>%
ggplot() +
geom_point(mapping = aes(x = pressure, y = arr_delay))
No clear correlation between pressure and arrival delay
airport_weather %>%
ggplot() +
geom_point(mapping = aes(x = temp, y = arr_delay))
No clear correlation between temperature and delay
june_13_2013 <- flights %>%
filter(year == "2013", month == "6", day == "13", !is.na(arr_delay)) %>%
group_by(dest) %>%
summarise(avg_delay = mean(arr_delay))
airports %>%
right_join(june_13_2013, c("faa" = "dest")) %>%
ggplot(aes(lon, lat, color = avg_delay, size = avg_delay)) +
borders("state") +
geom_point() +
coord_quickmap()
There were Tornados in Ohio on June 13, 2013, and we see large delays in Ohio on that day. We also see large delays elsewhere, but this could be due in part to a ripple effect from the delays and weather in Ohio.
13.5.1
They have scheduled arrival and departure times , but no actual arrival and departure times, or air time. My guess would be that they represent canceled flights.
flights_over_100 <- flights %>%
group_by(tailnum) %>%
summarise(total_flights = n()) %>%
filter(total_flights > 100)
flights %>%
filter(tailnum %in% flights_over_100$tailnum)
common <- fueleconomy::common
vehicles <- fueleconomy::vehicles
common_vehicles <- inner_join(x = vehicles, y = common)
Joining, by = c("make", "model")
Filter for 48 hours with most delay
worst_hours <- flights %>%
filter(!is.na(arr_delay)) %>%
group_by(time_hour) %>%
summarise(delay_this_hour = sum(arr_delay)) %>%
arrange(desc(delay_this_hour)) %>%
head(48)
Join with weather data
weather_delay <- inner_join(x = weather, y = worst_hours, by = c("time_hour"))
Compute stats for this time period and compare with overall data
summary(weather_delay)
origin year month day hour temp dewp humid wind_dir
Length:143 Min. :2013 Min. : 3.00 Min. : 2.00 Min. : 8.00 Min. :30.92 Min. :28.04 Min. : 30.95 Min. : 0.0
Class :character 1st Qu.:2013 1st Qu.: 6.00 1st Qu.: 8.00 1st Qu.:17.00 1st Qu.:71.51 1st Qu.:62.33 1st Qu.: 63.53 1st Qu.:140.0
Mode :character Median :2013 Median : 7.00 Median :13.00 Median :17.00 Median :77.00 Median :69.08 Median : 78.96 Median :180.0
Mean :2013 Mean : 6.42 Mean :16.03 Mean :17.15 Mean :74.28 Mean :64.43 Mean : 74.44 Mean :180.7
3rd Qu.:2013 3rd Qu.: 7.00 3rd Qu.:23.50 3rd Qu.:18.50 3rd Qu.:82.94 3rd Qu.:71.33 3rd Qu.: 88.25 3rd Qu.:220.0
Max. :2013 Max. :10.00 Max. :30.00 Max. :20.00 Max. :96.08 Max. :75.20 Max. :100.00 Max. :360.0
NA's :2
wind_speed wind_gust precip pressure visib time_hour delay_this_hour
Min. : 0.00 Min. :16.11 Min. :0.00000 Min. : 998.4 Min. : 0.12 Min. :2013-03-08 08:00:00 Min. :5548
1st Qu.:10.36 1st Qu.:20.71 1st Qu.:0.00000 1st Qu.:1009.4 1st Qu.: 8.00 1st Qu.:2013-06-13 18:00:00 1st Qu.:5919
Median :12.66 Median :23.02 Median :0.00000 Median :1010.8 Median :10.00 Median :2013-07-07 19:00:00 Median :6442
Mean :13.37 Mean :23.62 Mean :0.02874 Mean :1011.2 Mean : 8.20 Mean :2013-06-29 07:47:49 Mean :6737
3rd Qu.:16.11 3rd Qu.:25.32 3rd Qu.:0.01000 3rd Qu.:1014.4 3rd Qu.:10.00 3rd Qu.:2013-07-28 15:00:00 3rd Qu.:7336
Max. :33.37 Max. :41.43 Max. :0.48000 Max. :1020.8 Max. :10.00 Max. :2013-10-07 17:00:00 Max. :8961
NA's :97 NA's :40
summary(weather)
origin year month day hour temp dewp humid wind_dir
Length:26115 Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 0.00 Min. : 10.94 Min. :-9.94 Min. : 12.74 Min. : 0.0
Class :character 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 6.00 1st Qu.: 39.92 1st Qu.:26.06 1st Qu.: 47.05 1st Qu.:120.0
Mode :character Median :2013 Median : 7.000 Median :16.00 Median :11.00 Median : 55.40 Median :42.08 Median : 61.79 Median :220.0
Mean :2013 Mean : 6.504 Mean :15.68 Mean :11.49 Mean : 55.26 Mean :41.44 Mean : 62.53 Mean :199.8
3rd Qu.:2013 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:17.00 3rd Qu.: 69.98 3rd Qu.:57.92 3rd Qu.: 78.79 3rd Qu.:290.0
Max. :2013 Max. :12.000 Max. :31.00 Max. :23.00 Max. :100.04 Max. :78.08 Max. :100.00 Max. :360.0
NA's :1 NA's :1 NA's :1 NA's :460
wind_speed wind_gust precip pressure visib time_hour
Min. : 0.000 Min. :16.11 Min. :0.000000 Min. : 983.8 Min. : 0.000 Min. :2013-01-01 01:00:00
1st Qu.: 6.905 1st Qu.:20.71 1st Qu.:0.000000 1st Qu.:1012.9 1st Qu.:10.000 1st Qu.:2013-04-01 21:30:00
Median : 10.357 Median :24.17 Median :0.000000 Median :1017.6 Median :10.000 Median :2013-07-01 14:00:00
Mean : 10.518 Mean :25.49 Mean :0.004469 Mean :1017.9 Mean : 9.255 Mean :2013-07-01 18:26:37
3rd Qu.: 13.809 3rd Qu.:28.77 3rd Qu.:0.000000 3rd Qu.:1023.0 3rd Qu.:10.000 3rd Qu.:2013-09-30 13:00:00
Max. :1048.361 Max. :66.75 Max. :1.210000 Max. :1042.1 Max. :10.000 Max. :2013-12-30 18:00:00
NA's :4 NA's :20778 NA's :2729
Temperature, dewpoint, humid, wind speed, and precipitation were all higher on average during times of greater delay
anti_join() return all rows from x where there are not matching values in y, keeping just columns from x.
anti_join(flights, airports, by = c(“dest” = “faa”)) - will tell you the flights data for flights where the destination is not in airports
anti_join(airports, flights, by = c(“faa” = “dest”)) - will tell you the airports data for airports where no flights from the flights data were destined
planes_carriers <- flights %>%
select(c("carrier", "tailnum"))
unique_carriers <- planes_carriers %>%
group_by(tailnum) %>%
summarise(carriers = length(unique(carrier)))
unique_carriers %>% filter(carriers > 1)
It seems that some planes correspond to more than one carrier