13.3.1 Add a surrogate key to flights.
13.4.6 Q1 Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States:
flights %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay)) %>%
inner_join(airports, by = c(dest = "faa")) %>%
ggplot(aes(lon, lat, color = delay)) +
borders("state") +
geom_point() +
coord_quickmap()13.4.6 Q2 Add the location of the origin and destination (i.e. the lat and lon) to flights.
| faa | lat | lon | year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | air_time | distance | hour | minute | time_hour | flight_ID |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ABQ | 35.04022 | -106.6092 | 2013 | 10 | 1 | 1955 | 2001 | -6 | 2213 | 2248 | -35 | B6 | 65 | N554JB | JFK | 230 | 1826 | 20 | 1 | 2013-10-01 20:00:00 | 27882 |
| ABQ | 35.04022 | -106.6092 | 2013 | 10 | 2 | 2010 | 2001 | 9 | 2230 | 2248 | -18 | B6 | 65 | N607JB | JFK | 238 | 1826 | 20 | 1 | 2013-10-02 20:00:00 | 28868 |
| ABQ | 35.04022 | -106.6092 | 2013 | 10 | 3 | 1955 | 2001 | -6 | 2232 | 2248 | -16 | B6 | 65 | N591JB | JFK | 251 | 1826 | 20 | 1 | 2013-10-03 20:00:00 | 29831 |
| ABQ | 35.04022 | -106.6092 | 2013 | 10 | 4 | 2017 | 2001 | 16 | 2304 | 2248 | 16 | B6 | 65 | N662JB | JFK | 257 | 1826 | 20 | 1 | 2013-10-04 20:00:00 | 30849 |
| ABQ | 35.04022 | -106.6092 | 2013 | 10 | 5 | 1959 | 1959 | 0 | 2226 | 2246 | -20 | B6 | 65 | N580JB | JFK | 242 | 1826 | 19 | 59 | 2013-10-05 19:00:00 | 31580 |
| ABQ | 35.04022 | -106.6092 | 2013 | 10 | 6 | 1959 | 2001 | -2 | 2234 | 2248 | -14 | B6 | 65 | N507JB | JFK | 240 | 1826 | 20 | 1 | 2013-10-06 20:00:00 | 32437 |
13.4.6 Q3 Is there a relationship between the age of a plane and its delays?
planes_year_old <- planes %>%
mutate(year_old = 2019 - year) %>%
select(tailnum,year_old)
flights %>%
group_by(tailnum) %>%
summarise(delay = mean(arr_delay, na.rm = T)) %>%
inner_join(planes_year_old, by = c(tailnum = "tailnum")) %>%
ggplot(mapping = aes(x = year_old, y = delay)) +
geom_point() +
labs(x = 'Average Delay', y = 'Plane Age')## Warning: Removed 76 rows containing missing values (geom_point).
There is NO relationship between the age of a plane and its delays
13.4.6 Q4 What weather conditions make it more likely to see a delay?
flight_weather <-
flights %>%
inner_join(weather, by = c(
"origin" = "origin",
"year" = "year",
"month" = "month",
"day" = "day",
"hour" = "hour"
))| year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour.x | flight_ID | temp | dewp | humid | wind_dir | wind_speed | wind_gust | precip | pressure | visib | time_hour.y |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 | 1 | 39.02 | 28.04 | 64.43 | 260 | 12.65858 | NA | 0 | 1011.9 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 | 2 | 39.92 | 24.98 | 54.81 | 250 | 14.96014 | 21.86482 | 0 | 1011.4 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 | 3 | 39.02 | 26.96 | 61.63 | 260 | 14.96014 | NA | 0 | 1012.1 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 | 4 | 39.02 | 26.96 | 61.63 | 260 | 14.96014 | NA | 0 | 1012.1 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 | 5 | 39.92 | 24.98 | 54.81 | 260 | 16.11092 | 23.01560 | 0 | 1011.7 | 10 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 | 6 | 39.02 | 28.04 | 64.43 | 260 | 12.65858 | NA | 0 | 1011.9 | 10 | 2013-01-01 05:00:00 |
temp <- flight_weather %>%
filter(!is.na(dep_delay), dep_delay > 0, !is.na(temp)) %>%
ggplot() +
geom_point(mapping = aes(x = temp, y = dep_delay)) +
labs(x = 'Average Dealy', y = 'Temperature')
tempdewp <- flight_weather %>%
filter(!is.na(dep_delay), dep_delay > 0, !is.na(dewp)) %>%
ggplot() +
geom_point(mapping = aes(x = dewp, y = dep_delay)) +
labs(x = 'Average Dealy', y = 'Dew point')
dewphumid <- flight_weather %>%
filter(!is.na(dep_delay), dep_delay > 0, !is.na(humid)) %>%
ggplot() +
geom_point(mapping = aes(x = humid, y = dep_delay)) +
labs(x = 'Average Dealy', y = 'Humidity')
humidwind_speed <- flight_weather %>%
filter(!is.na(dep_delay), dep_delay > 0, !is.na(wind_speed)) %>%
ggplot() +
geom_point(mapping = aes(x = wind_speed, y = dep_delay)) +
labs(x = 'Average Dealy', y = 'Wind Speed')
wind_speedHumid and dewp cause the most delay. when wind speed is increased, delay time is decreased, probadly because when wind speed is increased too much, flights would be cancalled which is filtered in this graph (removed na values)
13.4.6 Q5 What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.
[1] 45.79083
[1] 12.63907
june13delay <- june13 %>%
group_by(hour) %>%
summarise(delay = mean(dep_delay, na.rm = T),
count = n()) %>%
arrange(hour)
kable(june13delay)| hour | delay | count |
|---|---|---|
| 5 | -2.833333 | 6 |
| 6 | 3.542169 | 84 |
| 7 | 2.811594 | 69 |
| 8 | 7.913043 | 73 |
| 9 | 23.245283 | 58 |
| 10 | 27.818182 | 50 |
| 11 | 35.615385 | 47 |
| 12 | 61.340426 | 53 |
| 13 | 51.804348 | 54 |
| 14 | 61.543860 | 67 |
| 15 | 59.169231 | 72 |
| 16 | 52.000000 | 59 |
| 17 | 63.774648 | 76 |
| 18 | 70.000000 | 60 |
| 19 | 92.946429 | 64 |
| 20 | 87.897959 | 54 |
| 21 | 83.320000 | 31 |
| 22 | 103.111111 | 9 |
| 23 | 25.333333 | 3 |
There is a lot more dealys in the afternoon and night than in the morning
weatherJune13 <- weather %>%
filter(month == 6, day == 13)
flight_weather_June13 <-
june13delay %>%
inner_join(weatherJune13, by = c(
"hour" = "hour"
))
mean(weatherJune13$dewp, na.rm = T)[1] 57.3475
[1] 41.43998
flight_weather_June13 %>%
group_by(hour) %>%
summarise(delay = mean(delay, na.rm = T),
humid = mean(humid, na.rm = T),
dewp = mean(dewp, na.rm = T),
wind_speed = mean(wind_speed, na.rm = T)) %>%
ggplot(mapping = aes(x = hour)) +
geom_line(aes(y = delay, color = "Delay")) +
geom_line(aes(y = humid*0.9, color = "Humidity")) +
scale_y_continuous(sec.axis = sec_axis(~./0.9, name = "Humidity")) +
xlab("Delay") + ylab("Hour") + ggtitle("Delay and Humidity VS. Hour")```
flight_weather_June13 %>%
group_by(hour) %>%
summarise(delay = mean(delay, na.rm = T),
humid = mean(humid, na.rm = T),
dewp = mean(dewp, na.rm = T),
wind_speed = mean(wind_speed, na.rm = T)) %>%
ggplot(mapping = aes(x = hour)) +
geom_line(aes(y = delay, color = 'Delay')) +
geom_line(aes(y = dewp*5-200, color = 'Dew Point')) +
scale_y_continuous(sec.axis = sec_axis(~./5+41, name = "Dewpoint")) +
xlab("Delay") + ylab("Hour") + ggtitle("Delay and Dewpoint VS. Hour") ```
Base on Humidity in dewp point, there is a storm in the afternoon on June 13 2013.
13.5.1 Q1 What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)
| year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour | flight_ID |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 2 | NA | 1545 | NA | NA | 1910 | NA | AA | 133 | NA | JFK | LAX | NA | 2475 | 15 | 45 | 2013-01-02 15:00:00 | 1783 |
| 2013 | 1 | 2 | NA | 1601 | NA | NA | 1735 | NA | UA | 623 | NA | EWR | ORD | NA | 719 | 16 | 1 | 2013-01-02 16:00:00 | 1785 |
| 2013 | 1 | 3 | NA | 857 | NA | NA | 1209 | NA | UA | 714 | NA | EWR | MIA | NA | 1085 | 8 | 57 | 2013-01-03 08:00:00 | 2698 |
| 2013 | 1 | 3 | NA | 645 | NA | NA | 952 | NA | UA | 719 | NA | EWR | DFW | NA | 1372 | 6 | 45 | 2013-01-03 06:00:00 | 2699 |
| 2013 | 1 | 4 | NA | 845 | NA | NA | 1015 | NA | 9E | 3405 | NA | JFK | DCA | NA | 213 | 8 | 45 | 2013-01-04 08:00:00 | 3609 |
| 2013 | 1 | 4 | NA | 1830 | NA | NA | 2044 | NA | 9E | 3716 | NA | EWR | DTW | NA | 488 | 18 | 30 | 2013-01-04 18:00:00 | 3610 |
These flights are cancelled
kable(head(flights %>%
filter(!is.na(tailnum)) %>%
anti_join(planes, by = 'tailnum') %>%
group_by(carrier) %>%
summarise(count = n()) %>%
arrange(desc(count))))| carrier | count |
|---|---|
| MQ | 25395 |
| AA | 22474 |
| UA | 1007 |
| B6 | 830 |
| FL | 187 |
| DL | 110 |
MQ and AA carriers have the most missing tailnum in planes
13.5.1 Q2 Filter flights to only show flights with planes that have flown at least 100 flights.
kable(flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
summarise(count = n()) %>%
filter(count >= 100) %>%
arrange(desc(count)) %>% top_n(20))## Selecting by count
| tailnum | count |
|---|---|
| N725MQ | 575 |
| N722MQ | 513 |
| N723MQ | 507 |
| N711MQ | 486 |
| N713MQ | 483 |
| N258JB | 427 |
| N298JB | 407 |
| N353JB | 404 |
| N351JB | 402 |
| N735MQ | 396 |
| N328AA | 393 |
| N228JB | 388 |
| N338AA | 388 |
| N327AA | 387 |
| N335AA | 385 |
| N0EGMQ | 371 |
| N274JB | 370 |
| N324JB | 370 |
| N229JB | 364 |
| N534MQ | 364 |
13.5 Q3 Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.
## Warning: Detecting old grouped_df format, replacing `vars` attribute by
## `groups`
| id | make | model | year | class | trans | drive | cyl | displ | fuel | hwy | cty |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1833 | Acura | Integra | 1986 | Subcompact Cars | Automatic 4-spd | Front-Wheel Drive | 4 | 1.6 | Regular | 28 | 22 |
| 1834 | Acura | Integra | 1986 | Subcompact Cars | Manual 5-spd | Front-Wheel Drive | 4 | 1.6 | Regular | 28 | 23 |
| 3037 | Acura | Integra | 1987 | Subcompact Cars | Automatic 4-spd | Front-Wheel Drive | 4 | 1.6 | Regular | 28 | 22 |
| 3038 | Acura | Integra | 1987 | Subcompact Cars | Manual 5-spd | Front-Wheel Drive | 4 | 1.6 | Regular | 28 | 23 |
| 4183 | Acura | Integra | 1988 | Subcompact Cars | Automatic 4-spd | Front-Wheel Drive | 4 | 1.6 | Regular | 27 | 22 |
| 4184 | Acura | Integra | 1988 | Subcompact Cars | Manual 5-spd | Front-Wheel Drive | 4 | 1.6 | Regular | 28 | 23 |
13.5 Q4 Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?
delay <- flights %>%
filter(dep_delay > 0, arr_delay > 0) %>%
group_by(month, day) %>%
summarise(count = n(),
mean_delay = mean(dep_delay+arr_delay, na.rm =T)) %>%
mutate(average_twoday = mean_delay + lag(mean_delay)) %>%
arrange(desc(average_twoday))
kable(head(delay))| month | day | count | mean_delay | average_twoday |
|---|---|---|---|---|
| 7 | 10 | 426 | 229.6526 | 378.5751 |
| 7 | 23 | 542 | 150.6070 | 368.5020 |
| 6 | 28 | 480 | 177.1875 | 360.0703 |
| 6 | 25 | 536 | 156.7705 | 359.0171 |
| 9 | 12 | 354 | 239.0876 | 351.7413 |
| 4 | 11 | 441 | 111.8231 | 338.3960 |
twoday_delay <- delay %>%
inner_join(weather, by = c("month", "day"))
twoday_delay %>%
ggplot() +
geom_point(mapping = aes(x = humid, y = average_twoday )) +
labs(y = 'Average Delay', x = 'Humidity')## Warning: Removed 856 rows containing missing values (geom_point).
When calculate for such wide range of time (48 hours) it is hard to see any patterns in weather because weather usually change in a smaller window of time (when it rains, it ususlly last a few hours).
13.5 Q5 What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you
| year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour | flight_ID |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 | 4 |
| 2013 | 1 | 1 | 615 | 615 | 0 | 1039 | 1100 | -21 | B6 | 709 | N794JB | JFK | SJU | 182 | 1598 | 6 | 15 | 2013-01-01 06:00:00 | 29 |
| 2013 | 1 | 1 | 628 | 630 | -2 | 1137 | 1140 | -3 | AA | 413 | N3BAAA | JFK | SJU | 192 | 1598 | 6 | 30 | 2013-01-01 06:00:00 | 37 |
| 2013 | 1 | 1 | 701 | 700 | 1 | 1123 | 1154 | -31 | UA | 1203 | N77296 | EWR | SJU | 188 | 1608 | 7 | 0 | 2013-01-01 07:00:00 | 69 |
| 2013 | 1 | 1 | 711 | 715 | -4 | 1151 | 1206 | -15 | B6 | 715 | N651JB | JFK | SJU | 190 | 1598 | 7 | 15 | 2013-01-01 07:00:00 | 72 |
| 2013 | 1 | 1 | 820 | 820 | 0 | 1254 | 1310 | -16 | B6 | 717 | N527JB | JFK | SJU | 190 | 1598 | 8 | 20 | 2013-01-01 08:00:00 | 126 |
Drops all observations in flights that have a match in airport.
| faa | name | lat | lon | alt | tz | dst | tzone |
|---|---|---|---|---|---|---|---|
| 04G | Lansdowne Airport | 41.13047 | -80.61958 | 1044 | -5 | A | America/New_York |
| 06A | Moton Field Municipal Airport | 32.46057 | -85.68003 | 264 | -6 | A | America/Chicago |
| 06C | Schaumburg Regional | 41.98934 | -88.10124 | 801 | -6 | A | America/Chicago |
| 06N | Randall Airport | 41.43191 | -74.39156 | 523 | -5 | A | America/New_York |
| 09J | Jekyll Island Airport | 31.07447 | -81.42778 | 11 | -5 | A | America/New_York |
| 0A9 | Elizabethton Municipal Airport | 36.37122 | -82.17342 | 1593 | -5 | A | America/New_York |
Drops all observations in airport that have a match in flights.
13.5 Q6 You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.
flights %>%
select("tailnum", "carrier") %>%
inner_join(planes, by = c("tailnum")) %>%
ggplot(aes(x = year, y = carrier)) +
geom_point(aes(alpha = 1/10))## Warning: Removed 5306 rows containing missing values (geom_point).
Hypothesis 1: Envoy(MQ) American Airline (AA) uses many old planes (alpha = 0.05)
planesVScarrier <- flights %>%
select("tailnum", "carrier") %>%
inner_join(planes, by = c("tailnum")) %>%
group_by(carrier) %>%
summarise(
count = n(),
average_year = mean(2019-year, na.rm = T)) %>%
arrange(desc(average_year))| carrier | count | average_year |
|---|---|---|
| MQ | 1000 | 41.31900 |
| AA | 10171 | 31.86943 |
| DL | 48000 | 22.37217 |
| UA | 56972 | 19.20769 |
| FL | 3073 | 17.38583 |
| EV | 54173 | 17.30900 |
flights %>% select("tailnum", "carrier") %>%
inner_join(planes, by = c("tailnum")) %>%
mutate( year_old = 2019 - year) %>%
filter(year_old < 45) %>%
ggplot(mapping = aes(x = year_old)) +
geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1] 18.51599
[1] 7.193425
[1] 2.193355
[1] 20.70935 16.32264
Confidential Interval for MQ is (20.7 , 16.3) Since MQ average plane year old (41.3 year old) is outside of confidential interval. I accpepted Hypothesis and concluded that MQ and AA use old planes.