pacman::p_load(tidyverse, nycflights13, maps, RColorBrewer, lubridate, knitr)
https://r4ds.had.co.nz/relational-data.html#exercises-30
delays <- filter(flights, arr_delay > 0) %>% # delays data frame of late arrivals
group_by (dest) %>% # group by month
summarize (delay = mean(arr_delay)) %>% # the mean arrival delay
arrange(delay)
# plot the delays on a US map
delays %>%
left_join(airports, c("dest" = "faa")) %>% # get the longitude and latitude for destination airports
ggplot(aes(lon, lat, color = delay, size = 1.5, alpha = 0.6) ) +
borders("state") +
geom_point(na.rm = TRUE) +
scale_color_gradient(low = "grey", high = "darkred") +
theme_light() +
coord_quickmap()
latlon <- select(airports, faa, lat, lon) # select the columns we want from airports
flights %>%
left_join(latlon, c("origin" = "faa")) %>% # join to get origin lat and lon
rename(lat_orig = lat, lon_orig = lon) %>% # rename the new columns for origin
left_join(latlon, c("origin" = "faa")) %>% # join to get destination lat & lon
rename(lat_dest = lat, lon_dest = lon) %>% # rename the new columns for destination
head(5) %>%
select(origin:lon_dest) %>%
kable()
| origin | dest | air_time | distance | hour | minute | time_hour | lat_orig | lon_orig | lat_dest | lon_dest |
|---|---|---|---|---|---|---|---|---|---|---|
| EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 | 40.69250 | -74.16867 | 40.69250 | -74.16867 |
| LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 | 40.77725 | -73.87261 | 40.77725 | -73.87261 |
| JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 | 40.63975 | -73.77893 | 40.63975 | -73.77893 |
| JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 | 40.63975 | -73.77893 | 40.63975 | -73.77893 |
| LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 | 40.77725 | -73.87261 | 40.77725 | -73.87261 |
There does not appear to be a relationship between the age of a plane and it’s delays, when looking at the entire data set. Graphs of the total delay = departure delay + arrival delay by year don’t show a linear relationship.
However looking at the graph in time segments reveals some correlations. From about 1997 to around 2004 there is a positive relationship between the age the plane and total delays. With older planes having fewer delays. And from 2004 to 2013 there is a negative relationship with younger planes having fewer total delays.
flights %>% na.omit() %>% # remove observations w/ NA
select(tailnum, dep_delay, arr_delay) %>%
mutate(tot_delay = dep_delay + arr_delay) %>% # sum up the total delay for each flight
filter(tot_delay > 0) %>% # use the records with a positive total delay
left_join(planes) %>% # join with planes to get year put in service
arrange(year) %>%
ggplot(aes(x=year, y=tot_delay)) + # plot a line graph
geom_smooth(na.rm = TRUE)
## Joining, by = "tailnum"
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Precipitation appears to be the largest factor in longer delays. Greater delays happen with precipitation around 1.18, dew point around 79, wind gust above 65, visibility below 2.5, pressure about 1000, wind speed about 33, humidity at 100, and/or wind direction around 125.
flights %>% na.omit() %>%
left_join(weather, by = c("origin", "time_hour")) %>%
select(arr_delay, dep_delay, temp:visib) %>%
mutate(tot_delay = arr_delay + dep_delay) %>%
arrange(desc(tot_delay)) %>%
filter(tot_delay > 0) %>%
pivot_longer(cols = temp:visib,
names_to = "wthr",
values_to = "amt" ) %>%
ggplot(aes(x = amt, y = tot_delay, color = wthr)) +
geom_smooth(na.rm = TRUE) +
facet_wrap(vars(wthr), scales = "free")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Storms in the mid-Atlantic caused cancellation of 900 flights and 7400 delays.
fl2 <- na.omit (flights) %>% # remove observations with NA values
filter(month == 6 & day == 13) %>%
mutate(tot_delay = arr_delay + dep_delay) %>% # create in-flight delay column
filter(tot_delay > 0) %>%
left_join(airports, c("dest" = "faa"))
fl2 %>% ggplot(aes(lon, lat, color = tot_delay, size = 2)) +
borders("state") +
geom_point(na.rm = TRUE) +
scale_color_gradient(low = "grey", high = "darkred") +
theme_light() +
coord_quickmap()
https://r4ds.had.co.nz/relational-data.html#exercises-31
Flights with a missing tailnum are flights that did not occur, they were cancelled. Flights that don’t have a matching record in planes are primarily flown by Envoy Air or American Airlines.
filter(flights, is.na(tailnum)) # find flights with missing tailnum
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 NA 1545 NA NA 1910
## 2 2013 1 2 NA 1601 NA NA 1735
## 3 2013 1 3 NA 857 NA NA 1209
## 4 2013 1 3 NA 645 NA NA 952
## 5 2013 1 4 NA 845 NA NA 1015
## 6 2013 1 4 NA 1830 NA NA 2044
## 7 2013 1 5 NA 840 NA NA 1001
## 8 2013 1 7 NA 820 NA NA 958
## 9 2013 1 8 NA 1645 NA NA 1838
## 10 2013 1 9 NA 755 NA NA 1012
## # ... with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
anti_join(flights, planes, by = "tailnum") %>% # find flights w/o matching record in planes
count(carrier) %>% # count by carrier
left_join(airlines) # find the airline name
## Joining, by = "carrier"
## # A tibble: 10 x 3
## carrier n name
## <chr> <int> <chr>
## 1 9E 1044 Endeavor Air Inc.
## 2 AA 22558 American Airlines Inc.
## 3 B6 830 JetBlue Airways
## 4 DL 110 Delta Air Lines Inc.
## 5 F9 50 Frontier Airlines Inc.
## 6 FL 187 AirTran Airways Corporation
## 7 MQ 25397 Envoy Air
## 8 UA 1693 United Air Lines Inc.
## 9 US 699 US Airways Inc.
## 10 WN 38 Southwest Airlines Co.
f100 <- flights %>%
count(tailnum) %>% # find the planes that have flown 100+ flights
filter(n >= 100)
flights %>% semi_join(f100) # filter flights for planes that have flown 100+ flights
## Joining, by = "tailnum"
## # A tibble: 230,902 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # ... with 230,892 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
vehicles contains 33.442 rows, common contains 347 rows
The semi_join of vehicles to common selects 14,531 rows from vehicles.
pacman::p_load(fueleconomy)
count(vehicles)
## # A tibble: 1 x 1
## n
## <int>
## 1 33442
count(common)
## # A tibble: 1 x 1
## n
## <int>
## 1 347
semi_join(vehicles, common) %>%
count()
## Joining, by = c("make", "model")
## # A tibble: 1 x 1
## n
## <int>
## 1 14531
The full year plot shows the greatest total delays in early July. Focusing into a few week period from late June to early July pinpoints the largest delays to around July 1. And the weather history does show heavy rain and fog on the morning of July 1.
fl2 <- na.omit (flights) %>% # remove observations with NA values
mutate(tot_delay = arr_delay + dep_delay) # create total delay column
fl2 %>% ggplot(aes(x=time_hour, y=tot_delay)) + # plot total delay for the entire year
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Focus in on a few weeks around peak total delays in the year
fl2 <- fl2 %>% filter(time_hour > "2013-06-23 00:30:00" & time_hour < "2013-07-04 23:59:59" )
fl2 %>% ggplot(aes(x=time_hour, y=tot_delay)) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# narrow down to the 48 hours around peak total delays in the year
fl2 <- fl2 %>% filter(time_hour > "2013-06-30 13:30:00" & time_hour < "2013-07-02 12:29:59" )
fl2 %>% ggplot(aes(x=time_hour, y=tot_delay)) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Looks like a storm, with fog perhaps, was coming in on the afternoon of June 30th. Visibility was low, pressure increasing, humidity was high and there was a sharp increase in rain during the morning of July 1
fl2 <- left_join(fl2,weather, by = c("origin", "time_hour"))
fl2$wind_gust <- replace_na(0)
fl2 %>% pivot_longer(cols = temp:visib,
names_to = "wthr",
values_to = "amt" ) %>%
ggplot(aes(x = time_hour, y = amt, color = wthr)) +
geom_smooth(na.rm = TRUE) +
facet_wrap(vars(wthr), scales = "free") +
theme(axis.text.x = element_text(angle = 65, vjust = .55))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 3)
There are 7,602 flights where the destination is not included in the airports dataset. There are 1,357 airports that are not destinations for a flight in the flights dataset.
anti_join(flights, airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # ... with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo~
## 2 06A Moton Field Municipal A~ 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo~
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo~
## 6 0A9 Elizabethton Municipal ~ 36.4 -82.2 1593 -5 A America/New_Yo~
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo~
## 8 0G7 Finger Lakes Regional A~ 42.9 -76.8 492 -5 A America/New_Yo~
## 9 0P2 Shoestring Aviation Air~ 39.8 -76.6 1000 -5 U America/New_Yo~
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An~
## # ... with 1,347 more rows
There are 17 planes that are each flown by two carriers, the hypothesis is rejected.
pf <- select(flights, tailnum, carrier) %>% # select desired columns, get unique rows
unique() %>%
right_join(planes, by = "tailnum") # join to planes
p1 <- pf %>%
count(tailnum, sort = TRUE) %>%
filter(n > 1) # find planes w/more than one carrier
p1 %>% left_join(pf) %>% # join to get the carrier codes
kable()
## Joining, by = "tailnum"
| tailnum | n | carrier | year | type | manufacturer | model | engines | seats | speed | engine |
|---|---|---|---|---|---|---|---|---|---|---|
| N146PQ | 2 | 9E | 2007 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N146PQ | 2 | EV | 2007 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N153PQ | 2 | 9E | 2007 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N153PQ | 2 | EV | 2007 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N176PQ | 2 | 9E | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N176PQ | 2 | EV | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N181PQ | 2 | 9E | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N181PQ | 2 | EV | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N197PQ | 2 | 9E | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N197PQ | 2 | EV | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N200PQ | 2 | 9E | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N200PQ | 2 | EV | 2008 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N228PQ | 2 | 9E | 2009 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N228PQ | 2 | EV | 2009 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N232PQ | 2 | 9E | 2009 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N232PQ | 2 | EV | 2009 | Fixed wing multi engine | BOMBARDIER INC | CL-600-2D24 | 2 | 95 | NA | Turbo-fan |
| N933AT | 2 | FL | 2000 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N933AT | 2 | DL | 2000 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N935AT | 2 | FL | 2000 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N935AT | 2 | DL | 2000 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N977AT | 2 | FL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N977AT | 2 | DL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N978AT | 2 | FL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N978AT | 2 | DL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N979AT | 2 | FL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N979AT | 2 | DL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N981AT | 2 | FL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N981AT | 2 | DL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N989AT | 2 | FL | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N989AT | 2 | DL | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N990AT | 2 | FL | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N990AT | 2 | DL | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N994AT | 2 | FL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
| N994AT | 2 | DL | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |