library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("nycflights13")
library("viridis")
## Warning: package 'viridis' was built under R version 4.0.5
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.0.5
flights_latlon <- flights %>%
inner_join(select(airports, origin = faa, origin_lat = lat, origin_lon = lon),
by = "origin"
) %>%
inner_join(select(airports, dest = faa, dest_lat = lat, dest_lon = lon),
by = "dest"
)
flights_latlon %>%
slice(1:100) %>%
ggplot(aes(
x = origin_lon, xend = dest_lon,
y = origin_lat, yend = dest_lat
)) +
borders("state") +
geom_segment(arrow = arrow(length = unit(0.1, "cm"))) +
coord_quickmap() +
labs(y = "Latitude", x = "Longitude")

flights %>%
arrange(year, month, day, sched_dep_time, carrier, flight) %>%
mutate(flight_id = row_number()) %>%
glimpse()
## Rows: 336,776
## Columns: 20
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time <int> 517, 533, 542, 544, 554, 559, 558, 559, 558, 558, 557, ~
## $ sched_dep_time <int> 515, 529, 540, 545, 558, 559, 600, 600, 600, 600, 600, ~
## $ dep_delay <dbl> 2, 4, 2, -1, -4, 0, -2, -1, -2, -2, -3, NA, 1, 0, -5, -~
## $ arr_time <int> 830, 850, 923, 1004, 740, 702, 753, 941, 849, 853, 838,~
## $ sched_arr_time <int> 819, 830, 850, 1022, 728, 706, 745, 910, 851, 856, 846,~
## $ arr_delay <dbl> 11, 20, 33, -18, 12, -4, 8, 31, -2, -3, -8, NA, -6, -7,~
## $ carrier <chr> "UA", "UA", "AA", "B6", "UA", "B6", "AA", "AA", "B6", "~
## $ flight <int> 1545, 1714, 1141, 725, 1696, 1806, 301, 707, 49, 71, 79~
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N39463", "N708~
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "EWR", "JFK", "LGA", "LGA",~
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ORD", "BOS", "ORD", "DFW",~
## $ air_time <dbl> 227, 227, 160, 183, 150, 44, 138, 257, 149, 158, 140, N~
## $ distance <dbl> 1400, 1416, 1089, 1576, 719, 187, 733, 1389, 1028, 1005~
## $ hour <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6~
## $ minute <dbl> 15, 29, 40, 45, 58, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0~
## $ flight_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ~
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()

avg_dest_delays <-
flights %>%
group_by(dest) %>%
# arrival delay NA's are cancelled flights
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa"))
avg_dest_delays %>%
ggplot(aes(lon, lat, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap()

flights %>%
distinct(carrier, tailnum) %>%
left_join(planes, by = "tailnum") %>%
group_by(carrier) %>%
summarise(total_planes = n(),
not_in_planes = sum(is.na(model))) %>%
mutate(missing_pct = not_in_planes / total_planes) %>%
arrange(desc(missing_pct))
## # A tibble: 16 x 4
## carrier total_planes not_in_planes missing_pct
## <chr> <int> <int> <dbl>
## 1 MQ 238 234 0.983
## 2 AA 601 430 0.715
## 3 F9 26 3 0.115
## 4 FL 129 12 0.0930
## 5 UA 621 23 0.0370
## 6 US 290 9 0.0310
## 7 DL 629 10 0.0159
## 8 B6 193 3 0.0155
## 9 WN 583 3 0.00515
## 10 9E 204 1 0.00490
## 11 AS 84 0 0
## 12 EV 316 0 0
## 13 HA 14 0 0
## 14 OO 28 0 0
## 15 VX 53 0 0
## 16 YV 58 0 0
flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
mutate(n = n()) %>%
filter(n >= 100)
## # A tibble: 228,390 x 20
## # Groups: tailnum [1,217]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # ... with 228,380 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # n <int>
worst_hours <- flights %>%
mutate(hour = sched_dep_time %/% 100) %>%
group_by(origin, year, month, day, hour) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(dep_delay)) %>%
slice(1:48)
## `summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can override using the `.groups` argument.