Does the dataset contain 336,776 records and 19 fields?
dim(flights)
## [1] 336776 19
Flight with the largest departure delay:
flights %>%
arrange(desc(dep_delay)) %>%
select(carrier, flight, tailnum, dep_delay) %>%
slice(1)
## # A tibble: 1 × 4
## carrier flight tailnum dep_delay
## <chr> <int> <chr> <dbl>
## 1 HA 51 N384HA 1301
Average departure delay per date (handling cancelled flights):
flights %>%
group_by(year, month, day) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop")
## # A tibble: 365 × 4
## year month day mean_dep_delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
Tail number with the lowest average arrival delay:
flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay),
n_flights = n(),
.groups = "drop") %>%
filter(!is.na(tailnum)) %>%
arrange(avg_arr_delay) %>%
slice(1)
## # A tibble: 1 × 3
## tailnum avg_arr_delay n_flights
## <chr> <dbl> <int>
## 1 N560AS -53 1
First and last departure time each day:
flights %>%
filter(!is.na(dep_time)) %>%
group_by(year, month, day) %>%
summarise(first = min(dep_time),
last = max(dep_time),
.groups = "drop") %>%
arrange(desc(last))
## # A tibble: 365 × 5
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 2 7 27 2400
## 2 2013 2 11 1 2400
## 3 2013 3 15 11 2400
## 4 2013 3 22 37 2400
## 5 2013 3 25 13 2400
## 6 2013 4 2 9 2400
## 7 2013 4 4 14 2400
## 8 2013 4 20 7 2400
## 9 2013 5 21 110 2400
## 10 2013 6 17 2 2400
## # ℹ 355 more rows
Proportion of flights with departure delays greater than 60 minutes by month:
flights %>%
group_by(month) %>%
summarise(prop_over_60 = mean(dep_delay > 60, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(prop_over_60))
## # A tibble: 12 × 2
## month prop_over_60
## <int> <dbl>
## 1 7 0.134
## 2 6 0.128
## 3 12 0.0942
## 4 4 0.0916
## 5 3 0.0837
## 6 5 0.0818
## 7 8 0.0796
## 8 2 0.0698
## 9 1 0.0688
## 10 9 0.0490
## 11 10 0.0469
## 12 11 0.0402
Destinations with the most carriers:
flights %>%
group_by(dest) %>%
summarise(n_carriers = n_distinct(carrier),
.groups = "drop") %>%
arrange(desc(n_carriers))
## # A tibble: 105 × 2
## dest n_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 95 more rows
Distance vs average arrival delay by destination (removing small counts and HNL):
delay_by_dest <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE),
.groups = "drop"
) %>%
filter(count > 20, dest != "HNL")
delay_by_dest
## # A tibble: 96 × 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00
## 7 BDL 443 116 7.05
## 8 BGR 375 378 8.03
## 9 BHM 297 866. 16.9
## 10 BNA 6333 758. 11.8
## # ℹ 86 more rows
Plot relationship:
ggplot(delay_by_dest, aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'