nycflights

Author

Djeneba Kounta

## Load data

#install.packages("nycflights23")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)
data("flights")
data('airlines')
flights_nona <- flights |>
  filter(!is.na(distance) & !is.na(arr_delay) & !is.na(dep_delay))  
# remove na's for distance, arr_delay, departure delay
by_dest <- flights_nona |>
  group_by(dest) |>  # group all destinations
  summarise(count = n(),   # counts totals for each destination
            avg_dist = mean(distance), # calculates the mean distance traveled
            avg_arr_delay = mean(arr_delay),  # calculates the mean arrival delay
            avg_dep_delay = mean(dep_delay), # calculates the mean dep delay
            .groups = "drop") |>  # remove the grouping structure after summarizing
  arrange(avg_arr_delay) |>
  filter(avg_dist < 3000)
head(by_dest)
# A tibble: 6 × 5
  dest  count avg_dist avg_arr_delay avg_dep_delay
  <chr> <int>    <dbl>         <dbl>         <dbl>
1 PNS      71    1030         -10.6         -1.24 
2 HHH     461     695.         -9.95         1.38 
3 HDN      27    1728          -9.93         8.78 
4 VPS     107     988          -9.41         2.62 
5 AVP     140      93          -8.53        -0.957
6 GSO    2857     456.         -7.77         3.81 

## code from AI to find the top 10

top10 <- by_dest |>
  arrange(desc(avg_arr_delay)) |>
  slice_head(n = 10)
head(top10)
# A tibble: 6 × 5
  dest  count avg_dist avg_arr_delay avg_dep_delay
  <chr> <int>    <dbl>         <dbl>         <dbl>
1 PSE     319    1617           37.6          44.2
2 RNO     129    2410.          34.4          47.0
3 ABQ     218    1825.          26.7          41.2
4 ONT     353    2429           26.1          37.3
5 BQN     957    1579.          25.6          32.2
6 SJU    5312    1602.          21.0          28.9
ggplot(top10, aes(x = avg_arr_delay, y = dest, fill = dest)) +
  geom_bar(stat = "identity") +
  scale_fill_discrete(name = "Destinations", labels = top10$dest) +
  labs(
    title = 'Top 10 Most Delayed Destinations from NYC',
    x = 'Average Arrival Delay',
    y = 'destination',
     caption = 'FAA Aircraft registery',
  )