##Load Library
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (nycflights23)
data ("flights" )
# A tibble: 6 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
# A tibble: 14 × 2
carrier name
<chr> <chr>
1 9E Endeavor Air Inc.
2 AA American Airlines Inc.
3 AS Alaska Airlines Inc.
4 B6 JetBlue Airways
5 DL Delta Air Lines Inc.
6 F9 Frontier Airlines Inc.
7 G4 Allegiant Air
8 HA Hawaiian Airlines Inc.
9 MQ Envoy Air
10 NK Spirit Air Lines
11 OO SkyWest Airlines Inc.
12 UA United Air Lines Inc.
13 WN Southwest Airlines Co.
14 YX Republic Airline
flights_full_airline_name <- flights |>
left_join (airlines, by = c ("carrier" = "carrier" )) |>
mutate (carrier = name)
head (flights_full_airline_name)
# A tibble: 6 × 20
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>
flights_with_friendly_delay <- flights_full_airline_name|>
mutate (total_delay = dep_delay+ arr_delay,
total_delay_hours = floor (total_delay / 60 ),
total_delay_minutes = total_delay %% 60 ,
total_delay_formatted = sprintf ("%02d hours %02d minutes" , total_delay_hours, total_delay_minutes))
head (flights_with_friendly_delay)
# A tibble: 6 × 24
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
# ℹ 16 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, total_delay <dbl>,
# total_delay_hours <dbl>, total_delay_minutes <dbl>,
# total_delay_formatted <chr>
delay_worst<- flights_with_friendly_delay |>
arrange (desc (total_delay)) |>
mutate (Speed = "Slowest" ) |> #add a column for use later
head (10 )
delay_worst
# A tibble: 10 × 25
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 12 17 1953 1340 1813 2155 1543
2 2023 10 1 1240 659 1781 1407 835
3 2023 4 25 1201 659 1742 1315 818
4 2023 2 7 2045 1700 1665 2352 2025
5 2023 4 20 926 619 1627 1135 822
6 2023 10 29 856 600 1616 1050 805
7 2023 3 17 2027 1830 1557 2346 2139
8 2023 4 30 1818 1617 1561 2001 1820
9 2023 3 29 633 525 1508 820 700
10 2023 7 14 1533 1600 1413 1935 1849
# ℹ 17 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, total_delay <dbl>,
# total_delay_hours <dbl>, total_delay_minutes <dbl>,
# total_delay_formatted <chr>, Speed <chr>
delay_best <- flights_with_friendly_delay|>
arrange (total_delay)|>
mutate (Speed = "Fastest" )|>
head (10 )
delay_best
# A tibble: 10 × 25
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 12 25 838 845 -7 1054 1231
2 2023 12 26 1659 1710 -11 1941 2112
3 2023 4 12 1144 1200 -16 1358 1523
4 2023 12 2 2147 2225 -38 2332 35
5 2023 4 11 907 915 -8 1118 1250
6 2023 9 3 1801 1834 -33 1934 2040
7 2023 12 26 1956 1959 -3 2208 2344
8 2023 4 11 1643 1649 -6 1846 2015
9 2023 12 26 853 900 -7 1110 1238
10 2023 6 7 1547 1548 -1 1823 1955
# ℹ 17 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, total_delay <dbl>,
# total_delay_hours <dbl>, total_delay_minutes <dbl>,
# total_delay_formatted <chr>, Speed <chr>
p1 <- delay_worst |>
ggplot (aes (x= dest, y= total_delay_formatted, fill= carrier)) +
geom_point (size = 10 , shape = 21 , stroke = 1 ) +
geom_smooth (method = "lm" , se = FALSE ) +
scale_fill_brewer (palette = "Set3" ) +
labs (x= 'Destination' ,
y= 'Total Delay' ,
title= 'Top 10 Worst Delayed Flights of 2023 from NYC - Where and Who?' ,
caption = "New York Flights Data 2023" )
p1
`geom_smooth()` using formula = 'y ~ x'
p2 <- delay_best|>
ggplot (aes (x= dest, y= total_delay_formatted, fill= carrier)) +
geom_point (size = 10 , shape = 21 , stroke = 1 ) +
geom_smooth (method = "lm" , se = FALSE ) +
scale_fill_brewer (palette = "Set3" ) +
labs (x= 'Destination' ,
y= 'Total Delay' ,
title= 'Top 10 Fastest Flights of 2023 from NYC - Where and Who?' ,
caption = "New York Flights Data 2023" )
p2
`geom_smooth()` using formula = 'y ~ x'
grouped_carriers <- flights_with_friendly_delay|>
group_by (carrier) |>
summarise (
total_flights = n (),
avg_total_delay = mean (total_delay, na.rm = TRUE ),
total_delay_hours = sum (total_delay_hours, na.rm = TRUE ),
total_delay_minutes = sum (total_delay_minutes, na.rm = TRUE )
)
# View the summarized data
head (grouped_carriers)
# A tibble: 6 × 5
carrier total_flights avg_total_delay total_delay_hours total_delay_minutes
<chr> <int> <dbl> <dbl> <dbl>
1 Alaska Ai… 7843 11.9 -2290 229094
2 Allegiant… 671 -1.90 -370 20930
3 American … 40525 19.3 -8213 1258615
4 Delta Air… 61562 16.6 -15047 1906129
5 Endeavor … 54141 5.15 -23514 1679844
6 Envoy Air 357 10.6 -131 11604
delay_worst_group<- grouped_carriers|>
arrange (desc (avg_total_delay)) |>
mutate (Speed = "Slowest" ) |>
head (5 )
delay_worst_group
# A tibble: 5 × 6
carrier total_flights avg_total_delay total_delay_hours total_delay_minutes
<chr> <int> <dbl> <dbl> <dbl>
1 Frontier … 1286 62.0 645 36798
2 Hawaiian … 366 44.4 85 10982
3 JetBlue A… 66169 39.2 8579 2006605
4 SkyWest A… 6432 33.5 51 204514
5 Spirit Ai… 15189 28.0 -918 469207
# ℹ 1 more variable: Speed <chr>
delay_best_group <- grouped_carriers|>
arrange (avg_total_delay)|>
mutate (Speed = "Fastest" )|>
head (5 )
delay_best_group
# A tibble: 5 × 6
carrier total_flights avg_total_delay total_delay_hours total_delay_minutes
<chr> <int> <dbl> <dbl> <dbl>
1 Allegiant… 671 -1.90 -370 20930
2 Republic … 88785 -0.528 -46870 2767123
3 Endeavor … 54141 5.15 -23514 1679844
4 Envoy Air 357 10.6 -131 11604
5 Alaska Ai… 7843 11.9 -2290 229094
# ℹ 1 more variable: Speed <chr>
top_bottom_avg_delay <- rbind (delay_best_group, delay_worst_group)
top_bottom_avg_delay <- top_bottom_avg_delay|>
arrange (avg_total_delay)
top_bottom_avg_delay$ carrier <- factor (top_bottom_avg_delay$ carrier, levels = top_bottom_avg_delay$ carrier)
p3 <- top_bottom_avg_delay|>
ggplot (aes (x= avg_total_delay, y= total_flights, fill= carrier)) +
geom_point (size = 8 , shape = 21 , stroke = 1 ) +
geom_smooth (method = "lm" , se = FALSE ) +
scale_fill_brewer (palette = "Paired" ) +
labs (x= 'Average Delay in Minutes' ,
y= 'Total Flights' ,
title= 'Top 5 and Bottom 5 Average Delayed Airlines from NYC' ,
caption = "New York Flights Data 2023" )
p3
`geom_smooth()` using formula = 'y ~ x'
What do we see?
In this graphs we have both the least and worst average delays for flights from NYC. On average for the 2023 year, this information can help customers and airlines view how they stand against the competition. While this data doesn’t guarantee you won’t have a delay on Allegiant Air, it helps you not be surprised if you have an hour delay on your Frontier flight.What might be most impressive is the average of timeliness against the amount of flights by each airline. Republic might have more delays that Allegiant, however it has more flights than any other airline almost making it seem that they could be more reliable. On the other hand Frontier with such few flights still has the worst average for delay time against it’s competitors and might want to investigate why that is to improve their business.