Load the packages
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (nycflights23)
library (dplyr)
library (ggplot2)
library (ggalluvial)
Data sets in the package “nycflights23”
data (package = "nycflights23" )
# A tibble: 435,352 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
7 2023 1 1 520 510 10 948 949
8 2023 1 1 524 530 -6 645 710
9 2023 1 1 537 520 17 926 818
10 2023 1 1 547 545 2 845 852
# ℹ 435,342 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Convert Airline and Aiport Codes
flights1 <- flights %>%
left_join (airlines, by = "carrier" )
head (flights1)
# A tibble: 6 × 20
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>
flights2 <- flights1 %>%
left_join (airports, by = c ("origin" = "faa" ))
flights2
# A tibble: 435,352 × 27
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
7 2023 1 1 520 510 10 948 949
8 2023 1 1 524 530 -6 645 710
9 2023 1 1 537 520 17 926 818
10 2023 1 1 547 545 2 845 852
# ℹ 435,342 more rows
# ℹ 19 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name.x <chr>, name.y <chr>,
# lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
[1] "year" "month" "day" "dep_time"
[5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
[9] "arr_delay" "carrier" "flight" "tailnum"
[13] "origin" "dest" "air_time" "distance"
[17] "hour" "minute" "time_hour" "name.x"
[21] "name.y" "lat" "lon" "alt"
[25] "tz" "dst" "tzone"
Dep_delay per month
library (dplyr)
flights2 <- flights2 %>%
select (dep_delay, dep_time, sched_dep_time, carrier, month, name.x, origin, name.y) %>%
na.omit ()
flights2
# A tibble: 424,614 × 8
dep_delay dep_time sched_dep_time carrier month name.x origin name.y
<dbl> <int> <int> <chr> <int> <chr> <chr> <chr>
1 203 1 2038 UA 1 United Air Lin… EWR Newar…
2 78 18 2300 DL 1 Delta Air Line… JFK John …
3 47 31 2344 B6 1 JetBlue Airways JFK John …
4 173 33 2140 B6 1 JetBlue Airways JFK John …
5 228 36 2048 UA 1 United Air Lin… EWR Newar…
6 3 503 500 AA 1 American Airli… EWR Newar…
7 10 520 510 B6 1 JetBlue Airways JFK John …
8 -6 524 530 AA 1 American Airli… EWR Newar…
9 17 537 520 UA 1 United Air Lin… EWR Newar…
10 2 547 545 NK 1 Spirit Air Lin… EWR Newar…
# ℹ 424,604 more rows
monthly_delays <- flights2 %>%
group_by (month) %>%
summarize (avg_dep_delay = mean (dep_delay, na.rm = TRUE ))
monthly_delays
# A tibble: 12 × 2
month avg_dep_delay
<int> <dbl>
1 1 14.0
2 2 11.0
3 3 13.0
4 4 17.7
5 5 8.39
6 6 24.4
7 7 30.5
8 8 13.5
9 9 17.3
10 10 5.28
11 11 4.40
12 12 8.33
Visualization plot
ggplot (monthly_delays, aes (x = factor (month, labels = month.abb), y = avg_dep_delay)) +
geom_bar (stat = "identity" , fill = "#EBA0A1" ) +
labs (title = "Average Departure Delay by Month" ,
x = "Month" ,
y = "Average Departure Delay (minutes)" ,
caption = "Data Source: NYC Flights 2023 Dataset" ) +
theme_minimal ()
Make a plot of carrier Vs avg_dep_delay
carrier_delays <- flights %>%
group_by (carrier) %>%
summarize (avg_dep_delay = mean (dep_delay, na.rm = TRUE )) %>%
mutate (delay_status = case_when (
avg_dep_delay <= 0 ~ "On Time" ,
avg_dep_delay > 0 & avg_dep_delay <= 15 ~ "Slightly Delayed" ,
avg_dep_delay > 15 ~ "Heavily Delayed"
))
carrier_delays
# A tibble: 14 × 3
carrier avg_dep_delay delay_status
<chr> <dbl> <chr>
1 9E 7.44 Slightly Delayed
2 AA 14.2 Slightly Delayed
3 AS 12.0 Slightly Delayed
4 B6 23.8 Heavily Delayed
5 DL 15.1 Heavily Delayed
6 F9 35.7 Heavily Delayed
7 G4 3.98 Slightly Delayed
8 HA 22.9 Heavily Delayed
9 MQ 10.5 Slightly Delayed
10 NK 18.2 Heavily Delayed
11 OO 19.8 Heavily Delayed
12 UA 17.6 Heavily Delayed
13 WN 16.1 Heavily Delayed
14 YX 4.21 Slightly Delayed
ggplot (carrier_delays, aes (x = carrier, y = avg_dep_delay, fill = delay_status)) +
geom_bar (stat = "identity" ) +
scale_fill_manual (values = c ("On Time" = "#EBA0A1" , "Slightly Delayed" = "#FFE700" , "Heavily Delayed" = "#A7C6D9" )) +
labs (title = "Average Departure Delay by Carrier" ,
x = "Carrier" ,
y = "Average Departure Delay (minutes)" ,
fill = "Delay Status" ,
caption = "Source: NYC Flights 2023 Dataset" ) +
theme_minimal ()
delay_summary <- carrier_delays %>%
group_by (delay_status) %>%
summarize (count = n (), .groups = 'drop' ) %>%
arrange (desc (count))
delay_summary
# A tibble: 2 × 2
delay_status count
<chr> <int>
1 Heavily Delayed 8
2 Slightly Delayed 6
ggplot (carrier_delays, aes (axis1 = carrier, axis2 = delay_status)) +
geom_alluvium (aes (fill = delay_status)) +
geom_stratum () +
geom_text (stat = "stratum" , aes (label = after_stat (stratum))) +
scale_fill_manual (values = c ("On Time" = "#EBA0A1" , "Slightly Delayed" = "#FFE700" , "Heavily Delayed" = "#800080" )) +
labs (title = "Delay Status by Carrier" ,
x = "Carrier and Delay Status" ,
y = "Count" ,
fill = "Delay Status" ,
caption = "Source: NYC Flights 2023 Dataset" ) +
theme_minimal ()
The dataset of flight delays in the NYC Flights 2023 dataset reveals important insights into the performance of different carriers. This dataset includes detailed information on over 300,000 flights, including variables such as departure delays, carriers, and scheduled times. In my visualization, I considered key variables such as departure delay (dep_delay), departure time (dep_time), scheduled departure time (sched_dep_time), carrier, and month. I first created a bar plot to visualize the average departure delay per month, which indicated that July experienced the highest delays. Following this, I examined the relationship between carriers and average departure delays. I compiled a summary table to count the average delay statuses, revealing that Frontier Airlines (F9) had the most heavily delayed flights, while American Airlines (AA) recorded the highest number of slightly delayed flights. This shows that more travel during the summer vacation can lead to crowded airports and longer delays. This finding raises questions about the reasons for these delays, like weather issues or challenges that airlines face during this busy month. It variability in airline performance and highlight specific areas where carriers can improve their on-time performance. This analysis shows seasonal patterns, such as more delays during busy travel months or specific events like holidays or bad weather.