NYC_Flights23

Author

Efren A Martinez

##Load Library

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights23)
data("flights")
head(flights)
# A tibble: 6 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2023     1     1        1           2038       203      328              3
2  2023     1     1       18           2300        78      228            135
3  2023     1     1       31           2344        47      500            426
4  2023     1     1       33           2140       173      238           2352
5  2023     1     1       36           2048       228      223           2252
6  2023     1     1      503            500         3      808            815
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>
airlines
# A tibble: 14 × 2
   carrier name                  
   <chr>   <chr>                 
 1 9E      Endeavor Air Inc.     
 2 AA      American Airlines Inc.
 3 AS      Alaska Airlines Inc.  
 4 B6      JetBlue Airways       
 5 DL      Delta Air Lines Inc.  
 6 F9      Frontier Airlines Inc.
 7 G4      Allegiant Air         
 8 HA      Hawaiian Airlines Inc.
 9 MQ      Envoy Air             
10 NK      Spirit Air Lines      
11 OO      SkyWest Airlines Inc. 
12 UA      United Air Lines Inc. 
13 WN      Southwest Airlines Co.
14 YX      Republic Airline      
flights_full_airline_name <- flights |>
  left_join(airlines, by = c("carrier" = "carrier")) |>
  mutate(carrier = name) 
head(flights_full_airline_name)
# A tibble: 6 × 20
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2023     1     1        1           2038       203      328              3
2  2023     1     1       18           2300        78      228            135
3  2023     1     1       31           2344        47      500            426
4  2023     1     1       33           2140       173      238           2352
5  2023     1     1       36           2048       228      223           2252
6  2023     1     1      503            500         3      808            815
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>
flights_with_friendly_delay <- flights_full_airline_name|>
  mutate(total_delay = dep_delay+arr_delay,
         total_delay_hours = floor(total_delay / 60),
         total_delay_minutes = total_delay %% 60,
         total_delay_formatted = sprintf("%02d hours %02d minutes", total_delay_hours, total_delay_minutes))

head(flights_with_friendly_delay)
# A tibble: 6 × 24
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2023     1     1        1           2038       203      328              3
2  2023     1     1       18           2300        78      228            135
3  2023     1     1       31           2344        47      500            426
4  2023     1     1       33           2140       173      238           2352
5  2023     1     1       36           2048       228      223           2252
6  2023     1     1      503            500         3      808            815
# ℹ 16 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, total_delay <dbl>,
#   total_delay_hours <dbl>, total_delay_minutes <dbl>,
#   total_delay_formatted <chr>
delay_worst<-flights_with_friendly_delay |>
  arrange(desc(total_delay)) |>
  mutate(Speed = "Slowest") |> #add a column for use later
  head(10)
delay_worst
# A tibble: 10 × 25
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2023    12    17     1953           1340      1813     2155           1543
 2  2023    10     1     1240            659      1781     1407            835
 3  2023     4    25     1201            659      1742     1315            818
 4  2023     2     7     2045           1700      1665     2352           2025
 5  2023     4    20      926            619      1627     1135            822
 6  2023    10    29      856            600      1616     1050            805
 7  2023     3    17     2027           1830      1557     2346           2139
 8  2023     4    30     1818           1617      1561     2001           1820
 9  2023     3    29      633            525      1508      820            700
10  2023     7    14     1533           1600      1413     1935           1849
# ℹ 17 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, total_delay <dbl>,
#   total_delay_hours <dbl>, total_delay_minutes <dbl>,
#   total_delay_formatted <chr>, Speed <chr>
delay_best <- flights_with_friendly_delay|>
  arrange(total_delay)|>
  mutate(Speed = "Fastest")|>
  head(10)

delay_best
# A tibble: 10 × 25
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2023    12    25      838            845        -7     1054           1231
 2  2023    12    26     1659           1710       -11     1941           2112
 3  2023     4    12     1144           1200       -16     1358           1523
 4  2023    12     2     2147           2225       -38     2332             35
 5  2023     4    11      907            915        -8     1118           1250
 6  2023     9     3     1801           1834       -33     1934           2040
 7  2023    12    26     1956           1959        -3     2208           2344
 8  2023     4    11     1643           1649        -6     1846           2015
 9  2023    12    26      853            900        -7     1110           1238
10  2023     6     7     1547           1548        -1     1823           1955
# ℹ 17 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>, total_delay <dbl>,
#   total_delay_hours <dbl>, total_delay_minutes <dbl>,
#   total_delay_formatted <chr>, Speed <chr>
library(RColorBrewer)
p1 <- delay_worst |>
    ggplot(aes(x=dest, y=total_delay_formatted, fill=carrier)) +
    geom_point(size = 10, shape = 21, stroke = 1) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_fill_brewer(palette = "Set3") +
    labs(x='Destination',
         y='Total Delay',
         title='Top 10 Worst Delayed Flights of 2023 from NYC - Where and Who?',
         caption = "New York Flights Data 2023")
p1 
`geom_smooth()` using formula = 'y ~ x'

p2 <- delay_best|>
    ggplot(aes(x=dest, y=total_delay_formatted, fill=carrier)) +
    geom_point(size = 10, shape = 21, stroke = 1) +
  geom_smooth(method = "lm", se = FALSE) +
    scale_fill_brewer(palette = "Set3") +
    labs(x='Destination',
         y='Total Delay',
         title='Top 10 Fastest Flights of 2023 from NYC - Where and Who?',
         caption = "New York Flights Data 2023") 
p2
`geom_smooth()` using formula = 'y ~ x'

grouped_carriers <- flights_with_friendly_delay|>
group_by(carrier) |>
  summarise(
    total_flights = n(),
    avg_total_delay = mean(total_delay, na.rm = TRUE),
    total_delay_hours = sum(total_delay_hours, na.rm = TRUE),
    total_delay_minutes = sum(total_delay_minutes, na.rm = TRUE)
  )

# View the summarized data
head(grouped_carriers)
# A tibble: 6 × 5
  carrier    total_flights avg_total_delay total_delay_hours total_delay_minutes
  <chr>              <int>           <dbl>             <dbl>               <dbl>
1 Alaska Ai…          7843           11.9              -2290              229094
2 Allegiant…           671           -1.90              -370               20930
3 American …         40525           19.3              -8213             1258615
4 Delta Air…         61562           16.6             -15047             1906129
5 Endeavor …         54141            5.15            -23514             1679844
6 Envoy Air            357           10.6               -131               11604
delay_worst_group<-grouped_carriers|>
  arrange(desc(avg_total_delay)) |>
  mutate(Speed = "Slowest") |>
  head(5)
delay_worst_group
# A tibble: 5 × 6
  carrier    total_flights avg_total_delay total_delay_hours total_delay_minutes
  <chr>              <int>           <dbl>             <dbl>               <dbl>
1 Frontier …          1286            62.0               645               36798
2 Hawaiian …           366            44.4                85               10982
3 JetBlue A…         66169            39.2              8579             2006605
4 SkyWest A…          6432            33.5                51              204514
5 Spirit Ai…         15189            28.0              -918              469207
# ℹ 1 more variable: Speed <chr>
delay_best_group <- grouped_carriers|>
  arrange(avg_total_delay)|>
  mutate(Speed = "Fastest")|>
  head(5)

delay_best_group
# A tibble: 5 × 6
  carrier    total_flights avg_total_delay total_delay_hours total_delay_minutes
  <chr>              <int>           <dbl>             <dbl>               <dbl>
1 Allegiant…           671          -1.90               -370               20930
2 Republic …         88785          -0.528            -46870             2767123
3 Endeavor …         54141           5.15             -23514             1679844
4 Envoy Air            357          10.6                -131               11604
5 Alaska Ai…          7843          11.9               -2290              229094
# ℹ 1 more variable: Speed <chr>
top_bottom_avg_delay <- rbind(delay_best_group, delay_worst_group)
top_bottom_avg_delay <- top_bottom_avg_delay|>
  arrange(avg_total_delay)

top_bottom_avg_delay$carrier <- factor(top_bottom_avg_delay$carrier, levels = top_bottom_avg_delay$carrier)
p3 <- top_bottom_avg_delay|>
    ggplot(aes(x=avg_total_delay, y=total_flights, fill=carrier)) +
    geom_point(size = 8, shape = 21, stroke = 1) +
  geom_smooth(method = "lm", se = FALSE) +
    scale_fill_brewer(palette = "Paired") +
    labs(x='Average Delay in Minutes',
         y='Total Flights',
         title='Top 5 and Bottom 5 Average Delayed Airlines from NYC',
         caption = "New York Flights Data 2023") 
p3
`geom_smooth()` using formula = 'y ~ x'

What do we see?

In this graphs we have both the least and worst average delays for flights from NYC. On average for the 2023 year, this information can help customers and airlines view how they stand against the competition. While this data doesn’t guarantee you won’t have a delay on Allegiant Air, it helps you not be surprised if you have an hour delay on your Frontier flight.What might be most impressive is the average of timeliness against the amount of flights by each airline. Republic might have more delays that Allegiant, however it has more flights than any other airline almost making it seem that they could be more reliable. On the other hand Frontier with such few flights still has the worst average for delay time against it’s competitors and might want to investigate why that is to improve their business.