Loading the libraries and the data

The data can be downloaded from github.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)

# Read the CSV file, specifying missing values
file_path <- "C:\\ProgramData\\MySQL\\MySQL Server 8.0\\Uploads\\airlines.csv"
airlines_data <- read_csv(file_path, na = c("-1"))
## Rows: 4 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Airline, Status
## dbl (5): Los Angeles, Phoenix, San Diego, San Francisco, Seattle
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(airlines_data, caption = "Complete Airlines Dataset")
Complete Airlines Dataset
Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
ALASKA on time 497 221 212 503 1841
ALASKA delayed 62 12 20 102 305
AM WEST on time 694 4840 383 320 201
AM WEST delayed 117 415 65 129 61

From wide data form to long form

airlines_long <- airlines_data %>%
  pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
               names_to = "Destination",
               values_to = "Result")

head (airlines_long)
## # A tibble: 6 × 4
##   Airline Status  Destination   Result
##   <chr>   <chr>   <chr>          <dbl>
## 1 ALASKA  on time Los Angeles      497
## 2 ALASKA  on time Phoenix          221
## 3 ALASKA  on time San Diego        212
## 4 ALASKA  on time San Francisco    503
## 5 ALASKA  on time Seattle         1841
## 6 ALASKA  delayed Los Angeles       62

Total Flights by Airlines

# Summarize total flights by airline
Airline_summary <- airlines_long %>%
  group_by(Airline) %>%
  summarise(total_flights = sum(Result))

kable(Airline_summary, caption = "Total Flights by Airline")
Total Flights by Airline
Airline total_flights
ALASKA 3775
AM WEST 7225
# Summarize total flights for each destination by airline
destination_summary <- airlines_long %>%
  group_by(Destination, Airline) %>%
  summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
kable(destination_summary, caption = "Total Flights by Destination")
Total Flights by Destination
Destination Airline total_flights
Los Angeles ALASKA 559
Los Angeles AM WEST 811
Phoenix ALASKA 233
Phoenix AM WEST 5255
San Diego ALASKA 232
San Diego AM WEST 448
San Francisco ALASKA 605
San Francisco AM WEST 449
Seattle ALASKA 2146
Seattle AM WEST 262
# Grouped bar chart to show total flights for each destination by airline
ggplot(data = destination_summary, aes(x = Destination, y = total_flights, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Total Flights to Each Destination by Airline",
       x = "Destination",
       y = "Total Number of Flights",
       fill = "Airline") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("ALASKA" = "#0077C8", "AM WEST" = "#C8102E"))

The graph shows that AM WEST has significantly more flights to Phoenix compared to ALASKA and also has a higher number of flights to Los Angeles and San Diego. On the other hand, ALASKA has the most flights to Seattle, and relatively higher flights to San Francisco.

Flight Delays

airlines_long <- airlines_long %>%
  group_by(Destination, Airline) %>%
  mutate(percentage = ifelse(Status == "delayed", (Result / sum(Result)) * 100, NA))

airlines_long <- airlines_long %>%
  group_by(Destination, Airline) %>%
  mutate(label_y = ifelse(Status == "delayed", 
                          lag(cumsum(Result), default = 0) + (Result / 2), 
                          NA))

print(airlines_long)
## # A tibble: 20 × 6
## # Groups:   Destination, Airline [10]
##    Airline Status  Destination   Result percentage label_y
##    <chr>   <chr>   <chr>          <dbl>      <dbl>   <dbl>
##  1 ALASKA  on time Los Angeles      497      NA        NA 
##  2 ALASKA  on time Phoenix          221      NA        NA 
##  3 ALASKA  on time San Diego        212      NA        NA 
##  4 ALASKA  on time San Francisco    503      NA        NA 
##  5 ALASKA  on time Seattle         1841      NA        NA 
##  6 ALASKA  delayed Los Angeles       62      11.1     528 
##  7 ALASKA  delayed Phoenix           12       5.15    227 
##  8 ALASKA  delayed San Diego         20       8.62    222 
##  9 ALASKA  delayed San Francisco    102      16.9     554 
## 10 ALASKA  delayed Seattle          305      14.2    1994.
## 11 AM WEST on time Los Angeles      694      NA        NA 
## 12 AM WEST on time Phoenix         4840      NA        NA 
## 13 AM WEST on time San Diego        383      NA        NA 
## 14 AM WEST on time San Francisco    320      NA        NA 
## 15 AM WEST on time Seattle          201      NA        NA 
## 16 AM WEST delayed Los Angeles      117      14.4     752.
## 17 AM WEST delayed Phoenix          415       7.90   5048.
## 18 AM WEST delayed San Diego         65      14.5     416.
## 19 AM WEST delayed San Francisco    129      28.7     384.
## 20 AM WEST delayed Seattle           61      23.3     232.
ggplot(data = airlines_long, aes(x = Destination, y = Result, fill = Status)) +
  geom_bar(stat = "identity", width = 0.7) + 
  geom_text(data = airlines_long %>% filter(Status == "delayed"),
            aes(label = paste0(round(percentage, 1), "%"), y = label_y),
            color = "black", size = 4) +
  facet_grid(~ Airline) +
  theme_minimal() +
  labs(title = "Comparison of Delays and On-Time Flights by Airline and Destination",
       x = "Destination",
       y = "Total Flights",
       fill = "Status") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))  

Alaska Airline experiences the highest percentage of delays for flights to San Francisco at 16.9%, followed by Seattle at 14.2%. Delays for Phoenix and San Diego are relatively low. AM West Airline shows the most significant percentage of delays for flights to San Francisco (28.7%), followed by Seattle (23.3%) and Los Angeles (8.5%). Phoenix has low delay percentages.

# Summarize flight status by airline
status_summary <- airlines_long %>%
  group_by(Status, Airline) %>%
  summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
status_summary <- status_summary %>%
  group_by(Airline) %>%
  mutate(percentage = ifelse(Status == "delayed", (total_flights / sum(total_flights)) * 100, NA))

print(status_summary)
## # A tibble: 4 × 4
## # Groups:   Airline [2]
##   Status  Airline total_flights percentage
##   <chr>   <chr>           <dbl>      <dbl>
## 1 delayed ALASKA            501       13.3
## 2 delayed AM WEST           787       10.9
## 3 on time ALASKA           3274       NA  
## 4 on time AM WEST          6438       NA
# Create a stacked bar chart_airlines
ggplot(data = status_summary, aes(x = Airline, y = total_flights, fill = Status)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(data = status_summary %>% filter(Status == "delayed"),
            aes(label = paste0(round(percentage, 1), "%")),
            position = position_stack(vjust = 9.5),
            color = "black", size = 4) +
  theme_minimal() +
  labs(title = "Comparison of Delays and On-Time Flights for Airlines",
       x = "Airline",
       y = "Number of Flights",
       fill = "Flight Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ALASKA Airline has 501 delays and 3274 on time flights, and delays accounted for 13.3% of their total flights. AM WEST Airline has 787 delayed flights and 6438 on time flights, and delayed flights represented 10.9% of their total flights.

# Summarize flights for each destination by flight status
destination_status <- airlines_long %>%
  group_by(Status, Destination) %>%
  summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
destination_status <- destination_status %>%
  group_by(Destination) %>%
  mutate(percentage = ifelse(Status == "delayed", (total_flights / sum(total_flights)) * 100, NA))

print(destination_status)
## # A tibble: 10 × 4
## # Groups:   Destination [5]
##    Status  Destination   total_flights percentage
##    <chr>   <chr>                 <dbl>      <dbl>
##  1 delayed Los Angeles             179      13.1 
##  2 delayed Phoenix                 427       7.78
##  3 delayed San Diego                85      12.5 
##  4 delayed San Francisco           231      21.9 
##  5 delayed Seattle                 366      15.2 
##  6 on time Los Angeles            1191      NA   
##  7 on time Phoenix                5061      NA   
##  8 on time San Diego               595      NA   
##  9 on time San Francisco           823      NA   
## 10 on time Seattle                2042      NA
# Create a stacked bar chart_destination
ggplot(data = destination_status, aes(x = Destination, y = total_flights, fill = Status)) +
  geom_bar(stat = "identity", width = 0.5) +
  theme_minimal() +
  labs(title = "Comparison of Delays and On-Time Flights by destination",
       x = "Destination",
       y = "Number of Flights",
       fill = "Flight Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

San Francisco has the highest percentage of delayed flights at 21.9%, followed by Seattle at 15.2%. Phoenix has the lowest percentage of delayed flights, with only 7.8% of flights being delayed.

Summary

Comparing flight performance city-by-city versus overall reveals that certain cities experience more delays. AM WEST performs well overall but has a high delay rate in San Francisco and a low rate in Phoenix.