Loading the libraries and the data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
file_path <- "C:\\ProgramData\\MySQL\\MySQL Server 8.0\\Uploads\\airlines.csv"
airlines_data <- read_csv(file_path)
## Rows: 4 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Airline, Status
## dbl (5): Los Angeles, Phoenix, San Diego, San Francisco, Seattle
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head (airlines_data)
## # A tibble: 4 × 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 ALASKA  delayed            62      12          20             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 AM WEST delayed           117     415          65             129      61

from wide data form to long form

airlines_long <- airlines_data %>%
  pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
               names_to = "Destination",
               values_to = "Result")

head (airlines_long)
## # A tibble: 6 × 4
##   Airline Status  Destination   Result
##   <chr>   <chr>   <chr>          <dbl>
## 1 ALASKA  on time Los Angeles      497
## 2 ALASKA  on time Phoenix          221
## 3 ALASKA  on time San Diego        212
## 4 ALASKA  on time San Francisco    503
## 5 ALASKA  on time Seattle         1841
## 6 ALASKA  delayed Los Angeles       62

Total Flights by Airlines

# Summarize total flights for each destination by airline
destination_summary <- airlines_long %>%
  group_by(Destination, Airline) %>%
  summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
print(destination_summary)
## # A tibble: 10 × 3
## # Groups:   Destination [5]
##    Destination   Airline total_flights
##    <chr>         <chr>           <dbl>
##  1 Los Angeles   ALASKA            559
##  2 Los Angeles   AM WEST           811
##  3 Phoenix       ALASKA            233
##  4 Phoenix       AM WEST          5255
##  5 San Diego     ALASKA            232
##  6 San Diego     AM WEST           448
##  7 San Francisco ALASKA            605
##  8 San Francisco AM WEST           449
##  9 Seattle       ALASKA           2146
## 10 Seattle       AM WEST           262
# Grouped bar chart to show total flights for each destination by airline
ggplot(data = destination_summary, aes(x = Destination, y = total_flights, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Total Flights to Each Destination by Airline",
       x = "Destination",
       y = "Total Number of Flights",
       fill = "Airline") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("ALASKA" = "#0077C8", "AM WEST" = "#C8102E"))

The graph shows that AM WEST has significantly more flights to Phoenix compared to ALASKA and also has a higher number of flights to Los Angeles and San Diego. On the other hand, ALASKA has the most flights to Seattle, and relatively higher flights to San Francisco.

Flight Delays

airlines_long <- airlines_long %>%
  group_by(Destination, Airline) %>%
  mutate(percentage = ifelse(Status == "delayed", (Result / sum(Result)) * 100, NA))

airlines_long <- airlines_long %>%
  group_by(Destination, Airline) %>%
  mutate(label_y = ifelse(Status == "delayed", 
                          lag(cumsum(Result), default = 0) + (Result / 2), 
                          NA))

print(airlines_long)
## # A tibble: 20 × 6
## # Groups:   Destination, Airline [10]
##    Airline Status  Destination   Result percentage label_y
##    <chr>   <chr>   <chr>          <dbl>      <dbl>   <dbl>
##  1 ALASKA  on time Los Angeles      497      NA        NA 
##  2 ALASKA  on time Phoenix          221      NA        NA 
##  3 ALASKA  on time San Diego        212      NA        NA 
##  4 ALASKA  on time San Francisco    503      NA        NA 
##  5 ALASKA  on time Seattle         1841      NA        NA 
##  6 ALASKA  delayed Los Angeles       62      11.1     528 
##  7 ALASKA  delayed Phoenix           12       5.15    227 
##  8 ALASKA  delayed San Diego         20       8.62    222 
##  9 ALASKA  delayed San Francisco    102      16.9     554 
## 10 ALASKA  delayed Seattle          305      14.2    1994.
## 11 AM WEST on time Los Angeles      694      NA        NA 
## 12 AM WEST on time Phoenix         4840      NA        NA 
## 13 AM WEST on time San Diego        383      NA        NA 
## 14 AM WEST on time San Francisco    320      NA        NA 
## 15 AM WEST on time Seattle          201      NA        NA 
## 16 AM WEST delayed Los Angeles      117      14.4     752.
## 17 AM WEST delayed Phoenix          415       7.90   5048.
## 18 AM WEST delayed San Diego         65      14.5     416.
## 19 AM WEST delayed San Francisco    129      28.7     384.
## 20 AM WEST delayed Seattle           61      23.3     232.
ggplot(data = airlines_long, aes(x = Destination, y = Result, fill = Status)) +
  geom_bar(stat = "identity", width = 0.7) + 
  geom_text(data = airlines_long %>% filter(Status == "delayed"),
            aes(label = paste0(round(percentage, 1), "%"), y = label_y),
            color = "black", size = 4) +
  facet_grid(~ Airline) +
  theme_minimal() +
  labs(title = "Comparison of Delays and On-Time Flights by Airline and Destination",
       x = "Destination",
       y = "Total Flights",
       fill = "Status") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))  

Alaska Airline experiences the highest percentage of delays for flights to San Francisco at 16.9%, followed by Seattle at 14.2%. Delays for Phoenix and San Diego are relatively low. AM West Airline shows the most significant percentage of delays for flights to San Francisco (28.7%), followed by Seattle (23.3%) and Los Angeles (8.5%). Phoenix has low delay percentages.

# Summarize flight status by airline
status_summary <- airlines_long %>%
  group_by(Status, Airline) %>%
  summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
status_summary <- status_summary %>%
  group_by(Airline) %>%
  mutate(percentage = ifelse(Status == "delayed", (total_flights / sum(total_flights)) * 100, NA))

print(status_summary)
## # A tibble: 4 × 4
## # Groups:   Airline [2]
##   Status  Airline total_flights percentage
##   <chr>   <chr>           <dbl>      <dbl>
## 1 delayed ALASKA            501       13.3
## 2 delayed AM WEST           787       10.9
## 3 on time ALASKA           3274       NA  
## 4 on time AM WEST          6438       NA
# Create a stacked bar chart_airlines
ggplot(data = status_summary, aes(x = Airline, y = total_flights, fill = Status)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(data = status_summary %>% filter(Status == "delayed"),
            aes(label = paste0(round(percentage, 1), "%")),
            position = position_stack(vjust = 9.5),
            color = "black", size = 4) +
  theme_minimal() +
  labs(title = "Comparison of Delays and On-Time Flights for Airlines",
       x = "Airline",
       y = "Number of Flights",
       fill = "Flight Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ALASKA Airline has 501 delays and 3274 on time flights, and delays accounted for 13.3% of their total flights. AM WEST Airline has 787 delayed flights and 6438 on time flights, and delayed flights represented 10.9% of their total flights.

# Summarize flights for each destination by flight status
destination_status <- airlines_long %>%
  group_by(Status, Destination) %>%
  summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
destination_status <- destination_status %>%
  group_by(Destination) %>%
  mutate(percentage = ifelse(Status == "delayed", (total_flights / sum(total_flights)) * 100, NA))

print(destination_status)
## # A tibble: 10 × 4
## # Groups:   Destination [5]
##    Status  Destination   total_flights percentage
##    <chr>   <chr>                 <dbl>      <dbl>
##  1 delayed Los Angeles             179      13.1 
##  2 delayed Phoenix                 427       7.78
##  3 delayed San Diego                85      12.5 
##  4 delayed San Francisco           231      21.9 
##  5 delayed Seattle                 366      15.2 
##  6 on time Los Angeles            1191      NA   
##  7 on time Phoenix                5061      NA   
##  8 on time San Diego               595      NA   
##  9 on time San Francisco           823      NA   
## 10 on time Seattle                2042      NA
# Create a stacked bar chart_destination
ggplot(data = destination_status, aes(x = Destination, y = total_flights, fill = Status)) +
  geom_bar(stat = "identity", width = 0.5) +
  theme_minimal() +
  labs(title = "Comparison of Delays and On-Time Flights by destination",
       x = "Destination",
       y = "Number of Flights",
       fill = "Flight Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

San Francisco has the highest percentage of delayed flights at 21.9%, followed by Seattle at 15.2%. Phoenix has the lowest percentage of delayed flights, with only 7.8% of flights being delayed.