# Loading required libraries
library(tidyr)
library(dplyr)
library(ggplot2)

# Load the dataset
url <- "https://raw.githubusercontent.com/Amish22/DS607/refs/heads/main/airline-data.csv"
airline_data <- read.csv(url)

head(airline_data)
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Data Transformation: Converting to Long Format

The dataset is initially in a wide format, where each row corresponds to a combination of airline and status (on time or delayed), and columns represent counts of flights for five destinations. To facilitate analysis, I reshaped this data into a long format.

# Converting data to long format 
tidy_airline_data <- airline_data %>%
  pivot_longer(cols = c("Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle"),
               names_to = "Destination",
               values_to = "Count")

# View the tidied data
print(tidy_airline_data)
## # A tibble: 20 × 4
##    Airline Status  Destination   Count
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los.Angeles     497
##  2 ALASKA  on time Phoenix         221
##  3 ALASKA  on time San.Diego       212
##  4 ALASKA  on time San.Francisco   503
##  5 ALASKA  on time Seattle        1841
##  6 ALASKA  delayed Los.Angeles      62
##  7 ALASKA  delayed Phoenix          12
##  8 ALASKA  delayed San.Diego        20
##  9 ALASKA  delayed San.Francisco   102
## 10 ALASKA  delayed Seattle         305
## 11 AM WEST on time Los.Angeles     694
## 12 AM WEST on time Phoenix        4840
## 13 AM WEST on time San.Diego       383
## 14 AM WEST on time San.Francisco   320
## 15 AM WEST on time Seattle         201
## 16 AM WEST delayed Los.Angeles     117
## 17 AM WEST delayed Phoenix         415
## 18 AM WEST delayed San.Diego        65
## 19 AM WEST delayed San.Francisco   129
## 20 AM WEST delayed Seattle          61

The tidied data now has three primary columns: Airline, Status, and Destination. The Count column represents the number of flights for each combination of airline, status, and destination.

Handling Missing Data

Next, I addressed missing data by replacing empty cells with zeros, ensuring that all combinations have values for accurate analysis.

# Replacing missing data with zeros
tidy_airline_data <- tidy_airline_data %>%
  mutate(Count = ifelse(is.na(Count), 0, Count))

Analysis: Total Flights by Status and Destination

I grouped the data by Airline, Status, and Destination to calculate the total number of flights in each category.

summary_data <- tidy_airline_data %>%
  group_by(Airline, Status, Destination) %>%
  summarize(Total_Flights = sum(Count), .groups = "drop")

print(summary_data)
## # A tibble: 20 × 4
##    Airline Status  Destination   Total_Flights
##    <chr>   <chr>   <chr>                 <int>
##  1 ALASKA  delayed Los.Angeles              62
##  2 ALASKA  delayed Phoenix                  12
##  3 ALASKA  delayed San.Diego                20
##  4 ALASKA  delayed San.Francisco           102
##  5 ALASKA  delayed Seattle                 305
##  6 ALASKA  on time Los.Angeles             497
##  7 ALASKA  on time Phoenix                 221
##  8 ALASKA  on time San.Diego               212
##  9 ALASKA  on time San.Francisco           503
## 10 ALASKA  on time Seattle                1841
## 11 AM WEST delayed Los.Angeles             117
## 12 AM WEST delayed Phoenix                 415
## 13 AM WEST delayed San.Diego                65
## 14 AM WEST delayed San.Francisco           129
## 15 AM WEST delayed Seattle                  61
## 16 AM WEST on time Los.Angeles             694
## 17 AM WEST on time Phoenix                4840
## 18 AM WEST on time San.Diego               383
## 19 AM WEST on time San.Francisco           320
## 20 AM WEST on time Seattle                 201
# Visualizing Total Flights by Airline, Status, and Destination
ggplot(summary_data, aes(x = Destination, y = Total_Flights, fill = Status)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~ Airline) +
  theme_minimal() +
  labs(title = "Total Flights by Status for Each Airline Across Destinations",
       x = "Destination",
       y = "Total Flights",
       fill = "Flight Status") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Calculating Delay Percentages

The delay percentage provides an easy way to compare on-time performance between the two airlines. A lower delay percentage indicates better on-time performance. This metric will be used in the visualization below to highlight differences.

# Calculating delay percentages
delay_percentage <- summary_data %>%
  pivot_wider(names_from = Status, values_from = Total_Flights) %>%
  mutate(Delay_Percentage = (delayed / (`on time` + delayed)) * 100) %>%
  select(Airline, Destination, Delay_Percentage)

print(delay_percentage)
## # A tibble: 10 × 3
##    Airline Destination   Delay_Percentage
##    <chr>   <chr>                    <dbl>
##  1 ALASKA  Los.Angeles              11.1 
##  2 ALASKA  Phoenix                   5.15
##  3 ALASKA  San.Diego                 8.62
##  4 ALASKA  San.Francisco            16.9 
##  5 ALASKA  Seattle                  14.2 
##  6 AM WEST Los.Angeles              14.4 
##  7 AM WEST Phoenix                   7.90
##  8 AM WEST San.Diego                14.5 
##  9 AM WEST San.Francisco            28.7 
## 10 AM WEST Seattle                  23.3

Visualizing Delay Percentages Across Destinations

The bar plot provides a side-by-side comparison of delay percentages for ALASKA and AM WEST across the five destinations.

ggplot(delay_percentage, aes(x = Destination, y = Delay_Percentage, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Comparison of Delay Percentages for ALASKA and AM WEST",
       x = "Destination", y = "Delay Percentage (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Findings

From the plot, we can see that AM WEST generally has higher delay percentages compared to ALASKA, indicating that it struggles more with on-time performance.

Identifying Best and Worst Performance Destinations

Next, I identified the destinations with the highest and lowest delay percentages for each airline.

# Destination with highest and lowest delay percentage for each airline
highest_delay <- delay_percentage %>%
  group_by(Airline) %>%
  slice_max(Delay_Percentage)

lowest_delay <- delay_percentage %>%
  group_by(Airline) %>%
  slice_min(Delay_Percentage)

highest_delay
## # A tibble: 2 × 3
## # Groups:   Airline [2]
##   Airline Destination   Delay_Percentage
##   <chr>   <chr>                    <dbl>
## 1 ALASKA  San.Francisco             16.9
## 2 AM WEST San.Francisco             28.7
lowest_delay
## # A tibble: 2 × 3
## # Groups:   Airline [2]
##   Airline Destination Delay_Percentage
##   <chr>   <chr>                  <dbl>
## 1 ALASKA  Phoenix                 5.15
## 2 AM WEST Phoenix                 7.90

Summary of Best and Worst Performance

The destinations with the highest and lowest delay percentages highlight which routes perform well and which need improvement.

Statistical Summary of Delay Percentages

To provide more context, I calculated the average and median delay percentages for each airline.

# Calculating average and median delay percentages
stat_summary <- delay_percentage %>%
  group_by(Airline) %>%
  summarize(Average_Delay = mean(Delay_Percentage, na.rm = TRUE),
            Median_Delay = median(Delay_Percentage),
            .groups = "drop")

stat_summary
## # A tibble: 2 × 3
##   Airline Average_Delay Median_Delay
##   <chr>           <dbl>        <dbl>
## 1 ALASKA           11.2         11.1
## 2 AM WEST          17.8         14.5
# Transforming for visualization
stat_summary_long <- stat_summary %>%
  pivot_longer(cols = c("Average_Delay", "Median_Delay"),
               names_to = "Statistic",
               values_to = "Percentage")

# Visualizing average and median delays for each airline
ggplot(stat_summary_long, aes(x = Airline, y = Percentage, fill = Statistic)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Average and Median Delay Percentages for ALASKA and AM WEST",
       x = "Airline", y = "Delay Percentage (%)") +
  scale_fill_brewer(palette = "Set2")

Summary

AM WEST has a higher average delay percentage compared to ALASKA. The high average delay indicates that AM WEST experiences more frequent delays. The median delay percentage for AM WEST is also higher than ALASKA, indicating that even typical delays are more common for AM WEST.

Overall and City-by-City Comparisons

Comparing the overall percentages with the city-by-city performance reveals that AM WEST struggles more consistently across cities, whereas ALASKA maintains relatively better performance in most locations.