# Loading required libraries
library(tidyr)
library(dplyr)
library(ggplot2)
# Load the dataset
url <- "https://raw.githubusercontent.com/Amish22/DS607/refs/heads/main/airline-data.csv"
airline_data <- read.csv(url)
head(airline_data)
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
The dataset is initially in a wide format, where each row corresponds to a combination of airline and status (on time or delayed), and columns represent counts of flights for five destinations. To facilitate analysis, I reshaped this data into a long format.
# Converting data to long format
tidy_airline_data <- airline_data %>%
pivot_longer(cols = c("Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle"),
names_to = "Destination",
values_to = "Count")
# View the tidied data
print(tidy_airline_data)
## # A tibble: 20 × 4
## Airline Status Destination Count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Francisco 129
## 20 AM WEST delayed Seattle 61
The tidied data now has three primary columns: Airline
,
Status
, and Destination
. The
Count
column represents the number of flights for each
combination of airline, status, and destination.
Next, I addressed missing data by replacing empty cells with zeros, ensuring that all combinations have values for accurate analysis.
# Replacing missing data with zeros
tidy_airline_data <- tidy_airline_data %>%
mutate(Count = ifelse(is.na(Count), 0, Count))
I grouped the data by Airline
, Status
, and
Destination
to calculate the total number of flights in
each category.
summary_data <- tidy_airline_data %>%
group_by(Airline, Status, Destination) %>%
summarize(Total_Flights = sum(Count), .groups = "drop")
print(summary_data)
## # A tibble: 20 × 4
## Airline Status Destination Total_Flights
## <chr> <chr> <chr> <int>
## 1 ALASKA delayed Los.Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San.Diego 20
## 4 ALASKA delayed San.Francisco 102
## 5 ALASKA delayed Seattle 305
## 6 ALASKA on time Los.Angeles 497
## 7 ALASKA on time Phoenix 221
## 8 ALASKA on time San.Diego 212
## 9 ALASKA on time San.Francisco 503
## 10 ALASKA on time Seattle 1841
## 11 AM WEST delayed Los.Angeles 117
## 12 AM WEST delayed Phoenix 415
## 13 AM WEST delayed San.Diego 65
## 14 AM WEST delayed San.Francisco 129
## 15 AM WEST delayed Seattle 61
## 16 AM WEST on time Los.Angeles 694
## 17 AM WEST on time Phoenix 4840
## 18 AM WEST on time San.Diego 383
## 19 AM WEST on time San.Francisco 320
## 20 AM WEST on time Seattle 201
# Visualizing Total Flights by Airline, Status, and Destination
ggplot(summary_data, aes(x = Destination, y = Total_Flights, fill = Status)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~ Airline) +
theme_minimal() +
labs(title = "Total Flights by Status for Each Airline Across Destinations",
x = "Destination",
y = "Total Flights",
fill = "Flight Status") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The delay percentage provides an easy way to compare on-time performance between the two airlines. A lower delay percentage indicates better on-time performance. This metric will be used in the visualization below to highlight differences.
# Calculating delay percentages
delay_percentage <- summary_data %>%
pivot_wider(names_from = Status, values_from = Total_Flights) %>%
mutate(Delay_Percentage = (delayed / (`on time` + delayed)) * 100) %>%
select(Airline, Destination, Delay_Percentage)
print(delay_percentage)
## # A tibble: 10 × 3
## Airline Destination Delay_Percentage
## <chr> <chr> <dbl>
## 1 ALASKA Los.Angeles 11.1
## 2 ALASKA Phoenix 5.15
## 3 ALASKA San.Diego 8.62
## 4 ALASKA San.Francisco 16.9
## 5 ALASKA Seattle 14.2
## 6 AM WEST Los.Angeles 14.4
## 7 AM WEST Phoenix 7.90
## 8 AM WEST San.Diego 14.5
## 9 AM WEST San.Francisco 28.7
## 10 AM WEST Seattle 23.3
The bar plot provides a side-by-side comparison of delay percentages for ALASKA and AM WEST across the five destinations.
ggplot(delay_percentage, aes(x = Destination, y = Delay_Percentage, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Comparison of Delay Percentages for ALASKA and AM WEST",
x = "Destination", y = "Delay Percentage (%)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
From the plot, we can see that AM WEST generally has higher delay percentages compared to ALASKA, indicating that it struggles more with on-time performance.
Next, I identified the destinations with the highest and lowest delay percentages for each airline.
# Destination with highest and lowest delay percentage for each airline
highest_delay <- delay_percentage %>%
group_by(Airline) %>%
slice_max(Delay_Percentage)
lowest_delay <- delay_percentage %>%
group_by(Airline) %>%
slice_min(Delay_Percentage)
highest_delay
## # A tibble: 2 × 3
## # Groups: Airline [2]
## Airline Destination Delay_Percentage
## <chr> <chr> <dbl>
## 1 ALASKA San.Francisco 16.9
## 2 AM WEST San.Francisco 28.7
lowest_delay
## # A tibble: 2 × 3
## # Groups: Airline [2]
## Airline Destination Delay_Percentage
## <chr> <chr> <dbl>
## 1 ALASKA Phoenix 5.15
## 2 AM WEST Phoenix 7.90
The destinations with the highest and lowest delay percentages highlight which routes perform well and which need improvement.
To provide more context, I calculated the average and median delay percentages for each airline.
# Calculating average and median delay percentages
stat_summary <- delay_percentage %>%
group_by(Airline) %>%
summarize(Average_Delay = mean(Delay_Percentage, na.rm = TRUE),
Median_Delay = median(Delay_Percentage),
.groups = "drop")
stat_summary
## # A tibble: 2 × 3
## Airline Average_Delay Median_Delay
## <chr> <dbl> <dbl>
## 1 ALASKA 11.2 11.1
## 2 AM WEST 17.8 14.5
# Transforming for visualization
stat_summary_long <- stat_summary %>%
pivot_longer(cols = c("Average_Delay", "Median_Delay"),
names_to = "Statistic",
values_to = "Percentage")
# Visualizing average and median delays for each airline
ggplot(stat_summary_long, aes(x = Airline, y = Percentage, fill = Statistic)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Average and Median Delay Percentages for ALASKA and AM WEST",
x = "Airline", y = "Delay Percentage (%)") +
scale_fill_brewer(palette = "Set2")
AM WEST has a higher average delay percentage compared to ALASKA. The high average delay indicates that AM WEST experiences more frequent delays. The median delay percentage for AM WEST is also higher than ALASKA, indicating that even typical delays are more common for AM WEST.
Comparing the overall percentages with the city-by-city performance reveals that AM WEST struggles more consistently across cities, whereas ALASKA maintains relatively better performance in most locations.