library(tidyr)
library(dplyr)
library(ggplot2)
# CSV data
url <- "https://raw.githubusercontent.com/Amish22/DS607/refs/heads/main/airline-data.csv"
airline_data <- read.csv(url)
head(airline_data)
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
The dataset is in a wide format, where each row corresponds to a combination of airline and status (on time or delayed), and columns represent counts of flights for five destinations. To facilitate analysis, I reshaped this data into a long format.
# Converting data to long format
tidy_airline_data <- airline_data %>%
pivot_longer(cols = c("Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle"),
names_to = "Destination",
values_to = "Count")
print(tidy_airline_data)
## # A tibble: 20 × 4
## Airline Status Destination Count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los.Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San.Diego 20
## 9 ALASKA delayed San.Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los.Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San.Diego 383
## 14 AM WEST on time San.Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los.Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San.Diego 65
## 19 AM WEST delayed San.Francisco 129
## 20 AM WEST delayed Seattle 61
The tidied data now has three primary columns: Airline, Status, and Destination. The Count column represents the number of flights for each combination of airline, status, and destination.
summary_data <- tidy_airline_data %>%
group_by(Airline, Status, Destination) %>%
summarize(Total_Flights = sum(Count), .groups = "drop")
print(summary_data)
## # A tibble: 20 × 4
## Airline Status Destination Total_Flights
## <chr> <chr> <chr> <int>
## 1 ALASKA delayed Los.Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San.Diego 20
## 4 ALASKA delayed San.Francisco 102
## 5 ALASKA delayed Seattle 305
## 6 ALASKA on time Los.Angeles 497
## 7 ALASKA on time Phoenix 221
## 8 ALASKA on time San.Diego 212
## 9 ALASKA on time San.Francisco 503
## 10 ALASKA on time Seattle 1841
## 11 AM WEST delayed Los.Angeles 117
## 12 AM WEST delayed Phoenix 415
## 13 AM WEST delayed San.Diego 65
## 14 AM WEST delayed San.Francisco 129
## 15 AM WEST delayed Seattle 61
## 16 AM WEST on time Los.Angeles 694
## 17 AM WEST on time Phoenix 4840
## 18 AM WEST on time San.Diego 383
## 19 AM WEST on time San.Francisco 320
## 20 AM WEST on time Seattle 201
# Percent delayed
delay_percentage <- summary_data %>%
pivot_wider(names_from = Status, values_from = Total_Flights) %>%
mutate(Delay_Percentage = (delayed / (`on time` + delayed)) * 100) %>%
select(Airline, Destination, Delay_Percentage)
print(delay_percentage)
## # A tibble: 10 × 3
## Airline Destination Delay_Percentage
## <chr> <chr> <dbl>
## 1 ALASKA Los.Angeles 11.1
## 2 ALASKA Phoenix 5.15
## 3 ALASKA San.Diego 8.62
## 4 ALASKA San.Francisco 16.9
## 5 ALASKA Seattle 14.2
## 6 AM WEST Los.Angeles 14.4
## 7 AM WEST Phoenix 7.90
## 8 AM WEST San.Diego 14.5
## 9 AM WEST San.Francisco 28.7
## 10 AM WEST Seattle 23.3
The delay percentage provides an easy way to compare on-time performance between the two airlines. A lower delay percentage indicates better on-time performance. This metric will be used in the visualization below to highlight differences.
ggplot(delay_percentage, aes(x = Destination, y = Delay_Percentage, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Comparison of Delay Percentages for ALASKA and AM WEST",
x = "Destination", y = "Delay Percentage (%)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The bar plot provides a side-by-side comparison of delay percentages for ALASKA and AM WEST across the five destinations. From the plot, we can see that AM WEST generally has higher delay percentages compared to ALASKA.
# Destination with highest and lowest delay percentage for each airline
highest_delay <- delay_percentage %>%
group_by(Airline) %>%
slice_max(Delay_Percentage)
lowest_delay <- delay_percentage %>%
group_by(Airline) %>%
slice_min(Delay_Percentage)
highest_delay
## # A tibble: 2 × 3
## # Groups: Airline [2]
## Airline Destination Delay_Percentage
## <chr> <chr> <dbl>
## 1 ALASKA San.Francisco 16.9
## 2 AM WEST San.Francisco 28.7
lowest_delay
## # A tibble: 2 × 3
## # Groups: Airline [2]
## Airline Destination Delay_Percentage
## <chr> <chr> <dbl>
## 1 ALASKA Phoenix 5.15
## 2 AM WEST Phoenix 7.90
# Statistical summary for delay percentages
stat_summary <- delay_percentage %>%
group_by(Airline) %>%
summarize(Average_Delay = mean(Delay_Percentage),
Median_Delay = median(Delay_Percentage),
.groups = "drop")
stat_summary
## # A tibble: 2 × 3
## Airline Average_Delay Median_Delay
## <chr> <dbl> <dbl>
## 1 ALASKA 11.2 11.1
## 2 AM WEST 17.8 14.5
stat_summary_long <- stat_summary %>%
pivot_longer(cols = c("Average_Delay", "Median_Delay"),
names_to = "Statistic",
values_to = "Percentage")
# Bar plot to compare average and median delays for each airline
ggplot(stat_summary_long, aes(x = Airline, y = Percentage, fill = Statistic)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Average and Median Delay Percentages for ALASKA and AM WEST",
x = "Airline", y = "Delay Percentage (%)") +
scale_fill_brewer(palette = "Set2")
AM WEST has a higher average delay percentage compared to ALASKA. The
high average delay indicates that AM WEST experiences more frequent
delays. The median delay percentage for AM WEST is also higher than
ALASKA, indicating that even typical delays are more common for AM
WEST.