IMPORT
airlines_data <- read.csv("airlines.csv")
# Display the first few rows
head(airlines_data)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
TIDYING
airlines_data$X <- ifelse(airlines_data$X == "", NA, airlines_data$X)
airlines_data <- airlines_data %>% fill(X)
# Rename columns for clarity
colnames(airlines_data) <- c("Airline", "Status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle")
# Check the result
head(airlines_data)
## Airline Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 ALASKA NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
ANALYSIS
overall_summary <- airlines_long %>%
group_by(Airline, Status) %>%
summarise(Total = sum(Count, na.rm = TRUE), .groups = 'drop')
overall_summary_wide <- overall_summary %>%
pivot_wider(names_from = Status, values_from = Total) %>%
mutate(
Total_Flights = ifelse(is.na(`on time`), 0, `on time`) + ifelse(is.na(delayed), 0, delayed),
pct_delayed = ifelse(Total_Flights > 0, 100 * ifelse(is.na(delayed), 0, delayed) / Total_Flights, 0)
)
# Display overall summary
print(overall_summary_wide)
## # A tibble: 2 × 5
## Airline delayed `on time` Total_Flights pct_delayed
## <chr> <int> <int> <int> <dbl>
## 1 ALASKA 501 3274 3775 13.3
## 2 AM WEST 787 6438 7225 10.9
city_summary <- airlines_long %>%
group_by(Airline, City, Status) %>%
summarise(Total = sum(Count, na.rm = TRUE), .groups = 'drop')
city_summary_wide <- city_summary %>%
pivot_wider(names_from = Status, values_from = Total) %>%
mutate(
Total_Flights = ifelse(is.na(`on time`), 0, `on time`) + ifelse(is.na(delayed), 0, delayed),
pct_delayed = ifelse(Total_Flights > 0, 100 * ifelse(is.na(delayed), 0, delayed) / Total_Flights, 0)
)
# Display city-level summary
print(city_summary_wide)
## # A tibble: 10 × 6
## Airline City delayed `on time` Total_Flights pct_delayed
## <chr> <chr> <int> <int> <int> <dbl>
## 1 ALASKA Los_Angeles 62 497 559 11.1
## 2 ALASKA Phoenix 12 221 233 5.15
## 3 ALASKA San_Diego 20 212 232 8.62
## 4 ALASKA San_Francisco 102 503 605 16.9
## 5 ALASKA Seattle 305 1841 2146 14.2
## 6 AM WEST Los_Angeles 117 694 811 14.4
## 7 AM WEST Phoenix 415 4840 5255 7.90
## 8 AM WEST San_Diego 65 383 448 14.5
## 9 AM WEST San_Francisco 129 320 449 28.7
## 10 AM WEST Seattle 61 201 262 23.3
city_comparison <- city_summary_wide %>%
select(Airline, City, pct_delayed) %>%
rename(City_Pct_Delayed = pct_delayed)
overall_comparison <- overall_summary_wide %>%
select(Airline, pct_delayed) %>%
rename(Overall_Pct_Delayed = pct_delayed)
# Join the city and overall percentages
comparison_table <- city_comparison %>%
left_join(overall_comparison, by = "Airline") %>%
mutate(Difference = City_Pct_Delayed - Overall_Pct_Delayed)
# Display the comparison table
print(comparison_table)
## # A tibble: 10 × 5
## Airline City City_Pct_Delayed Overall_Pct_Delayed Difference
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 ALASKA Los_Angeles 11.1 13.3 -2.18
## 2 ALASKA Phoenix 5.15 13.3 -8.12
## 3 ALASKA San_Diego 8.62 13.3 -4.65
## 4 ALASKA San_Francisco 16.9 13.3 3.59
## 5 ALASKA Seattle 14.2 13.3 0.941
## 6 AM WEST Los_Angeles 14.4 10.9 3.53
## 7 AM WEST Phoenix 7.90 10.9 -3.00
## 8 AM WEST San_Diego 14.5 10.9 3.62
## 9 AM WEST San_Francisco 28.7 10.9 17.8
## 10 AM WEST Seattle 23.3 10.9 12.4
# Calculate the average number of flights per city for each airline
flight_distribution <- airlines_long %>%
group_by(Airline, City) %>%
summarise(Total_Flights = sum(Count, na.rm = TRUE), .groups = 'drop')
# Display flight distribution
print(flight_distribution)
## # A tibble: 10 × 3
## Airline City Total_Flights
## <chr> <chr> <int>
## 1 ALASKA Los_Angeles 559
## 2 ALASKA Phoenix 233
## 3 ALASKA San_Diego 232
## 4 ALASKA San_Francisco 605
## 5 ALASKA Seattle 2146
## 6 AM WEST Los_Angeles 811
## 7 AM WEST Phoenix 5255
## 8 AM WEST San_Diego 448
## 9 AM WEST San_Francisco 449
## 10 AM WEST Seattle 262
VISUALIZE
ggplot(city_summary_wide, aes(x = City, y = pct_delayed, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "City-wise Percentage of Delayed Flights by Airline",
x = "City",
y = "Percentage Delayed (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

TEXT
City-by-City:
Los_Angeles: ALASKA performs better with 3.34% lower delay rate
Phoenix: ALASKA performs better with 2.75% lower delay rate
San_Diego: ALASKA performs better with 5.89% lower delay rate
San_Francisco: ALASKA performs better with 11.87% lower delay
rate
Seattle: ALASKA performs better with 9.07% lower delay rate
ALASKA flight distribution:
Seattle: 56.85% of flights (2146 flights)
San_Francisco: 16.03% of flights (605 flights)
Los_Angeles: 14.81% of flights (559 flights)
Phoenix: 6.17% of flights (233 flights)
San_Diego: 6.15% of flights (232 flights)
AM WEST flight distribution:
Phoenix: 72.73% of flights (5255 flights)
Los_Angeles: 11.22% of flights (811 flights)
San_Francisco: 6.21% of flights (449 flights)
San_Diego: 6.2% of flights (448 flights)
Seattle: 3.63% of flights (262 flights)
Simpson’s Paradox:
ALASKA performs better in every individual city, but as we can see
from
the data, AM WEST has better overall performance.
AM WEST has 72.73% of its flights in Phoenix (relatively low delay
rate)
ALASKA has most of its flights in Seattle (relatively higher delay
rate)