IMPORT

airlines_data <- read.csv("airlines.csv") 
 
# Display the first few rows  
head(airlines_data)  
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

TIDYING

airlines_data$X <- ifelse(airlines_data$X == "", NA, airlines_data$X)
airlines_data <- airlines_data %>% fill(X)

# Rename columns for clarity
colnames(airlines_data) <- c("Airline", "Status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle")
 
# Check the result  
head(airlines_data)  
##   Airline  Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3  ALASKA                  NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

TRANSFORM

airlines_data$Status <- ifelse(airlines_data$Status == "", NA, airlines_data$Status)

# Remove rows with NA in Status (these are likely header rows or empty rows)
airlines_data <- airlines_data %>% filter(!is.na(Status))

# Convert to long format
airlines_long <- airlines_data %>%
  pivot_longer(cols = c(Los_Angeles, Phoenix, San_Diego, San_Francisco, Seattle),
               names_to = "City",
               values_to = "Count")

# Check the long format data
print(head(airlines_long, 10))
## # A tibble: 10 × 4
##    Airline Status  City          Count
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los_Angeles     497
##  2 ALASKA  on time Phoenix         221
##  3 ALASKA  on time San_Diego       212
##  4 ALASKA  on time San_Francisco   503
##  5 ALASKA  on time Seattle        1841
##  6 ALASKA  delayed Los_Angeles      62
##  7 ALASKA  delayed Phoenix          12
##  8 ALASKA  delayed San_Diego        20
##  9 ALASKA  delayed San_Francisco   102
## 10 ALASKA  delayed Seattle         305

ANALYSIS

overall_summary <- airlines_long %>%
  group_by(Airline, Status) %>%
  summarise(Total = sum(Count, na.rm = TRUE), .groups = 'drop')

overall_summary_wide <- overall_summary %>%
  pivot_wider(names_from = Status, values_from = Total) %>%
  mutate(
    Total_Flights = ifelse(is.na(`on time`), 0, `on time`) + ifelse(is.na(delayed), 0, delayed),
    pct_delayed = ifelse(Total_Flights > 0, 100 * ifelse(is.na(delayed), 0, delayed) / Total_Flights, 0)
  )

# Display overall summary
print(overall_summary_wide)
## # A tibble: 2 × 5
##   Airline delayed `on time` Total_Flights pct_delayed
##   <chr>     <int>     <int>         <int>       <dbl>
## 1 ALASKA      501      3274          3775        13.3
## 2 AM WEST     787      6438          7225        10.9
city_summary <- airlines_long %>%
  group_by(Airline, City, Status) %>%
  summarise(Total = sum(Count, na.rm = TRUE), .groups = 'drop')

city_summary_wide <- city_summary %>%
  pivot_wider(names_from = Status, values_from = Total) %>%
  mutate(
    Total_Flights = ifelse(is.na(`on time`), 0, `on time`) + ifelse(is.na(delayed), 0, delayed),
    pct_delayed = ifelse(Total_Flights > 0, 100 * ifelse(is.na(delayed), 0, delayed) / Total_Flights, 0)
  )

# Display city-level summary
print(city_summary_wide)
## # A tibble: 10 × 6
##    Airline City          delayed `on time` Total_Flights pct_delayed
##    <chr>   <chr>           <int>     <int>         <int>       <dbl>
##  1 ALASKA  Los_Angeles        62       497           559       11.1 
##  2 ALASKA  Phoenix            12       221           233        5.15
##  3 ALASKA  San_Diego          20       212           232        8.62
##  4 ALASKA  San_Francisco     102       503           605       16.9 
##  5 ALASKA  Seattle           305      1841          2146       14.2 
##  6 AM WEST Los_Angeles       117       694           811       14.4 
##  7 AM WEST Phoenix           415      4840          5255        7.90
##  8 AM WEST San_Diego          65       383           448       14.5 
##  9 AM WEST San_Francisco     129       320           449       28.7 
## 10 AM WEST Seattle            61       201           262       23.3
city_comparison <- city_summary_wide %>%
  select(Airline, City, pct_delayed) %>%
  rename(City_Pct_Delayed = pct_delayed)

overall_comparison <- overall_summary_wide %>%
  select(Airline, pct_delayed) %>%
  rename(Overall_Pct_Delayed = pct_delayed)

# Join the city and overall percentages
comparison_table <- city_comparison %>%
  left_join(overall_comparison, by = "Airline") %>%
  mutate(Difference = City_Pct_Delayed - Overall_Pct_Delayed)

# Display the comparison table
print(comparison_table)
## # A tibble: 10 × 5
##    Airline City          City_Pct_Delayed Overall_Pct_Delayed Difference
##    <chr>   <chr>                    <dbl>               <dbl>      <dbl>
##  1 ALASKA  Los_Angeles              11.1                 13.3     -2.18 
##  2 ALASKA  Phoenix                   5.15                13.3     -8.12 
##  3 ALASKA  San_Diego                 8.62                13.3     -4.65 
##  4 ALASKA  San_Francisco            16.9                 13.3      3.59 
##  5 ALASKA  Seattle                  14.2                 13.3      0.941
##  6 AM WEST Los_Angeles              14.4                 10.9      3.53 
##  7 AM WEST Phoenix                   7.90                10.9     -3.00 
##  8 AM WEST San_Diego                14.5                 10.9      3.62 
##  9 AM WEST San_Francisco            28.7                 10.9     17.8  
## 10 AM WEST Seattle                  23.3                 10.9     12.4
# Calculate the average number of flights per city for each airline
flight_distribution <- airlines_long %>%
  group_by(Airline, City) %>%
  summarise(Total_Flights = sum(Count, na.rm = TRUE), .groups = 'drop')

# Display flight distribution
print(flight_distribution)
## # A tibble: 10 × 3
##    Airline City          Total_Flights
##    <chr>   <chr>                 <int>
##  1 ALASKA  Los_Angeles             559
##  2 ALASKA  Phoenix                 233
##  3 ALASKA  San_Diego               232
##  4 ALASKA  San_Francisco           605
##  5 ALASKA  Seattle                2146
##  6 AM WEST Los_Angeles             811
##  7 AM WEST Phoenix                5255
##  8 AM WEST San_Diego               448
##  9 AM WEST San_Francisco           449
## 10 AM WEST Seattle                 262

VISUALIZE

ggplot(city_summary_wide, aes(x = City, y = pct_delayed, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "City-wise Percentage of Delayed Flights by Airline",
       x = "City",
       y = "Percentage Delayed (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

TEXT

City-by-City:

Los_Angeles: ALASKA performs better with 3.34% lower delay rate

Phoenix: ALASKA performs better with 2.75% lower delay rate

San_Diego: ALASKA performs better with 5.89% lower delay rate

San_Francisco: ALASKA performs better with 11.87% lower delay rate

Seattle: ALASKA performs better with 9.07% lower delay rate

ALASKA flight distribution:

Seattle: 56.85% of flights (2146 flights)

San_Francisco: 16.03% of flights (605 flights)

Los_Angeles: 14.81% of flights (559 flights)

Phoenix: 6.17% of flights (233 flights)

San_Diego: 6.15% of flights (232 flights)

AM WEST flight distribution:

Phoenix: 72.73% of flights (5255 flights)

Los_Angeles: 11.22% of flights (811 flights)

San_Francisco: 6.21% of flights (449 flights)

San_Diego: 6.2% of flights (448 flights)

Seattle: 3.63% of flights (262 flights)

Simpson’s Paradox:

ALASKA performs better in every individual city, but as we can see from

the data, AM WEST has better overall performance.

AM WEST has 72.73% of its flights in Phoenix (relatively low delay rate)

ALASKA has most of its flights in Seattle (relatively higher delay rate)