library(tidyverse)
library(ggplot2)

Overview

The first objective in this process is loading the data, followed by cleaning up the data so that it takes on an expected dataframe form (i.e. empty rows, or null values when not appropriate). Next, we transform the data, moving from wide to long in order to more easily analyze. Conversely, we widen the column denoting delayed and on time flights, again, to more easily analyze. Lastly, airlines are compared at aggregate and at a city level, resulting in interesting findings (discussed in the conclusion).

Loading the data

# Load arrival delays csv from Github URL
#arrivaldelays <- read.csv(url("https://raw.githubusercontent.com/mattlucich/tidy-data/tidy/arrival-delays.csv"))

# Load arrival delays csv from Github repo (if you prefer)
df_arrival_delays <- read.csv("arrival-delays.csv")

# View the data
df_arrival_delays
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2         delayed          62      12        20           102     305
## 3                          NA                NA            NA        
## 4 AM WEST on time         694   4,840       383           320     201
## 5         delayed         117     415        65           129      61

Cleaning the data

# Filter out empty rows
df_ad_wide <- df_arrival_delays %>% filter(X.1 != "")

# If X (airline) is empty fill in value from previous row
df_ad_wide <- df_ad_wide %>% mutate(airline = ifelse(X=="", lag(X), X))

# Drop unneeded row
df_ad_wide <- df_ad_wide[-c(1)]

# Rename columns
df_ad_wide <- df_ad_wide %>% 
            rename(
              status = X.1,
              los_angeles = "Los.Angeles",
              phoenix = "Phoenix",
              san_diego = "San.Diego",
              san_franchisco = "San.Francisco",
              seattle = "Seattle"
              )

# Remove commas to convert to numeric
df_ad_wide <- as.data.frame(lapply(df_ad_wide, function(y) gsub(",", "", y)))

# Reorder columns
df_ad_wide <- df_ad_wide[, c(7, 1, 2, 3, 4, 5, 6)]
df_ad_wide
##   airline  status los_angeles phoenix san_diego san_franchisco seattle
## 1  ALASKA on time         497     221       212            503    1841
## 2  ALASKA delayed          62      12        20            102     305
## 3 AM WEST on time         694    4840       383            320     201
## 4 AM WEST delayed         117     415        65            129      61

Transforming the data

# Gather city columns into one column "city" and their values to "delay"
df_ad_long <- df_ad_wide %>% 
                        pivot_longer(
                          cols = los_angeles:seattle, 
                          names_to = "city", 
                          values_to = "delay",
                          values_drop_na = TRUE
                        )
df_ad_long
## # A tibble: 20 x 4
##    airline status  city           delay
##    <chr>   <chr>   <chr>          <chr>
##  1 ALASKA  on time los_angeles    497  
##  2 ALASKA  on time phoenix        221  
##  3 ALASKA  on time san_diego      212  
##  4 ALASKA  on time san_franchisco 503  
##  5 ALASKA  on time seattle        1841 
##  6 ALASKA  delayed los_angeles    62   
##  7 ALASKA  delayed phoenix        12   
##  8 ALASKA  delayed san_diego      20   
##  9 ALASKA  delayed san_franchisco 102  
## 10 ALASKA  delayed seattle        305  
## 11 AM WEST on time los_angeles    694  
## 12 AM WEST on time phoenix        4840 
## 13 AM WEST on time san_diego      383  
## 14 AM WEST on time san_franchisco 320  
## 15 AM WEST on time seattle        201  
## 16 AM WEST delayed los_angeles    117  
## 17 AM WEST delayed phoenix        415  
## 18 AM WEST delayed san_diego      65   
## 19 AM WEST delayed san_franchisco 129  
## 20 AM WEST delayed seattle        61

Analyzing the airline and city combinations (and more transformations)

# Concat airline and city into one column
df_ad_long$airline_city <- paste(df_ad_long$airline,df_ad_long$city)

# Widen data in order to see on-time percentage by airline by location
df_airline_city_wide <- df_ad_long %>%
    pivot_wider(names_from = status, values_from = delay)

# Add on-time percentage column
df_airline_city_wide <- df_airline_city_wide %>%
    mutate(ontime_perc = as.numeric(`on time`) / (as.numeric(`on time`) + as.numeric(delayed)))
df_airline_city_wide
## # A tibble: 10 x 6
##    airline city           airline_city           `on time` delayed ontime_perc
##    <chr>   <chr>          <chr>                  <chr>     <chr>         <dbl>
##  1 ALASKA  los_angeles    ALASKA los_angeles     497       62            0.889
##  2 ALASKA  phoenix        ALASKA phoenix         221       12            0.948
##  3 ALASKA  san_diego      ALASKA san_diego       212       20            0.914
##  4 ALASKA  san_franchisco ALASKA san_franchisco  503       102           0.831
##  5 ALASKA  seattle        ALASKA seattle         1841      305           0.858
##  6 AM WEST los_angeles    AM WEST los_angeles    694       117           0.856
##  7 AM WEST phoenix        AM WEST phoenix        4840      415           0.921
##  8 AM WEST san_diego      AM WEST san_diego      383       65            0.855
##  9 AM WEST san_franchisco AM WEST san_franchisco 320       129           0.713
## 10 AM WEST seattle        AM WEST seattle        201       61            0.767
# Histogram: ontime percentage
ggplot(df_airline_city_wide, aes(x=airline_city, weights=ontime_perc)) + 
  geom_bar(fill="turquoise3") + theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("Airline City Ontime %") + ylab("Ontime %")

# Groupby airline_city and status
df_airline_city_long <- df_ad_long %>%
  group_by(airline_city, status) %>%
  summarize(count = as.numeric(delay))

# Stacked histogram: ontime and delay counts
ggplot(df_airline_city_long, aes(x=airline_city, weights=count, fill=status)) + 
  geom_bar() + theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("Airline City Ontime vs Delay")

Analyzing airlines (and more transformations)

# Arrival delays by airline
df_airline_wide <- df_airline_city_wide %>%
    group_by(airline) %>%
    summarize(delay_total = sum(as.numeric(delayed)),
              ontime_total = sum(as.numeric(`on time`)))

# Add on-time percentage column
df_airline_wide <- df_airline_wide %>%
    mutate(ontime_perc = as.numeric(ontime_total) / (as.numeric(ontime_total) + as.numeric(delay_total)))
df_airline_wide
## # A tibble: 2 x 4
##   airline delay_total ontime_total ontime_perc
## * <chr>         <dbl>        <dbl>       <dbl>
## 1 ALASKA          501         3274       0.867
## 2 AM WEST         787         6438       0.891
# Histogram: ontime percentage
ggplot(df_airline_wide, aes(x=airline, weights=ontime_perc)) + 
  geom_bar(fill="turquoise3") + ggtitle("Airline City Ontime %") + ylab("Ontime %")

# Groupby airline and status
df_airline_long <- df_ad_long %>%
  group_by(airline, status) %>%
  summarize(count = sum(as.numeric(delay)))
df_airline_long
## # A tibble: 4 x 3
## # Groups:   airline [2]
##   airline status  count
##   <chr>   <chr>   <dbl>
## 1 ALASKA  delayed   501
## 2 ALASKA  on time  3274
## 3 AM WEST delayed   787
## 4 AM WEST on time  6438
# Stacked histogram: ontime and delay counts
ggplot(df_airline_long, aes(x=airline, weights=count, fill=status)) + 
  geom_bar() + ggtitle("Airline Ontime vs Delay")

Conclusion

The comparison of Alaska and AM West airlines is an ideal example of Simpson's Paradox. We can see from the table grouped by airlines and locations that Alaska beats AM West in every location comparison of on time percentage (e.g. Alaska's Los Angeles on time % vs AM West's Los Angeles on time %). However, we see that AM West has a higher overall on time percentage. This is mainly attributable to AM West having a high volume of flights to Phoenix with a relatively high on time percent as well as Alaska having a high volume of flights to Seattle with a relatively low on time percent.

....


Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.