library(tidyverse)
library(kableExtra)
library(hrbrthemes)
library(reshape2)

My Approach

My approach follows:

  • Created csv file per assignment requirments.
  • Used Readr to read the file into R
  • Utilized Tidyverse tools to tidy and transform data.
  • Presented data in table and graph formats.

Read Data into R

delay_data <- read_csv("https://raw.githubusercontent.com/MundyMSDS/DATA607/master/airline_delays.csv")

head(delay_data)
## # A tibble: 5 x 7
##   X1      X2      `Los Angeles` Phoenix `San Deigo` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 Alaska  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 <NA>    <NA>               NA      NA          NA              NA      NA
## 4 AM West on time           694    4840         383             320     201
## 5 <NA>    delayed           117     415          65             129      61

Tidy the Data

delay_data <- delay_data %>% 
  mutate(X1=if_else(is.na(X1),lag(X1),X1)) %>%
  drop_na() %>% 
  gather(`Los Angeles`, Phoenix, `San Deigo`, `San Francisco`, Seattle, key="City", value = "Count") %>% 
  spread("X2", "Count",  3:7) %>% 
  rename(Airline = X1, Delayed = delayed, `On Time` = `on time`) %>% 
  mutate(Total = Delayed + `On Time`) %>% 
  mutate(`On Time %` = `On Time` / Total) %>% 
  mutate(`Delay %` = Delayed / Total) %>%
  mutate(`Is Hub` = if_else(Airline=="Alaska" & City =="Seattle" |Airline=="AM West" & City =="Phoenix", "Yes", "No" )) %>% 
  arrange(Airline, Delayed)

Analysis

kable(delay_data, format = "markdown")
Airline City Delayed On Time Total On Time % Delay % Is Hub
Alaska Phoenix 12 221 233 0.9484979 0.0515021 No
Alaska San Deigo 20 212 232 0.9137931 0.0862069 No
Alaska Los Angeles 62 497 559 0.8890877 0.1109123 No
Alaska San Francisco 102 503 605 0.8314050 0.1685950 No
Alaska Seattle 305 1841 2146 0.8578751 0.1421249 Yes
AM West Seattle 61 201 262 0.7671756 0.2328244 No
AM West San Deigo 65 383 448 0.8549107 0.1450893 No
AM West Los Angeles 117 694 811 0.8557337 0.1442663 No
AM West San Francisco 129 320 449 0.7126949 0.2873051 No
AM West Phoenix 415 4840 5255 0.9210276 0.0789724 Yes
delay_data <- delay_data %>% 
  select(Airline, City, Delayed, `On Time`) %>%
  mutate(City = if_else(City == "Los Angeles", "LA", City)) %>% 
  mutate(City = if_else(City == "San Francisco", "SF", City)) %>% 
  mutate(City = if_else(City == "San Deigo", "SD", City)) %>%
  mutate(City = if_else(City == "Phoenix", "PHX", City)) %>%
  mutate(City = if_else(City == "Seattle", "STL", City)) %>%
  melt()

ggplot(delay_data, aes(x = City, y = value, fill = variable)) +
   geom_bar(stat = 'identity') +
   geom_text(size = 3, aes(label = value),  position = position_stack(vjust = 0.5)) +
   labs(y = "flights") +
   labs(x="", y="Count",
       title="Flight Delays",
       subtitle="Alaska Air vs. America West",
       caption="Brought to you by the Tidyverse") + 
   theme_ipsum(grid="Y") +
   facet_grid(~ Airline)

Findings

  • More flights mean more delays - Hubs (Seattle:Alaska, Phoenix:Am West) had highest delays for each airline.

  • Alaska Airlines has a better track record for delays versus AM West.

  • Alaska Air’s highest on time percentage is at American West’s hub (Phoenix)

  • Delays are positively correlated to precipitation.