library(tidyverse)
library(kableExtra)
library(hrbrthemes)
library(reshape2)My approach follows:
delay_data <- read_csv("https://raw.githubusercontent.com/MundyMSDS/DATA607/master/airline_delays.csv")
head(delay_data)## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Deigo` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM West on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
delay_data <- delay_data %>%
mutate(X1=if_else(is.na(X1),lag(X1),X1)) %>%
drop_na() %>%
gather(`Los Angeles`, Phoenix, `San Deigo`, `San Francisco`, Seattle, key="City", value = "Count") %>%
spread("X2", "Count", 3:7) %>%
rename(Airline = X1, Delayed = delayed, `On Time` = `on time`) %>%
mutate(Total = Delayed + `On Time`) %>%
mutate(`On Time %` = `On Time` / Total) %>%
mutate(`Delay %` = Delayed / Total) %>%
mutate(`Is Hub` = if_else(Airline=="Alaska" & City =="Seattle" |Airline=="AM West" & City =="Phoenix", "Yes", "No" )) %>%
arrange(Airline, Delayed)kable(delay_data, format = "markdown")| Airline | City | Delayed | On Time | Total | On Time % | Delay % | Is Hub |
|---|---|---|---|---|---|---|---|
| Alaska | Phoenix | 12 | 221 | 233 | 0.9484979 | 0.0515021 | No |
| Alaska | San Deigo | 20 | 212 | 232 | 0.9137931 | 0.0862069 | No |
| Alaska | Los Angeles | 62 | 497 | 559 | 0.8890877 | 0.1109123 | No |
| Alaska | San Francisco | 102 | 503 | 605 | 0.8314050 | 0.1685950 | No |
| Alaska | Seattle | 305 | 1841 | 2146 | 0.8578751 | 0.1421249 | Yes |
| AM West | Seattle | 61 | 201 | 262 | 0.7671756 | 0.2328244 | No |
| AM West | San Deigo | 65 | 383 | 448 | 0.8549107 | 0.1450893 | No |
| AM West | Los Angeles | 117 | 694 | 811 | 0.8557337 | 0.1442663 | No |
| AM West | San Francisco | 129 | 320 | 449 | 0.7126949 | 0.2873051 | No |
| AM West | Phoenix | 415 | 4840 | 5255 | 0.9210276 | 0.0789724 | Yes |
delay_data <- delay_data %>%
select(Airline, City, Delayed, `On Time`) %>%
mutate(City = if_else(City == "Los Angeles", "LA", City)) %>%
mutate(City = if_else(City == "San Francisco", "SF", City)) %>%
mutate(City = if_else(City == "San Deigo", "SD", City)) %>%
mutate(City = if_else(City == "Phoenix", "PHX", City)) %>%
mutate(City = if_else(City == "Seattle", "STL", City)) %>%
melt()
ggplot(delay_data, aes(x = City, y = value, fill = variable)) +
geom_bar(stat = 'identity') +
geom_text(size = 3, aes(label = value), position = position_stack(vjust = 0.5)) +
labs(y = "flights") +
labs(x="", y="Count",
title="Flight Delays",
subtitle="Alaska Air vs. America West",
caption="Brought to you by the Tidyverse") +
theme_ipsum(grid="Y") +
facet_grid(~ Airline)More flights mean more delays - Hubs (Seattle:Alaska, Phoenix:Am West) had highest delays for each airline.
Alaska Airlines has a better track record for delays versus AM West.
Alaska Air’s highest on time percentage is at American West’s hub (Phoenix)
Delays are positively correlated to precipitation.