library('dplyr')##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('tidyr')
library('ggplot2')## Warning: package 'ggplot2' was built under R version 3.3.3
airlines <- read.csv("airlines.csv",header=TRUE, sep=",", stringsAsFactors=FALSE)
airlines <- data.frame(airlines)
airlines## X X.1 Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
names(airlines)[c(1)] <- "Airline"
names(airlines)[c(2)] <- "Status"
airlines$Airline[2] <- "ALASKA"
airlines$Airline[5] <- "AM WEST"
airlines## Airline Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
airlines <- airlines[-3,]
airlines## Airline Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
clean <- airlines %>% gather(Cities, Flights, 3:7) %>% spread(Status, Flights)
clean <- clean %>%
mutate(total = clean$`on time` + clean$delayed, delaypct = round(clean$delayed/total, 3)) %>%
mutate(ontimepct = round(clean$`on time`/total, 3))
clean## Airline Cities delayed on time total delaypct ontimepct
## 1 ALASKA Los.Angeles 62 497 559 0.111 0.889
## 2 ALASKA Phoenix 12 221 233 0.052 0.948
## 3 ALASKA San.Diego 20 212 232 0.086 0.914
## 4 ALASKA San.Franciso 102 503 605 0.169 0.831
## 5 ALASKA Seattle 305 1841 2146 0.142 0.858
## 6 AM WEST Los.Angeles 117 694 811 0.144 0.856
## 7 AM WEST Phoenix 415 4840 5255 0.079 0.921
## 8 AM WEST San.Diego 65 383 448 0.145 0.855
## 9 AM WEST San.Franciso 129 320 449 0.287 0.713
## 10 AM WEST Seattle 61 201 262 0.233 0.767
ggplot(clean, aes(delaypct, Cities)) +
geom_point(aes(color = Airline))ggplot(clean, aes(ontimepct, Cities)) +
geom_point(aes(color = Airline))In each city AM West has a higher delay ratio than Alaska.
airlinegrp <- clean %>% group_by(Airline) %>%
summarize(delayed = sum(delayed), ontime = sum(`on time`), total = sum(total)) %>%
mutate(delaypct = round(delayed/total, 3)) %>% mutate(ontimepct = round(ontime/total, 3))
airlinegrp## # A tibble: 2 × 6
## Airline delayed ontime total delaypct ontimepct
## <chr> <int> <int> <int> <dbl> <dbl>
## 1 ALASKA 501 3274 3775 0.133 0.867
## 2 AM WEST 787 6438 7225 0.109 0.891
ggplot(airlinegrp, aes(delaypct, Airline)) +
geom_point(aes(color = Airline))ggplot(airlinegrp, aes(ontimepct, Airline)) +
geom_point(aes(color = Airline))Looking at the delay ratio by airline, Alaska has a higher ratio.
When looking at the delay ratio by just cities it looks like Alaska is doing a better job at not being delayed. But when grouped by just the airlines it is clear that AM West has a lower delay ratio. This is because AM West dealt with more flights than Alaska (7225 for AM West and 3775 for Alaska). AM West did a good job with delays in terms of the amount of flights they had.