HW 4

Load Libraries

library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('tidyr')
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 3.3.3

Load Airline CSV

airlines <- read.csv("airlines.csv",header=TRUE, sep=",", stringsAsFactors=FALSE)
airlines <- data.frame(airlines)
airlines
##         X     X.1 Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1  ALASKA on time         497     221       212          503    1841
## 2         delayed          62      12        20          102     305
## 3                          NA      NA        NA           NA      NA
## 4 AM WEST on time         694    4840       383          320     201
## 5         delayed         117     415        65          129      61

Modify Data Frame

names(airlines)[c(1)] <- "Airline"
names(airlines)[c(2)] <- "Status"
airlines$Airline[2] <- "ALASKA"
airlines$Airline[5] <- "AM WEST"
airlines
##   Airline  Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1  ALASKA on time         497     221       212          503    1841
## 2  ALASKA delayed          62      12        20          102     305
## 3                          NA      NA        NA           NA      NA
## 4 AM WEST on time         694    4840       383          320     201
## 5 AM WEST delayed         117     415        65          129      61
airlines <- airlines[-3,]
airlines
##   Airline  Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1  ALASKA on time         497     221       212          503    1841
## 2  ALASKA delayed          62      12        20          102     305
## 4 AM WEST on time         694    4840       383          320     201
## 5 AM WEST delayed         117     415        65          129      61

Clean Data with Tidy

clean <- airlines %>% gather(Cities, Flights, 3:7) %>% spread(Status, Flights)

clean <- clean %>% 
  mutate(total = clean$`on time` + clean$delayed, delaypct = round(clean$delayed/total, 3)) %>%
  mutate(ontimepct = round(clean$`on time`/total, 3)) 

clean
##    Airline       Cities delayed on time total delaypct ontimepct
## 1   ALASKA  Los.Angeles      62     497   559    0.111     0.889
## 2   ALASKA      Phoenix      12     221   233    0.052     0.948
## 3   ALASKA    San.Diego      20     212   232    0.086     0.914
## 4   ALASKA San.Franciso     102     503   605    0.169     0.831
## 5   ALASKA      Seattle     305    1841  2146    0.142     0.858
## 6  AM WEST  Los.Angeles     117     694   811    0.144     0.856
## 7  AM WEST      Phoenix     415    4840  5255    0.079     0.921
## 8  AM WEST    San.Diego      65     383   448    0.145     0.855
## 9  AM WEST San.Franciso     129     320   449    0.287     0.713
## 10 AM WEST      Seattle      61     201   262    0.233     0.767

Plot Ratios by City and Airline

ggplot(clean, aes(delaypct, Cities)) +
  geom_point(aes(color = Airline))

ggplot(clean, aes(ontimepct, Cities)) +
  geom_point(aes(color = Airline))

In each city AM West has a higher delay ratio than Alaska.

Sumarize Data by Airline

airlinegrp <- clean %>% group_by(Airline) %>% 
  summarize(delayed = sum(delayed), ontime = sum(`on time`), total = sum(total)) %>%
  mutate(delaypct = round(delayed/total, 3)) %>% mutate(ontimepct = round(ontime/total, 3))

airlinegrp
## # A tibble: 2 × 6
##   Airline delayed ontime total delaypct ontimepct
##     <chr>   <int>  <int> <int>    <dbl>     <dbl>
## 1  ALASKA     501   3274  3775    0.133     0.867
## 2 AM WEST     787   6438  7225    0.109     0.891

Plot Ratios by Just Airline

ggplot(airlinegrp, aes(delaypct, Airline)) +
  geom_point(aes(color = Airline))

ggplot(airlinegrp, aes(ontimepct, Airline)) +
  geom_point(aes(color = Airline))

Looking at the delay ratio by airline, Alaska has a higher ratio.

Conclusion

When looking at the delay ratio by just cities it looks like Alaska is doing a better job at not being delayed. But when grouped by just the airlines it is clear that AM West has a lower delay ratio. This is because AM West dealt with more flights than Alaska (7225 for AM West and 3775 for Alaska). AM West did a good job with delays in terms of the amount of flights they had.