Load all libraries

library(tidyr)
library(dplyr)
library(plyr)
library(ggplot2)

Read data from .csv to a dataframe

untidyFlight <- read.table("https://raw.githubusercontent.com/ezaccountz/Week_5_Assignment/master/flight_arrivals.csv", sep = ",", header = TRUE, stringsAsFactors = FALSE)
untidyFlight
##         X     X.1 Los.Angeles Phonenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497      221       212           503    1841
## 2         delayed          62       12        20           102     305
## 3                          NA       NA        NA            NA      NA
## 4 AM WEST on time         694     4840       383           320     201
## 5         delayed         117      415        65           129      61

First, rename the columns, remove the empty row(s) and fill in the airline name for the records of delayed flights

tidyFlight <- dplyr::rename(untidyFlight, "AirLine" = colnames(untidyFlight)[1], "status" = colnames(untidyFlight)[2])
tidyFlight <- tidyFlight %>%
  filter(status != "") 
tidyFlight$AirLine[seq(2,length(tidyFlight$status),2)] <- tidyFlight$AirLine[seq(1,length(tidyFlight$status),2)]
tidyFlight
##   AirLine  status Los.Angeles Phonenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497      221       212           503    1841
## 2  ALASKA delayed          62       12        20           102     305
## 3 AM WEST on time         694     4840       383           320     201
## 4 AM WEST delayed         117      415        65           129      61

Use the gather function to transform the table to be tidy

tidyFlight <- tidyFlight %>% 
  gather(destination, num_of_flights, 3:7)
tidyFlight
##    AirLine  status   destination num_of_flights
## 1   ALASKA on time   Los.Angeles            497
## 2   ALASKA delayed   Los.Angeles             62
## 3  AM WEST on time   Los.Angeles            694
## 4  AM WEST delayed   Los.Angeles            117
## 5   ALASKA on time      Phonenix            221
## 6   ALASKA delayed      Phonenix             12
## 7  AM WEST on time      Phonenix           4840
## 8  AM WEST delayed      Phonenix            415
## 9   ALASKA on time     San.Diego            212
## 10  ALASKA delayed     San.Diego             20
## 11 AM WEST on time     San.Diego            383
## 12 AM WEST delayed     San.Diego             65
## 13  ALASKA on time San.Francisco            503
## 14  ALASKA delayed San.Francisco            102
## 15 AM WEST on time San.Francisco            320
## 16 AM WEST delayed San.Francisco            129
## 17  ALASKA on time       Seattle           1841
## 18  ALASKA delayed       Seattle            305
## 19 AM WEST on time       Seattle            201
## 20 AM WEST delayed       Seattle             61

The following shows the overall delay rate of the two airlines

ddply(tidyFlight, "AirLine", transform, perct = num_of_flights / sum(num_of_flights)) %>% 
  filter(status == "delayed") %>%
  group_by(AirLine) %>% 
  dplyr::summarise(total_perct = sum(perct))
## # A tibble: 2 x 2
##   AirLine total_perct
##   <chr>         <dbl>
## 1 ALASKA        0.133
## 2 AM WEST       0.109

The difference between the two airlines is not significantly large.

The following shows the overall delay rate of the five destinations

ddply(tidyFlight, "destination", transform, perct = num_of_flights / sum(num_of_flights)) %>% 
  filter(status == "delayed") %>%
  group_by(destination) %>% 
  dplyr::summarise(total_perct = sum(perct))
## # A tibble: 5 x 2
##   destination   total_perct
##   <chr>               <dbl>
## 1 Los.Angeles        0.131 
## 2 Phonenix           0.0778
## 3 San.Diego          0.125 
## 4 San.Francisco      0.219 
## 5 Seattle            0.152

We can see that the range of the rates is quite notable.

Now we perform further analysis. The following graph shows the total number of flights by airline and destination, and the total number of delayed flights by airline and destination.

tidyFlight2 <- ddply(tidyFlight, c("AirLine", "destination"), transform, total_flights =  sum(num_of_flights)) %>% 
  filter(status == "delayed")

ggplot(data = tidyFlight2, aes(x = paste(destination,"\n",AirLine))) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("Air Line and Destination") +
  ylab("Total Number of Flights") +
  geom_bar(aes(y=total_flights),stat = "identity",col = "cyan3", fill = "cyan3") +
  geom_bar(aes(y=num_of_flights),stat = "identity",col = "coral1", fill = "coral1") +
  geom_rect(mapping=aes(xmin=9.5, xmax=10, ymin=5000, ymax=5250),col = "cyan3", fill = "cyan3") +
  geom_rect(mapping=aes(xmin=9.5, xmax=10, ymin=4500, ymax=4750),col = "coral1", fill = "coral1") +
  geom_text(mapping=aes(x=9.75, y=5250, label="Total Num of Flight"), size=3, vjust=3, hjust=0.5) +
  geom_text(mapping=aes(x=9.75, y=4750, label="Total Num of Delayed"), size=3, vjust=3, hjust=0.5)

To make it easier to compare, we compute the percentage of delays by airline and destination.

tidyFlight2 <- mutate(tidyFlight2, delay_perct = num_of_flights / total_flights)
tidyFlight2
##    AirLine  status   destination num_of_flights total_flights delay_perct
## 1   ALASKA delayed   Los.Angeles             62           559  0.11091234
## 2   ALASKA delayed      Phonenix             12           233  0.05150215
## 3   ALASKA delayed     San.Diego             20           232  0.08620690
## 4   ALASKA delayed San.Francisco            102           605  0.16859504
## 5   ALASKA delayed       Seattle            305          2146  0.14212488
## 6  AM WEST delayed   Los.Angeles            117           811  0.14426634
## 7  AM WEST delayed      Phonenix            415          5255  0.07897241
## 8  AM WEST delayed     San.Diego             65           448  0.14508929
## 9  AM WEST delayed San.Francisco            129           449  0.28730512
## 10 AM WEST delayed       Seattle             61           262  0.23282443

Finally letโ€™s compare some descriptive statistics of the two airlines

print("ALASKA")
## [1] "ALASKA"
tidyFlight2 %>% 
  filter(AirLine == "ALASKA") %>% 
  select(delay_perct) %>% 
  summary()
##   delay_perct     
##  Min.   :0.05150  
##  1st Qu.:0.08621  
##  Median :0.11091  
##  Mean   :0.11187  
##  3rd Qu.:0.14212  
##  Max.   :0.16860
print("AM WEST")
## [1] "AM WEST"
tidyFlight2 %>% 
  filter(AirLine == "AM WEST") %>% 
  select(delay_perct) %>% 
  summary()
##   delay_perct     
##  Min.   :0.07897  
##  1st Qu.:0.14427  
##  Median :0.14509  
##  Mean   :0.17769  
##  3rd Qu.:0.23282  
##  Max.   :0.28731

It is very clear now that all the statistics of AM WEST are higher than ALASKA. We can conclude that ALASKA is doing better job in making their flights on time.