Overview: The assignment for this week is related to Tidying and Transforming Data of arrival delay performance of 2 airlines.

Load all the required packages.

library(tidyverse)
library(RMariaDB)

Read data from MySQL

con <- dbConnect(RMariaDB::MariaDB(), user="test", password="test", dbname="adelay", host="localhost")
con
## <MariaDBConnection>
##   Host:    localhost
##   Server:  
##   Client:
dbListTables( con )
## [1] "ad"
#apply out SELECT statement to the database we established a connection to.
arrivalDelay <- dbGetQuery( con, 'SELECT * FROM ad;' )

#display the head of the resulting R dataframe
head( arrivalDelay )
##   airlines  status LosAngeles Phoenix SanDiego SanFrancisco Seattle
## 1   Alaska on time        497     221      212          503    1841
## 2          delayed         62      12       20          102     305
## 3  Am West on time        694    4840      383          320     201
## 4          delayed        117     415       65          129      61
#I didn't include the blank line in the database

#Fill out blank field in column airlines
arrivalDelay$airlines[2]<-"Alaska"
arrivalDelay$airlines[4]<-"Am West"

Transform data from one format to another format

ad <- arrivalDelay %>%
# Tidy up the table by gather to narrow all destinations to a column
  gather(key = "destination" , value= "numberOfFlight", LosAngeles:Seattle, convert=TRUE) %>%
# Spread out the "On time" and "Delayed" status
  spread(key = status, value = numberOfFlight, convert=TRUE) %>%
# Add several new columns
  mutate("totalFlight" = delayed + `on time`,
         "delayed%" = round(delayed/totalFlight*100, 2),
         "onTime%" = round(`on time`/totalFlight*100, 2))
ad
##    airlines  destination delayed on time totalFlight delayed% onTime%
## 1    Alaska   LosAngeles      62     497         559    11.09   88.91
## 2    Alaska      Phoenix      12     221         233     5.15   94.85
## 3    Alaska     SanDiego      20     212         232     8.62   91.38
## 4    Alaska SanFrancisco     102     503         605    16.86   83.14
## 5    Alaska      Seattle     305    1841        2146    14.21   85.79
## 6   Am West   LosAngeles     117     694         811    14.43   85.57
## 7   Am West      Phoenix     415    4840        5255     7.90   92.10
## 8   Am West     SanDiego      65     383         448    14.51   85.49
## 9   Am West SanFrancisco     129     320         449    28.73   71.27
## 10  Am West      Seattle      61     201         262    23.28   76.72

Compare arrival delayed flight percentage of 2 airlines at each destination

ggplot(data=ad) + 
  geom_col(mapping= aes(x = destination, y=`delayed%`, fill=airlines), position = "dodge")+
  labs(y="arrival delay rate")

Am West Airline has a higher percentage of arrival delayed flights than Alaska Airline at all 5 destinations.

Look further into the overall arrival delayed flight percentage between 2 airlines

adsum<-ad%>%
  group_by(airlines)%>%
  summarise(sum(delayed), sum(totalFlight))
adsum
## # A tibble: 2 x 3
##   airlines `sum(delayed)` `sum(totalFlight)`
##   <chr>             <int>              <int>
## 1 Alaska              501               3775
## 2 Am West             787               7225

Compare overall arrival delayed flight percentage of 2 airlines

ggplot(data=adsum) + 
  geom_col(mapping= aes(x = airlines, y=(`sum(delayed)`/`sum(totalFlight)`*100), fill=airlines))+
  labs(y="overall arrival delay rate")

Contracy to the perception from the first diagram, the overall flight arrival delayed percentage of Am West Airline is lower than Alaska Airline. Am West Airline who has almost twice as many flights as Alaska Airline is the better overall performer among the two.