Load the relevant libraries

#make sure the Wickham libraries are loaded
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3

Read CSV File

#current directory
setwd("~/")
#read the csv file
flt_del <- read.csv("flight_delays.csv", header=T, stringsAsFactors = F)
#list the dataframe
flt_del
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4         delayed         117     415        65           129      61

Tidy the dataframe

flt_del$X[2] = flt_del$X[1]
flt_del$X[4] = flt_del$X[3]

flt_tidy <- gather(flt_del,"City","Flights", 3:7)

flt_tidy
##          X     X.1          City Flights
## 1   ALASKA on time   Los.Angeles     497
## 2   ALASKA delayed   Los.Angeles      62
## 3  AM WEST on time   Los.Angeles     694
## 4  AM WEST delayed   Los.Angeles     117
## 5   ALASKA on time       Phoenix     221
## 6   ALASKA delayed       Phoenix      12
## 7  AM WEST on time       Phoenix    4840
## 8  AM WEST delayed       Phoenix     415
## 9   ALASKA on time     San.Diego     212
## 10  ALASKA delayed     San.Diego      20
## 11 AM WEST on time     San.Diego     383
## 12 AM WEST delayed     San.Diego      65
## 13  ALASKA on time San.Francisco     503
## 14  ALASKA delayed San.Francisco     102
## 15 AM WEST on time San.Francisco     320
## 16 AM WEST delayed San.Francisco     129
## 17  ALASKA on time       Seattle    1841
## 18  ALASKA delayed       Seattle     305
## 19 AM WEST on time       Seattle     201
## 20 AM WEST delayed       Seattle      61

Group data and summarize results of groupings

#Group data by City(Airport) and Airline and get the total flights (on time and delayed) per group
flt_sum1 <- flt_tidy %>% group_by(City,X) %>% summarise(flt_sum1 = sum(Flights))

#Group data by City(Airport) and get the total flights (on time and delayed) per group (City)
flt_sum2 <- flt_tidy %>% group_by(City) %>% summarise(flt_sum2 = sum(Flights))

#Group data by Airline and get the total flights (on time and delayed) per group Airline
flt_sum7 <- flt_tidy %>% group_by(X) %>% summarise(flt_sum7 = sum(Flights))

Results show that about 75% of all the flights (in terms of the 5 airports listed) for American West are in Phoenix. For Alaska, most of its flights are in Seattle (about 55%) - which is not as proportionately large as American West.

#Combine the 2 summaries using a join
flt_com1 <- inner_join(flt_sum1, flt_sum7, by="X")

#Relative Percentage of flight between Alaska and American West per city
flt_com1 %>% mutate(per_cit = round(flt_sum1/flt_sum7,2)) %>% select(City, X, per_cit)
## Source: local data frame [10 x 3]
## Groups: City [5]
## 
##             City       X per_cit
##            (chr)   (chr)   (dbl)
## 1    Los.Angeles  ALASKA    0.15
## 2    Los.Angeles AM WEST    0.11
## 3        Phoenix  ALASKA    0.06
## 4        Phoenix AM WEST    0.73
## 5      San.Diego  ALASKA    0.06
## 6      San.Diego AM WEST    0.06
## 7  San.Francisco  ALASKA    0.16
## 8  San.Francisco AM WEST    0.06
## 9        Seattle  ALASKA    0.57
## 10       Seattle AM WEST    0.04
#generate bar plot to show relative share for Alaska and American West per City
ggplot(flt_com1, aes(x = X, y=round((flt_sum1/flt_sum7),2), fill = City)) +  geom_bar(stat="identity",position="dodge") + xlab("Airlines") + ylab("Percentage of Flights per City") 

Find the market share (in terms of number of flights) of Alaska and American West Relative to each other in the Aiports on the dataframe list. While American West has a much stronger presence in southern cities such as Phoenix and San Diego, Alaska dominates American West in Seattle. It is a much closer competition for the 2 airlines in San Francisco and Los Angeles

#Combine the 2 summaries using a join
flt_com <- inner_join(flt_sum1, flt_sum2, by="City")

#Relative market share between Alaska and American West
flt_com %>% mutate(mkt_shr = round(flt_sum1/flt_sum2,2)) %>% select(City, X, mkt_shr)
## Source: local data frame [10 x 3]
## Groups: City [5]
## 
##             City       X mkt_shr
##            (chr)   (chr)   (dbl)
## 1    Los.Angeles  ALASKA    0.41
## 2    Los.Angeles AM WEST    0.59
## 3        Phoenix  ALASKA    0.04
## 4        Phoenix AM WEST    0.96
## 5      San.Diego  ALASKA    0.34
## 6      San.Diego AM WEST    0.66
## 7  San.Francisco  ALASKA    0.57
## 8  San.Francisco AM WEST    0.43
## 9        Seattle  ALASKA    0.89
## 10       Seattle AM WEST    0.11
#generate bar plot to show relative share for Alaska and American West per City
ggplot(flt_com, aes(x = X, y=round((flt_sum1/flt_sum2),2), fill = City)) +  geom_bar(stat="identity",position="dodge") + xlab("Airlines") + ylab("Market Shares (Relative to each other)") 

Compare the delays/on time record in the five cities listed for the 2 airlines. Alaska Airlines has a better on time arrival records when compared to American West for all the 5 cities (airports)

flt_com_del <- inner_join(flt_tidy, flt_sum1, by=c("City","X"))

#determine the percentage of flights that are on time and delayed per City per Airline
flt_com_del %>% mutate(del_per = round(Flights/flt_sum1,2)) %>% select(City, X, X.1, del_per)
##             City       X     X.1 del_per
## 1    Los.Angeles  ALASKA on time    0.89
## 2    Los.Angeles  ALASKA delayed    0.11
## 3    Los.Angeles AM WEST on time    0.86
## 4    Los.Angeles AM WEST delayed    0.14
## 5        Phoenix  ALASKA on time    0.95
## 6        Phoenix  ALASKA delayed    0.05
## 7        Phoenix AM WEST on time    0.92
## 8        Phoenix AM WEST delayed    0.08
## 9      San.Diego  ALASKA on time    0.91
## 10     San.Diego  ALASKA delayed    0.09
## 11     San.Diego AM WEST on time    0.85
## 12     San.Diego AM WEST delayed    0.15
## 13 San.Francisco  ALASKA on time    0.83
## 14 San.Francisco  ALASKA delayed    0.17
## 15 San.Francisco AM WEST on time    0.71
## 16 San.Francisco AM WEST delayed    0.29
## 17       Seattle  ALASKA on time    0.86
## 18       Seattle  ALASKA delayed    0.14
## 19       Seattle AM WEST on time    0.77
## 20       Seattle AM WEST delayed    0.23
#show percentage of delayed/on time flights in Los Angeles for the 2 airlines
flt_LA = filter(flt_com_del, City=="Los.Angeles")
ggplot(flt_LA, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) +  geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(Los Angeles)")

#show percentage of delayed/on time flights in Phoenix for the 2 airlines
flt_Ph = filter(flt_com_del, City=="Phoenix")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) +  geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(Phoenix)")                                                                                                                                            #show percentage of delayed/on time flights in San Diego for the 2 airlines

flt_Ph = filter(flt_com_del, City=="San.Diego")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) +  geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(San Diego)")

#show percentage of delayed/on time flights in San Francisco for the 2 airlines
flt_Ph = filter(flt_com_del, City=="San.Francisco")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) +  geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(San Francisco)")

#show percentage of delayed/on time flights in Seattle for the 2 airlines
flt_Ph = filter(flt_com_del, City=="Seattle")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) +  geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(Seattle)")

Although Alaska beat American West in all the 5 cities in the list, overall (for the all the flights arriving the 5 cities), American West has a better record. This can be attributed to the fact that the majority of American West’s flights are in its hub in Phoenix (75%) where it has a excellent on time record (so does Alaska but it has a lot less flights to Phoenix). This disproportionately large percentage more than made up for a less stellar record in the other 4 airports. This is the reason why American West has the overall better record inspite of losing to Alaska in all the 5 airports in terms of on time record

#group flights by airline and on-time/delay status and sum the grouping
flt_sum3 <- flt_tidy %>% group_by(X,X.1) %>% summarise(flt_sum3 = sum(Flights))

#from the summary group abouve, group this by airline and sum the grouping
flt_sum4 <- flt_sum3 %>% group_by(X) %>% summarise(flt_sum4 = sum(flt_sum3))

#join the the 2 summary groupings
flt_todel <- inner_join(flt_sum3, flt_sum4, by=("X"))

#determine the overall delay/on time record for the 2 airlines
mutate(flt_todel, del_all = round(flt_sum3/flt_sum4,2)) %>% select(X, X.1,del_all)
## Source: local data frame [4 x 3]
## Groups: X [2]
## 
##         X     X.1 del_all
##     (chr)   (chr)   (dbl)
## 1  ALASKA delayed    0.13
## 2  ALASKA on time    0.87
## 3 AM WEST delayed    0.11
## 4 AM WEST on time    0.89
#show the overall delay/on time record for the 2 airlines
ggplot(flt_todel, aes(x = X, y=round((flt_sum3/flt_sum4),2), fill = X.1)) +  geom_bar(stat="identity") + xlab("Airlines") + ylab("Overall Percentage of Delayed Flights")