Load the relevant libraries
#make sure the Wickham libraries are loaded
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
Read CSV File
#current directory
setwd("~/")
#read the csv file
flt_del <- read.csv("flight_delays.csv", header=T, stringsAsFactors = F)
#list the dataframe
flt_del
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
Tidy the dataframe
flt_del$X[2] = flt_del$X[1]
flt_del$X[4] = flt_del$X[3]
flt_tidy <- gather(flt_del,"City","Flights", 3:7)
flt_tidy
## X X.1 City Flights
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Group data and summarize results of groupings
#Group data by City(Airport) and Airline and get the total flights (on time and delayed) per group
flt_sum1 <- flt_tidy %>% group_by(City,X) %>% summarise(flt_sum1 = sum(Flights))
#Group data by City(Airport) and get the total flights (on time and delayed) per group (City)
flt_sum2 <- flt_tidy %>% group_by(City) %>% summarise(flt_sum2 = sum(Flights))
#Group data by Airline and get the total flights (on time and delayed) per group Airline
flt_sum7 <- flt_tidy %>% group_by(X) %>% summarise(flt_sum7 = sum(Flights))
Results show that about 75% of all the flights (in terms of the 5 airports listed) for American West are in Phoenix. For Alaska, most of its flights are in Seattle (about 55%) - which is not as proportionately large as American West.
#Combine the 2 summaries using a join
flt_com1 <- inner_join(flt_sum1, flt_sum7, by="X")
#Relative Percentage of flight between Alaska and American West per city
flt_com1 %>% mutate(per_cit = round(flt_sum1/flt_sum7,2)) %>% select(City, X, per_cit)
## Source: local data frame [10 x 3]
## Groups: City [5]
##
## City X per_cit
## (chr) (chr) (dbl)
## 1 Los.Angeles ALASKA 0.15
## 2 Los.Angeles AM WEST 0.11
## 3 Phoenix ALASKA 0.06
## 4 Phoenix AM WEST 0.73
## 5 San.Diego ALASKA 0.06
## 6 San.Diego AM WEST 0.06
## 7 San.Francisco ALASKA 0.16
## 8 San.Francisco AM WEST 0.06
## 9 Seattle ALASKA 0.57
## 10 Seattle AM WEST 0.04
#generate bar plot to show relative share for Alaska and American West per City
ggplot(flt_com1, aes(x = X, y=round((flt_sum1/flt_sum7),2), fill = City)) + geom_bar(stat="identity",position="dodge") + xlab("Airlines") + ylab("Percentage of Flights per City")
Find the market share (in terms of number of flights) of Alaska and American West Relative to each other in the Aiports on the dataframe list. While American West has a much stronger presence in southern cities such as Phoenix and San Diego, Alaska dominates American West in Seattle. It is a much closer competition for the 2 airlines in San Francisco and Los Angeles
#Combine the 2 summaries using a join
flt_com <- inner_join(flt_sum1, flt_sum2, by="City")
#Relative market share between Alaska and American West
flt_com %>% mutate(mkt_shr = round(flt_sum1/flt_sum2,2)) %>% select(City, X, mkt_shr)
## Source: local data frame [10 x 3]
## Groups: City [5]
##
## City X mkt_shr
## (chr) (chr) (dbl)
## 1 Los.Angeles ALASKA 0.41
## 2 Los.Angeles AM WEST 0.59
## 3 Phoenix ALASKA 0.04
## 4 Phoenix AM WEST 0.96
## 5 San.Diego ALASKA 0.34
## 6 San.Diego AM WEST 0.66
## 7 San.Francisco ALASKA 0.57
## 8 San.Francisco AM WEST 0.43
## 9 Seattle ALASKA 0.89
## 10 Seattle AM WEST 0.11
#generate bar plot to show relative share for Alaska and American West per City
ggplot(flt_com, aes(x = X, y=round((flt_sum1/flt_sum2),2), fill = City)) + geom_bar(stat="identity",position="dodge") + xlab("Airlines") + ylab("Market Shares (Relative to each other)")
Compare the delays/on time record in the five cities listed for the 2 airlines. Alaska Airlines has a better on time arrival records when compared to American West for all the 5 cities (airports)
flt_com_del <- inner_join(flt_tidy, flt_sum1, by=c("City","X"))
#determine the percentage of flights that are on time and delayed per City per Airline
flt_com_del %>% mutate(del_per = round(Flights/flt_sum1,2)) %>% select(City, X, X.1, del_per)
## City X X.1 del_per
## 1 Los.Angeles ALASKA on time 0.89
## 2 Los.Angeles ALASKA delayed 0.11
## 3 Los.Angeles AM WEST on time 0.86
## 4 Los.Angeles AM WEST delayed 0.14
## 5 Phoenix ALASKA on time 0.95
## 6 Phoenix ALASKA delayed 0.05
## 7 Phoenix AM WEST on time 0.92
## 8 Phoenix AM WEST delayed 0.08
## 9 San.Diego ALASKA on time 0.91
## 10 San.Diego ALASKA delayed 0.09
## 11 San.Diego AM WEST on time 0.85
## 12 San.Diego AM WEST delayed 0.15
## 13 San.Francisco ALASKA on time 0.83
## 14 San.Francisco ALASKA delayed 0.17
## 15 San.Francisco AM WEST on time 0.71
## 16 San.Francisco AM WEST delayed 0.29
## 17 Seattle ALASKA on time 0.86
## 18 Seattle ALASKA delayed 0.14
## 19 Seattle AM WEST on time 0.77
## 20 Seattle AM WEST delayed 0.23
#show percentage of delayed/on time flights in Los Angeles for the 2 airlines
flt_LA = filter(flt_com_del, City=="Los.Angeles")
ggplot(flt_LA, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) + geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(Los Angeles)")
#show percentage of delayed/on time flights in Phoenix for the 2 airlines
flt_Ph = filter(flt_com_del, City=="Phoenix")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) + geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(Phoenix)") #show percentage of delayed/on time flights in San Diego for the 2 airlines
flt_Ph = filter(flt_com_del, City=="San.Diego")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) + geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(San Diego)")
#show percentage of delayed/on time flights in San Francisco for the 2 airlines
flt_Ph = filter(flt_com_del, City=="San.Francisco")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) + geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(San Francisco)")
#show percentage of delayed/on time flights in Seattle for the 2 airlines
flt_Ph = filter(flt_com_del, City=="Seattle")
ggplot(flt_Ph, aes(x = X, y=round((Flights/flt_sum1),2), fill = X.1)) + geom_bar(stat="identity", position="dodge") + xlab("Airlines") + ylab("Percentage Delayed/On Time(Seattle)")
Although Alaska beat American West in all the 5 cities in the list, overall (for the all the flights arriving the 5 cities), American West has a better record. This can be attributed to the fact that the majority of American West’s flights are in its hub in Phoenix (75%) where it has a excellent on time record (so does Alaska but it has a lot less flights to Phoenix). This disproportionately large percentage more than made up for a less stellar record in the other 4 airports. This is the reason why American West has the overall better record inspite of losing to Alaska in all the 5 airports in terms of on time record
#group flights by airline and on-time/delay status and sum the grouping
flt_sum3 <- flt_tidy %>% group_by(X,X.1) %>% summarise(flt_sum3 = sum(Flights))
#from the summary group abouve, group this by airline and sum the grouping
flt_sum4 <- flt_sum3 %>% group_by(X) %>% summarise(flt_sum4 = sum(flt_sum3))
#join the the 2 summary groupings
flt_todel <- inner_join(flt_sum3, flt_sum4, by=("X"))
#determine the overall delay/on time record for the 2 airlines
mutate(flt_todel, del_all = round(flt_sum3/flt_sum4,2)) %>% select(X, X.1,del_all)
## Source: local data frame [4 x 3]
## Groups: X [2]
##
## X X.1 del_all
## (chr) (chr) (dbl)
## 1 ALASKA delayed 0.13
## 2 ALASKA on time 0.87
## 3 AM WEST delayed 0.11
## 4 AM WEST on time 0.89
#show the overall delay/on time record for the 2 airlines
ggplot(flt_todel, aes(x = X, y=round((flt_sum3/flt_sum4),2), fill = X.1)) + geom_bar(stat="identity") + xlab("Airlines") + ylab("Overall Percentage of Delayed Flights")