Load packages.
library(knitr)
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
Import CSV, column header = True.
x <- getURL("https://raw.githubusercontent.com/spsstudent15/2016-01-607-06/master/flights.csv")
flights<-data.frame(read.csv(text=x, header=T))
flights
## Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
#str(flights) #check structure
Reshape data with city columns as factors.
flights2 <- flights %>%
gather("City","NumFlights",3:7)
kable(flights2)
| ALASKA |
on time |
Los.Angeles |
497 |
| ALASKA |
delayed |
Los.Angeles |
62 |
| AM WEST |
on time |
Los.Angeles |
694 |
| AM WEST |
delayed |
Los.Angeles |
117 |
| ALASKA |
on time |
Phoenix |
221 |
| ALASKA |
delayed |
Phoenix |
12 |
| AM WEST |
on time |
Phoenix |
4840 |
| AM WEST |
delayed |
Phoenix |
415 |
| ALASKA |
on time |
San.Diego |
212 |
| ALASKA |
delayed |
San.Diego |
20 |
| AM WEST |
on time |
San.Diego |
383 |
| AM WEST |
delayed |
San.Diego |
65 |
| ALASKA |
on time |
San.Francisco |
503 |
| ALASKA |
delayed |
San.Francisco |
102 |
| AM WEST |
on time |
San.Francisco |
320 |
| AM WEST |
delayed |
San.Francisco |
129 |
| ALASKA |
on time |
Seattle |
1841 |
| ALASKA |
delayed |
Seattle |
305 |
| AM WEST |
on time |
Seattle |
201 |
| AM WEST |
delayed |
Seattle |
61 |
Reshape data with Arrival factors as columns.
flights3 <- flights2 %>%
spread(Arrival,NumFlights,fill=NA)
colnames(flights3)<-c("Airline","City","Delayed","OnTime")
kable(flights3)
| ALASKA |
Los.Angeles |
62 |
497 |
| ALASKA |
Phoenix |
12 |
221 |
| ALASKA |
San.Diego |
20 |
212 |
| ALASKA |
San.Francisco |
102 |
503 |
| ALASKA |
Seattle |
305 |
1841 |
| AM WEST |
Los.Angeles |
117 |
694 |
| AM WEST |
Phoenix |
415 |
4840 |
| AM WEST |
San.Diego |
65 |
383 |
| AM WEST |
San.Francisco |
129 |
320 |
| AM WEST |
Seattle |
61 |
201 |
Add calculated column for Total Flights.
flights4<-mutate(flights3, TotalFlights=(Delayed+OnTime))
kable(flights4)
| ALASKA |
Los.Angeles |
62 |
497 |
559 |
| ALASKA |
Phoenix |
12 |
221 |
233 |
| ALASKA |
San.Diego |
20 |
212 |
232 |
| ALASKA |
San.Francisco |
102 |
503 |
605 |
| ALASKA |
Seattle |
305 |
1841 |
2146 |
| AM WEST |
Los.Angeles |
117 |
694 |
811 |
| AM WEST |
Phoenix |
415 |
4840 |
5255 |
| AM WEST |
San.Diego |
65 |
383 |
448 |
| AM WEST |
San.Francisco |
129 |
320 |
449 |
| AM WEST |
Seattle |
61 |
201 |
262 |
Add calculated column for Percent On Time.
flights5<-mutate(flights4, PercentOnTime=round((OnTime/TotalFlights),2))
kable(flights5)
| ALASKA |
Los.Angeles |
62 |
497 |
559 |
0.89 |
| ALASKA |
Phoenix |
12 |
221 |
233 |
0.95 |
| ALASKA |
San.Diego |
20 |
212 |
232 |
0.91 |
| ALASKA |
San.Francisco |
102 |
503 |
605 |
0.83 |
| ALASKA |
Seattle |
305 |
1841 |
2146 |
0.86 |
| AM WEST |
Los.Angeles |
117 |
694 |
811 |
0.86 |
| AM WEST |
Phoenix |
415 |
4840 |
5255 |
0.92 |
| AM WEST |
San.Diego |
65 |
383 |
448 |
0.85 |
| AM WEST |
San.Francisco |
129 |
320 |
449 |
0.71 |
| AM WEST |
Seattle |
61 |
201 |
262 |
0.77 |
Show percent delay by airline.
sumairline<-aggregate(cbind(Delayed,OnTime,TotalFlights) ~ Airline, data = flights5, sum)
sumairline<-mutate(sumairline,PercentOnTime=round((OnTime/TotalFlights),3))
kable(sumairline)
| ALASKA |
501 |
3274 |
3775 |
0.867 |
| AM WEST |
787 |
6438 |
7225 |
0.891 |
Show percent delay by city, ordered by percent on time ascending.
sumcity<-aggregate(cbind(Delayed,OnTime,TotalFlights) ~ City, data = flights5, sum)
sumcity<-mutate(sumcity,PercentOnTime=round((OnTime/TotalFlights),3))
sumcity<-sumcity[ order(sumcity[,5]), ]
kable(sumcity)
| 4 |
San.Francisco |
231 |
823 |
1054 |
0.781 |
| 5 |
Seattle |
366 |
2042 |
2408 |
0.848 |
| 1 |
Los.Angeles |
179 |
1191 |
1370 |
0.869 |
| 3 |
San.Diego |
85 |
595 |
680 |
0.875 |
| 2 |
Phoenix |
427 |
5061 |
5488 |
0.922 |
Show bar plots.
ggplot(flights5, aes(x = Airline, y = PercentOnTime, fill = Airline)) + geom_bar(stat = "identity", color="black") + ggtitle("Percent On Time Airline Arrivals by Airline and City") + facet_wrap(~ City, nrow = 2)+ ylim(0, 1) + theme(panel.grid.minor.x=element_blank(), panel.grid.major.x=element_blank()) +scale_fill_manual(values=c("#56B4E9", "#0000CC"))

ggplot(sumcity, aes(x = City, y = PercentOnTime)) + geom_bar(stat = "identity",fill="#9999CC",color="black") + ggtitle("Percent On Time Airline Arrivals by City")+ theme(panel.grid.minor.x=element_blank(), panel.grid.major.x=element_blank()) + coord_cartesian(ylim = c(.5, 1))

ggplot(sumairline, aes(x = Airline, y = TotalFlights)) + geom_bar(stat = "identity",fill="#ADFF2F",color="black") + ggtitle("Total Flights by Airline")+ theme(panel.grid.minor.x=element_blank(), panel.grid.major.x=element_blank())

Conclusion
Alaska Airlines has better on-time arrival percentages for these five cities.
A city comparison shows that San Francisco has the most delays of these cities and Phoenix has the least delays.
The gap between the two airlines is most significant for San Francisco and Seattle. We would need more information on what factors cause a flight to be delayed.
For example, San Francisco and Seattle might have the worst weather of all of these cities, and Alaska airlines may have special equipment or training which helps with poor weather. Alaska also has a smaller total number of flights to these cities, which may contribute to better on-time percentages.