Load the original data file from GitHub.
flights <- read.csv("https://raw.githubusercontent.com/SieSiongWong/DATA-607/master/Flight.csv", header=TRUE, sep=",", na.strings=c(""," ","NA"))
Review the dataset.
flights
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
str(flights)
## 'data.frame': 5 obs. of 7 variables:
## $ X : Factor w/ 2 levels "ALASKA","AM WEST": 1 NA NA 2 NA
## $ X.1 : Factor w/ 2 levels "delayed","on time": 2 1 NA 2 1
## $ Los.Angeles : int 497 62 NA 694 117
## $ Phoenix : int 221 12 NA 4840 415
## $ San.Diego : int 212 20 NA 383 65
## $ San.Francisco: int 503 102 NA 320 129
## $ Seattle : int 1841 305 NA 201 61
Clean the data.
## Rename the first two columns.
flights <- rename(flights, c("X"="Airline", "X.1"="Status"))
## Drop the 3rd row.
flights2 <- flights[-c(3),]
## Replace NA in the Airline column with the value prior to it.
flights2 <- flights2 %>% mutate(Airline=na.locf(Airline))
## Reorder the dataframe.
rownames(flights2) <- 1:nrow(flights2)
Rehape the clean data.
## Since the city names which columns are not variables, it is better to collapse the column 3 to 7 into a column and then pull their values into their own columns, Delayed and OnTime.
flights3 <- flights2 %>% gather(Destination, value, 3:7) %>% spread(Status, value)
## Rename the delayed and on time columns.
flights3 <- rename(flights3, c("delayed"="Delayed", "on time"="OnTime"))
## Add new column Total and perform the proportion of delayed into another column Proportion_Delayed.
flights4 <- mutate(flights3, Total=Delayed+OnTime, Proportion_Delayed=Delayed/Total)
Arrival delays analysis for the two airlines.
## Figure 1: Barplot arrival delays by city airports.
ggplot(flights4, aes(x = Destination, y = Proportion_Delayed, fill = Airline)) + geom_bar(stat="identity", position="dodge") + xlab("Destination") + ylab("Proportion Delayed") + scale_fill_brewer(palette = "Set2")

## Table 1: Airline Performance by Conditional Distribution.
flights5 <- flights4[,c(1,3:4)]
flights6 <- flights5 %>% group_by(Airline) %>% summarize_each(funs(sum))
PerformanceTable <- matrix(c(flights6$Delayed,flights6$OnTime),ncol=2,nrow=2,byrow=FALSE)
dimnames(PerformanceTable) = list(c("Alaska", "Am WEST"), c("Delayed", "OnTime"))
round(prop.table(PerformanceTable,2)*100, digits=2)
## Delayed OnTime
## Alaska 38.9 33.71
## Am WEST 61.1 66.29
## Figure 2: Barplot for Airline Performance by Conditional Distribution.
barplot(prop.table(PerformanceTable,2)*100, beside=T, ylab="%", ylim=c(0,100), main="Flight Performance", col=c("deepskyblue", "darkorchid1"))
legend("center", legend = c("ALASKA", "AM WEST"), fill = c("deepskyblue", "darkorchid1"),cex=0.85)

Conclusions:
From the figure 1 , you can easily see that Am West airline which arrival delays occur more frequently than Alask airline. Also, both airlines have the highest arrival delay rate at San Francisco airport and the lowest arrival delay rate at Phoenix airport.
From table 1 and figure 2, you can see that Alaska Airline which performance is much better than Am West airline. The Am West airline is about 22% more likely to be delayed than the Alaska airline. Also, the Alaska airline is about 32% more likely to be on time than the Am West airline.