library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
airlines <- read.csv("assignment5.csv")
airlines
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

We see that there are only two airlines, Alaska and AM West, but the data is not quite tidy. For every airport, it contains two observations rather than one due to the on time and delayed column. If we’d like it to be in a wide format, we should look into doubling the number of airport observations, so there are Los.Angeles.On.Time and Los.Angeles.Delayed for example.

Meantime, we can start by simply removing the extra blank row separating the two airline observations.

airlines <- airlines[-3,]

Using the dplyr package we could have done the same in a more complicated fashion by doing airlines <- as.data.frame(slice(airlines, c(1, 2, 4, 5)))

Now let’s try to move the second set of observations into their own columns.

delays <- filter(airlines, X.1 == "delayed")

Once we have the information, we can go ahead and delete the rows from the main data frame.

airlines <- airlines[c(1, 3), ]

Now we extract the relevant information for each airport. Then we can add the information back into the data frame, airlines, but in the appropriate columns.

alaska.delay <- as.data.frame(slice(delays, 1))[, 3:7]
amwest.delay <- as.data.frame(slice(delays, 2))[, 3:7]

new.col.names <- c("Los.Angeles.Delayed", "Phoenix.Delayed", "San.Diege.Delayed", "San.Francisco.Delayed", "Seattle.Delayed")

airlines[new.col.names] <- alaska.delay
airlines
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 4 AM WEST on time         694    4840       383           320     201
##   Los.Angeles.Delayed Phoenix.Delayed San.Diege.Delayed
## 1                  62              12                20
## 4                  62              12                20
##   San.Francisco.Delayed Seattle.Delayed
## 1                   102             305
## 4                   102             305

It’s not ideal, but it’s okay that we’ve duplicated the delay times for Alaska in AM West's row. We will simply over write that information, then finish tidying up the data frame.

airlines[2, 8:12] <- amwest.delay
airlines <- airlines[, c(1, 3:12)]
colnames(airlines) <- c("Airline", "Los.Angeles.On.Time", "Phoenix.On.Time", "San.Diego.On.Time", "San.Francisco.On.Time", "Seattle.On.Time", new.col.names)

airlines
##   Airline Los.Angeles.On.Time Phoenix.On.Time San.Diego.On.Time
## 1  ALASKA                 497             221               212
## 4 AM WEST                 694            4840               383
##   San.Francisco.On.Time Seattle.On.Time Los.Angeles.Delayed
## 1                   503            1841                  62
## 4                   320             201                 117
##   Phoenix.Delayed San.Diege.Delayed San.Francisco.Delayed Seattle.Delayed
## 1              12                20                   102             305
## 4             415                65                   129              61

Perfect! We could have also renamed the remaining columns using dplyr.

rename(airlines, Airline = X, Los.Angeles.On.Time = Los.Angeles, Phoenix.On.Time = Phoenix, San.Diego.On.Time = San.Diego, San.Francisco.On.Time = San.Francisco, Seattle.On.Time = Seattle)

Now let’s analyze the arrivals and delays for both airlines.

airlines$Avg.On.Time <- mean(as.numeric(airlines[1, 2:6]))
airlines[2, 12] <- mean(as.numeric(airlines[2, 2:6]))
airlines$Avg.Delayed <- mean(as.numeric(airlines[1, 7:11]))
airlines[2, 13] <- mean(as.numeric(airlines[2, 7:11]))

a <- ggplot(data = airlines, aes(x = Airline, y = Avg.On.Time)) +
  geom_bar(stat = "identity") +
  xlab("Airlines") +
  ylab("Num of Planes")

b <- ggplot(data = airlines, aes(x = Airline, y = Avg.Delayed)) +
  geom_bar(stat = "identity") +
  xlab("Airlines") +
  ylab("Num of Planes")

a

b

At first glance, AM West seems to perform significantly better than Alaska in arrivals, and worse in delays, but that’s because we’re simply comparing the values directly. Let’s take a closer look.

sum(as.numeric(airlines[1, 2:11]))
## [1] 3775
sum(as.numeric(airlines[2, 2:11]))
## [1] 7225

AM West handles nearly twice the number of flights that Alaska does. It’d be unfair to take a straight comparison between both airlines’ performances. What if we looked at the percentage?

total.flights.alaska <- c(0, 0, 0, 0, 0)
total.flights.amwest <- c(0, 0, 0, 0, 0)

air.final <- airlines

for(i in 2:6){
  total.flights.alaska[i - 1] <- airlines[1, i] + airlines[1, i + 5]
  total.flights.amwest[i - 1] <- airlines[2, i] + airlines[2, i + 5]
}

Interesting to note that, with the sums in hand, we can see which airport seems to be the main airport hub for each airline. For Alaska, their main hub is Seattle with 2,146 total flights, and AM West uses Phoenix with 5,255 total flights.

for(i in 2:6){
  air.final[1, i] <- airlines[1, i] / total.flights.alaska[i - 1]
  air.final[2, i] <- airlines[2, i] / total.flights.amwest[i - 1]
}

for(i in 7:11){
  air.final[1, i] <- airlines[1, i] / total.flights.alaska[i - 6]
  air.final[2, i] <- airlines[2, i] / total.flights.amwest[i - 6]
}

air.final[1, 12] <- mean(as.numeric(air.final[1, 2:6]))
air.final[2, 12] <- mean(as.numeric(air.final[2, 2:6]))
air.final[1, 13] <- mean(as.numeric(air.final[1, 7:11]))
air.final[2, 13] <- mean(as.numeric(air.final[2, 7:11]))

c <- ggplot(data = air.final, aes(x = Airline, y = Avg.On.Time)) +
  geom_bar(stat = "identity") +
  xlab("Airlines") +
  ylab("Percentage of Planes")

d <- ggplot(data = air.final, aes(x = Airline, y = Avg.Delayed)) +
  geom_bar(stat = "identity") +
  xlab("Airlines") +
  ylab("Percentage of Planes")

c

d

So, adjusting for the difference in volume, it looks as though Alaska Airlines performs better on average than AM West. It has slightly more on time flights, and much less delays.