Setup Packages

knitr::opts_chunk$set(echo = TRUE)
if("tidyr" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyr")}
library(tidyr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("zoo" %in% rownames(installed.packages()) == FALSE) {install.packages("zoo")}
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
if("ggplot2" %in% rownames(installed.packages()) == FALSE) {install.packages("ggplot2")}
library(ggplot2)

Read in Data

raw_arr_delay_data <- read.csv("arr_delay.csv", stringsAsFactors = FALSE, na.strings = c("", " "))
raw_arr_delay_data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA    <NA>        NA            NA    <NA>
## 4 AM WEST on time         694   4,840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

Clean Up data in Data frame

#Remove Row 3 as it is an empty row
arr_delay_data <- raw_arr_delay_data[-c(3),]

#Cleanup Column Names
names(arr_delay_data) <- c("Airline", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")

#Remove commas from numeric data (Phoenix and Seattle Columns) so that they can be recognised as integer.  This is necessary to be able to summarise the data later on.
arr_delay_data$Phoenix <- as.integer(str_replace_all(arr_delay_data$Phoenix, ",",""))
arr_delay_data$Seattle <- as.integer(str_replace_all(arr_delay_data$Seattle, ",",""))


#Filling Missing data in Airline Column.  Will use the na.locf from the zoo package to fill the N/A values in the Airlines column
arr_delay_data <- transform(arr_delay_data, Airline = na.locf(Airline))

arr_delay_data
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

Rearrange Data

The data is deemed to have four variables - Airline, Status(whether on time or delayed), City, Number (of flights). Airline is in Column 1, Status in Column 2 and City and number spread among the remaining columns. There is need to put City in its own column (the content to be the remaining column names) and the Number can be in the fourth column. For this, the gather function (of the tidy package) will be used to reshape the data.

tidy_arr_delay <- arr_delay_data %>% 
  gather(City, Number, Los.Angeles:Seattle)

tidy_arr_delay
##    Airline  Status          City Number
## 1   ALASKA on time   Los.Angeles    497
## 2   ALASKA delayed   Los.Angeles     62
## 3  AM WEST on time   Los.Angeles    694
## 4  AM WEST delayed   Los.Angeles    117
## 5   ALASKA on time       Phoenix    221
## 6   ALASKA delayed       Phoenix     12
## 7  AM WEST on time       Phoenix   4840
## 8  AM WEST delayed       Phoenix    415
## 9   ALASKA on time     San.Diego    212
## 10  ALASKA delayed     San.Diego     20
## 11 AM WEST on time     San.Diego    383
## 12 AM WEST delayed     San.Diego     65
## 13  ALASKA on time San.Francisco    503
## 14  ALASKA delayed San.Francisco    102
## 15 AM WEST on time San.Francisco    320
## 16 AM WEST delayed San.Francisco    129
## 17  ALASKA on time       Seattle   1841
## 18  ALASKA delayed       Seattle    305
## 19 AM WEST on time       Seattle    201
## 20 AM WEST delayed       Seattle     61

Arrival Delays

Select delayed flights

Filter the delayed flights only. The on time flights are not of concern for the rest of the analysis. In addition, since the status column will be the same value, that column is no longer needed.

delayed_data <- tidy_arr_delay %>% 
  filter(Status == "delayed") %>% 
  select(Airline, City, Number)
delayed_data
##    Airline          City Number
## 1   ALASKA   Los.Angeles     62
## 2  AM WEST   Los.Angeles    117
## 3   ALASKA       Phoenix     12
## 4  AM WEST       Phoenix    415
## 5   ALASKA     San.Diego     20
## 6  AM WEST     San.Diego     65
## 7   ALASKA San.Francisco    102
## 8  AM WEST San.Francisco    129
## 9   ALASKA       Seattle    305
## 10 AM WEST       Seattle     61

Group delayed flights by airline

In order to summarise data by airline, the group_by function of the dplyr package will be used to group the data by airline.

delay_group <- group_by(delayed_data, Airline)

delay_group
## Source: local data frame [10 x 3]
## Groups: Airline [2]
## 
##    Airline          City Number
##      <chr>         <chr>  <int>
## 1   ALASKA   Los.Angeles     62
## 2  AM WEST   Los.Angeles    117
## 3   ALASKA       Phoenix     12
## 4  AM WEST       Phoenix    415
## 5   ALASKA     San.Diego     20
## 6  AM WEST     San.Diego     65
## 7   ALASKA San.Francisco    102
## 8  AM WEST San.Francisco    129
## 9   ALASKA       Seattle    305
## 10 AM WEST       Seattle     61

Total number of delayed flights by airline

Summarise by adding up the number of flights for each airline. A bar plot is also generated from the summary data.

airline_summary <- summarise(delay_group, sum(Number))
airline_summary
## # A tibble: 2 × 2
##   Airline `sum(Number)`
##     <chr>         <int>
## 1  ALASKA           501
## 2 AM WEST           787
barplot(airline_summary$`sum(Number)`, names.arg = airline_summary$Airline, xlab = "Airline", ylab = "Number of Delayed Flights")

The graph shows that AM West had the most delayed flights

Delays by Airline and City

Reverting to using the non-grouped data, the data is presented in graphical format by Airline on the x-axis grouped by city.

ggplot(data = delayed_data) +   
  geom_bar(
    mapping = aes(x = Airline, y = Number, fill = City), 
    stat = "identity",
    position = "dodge"
    )

The graph shows that for Alaska airlines, the most delays were in Seattle while the least were in Phoenix. For AM West airlines the most delays were in Phoenix and the least in Seattle.

The data is presented in graphical format by City on the x-axis grouped by Airline

delayed_data
##    Airline          City Number
## 1   ALASKA   Los.Angeles     62
## 2  AM WEST   Los.Angeles    117
## 3   ALASKA       Phoenix     12
## 4  AM WEST       Phoenix    415
## 5   ALASKA     San.Diego     20
## 6  AM WEST     San.Diego     65
## 7   ALASKA San.Francisco    102
## 8  AM WEST San.Francisco    129
## 9   ALASKA       Seattle    305
## 10 AM WEST       Seattle     61
ggplot(data = delayed_data) +   
  geom_bar(
    mapping = aes(x = City, y = Number, fill = Airline), 
    stat = "identity",
    position = "dodge"
    )

The graph shows that AM West had the most delays in every city except Seattle where Alaska Airlines had the most delays