Load the required packages:

library(dplyr)
library(tidyr)
library(knitr)

1. Read the given flights CSV file into an R object

# Read the source file from the Github repository
flights_data <- read.csv("https://raw.githubusercontent.com/isrini/SI_IS607/master/flights_data.csv")

head(flights_data)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4         delayed         117     415        65           129      61

2. Do the data wrangling with tidyr and dplyr

#Add the airline name in the blank rows by selecting the row with airline name and updating the  
# blank row with the aairline name.

data_row  <- seq(1, 3, by = 2)
blank_row <- seq(2, 4, by = 2)
flights_data[blank_row,1] <- flights_data[data_row,1]
head(flights_data)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61
# Using gather to make city into a variable
flights_data <- gather(flights_data, "City", "n", 3:7)
head(flights_data)
##         X     X.1        City   n
## 1  ALASKA on time Los.Angeles 497
## 2  ALASKA delayed Los.Angeles  62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5  ALASKA on time     Phoenix 221
## 6  ALASKA delayed     Phoenix  12
## use spread to make 'on-time' and 'delayed' rows into two new distinct columns
flights_data <- spread(flights_data, X.1, n)
head(flights_data)
##         X          City delayed on time
## 1  ALASKA   Los.Angeles      62     497
## 2  ALASKA       Phoenix      12     221
## 3  ALASKA     San.Diego      20     212
## 4  ALASKA San.Francisco     102     503
## 5  ALASKA       Seattle     305    1841
## 6 AM WEST   Los.Angeles     117     694
# And finally doing some column name adjustments.

# Name the first column header as 'Airline'
colnames(flights_data)[1] <- "Airline"

# Replace the space with the hyphen for the column 'on time'
colnames(flights_data)[4] <- "on_time"

# Replace the . in the city names with a space
flights_data$City <- gsub("\\."," ",flights_data$City)

Here is the transformed Tidy data set

kable(flights_data)
Airline City delayed on_time
ALASKA Los Angeles 62 497
ALASKA Phoenix 12 221
ALASKA San Diego 20 212
ALASKA San Francisco 102 503
ALASKA Seattle 305 1841
AM WEST Los Angeles 117 694
AM WEST Phoenix 415 4840
AM WEST San Diego 65 383
AM WEST San Francisco 129 320
AM WEST Seattle 61 201

3. Perform analysis to compare the arrival delays for the two airlines.

flights_data %>% 
  group_by(City, Airline) %>% 
  summarise(delay_percent = 100 * delayed/(on_time + delayed)) %>% 
  arrange(City, Airline)
## Source: local data frame [10 x 3]
## Groups: City [5]
## 
##             City Airline delay_percent
##            (chr)  (fctr)         (dbl)
## 1    Los Angeles  ALASKA     11.091234
## 2    Los Angeles AM WEST     14.426634
## 3        Phoenix  ALASKA      5.150215
## 4        Phoenix AM WEST      7.897241
## 5      San Diego  ALASKA      8.620690
## 6      San Diego AM WEST     14.508929
## 7  San Francisco  ALASKA     16.859504
## 8  San Francisco AM WEST     28.730512
## 9        Seattle  ALASKA     14.212488
## 10       Seattle AM WEST     23.282443

Visual Plotting of the arrival delay rates bewteen the two airlines

library(ggplot2)
qplot(City, 100 * delayed/(on_time + delayed), data = flights_data, color = Airline, geom = "boxplot")