First we call the libraries that we will be using

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

We need to bring in our csv file which was created using MySQL. Data is in a wide format.

flights <- read.csv("https://raw.githubusercontent.com/bkreis84/Kreis-Week-6/master/delays.csv", header = TRUE)
write.table(flights, sep = ",");
## "X","X.1","Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle"
## "1","ALASKA","on time",497,221,212,503,1841
## "2","","delayed",62,12,20,102,305
## "3","","",NA,NA,NA,NA,NA
## "4","AM WEST","on time",694,4840,383,320,201
## "5","","delayed",117,415,65,129,61
tbl_df(flights)
## Source: local data frame [5 x 7]
## 
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
##    (fctr)  (fctr)       (int)   (int)     (int)         (int)   (int)
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

The data needs to be cleaned up a bit, so we add a couple missing values and change the names of two columns in order to make them more descriptive.

flights <- flights[-3,]
flights <- rename(flights, Airline = X, Status = X.1)
flights[2, "Airline"] = "ALASKA"
flights[4, "Airline"] = "AM WEST"
tbl_df(flights)
## Source: local data frame [4 x 7]
## 
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
##    (fctr)  (fctr)       (int)   (int)     (int)         (int)   (int)
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

In order to tidy up our data we need to first use the gather function. This will change the city names, which were previously listed as column headers, into our rows which represent observations. The Key is the city names and the value column is our frequency values.

flights <- gather(flights, "City", "n", 3:7)
tbl_df(flights)
## Source: local data frame [20 x 4]
## 
##    Airline  Status          City     n
##     (fctr)  (fctr)        (fctr) (int)
## 1   ALASKA on time   Los.Angeles   497
## 2   ALASKA delayed   Los.Angeles    62
## 3  AM WEST on time   Los.Angeles   694
## 4  AM WEST delayed   Los.Angeles   117
## 5   ALASKA on time       Phoenix   221
## 6   ALASKA delayed       Phoenix    12
## 7  AM WEST on time       Phoenix  4840
## 8  AM WEST delayed       Phoenix   415
## 9   ALASKA on time     San.Diego   212
## 10  ALASKA delayed     San.Diego    20
## 11 AM WEST on time     San.Diego   383
## 12 AM WEST delayed     San.Diego    65
## 13  ALASKA on time San.Francisco   503
## 14  ALASKA delayed San.Francisco   102
## 15 AM WEST on time San.Francisco   320
## 16 AM WEST delayed San.Francisco   129
## 17  ALASKA on time       Seattle  1841
## 18  ALASKA delayed       Seattle   305
## 19 AM WEST on time       Seattle   201
## 20 AM WEST delayed       Seattle    61

We now will make variables out of the “on time” & “delayed”" values currently listed in our data set. We use the spread function for this.

flights <- spread(flights, Status, n)
tbl_df(flights)
## Source: local data frame [10 x 4]
## 
##    Airline          City delayed on time
##     (fctr)        (fctr)   (int)   (int)
## 1   ALASKA   Los.Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San.Diego      20     212
## 4   ALASKA San.Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los.Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San.Diego      65     383
## 9  AM WEST San.Francisco     129     320
## 10 AM WEST       Seattle      61     201

I was having some issues with my column name “on time,” so I found it necessary to change the name to have no spaces.

colnames(flights)[4] <- "on_time"

I now used the mutate function to add a column to the data set which shows the percentage of flights that were delayed. Looking at the results, AM WEST had higher levels of flight delays for each indvidual city as compared to ALASKA.

flights <- flights %>% 
  mutate(percent_delayed = (delayed/(on_time + delayed)*100))

flights$percent_delayed <- round(flights$percent_delayed, digits = 2)
tbl_df(flights)
## Source: local data frame [10 x 5]
## 
##    Airline          City delayed on_time percent_delayed
##     (fctr)        (fctr)   (int)   (int)           (dbl)
## 1   ALASKA   Los.Angeles      62     497           11.09
## 2   ALASKA       Phoenix      12     221            5.15
## 3   ALASKA     San.Diego      20     212            8.62
## 4   ALASKA San.Francisco     102     503           16.86
## 5   ALASKA       Seattle     305    1841           14.21
## 6  AM WEST   Los.Angeles     117     694           14.43
## 7  AM WEST       Phoenix     415    4840            7.90
## 8  AM WEST     San.Diego      65     383           14.51
## 9  AM WEST San.Francisco     129     320           28.73
## 10 AM WEST       Seattle      61     201           23.28

We then look at the performance of the airlines as a whole. It seems that AM WEST actually had a lower level of overall delays. Looking back at our individual city output, this looks like it is the result of the much higher volume of flights operated by AM WEST (particularly in Phoenix where the company also had a relatively low delay percentage)

flights2 <- flights %>% 
  group_by(Airline) %>% 
  summarise(percent_delayed = sum(delayed)/(sum(on_time) + sum(delayed))*100)
tbl_df(flights2)
## Source: local data frame [2 x 2]
## 
##   Airline percent_delayed
##    (fctr)           (dbl)
## 1  ALASKA        13.27152
## 2 AM WEST        10.89273