Step 1 - Load the Untidy Data

week_5 <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/week_5.csv")

week_5
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Step 2 - Add some Column Names

colnames(week_5)[1:2] <- c("provider","status")

Step 3 - Fill in Provider Names Where Missing

week_5[2,1] <- 'ALASKA'
week_5[5,1] <- 'AM WEST'

Step 4 - Remove Blank 3rd Row

week_5 <- week_5[-3,]

week_5
##   provider  status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        20           102     305
## 4  AM WEST on time         694    4840       383           320     201
## 5  AM WEST delayed         117     415        65           129      61

Step 5 - Gather Location/Flight Count

Ideally “Location” should be a variable/column name, with individual “Values” relating to the specific location (i.e. ‘Los.Angeles’). We can use the “gather” function to turn the various columns into location/flight_count value pairs:

week_5 <- gather(week_5,"location", "flight_count", Los.Angeles:Seattle)

week_5
##    provider  status      location flight_count
## 1    ALASKA on time   Los.Angeles          497
## 2    ALASKA delayed   Los.Angeles           62
## 3   AM WEST on time   Los.Angeles          694
## 4   AM WEST delayed   Los.Angeles          117
## 5    ALASKA on time       Phoenix          221
## 6    ALASKA delayed       Phoenix           12
## 7   AM WEST on time       Phoenix         4840
## 8   AM WEST delayed       Phoenix          415
## 9    ALASKA on time     San.Diego          212
## 10   ALASKA delayed     San.Diego           20
## 11  AM WEST on time     San.Diego          383
## 12  AM WEST delayed     San.Diego           65
## 13   ALASKA on time San.Francisco          503
## 14   ALASKA delayed San.Francisco          102
## 15  AM WEST on time San.Francisco          320
## 16  AM WEST delayed San.Francisco          129
## 17   ALASKA on time       Seattle         1841
## 18   ALASKA delayed       Seattle          305
## 19  AM WEST on time       Seattle          201
## 20  AM WEST delayed       Seattle           61

Step 6 - Spread/Cast Status

Ideally ontime flights and delayed flights should have their own variables/column names. This requires casting/spreading:

week_5 <- spread(week_5,status,flight_count)

week_5
##    provider      location delayed on time
## 1    ALASKA   Los.Angeles      62     497
## 2    ALASKA       Phoenix      12     221
## 3    ALASKA     San.Diego      20     212
## 4    ALASKA San.Francisco     102     503
## 5    ALASKA       Seattle     305    1841
## 6   AM WEST   Los.Angeles     117     694
## 7   AM WEST       Phoenix     415    4840
## 8   AM WEST     San.Diego      65     383
## 9   AM WEST San.Francisco     129     320
## 10  AM WEST       Seattle      61     201

Step 7 - Add a total column and percentage columns

Adding a column that contains the total of delayed and on time flights. Also adding a column for % of delayed. This is good for analysis purposes.

week_5$total_flights <- week_5$delayed + week_5$'on time'
week_5$delayed_pct <- week_5$delayed / week_5$total_flights

week_5
##    provider      location delayed on time total_flights delayed_pct
## 1    ALASKA   Los.Angeles      62     497           559  0.11091234
## 2    ALASKA       Phoenix      12     221           233  0.05150215
## 3    ALASKA     San.Diego      20     212           232  0.08620690
## 4    ALASKA San.Francisco     102     503           605  0.16859504
## 5    ALASKA       Seattle     305    1841          2146  0.14212488
## 6   AM WEST   Los.Angeles     117     694           811  0.14426634
## 7   AM WEST       Phoenix     415    4840          5255  0.07897241
## 8   AM WEST     San.Diego      65     383           448  0.14508929
## 9   AM WEST San.Francisco     129     320           449  0.28730512
## 10  AM WEST       Seattle      61     201           262  0.23282443

Step 8 - Comparing Across Providers

Lets look at overall delay percentages per providers

AK_delayed_pct <- sum(subset(week_5,provider == 'ALASKA')$delayed) / sum(subset(week_5,provider == 'ALASKA')$total_flights) 



AW_delayed_pct <- sum(subset(week_5,provider == 'AM WEST')$delayed) / sum(subset(week_5,provider == 'AM WEST')$total_flights) 

df <- data.frame(c("Alaska", "AM West"),c(AK_delayed_pct, AW_delayed_pct))
colnames(df) <- c("Provider", "Delay Percentage")
df
##   Provider Delay Percentage
## 1   Alaska        0.1327152
## 2  AM West        0.1089273

Conclusion

Looks like AM West has a lower delay percentage than Alaska.