week_5 <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/week_5.csv")
week_5
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
colnames(week_5)[1:2] <- c("provider","status")
week_5[2,1] <- 'ALASKA'
week_5[5,1] <- 'AM WEST'
week_5 <- week_5[-3,]
week_5
## provider status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
Ideally “Location” should be a variable/column name, with individual “Values” relating to the specific location (i.e. ‘Los.Angeles’). We can use the “gather” function to turn the various columns into location/flight_count value pairs:
week_5 <- gather(week_5,"location", "flight_count", Los.Angeles:Seattle)
week_5
## provider status location flight_count
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Ideally ontime flights and delayed flights should have their own variables/column names. This requires casting/spreading:
week_5 <- spread(week_5,status,flight_count)
week_5
## provider location delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
Adding a column that contains the total of delayed and on time flights. Also adding a column for % of delayed. This is good for analysis purposes.
week_5$total_flights <- week_5$delayed + week_5$'on time'
week_5$delayed_pct <- week_5$delayed / week_5$total_flights
week_5
## provider location delayed on time total_flights delayed_pct
## 1 ALASKA Los.Angeles 62 497 559 0.11091234
## 2 ALASKA Phoenix 12 221 233 0.05150215
## 3 ALASKA San.Diego 20 212 232 0.08620690
## 4 ALASKA San.Francisco 102 503 605 0.16859504
## 5 ALASKA Seattle 305 1841 2146 0.14212488
## 6 AM WEST Los.Angeles 117 694 811 0.14426634
## 7 AM WEST Phoenix 415 4840 5255 0.07897241
## 8 AM WEST San.Diego 65 383 448 0.14508929
## 9 AM WEST San.Francisco 129 320 449 0.28730512
## 10 AM WEST Seattle 61 201 262 0.23282443
Lets look at overall delay percentages per providers
AK_delayed_pct <- sum(subset(week_5,provider == 'ALASKA')$delayed) / sum(subset(week_5,provider == 'ALASKA')$total_flights)
AW_delayed_pct <- sum(subset(week_5,provider == 'AM WEST')$delayed) / sum(subset(week_5,provider == 'AM WEST')$total_flights)
df <- data.frame(c("Alaska", "AM West"),c(AK_delayed_pct, AW_delayed_pct))
colnames(df) <- c("Provider", "Delay Percentage")
df
## Provider Delay Percentage
## 1 Alaska 0.1327152
## 2 AM West 0.1089273
Looks like AM West has a lower delay percentage than Alaska.