Load libraries
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Load Data
fightstatus <- tbl_df(read.csv("sampleFlightData.csv", stringsAsFactors = FALSE, check.names = FALSE))
#all flight status
print(fightstatus)
## # A tibble: 4 x 7
## Flight Status `Los Angeles` Phoenix `San Diego` `San Francisco`
## <chr> <chr> <int> <int> <int> <int>
## 1 Alaska On Time 497 221 212 503
## 2 Alaska Delayed 62 12 20 102
## 3 AM WEST On Time 694 4840 383 320
## 4 AM WEST Delayed 117 415 65 129
## # ... with 1 more variables: Seattle <int>
Tidy the data per airline
#all arrival status for Alaska
fightstatus.ALASKA <- fightstatus %>%
filter(Flight == "Alaska") %>%
gather(Destination, Arrival, 3:7)
## Warning: package 'bindrcpp' was built under R version 3.4.2
print(fightstatus.ALASKA)
## # A tibble: 10 x 4
## Flight Status Destination Arrival
## <chr> <chr> <chr> <int>
## 1 Alaska On Time Los Angeles 497
## 2 Alaska Delayed Los Angeles 62
## 3 Alaska On Time Phoenix 221
## 4 Alaska Delayed Phoenix 12
## 5 Alaska On Time San Diego 212
## 6 Alaska Delayed San Diego 20
## 7 Alaska On Time San Francisco 503
## 8 Alaska Delayed San Francisco 102
## 9 Alaska On Time Seattle 1841
## 10 Alaska Delayed Seattle 305
#all arrival status for AM WEST
fightstatus.AMWEST <- fightstatus %>%
filter(Flight == "AM WEST") %>%
gather(Destination, Arrival, 3:7)
print(fightstatus.AMWEST)
## # A tibble: 10 x 4
## Flight Status Destination Arrival
## <chr> <chr> <chr> <int>
## 1 AM WEST On Time Los Angeles 694
## 2 AM WEST Delayed Los Angeles 117
## 3 AM WEST On Time Phoenix 4840
## 4 AM WEST Delayed Phoenix 415
## 5 AM WEST On Time San Diego 383
## 6 AM WEST Delayed San Diego 65
## 7 AM WEST On Time San Francisco 320
## 8 AM WEST Delayed San Francisco 129
## 9 AM WEST On Time Seattle 201
## 10 AM WEST Delayed Seattle 61
#spread the status column
fightstatus.ALASKA <- fightstatus.ALASKA %>%
spread(Status, Arrival)
Transform the data and calculate the delay percentage
#calculate and add the delay percentage
fightstatus.ALASKA <- fightstatus.ALASKA %>%
mutate(
Arrival_Total = fightstatus.ALASKA$Delayed + fightstatus.ALASKA$`On Time`,
Delay_Pct = fightstatus.ALASKA$Delayed / Arrival_Total
)
#spread the status column
fightstatus.AMWEST <- fightstatus.AMWEST %>%
spread(Status, Arrival)
#calculate and add the delay percentage
fightstatus.AMWEST <- fightstatus.AMWEST %>%
mutate(
Arrival_Total = fightstatus.AMWEST$Delayed + fightstatus.AMWEST$`On Time`,
Delay_Pct = fightstatus.AMWEST$Delayed / Arrival_Total
)
Print Delay Pct
print(fightstatus.ALASKA)
## # A tibble: 5 x 6
## Flight Destination Delayed `On Time` Arrival_Total Delay_Pct
## <chr> <chr> <int> <int> <int> <dbl>
## 1 Alaska Los Angeles 62 497 559 0.11091234
## 2 Alaska Phoenix 12 221 233 0.05150215
## 3 Alaska San Diego 20 212 232 0.08620690
## 4 Alaska San Francisco 102 503 605 0.16859504
## 5 Alaska Seattle 305 1841 2146 0.14212488
print(fightstatus.AMWEST)
## # A tibble: 5 x 6
## Flight Destination Delayed `On Time` Arrival_Total Delay_Pct
## <chr> <chr> <int> <int> <int> <dbl>
## 1 AM WEST Los Angeles 117 694 811 0.14426634
## 2 AM WEST Phoenix 415 4840 5255 0.07897241
## 3 AM WEST San Diego 65 383 448 0.14508929
## 4 AM WEST San Francisco 129 320 449 0.28730512
## 5 AM WEST Seattle 61 201 262 0.23282443