# Filter out Cancelled and Diverted flights into separate files to reduce complexity and confustion
flights_cancelled <- flights_data %>% filter(Cancelled == 1)
flights_diverted<- flights_data %>% filter(Diverted == 1)
flights_comp <- flights_data %>% filter(Diverted == 0 & Cancelled == 0)
# Create a single field with date and time together for scheduled and actual departure, scheduled and actual arrival, wheels on & wheels on date and time, then make them all POSIX format
flights_comp <- flights_comp %>%
mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)),
new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)),
new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)),
new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)),
new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
# Make fileds file POSIX standard
flights_comp$new_CRSDepTime<- as.POSIXct(flights_comp$new_CRSDepTime, format= "%Y-%m-%d %H%M")
flights_comp$new_DepTime <- as.POSIXct(flights_comp$new_DepTime, format="%Y-%m-%d %H%M")
flights_comp$new_WheelsOff <- as.POSIXct(flights_comp$new_WheelsOff, format="%Y-%m-%d %H%M")
flights_comp$new_WheelsOn <- as.POSIXct(flights_comp$new_WheelsOn, format="%Y-%m-%d %H%M")
flights_comp$new_CRSArrTime <- as.POSIXct(flights_comp$new_CRSArrTime, format="%Y-%m-%d %H%M")
flights_comp$new_ArrTime <- as.POSIXct(flights_comp$new_ArrTime, format="%Y-%m-%d %H%M")
# Save flights_comp without additional data. Create new file (flights_enhanced) with additional fields
flights_enhanced <- flights_comp %>%
mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins"))) %>%
arrange(DepDelay)
# ASSUME: Flights with a DepDelay time of less than -47 actually left the following day, and were late departing, not early
#head(flights_enhanced %>% select(CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay))
#flights_enhanced %>%
# filter(DepDelay < -47 & DepDelay > -1440 ) %>%
# select(CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay)%>%
# arrange(new_CRSDepTime)
#names(flights_enhanced)
# Recalculate Departure Delay for flights delayed overnight, set DepNextDay flag, and set departing early flights to 0
flights_enhanced <- flights_enhanced %>%
mutate(DepNextDay = ifelse(DepDelay < -47 & DepDelay > -1440, 1, 0),
DepDelay = ifelse(DepDelay < -47 & DepDelay > -1440, DepDelay + 1440, DepDelay),
DepDelay = ifelse(DepDelay < 0, 0, DepDelay))
#filter(flights_enhanced, DepDelay < -47 & DepDelay > -1440 ) %>%
# select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay )%>%
# arrange(new_CRSDepTime)
#filter(flights_enhanced, DepDelay < -0) %>%
# select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay)%>%
# arrange(new_CRSDepTime)
#names(flights_enhanced)
# Identify flights departing more than 15 minutes late
flights_enhanced <- flights_enhanced %>%
mutate(DepDel_gt15 = ifelse(DepDelay >15, 1, 0))
#filter(flights_enhanced, DepDel_gt15 == 0) %>%
# select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay, DepDel_gt15)%>%
# arrange(DepDelay)
#names(flights_enhanced)
# calculate Taxi Out time (time between actual departure and wheels off). If TaxiOut is negative, correct by adding 1440.
flights_enhanced <- flights_enhanced %>%
mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")))
#filter(flights_enhanced, TaxiOut < 0) %>%
# select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay, TaxiOut)%>%
# arrange(TaxiOut)
flights_enhanced <- flights_enhanced %>%
mutate(TaxiOut = ifelse(TaxiOut < 0, TaxiOut+1440, TaxiOut))
#filter(flights_enhanced, TaxiOut < 0) %>%
# select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay, TaxiOut)%>%
# arrange(TaxiOut)
#names(flights_enhanced)
# Calculate Taxi In time and correct for flights taxing in after midnight
flights_enhanced <- flights_enhanced %>%
mutate(TaxiIn=as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")))
#filter(flights_enhanced, TaxiIn < 0) %>%
# select(DepNextDay, TaxiIn, CRSArrTime, new_CRSArrTime, ArrTime, new_ArrTime, new_WheelsOn, #TaxiOut)%>%
# arrange(TaxiIn)
flights_enhanced <- flights_enhanced %>%
mutate(TaxiIn = ifelse(TaxiIn < 0, TaxiIn+1440, TaxiIn))
# Calculate scheduled gate to gate
#head(flights_enhanced)
flights_enhanced <- flights_enhanced %>%
mutate(CRS_G2G=as.integer(difftime(new_CRSArrTime, new_CRSDepTime, units = "mins")))
#filter(flights_enhanced, CRS_G2G <= 0) %>%
# arrange(desc(CRS_G2G))
#filter(flights_enhanced, CRS_G2G > 0) %>%
# arrange(desc(CRS_G2G))
flights_enhanced <- flights_enhanced %>%
mutate(CRS_G2G = ifelse(CRS_G2G < 0, CRS_G2G+1440, CRS_G2G))
# Calculate Actual Elapsed Time, Gate to Gate
flights_enhanced <- flights_enhanced %>%
mutate(Act_G2G =as.integer(difftime(new_ArrTime, new_DepTime, units = "mins")))
#filter(flights_enhanced, Act_G2G <= 0) %>%
# arrange(Act_G2G)
#filter(flights_enhanced, Act_G2G > 0) %>%
# arrange(Act_G2G)
flights_enhanced <- flights_enhanced %>%
mutate(Act_G2G = ifelse(Act_G2G < 0, Act_G2G+1440, Act_G2G))
# Calculate Actual Air Time
flights_enhanced <- flights_enhanced %>%
mutate(Act_air =as.integer(difftime(new_WheelsOff, new_WheelsOn, units = "mins")))
#filter(flights_enhanced, Act_air <= 0) %>%
# arrange(Act_air)
#filter(flights_enhanced, Act_air > 0) %>%
# arrange(Act_air)
flights_enhanced <- flights_enhanced %>%
mutate(Act_air = ifelse(Act_air < 0, Act_air+1440, Act_air))
# Calculate the difference (buffer) between scheduled gate to gate time and Actual Elapsed Time
flights_enhanced <- flights_enhanced %>%
mutate(CRSvsActual = (CRS_G2G - Act_G2G))
#filter(flights_enhanced, CRSvsActual <= 0) %>%
# arrange(desc(CRSvsActual))
#filter(flights_enhanced, CRSvsActual > 0) %>%
# arrange(desc(CRSvsActual))
# Calculate the air speed
flights_enhanced <- flights_enhanced %>%
mutate(Act_airspeed =as.integer(Distance/Act_air),
Total_speed = as.integer(Distance/Act_G2G))
## Warning in evalq(as.integer(Distance/Act_air), <environment>): NAs
## introduced by coercion to integer range
## Warning in evalq(as.integer(Distance/Act_G2G), <environment>): NAs
## introduced by coercion to integer range
#filter(flights_enhanced, Total_speed > 0) %>% arrange(Total_speed)
knitr::opts_chunk$set(echo = FALSE)
# * * * * * * * * * * * * * * * * * *
## MBA676 Data Management & Analytics
## Assignment 2
### Created October 8, 2017
#This assignment uses a file of domestic flight data for January 2016.
#The data in this file is incomplete and many assumptions needed to be made regarding flight dates and times.
#There was no adjustment for time zone differences, futher complicating the assumptions in the data.
## Warning: package 'ggvis' was built under R version 3.4.2
## [1] "FlightDate" "Carrier" "TailNum"
## [4] "FlightNum" "Origin" "OriginCityName"
## [7] "OriginState" "Dest" "DestCityName"
## [10] "DestState" "CRSDepTime" "DepTime"
## [13] "WheelsOff" "WheelsOn" "CRSArrTime"
## [16] "ArrTime" "Cancelled" "Diverted"
## [19] "CRSElapsedTime" "ActualElapsedTime" "Distance"
## [22] "new_CRSDepTime" "new_DepTime" "new_WheelsOff"
## [25] "new_WheelsOn" "new_CRSArrTime" "new_ArrTime"
## [28] "DepDelay" "DepNextDay" "DepDel_gt15"
## [31] "TaxiOut" "TaxiIn" "CRS_G2G"
## [34] "Act_G2G" "Act_air" "CRSvsActual"
## [37] "Act_airspeed" "Total_speed"
## Warning in kable_markdown(x = structure("Summary of delays greater than 15
## minutes by carrier with Average flight delays", .Dim = c(1L, : The table
## should have a header (column names)
| Summary of delays greater than 15 minutes by carrier with Average flight delays |
## Warning in kable_markdown(x = structure("Data restricted to delays less
## than 5 hours and distances less than 1,500 miles", .Dim = c(1L, : The table
## should have a header (column names)
| Data restricted to delays less than 5 hours and distances less than 1,500 miles |
| AA |
55.0 |
8405 |
| AS |
59.0 |
879 |
| B6 |
58.0 |
4726 |
| DL |
56.2 |
7116 |
| EV |
62.4 |
5537 |
| F9 |
65.8 |
745 |
| HA |
29.5 |
151 |
| NK |
58.2 |
2498 |
| OO |
67.7 |
7691 |
| UA |
57.9 |
4169 |
| VX |
63.7 |
749 |
| WN |
45.6 |
13870 |
## Warning in kable_markdown(x = structure("Chart of flights with actual
## elapsed time between 30 and 300 minutes and departure delay time less than
## 5 hours", .Dim = c(1L, : The table should have a header (column names)
| Chart of flights with actual elapsed time between 30 and 300 minutes and departure delay time less than 5 hours |