# Filter out Cancelled and Diverted flights into separate files to reduce complexity and confustion
flights_cancelled <- flights_data %>% filter(Cancelled == 1)
flights_diverted<- flights_data %>% filter(Diverted == 1)
flights_comp <- flights_data %>% filter(Diverted == 0 & Cancelled == 0) 
#  Create a single field with date and time together for scheduled and actual departure, scheduled and actual arrival, wheels on & wheels on date and time, then make them all POSIX format
flights_comp <- flights_comp %>%
  mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)),
         new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), 
         new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
         new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)),
         new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)),
         new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
#  Make fileds file POSIX standard
flights_comp$new_CRSDepTime<- as.POSIXct(flights_comp$new_CRSDepTime, format= "%Y-%m-%d %H%M")
flights_comp$new_DepTime <- as.POSIXct(flights_comp$new_DepTime, format="%Y-%m-%d %H%M")
flights_comp$new_WheelsOff <- as.POSIXct(flights_comp$new_WheelsOff, format="%Y-%m-%d %H%M")
flights_comp$new_WheelsOn <- as.POSIXct(flights_comp$new_WheelsOn, format="%Y-%m-%d %H%M")
flights_comp$new_CRSArrTime <- as.POSIXct(flights_comp$new_CRSArrTime, format="%Y-%m-%d %H%M")
flights_comp$new_ArrTime <- as.POSIXct(flights_comp$new_ArrTime, format="%Y-%m-%d %H%M")
#  Save flights_comp without additional data.  Create new file (flights_enhanced) with additional fields
flights_enhanced <- flights_comp %>%
  mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins"))) %>%
  arrange(DepDelay)
# ASSUME: Flights with a DepDelay time of less than -47 actually left the following day, and were late departing, not early
#head(flights_enhanced %>% select(CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay))
#flights_enhanced %>% 
#  filter(DepDelay < -47 & DepDelay > -1440 ) %>% 
#  select(CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay)%>%
#  arrange(new_CRSDepTime)

#names(flights_enhanced)
# Recalculate Departure Delay for flights delayed overnight, set DepNextDay flag, and set departing early flights to 0
flights_enhanced <- flights_enhanced %>% 
  mutate(DepNextDay = ifelse(DepDelay < -47 & DepDelay > -1440, 1, 0), 
         DepDelay = ifelse(DepDelay < -47 & DepDelay > -1440, DepDelay + 1440, DepDelay),
         DepDelay = ifelse(DepDelay <  0, 0, DepDelay)) 

#filter(flights_enhanced, DepDelay < -47 & DepDelay > -1440 ) %>% 
#  select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay )%>%
#  arrange(new_CRSDepTime)

#filter(flights_enhanced, DepDelay < -0) %>% 
#  select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay)%>%
#  arrange(new_CRSDepTime)
#names(flights_enhanced)
# Identify flights departing more than 15 minutes late
flights_enhanced <- flights_enhanced %>% 
  mutate(DepDel_gt15 = ifelse(DepDelay >15, 1, 0)) 

#filter(flights_enhanced, DepDel_gt15 == 0) %>% 
#  select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay, DepDel_gt15)%>%
#  arrange(DepDelay)
#names(flights_enhanced)
# calculate Taxi Out time (time between actual departure and wheels off).  If TaxiOut is negative, correct by adding 1440. 
flights_enhanced <- flights_enhanced %>% 
  mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")))

#filter(flights_enhanced, TaxiOut < 0) %>% 
#  select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay, TaxiOut)%>%
#  arrange(TaxiOut)

flights_enhanced <- flights_enhanced %>% 
  mutate(TaxiOut = ifelse(TaxiOut < 0, TaxiOut+1440, TaxiOut)) 


#filter(flights_enhanced, TaxiOut < 0) %>% 
#  select(DepNextDay, CRSDepTime, new_CRSDepTime, DepTime, new_DepTime, DepDelay, TaxiOut)%>%
#  arrange(TaxiOut)

#names(flights_enhanced)
# Calculate Taxi In time and correct for flights taxing in after midnight

flights_enhanced <- flights_enhanced %>% 
  mutate(TaxiIn=as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins"))) 

#filter(flights_enhanced, TaxiIn < 0) %>% 
#  select(DepNextDay, TaxiIn, CRSArrTime, new_CRSArrTime, ArrTime, new_ArrTime, new_WheelsOn, #TaxiOut)%>%
#  arrange(TaxiIn)

flights_enhanced <- flights_enhanced %>% 
  mutate(TaxiIn = ifelse(TaxiIn < 0, TaxiIn+1440, TaxiIn)) 
# Calculate scheduled gate to gate
#head(flights_enhanced)

flights_enhanced <- flights_enhanced %>% 
  mutate(CRS_G2G=as.integer(difftime(new_CRSArrTime, new_CRSDepTime, units = "mins"))) 

#filter(flights_enhanced, CRS_G2G <= 0) %>% 
#  arrange(desc(CRS_G2G))

#filter(flights_enhanced, CRS_G2G > 0) %>% 
#  arrange(desc(CRS_G2G))

flights_enhanced <- flights_enhanced %>% 
  mutate(CRS_G2G = ifelse(CRS_G2G < 0, CRS_G2G+1440, CRS_G2G))
# Calculate Actual Elapsed Time, Gate to Gate

flights_enhanced <- flights_enhanced %>% 
  mutate(Act_G2G =as.integer(difftime(new_ArrTime, new_DepTime, units = "mins"))) 

#filter(flights_enhanced, Act_G2G <= 0) %>% 
#  arrange(Act_G2G)

#filter(flights_enhanced, Act_G2G > 0) %>% 
#  arrange(Act_G2G)

flights_enhanced <- flights_enhanced %>%
  mutate(Act_G2G = ifelse(Act_G2G < 0, Act_G2G+1440, Act_G2G))
# Calculate Actual Air Time

flights_enhanced <- flights_enhanced %>% 
  mutate(Act_air =as.integer(difftime(new_WheelsOff, new_WheelsOn, units = "mins"))) 

#filter(flights_enhanced, Act_air <= 0) %>% 
#  arrange(Act_air)

#filter(flights_enhanced, Act_air > 0) %>% 
#  arrange(Act_air)

flights_enhanced <- flights_enhanced %>%
  mutate(Act_air = ifelse(Act_air < 0, Act_air+1440, Act_air))
# Calculate the difference (buffer) between scheduled gate to gate time and Actual Elapsed Time

flights_enhanced <- flights_enhanced %>% 
  mutate(CRSvsActual = (CRS_G2G - Act_G2G)) 

#filter(flights_enhanced, CRSvsActual <= 0) %>% 
#  arrange(desc(CRSvsActual))

#filter(flights_enhanced, CRSvsActual > 0) %>% 
#  arrange(desc(CRSvsActual))
# Calculate the air speed

flights_enhanced <- flights_enhanced %>% 
  mutate(Act_airspeed =as.integer(Distance/Act_air),
         Total_speed = as.integer(Distance/Act_G2G))
## Warning in evalq(as.integer(Distance/Act_air), <environment>): NAs
## introduced by coercion to integer range
## Warning in evalq(as.integer(Distance/Act_G2G), <environment>): NAs
## introduced by coercion to integer range
#filter(flights_enhanced, Total_speed > 0) %>% arrange(Total_speed)
knitr::opts_chunk$set(echo = FALSE)


# * * * * * * * * * * * * * * * * * *
## MBA676 Data Management & Analytics
## Assignment 2
### Created October 8, 2017
#This assignment uses a file of domestic flight data for January 2016. 
#The data in this file is incomplete and many assumptions needed to be made regarding flight dates and times. 
#There was no adjustment for time zone differences, futher complicating the assumptions in the data.
## Warning: package 'ggvis' was built under R version 3.4.2
##  [1] "FlightDate"        "Carrier"           "TailNum"          
##  [4] "FlightNum"         "Origin"            "OriginCityName"   
##  [7] "OriginState"       "Dest"              "DestCityName"     
## [10] "DestState"         "CRSDepTime"        "DepTime"          
## [13] "WheelsOff"         "WheelsOn"          "CRSArrTime"       
## [16] "ArrTime"           "Cancelled"         "Diverted"         
## [19] "CRSElapsedTime"    "ActualElapsedTime" "Distance"         
## [22] "new_CRSDepTime"    "new_DepTime"       "new_WheelsOff"    
## [25] "new_WheelsOn"      "new_CRSArrTime"    "new_ArrTime"      
## [28] "DepDelay"          "DepNextDay"        "DepDel_gt15"      
## [31] "TaxiOut"           "TaxiIn"            "CRS_G2G"          
## [34] "Act_G2G"           "Act_air"           "CRSvsActual"      
## [37] "Act_airspeed"      "Total_speed"
## Warning in kable_markdown(x = structure("Summary of delays greater than 15
## minutes by carrier with Average flight delays", .Dim = c(1L, : The table
## should have a header (column names)
Summary of delays greater than 15 minutes by carrier with Average flight delays
## Warning in kable_markdown(x = structure("Data restricted to delays less
## than 5 hours and distances less than 1,500 miles", .Dim = c(1L, : The table
## should have a header (column names)
Data restricted to delays less than 5 hours and distances less than 1,500 miles
Carrier Avg_flight_delay Num_filghts
AA 55.0 8405
AS 59.0 879
B6 58.0 4726
DL 56.2 7116
EV 62.4 5537
F9 65.8 745
HA 29.5 151
NK 58.2 2498
OO 67.7 7691
UA 57.9 4169
VX 63.7 749
WN 45.6 13870
## Warning in kable_markdown(x = structure("Chart of flights with actual
## elapsed time between 30 and 300 minutes and departure delay time less than
## 5 hours", .Dim = c(1L, : The table should have a header (column names)
Chart of flights with actual elapsed time between 30 and 300 minutes and departure delay time less than 5 hours