This report looks at metrics for flights departing from PWM.
An initial linear regression analysis of the full data set was performed in an effort to identify predictors of departure delays but found no conlusive linear relationships. If time were less of a constraint, a better regression analysis could be performed by converting categorical data such as location to numerical data and by including historical weather data in the anaylsis.
A report of the linear regression analysis was created and saved as a reference document for future analysis opportunities.
The linear regression analysis report can be viewed at: “http://rpubs.com/clolar/216400”
#Load the Raw Data file
RawData <- read.csv("domestic_flights_jan_2016.csv", header = TRUE, stringsAsFactors = FALSE)
#Extract records for ORD and Add a counter field "numflight"
flights <- RawData %>% filter(Origin == "PWM") %>% mutate(numflight = 1)
#Field Headings
names(flights)
## [1] "FlightDate" "Carrier" "TailNum"
## [4] "FlightNum" "Origin" "OriginCityName"
## [7] "OriginState" "Dest" "DestCityName"
## [10] "DestState" "CRSDepTime" "DepTime"
## [13] "WheelsOff" "WheelsOn" "CRSArrTime"
## [16] "ArrTime" "Cancelled" "Diverted"
## [19] "CRSElapsedTime" "ActualElapsedTime" "Distance"
## [22] "numflight"
#Structure
str(flights)
## 'data.frame': 377 obs. of 22 variables:
## $ FlightDate : chr "1/5/2016" "1/1/2016" "1/6/2016" "1/4/2016" ...
## $ Carrier : chr "AA" "AA" "AA" "AA" ...
## $ TailNum : chr "N745VJ" "N748UW" "N829AW" "N755US" ...
## $ FlightNum : int 1815 1815 1815 1815 1815 1815 850 653 1815 1815 ...
## $ Origin : chr "PWM" "PWM" "PWM" "PWM" ...
## $ OriginCityName : chr "Portland, ME" "Portland, ME" "Portland, ME" "Portland, ME" ...
## $ OriginState : chr "ME" "ME" "ME" "ME" ...
## $ Dest : chr "CLT" "CLT" "CLT" "CLT" ...
## $ DestCityName : chr "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" ...
## $ DestState : chr "NC" "NC" "NC" "NC" ...
## $ CRSDepTime : int 800 805 800 805 800 800 800 910 800 800 ...
## $ DepTime : int 906 802 755 803 758 801 755 903 817 757 ...
## $ WheelsOff : int 923 814 815 817 821 842 815 919 828 850 ...
## $ WheelsOn : int 1111 1034 1022 1037 1034 1048 1026 1130 1039 1057 ...
## $ CRSArrTime : int 1047 1050 1047 1050 1047 1047 1047 1201 1047 1047 ...
## $ ArrTime : int 1127 1040 1027 1044 1039 1053 1030 1138 1045 1102 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 167 165 167 165 167 167 167 171 167 167 ...
## $ ActualElapsedTime: int 141 158 152 161 161 172 155 155 148 185 ...
## $ Distance : int 813 813 813 813 813 813 813 813 813 813 ...
## $ numflight : num 1 1 1 1 1 1 1 1 1 1 ...
#Quick Check for records with Duplicate Tail Numbers.
pandoc.table(head(flights %>% count(length(unique(TailNum))), style = "grid"))
##
## -----------------------------
## length(unique(TailNum)) n
## ------------------------- ---
## 267 377
## -----------------------------
Confirmed records with duplicate tail numbers exist.
There are 377 Records and 267 Unique Tail Numbers
#Checking for neagative flight times
pandoc.table(flights %>% count(ArrTime > DepTime), style = "grid")
##
##
## +---------------------+-----+
## | ArrTime > DepTime | n |
## +=====================+=====+
## | TRUE | 355 |
## +---------------------+-----+
## | NA | 22 |
## +---------------------+-----+
#Check the Number of Incomplete Cases
pandoc.table(flights %>% count(!complete.cases(.)), style = "grid")
##
##
## +----------------------+-----+
## | !complete.cases(.) | n |
## +======================+=====+
## | FALSE | 355 |
## +----------------------+-----+
## | TRUE | 22 |
## +----------------------+-----+
#Check the Number of Cancelled Flights
pandoc.table(flights %>% count(Cancelled == 1), style = 'grid')
##
##
## +------------------+-----+
## | Cancelled == 1 | n |
## +==================+=====+
## | FALSE | 355 |
## +------------------+-----+
## | TRUE | 22 |
## +------------------+-----+
It appears the incomplete cases are cancelled flights
Convert the FlightDate field to a Date clase
#This code chunk was taken directly from the lecure notes.
#Converts FlightDate to a Date class
flights$FlightDate <- as.Date(flights$FlightDate, format = "%m/%d/%Y")
#This code chunk was taken directly from the lecure notes.
#Using the **sprintf()** function to add leading zeros to allow creation of date/time variables
#Add leading zeros to allow creation of date/time variables, then paste the Date information from FlightDate
flights <- flights %>%
mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
flights$new_CRSDepTime <- as.POSIXct(flights$new_CRSDepTime, format="%Y-%m-%d %H%M")
pandoc.table(head(flights %>% select(CRSDepTime, new_CRSDepTime), style = "grid"))
##
## --------------------------------
## CRSDepTime new_CRSDepTime
## ------------ -------------------
## 800 2016-01-05 08:00:00
##
## 805 2016-01-01 08:05:00
##
## 800 2016-01-06 08:00:00
##
## 805 2016-01-04 08:05:00
##
## 800 2016-01-07 08:00:00
##
## 800 2016-01-08 08:00:00
## --------------------------------
#The **sprintf()** function is repeated for: *DepTime, WheelsOff, WheelsOn, CRSArrTime, and ArrTime*
#DepTime
flights <- flights %>%
mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)))
flights$new_DepTime <- as.POSIXct(flights$new_DepTime, format="%Y-%m-%d %H%M")
pandoc.table(head(flights %>% select(DepTime, new_DepTime), style = "grid"))
##
## -----------------------------
## DepTime new_DepTime
## --------- -------------------
## 906 2016-01-05 09:06:00
##
## 802 2016-01-01 08:02:00
##
## 755 2016-01-06 07:55:00
##
## 803 2016-01-04 08:03:00
##
## 758 2016-01-07 07:58:00
##
## 801 2016-01-08 08:01:00
## -----------------------------
#WheelsOff
flights <- flights %>%
mutate(new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)))
flights$new_WheelsOff <- as.POSIXct(flights$new_WheelsOff, format="%Y-%m-%d %H%M")
pandoc.table(head(flights %>% select(WheelsOff, new_WheelsOff), style = "grid"))
##
## -------------------------------
## WheelsOff new_WheelsOff
## ----------- -------------------
## 923 2016-01-05 09:23:00
##
## 814 2016-01-01 08:14:00
##
## 815 2016-01-06 08:15:00
##
## 817 2016-01-04 08:17:00
##
## 821 2016-01-07 08:21:00
##
## 842 2016-01-08 08:42:00
## -------------------------------
#WheelsOn
flights <- flights %>%
mutate(new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)))
flights$new_WheelsOn <- as.POSIXct(flights$new_WheelsOn, format="%Y-%m-%d %H%M")
pandoc.table(head(flights %>% select(WheelsOn, new_WheelsOn), style = "grid"))
##
## ------------------------------
## WheelsOn new_WheelsOn
## ---------- -------------------
## 1111 2016-01-05 11:11:00
##
## 1034 2016-01-01 10:34:00
##
## 1022 2016-01-06 10:22:00
##
## 1037 2016-01-04 10:37:00
##
## 1034 2016-01-07 10:34:00
##
## 1048 2016-01-08 10:48:00
## ------------------------------
#CRSArrTime
flights <- flights %>%
mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
flights$new_CRSArrTime <- as.POSIXct(flights$new_CRSArrTime, format="%Y-%m-%d %H%M")
pandoc.table(head(flights %>% select(CRSArrTime, new_CRSArrTime), style = "grid"))
##
## --------------------------------
## CRSArrTime new_CRSArrTime
## ------------ -------------------
## 1047 2016-01-05 10:47:00
##
## 1050 2016-01-01 10:50:00
##
## 1047 2016-01-06 10:47:00
##
## 1050 2016-01-04 10:50:00
##
## 1047 2016-01-07 10:47:00
##
## 1047 2016-01-08 10:47:00
## --------------------------------
#ArrTime
flights <- flights %>%
mutate(new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
flights$new_ArrTime <- as.POSIXct(flights$new_ArrTime, format="%Y-%m-%d %H%M")
pandoc.table(head(flights %>% select(ArrTime, new_ArrTime), style = "grid"))
##
## -----------------------------
## ArrTime new_ArrTime
## --------- -------------------
## 1127 2016-01-05 11:27:00
##
## 1040 2016-01-01 10:40:00
##
## 1027 2016-01-06 10:27:00
##
## 1044 2016-01-04 10:44:00
##
## 1039 2016-01-07 10:39:00
##
## 1053 2016-01-08 10:53:00
## -----------------------------
#Using the **difftime()** function to calculate the differnce between two date/time objects.
#The **as.interger()** function is used to store the result as in integer.
flights <- flights %>% mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
pandoc.table(head(flights %>% select(CRSDepTime, DepTime, DepDelay), style = "grid"))
##
## ---------------------------------
## CRSDepTime DepTime DepDelay
## ------------ --------- ----------
## 800 906 66
##
## 805 802 -3
##
## 800 755 -5
##
## 805 803 -2
##
## 800 758 -2
##
## 800 801 1
## ---------------------------------
#Using the **ifelse()** function to separate flights with delays into two categories.
#Variable DepDelayMinutes for delays less then 15 minutes and DeDel15 for delays 15 minutes or more.
flights <- flights %>% mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay),
DepDel15 = ifelse(DepDelay >= 15, 1, 0))
pandoc.table(head(flights %>% select(DepDelay, DepDelayMinutes, DepDel15), style = "grid"))
##
## ---------------------------------------
## DepDelay DepDelayMinutes DepDel15
## ---------- ----------------- ----------
## 66 66 1
##
## -3 0 0
##
## -5 0 0
##
## -2 0 0
##
## -2 0 0
##
## 1 1 0
## ---------------------------------------
#Check the number of flights with more than a 15 minute delay.
pandoc.table(flights %>% count(DepDel15 == 1), style = 'grid')
##
##
## +-----------------+-----+
## | DepDel15 == 1 | n |
## +=================+=====+
## | FALSE | 318 |
## +-----------------+-----+
## | TRUE | 37 |
## +-----------------+-----+
## | NA | 22 |
## +-----------------+-----+
#There are 41991 record for flights with a delay of 15 minutes or more
#Calculating the inboud and outboud taxi times and in minutes.
#TaxiOut and TaxiIn
flights <- flights %>% mutate(TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")),
TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")))
pandoc.table(head(flights %>% select(TaxiIn, TaxiOut), style = "grid"))
##
## ------------------
## TaxiIn TaxiOut
## -------- ---------
## 16 17
##
## 6 12
##
## 5 20
##
## 7 14
##
## 5 23
##
## 5 41
## ------------------
#Calculating the difference in minutes between scheduled and actual arrival time.
#Early arrivals show negative numbers.
#ArrDelay
flights <- flights %>% mutate(ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")))
pandoc.table(head(flights %>% select(ArrDelay), style = "grid"))
##
## ----------
## ArrDelay
## ----------
## 40
##
## -10
##
## -20
##
## -6
##
## -8
##
## 6
## ----------
#Calculating the difference in minutes between scheduled and actual arrival time with early arrivals set to 0.
#Creating an Arrival Delay Indicator, 15 Minutes or More (1=Yes)
#ArrDelayMinutes and ArrDel15
flights <- flights %>% mutate( ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay),
ArrDel15 = ifelse(ArrDelay >= 15, 1, 0))
pandoc.table(head(flights %>% select(ArrDelayMinutes, ArrDel15), style = "grid"))
##
## ----------------------------
## ArrDelayMinutes ArrDel15
## ----------------- ----------
## 40 1
##
## 0 0
##
## 0 0
##
## 0 0
##
## 0 0
##
## 6 0
## ----------------------------
#Arithmetic Calculations
#FlightTimeBuffer: The difference between scheduled elapsed time and actual elapsed time.
#AirTime: The time spent in the air.
#AirSpeed: Average speed of the plane in flight (mph)
#FlightTimeBuffer
flights <- flights %>% mutate(FlightTimeBuffer = CRSElapsedTime - ActualElapsedTime)
#AirTime
flights <- flights %>% mutate(AirTime = ActualElapsedTime - TaxiOut - TaxiIn)
#AirSpeed
flights <- flights %>% mutate(AirSpeed = Distance / (AirTime / 60))
pandoc.table(head(flights %>% select(FlightTimeBuffer, AirTime, AirSpeed), style = "grid"))
##
## ---------------------------------------
## FlightTimeBuffer AirTime AirSpeed
## ------------------ --------- ----------
## 26 108 451.7
##
## 7 140 348.4
##
## 15 127 384.1
##
## 4 140 348.4
##
## 6 133 366.8
##
## -5 126 387.1
## ---------------------------------------
#Field Headings
names(flights)
## [1] "FlightDate" "Carrier" "TailNum"
## [4] "FlightNum" "Origin" "OriginCityName"
## [7] "OriginState" "Dest" "DestCityName"
## [10] "DestState" "CRSDepTime" "DepTime"
## [13] "WheelsOff" "WheelsOn" "CRSArrTime"
## [16] "ArrTime" "Cancelled" "Diverted"
## [19] "CRSElapsedTime" "ActualElapsedTime" "Distance"
## [22] "numflight" "new_CRSDepTime" "new_DepTime"
## [25] "new_WheelsOff" "new_WheelsOn" "new_CRSArrTime"
## [28] "new_ArrTime" "DepDelay" "DepDelayMinutes"
## [31] "DepDel15" "TaxiOut" "TaxiIn"
## [34] "ArrDelay" "ArrDelayMinutes" "ArrDel15"
## [37] "FlightTimeBuffer" "AirTime" "AirSpeed"
#Structure
str(flights)
## 'data.frame': 377 obs. of 39 variables:
## $ FlightDate : Date, format: "2016-01-05" "2016-01-01" ...
## $ Carrier : chr "AA" "AA" "AA" "AA" ...
## $ TailNum : chr "N745VJ" "N748UW" "N829AW" "N755US" ...
## $ FlightNum : int 1815 1815 1815 1815 1815 1815 850 653 1815 1815 ...
## $ Origin : chr "PWM" "PWM" "PWM" "PWM" ...
## $ OriginCityName : chr "Portland, ME" "Portland, ME" "Portland, ME" "Portland, ME" ...
## $ OriginState : chr "ME" "ME" "ME" "ME" ...
## $ Dest : chr "CLT" "CLT" "CLT" "CLT" ...
## $ DestCityName : chr "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" "Charlotte, NC" ...
## $ DestState : chr "NC" "NC" "NC" "NC" ...
## $ CRSDepTime : int 800 805 800 805 800 800 800 910 800 800 ...
## $ DepTime : int 906 802 755 803 758 801 755 903 817 757 ...
## $ WheelsOff : int 923 814 815 817 821 842 815 919 828 850 ...
## $ WheelsOn : int 1111 1034 1022 1037 1034 1048 1026 1130 1039 1057 ...
## $ CRSArrTime : int 1047 1050 1047 1050 1047 1047 1047 1201 1047 1047 ...
## $ ArrTime : int 1127 1040 1027 1044 1039 1053 1030 1138 1045 1102 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 167 165 167 165 167 167 167 171 167 167 ...
## $ ActualElapsedTime: int 141 158 152 161 161 172 155 155 148 185 ...
## $ Distance : int 813 813 813 813 813 813 813 813 813 813 ...
## $ numflight : num 1 1 1 1 1 1 1 1 1 1 ...
## $ new_CRSDepTime : POSIXct, format: "2016-01-05 08:00:00" "2016-01-01 08:05:00" ...
## $ new_DepTime : POSIXct, format: "2016-01-05 09:06:00" "2016-01-01 08:02:00" ...
## $ new_WheelsOff : POSIXct, format: "2016-01-05 09:23:00" "2016-01-01 08:14:00" ...
## $ new_WheelsOn : POSIXct, format: "2016-01-05 11:11:00" "2016-01-01 10:34:00" ...
## $ new_CRSArrTime : POSIXct, format: "2016-01-05 10:47:00" "2016-01-01 10:50:00" ...
## $ new_ArrTime : POSIXct, format: "2016-01-05 11:27:00" "2016-01-01 10:40:00" ...
## $ DepDelay : int 66 -3 -5 -2 -2 1 -5 -7 17 -3 ...
## $ DepDelayMinutes : num 66 0 0 0 0 1 0 0 17 0 ...
## $ DepDel15 : num 1 0 0 0 0 0 0 0 1 0 ...
## $ TaxiOut : int 17 12 20 14 23 41 20 16 11 53 ...
## $ TaxiIn : int 16 6 5 7 5 5 4 8 6 5 ...
## $ ArrDelay : int 40 -10 -20 -6 -8 6 -17 -23 -2 15 ...
## $ ArrDelayMinutes : num 40 0 0 0 0 6 0 0 0 15 ...
## $ ArrDel15 : num 1 0 0 0 0 0 0 0 0 1 ...
## $ FlightTimeBuffer : int 26 7 15 4 6 -5 12 16 19 -18 ...
## $ AirTime : int 108 140 127 140 133 126 131 131 131 127 ...
## $ AirSpeed : num 452 348 384 348 367 ...
PWM Departure Flight Delays by Destination.
#Filter Out Cancelled Flights Leaving All Completed Flights
flights %>% filter(Cancelled == 0) %>%
ggvis(~Dest, ~numflight, fill := "blue", width := 1) %>% group_by(Dest) %>%
layer_bars(width = .5) %>%
add_axis("x", title = "Destination Airport") %>%
add_axis("y", title = "Number of Departures from PWM")
JFK, BWI, DTW, and EWR had to most depatures
PWM Departure Flights with Delays Greater Than 15 Minutes by Destination.
#New Data Frame of Completed Flights
#Add a counter field "numflight"
flights %>% filter(DepDel15 == 1) %>%
ggvis(~Dest, ~DepDel15, fill := "blue", width := 1) %>% group_by(Dest) %>%
layer_bars(width = .5) %>%
add_axis("x", title = "Destination Airport") %>%
add_axis("y", title = "Number of Delayed Departures from PWM")
BWI, EWR, DTW, CLT, and JFK had the most Delays
PWM Departure Flight Cancellations by Destination.
#Filter Out Completed Flights Leaving All Cancelled Flights
flights %>% filter(Cancelled == 1) %>%
ggvis(~Dest, ~Cancelled, fill := "blue", width := 1) %>% group_by(Dest) %>%
layer_bars(width = .5) %>%
add_axis("x", title = "Destination Airport") %>%
add_axis("y", title = "Cancelled Departures from PWM", values = seq(1, 6, by = 1))
BWI, EWR, and JFK had the most cancellations
Comparison of PWM Departure Flights and Flight Delays Greater than 15 Minutes.
aggflights <- flights %>% select(Dest, numflight, Cancelled, DepDel15) %>%
#Convert "NA" to "0"
mutate(DepDel15 = ifelse(is.na(DepDel15),0,DepDel15))
#Aggregate Each Column
aggflights <- aggflights %>% group_by(Dest) %>% summarise_each(funs(sum))
Prepare Aggregate Data
aggflights <- flights %>% select(Dest, numflight, Cancelled, DepDel15) %>%
#Convert "NA" to "0"
mutate(DepDel15 = ifelse(is.na(DepDel15),0,DepDel15))
#Aggregate Each Column
aggflights <- aggflights %>% group_by(Dest) %>% summarise_each(funs(sum))
kable(aggflights, align = 'c')
| Dest | numflight | Cancelled | DepDel15 |
|---|---|---|---|
| ATL | 26 | 0 | 3 |
| BWI | 70 | 6 | 8 |
| CLT | 30 | 2 | 5 |
| DTW | 60 | 0 | 6 |
| EWR | 64 | 6 | 7 |
| JFK | 89 | 6 | 5 |
| LGA | 7 | 0 | 0 |
| ORD | 31 | 2 | 3 |
Cancellation and Delay Comparison
aggflights %>%
ggvis(~DepDel15, ~Cancelled) %>%
layer_points(fill = ~factor(Dest), size := 400, opacity := .8) %>%
add_axis("x", title = "Number of 15min Delayed Departures from PWM") %>%
add_axis("y", title = "Cancelled Departures from PWM", values = seq(1, 6, by = 1))
The Data Suggests BWI is the Worst Connection for PWM