I started out this project by loading several libraries to help me with my data-wrangling, manipulation and further visualisation of the data. Then I mirrorred manipulation of the data as explained in Unit 6.
library(dplyr)
library(ggvis)
library(knitr)
library(bindrcpp)
flights <- read.csv("domestic_flights_jan_2016.csv", stringsAsFactors = FALSE)
str(flights)
## 'data.frame': 445827 obs. of 21 variables:
## $ FlightDate : chr "1/6/2016" "1/7/2016" "1/8/2016" "1/9/2016" ...
## $ Carrier : chr "AA" "AA" "AA" "AA" ...
## $ TailNum : chr "N4YBAA" "N434AA" "N541AA" "N489AA" ...
## $ FlightNum : int 43 43 43 43 43 43 43 43 43 43 ...
## $ Origin : chr "DFW" "DFW" "DFW" "DFW" ...
## $ OriginCityName : chr "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" ...
## $ OriginState : chr "TX" "TX" "TX" "TX" ...
## $ Dest : chr "DTW" "DTW" "DTW" "DTW" ...
## $ DestCityName : chr "Detroit, MI" "Detroit, MI" "Detroit, MI" "Detroit, MI" ...
## $ DestState : chr "MI" "MI" "MI" "MI" ...
## $ CRSDepTime : int 1100 1100 1100 1100 1100 1100 1100 1100 1100 1100 ...
## $ DepTime : int 1057 1056 1055 1102 1240 1107 1059 1055 1058 1056 ...
## $ WheelsOff : int 1112 1110 1116 1115 1300 1118 1113 1107 1110 1110 ...
## $ WheelsOn : int 1424 1416 1431 1424 1617 1426 1429 1419 1420 1423 ...
## $ CRSArrTime : int 1438 1438 1438 1438 1438 1438 1438 1438 1438 1438 ...
## $ ArrTime : int 1432 1426 1445 1433 1631 1435 1438 1431 1428 1434 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 158 158 158 158 158 158 158 158 158 158 ...
## $ ActualElapsedTime: int 155 150 170 151 171 148 159 156 150 158 ...
## $ Distance : int 986 986 986 986 986 986 986 986 986 986 ...
flights %>% filter(!complete.cases(.))
flights$FlightDate <- as.Date(flights$FlightDate, format = "%m/%d/%Y")
flights %>% filter(Cancelled == 0) %>% summarize(maxcrsdep = max(CRSDepTime), maxdep = max(DepTime))
## maxcrsdep maxdep
## 1 2359 2400
flights <- flights %>%
mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
flights$new_CRSDepTime <- as.POSIXct(flights$new_CRSDepTime, format="%Y-%m-%d %H%M")
head(flights %>% select(new_CRSDepTime, CRSDepTime))
## new_CRSDepTime CRSDepTime
## 1 2016-01-06 11:00:00 1100
## 2 2016-01-07 11:00:00 1100
## 3 2016-01-08 11:00:00 1100
## 4 2016-01-09 11:00:00 1100
## 5 2016-01-10 11:00:00 1100
## 6 2016-01-11 11:00:00 1100
flights <- flights %>%
mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
flights$new_CRSArrTime <- as.POSIXct(flights$new_CRSArrTime, format="%Y-%m-%d %H%M")
flights <- flights %>% filter(Cancelled == 0) %>%
mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)),
new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
flights$new_DepTime <- as.POSIXct(flights$new_DepTime, format="%Y-%m-%d %H%M")
flights$new_WheelsOff <- as.POSIXct(flights$new_WheelsOff, format="%Y-%m-%d %H%M")
flights$new_WheelsOn <- as.POSIXct(flights$new_WheelsOn, format="%Y-%m-%d %H%M")
flights$new_ArrTime <- as.POSIXct(flights$new_ArrTime, format="%Y-%m-%d %H%M")
head(flights)
## FlightDate Carrier TailNum FlightNum Origin OriginCityName
## 1 2016-01-06 AA N4YBAA 43 DFW Dallas/Fort Worth, TX
## 2 2016-01-07 AA N434AA 43 DFW Dallas/Fort Worth, TX
## 3 2016-01-08 AA N541AA 43 DFW Dallas/Fort Worth, TX
## 4 2016-01-09 AA N489AA 43 DFW Dallas/Fort Worth, TX
## 5 2016-01-10 AA N439AA 43 DFW Dallas/Fort Worth, TX
## 6 2016-01-11 AA N468AA 43 DFW Dallas/Fort Worth, TX
## OriginState Dest DestCityName DestState CRSDepTime DepTime WheelsOff
## 1 TX DTW Detroit, MI MI 1100 1057 1112
## 2 TX DTW Detroit, MI MI 1100 1056 1110
## 3 TX DTW Detroit, MI MI 1100 1055 1116
## 4 TX DTW Detroit, MI MI 1100 1102 1115
## 5 TX DTW Detroit, MI MI 1100 1240 1300
## 6 TX DTW Detroit, MI MI 1100 1107 1118
## WheelsOn CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
## 1 1424 1438 1432 0 0 158
## 2 1416 1438 1426 0 0 158
## 3 1431 1438 1445 0 0 158
## 4 1424 1438 1433 0 0 158
## 5 1617 1438 1631 0 0 158
## 6 1426 1438 1435 0 0 158
## ActualElapsedTime Distance new_CRSDepTime new_CRSArrTime
## 1 155 986 2016-01-06 11:00:00 2016-01-06 14:38:00
## 2 150 986 2016-01-07 11:00:00 2016-01-07 14:38:00
## 3 170 986 2016-01-08 11:00:00 2016-01-08 14:38:00
## 4 151 986 2016-01-09 11:00:00 2016-01-09 14:38:00
## 5 171 986 2016-01-10 11:00:00 2016-01-10 14:38:00
## 6 148 986 2016-01-11 11:00:00 2016-01-11 14:38:00
## new_DepTime new_WheelsOff new_WheelsOn
## 1 2016-01-06 10:57:00 2016-01-06 11:12:00 2016-01-06 14:24:00
## 2 2016-01-07 10:56:00 2016-01-07 11:10:00 2016-01-07 14:16:00
## 3 2016-01-08 10:55:00 2016-01-08 11:16:00 2016-01-08 14:31:00
## 4 2016-01-09 11:02:00 2016-01-09 11:15:00 2016-01-09 14:24:00
## 5 2016-01-10 12:40:00 2016-01-10 13:00:00 2016-01-10 16:17:00
## 6 2016-01-11 11:07:00 2016-01-11 11:18:00 2016-01-11 14:26:00
## new_ArrTime
## 1 2016-01-06 14:32:00
## 2 2016-01-07 14:26:00
## 3 2016-01-08 14:45:00
## 4 2016-01-09 14:33:00
## 5 2016-01-10 16:31:00
## 6 2016-01-11 14:35:00
Once I completed cleaning the data, I filtered out the flights that were either cancelled or diverted in order not to skew my further analysis:
January_flights = flights %>% filter(Cancelled == 0, Diverted == 0) %>% filter(ArrTime > DepTime)
In addition to the original question of how late arrivals to PWM compares to those nation-wide, I would like to address the following questions in my analysis:
First , I would like to see what the average delay is for planes arriving in PWM in January by airline:
PWM_Late_Arrivals = January_flights %>% filter(Dest == "PWM") %>% select(Carrier, new_CRSArrTime, new_ArrTime)
PWM_Carrier_Late_Arrivals = PWM_Late_Arrivals %>% mutate(Late_Arrival_Min = as.integer(difftime(new_CRSArrTime, new_ArrTime, units = "mins")))
PWM_Carrier_Late_Arrivals = PWM_Carrier_Late_Arrivals %>%mutate(Late_Arrival =ifelse(Late_Arrival_Min <0, Late_Arrival_Min*-1, 0))
PWM_Carrier_Late_Arrivals %>% ggvis(x = ~Carrier, y =~Late_Arrival) %>% layer_bars(fill :="red")
Then, I proceeded by figuring out the average late arrival for the 5 airlines that provided their service to Portland, ME
PWMFlights <-PWM_Carrier_Late_Arrivals %>% group_by(Carrier) %>% summarise(Late_Arrival = sum(Late_Arrival))
colnames(PWMFlights) [2] <-"TotalofLateArrivalMinutes"
PWMFlights2 <-PWM_Carrier_Late_Arrivals %>% count(Carrier)
TotalPWMFlights <- inner_join(PWMFlights, PWMFlights2, by = "Carrier")
TotalPWMFlights2 <- mutate(TotalPWMFlights, TotalLateArrival = TotalofLateArrivalMinutes / n)
colnames(TotalPWMFlights2)[3]<-"NumberofFlights"
kable(TotalPWMFlights2)
| Carrier | TotalofLateArrivalMinutes | NumberofFlights | TotalLateArrival |
|---|---|---|---|
| AA | 278 | 28 | 9.928571 |
| B6 | 247 | 69 | 3.579710 |
| DL | 57 | 26 | 2.192308 |
| EV | 1224 | 144 | 8.500000 |
| WN | 319 | 59 | 5.406780 |
TotalPWMFlights2 %>% ggvis(~TotalofLateArrivalMinutes, ~NumberofFlights) %>% layer_points(fill=~Carrier) %>% layer_model_predictions(model="lm", se=T)
Then I proceeded to my second question of looking at the nation-wide numbers. In order to do this, I had to figure out which airlines had the most delays in terms of minutes:
January_flights = January_flights %>% mutate(Late_Arrival_Mins = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")))
January_flights = January_flights %>% mutate(Late_Arrival = ifelse(Late_Arrival_Mins < 0, Late_Arrival_Mins* -1, 0))
January_flights %>% ggvis(x = ~Carrier, y = ~Late_Arrival) %>% layer_bars(fill := "green")
Just like with the Portland data, by looking at this graph, one can assume that the Hawaiian Airlines is the best airline to take as it has the smallest difference between the scheduled and the actual arival times. However, it is clear that this difference is due the fact that the Hawaiian Airlines don’t offer many flights to begin with. In order to account for this, I decided to further manipulate the data to reflect the number of flights offered by each individual airline for the month of January:
January_flights2 = January_flights %>% group_by(Carrier) %>% summarise(Late_Arrival = sum(Late_Arrival))
colnames(January_flights2)[2] <- "Late_Arrival_Total"
January_flights3 = January_flights %>% count(Carrier)
January_flights4 = inner_join(January_flights2, January_flights3, by = "Carrier")
January_flights4 = mutate(January_flights4, Average_Late_Arrival = Late_Arrival_Total / n)
colnames(January_flights4) [3] <- "Number_Of_Flights"
kable(January_flights4)
| Carrier | Late_Arrival_Total | Number_Of_Flights | Average_Late_Arrival |
|---|---|---|---|
| AA | 655302 | 69775 | 9.391645 |
| AS | 146179 | 13057 | 11.195451 |
| B6 | 182102 | 20071 | 9.072891 |
| DL | 789168 | 66069 | 11.944603 |
| EV | 404051 | 39737 | 10.168131 |
| F9 | 95285 | 6633 | 14.365295 |
| HA | 34494 | 6162 | 5.597858 |
| NK | 84332 | 9865 | 8.548606 |
| OO | 440393 | 45588 | 9.660283 |
| UA | 473308 | 36244 | 13.058934 |
| VX | 44993 | 4874 | 9.231227 |
| WN | 826903 | 99051 | 8.348255 |
January_flights4 %>% ggvis(~Late_Arrival_Total, ~Number_Of_Flights) %>% layer_points(fill=~Carrier) %>% layer_model_predictions(model="lm", se=T)
Based on my brief analysis, there is a positive correlation between the number of flights offered by an airline and late arrivals. Out of five airlines offering flights to Portland, AA is the worst airline to be flying with in January, since the difference between the scheduled and the actual arrival is about 10 minutes per flight on average both on flights to Portland as well as nation-wide. DL is the airline that offers a more reliable service (2 minute difference between the scheduled and the actual arrival), however, nation-wide, this average increases to 12 minutes. The weakness of this analysis is it does not take into account flights that arrived before the scheduled times, because generally passengers do not complain when their flights arrive early.