# Load Libraries
library(dplyr)
library(plyr)
# Load File
allflights <- read.csv("domestic_flights_jan_2016.csv", header = TRUE, sep = "," )
# Convert/format Dates and Create new date/time columns
# Convert flight date format
allflights$FlightDate <- as.Date(allflights$FlightDate, format = "%m/%d/%Y")
# Scheduled Departure Time, Scheduled Arrival Time: Leave in Cancellations
# Add leading 0 to three dight scheduled departure dates; create new scheduled departure date/time column
allflights <- allflights %>% mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)), new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
# Re-format new date/time columns
allflights$new_CRSDepTime <- as.POSIXct(allflights$new_CRSDepTime, format = "%Y-%m-%d %H%M")
allflights$new_CRSArrTime <- as.POSIXct(allflights$new_CRSArrTime, format="%Y-%m-%d %H%M")
# Create object for all cancelled flights before filtering out cancels
cancels <- allflights %>% filter(Cancelled == 1)
# Departure Time, Wheels Off, Wheels On, Arrival Time: Leave out Cancellations
# Add leading 0 to three dight scheduled departure dates; create new date/time columns
allflights <- allflights %>% filter(Cancelled == 0) %>% mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)), new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
# Re-format new date/time columns
allflights$new_DepTime <- as.POSIXct(allflights$new_DepTime, format="%Y-%m-%d %H%M")
allflights$new_WheelsOff <- as.POSIXct(allflights$new_WheelsOff, format="%Y-%m-%d %H%M")
allflights$new_WheelsOn <- as.POSIXct(allflights$new_WheelsOn, format="%Y-%m-%d %H%M")
allflights$new_ArrTime <- as.POSIXct(allflights$new_ArrTime, format="%Y-%m-%d %H%M")
# Create new Calculated Columns
allflights <- allflights %>% filter(Cancelled == 0) %>%
mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")),
DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay),
DepDel15 = ifelse(DepDelay >= 15, 1, 0),
TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")),
TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")),
ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")),
ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay),
ArrDel15 = ifelse(ArrDelay >= 15, 1, 0),
AirTime = ActualElapsedTime - TaxiOut - TaxiIn,
AirSpeed = Distance/(AirTime/60))
# Round air speed to 2 decimal places
allflights <- mutate(allflights, AirSpeed = round(AirSpeed, digits = 2))
# Insert Distinct Carriers Names based on exisiting carrier codes by joining the data with a new external file
carrier_labels <- read.csv("Carrier_Labels.csv", header = TRUE, sep = ",")
colnames(allflights)[2] <- "CarrierCode"
colnames(cancels)[2] <- "CarrierCode"
allflights <- inner_join(allflights, carrier_labels, by = "CarrierCode")
cancels <- inner_join(cancels, carrier_labels, by = "CarrierCode")
# Insert counting column filled with 1 - allows for counting flights
allflights$CountCol <- c(1)
cancels$CountCol <- c(1)
# Create object for non cancelled flights
flights <- allflights %>% filter(Cancelled == 0)
# How many planes were used in Jan. 2016 and how many flights did they make on average?
# Count planes by unique tail numbers then total the number of flights
numPlanes <- n_distinct(flights$TailNum)
numFlights <- sum(flights$CountCol)
# On average how many flights did each plane take during the month and each day?
output1 <- round(numFlights/numPlanes, digits = 0)
output2 <- round((numFlights/numPlanes)/30, digits = 0)
# What is the most overall on-time carrier?
# Calculate total delay minutes by carrier
CarrierTotDelay <- ddply(flights, ~CarrierName, summarise, DelayTotal = sum(DepDelayMinutes))
# Calculate total flights by carrier
CarrierTotFlights <- ddply(flights, ~CarrierName, summarise, FlightsTotal = sum(CountCol))
# Join the previous two tables and insert an average delay per flight by carrier
CPerformance <- inner_join(CarrierTotFlights, CarrierTotDelay, by = "CarrierName")
CPerformance$AvgFlightDelay <- c(round((CPerformance$DelayTotal/CPerformance$FlightsTotal), digits = 2))
# Overall average flight delays in minutes for all carriers
output3 <- CPerformance %>% select(CarrierName, AvgFlightDelay) %>% arrange(desc(AvgFlightDelay))
# What is the most on-time carrier out of PWM?
# Filters PWM data out of full list and performs same fuctions as above
PWMDelays <- flights %>% filter(Origin == "PWM") %>% select(CarrierName, CountCol, DepDelayMinutes)
PWMCarrierTotDelay <- ddply(PWMDelays, ~CarrierName, summarise, DelayTotal = sum(DepDelayMinutes))
PWMCarrierTotFlights <- ddply(PWMDelays, ~CarrierName, summarise, FlightsTotal = sum(CountCol))
PWMPerformance <- inner_join(PWMCarrierTotFlights, PWMCarrierTotDelay, by = "CarrierName")
PWMPerformance$AvgFlightDelay <- c(round((PWMPerformance$DelayTotal/PWMPerformance$FlightsTotal), digits = 2))
output4 <- PWMPerformance %>% select(CarrierName, AvgFlightDelay) %>% arrange(AvgFlightDelay)
# What Carriers have the most cancels?
# Totals the number of cancels for each carrier for all domestic flights
CarrierCancels <- ddply(cancels, ~CarrierName, summarise, CancelledFlights = sum(CountCol))
output5 <- CarrierCancels %>% select(CarrierName, CancelledFlights) %>% arrange(desc(CancelledFlights))
# Totals the number of cancels for each carrier for all PWM flights
PWMCancels <- cancels %>% filter(Origin == "PWM") %>% select(OriginCityName, CarrierName, CountCol)
PWMCarrierCancels <- ddply(PWMCancels, ~CarrierName, summarise, CancelledFlights = sum(CountCol))
output5_2 <- PWMCarrierCancels %>% select(CarrierName, CancelledFlights) %>% arrange(desc(CancelledFlights))
# What airport had the most/least departures, implying the busiest/slowest?
# Totals the number of flights out of each airport
AirportTotFlights <- ddply(flights, ~OriginCityName, summarise, FlightsTotal = sum(CountCol))
# Lists the top 10 busiest airports
output6 <- top_n(AirportTotFlights, 10, FlightsTotal) %>% arrange(desc(FlightsTotal))
# Lists the top 10 least busy airports
output7 <- top_n(AirportTotFlights, -10, FlightsTotal) %>% arrange(FlightsTotal)
# Graph total domestic flights by day of the week: What day of the week has the most air traffic?
library(ggvis)
library(lubridate)
graph1 <- flights %>% select(CarrierName, FlightDate, CountCol) %>% group_by(DayofWeek = wday(FlightDate, label = TRUE))
graph1_2 <- ddply(graph1, ~DayofWeek, summarise, TotFlights = sum(CountCol))
output8 <- ggvis(graph1_2, ~DayofWeek, ~TotFlights) %>% layer_bars() %>% layer_text(text := ~TotFlights, fontSize := 14)
# Line graph of total flights over the course of the month
graph1_3 <- ddply(graph1, ~FlightDate, summarise, TotFlights = sum(CountCol))
output9 <- ggvis(graph1_3, x = ~FlightDate, y = ~TotFlights) %>% layer_lines(stroke := "green") %>% layer_points(fill := "black")
# Line graph of total cancelled flights over the course of the month
graph1_4 <- ddply(cancels, ~FlightDate, summarise, TotFlights = sum(CountCol))
output10 <- ggvis(graph1_4, x = ~FlightDate, y = ~TotFlights) %>% layer_lines(stroke := "red") %>% layer_points(fill := "black")
library(knitr)
Domestic Flight Data
I began by performing the date conversions, that were covered in unit 6, to the entire data set. Directly after, I created a new object called “cancels” that was a copy of “allflights” but only contained data for flights that were cancelled. My new object resulted in 11,665 observations. Next, I re-created all of the calculated columns that we covered - applying them to the original file. I also added in a column called “CountCol” to both the whole data set and my cancels object. I filled this column with the number 1. I was then able to use this column to easily count or sum various aspects of the data. Finally, once all of my date conversions were complete and additional columns added, I created an object titled “flights” which was a copy of the original data less the cancelled flights. The reason why I created “cancels” a few steps prior to “flights” was due to the fact that I filtered out all cancelled flights anyway, when I created the list of new columns.
What I Found Interesting:
In January 2016 there were 4236 commerical planes that took to the sky. On average, each of these planes flew 102 flights over the course of the month, which works out to be approximately 3 flights per day.
Planes tend to be delayed; so I wanted to look at what carriers had the best performance. I found the average flight delay by air carrier by taking the total number of delayed minutes and dividing it by the number of flights made by each carrier and found the following:
Average Delay per Flight by Carrier for all Domestic Flights
Spirit |
16.63 |
JetBlue |
16.22 |
Virgin America |
15.58 |
Skywest Airlines |
13.20 |
United |
11.71 |
Frontier |
10.28 |
Atlantic Southeast Airlines |
9.98 |
American |
9.77 |
Delta |
9.13 |
Southwest |
8.57 |
Alaska |
6.25 |
Hawaiian |
2.70 |
The above table displays the performance of each carrier across the country, but what carrier is most reliable out of Portland International Jetport?
Average Delay per Flight by Carrier for PWM Flights
JetBlue |
4.05 |
Atlantic Southeast Airlines |
9.41 |
Southwest |
11.09 |
American |
15.39 |
Delta |
20.44 |
Next, apart from delays, I wanted to look at total cancellations for the month of January for both domestic flights overall and for Portland specific flights.
Total Cancellations by Carrier for all Domestic Flights
American |
2720 |
Southwest |
2640 |
Atlantic Southeast Airlines |
1427 |
United |
1336 |
Skywest Airlines |
984 |
Delta |
974 |
JetBlue |
897 |
Spirit |
308 |
Virgin America |
159 |
Alaska |
139 |
Frontier |
77 |
Hawaiian |
4 |
Total Cancellations by Carrier for PWM Flights
Atlantic Southeast Airlines |
8 |
JetBlue |
6 |
Southwest |
6 |
American |
2 |
After looking at performance, I was curious as to what the busiest and slowest airports were, based on the number of departures, below are the results. Note that Bangor is one of the least busy airports in the country for commercial flights.
Top 10 Busiest Airports
Atlanta, GA |
29456 |
Chicago, IL |
24786 |
Denver, CO |
17317 |
Dallas/Fort Worth, TX |
16341 |
Los Angeles, CA |
16128 |
Houston, TX |
15967 |
New York, NY |
15031 |
Phoenix, AZ |
12864 |
San Francisco, CA |
12788 |
Las Vegas, NV |
11984 |
Bottom 10 Busiest Airports
St. Augustine, FL |
3 |
Roswell, NM |
4 |
Bangor, ME |
6 |
Adak Island, AK |
9 |
Pago Pago, TT |
10 |
North Bend/Coos Bay, OR |
17 |
Mammoth Lakes, CA |
22 |
Niagara Falls, NY |
29 |
Guam, TT |
31 |
Plattsburgh, NY |
31 |
Finally, I wanted to see what air traffic patterns looked like.
It appears that Tuesdays had the least amount of air traffic. I thought that this was interesting because a lot of people believe that you get a better deal when purchasing plane tickets on Tuesdays - could there be a correlation?
Total Domestic Flights by Day of the Week
After seeing the results of the overall bar chart, I wanted to see if there was a pattern over the course of the month, so I graphed total domestic flights for each day using a line chart. Right away I noticed that a pattern was present, with an exception toward the end of the month where the total flights for the day takes a deep dive.
Total Flights for Each Day of January 2016
When I saw the decrease in flights, I went to Google to try to find an explaination, afterall, the data is from January and I wondered if weather had an impact on air travel. Sure enough, I found the following article Blizzard 2016.
The article states how a significant number of flights were cancelled between Friday January 22nd and Sunday January 24th, which is exactly what the data shows on my cancellation timeline below:
Total Cancellations During January
Knowing now that there were increased cancellations due to weather, it makes sense that the cancellation data is skewed because the carrier is not at fault when the weather is bad. I would like to re-run the cancellation data during a month where weather would not be so much of a problem to better determine who has the best performance.