# Load Libraries
library(dplyr)
library(plyr)
# Load File
allflights <- read.csv("domestic_flights_jan_2016.csv", header = TRUE, sep = "," )
# Convert/format Dates and Create new date/time columns

# Convert flight date format
allflights$FlightDate <- as.Date(allflights$FlightDate, format = "%m/%d/%Y")

# Scheduled Departure Time, Scheduled Arrival Time: Leave in Cancellations
   # Add leading 0 to three dight scheduled departure dates; create new scheduled departure date/time column
allflights <- allflights %>% mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)), new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
# Re-format new date/time columns
allflights$new_CRSDepTime <- as.POSIXct(allflights$new_CRSDepTime, format = "%Y-%m-%d %H%M")
allflights$new_CRSArrTime <- as.POSIXct(allflights$new_CRSArrTime, format="%Y-%m-%d %H%M")
# Create object for all cancelled flights before filtering out cancels
cancels <- allflights %>% filter(Cancelled == 1)
# Departure Time, Wheels Off, Wheels On, Arrival Time: Leave out Cancellations
   # Add leading 0 to three dight scheduled departure dates; create new date/time columns
allflights <- allflights %>% filter(Cancelled == 0) %>% mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), new_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)), new_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)), new_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)))
# Re-format new date/time columns
allflights$new_DepTime <- as.POSIXct(allflights$new_DepTime, format="%Y-%m-%d %H%M")
allflights$new_WheelsOff <- as.POSIXct(allflights$new_WheelsOff, format="%Y-%m-%d %H%M")
allflights$new_WheelsOn <- as.POSIXct(allflights$new_WheelsOn, format="%Y-%m-%d %H%M")
allflights$new_ArrTime <- as.POSIXct(allflights$new_ArrTime, format="%Y-%m-%d %H%M")
# Create new Calculated Columns
allflights <- allflights %>% filter(Cancelled == 0) %>% 
  mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")), 
         DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay), 
         DepDel15 = ifelse(DepDelay >= 15, 1, 0), 
         TaxiOut = as.integer(difftime(new_WheelsOff, new_DepTime, units = "mins")), 
         TaxiIn = as.integer(difftime(new_ArrTime, new_WheelsOn, units = "mins")), 
         ArrDelay = as.integer(difftime(new_ArrTime, new_CRSArrTime, units = "mins")), 
         ArrDelayMinutes = ifelse(ArrDelay < 0, 0, ArrDelay), 
         ArrDel15 = ifelse(ArrDelay >= 15, 1, 0),         
         AirTime = ActualElapsedTime - TaxiOut - TaxiIn, 
         AirSpeed = Distance/(AirTime/60)) 

# Round air speed to 2 decimal places
allflights <- mutate(allflights, AirSpeed = round(AirSpeed, digits = 2))
# Insert Distinct Carriers Names based on exisiting carrier codes by joining the data with a new external file
carrier_labels <- read.csv("Carrier_Labels.csv", header = TRUE, sep = ",")

colnames(allflights)[2] <- "CarrierCode"
colnames(cancels)[2] <- "CarrierCode"

allflights <- inner_join(allflights, carrier_labels, by = "CarrierCode")
cancels <- inner_join(cancels, carrier_labels, by = "CarrierCode")

# Insert counting column filled with 1 - allows for counting flights
allflights$CountCol <- c(1)
cancels$CountCol <- c(1)
# Create object for non cancelled flights
flights <- allflights %>% filter(Cancelled == 0)
# How many planes were used in Jan. 2016 and how many flights did they make on average?

# Count planes by unique tail numbers then total the number of flights
numPlanes <- n_distinct(flights$TailNum)
numFlights <- sum(flights$CountCol)

# On average how many flights did each plane take during the month and each day?
output1 <- round(numFlights/numPlanes, digits = 0)
output2 <- round((numFlights/numPlanes)/30, digits = 0)
# What is the most overall on-time carrier?

   # Calculate total delay minutes by carrier
CarrierTotDelay <- ddply(flights, ~CarrierName, summarise, DelayTotal = sum(DepDelayMinutes)) 

   # Calculate total flights by carrier
CarrierTotFlights <-  ddply(flights, ~CarrierName, summarise, FlightsTotal = sum(CountCol)) 

   # Join the previous two tables and insert an average delay per flight by carrier
CPerformance <- inner_join(CarrierTotFlights, CarrierTotDelay, by = "CarrierName")
CPerformance$AvgFlightDelay <- c(round((CPerformance$DelayTotal/CPerformance$FlightsTotal), digits = 2))

# Overall average flight delays in minutes for all carriers
output3 <- CPerformance %>% select(CarrierName, AvgFlightDelay) %>% arrange(desc(AvgFlightDelay))

# What is the most on-time carrier out of PWM?

   # Filters PWM data out of full list and performs same fuctions as above
PWMDelays <- flights %>%  filter(Origin == "PWM") %>% select(CarrierName, CountCol, DepDelayMinutes)

PWMCarrierTotDelay <- ddply(PWMDelays, ~CarrierName, summarise, DelayTotal = sum(DepDelayMinutes))
PWMCarrierTotFlights <- ddply(PWMDelays, ~CarrierName, summarise, FlightsTotal = sum(CountCol))

PWMPerformance <- inner_join(PWMCarrierTotFlights, PWMCarrierTotDelay, by = "CarrierName")
PWMPerformance$AvgFlightDelay <- c(round((PWMPerformance$DelayTotal/PWMPerformance$FlightsTotal), digits = 2))

output4 <- PWMPerformance %>% select(CarrierName, AvgFlightDelay) %>% arrange(AvgFlightDelay)
# What Carriers have the most cancels?

   # Totals the number of cancels for each carrier for all domestic flights
CarrierCancels <- ddply(cancels, ~CarrierName, summarise, CancelledFlights = sum(CountCol))

output5 <- CarrierCancels %>% select(CarrierName, CancelledFlights) %>% arrange(desc(CancelledFlights))

   # Totals the number of cancels for each carrier for all PWM flights
PWMCancels <- cancels %>%  filter(Origin == "PWM") %>% select(OriginCityName, CarrierName, CountCol)
PWMCarrierCancels <- ddply(PWMCancels, ~CarrierName, summarise, CancelledFlights = sum(CountCol))

output5_2 <- PWMCarrierCancels %>% select(CarrierName, CancelledFlights) %>% arrange(desc(CancelledFlights))
# What airport had the most/least departures, implying the busiest/slowest?

   # Totals the number of flights out of each airport
AirportTotFlights <-  ddply(flights, ~OriginCityName, summarise, FlightsTotal = sum(CountCol))
   # Lists the top 10 busiest airports
output6 <-  top_n(AirportTotFlights, 10, FlightsTotal) %>% arrange(desc(FlightsTotal))
   # Lists the top 10 least busy airports
output7 <- top_n(AirportTotFlights, -10, FlightsTotal) %>% arrange(FlightsTotal)
# Graph total domestic flights by day of the week: What day of the week has the most air traffic?

library(ggvis)
library(lubridate)

graph1 <- flights %>% select(CarrierName, FlightDate, CountCol) %>%  group_by(DayofWeek = wday(FlightDate, label = TRUE)) 
graph1_2 <- ddply(graph1, ~DayofWeek, summarise, TotFlights = sum(CountCol))
output8 <- ggvis(graph1_2, ~DayofWeek, ~TotFlights) %>% layer_bars() %>% layer_text(text := ~TotFlights, fontSize := 14)

# Line graph of total flights over the course of the month
graph1_3 <- ddply(graph1, ~FlightDate, summarise, TotFlights = sum(CountCol))
output9 <- ggvis(graph1_3, x = ~FlightDate, y = ~TotFlights) %>% layer_lines(stroke := "green") %>% layer_points(fill := "black") 

# Line graph of total cancelled flights over the course of the month
graph1_4 <- ddply(cancels, ~FlightDate, summarise, TotFlights = sum(CountCol))
output10 <- ggvis(graph1_4, x = ~FlightDate, y = ~TotFlights) %>% layer_lines(stroke := "red") %>% layer_points(fill := "black")
library(knitr)

Domestic Flight Data

I began by performing the date conversions, that were covered in unit 6, to the entire data set. Directly after, I created a new object called “cancels” that was a copy of “allflights” but only contained data for flights that were cancelled. My new object resulted in 11,665 observations. Next, I re-created all of the calculated columns that we covered - applying them to the original file. I also added in a column called “CountCol” to both the whole data set and my cancels object. I filled this column with the number 1. I was then able to use this column to easily count or sum various aspects of the data. Finally, once all of my date conversions were complete and additional columns added, I created an object titled “flights” which was a copy of the original data less the cancelled flights. The reason why I created “cancels” a few steps prior to “flights” was due to the fact that I filtered out all cancelled flights anyway, when I created the list of new columns.

What I Found Interesting:

In January 2016 there were 4236 commerical planes that took to the sky. On average, each of these planes flew 102 flights over the course of the month, which works out to be approximately 3 flights per day.
Planes tend to be delayed; so I wanted to look at what carriers had the best performance. I found the average flight delay by air carrier by taking the total number of delayed minutes and dividing it by the number of flights made by each carrier and found the following:

Average Delay per Flight by Carrier for all Domestic Flights

CarrierName AvgFlightDelay
Spirit 16.63
JetBlue 16.22
Virgin America 15.58
Skywest Airlines 13.20
United 11.71
Frontier 10.28
Atlantic Southeast Airlines 9.98
American 9.77
Delta 9.13
Southwest 8.57
Alaska 6.25
Hawaiian 2.70
The above table displays the performance of each carrier across the country, but what carrier is most reliable out of Portland International Jetport?

Average Delay per Flight by Carrier for PWM Flights

CarrierName AvgFlightDelay
JetBlue 4.05
Atlantic Southeast Airlines 9.41
Southwest 11.09
American 15.39
Delta 20.44
Next, apart from delays, I wanted to look at total cancellations for the month of January for both domestic flights overall and for Portland specific flights.

Total Cancellations by Carrier for all Domestic Flights

CarrierName CancelledFlights
American 2720
Southwest 2640
Atlantic Southeast Airlines 1427
United 1336
Skywest Airlines 984
Delta 974
JetBlue 897
Spirit 308
Virgin America 159
Alaska 139
Frontier 77
Hawaiian 4

Total Cancellations by Carrier for PWM Flights

CarrierName CancelledFlights
Atlantic Southeast Airlines 8
JetBlue 6
Southwest 6
American 2
After looking at performance, I was curious as to what the busiest and slowest airports were, based on the number of departures, below are the results. Note that Bangor is one of the least busy airports in the country for commercial flights.

Top 10 Busiest Airports

OriginCityName FlightsTotal
Atlanta, GA 29456
Chicago, IL 24786
Denver, CO 17317
Dallas/Fort Worth, TX 16341
Los Angeles, CA 16128
Houston, TX 15967
New York, NY 15031
Phoenix, AZ 12864
San Francisco, CA 12788
Las Vegas, NV 11984

Bottom 10 Busiest Airports

OriginCityName FlightsTotal
St. Augustine, FL 3
Roswell, NM 4
Bangor, ME 6
Adak Island, AK 9
Pago Pago, TT 10
North Bend/Coos Bay, OR 17
Mammoth Lakes, CA 22
Niagara Falls, NY 29
Guam, TT 31
Plattsburgh, NY 31
Finally, I wanted to see what air traffic patterns looked like.
First, I created a bar chart that showed total domestic flights for each day of the week. I attempted to add labels to the bars to more easily tell what the value of each bar was, however I could not find a way to center the label; if anyone has found a way to do this, you comment would be greatly appreciated!
It appears that Tuesdays had the least amount of air traffic. I thought that this was interesting because a lot of people believe that you get a better deal when purchasing plane tickets on Tuesdays - could there be a correlation?

Total Domestic Flights by Day of the Week

After seeing the results of the overall bar chart, I wanted to see if there was a pattern over the course of the month, so I graphed total domestic flights for each day using a line chart. Right away I noticed that a pattern was present, with an exception toward the end of the month where the total flights for the day takes a deep dive.
Note that the y-scale does not begin at 0. Again, this is a detail that I could not figure out how to change; if anyone knows how to adjust the starting point of the axes please comment. Thanks!

Total Flights for Each Day of January 2016

When I saw the decrease in flights, I went to Google to try to find an explaination, afterall, the data is from January and I wondered if weather had an impact on air travel. Sure enough, I found the following article Blizzard 2016.
The article states how a significant number of flights were cancelled between Friday January 22nd and Sunday January 24th, which is exactly what the data shows on my cancellation timeline below:

Total Cancellations During January

Knowing now that there were increased cancellations due to weather, it makes sense that the cancellation data is skewed because the carrier is not at fault when the weather is bad. I would like to re-run the cancellation data during a month where weather would not be so much of a problem to better determine who has the best performance.