Load the flights data file

domestic_flight_data <- read.csv(("domestic_flights_jan_2016.csv"), stringsAsFactors = FALSE)
str(domestic_flight_data)
## 'data.frame':    445827 obs. of  21 variables:
##  $ FlightDate       : chr  "1/6/2016" "1/7/2016" "1/8/2016" "1/9/2016" ...
##  $ Carrier          : chr  "AA" "AA" "AA" "AA" ...
##  $ TailNum          : chr  "N4YBAA" "N434AA" "N541AA" "N489AA" ...
##  $ FlightNum        : int  43 43 43 43 43 43 43 43 43 43 ...
##  $ Origin           : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ OriginCityName   : chr  "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" ...
##  $ OriginState      : chr  "TX" "TX" "TX" "TX" ...
##  $ Dest             : chr  "DTW" "DTW" "DTW" "DTW" ...
##  $ DestCityName     : chr  "Detroit, MI" "Detroit, MI" "Detroit, MI" "Detroit, MI" ...
##  $ DestState        : chr  "MI" "MI" "MI" "MI" ...
##  $ CRSDepTime       : int  1100 1100 1100 1100 1100 1100 1100 1100 1100 1100 ...
##  $ DepTime          : int  1057 1056 1055 1102 1240 1107 1059 1055 1058 1056 ...
##  $ WheelsOff        : int  1112 1110 1116 1115 1300 1118 1113 1107 1110 1110 ...
##  $ WheelsOn         : int  1424 1416 1431 1424 1617 1426 1429 1419 1420 1423 ...
##  $ CRSArrTime       : int  1438 1438 1438 1438 1438 1438 1438 1438 1438 1438 ...
##  $ ArrTime          : int  1432 1426 1445 1433 1631 1435 1438 1431 1428 1434 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CRSElapsedTime   : int  158 158 158 158 158 158 158 158 158 158 ...
##  $ ActualElapsedTime: int  155 150 170 151 171 148 159 156 150 158 ...
##  $ Distance         : int  986 986 986 986 986 986 986 986 986 986 ...

I recreated some of the calculations from unit 6

domestic_flights <- domestic_flight_data %>% filter(!complete.cases(.))
domestic_flights$FlightDate <- as.Date(domestic_flights$FlightDate, format = "%m/%d/%Y")
domestic_flights %>% filter(Cancelled == 0) %>% summarize(maxcrsdep = max(CRSDepTime), maxdep = max(DepTime))
##   maxcrsdep maxdep
## 1      2359   2356
domestic_flights <- domestic_flights %>% mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
domestic_flights$new_CRSDepTime <- as.POSIXct(domestic_flights$new_CRSDepTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
domestic_flights$new_CRSArrTime <- as.POSIXct(domestic_flights$new_CRSArrTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)))
domestic_flights$new_DepTime <- as.POSIXct(domestic_flights$new_DepTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(new_ArrTime = paste(FlightDate, sprintf("%04d", DepTime)))
domestic_flights$new_ArrTime <- as.POSIXct(domestic_flights$new_ArrTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay))

Then I wanted to see which carriers had the most flights and compare that to distance. I created a bar chart to visualize which airlines traveled the most distance in January 2016.

Carrier_Dist <- domestic_flight_data %>% group_by(Carrier)
Carrier_Dist %>% ggvis(~Carrier, ~Distance) %>% layer_bars(fill := "red", stroke :=20)

WN or Southwest had the most flights with 104154 observations. VX or Virgin America had the least amount of flights with 5384 observations. I know Southwest is a popular airline, but I was surprised to find it had the most domestic flights.

Carrier_Flights <- count(domestic_flight_data, 'Carrier')
pander(tbl_df(Carrier_Flights))
Carrier freq
AA 75580
AS 14205
B6 23018
DL 69711
EV 41970
F9 7099
HA 6279
NK 11047
OO 47619
UA 39761
VX 5384
WN 104154

I then used the data frame domestic_flights, that I created using calculations from Unit 6, which decreased the observations from 445827 to 864. Then I narrowed down the data further to only include Southwest and Virgin America airlines.

WNVX <- domestic_flights %>% filter(Carrier == "WN" | Carrier == "VX")
WNVX %>% ggvis(~DepDelayMinutes, ~Distance) %>% filter(DepDelayMinutes > 0) %>% layer_points(fill = ~Carrier)  %>% group_by(Carrier) %>% layer_model_predictions(model = "lm")
## Guessing formula = Distance ~ DepDelayMinutes
#I didn't like the fit of all observations with delays of greater than 0 minutes, so I made a new plot with delays of greater than 5 minutes. 
WNVX %>% ggvis(~DepDelayMinutes, ~Distance) %>% filter(DepDelayMinutes > 5) %>% layer_points(fill = ~Carrier)  %>% group_by(Carrier) %>% layer_model_predictions(model = "lm")
## Guessing formula = Distance ~ DepDelayMinutes
#Again the data seen in the plot seemed to be skewed by short delay and long delay times, so I created a delay time range. 
WNVX %>% ggvis(~DepDelayMinutes, ~Distance) %>% filter(DepDelayMinutes > 0, DepDelayMinutes < 120) %>% layer_points(fill = ~Carrier)  %>% group_by(Carrier) %>% layer_model_predictions(model = "lm")
## Guessing formula = Distance ~ DepDelayMinutes

This gave me a fit that I expected (or wanted) to see. The fit suggests that departure delays are longer when the flight distance is longer.