domestic_flight_data <- read.csv(("domestic_flights_jan_2016.csv"), stringsAsFactors = FALSE)
str(domestic_flight_data)
## 'data.frame': 445827 obs. of 21 variables:
## $ FlightDate : chr "1/6/2016" "1/7/2016" "1/8/2016" "1/9/2016" ...
## $ Carrier : chr "AA" "AA" "AA" "AA" ...
## $ TailNum : chr "N4YBAA" "N434AA" "N541AA" "N489AA" ...
## $ FlightNum : int 43 43 43 43 43 43 43 43 43 43 ...
## $ Origin : chr "DFW" "DFW" "DFW" "DFW" ...
## $ OriginCityName : chr "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" "Dallas/Fort Worth, TX" ...
## $ OriginState : chr "TX" "TX" "TX" "TX" ...
## $ Dest : chr "DTW" "DTW" "DTW" "DTW" ...
## $ DestCityName : chr "Detroit, MI" "Detroit, MI" "Detroit, MI" "Detroit, MI" ...
## $ DestState : chr "MI" "MI" "MI" "MI" ...
## $ CRSDepTime : int 1100 1100 1100 1100 1100 1100 1100 1100 1100 1100 ...
## $ DepTime : int 1057 1056 1055 1102 1240 1107 1059 1055 1058 1056 ...
## $ WheelsOff : int 1112 1110 1116 1115 1300 1118 1113 1107 1110 1110 ...
## $ WheelsOn : int 1424 1416 1431 1424 1617 1426 1429 1419 1420 1423 ...
## $ CRSArrTime : int 1438 1438 1438 1438 1438 1438 1438 1438 1438 1438 ...
## $ ArrTime : int 1432 1426 1445 1433 1631 1435 1438 1431 1428 1434 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CRSElapsedTime : int 158 158 158 158 158 158 158 158 158 158 ...
## $ ActualElapsedTime: int 155 150 170 151 171 148 159 156 150 158 ...
## $ Distance : int 986 986 986 986 986 986 986 986 986 986 ...
domestic_flights <- domestic_flight_data %>% filter(!complete.cases(.))
domestic_flights$FlightDate <- as.Date(domestic_flights$FlightDate, format = "%m/%d/%Y")
domestic_flights %>% filter(Cancelled == 0) %>% summarize(maxcrsdep = max(CRSDepTime), maxdep = max(DepTime))
## maxcrsdep maxdep
## 1 2359 2356
domestic_flights <- domestic_flights %>% mutate(new_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)))
domestic_flights$new_CRSDepTime <- as.POSIXct(domestic_flights$new_CRSDepTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% mutate(new_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)))
domestic_flights$new_CRSArrTime <- as.POSIXct(domestic_flights$new_CRSArrTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(new_DepTime = paste(FlightDate, sprintf("%04d", DepTime)))
domestic_flights$new_DepTime <- as.POSIXct(domestic_flights$new_DepTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(new_ArrTime = paste(FlightDate, sprintf("%04d", DepTime)))
domestic_flights$new_ArrTime <- as.POSIXct(domestic_flights$new_ArrTime, format = "%Y-%m-%d %H%M")
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(DepDelay = as.integer(difftime(new_DepTime, new_CRSDepTime, units = "mins")))
domestic_flights <- domestic_flights %>% filter(Cancelled == 0) %>% mutate(DepDelayMinutes = ifelse(DepDelay < 0, 0, DepDelay))
Then I wanted to see which carriers had the most flights and compare that to distance. I created a bar chart to visualize which airlines traveled the most distance in January 2016.
Carrier_Dist <- domestic_flight_data %>% group_by(Carrier)
Carrier_Dist %>% ggvis(~Carrier, ~Distance) %>% layer_bars(fill := "red", stroke :=20)
WN or Southwest had the most flights with 104154 observations. VX or Virgin America had the least amount of flights with 5384 observations. I know Southwest is a popular airline, but I was surprised to find it had the most domestic flights.
Carrier_Flights <- count(domestic_flight_data, 'Carrier')
pander(tbl_df(Carrier_Flights))
| Carrier | freq |
|---|---|
| AA | 75580 |
| AS | 14205 |
| B6 | 23018 |
| DL | 69711 |
| EV | 41970 |
| F9 | 7099 |
| HA | 6279 |
| NK | 11047 |
| OO | 47619 |
| UA | 39761 |
| VX | 5384 |
| WN | 104154 |
I then used the data frame domestic_flights, that I created using calculations from Unit 6, which decreased the observations from 445827 to 864. Then I narrowed down the data further to only include Southwest and Virgin America airlines.
WNVX <- domestic_flights %>% filter(Carrier == "WN" | Carrier == "VX")
WNVX %>% ggvis(~DepDelayMinutes, ~Distance) %>% filter(DepDelayMinutes > 0) %>% layer_points(fill = ~Carrier) %>% group_by(Carrier) %>% layer_model_predictions(model = "lm")
## Guessing formula = Distance ~ DepDelayMinutes
#I didn't like the fit of all observations with delays of greater than 0 minutes, so I made a new plot with delays of greater than 5 minutes.
WNVX %>% ggvis(~DepDelayMinutes, ~Distance) %>% filter(DepDelayMinutes > 5) %>% layer_points(fill = ~Carrier) %>% group_by(Carrier) %>% layer_model_predictions(model = "lm")
## Guessing formula = Distance ~ DepDelayMinutes
#Again the data seen in the plot seemed to be skewed by short delay and long delay times, so I created a delay time range.
WNVX %>% ggvis(~DepDelayMinutes, ~Distance) %>% filter(DepDelayMinutes > 0, DepDelayMinutes < 120) %>% layer_points(fill = ~Carrier) %>% group_by(Carrier) %>% layer_model_predictions(model = "lm")
## Guessing formula = Distance ~ DepDelayMinutes