NYC Flights heatmap

pacman:: p_load(nycflights13)  # load required libraries
pacman:: p_load(RColorBrewer)
pacman:: p_load(tidyverse)
pacman:: p_load(alluvial)
flights_nona <- na.omit (flights)  %>%       # remove observations with NA values
       mutate(inflight_delay = arr_delay - dep_delay) # create in-flight delay column
head(flights_nona)
## # A tibble: 6 x 20
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ... with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, inflight_delay <dbl>
delays <- filter(flights_nona, arr_delay > 0) %>%  # delays dataframe of late arrivals
   group_by (carrier)  %>%                # group by month
   summarize (count = n(),              # creating variables: number of flights to each destination,
              dist = mean (distance),   # the mean distance flown to by each carrier,
              delay = mean(dep_delay + arr_delay)) %>%  # the mean total delay
   arrange(delay)
head(delays)                            # look at the data
## # A tibble: 6 x 4
##   carrier count  dist delay
##   <chr>   <int> <dbl> <dbl>
## 1 US       7349  578.  47.4
## 2 HA         97 4983   61.0
## 3 MQ      11693  583.  65.9
## 4 AS        189 2402   66.2
## 5 DL      16413 1207.  68.5
## 6 UA      22222 1556.  68.7
row.names(delays) <- delays$carrier      # rename the rows according to carrier codes
## Warning: Setting row names on a tibble is deprecated.
delays_mat <- data.matrix(delays)     # convert delays dataframe to a matrix  (required by heatmap)
delays_mat_ <- delays_mat[,2:4]       # remove the redundant column of carrier codes

# Call heatmap using a ColorBrewer color set, margins=c(7,10) for aspect ratio, titles of graph, x and y labels, 
# font size of x and y labels, and set up a RowSideColors bar

varcols = setNames(colorRampPalette(brewer.pal(nrow(delays_mat_), "OrRd"))(nrow(delays_mat_)), 
                   rownames(delays_mat_))   # parameter for RowSideColors
## Warning in brewer.pal(nrow(delays_mat_), "OrRd"): n too large, allowed maximum for palette OrRd is 9
## Returning the palette you asked for with that many colors
heatmap(delays_mat_, 
        Rowv = NA, Colv = NA, 
        col= colorRampPalette(brewer.pal(nrow(delays_mat_), "OrRd"))(nrow(delays_mat_)),
        s=0.6, v=1, scale="column", 
        margins=c(7,6), 
        main = "Delays by Carrier", 
        xlab = "Flight Characteristics", 
        ylab="Carrier", labCol = c("Flights","Distance","Delay"),
        cexCol=1, cexRow =1, RowSideColors = varcols)
## layout: widths =  0.05 0.2 4 , heights =  0.25 4 ; lmat=
##      [,1] [,2] [,3]
## [1,]    0    0    4
## [2,]    3    1    2
## Warning in brewer.pal(nrow(delays_mat_), "OrRd"): n too large, allowed maximum for palette OrRd is 9
## Returning the palette you asked for with that many colors

Delays by Carrier

This diagram displays the average total delay by airline carrier, total delay is the sum of departure delay and arrival delay. The delay column is sorted in descending order. Darker red indicates a greater value with gradients down to pale peach for smaller values. Skywest airlines(OO) has the longest average delay, with the second shortest distance, but it also has the fewest number of flights (10).

Airlines with shorter flights tend to have longer delays, of the seven airlines with average distance less than 1000, six have average delays greater than 60 minutes.

National and larger carriers that have more flights and/or longer distances tend to have shorter delays. But Puerto Rico International Airlines (US) defies the trend with the shortest average delay, average distance of 577.67 and 7349 flights.

carriers <- read.csv("C:/Users/Owner/Desktop/My Documents/School/DataFiles/AirportCodes.csv")
carriers
##    Code                    Airline
## 1    OO                    Skywest
## 2    9E               Endeavor Air
## 3    YV              Mesa Airlines
## 4    EV                 ExpressJet
## 5    F9                   Frontier
## 6    VX                     Virgin
## 7    WN                  Southwest
## 8    FL         Air Large European
## 9    B6                    Jetblue
## 10   AA                   American
## 11   UA                     United
## 12   DL                      Delta
## 13   AS            Alaska Airlines
## 14   MQ                  Envoy Air
## 15   HA          Hawaiian Airlines
## 16   US Puerto Rico Int'l Airlines

Delays by departure airport

I wanted to look at the trend of in flight delays, I created the following two plots by departure airport.

delays <- filter(flights_nona, arr_delay > 0) %>%  # delays dataframe of late arrivals
   group_by (origin,month)  %>%                # group by month
   summarize (count = n(),
              inflight_delay = mean(inflight_delay)) # total inflight delay
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
head(delays)                            # look at the data
## # A tibble: 6 x 4
## # Groups:   origin [1]
##   origin month count inflight_delay
##   <chr>  <int> <int>          <dbl>
## 1 EWR        1  4751          7.24 
## 2 EWR        2  3972          4.83 
## 3 EWR        3  4307          1.46 
## 4 EWR        4  4919          6.20 
## 5 EWR        5  4083          0.420
## 6 EWR        6  4573          3.03
alluvial_ts(delays, wave = .3, ygap = 5, col = c("#551C3E", "#18440E", "#873D99"), plotdir = 'centred', alpha=.9,
            grid = TRUE, grid.lwd = 5, xmargin = 0.2, lab.cex = .7, xlab = "2013",
            ylab = 'Departure Airport', border = NA, axis.cex = .8, leg.cex = .7,
            leg.col='white', 
            title = "Average In-flight delay minutes by month")

It looks like Newark has more delays for most of the year, and La Guardia the least. Summer 2013 wasn’t a good time to travel from JFK. And September had the least delays. I thought I’d see more differences based on the season, and there doesn’t seem to be a a very large difference between the airports.

delaysb <- filter(flights_nona, arr_delay > 0)   # delays dataframe of late arrivals
delaysb$month <- month.abb[delaysb$month]
delaysb$month <- factor(delaysb$month, month.abb)

dplot <- delaysb %>%
         ggplot() +
         geom_bar(aes(x=month, fill=origin),
            position = "dodge") +
         labs(x = "2013", y = "# Delays", title = "NYC Delays by Origin Airport") +
         scale_fill_discrete(name = "Airport", labels = c("Newark", "JFK","La Guardia"))

dplot

This plot provides another view of the delays from originating airport, it also shows the dip in delays in September 2013. For most of the year La Guardia is the best bet, with JFK being a bit better in the Fall, and Newark was not the best choice.