pacman:: p_load(nycflights13) # load required libraries
pacman:: p_load(RColorBrewer)
pacman:: p_load(tidyverse)
pacman:: p_load(alluvial)
flights_nona <- na.omit (flights) %>% # remove observations with NA values
mutate(inflight_delay = arr_delay - dep_delay) # create in-flight delay column
head(flights_nona)
## # A tibble: 6 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ... with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, inflight_delay <dbl>
delays <- filter(flights_nona, arr_delay > 0) %>% # delays dataframe of late arrivals
group_by (carrier) %>% # group by month
summarize (count = n(), # creating variables: number of flights to each destination,
dist = mean (distance), # the mean distance flown to by each carrier,
delay = mean(dep_delay + arr_delay)) %>% # the mean total delay
arrange(delay)
head(delays) # look at the data
## # A tibble: 6 x 4
## carrier count dist delay
## <chr> <int> <dbl> <dbl>
## 1 US 7349 578. 47.4
## 2 HA 97 4983 61.0
## 3 MQ 11693 583. 65.9
## 4 AS 189 2402 66.2
## 5 DL 16413 1207. 68.5
## 6 UA 22222 1556. 68.7
row.names(delays) <- delays$carrier # rename the rows according to carrier codes
## Warning: Setting row names on a tibble is deprecated.
delays_mat <- data.matrix(delays) # convert delays dataframe to a matrix (required by heatmap)
delays_mat_ <- delays_mat[,2:4] # remove the redundant column of carrier codes
# Call heatmap using a ColorBrewer color set, margins=c(7,10) for aspect ratio, titles of graph, x and y labels,
# font size of x and y labels, and set up a RowSideColors bar
varcols = setNames(colorRampPalette(brewer.pal(nrow(delays_mat_), "OrRd"))(nrow(delays_mat_)),
rownames(delays_mat_)) # parameter for RowSideColors
## Warning in brewer.pal(nrow(delays_mat_), "OrRd"): n too large, allowed maximum for palette OrRd is 9
## Returning the palette you asked for with that many colors
heatmap(delays_mat_,
Rowv = NA, Colv = NA,
col= colorRampPalette(brewer.pal(nrow(delays_mat_), "OrRd"))(nrow(delays_mat_)),
s=0.6, v=1, scale="column",
margins=c(7,6),
main = "Delays by Carrier",
xlab = "Flight Characteristics",
ylab="Carrier", labCol = c("Flights","Distance","Delay"),
cexCol=1, cexRow =1, RowSideColors = varcols)
## layout: widths = 0.05 0.2 4 , heights = 0.25 4 ; lmat=
## [,1] [,2] [,3]
## [1,] 0 0 4
## [2,] 3 1 2
## Warning in brewer.pal(nrow(delays_mat_), "OrRd"): n too large, allowed maximum for palette OrRd is 9
## Returning the palette you asked for with that many colors
This diagram displays the average total delay by airline carrier, total delay is the sum of departure delay and arrival delay. The delay column is sorted in descending order. Darker red indicates a greater value with gradients down to pale peach for smaller values. Skywest airlines(OO) has the longest average delay, with the second shortest distance, but it also has the fewest number of flights (10).
Airlines with shorter flights tend to have longer delays, of the seven airlines with average distance less than 1000, six have average delays greater than 60 minutes.
National and larger carriers that have more flights and/or longer distances tend to have shorter delays. But Puerto Rico International Airlines (US) defies the trend with the shortest average delay, average distance of 577.67 and 7349 flights.
carriers <- read.csv("C:/Users/Owner/Desktop/My Documents/School/DataFiles/AirportCodes.csv")
carriers
## Code Airline
## 1 OO Skywest
## 2 9E Endeavor Air
## 3 YV Mesa Airlines
## 4 EV ExpressJet
## 5 F9 Frontier
## 6 VX Virgin
## 7 WN Southwest
## 8 FL Air Large European
## 9 B6 Jetblue
## 10 AA American
## 11 UA United
## 12 DL Delta
## 13 AS Alaska Airlines
## 14 MQ Envoy Air
## 15 HA Hawaiian Airlines
## 16 US Puerto Rico Int'l Airlines
I wanted to look at the trend of in flight delays, I created the following two plots by departure airport.
delays <- filter(flights_nona, arr_delay > 0) %>% # delays dataframe of late arrivals
group_by (origin,month) %>% # group by month
summarize (count = n(),
inflight_delay = mean(inflight_delay)) # total inflight delay
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
head(delays) # look at the data
## # A tibble: 6 x 4
## # Groups: origin [1]
## origin month count inflight_delay
## <chr> <int> <int> <dbl>
## 1 EWR 1 4751 7.24
## 2 EWR 2 3972 4.83
## 3 EWR 3 4307 1.46
## 4 EWR 4 4919 6.20
## 5 EWR 5 4083 0.420
## 6 EWR 6 4573 3.03
alluvial_ts(delays, wave = .3, ygap = 5, col = c("#551C3E", "#18440E", "#873D99"), plotdir = 'centred', alpha=.9,
grid = TRUE, grid.lwd = 5, xmargin = 0.2, lab.cex = .7, xlab = "2013",
ylab = 'Departure Airport', border = NA, axis.cex = .8, leg.cex = .7,
leg.col='white',
title = "Average In-flight delay minutes by month")
It looks like Newark has more delays for most of the year, and La Guardia the least. Summer 2013 wasn’t a good time to travel from JFK. And September had the least delays. I thought I’d see more differences based on the season, and there doesn’t seem to be a a very large difference between the airports.
delaysb <- filter(flights_nona, arr_delay > 0) # delays dataframe of late arrivals
delaysb$month <- month.abb[delaysb$month]
delaysb$month <- factor(delaysb$month, month.abb)
dplot <- delaysb %>%
ggplot() +
geom_bar(aes(x=month, fill=origin),
position = "dodge") +
labs(x = "2013", y = "# Delays", title = "NYC Delays by Origin Airport") +
scale_fill_discrete(name = "Airport", labels = c("Newark", "JFK","La Guardia"))
dplot
This plot provides another view of the delays from originating airport, it also shows the dip in delays in September 2013. For most of the year La Guardia is the best bet, with JFK being a bit better in the Fall, and Newark was not the best choice.