library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA
library(dplyr)
data = data.frame(flights)
head(data)
##   year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 2013     1   1      517            515         2      830            819
## 2 2013     1   1      533            529         4      850            830
## 3 2013     1   1      542            540         2      923            850
## 4 2013     1   1      544            545        -1     1004           1022
## 5 2013     1   1      554            600        -6      812            837
## 6 2013     1   1      554            558        -4      740            728
##   arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1        11      UA   1545  N14228    EWR  IAH      227     1400    5     15
## 2        20      UA   1714  N24211    LGA  IAH      227     1416    5     29
## 3        33      AA   1141  N619AA    JFK  MIA      160     1089    5     40
## 4       -18      B6    725  N804JB    JFK  BQN      183     1576    5     45
## 5       -25      DL    461  N668DN    LGA  ATL      116      762    6      0
## 6        12      UA   1696  N39463    EWR  ORD      150      719    5     58
##             time_hour
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
flights_nona <- na.omit (flights)
data_variables = data %>%
  select(distance, air_time, arr_delay, dest)
head(data_variables)
##   distance air_time arr_delay dest
## 1     1400      227        11  IAH
## 2     1416      227        20  IAH
## 3     1089      160        33  MIA
## 4     1576      183       -18  BQN
## 5      762      116       -25  ATL
## 6      719      150        12  ORD
data2 <- flights_nona %>%
  group_by(dest) %>%
  summarize(depdelay = mean(dep_delay),
            arrdelay = mean(arr_delay),
            dist = mean(distance))
data3 <- data2 %>%
  arrange(desc(dist))
row.names(data3) <- data3$dest
## Warning: Setting row names on a tibble is deprecated.
flights_matrix <- data.matrix(data3)
flights_mat <- flights_matrix[,2:4]
library(RColorBrewer)
cols = setNames(colorRampPalette(brewer.pal(nrow(flights_mat), "RdPu"))(nrow(flights_mat)), rownames(flights_mat))
## Warning in brewer.pal(nrow(flights_mat), "RdPu"): n too large, allowed maximum for palette RdPu is 9
## Returning the palette you asked for with that many colors
heatmap(flights_mat, Rowv = NA, 
        Colv = NA, 
        col= colorRampPalette(brewer.pal(nrow(flights_mat), "RdPu"))(nrow(flights_mat)), 
        s=0.6, v=1, scale="column", 
        margins=c(7,10), 
        main = "Shorter Flights mean Longer Delays", 
        xlab ="Distance and Delays", 
        ylab="Destination Airport", 
        labCol = c("Dep Delay","Arr Delay","Distance"), 
        cexCol=1, cexRow =1, RowSideColors = cols)
## layout: widths =  0.05 0.2 4 , heights =  0.25 4 ; lmat=
##      [,1] [,2] [,3]
## [1,]    0    0    4
## [2,]    3    1    2
## Warning in brewer.pal(nrow(flights_mat), "RdPu"): n too large, allowed maximum for palette RdPu is 9
## Returning the palette you asked for with that many colors
legend(x="bottomright", legend=c("min", "ave", "max"), 
     fill=colorRampPalette(brewer.pal(8, "RdPu"))(3))

Reflection

In my visualization, I wanted to see whether or not the length of a flight affected how much that flight suffered delays (both arrival and departure delays). I hypothesized that longer flights would be delayed more often, since there might be more preparation before the flight involved (a longer flight might mean a longer cleaning of the plane beforehand, check on fuel, etc). To test this hypothesis, I made a heatmap that visualizes the delays that the flights experienced depending on how much distance was traveled. However, I was surprised to learn that the results were actually more opposite to my hypothesis. Longer distance flights had less delays, not more. I looked into this more to find out why, and actually discovered a couple really interesting reasons. For one, flights are a lot less likely to get delayed if the airport is less crowded, and airports are less crowded in the early morning and at night. Adding onto that, longer distance flights (also referred to as ‘long-haul’ flights) are more likely to fly through the night, for reasons such as lengthy travel and numerous time zone changes. As an individual that annually takes 16 hour flights to Pakistan, I realized that, yes, our flights always took off in the very early morning or in the evening. So, because long-haul flights are flown mostly at night, and airports are less likely to suffer delays at that time, longer flights actually mean less delays.

SOURCES: https://www.cntraveler.com/stories/2013-02-25/planes-night-flights-

https://claimflights.com/why-are-so-many-flights-delayed-today/