library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
library(dplyr)
data = data.frame(flights)
head(data)
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## time_hour
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
flights_nona <- na.omit (flights)
data_variables = data %>%
select(distance, air_time, arr_delay, dest)
head(data_variables)
## distance air_time arr_delay dest
## 1 1400 227 11 IAH
## 2 1416 227 20 IAH
## 3 1089 160 33 MIA
## 4 1576 183 -18 BQN
## 5 762 116 -25 ATL
## 6 719 150 12 ORD
data2 <- flights_nona %>%
group_by(dest) %>%
summarize(depdelay = mean(dep_delay),
arrdelay = mean(arr_delay),
dist = mean(distance))
data3 <- data2 %>%
arrange(desc(dist))
row.names(data3) <- data3$dest
## Warning: Setting row names on a tibble is deprecated.
flights_matrix <- data.matrix(data3)
flights_mat <- flights_matrix[,2:4]
library(RColorBrewer)
cols = setNames(colorRampPalette(brewer.pal(nrow(flights_mat), "RdPu"))(nrow(flights_mat)), rownames(flights_mat))
## Warning in brewer.pal(nrow(flights_mat), "RdPu"): n too large, allowed maximum for palette RdPu is 9
## Returning the palette you asked for with that many colors
heatmap(flights_mat, Rowv = NA,
Colv = NA,
col= colorRampPalette(brewer.pal(nrow(flights_mat), "RdPu"))(nrow(flights_mat)),
s=0.6, v=1, scale="column",
margins=c(7,10),
main = "Shorter Flights mean Longer Delays",
xlab ="Distance and Delays",
ylab="Destination Airport",
labCol = c("Dep Delay","Arr Delay","Distance"),
cexCol=1, cexRow =1, RowSideColors = cols)
## layout: widths = 0.05 0.2 4 , heights = 0.25 4 ; lmat=
## [,1] [,2] [,3]
## [1,] 0 0 4
## [2,] 3 1 2
## Warning in brewer.pal(nrow(flights_mat), "RdPu"): n too large, allowed maximum for palette RdPu is 9
## Returning the palette you asked for with that many colors
legend(x="bottomright", legend=c("min", "ave", "max"),
fill=colorRampPalette(brewer.pal(8, "RdPu"))(3))
Reflection
In my visualization, I wanted to see whether or not the length of a flight affected how much that flight suffered delays (both arrival and departure delays). I hypothesized that longer flights would be delayed more often, since there might be more preparation before the flight involved (a longer flight might mean a longer cleaning of the plane beforehand, check on fuel, etc). To test this hypothesis, I made a heatmap that visualizes the delays that the flights experienced depending on how much distance was traveled. However, I was surprised to learn that the results were actually more opposite to my hypothesis. Longer distance flights had less delays, not more. I looked into this more to find out why, and actually discovered a couple really interesting reasons. For one, flights are a lot less likely to get delayed if the airport is less crowded, and airports are less crowded in the early morning and at night. Adding onto that, longer distance flights (also referred to as ‘long-haul’ flights) are more likely to fly through the night, for reasons such as lengthy travel and numerous time zone changes. As an individual that annually takes 16 hour flights to Pakistan, I realized that, yes, our flights always took off in the very early morning or in the evening. So, because long-haul flights are flown mostly at night, and airports are less likely to suffer delays at that time, longer flights actually mean less delays.
SOURCES: https://www.cntraveler.com/stories/2013-02-25/planes-night-flights-
https://claimflights.com/why-are-so-many-flights-delayed-today/