library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.1 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
library(dplyr)
library(RColorBrewer)
flights<-flights
removena_flights <- flights %>%
filter(!is.na(distance) & !is.na(sched_arr_time))
by_carrier <- removena_flights %>%
group_by(carrier,month) %>%
summarise(count = n(),
dist = mean(distance),
sched_arr_time = mean(sched_arr_time))
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
sched_arr_time <- filter(by_carrier, count > 20, dist < 2000)
ggplot(sched_arr_time, aes(dist, sched_arr_time)) +
geom_point(aes(size = count), alpha = 1/2) +
geom_smooth() +
scale_size_area()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
delays <- removena_flights %>%
group_by (dest) %>%
summarize (count = n(),
dist = mean (distance),
sched_arr_time = mean(sched_arr_time),
dep_delay = mean (dep_delay))
delays <- arrange(delays, desc(dep_delay))
head(delays)
## # A tibble: 6 x 5
## dest count dist sched_arr_time dep_delay
## <chr> <int> <dbl> <dbl> <dbl>
## 1 SBN 10 645. 1739. 21.1
## 2 ABQ 254 1826 2278. 13.7
## 3 ANC 8 3370 1966. 12.9
## 4 ACK 265 199 1139. 6.46
## 5 EYW 17 1207 1360. 3.65
## 6 LEX 1 604 2249 -9
delays <- removena_flights %>%
group_by (dest) %>%
summarize (count = n(),
dist = mean (distance),
delay = mean (dep_delay),
delaycost = mean(count*delay/dist))
delays <- arrange(delays, desc(delaycost))
head(delays)
## # A tibble: 6 x 5
## dest count dist delay delaycost
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ACK 265 199 6.46 8.60
## 2 ABQ 254 1826 13.7 1.91
## 3 SBN 10 645. 21.1 0.327
## 4 EYW 17 1207 3.65 0.0514
## 5 ANC 8 3370 12.9 0.0306
## 6 LEX 1 604 -9 -0.0149
#install.packages("knitr")
library(knitr)
kable(delays,
caption = "Table of Mean Distance, Mean Departure Delay, and Highest Delay Costs",
digits = 2)
| dest | count | dist | delay | delaycost |
|---|---|---|---|---|
| ACK | 265 | 199.00 | 6.46 | 8.60 |
| ABQ | 254 | 1826.00 | 13.74 | 1.91 |
| SBN | 10 | 645.40 | 21.10 | 0.33 |
| EYW | 17 | 1207.00 | 3.65 | 0.05 |
| ANC | 8 | 3370.00 | 12.88 | 0.03 |
| LEX | 1 | 604.00 | -9.00 | -0.01 |
| ALB | 439 | 143.00 | NA | NA |
| ATL | 17215 | 757.11 | NA | NA |
| AUS | 2439 | 1514.25 | NA | NA |
| AVL | 275 | 583.58 | NA | NA |
| BDL | 443 | 116.00 | NA | NA |
| BGR | 375 | 378.00 | NA | NA |
| BHM | 297 | 866.00 | NA | NA |
| BNA | 6333 | 758.21 | NA | NA |
| BOS | 15508 | 190.64 | NA | NA |
| BQN | 896 | 1578.98 | NA | NA |
| BTV | 2589 | 265.09 | NA | NA |
| BUF | 4681 | 296.81 | NA | NA |
| BUR | 371 | 2465.00 | NA | NA |
| BWI | 1781 | 179.42 | NA | NA |
| BZN | 36 | 1882.00 | NA | NA |
| CAE | 116 | 603.55 | NA | NA |
| CAK | 864 | 397.00 | NA | NA |
| CHO | 52 | 305.00 | NA | NA |
| CHS | 2884 | 632.92 | NA | NA |
| CLE | 4573 | 414.17 | NA | NA |
| CLT | 14064 | 538.03 | NA | NA |
| CMH | 3524 | 476.56 | NA | NA |
| CRW | 138 | 444.00 | NA | NA |
| CVG | 3941 | 575.16 | NA | NA |
| DAY | 1525 | 537.10 | NA | NA |
| DCA | 9705 | 211.01 | NA | NA |
| DEN | 7266 | 1614.68 | NA | NA |
| DFW | 8738 | 1383.04 | NA | NA |
| DSM | 569 | 1020.89 | NA | NA |
| DTW | 9384 | 498.13 | NA | NA |
| EGE | 213 | 1735.71 | NA | NA |
| FLL | 12055 | 1070.07 | NA | NA |
| GRR | 765 | 605.78 | NA | NA |
| GSO | 1606 | 449.84 | NA | NA |
| GSP | 849 | 595.96 | NA | NA |
| HDN | 15 | 1728.00 | NA | NA |
| HNL | 707 | 4972.67 | NA | NA |
| HOU | 2115 | 1420.16 | NA | NA |
| IAD | 5700 | 224.85 | NA | NA |
| IAH | 7198 | 1407.21 | NA | NA |
| ILM | 110 | 500.00 | NA | NA |
| IND | 2077 | 652.26 | NA | NA |
| JAC | 25 | 1875.60 | NA | NA |
| JAX | 2720 | 824.68 | NA | NA |
| LAS | 5997 | 2240.96 | NA | NA |
| LAX | 16174 | 2468.62 | NA | NA |
| LGA | 1 | 17.00 | NA | NA |
| LGB | 668 | 2465.00 | NA | NA |
| MCI | 2008 | 1097.70 | NA | NA |
| MCO | 14082 | 943.11 | NA | NA |
| MDW | 4113 | 718.05 | NA | NA |
| MEM | 1789 | 954.20 | NA | NA |
| MHT | 1009 | 207.03 | NA | NA |
| MIA | 11728 | 1091.55 | NA | NA |
| MKE | 2802 | 733.38 | NA | NA |
| MSN | 572 | 803.95 | NA | NA |
| MSP | 7185 | 1017.40 | NA | NA |
| MSY | 3799 | 1177.71 | NA | NA |
| MTJ | 15 | 1795.00 | NA | NA |
| MVY | 221 | 173.00 | NA | NA |
| MYR | 59 | 550.66 | NA | NA |
| OAK | 312 | 2576.00 | NA | NA |
| OKC | 346 | 1325.00 | NA | NA |
| OMA | 849 | 1135.57 | NA | NA |
| ORD | 17283 | 729.00 | NA | NA |
| ORF | 1536 | 288.52 | NA | NA |
| PBI | 6554 | 1028.84 | NA | NA |
| PDX | 1354 | 2445.57 | NA | NA |
| PHL | 1632 | 94.32 | NA | NA |
| PHX | 4656 | 2141.30 | NA | NA |
| PIT | 2875 | 334.06 | NA | NA |
| PSE | 365 | 1617.00 | NA | NA |
| PSP | 19 | 2378.00 | NA | NA |
| PVD | 376 | 160.00 | NA | NA |
| PWM | 2352 | 276.13 | NA | NA |
| RDU | 8163 | 426.76 | NA | NA |
| RIC | 2454 | 281.40 | NA | NA |
| ROC | 2416 | 259.25 | NA | NA |
| RSW | 3537 | 1072.85 | NA | NA |
| SAN | 2737 | 2437.30 | NA | NA |
| SAT | 686 | 1578.34 | NA | NA |
| SAV | 804 | 709.18 | NA | NA |
| SDF | 1157 | 645.98 | NA | NA |
| SEA | 3923 | 2412.67 | NA | NA |
| SFO | 13331 | 2577.92 | NA | NA |
| SJC | 329 | 2569.00 | NA | NA |
| SJU | 5819 | 1599.83 | NA | NA |
| SLC | 2467 | 1986.99 | NA | NA |
| SMF | 284 | 2521.00 | NA | NA |
| SNA | 825 | 2434.00 | NA | NA |
| SRQ | 1211 | 1044.65 | NA | NA |
| STL | 4339 | 878.72 | NA | NA |
| STT | 522 | 1626.98 | NA | NA |
| SYR | 1761 | 205.92 | NA | NA |
| TPA | 7466 | 1003.94 | NA | NA |
| TUL | 315 | 1215.00 | NA | NA |
| TVC | 101 | 652.39 | NA | NA |
| TYS | 631 | 638.81 | NA | NA |
| XNA | 1036 | 1142.51 | NA | NA |
top10 <- delays %>%
head(10) %>%
arrange(delaycost)
row.names(top10) <- top10$dest
## Warning: Setting row names on a tibble is deprecated.
delays_mat <- data.matrix(top10)
delays_mat2 <- delays_mat[,1:2]
varcols = setNames(colorRampPalette(brewer.pal(nrow(delays_mat2), "YlOrRd"))(nrow(delays_mat2)),
rownames(delays_mat2))
## Warning in brewer.pal(nrow(delays_mat2), "YlOrRd"): n too large, allowed maximum for palette YlOrRd is 9
## Returning the palette you asked for with that many colors
heatmap(delays_mat2,
Rowv = NA, Colv = NA,
col= colorRampPalette(brewer.pal(nrow(delays_mat2), "YlOrRd"))(nrow(delays_mat2)),
s=0.4, v=1, scale="column",
margins=c(7,7),
main = "Cost of Late Departures",
xlab = "Flight Characteristics",
ylab="Airport Name", labCol = c("Flights","Distance","Late Departure","Cost Index"),
cexCol=1, cexRow =1,RowSideColors = varcols)
## layout: widths = 0.05 0.2 4 , heights = 0.25 4 ; lmat=
## [,1] [,2] [,3]
## [1,] 0 0 4
## [2,] 3 1 2
## Warning in brewer.pal(nrow(delays_mat2), "YlOrRd"): n too large, allowed maximum for palette YlOrRd is 9
## Returning the palette you asked for with that many colors
For my plot, I decided to make the heatmap model by referring to the one we made in class. “Cost Index” is defined as a measure of the impact of flight departure delays on the cost of the flight at each airport. For airlines, it is a measure of the increased cost of flying to an airport due to frequent departure delays. This index is inversely proportional to distance because delays affect short flights more than long flights and because profit per seat increases with distance due to larger and more efficient aircraft used for long distances. Delays in flight delays are mainly due to weather, as it is the ability to see one’s surroundings. If the cloud cover is too thick or there is a lot of fog, it can be dangerous for planes to take off. Therefore, during the winter months, delays are often much longer due to visibility issues. A simple snow squall could end up bringing down a plane due to the inability to determine the correct take-off angle. In addition, mechanical problems can also be the cause of delays. Planes are very delicate mechanically. One thing that isn’t working properly can cause problems in other unexpected places. This makes diagnostics complicated and can take longer than expected to properly resolve these issues, leading to flight delays. Nevertheless, there can be even more delays related to mechanical problems that have nothing to do with the plane itself, because before each flight the plane must be re-inspected and the documents must be filed before the aircraft can be loaded and take off. These events are therefore added to the delay already incurred. We can also note through our graph that airports such as ACK and ABQ have a high cost index because they are frequently delayed, while airports ATL, AUS, AVL, BDL have zero cost and are less affected by delays of flights.