#install.packages("nycflights23")
library(nycflights23)
library(tidyverse)
#install.packages("fields") # run once
library(fields)
data(flights)
data(airlines)Heatmaps Treemaps and Alluvials
So many ways to visualize data
https://r-graph-gallery.com/vaccination-heatmap.html
https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html

Use the dataset NYCFlights23 to explore late arrivals
Source: FAA Aircraft registry, https://www.faa.gov/licenses_certificates/aircraft_certification/ aircraft_registry/releasable_aircraft_download/
Create an initial scatterplot with loess smoother for distance to delays
Use “group_by” together with summarize functions
flights_nona <- flights |>
filter(!is.na(distance) & !is.na(arr_delay) & !is.na(dep_delay))
# remove na's for distance, arr_delay, departure delayUse group_by and summarise to create a summary table
The table includes counts for each destination, mean distance traveled, mean arrival delay, and mean departure delay
by_dest <- flights_nona |>
group_by(dest) |> # group all destinations
summarise(count = n(), # counts totals for each destination
avg_dist = mean(distance), # calculates the mean distance traveled
avg_arr_delay = mean(arr_delay), # calculates the mean arrival delay
avg_dep_delay = mean(dep_delay), # calculates the mean dep delay
.groups = "drop") |> # remove the grouping structure after summarizing
arrange(avg_arr_delay) |>
filter(avg_dist < 3000)
head(by_dest)# A tibble: 6 × 5
dest count avg_dist avg_arr_delay avg_dep_delay
<chr> <int> <dbl> <dbl> <dbl>
1 PNS 71 1030 -10.6 -1.24
2 HHH 461 695. -9.95 1.38
3 HDN 27 1728 -9.93 8.78
4 VPS 107 988 -9.41 2.62
5 AVP 140 93 -8.53 -0.957
6 GSO 2857 456. -7.77 3.81
Heatmaps
A heatmap is a way of visualizing a table of numbers, where you substitute the numbers with colored cells. There are two fundamentally different categories of heat maps: the cluster heat map and the spatial heat map. In a cluster heat map, magnitudes are laid out into a matrix of fixed cell size whose rows and columns are discrete categories, and the sorting of rows and columns is intentional. The size of the cell is arbitrary but large enough to be clearly visible. By contrast, the position of a magnitude in a spatial heat map is forced by the location of the magnitude in that space, and there is no notion of cells; the phenomenon is considered to vary continuously. (Wikipedia)
Heatmap of average departure delays, arrival delays, distance and flight times
by_dest_matrix <- data.matrix(by_dest[, -1]) # drop dest from matrix so it won't show in heatmap
row.names(by_dest_matrix) <- by_dest$dest
# restore row names
by_dest_matrix count avg_dist avg_arr_delay avg_dep_delay
PNS 71 1030.00000 -10.56338028 -1.2394366
HHH 461 694.53145 -9.94577007 1.3752711
HDN 27 1728.00000 -9.92592593 8.7777778
VPS 107 988.00000 -9.41121495 2.6168224
AVP 140 93.00000 -8.52857143 -0.9571429
GSO 2857 455.88694 -7.76583829 3.8085404
SDF 2509 652.14787 -7.16859306 4.5990434
XNA 604 1147.00000 -6.54801325 3.6158940
CHO 1439 305.00000 -6.50382210 4.0694927
ILM 2236 497.03220 -6.34302326 3.8635957
SNA 1402 2439.20685 -6.31027104 7.1847361
DSM 695 1029.24748 -5.21870504 4.7381295
DAY 590 549.00000 -5.06779661 3.1457627
TYS 1221 645.30958 -4.72645373 5.1408681
ROA 619 405.00000 -4.70113086 3.4184168
SCE 269 207.94424 -4.69888476 4.1115242
GRR 1673 611.86910 -4.60370592 5.2982666
AVL 1565 593.25431 -4.47539936 2.4913738
OAK 213 2555.00000 -4.22065728 11.6713615
RIC 3511 290.06038 -3.83850755 4.3192823
CMH 7269 475.88926 -3.57573256 6.0020636
OMA 1079 1147.88323 -3.35125116 7.4161260
PIT 7552 330.30005 -3.27462924 5.7689354
MHT 586 209.00000 -3.15529010 4.0068259
ALB 1510 138.92980 -3.14966887 4.9165563
BDL 681 101.00000 -2.97063142 8.2085169
CVG 6324 580.70715 -2.27688172 7.2303922
STL 4777 885.74921 -1.74502826 8.8911451
GSP 2217 603.64908 -1.61569689 5.6959856
DCA 9189 212.33986 -1.51616063 7.9632169
EYW 897 1198.97993 -1.43589744 1.2006689
SEA 5154 2414.01785 -1.41249515 13.8861079
BGR 2146 383.23858 -1.38676608 6.9911463
TVC 175 650.66286 -1.36571429 4.9257143
IND 6952 656.69448 -1.32968930 8.2176352
BHM 871 866.00000 -1.15958668 8.1343284
BWI 895 184.00000 -1.07709497 9.8525140
MEM 2066 957.69264 -0.99854792 7.7531462
MSP 5805 1017.00827 -0.85839793 11.9574505
ORH 948 148.77215 -0.51054852 5.1740506
PDX 1904 2444.86134 -0.50840336 16.3256303
CLE 6399 415.57853 -0.41397093 7.3506798
ORF 5193 290.90237 -0.32486039 6.4850761
LIT 498 1085.00000 -0.27108434 5.8012048
PVD 1498 150.09279 -0.24299065 7.9532710
PWM 3937 274.79629 -0.03098806 7.4107188
SFO 11474 2578.44117 0.27871710 16.0542967
MDT 471 141.00000 0.38004246 7.8492569
SAT 1279 1577.27522 0.73885848 9.7896794
MYR 2120 556.94151 0.75613208 10.0938679
EGE 205 1736.33659 0.76585366 17.0487805
STT 759 1626.42029 0.94861660 9.6297760
PHX 5120 2142.83828 1.00000000 13.4455078
SLC 3118 1983.00449 1.18248877 18.6889031
TUL 305 1231.00000 1.52459016 11.4426230
BZN 272 1887.80882 1.62867647 13.8676471
PSP 188 2377.57447 1.85638298 17.1861702
MSN 810 805.90123 1.95802469 12.5432099
SAN 4087 2437.58356 2.15047712 15.3246880
MKE 2801 735.73831 2.41949304 11.5958586
CHS 5204 635.37567 2.56033820 9.1856264
MCI 3611 1105.39075 2.59401828 11.9437829
DTW 9997 499.05422 2.83204961 12.6795039
DAL 1322 1381.00000 3.12405446 15.1164902
RDU 11182 425.69496 3.16302987 11.8137185
BTV 3329 263.91649 3.17873235 9.1730249
DEN 9720 1616.02068 3.45370370 15.9853909
BGM 624 147.00000 3.55769231 10.0737179
ROC 3820 257.22984 3.60654450 11.4002618
OKC 830 1341.00000 3.74698795 11.6819277
SBN 24 648.08333 3.83333333 -1.1250000
HYA 193 196.51813 3.83937824 12.0259067
CLT 12548 538.50159 3.84826267 12.3792636
CAE 658 617.00000 3.95440729 8.4924012
BUF 6014 294.71034 4.17592285 10.5023279
LAX 15743 2467.28876 4.54227276 15.4996506
IAH 7438 1408.42794 4.69776822 14.9631621
BNA 8944 760.41659 4.86449016 13.0196780
PHL 346 84.99422 5.07514451 8.5982659
DFW 11317 1383.86719 5.76186268 15.6238402
BOS 18387 188.68744 6.07945831 13.9915157
ATL 17234 756.19531 6.22124869 16.7238598
SAV 3734 716.47991 6.31494376 11.3564542
ORD 17691 729.78944 6.37527556 15.4468939
SYR 3231 202.44383 6.81213247 13.8780563
LEX 1 604.00000 7.00000000 -9.0000000
LAS 5901 2237.36706 7.04016268 21.0521945
JAX 4833 827.47155 7.53362301 15.3976826
MDW 2516 725.00000 7.63831479 17.2356916
MSY 4170 1177.34820 8.00239808 16.8947242
SRQ 1983 1038.94402 8.06454866 13.5395865
AGS 20 678.00000 8.35000000 3.7500000
RSW 4628 1072.90320 8.45678479 18.4535436
AUS 4722 1513.41804 8.57475646 15.9345616
MVY 646 174.00310 8.77089783 14.7136223
MIA 15706 1090.22705 9.27034254 15.7499682
ITH 698 188.68338 9.33524355 9.7521490
TPA 8048 1003.73708 9.69396123 18.5724404
ACK 875 200.54286 10.82400000 13.4548571
IAD 3257 219.04544 11.04451950 17.0841265
BUR 335 2465.00000 11.20000000 24.1223881
MTJ 75 1796.80000 11.25333333 16.1866667
HOU 1002 1428.00000 11.28742515 19.8642715
PBI 8046 1028.36130 11.70270942 19.9043003
FLL 13782 1069.81251 12.97264548 21.9497170
JAC 190 1874.82105 14.18947368 23.1631579
MCO 17287 943.02511 14.84971366 21.9103372
SMF 704 2510.58949 17.85511364 29.4147727
SJC 186 2569.00000 20.20967742 35.8118280
SJU 5312 1602.00414 21.03859187 28.9412651
BQN 957 1579.33856 25.61128527 32.2445141
ONT 353 2429.00000 26.13881020 37.2832861
ABQ 218 1824.94037 26.71559633 41.2018349
RNO 129 2410.34884 34.43410853 47.0155039
PSE 319 1617.00000 37.55485893 44.2476489
library(viridis)
by_dest_heatmap <- heatmap(by_dest_matrix,
Rowv=NA,
Colv=NA,
col = viridis(25),
cexCol = .7, # shrink x-axis label size
scale="column",
xlab = "",
ylab = "",
keep.dendro = TRUE,
main = "Heatmap of Flight Destinations from NY Airports")
TODO: Which 6 destination airports have the highest average arrival delay from NYC?
Treemaps
Treemaps display hierarchical (tree-structured) data as a set of nested rectangles. Each branch of the tree is given a rectangle, which is then tiled with smaller rectangles representing sub-branches. A leaf node’s rectangle has an area proportional to a specified dimension of the data.[1] Often the leaf nodes are colored to show a separate dimension of the data.
When the color and size dimensions are correlated in some way with the tree structure, one can often easily see patterns that would be difficult to spot in other ways, such as whether a certain color is particularly relevant. A second advantage of treemaps is that, by construction, they make efficient use of space. As a result, they can legibly display thousands of items on the screen simultaneously.
The Downside to Treemaps
The downside of treemaps is that as the aspect ratio is optimized, the order of placement becomes less predictable. As the order becomes more stable, the aspect ratio is degraded. (Wikipedia)
Join the delay_punctuality dataset with the airlines dataset
Also remove “Inc.” or “Co.” from the Carrier Name
str(airlines)tibble [14 × 2] (S3: tbl_df/tbl/data.frame)
$ carrier: chr [1:14] "9E" "AA" "AS" "B6" ...
$ name : chr [1:14] "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...
airlines$name [1] "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc."
[4] "JetBlue Airways" "Delta Air Lines Inc." "Frontier Airlines Inc."
[7] "Allegiant Air" "Hawaiian Airlines Inc." "Envoy Air"
[10] "Spirit Air Lines" "SkyWest Airlines Inc." "United Air Lines Inc."
[13] "Southwest Airlines Co." "Republic Airline"
flights2 <- left_join(flights_nona, airlines, by = "carrier")
flights2$name <- gsub("Inc\\.|Co\\.", "", flights2$name)
head(flights2)# A tibble: 6 × 20
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, name <chr>
# Convert months from numerical to abbreviated labels
flights3 <- flights2 |>
group_by(name)|>
summarise(avg_dist = mean(distance), # calculates the mean distance traveled
avg_arr_delay = mean(arr_delay)) # calculates the mean arrival delay
flights3# A tibble: 14 × 3
name avg_dist avg_arr_delay
<chr> <dbl> <dbl>
1 "Alaska Airlines " 2481. 0.0844
2 "Allegiant Air" 723. -5.88
3 "American Airlines " 1156. 5.27
4 "Delta Air Lines " 1278. 1.64
5 "Endeavor Air " 487. -2.23
6 "Envoy Air" 725. 0.119
7 "Frontier Airlines " 969. 26.2
8 "Hawaiian Airlines " 4983 21.4
9 "JetBlue Airways" 1140. 15.6
10 "Republic Airline" 485. -4.64
11 "SkyWest Airlines " 628. 13.7
12 "Southwest Airlines " 1024. 5.76
13 "Spirit Air Lines" 1085. 9.89
14 "United Air Lines " 1246. 9.04
#flights2$month_label <- month(flights2$month, label = TRUE, abbr = TRUE)Create a treemap for NYC FLights
The index is a categorical variable - carrier (name)
The size of the box is by average distance
The heatmap color is average arrival delay
Notice how the treemap includes a legend for average arrival delay
library(RColorBrewer)
library(treemap)
treemap(flights3,
index="name", # airline
vSize="avg_dist",
vColor="avg_arr_delay",
type="manual",
palette="RdYlBu", #Use RColorBrewer palette
title = "Average Distance and Arrival Delay by Carrier", # plot title
title.legend = "Avg Arrival Delay (min)" ) # legend label Graph On-Time Performance using Departure Delay and Arrival Delay
Some of the most important data that is collected for reporting is to analyze key performance indicators (KPIs) and the subset that agencies look at the most is “On-Time Performance” (OTP) which is usually defined as arriving at the origin location within 15 minutes of the requested/scheduled pickup time. The following code will create a bidirectional bar graph that has both the departure delay percentage and arrival delay percentage for each carrier.
Calculate the percentage of flights with less than 15 minutes delay (OTP)
delay_OTP <- flights2 |>
group_by(name) |> #name = airline
summarize(Departure_Percentage = sum(dep_delay <= 15)
/ n() * 100,
Arrival_Percentage = sum(arr_delay <= 15) / n() * 100)Create a bidirectional horizontal bar chart
ggplot(delay_OTP, aes(x = -Departure_Percentage, y = reorder(name, Departure_Percentage))) +
geom_text(aes(label = paste0(round(Departure_Percentage, 0), "%")),
hjust = 1.1, size = 3.5) + #departure % labels
geom_bar(aes(fill = "Departure_Percentage"), stat = "identity", width = .75) +
geom_bar(aes(x = Arrival_Percentage, fill = "Arrival_Percentage"),
stat = "identity", width = .75) +
geom_text(aes(x = Arrival_Percentage, label = paste0(round(Arrival_Percentage, 0), "%")),
hjust =-.1, size = 3.5) + # arrival % labels
labs(x = "Departures < On-Time Performance > Arrivals",
y = "",
title = "On-Time Performance of Airline Carriers \n (Percent of Flights < 15 Minutes Delay)",
caption = "Source: FAA") +
scale_fill_manual(
name = "Performance",
breaks = c("Departure_Percentage", "Arrival_Percentage"), # Specify the order of legend items
values = c("Departure_Percentage" = "#8bd3c7", "Arrival_Percentage" = "#beb9db"),
labels = c("Departure_Percentage" = "Departure", "Arrival_Percentage" = "Arrival")
) +
scale_x_continuous(labels = abs, limits = c(-120, 120)) + # Positive negative axis
theme_minimal()