#install.packages("treemap")
#install.packages("RColorBrewer")
library(treemap)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(RColorBrewer)

Heatmaps

A heatmap is a literal way of visualizing a table of numbers, where you substitute the numbers with colored cells. There are two fundamentally different categories of heat maps: the cluster heat map and the spatial heat map. In a cluster heat map, magnitudes are laid out into a matrix of fixed cell size whose rows and columns are discrete categories, and the sorting of rows and columns is intentional. The size of the cell is arbitrary but large enough to be clearly visible. By contrast, the position of a magnitude in a spatial heat map is forced by the location of the magnitude in that space, and there is no notion of cells; the phenomenon is considered to vary continuously. (Wikipedia)

# How to make a heatmap
nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv")  
#apparently you have to use read.csv here instead of read_csv
head(nba)

##             Name  G  MIN  PTS  FGM  FGA   FGP FTM FTA   FTP X3PM X3PA  X3PP ORB
## 1   Dwyane Wade  79 38.6 30.2 10.8 22.0 0.491 7.5 9.8 0.765  1.1  3.5 0.317 1.1
## 2  LeBron James  81 37.7 28.4  9.7 19.9 0.489 7.3 9.4 0.780  1.6  4.7 0.344 1.3
## 3   Kobe Bryant  82 36.2 26.8  9.8 20.9 0.467 5.9 6.9 0.856  1.4  4.1 0.351 1.1
## 4 Dirk Nowitzki  81 37.7 25.9  9.6 20.0 0.479 6.0 6.7 0.890  0.8  2.1 0.359 1.1
## 5 Danny Granger  67 36.2 25.8  8.5 19.1 0.447 6.0 6.9 0.878  2.7  6.7 0.404 0.7
## 6  Kevin Durant  74 39.0 25.3  8.9 18.8 0.476 6.1 7.1 0.863  1.3  3.1 0.422 1.0
##   DRB TRB AST STL BLK  TO  PF
## 1 3.9 5.0 7.5 2.2 1.3 3.4 2.3
## 2 6.3 7.6 7.2 1.7 1.1 3.0 1.7
## 3 4.1 5.2 4.9 1.5 0.5 2.6 2.3
## 4 7.3 8.4 2.4 0.8 0.8 1.9 2.2
## 5 4.4 5.1 2.7 1.0 1.4 2.5 3.1
## 6 5.5 6.5 2.8 1.3 0.7 3.0 1.8

nba <- nba[order(nba$PTS),]
row.names(nba) <- nba$Name
nba <- nba[,2:19]
nba_matrix <- data.matrix(nba)
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, 
                       col = cm.colors(256), scale="column", margins=c(5,10),
                       xlab = "NBA Player Stats",
                       ylab = "NBA Players",
                       main = "NBA Player Stats in 2008")

nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = heat.colors(256), 
                       scale="column", margins=c(5,10),
                        xlab = "NBA Player Stats",
                       ylab = "NBA Players",
                       main = "NBA Player Stats in 2008")

library(viridis) # allows use of the color palette in the heatmap

## Loading required package: viridisLite

nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv = NA, col = viridis(25, direction = -1), 
                       scale="column", margins=c(5,10),
                        xlab = "NBA Player Stats",
                       ylab = "NBA Players",
                       main = "NBA Player Stats in 2008")

Treemaps

Treemaps display hierarchical (tree-structured) data as a set of nested rectangles. Each branch of the tree is given a rectangle, which is then tiled with smaller rectangles representing sub-branches. A leaf node’s rectangle has an area proportional to a specified dimension of the data.[1] Often the leaf nodes are colored to show a separate dimension of the data.

When the color and size dimensions are correlated in some way with the tree structure, one can often easily see patterns that would be difficult to spot in other ways, such as whether a certain color is particularly relevant. A second advantage of treemaps is that, by construction, they make efficient use of space. As a result, they can legibly display thousands of items on the screen simultaneously.

The Downside to Treemaps

The downside of treemaps is that as the aspect ratio is optimized, the order of placement becomes less predictable. As the order becomes more stable, the aspect ratio is degraded. (Wikipedia)

Use Nathan Yau’s dataset from the flowingdata website: http://datasets.flowingdata.com/post-data.txt You will need the package “treemap” and the package “RColorBrewer”.

data <- read.csv("http://datasets.flowingdata.com/post-data.txt")
head(data)

##     id  views comments               category
## 1 5019 148896       28 Artistic Visualization
## 2 1416  81374       26          Visualization
## 3 1416  81374       26               Featured
## 4 3485  80819       37               Featured
## 5 3485  80819       37                Mapping
## 6 3485  80819       37           Data Sources

treemap(data, index="category", vSize="views", 
        vColor="comments", type="manual",    # note: type = "manual" changes to red yellow blue
        palette="RdYlBu")

# Notice the following: The index is a categorical variable - in this case, “category” of post The size of the box is by number of views of the post The heatmap color is by number of comments for the post Notice how the treemap includes a legend for number of comments *

Use the dataset NYCFlights13 to create a heatmap that explores Late Arrivals

#install.packages("nycflights13")
library(nycflights13)
library(RColorBrewer)
flights <- flights

Create an initial scatterplot with loess smoother for distance to delays

Use “group_by” together with summarise functions Remove observations with NA values from distand and arr_delay variables - notice number of rows changed from 336,776 to 327,346

flights_nona <- flights %>%
  filter(!is.na(distance) & !is.na(arr_delay))  # remove na's for distance and arr_delay

Use group_by and summarise to create a summary table

The table includes, counts for each tail number, mean distance traveled, and mean arrival delay

by_tailnum <- flights_nona %>%
  group_by(tailnum) %>%  # group all tailnumbers together
  summarise(count = n(),   # counts totals for each tailnumber
        dist = mean(distance), # calculates the mean distance traveled
        delay = mean(arr_delay)) # calculates the mean arrival delay
delay <- filter(by_tailnum, count > 20, dist < 2000) # only include counts > 20 and distance < 2000 mi

Average delay is only slightly related to average distance flown by a plane.

ggplot(delay, aes(dist, delay)) +
  geom_point(aes(size = count), alpha = 1/2) +
  geom_smooth() +
  scale_size_area()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Late Arrivals Affect the Usage Cost of Airports

delays <- flights_nona %>%    # create a delays dataframe by:
  group_by (dest) %>%         # grouping by point of destination
  summarize (count = n(),     # creating variables: number of flights to each destination,
             dist = mean (distance),   # the mean distance flown to each destination,
             delay = mean (arr_delay), # the mean delay of arrival to each destination,
             delaycost = mean(count*delay/dist)) # delay cost index defined as:
                                       #  [(number of flights)*delay/distance] for a destination
delays <- arrange(delays, desc(delaycost))    # sort the rows by delay cost
head(delays)                            # look at the data

## # A tibble: 6 × 5
##   dest  count  dist delay delaycost
##   <chr> <int> <dbl> <dbl>     <dbl>
## 1 DCA    9111  211.  9.07      391.
## 2 IAD    5383  225. 13.9       332.
## 3 ATL   16837  757. 11.3       251.
## 4 BOS   15022  191.  2.91      230.
## 5 CLT   13674  538.  7.36      187.
## 6 RDU    7770  427. 10.1       183.

#install.packages("knitr")
library(knitr)
kable(delays, 
      caption = "Table of Mean Distance, Mean Arrival Delay, and Highest Delay Costs",
      digits = 2)  # round values to 2 decimal places

Table of Mean Distance, Mean Arrival Delay, and Highest Delay Costs
dest	count	dist	delay	delaycost
DCA	9111	211.08	9.07	391.36
IAD	5383	224.74	13.86	332.08
ATL	16837	757.14	11.30	251.29
BOS	15022	190.74	2.91	229.53
CLT	13674	538.01	7.36	187.07
RDU	7770	426.73	10.05	183.04
RIC	2346	281.27	20.11	167.74
PHL	1541	94.34	10.13	165.42
BUF	4570	296.87	8.95	137.71
ORD	16566	729.02	5.88	133.54
ROC	2358	259.36	11.56	105.11
BWI	1687	179.35	10.73	100.90
CVG	3725	575.23	15.36	99.50
DTW	9031	498.20	5.43	98.43
CLE	4394	414.00	9.18	97.45
PWM	2288	276.03	11.66	96.65
BNA	6084	758.22	11.81	94.78
FLL	11897	1070.06	8.08	89.86
BTV	2510	265.12	8.95	84.74
MCO	13967	943.11	5.45	80.78
CMH	3326	476.55	10.60	73.99
SYR	1707	206.07	8.90	73.76
MDW	4025	718.09	12.36	69.30
MHT	932	207.38	14.79	66.46
PIT	2746	334.10	7.68	63.13
TPA	7390	1003.93	7.41	54.53
ORF	1434	288.55	10.95	54.41
PBI	6487	1028.82	8.56	53.99
MKE	2709	733.37	14.17	52.33
STL	4142	878.83	11.08	52.21
MSP	6929	1017.46	7.27	49.51
GSO	1492	449.79	14.11	46.81
CHS	2759	632.96	10.59	46.17
ALB	418	143.00	14.40	42.08
CAK	842	397.00	19.70	41.78
DEN	7169	1614.69	8.61	38.21
JAX	2623	824.71	11.84	37.67
PVD	358	160.00	16.23	36.32
DAY	1399	536.91	12.68	33.04
IND	1981	652.26	9.94	30.19
BDL	412	116.00	7.05	25.03
MCI	1885	1097.65	14.51	24.92
GRR	728	605.71	18.19	21.86
TYS	578	638.34	24.07	21.79
SDF	1104	645.96	12.67	21.65
IAH	7085	1407.18	4.24	21.35
GSP	790	595.98	15.94	21.12
MSY	3715	1177.73	6.49	20.47
MEM	1686	954.48	10.65	18.80
SAV	749	709.27	15.13	15.98
MSN	556	803.93	20.20	13.97
SFO	13173	2577.93	2.67	13.66
OMA	817	1135.56	14.70	10.58
RSW	3502	1072.85	3.24	10.57
HOU	2083	1420.26	7.18	10.52
DSM	523	1020.56	19.01	9.74
AUS	2411	1514.25	6.02	9.58
SJU	5773	1599.84	2.52	9.10
TUL	294	1215.00	33.66	8.14
BGR	358	378.00	8.03	7.60
CAE	106	603.70	41.76	7.33
OKC	315	1325.00	30.62	7.28
XNA	992	1142.44	7.47	6.48
ACK	264	199.00	4.85	6.44
BHM	269	866.00	16.88	5.24
BQN	888	1578.99	8.25	4.64
PHX	4606	2141.34	2.10	4.51
CRW	134	444.00	14.67	4.43
AVL	261	583.61	8.00	3.58
LAX	16026	2468.62	0.55	3.55
SRQ	1201	1044.64	3.08	3.54
SAN	2709	2437.28	3.14	3.49
MIA	11593	1091.54	0.30	3.18
SAT	659	1578.18	6.95	2.90
PDX	1342	2445.61	5.14	2.82
DFW	8388	1383.06	0.32	1.95
TVC	95	652.45	12.97	1.89
PSE	358	1617.00	7.87	1.74
CHO	46	305.00	9.50	1.43
SMF	282	2521.00	12.11	1.35
BUR	370	2465.00	8.18	1.23
ILM	107	500.00	4.64	0.99
EGE	207	1735.80	6.30	0.75
LAS	5952	2240.98	0.26	0.68
ABQ	254	1826.00	4.38	0.61
MYR	58	550.67	4.60	0.48
SJC	328	2569.00	3.45	0.44
OAK	309	2576.00	3.08	0.37
JAC	21	1875.90	28.10	0.31
SLC	2451	1986.99	0.18	0.22
BZN	35	1882.00	7.60	0.14
SBN	10	645.40	6.50	0.10
EYW	17	1207.00	6.35	0.09
HDN	14	1728.00	2.14	0.02
MTJ	14	1795.00	1.79	0.01
ANC	8	3370.00	-2.50	-0.01
LGB	661	2465.00	-0.06	-0.02
LEX	1	604.00	-22.00	-0.04
PSP	18	2378.00	-12.72	-0.10
HNL	701	4972.76	-1.37	-0.19
MVY	210	173.00	-0.29	-0.35
STT	518	1626.99	-3.84	-1.22
SEA	3885	2412.68	-1.10	-1.77
SNA	812	2434.00	-7.87	-2.62

Now get the top 100 delay costs to create a heatmap of those flights.

top100 <- delays %>%                  # select the 100 largest delay costs
  head(100) %>%
  arrange(delaycost)                  # sort ascending so the heatmap displays descending costs
row.names(top100) <- top100$dest      # rename the rows according to destination airport codes

## Warning: Setting row names on a tibble is deprecated.

In order to make a heatmap, convert the dataframe to matrix form

delays_mat <- data.matrix(top100)  # convert delays dataframe to a matrix  (required by heatmap)
delays_mat2 <- delays_mat[,2:5]    # remove the redundant column of destination airport codes

# Create a heatmap using colorBrewer
varcols = setNames(colorRampPalette(brewer.pal(nrow(delays_mat2), "YlGnBu"))(nrow(delays_mat2)), 
                   rownames(delays_mat2))   # parameter for RowSideColors

## Warning in brewer.pal(nrow(delays_mat2), "YlGnBu"): n too large, allowed maximum for palette YlGnBu is 9
## Returning the palette you asked for with that many colors

heatmap(delays_mat2, 
        Rowv = NA, Colv = NA, 
        col= colorRampPalette(brewer.pal(nrow(delays_mat2), "YlGnBu"))(nrow(delays_mat2)),
        s=0.6, v=1, scale="column", 
        margins=c(7,10), 
        main = "Cost of Late Arrivals", 
        xlab = "Flight Characteristics", 
        ylab="Arrival Airport", labCol = c("Flights","Distance","Delay","Cost Index"),
        cexCol=1, cexRow =1, RowSideColors = varcols)

## layout: widths =  0.05 0.2 4 , heights =  0.25 4 ; lmat=
##      [,1] [,2] [,3]
## [1,]    0    0    4
## [2,]    3    1    2

## Warning in brewer.pal(nrow(delays_mat2), "YlGnBu"): n too large, allowed maximum for palette YlGnBu is 9
## Returning the palette you asked for with that many colors

Flight only, HW part one. Note, I was not able to upload the other parts of the program

Ami

2022-10-02