#install.packages("treemap")
#install.packages("RColorBrewer")
library(treemap)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(RColorBrewer)
A heatmap is a literal way of visualizing a table of numbers, where you substitute the numbers with colored cells. There are two fundamentally different categories of heat maps: the cluster heat map and the spatial heat map. In a cluster heat map, magnitudes are laid out into a matrix of fixed cell size whose rows and columns are discrete categories, and the sorting of rows and columns is intentional. The size of the cell is arbitrary but large enough to be clearly visible. By contrast, the position of a magnitude in a spatial heat map is forced by the location of the magnitude in that space, and there is no notion of cells; the phenomenon is considered to vary continuously. (Wikipedia)
# How to make a heatmap
nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv")
#apparently you have to use read.csv here instead of read_csv
head(nba)
## Name G MIN PTS FGM FGA FGP FTM FTA FTP X3PM X3PA X3PP ORB
## 1 Dwyane Wade 79 38.6 30.2 10.8 22.0 0.491 7.5 9.8 0.765 1.1 3.5 0.317 1.1
## 2 LeBron James 81 37.7 28.4 9.7 19.9 0.489 7.3 9.4 0.780 1.6 4.7 0.344 1.3
## 3 Kobe Bryant 82 36.2 26.8 9.8 20.9 0.467 5.9 6.9 0.856 1.4 4.1 0.351 1.1
## 4 Dirk Nowitzki 81 37.7 25.9 9.6 20.0 0.479 6.0 6.7 0.890 0.8 2.1 0.359 1.1
## 5 Danny Granger 67 36.2 25.8 8.5 19.1 0.447 6.0 6.9 0.878 2.7 6.7 0.404 0.7
## 6 Kevin Durant 74 39.0 25.3 8.9 18.8 0.476 6.1 7.1 0.863 1.3 3.1 0.422 1.0
## DRB TRB AST STL BLK TO PF
## 1 3.9 5.0 7.5 2.2 1.3 3.4 2.3
## 2 6.3 7.6 7.2 1.7 1.1 3.0 1.7
## 3 4.1 5.2 4.9 1.5 0.5 2.6 2.3
## 4 7.3 8.4 2.4 0.8 0.8 1.9 2.2
## 5 4.4 5.1 2.7 1.0 1.4 2.5 3.1
## 6 5.5 6.5 2.8 1.3 0.7 3.0 1.8
nba <- nba[order(nba$PTS),]
row.names(nba) <- nba$Name
nba <- nba[,2:19]
nba_matrix <- data.matrix(nba)
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA,
col = cm.colors(256), scale="column", margins=c(5,10),
xlab = "NBA Player Stats",
ylab = "NBA Players",
main = "NBA Player Stats in 2008")
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = heat.colors(256),
scale="column", margins=c(5,10),
xlab = "NBA Player Stats",
ylab = "NBA Players",
main = "NBA Player Stats in 2008")
library(viridis) # allows use of the color palette in the heatmap
## Loading required package: viridisLite
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv = NA, col = viridis(25, direction = -1),
scale="column", margins=c(5,10),
xlab = "NBA Player Stats",
ylab = "NBA Players",
main = "NBA Player Stats in 2008")
Treemaps display hierarchical (tree-structured) data as a set of nested rectangles. Each branch of the tree is given a rectangle, which is then tiled with smaller rectangles representing sub-branches. A leaf node’s rectangle has an area proportional to a specified dimension of the data.[1] Often the leaf nodes are colored to show a separate dimension of the data.
When the color and size dimensions are correlated in some way with the tree structure, one can often easily see patterns that would be difficult to spot in other ways, such as whether a certain color is particularly relevant. A second advantage of treemaps is that, by construction, they make efficient use of space. As a result, they can legibly display thousands of items on the screen simultaneously.
The downside of treemaps is that as the aspect ratio is optimized, the order of placement becomes less predictable. As the order becomes more stable, the aspect ratio is degraded. (Wikipedia)
Use Nathan Yau’s dataset from the flowingdata website: http://datasets.flowingdata.com/post-data.txt You will need the package “treemap” and the package “RColorBrewer”.
data <- read.csv("http://datasets.flowingdata.com/post-data.txt")
head(data)
## id views comments category
## 1 5019 148896 28 Artistic Visualization
## 2 1416 81374 26 Visualization
## 3 1416 81374 26 Featured
## 4 3485 80819 37 Featured
## 5 3485 80819 37 Mapping
## 6 3485 80819 37 Data Sources
treemap(data, index="category", vSize="views",
vColor="comments", type="manual", # note: type = "manual" changes to red yellow blue
palette="RdYlBu")
# Notice the following: The index is a categorical variable - in this
case, “category” of post The size of the box is by number of views of
the post The heatmap color is by number of comments for the post Notice
how the treemap includes a legend for number of comments *
#install.packages("nycflights13")
library(nycflights13)
library(RColorBrewer)
flights <- flights
Use “group_by” together with summarise functions Remove observations with NA values from distand and arr_delay variables - notice number of rows changed from 336,776 to 327,346
flights_nona <- flights %>%
filter(!is.na(distance) & !is.na(arr_delay)) # remove na's for distance and arr_delay
The table includes, counts for each tail number, mean distance traveled, and mean arrival delay
by_tailnum <- flights_nona %>%
group_by(tailnum) %>% # group all tailnumbers together
summarise(count = n(), # counts totals for each tailnumber
dist = mean(distance), # calculates the mean distance traveled
delay = mean(arr_delay)) # calculates the mean arrival delay
delay <- filter(by_tailnum, count > 20, dist < 2000) # only include counts > 20 and distance < 2000 mi
top100 <- delays %>% # select the 100 largest delay costs
head(100) %>%
arrange(delaycost) # sort ascending so the heatmap displays descending costs
row.names(top100) <- top100$dest # rename the rows according to destination airport codes
## Warning: Setting row names on a tibble is deprecated.
delays_mat <- data.matrix(top100) # convert delays dataframe to a matrix (required by heatmap)
delays_mat2 <- delays_mat[,2:5] # remove the redundant column of destination airport codes
# Create a heatmap using colorBrewer
varcols = setNames(colorRampPalette(brewer.pal(nrow(delays_mat2), "YlGnBu"))(nrow(delays_mat2)),
rownames(delays_mat2)) # parameter for RowSideColors
## Warning in brewer.pal(nrow(delays_mat2), "YlGnBu"): n too large, allowed maximum for palette YlGnBu is 9
## Returning the palette you asked for with that many colors
heatmap(delays_mat2,
Rowv = NA, Colv = NA,
col= colorRampPalette(brewer.pal(nrow(delays_mat2), "YlGnBu"))(nrow(delays_mat2)),
s=0.6, v=1, scale="column",
margins=c(7,10),
main = "Cost of Late Arrivals",
xlab = "Flight Characteristics",
ylab="Arrival Airport", labCol = c("Flights","Distance","Delay","Cost Index"),
cexCol=1, cexRow =1, RowSideColors = varcols)
## layout: widths = 0.05 0.2 4 , heights = 0.25 4 ; lmat=
## [,1] [,2] [,3]
## [1,] 0 0 4
## [2,] 3 1 2
## Warning in brewer.pal(nrow(delays_mat2), "YlGnBu"): n too large, allowed maximum for palette YlGnBu is 9
## Returning the palette you asked for with that many colors