library(treemap)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(RColorBrewer)

How to create heatmaps

NBA Stats Heatmap

Load the data from url and display the first 6 rows of the data

nba <- read.csv("http://datasets.flowingdata.com/ppg2008.csv")

head(nba)

##             Name  G  MIN  PTS  FGM  FGA   FGP FTM FTA   FTP X3PM X3PA  X3PP ORB
## 1   Dwyane Wade  79 38.6 30.2 10.8 22.0 0.491 7.5 9.8 0.765  1.1  3.5 0.317 1.1
## 2  LeBron James  81 37.7 28.4  9.7 19.9 0.489 7.3 9.4 0.780  1.6  4.7 0.344 1.3
## 3   Kobe Bryant  82 36.2 26.8  9.8 20.9 0.467 5.9 6.9 0.856  1.4  4.1 0.351 1.1
## 4 Dirk Nowitzki  81 37.7 25.9  9.6 20.0 0.479 6.0 6.7 0.890  0.8  2.1 0.359 1.1
## 5 Danny Granger  67 36.2 25.8  8.5 19.1 0.447 6.0 6.9 0.878  2.7  6.7 0.404 0.7
## 6  Kevin Durant  74 39.0 25.3  8.9 18.8 0.476 6.1 7.1 0.863  1.3  3.1 0.422 1.0
##   DRB TRB AST STL BLK  TO  PF
## 1 3.9 5.0 7.5 2.2 1.3 3.4 2.3
## 2 6.3 7.6 7.2 1.7 1.1 3.0 1.7
## 3 4.1 5.2 4.9 1.5 0.5 2.6 2.3
## 4 7.3 8.4 2.4 0.8 0.8 1.9 2.2
## 5 4.4 5.1 2.7 1.0 1.4 2.5 3.1
## 6 5.5 6.5 2.8 1.3 0.7 3.0 1.8

Create a cool color heat map ordered by points.

nba <- nba[order(nba$PTS),]
row.names(nba) <- nba$Name
nba <- nba[2:19,]
nba_matrix <- data.matrix(nba)
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, 
                       col = cm.colors(80), scale="column", margins=c(5,10),
                       xlab = "NBA Player Stats",
                       ylab = "NBA Players",
                       main = "NBA Player Stats in 2008")

Cool colored map is changed to a warm color heatmap

nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = heat.colors(80), 
                       scale="column", margins=c(5,10),
                        xlab = "NBA Player Stats",
                       ylab = "NBA Players",
                       main = "NBA Player Stats in 2008")

Warm color heatmap is changed to Viridis color palette

library(viridis)

## Loading required package: viridisLite

nba_heatmap <- heatmap(nba_matrix, Rowv=NA, col = viridis(25), 
                       scale="column", margins=c(5,10),
                        xlab = "NBA Player Stats",
                       ylab = "NBA Players",
                       main = "NBA Payer Stats in 2008")

Late Arrivals heatmap

Load data from a prebuilt dataset

library(nycflights13)
library(RColorBrewer)
data(flights)

Clean the data to ignore NA Values

flights_nona <- flights %>%
  filter(!is.na(distance) & !is.na(arr_delay))

Create a summary table

by_tailnum <- flights_nona %>%
  group_by(tailnum) %>%  
  summarise(count = n(),   
        dist = mean(distance), 
        delay = mean(arr_delay)) 
delay <- filter(by_tailnum, count > 20, dist < 2000)

Create a scatterplot, average delays are partly related to average distance

ggplot(delay, aes(dist, delay)) +
  geom_point(aes(size = count), alpha = 1/2) +
  geom_smooth() +
  scale_size_area()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Calculate the cost of usage cost of late arrivals

delays <- flights_nona %>%    
  group_by (dest) %>%         
  summarize (count = n(),     
             dist = mean (distance),   
             delay = mean (arr_delay), 
             delaycost = mean(count*delay/dist)) 
                                       
delays <- arrange(delays, desc(delaycost))    
head(delays)

## # A tibble: 6 × 5
##   dest  count  dist delay delaycost
##   <chr> <int> <dbl> <dbl>     <dbl>
## 1 DCA    9111  211.  9.07      391.
## 2 IAD    5383  225. 13.9       332.
## 3 ATL   16837  757. 11.3       251.
## 4 BOS   15022  191.  2.91      230.
## 5 CLT   13674  538.  7.36      187.
## 6 RDU    7770  427. 10.1       183.

DCA and IAD have the highest delay cost

displayed using knitr package and kable function

library(knitr)
kable(delays, 
      caption = "Table of Mean Distance, Mean Arrival Delay, and Highest Delay Costs",
      digits = 2)

Table of Mean Distance, Mean Arrival Delay, and Highest Delay Costs
dest	count	dist	delay	delaycost
DCA	9111	211.08	9.07	391.36
IAD	5383	224.74	13.86	332.08
ATL	16837	757.14	11.30	251.29
BOS	15022	190.74	2.91	229.53
CLT	13674	538.01	7.36	187.07
RDU	7770	426.73	10.05	183.04
RIC	2346	281.27	20.11	167.74
PHL	1541	94.34	10.13	165.42
BUF	4570	296.87	8.95	137.71
ORD	16566	729.02	5.88	133.54
ROC	2358	259.36	11.56	105.11
BWI	1687	179.35	10.73	100.90
CVG	3725	575.23	15.36	99.50
DTW	9031	498.20	5.43	98.43
CLE	4394	414.00	9.18	97.45
PWM	2288	276.03	11.66	96.65
BNA	6084	758.22	11.81	94.78
FLL	11897	1070.06	8.08	89.86
BTV	2510	265.12	8.95	84.74
MCO	13967	943.11	5.45	80.78
CMH	3326	476.55	10.60	73.99
SYR	1707	206.07	8.90	73.76
MDW	4025	718.09	12.36	69.30
MHT	932	207.38	14.79	66.46
PIT	2746	334.10	7.68	63.13
TPA	7390	1003.93	7.41	54.53
ORF	1434	288.55	10.95	54.41
PBI	6487	1028.82	8.56	53.99
MKE	2709	733.37	14.17	52.33
STL	4142	878.83	11.08	52.21
MSP	6929	1017.46	7.27	49.51
GSO	1492	449.79	14.11	46.81
CHS	2759	632.96	10.59	46.17
ALB	418	143.00	14.40	42.08
CAK	842	397.00	19.70	41.78
DEN	7169	1614.69	8.61	38.21
JAX	2623	824.71	11.84	37.67
PVD	358	160.00	16.23	36.32
DAY	1399	536.91	12.68	33.04
IND	1981	652.26	9.94	30.19
BDL	412	116.00	7.05	25.03
MCI	1885	1097.65	14.51	24.92
GRR	728	605.71	18.19	21.86
TYS	578	638.34	24.07	21.79
SDF	1104	645.96	12.67	21.65
IAH	7085	1407.18	4.24	21.35
GSP	790	595.98	15.94	21.12
MSY	3715	1177.73	6.49	20.47
MEM	1686	954.48	10.65	18.80
SAV	749	709.27	15.13	15.98
MSN	556	803.93	20.20	13.97
SFO	13173	2577.93	2.67	13.66
OMA	817	1135.56	14.70	10.58
RSW	3502	1072.85	3.24	10.57
HOU	2083	1420.26	7.18	10.52
DSM	523	1020.56	19.01	9.74
AUS	2411	1514.25	6.02	9.58
SJU	5773	1599.84	2.52	9.10
TUL	294	1215.00	33.66	8.14
BGR	358	378.00	8.03	7.60
CAE	106	603.70	41.76	7.33
OKC	315	1325.00	30.62	7.28
XNA	992	1142.44	7.47	6.48
ACK	264	199.00	4.85	6.44
BHM	269	866.00	16.88	5.24
BQN	888	1578.99	8.25	4.64
PHX	4606	2141.34	2.10	4.51
CRW	134	444.00	14.67	4.43
AVL	261	583.61	8.00	3.58
LAX	16026	2468.62	0.55	3.55
SRQ	1201	1044.64	3.08	3.54
SAN	2709	2437.28	3.14	3.49
MIA	11593	1091.54	0.30	3.18
SAT	659	1578.18	6.95	2.90
PDX	1342	2445.61	5.14	2.82
DFW	8388	1383.06	0.32	1.95
TVC	95	652.45	12.97	1.89
PSE	358	1617.00	7.87	1.74
CHO	46	305.00	9.50	1.43
SMF	282	2521.00	12.11	1.35
BUR	370	2465.00	8.18	1.23
ILM	107	500.00	4.64	0.99
EGE	207	1735.80	6.30	0.75
LAS	5952	2240.98	0.26	0.68
ABQ	254	1826.00	4.38	0.61
MYR	58	550.67	4.60	0.48
SJC	328	2569.00	3.45	0.44
OAK	309	2576.00	3.08	0.37
JAC	21	1875.90	28.10	0.31
SLC	2451	1986.99	0.18	0.22
BZN	35	1882.00	7.60	0.14
SBN	10	645.40	6.50	0.10
EYW	17	1207.00	6.35	0.09
HDN	14	1728.00	2.14	0.02
MTJ	14	1795.00	1.79	0.01
ANC	8	3370.00	-2.50	-0.01
LGB	661	2465.00	-0.06	-0.02
LEX	1	604.00	-22.00	-0.04
PSP	18	2378.00	-12.72	-0.10
HNL	701	4972.76	-1.37	-0.19
MVY	210	173.00	-0.29	-0.35
STT	518	1626.99	-3.84	-1.22
SEA	3885	2412.68	-1.10	-1.77
SNA	812	2434.00	-7.87	-2.62

Top 100 of the largest delay costs

top100 <- delays %>%
  head(100) %>%
  arrange(delaycost)
row.names(top100) <- top100$dest

## Warning: Setting row names on a tibble is deprecated.

Convert dataframe to matrix

delays_mat <- data.matrix(top100)
delays_mat2 <-delays_mat[,2:5]

Create a heatmap of the cost of late arrivals

heatmap(delays_mat2,
        Rowv = NA, Colv = NA,
        col= viridis(25),
        s=0.6, v=1, scale = "column",
        margins = c(7,10),
        main = "Cost of Late Arrivals",
        xlab = "Flight Characteristics",
        ylab = "Arrival Airport",
        labCol =  c("Flights", "Distance", "Delay", "Cost Index"),
        cexCol=1, cexRow=1)

## layout: widths =  0.05 4 , heights =  0.25 4 ; lmat=
##      [,1] [,2]
## [1,]    0    3
## [2,]    2    1

How to create Treemaps

Load the data from url and display the first 6 rows

data <- read.csv("http://datasets.flowingdata.com/post-data.txt")
head(data)

##     id  views comments               category
## 1 5019 148896       28 Artistic Visualization
## 2 1416  81374       26          Visualization
## 3 1416  81374       26               Featured
## 4 3485  80819       37               Featured
## 5 3485  80819       37                Mapping
## 6 3485  80819       37           Data Sources

Use RColorBrewer to change the color palette.

treemap(data, index="category", vSize="views", 
        vColor="comments", type="manual",
        palette="PiYG")

How to create Streamgraphs

Load data from a prebuilt dataset

library(dplyr)
library(streamgraph)
library(babynames)
babynames <- babynames

Create a steamgraph showing names over time

year=rep(seq(1990,2016), each=10)
name=rep(letters[1:10], 27)
value=sample(seq(0,1,0.0001), length(year))
data=data.frame(year, name, value)
streamgraph(data, key="name", value="value", date="year")

## Warning in widget_html(name, package, id = x$id, style = css(width =
## validateCssUnit(sizeInfo$width), : streamgraph_html returned an object of class
## `list` instead of a `shiny.tag`.

## Warning: `bindFillRole()` only works on htmltools::tag() objects (e.g., div(),
## p(), etc.), not objects of type 'list'.

Examine the babynames dataset

ncol(babynames)

## [1] 5

head(babynames)

## # A tibble: 6 × 5
##    year sex   name          n   prop
##   <dbl> <chr> <chr>     <int>  <dbl>
## 1  1880 F     Mary       7065 0.0724
## 2  1880 F     Anna       2604 0.0267
## 3  1880 F     Emma       2003 0.0205
## 4  1880 F     Elizabeth  1939 0.0199
## 5  1880 F     Minnie     1746 0.0179
## 6  1880 F     Margaret   1578 0.0162

str(babynames)

## tibble [1,924,665 × 5] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:1924665] 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
##  $ sex : chr [1:1924665] "F" "F" "F" "F" ...
##  $ name: chr [1:1924665] "Mary" "Anna" "Emma" "Elizabeth" ...
##  $ n   : int [1:1924665] 7065 2604 2003 1939 1746 1578 1472 1414 1320 1288 ...
##  $ prop: num [1:1924665] 0.0724 0.0267 0.0205 0.0199 0.0179 ...

Babynames that begin with S and their popularity over the years

babynames %>%
  filter(grepl("^Si", name)) %>%
  group_by(year,name) %>%
  tally(wt=n) %>%
  streamgraph("name", "n", "year")

## Warning in widget_html(name, package, id = x$id, style = css(width =
## validateCssUnit(sizeInfo$width), : streamgraph_html returned an object of class
## `list` instead of a `shiny.tag`.

## Warning: `bindFillRole()` only works on htmltools::tag() objects (e.g., div(),
## p(), etc.), not objects of type 'list'.

How to create alluvials

Load the alluvial package and a prebuilt dataset

library(alluvial)
library(ggalluvial)
data(Refugees)

Show the top 10 most affected countries

ggalluv <- ggplot(Refugees,
                  aes(x = year, y = refugees, alluvium = country)) + 
  theme_bw() +
  geom_alluvium(aes(fill = country),
                color = "white",
                  width = .1,
                  alpha = .8,
                  decreasing = FALSE) +
  scale_fill_brewer(palette = "Paired") +
  scale_x_continuous(lim = c(2002,2013))+
  ggtitle("UNHCR Recognized Refugees \n Top 10 Countries(2003-2013)\n")+
  ylab("Number of Refugees")

ggalluv

Change the y axis to show numerical values

options(scipen=999)
ggalluv

Data 110 Heatmaps etc

Shadeja Fuentes

2023-02-27

How to create heatmaps

NBA Stats Heatmap

Load the data from url and display the first 6 rows of the data

Create a cool color heat map ordered by points.

Cool colored map is changed to a warm color heatmap

Warm color heatmap is changed to Viridis color palette

Late Arrivals heatmap

Load data from a prebuilt dataset

Clean the data to ignore NA Values

Create a summary table

Calculate the cost of usage cost of late arrivals

DCA and IAD have the highest delay cost

displayed using knitr package and kable function

Top 100 of the largest delay costs

Convert dataframe to matrix

Create a heatmap of the cost of late arrivals

How to create Treemaps

Load the data from url and display the first 6 rows

Use RColorBrewer to change the color palette.

How to create Streamgraphs

Load data from a prebuilt dataset

Create a steamgraph showing names over time

Examine the babynames dataset

Babynames that begin with S and their popularity over the years

How to create alluvials

Load the alluvial package and a prebuilt dataset

Show the top 10 most affected countries

Change the y axis to show numerical values