For visualizing size of categorical data.
sample_data <- HairEyeColor %>%
as.data.frame()
sample_data
## Hair Eye Sex Freq
## 1 Black Brown Male 32
## 2 Brown Brown Male 53
## 3 Red Brown Male 10
## 4 Blond Brown Male 3
## 5 Black Blue Male 11
## 6 Brown Blue Male 50
## 7 Red Blue Male 10
## 8 Blond Blue Male 30
## 9 Black Hazel Male 10
## 10 Brown Hazel Male 25
## 11 Red Hazel Male 7
## 12 Blond Hazel Male 5
## 13 Black Green Male 3
## 14 Brown Green Male 15
## 15 Red Green Male 7
## 16 Blond Green Male 8
## 17 Black Brown Female 36
## 18 Brown Brown Female 66
## 19 Red Brown Female 16
## 20 Blond Brown Female 4
## 21 Black Blue Female 9
## 22 Brown Blue Female 34
## 23 Red Blue Female 7
## 24 Blond Blue Female 64
## 25 Black Hazel Female 5
## 26 Brown Hazel Female 29
## 27 Red Hazel Female 7
## 28 Blond Hazel Female 5
## 29 Black Green Female 2
## 30 Brown Green Female 14
## 31 Red Green Female 7
## 32 Blond Green Female 8
It groups rows according to the categories named in …, and effectively creates a column named “value”, which is proportional to the area each group would have on the graph. You can use this function to customize your own plots, or use my pre-built plotting function in the next section.
Arguments: data: the data, containing the columns
specified by area and ...
width and height: dimensions of the
graph
area: a string that names a numerical column in
data, which determines the size each category gets in the
grid
...: strings that name the columns by which the data should
be categorized into
Return:
A data.frame containing the original data, in addition to columns
xmin, ymin, width,
height defining the rectangular grid in which the row would
go into in the tree-map.
gridder <- function(data, width, height, area, ...){
# ... is the column names to group by
data <- data %>% mutate(value=data[[area]])
return(recursive_gridder(data, width=width, height=height,
area="value", xmin=0, ymin=0, ...))
}
It is called by the wrapper to recursively build the grid on each level (one recursion for each category-column named in …). For actually building the grid, it calls the grid_single_level function.
recursive_gridder <- function(data, width, height, area, xmin, ymin, ...){
# base case
if (length(c(...))==1) {
return(grid_single_level(data=data, by=c(...)[1], width, height, xmin, ymin))
}
# build the grid for the current level (e.g. the "Hair" column)
current_category <- c(...)[1]
category_grid <- data %>%
group_by_at(current_category) %>%
summarize(value=sum(value)) %>%
grid_single_level(data=., by=current_category,
width, height, xmin, ymin)
# all the different categories this level has
# (e.g. "Black", "Brown" for the "Hair" level)
data_groups <- unique(data[[current_category]])
# prepare a data.frame for recording the grids computed within each category
result <- data %>% mutate(grouped_value=0, xmin=0, ymin=0, width=0, height=0)
# for each category this level has, recurse down a level to build the grid within this level
for (group in data_groups) {
# extract the grid this category gets in the level
index <- which(category_grid[[current_category]]==group)
group_xmin = category_grid$xmin[index]
group_ymin = category_grid$ymin[index]
group_width = category_grid$width[index]
group_height = category_grid$height[index]
this_groups_data <- filter(data,data[[current_category]]==group) %>%
(function(x){do.call(recursive_gridder,
c(list(x, group_width, group_height,
area, group_xmin, group_ymin),
list(...)[-1]))
})
# record the grid for this category
result[which(result[[current_category]]==group),] = this_groups_data
}
return(result)
}
This function makes the final preparations to the data for an algorithm to compute the grid (e.g. adding zero-filled columns).
grid_single_level <- function(data, by, width=1, height=1, xmin=0, ymin=0){
# given the level specified in <by>, group_by() the data into categories
# to compute the weight each category has.
grouped_values <- data %>%
group_by_at(by) %>%
summarize(grouped_value = sum(value)) %>%
arrange(grouped_value) %>%
mutate(xmin=0, ymin=0, width=0, height=0)
# call the algorithm to compute the grid each category has, according to their weights.
group_grid <- compute_grid(grouped_values, width, height, xmin, ymin)
# left_join the grid onto the original data, on the column <by>
return(left_join(data, group_grid, by=by))
}
Similar to agglomerative (hierarchical) clustering: look for the smallest two categories, merge them into a new category, and repeat. Record the mergers in this process, and reverse it to build the grid: for each merger, split the grid their union has into two parts.
# helper function that finds the nth smallest element in a list
index_minN <- function(x, N=2){
return(order(x)[N])
}
# the agglomeration function: given the data, it performs the mergering and records (1) the categories that got merged, and (2) the size each half has. These two are needed for splitting the grid in reverse.
aggregate <- function(data, value_col){
sums <- as.double(data[[value_col]]) # the size of each (group of) category
group_indices <- as.list(1:nrow(data)) # index of each (group of) category
# the record for the mergers. Each item is another list containing
# [[index1]] and [[index2]] of the two (groups of) categories that got merged, and
# [[size]] of the sizes of the two groups.
record = list()
while (length(sums)>1){
# find the two smallest (groups of) categories
min_index = index_minN(sums, 1)
min2_index = index_minN(sums, 2)
# record the merger
record[[length(record)+1]] <- list(index1 = group_indices[[min_index]],
index2 = group_indices[[min2_index]],
size=c(sums[min_index],sums[min2_index]))
# update the sizes
sums <- c(sums[-c(min_index, min2_index)],
sums[min_index]+sums[min2_index])
# update the indices
group_indices[[length(group_indices)+1]] <- c(group_indices[[min_index]],
group_indices[[min2_index]])
group_indices <- group_indices[-c(min_index, min2_index)]
}
return(record)
}
compute_grid <- function(data, width=1, height=1, xmin=0, ymin=0){
# relabel to avoid scoping issues
width0=width
height0=height
xmin0=xmin
ymin0=ymin
# start with each category getting the entire grid.
data <- data %>% mutate(width=width0, height=height0, xmin=xmin0, ymin=ymin0)
# compute the aggregation process
greg <- aggregate(data, value_col="grouped_value")
for (merge in rev(greg)) {
# for each merger (in reverse order), compute the grids after the division.
size_proportion <- merge$size[1]/(merge$size[1]+merge$size[2])
current_specs <- data[merge$index1[1],]
current_width <- current_specs$width
current_height <- current_specs$height
current_xmin <- current_specs$xmin
current_ymin <- current_specs$ymin
if (current_width >= current_height){ # split into left & right
new_width1 <- current_width * size_proportion
new_width2 <- current_width * (1-size_proportion)
new_xmin2 <- current_xmin + new_width1
# for categories in the first group in the merger, they now get the left half
data[merge$index1,"width"] <- new_width1
data[merge$index2,"width"] <- new_width2
# same for the second group getting the right half
data[merge$index2,"xmin"] <- new_xmin2
} else { # split into top & bottom
new_height1 <- current_height * size_proportion
new_height2 <- current_height * (1-size_proportion)
new_ymin2 <- current_ymin + new_height1
# first group getting top
data[merge$index1,"height"] <- new_height1
data[merge$index2,"height"] <- new_height2
# second group getting bottom
data[merge$index2,"ymin"] <- new_ymin2
}
}
return(data)
}
Again, gridder() returns a data.frame containing the original data,
along with columns xmin, ymin,
width, height defining the rectangular grid in
which the row would go into in the tree-map.
sample_grid <- gridder(sample_data, area="Freq",width=1, height=1, "Hair","Eye")
head(sample_grid, 5)
## Hair Eye Sex Freq value grouped_value xmin ymin width
## 1 Black Brown Male 32 32 68 0.6745495 0.6470588 0.3254505
## 2 Brown Brown Male 53 53 119 0.0000000 0.0000000 0.4831081
## 3 Red Brown Male 10 10 26 0.8107156 0.4150327 0.1892844
## 4 Blond Brown Male 3 3 7 0.4831081 0.2012280 0.1343105
## 5 Black Blue Male 11 11 20 0.4831081 0.6470588 0.1914414
## height
## 1 0.35294118
## 2 0.41608392
## 3 0.23202614
## 4 0.08803724
## 5 0.17647059
# the unnamed arguments in the end should name the columns (levels) by which the data should be categorized
# grouped value indicates the sum of values in rows of the same category (e.g. Sex is not selected as a level, so Sex=Male and Sex=Female rows with the same Hair and Eye categories are summed).
# value is a copy of Freq created to standardize the column name for internal use. Please keep it if you pass this data.frame to the plotting function in the next section; if you are using this for other use, you can safely discard it.
Arguments: data: a data.frame, containing categorical
columns in by, and the numerical column named by
area.
by: a character vector, naming the columns by which the
data should be lumped into categories; their order in the vector should
correspond to their order on the graph: the first column is the the
highest level of grids.
area: a string naming the column that determines the size
of the grid each category gets.
label: how each block should be labelled:
- “all” means the categories on each level are pasted together, e.g. as
in “Black Hazel” in the sample
- “last” means only use the categories on the lowest level
- or provide a custom function to compute the label by being applied to
each row.
- anything else would result in no labels.
width and height: dimensions of the
image
linewidth: width of grid-lines, either a constant for
constant width, or “auto” for decreasing width
fill_color: a string indicating the base color for filling
the grids. A smaller grid will have a more diluted color, while a larger
grid will be closer to this specified color.
...: arguments to be passed onto geom_rect() used for
plotting.
- pass color=<color> for grid-lines
plant_tree <- function(data, by=c(), area, label="all", width=1, height=1, linewidth="auto", fill_color="#7593C5", ...){
if (linewidth == "auto"){
linewidth <- seq(0, 1, 1/length(by))[-1]
} else if (length(linewidth) == 1){
linewidth <- rep(linewidth, length(by))
}
plot <- ggplot()
for (i in length(by):1) {
new_grid <- do.call(gridder, c(list(data, width, height, area),
as.list(by[1:(length(by)-i+1)])))
if (i==1){
colors <- new_grid %>% mutate(color=width * height)
colors <- colors$color/max(colors$color)
plot = plot + geom_rect(data=new_grid,
aes(xmin=xmin, xmax=xmin+width,
ymin=ymin, ymax=ymin+height),
fill = fill_color,
alpha=colors, linewidth=linewidth[i], ...)
} else {
plot = plot + geom_rect(data=new_grid,
aes(xmin=xmin, xmax=xmin+width,
ymin=ymin, ymax=ymin+height),
fill = NA,
linewidth=linewidth[i], ...)
}
}
if (identical(label,"all")){
labeler <- function(x){
label = ""
for (category in by){label <- paste(label, x[[category]])}
return(label)
}
labels <- apply(data, 1, labeler)
} else if (identical(label,"last")) {
labeler <- function(x){x[[by[length(by)]]]}
labels <- apply(data, 1, labeler)
} else if (is.function(label)){
labels <- apply(data, 1, label)
} else {
labels <- rep("", nrow(new_grid))
}
plot = plot +
geom_fit_text(data=new_grid,
aes(xmin=xmin, xmax=xmin+width,
ymin=ymin, ymax=ymin+height,
label=labels), reflow = TRUE)
return(plot)
}
(except for grid-lines)
plant_tree(data=sample_data,
by=c("Hair","Eye"), area="Freq", color="black") +
theme_void() + theme(legend.position="none")
Remember color is the color of the grid-lines, while
fill_color is that of the fill.
labeler <- function(x){
paste(x[["Hair"]],"hair,",x[["Eye"]],"eyed",x[["Sex"]])}
plant_tree(data=sample_data,
by=c("Sex","Hair","Eye"), area="Freq",
label=labeler, color="black", fill_color = "#E19566") +
theme_void() + theme(legend.position="none")