For visualizing size of categorical data.

Sample Data

sample_data <- HairEyeColor %>% 
  as.data.frame()
sample_data
##     Hair   Eye    Sex Freq
## 1  Black Brown   Male   32
## 2  Brown Brown   Male   53
## 3    Red Brown   Male   10
## 4  Blond Brown   Male    3
## 5  Black  Blue   Male   11
## 6  Brown  Blue   Male   50
## 7    Red  Blue   Male   10
## 8  Blond  Blue   Male   30
## 9  Black Hazel   Male   10
## 10 Brown Hazel   Male   25
## 11   Red Hazel   Male    7
## 12 Blond Hazel   Male    5
## 13 Black Green   Male    3
## 14 Brown Green   Male   15
## 15   Red Green   Male    7
## 16 Blond Green   Male    8
## 17 Black Brown Female   36
## 18 Brown Brown Female   66
## 19   Red Brown Female   16
## 20 Blond Brown Female    4
## 21 Black  Blue Female    9
## 22 Brown  Blue Female   34
## 23   Red  Blue Female    7
## 24 Blond  Blue Female   64
## 25 Black Hazel Female    5
## 26 Brown Hazel Female   29
## 27   Red Hazel Female    7
## 28 Blond Hazel Female    5
## 29 Black Green Female    2
## 30 Brown Green Female   14
## 31   Red Green Female    7
## 32 Blond Green Female    8

Defining Functions for Building the Grid

The Wrapper Function

It groups rows according to the categories named in …, and effectively creates a column named “value”, which is proportional to the area each group would have on the graph. You can use this function to customize your own plots, or use my pre-built plotting function in the next section.

Arguments: data: the data, containing the columns specified by area and ...
width and height: dimensions of the graph
area: a string that names a numerical column in data, which determines the size each category gets in the grid
...: strings that name the columns by which the data should be categorized into
Return:
A data.frame containing the original data, in addition to columns xmin, ymin, width, height defining the rectangular grid in which the row would go into in the tree-map.

gridder <- function(data, width, height, area, ...){
  # ... is the column names to group by
  data <- data %>% mutate(value=data[[area]])
  return(recursive_gridder(data, width=width, height=height, 
                           area="value", xmin=0, ymin=0, ...))
}

The Recursion

It is called by the wrapper to recursively build the grid on each level (one recursion for each category-column named in …). For actually building the grid, it calls the grid_single_level function.

recursive_gridder <- function(data, width, height, area, xmin, ymin, ...){
  
  # base case
  if (length(c(...))==1) {
    return(grid_single_level(data=data, by=c(...)[1], width, height, xmin, ymin))
  }

  # build the grid for the current level (e.g. the "Hair" column)
  current_category <- c(...)[1]
  category_grid <- data %>% 
    group_by_at(current_category) %>%
    summarize(value=sum(value)) %>%
    grid_single_level(data=., by=current_category, 
                      width, height, xmin, ymin)
  
  # all the different categories this level has
  # (e.g. "Black", "Brown" for the "Hair" level)
  data_groups <- unique(data[[current_category]])
  
  # prepare a data.frame for recording the grids computed within each category
  result <- data %>% mutate(grouped_value=0, xmin=0, ymin=0, width=0, height=0)
  
  # for each category this level has, recurse down a level to build the grid within this level
  for (group in data_groups) {
    # extract the grid this category gets in the level
    index <- which(category_grid[[current_category]]==group)
    group_xmin = category_grid$xmin[index]
    group_ymin = category_grid$ymin[index]
    group_width = category_grid$width[index]
    group_height = category_grid$height[index]
    
    this_groups_data <- filter(data,data[[current_category]]==group) %>%
      (function(x){do.call(recursive_gridder,
                           c(list(x, group_width, group_height, 
                                     area, group_xmin, group_ymin),
                             list(...)[-1]))
      })
    # record the grid for this category
    result[which(result[[current_category]]==group),] = this_groups_data
  }
  return(result)
}

Building the Grid

This function makes the final preparations to the data for an algorithm to compute the grid (e.g. adding zero-filled columns).

grid_single_level <- function(data, by, width=1, height=1, xmin=0, ymin=0){

  # given the level specified in <by>, group_by() the data into categories 
  # to compute the weight each category has.
  grouped_values <- data %>% 
    group_by_at(by) %>%
    summarize(grouped_value = sum(value)) %>%
    arrange(grouped_value) %>%
    mutate(xmin=0, ymin=0, width=0, height=0)
  
  # call the algorithm to compute the grid each category has, according to their weights.
  group_grid <- compute_grid(grouped_values, width, height, xmin, ymin)

  # left_join the grid onto the original data, on the column <by>
  return(left_join(data, group_grid, by=by))
}

The Algorithm

Similar to agglomerative (hierarchical) clustering: look for the smallest two categories, merge them into a new category, and repeat. Record the mergers in this process, and reverse it to build the grid: for each merger, split the grid their union has into two parts.

# helper function that finds the nth smallest element in a list
index_minN <- function(x, N=2){
  return(order(x)[N])
}

# the agglomeration function: given the data, it performs the mergering and records (1) the categories that got merged, and (2) the size each half has. These two are needed for splitting the grid in reverse.
aggregate <- function(data, value_col){
  sums <- as.double(data[[value_col]])  # the size of each (group of) category
  group_indices <- as.list(1:nrow(data))  # index of each (group of) category
  
  # the record for the mergers. Each item is another list containing 
  # [[index1]] and [[index2]] of the two (groups of) categories that got merged, and
  # [[size]] of the sizes of the two groups.
  record = list()
  
  while (length(sums)>1){
    # find the two smallest (groups of) categories
    min_index = index_minN(sums, 1)
    min2_index = index_minN(sums, 2)
    
    # record the merger
    record[[length(record)+1]] <- list(index1 = group_indices[[min_index]],
                                       index2 = group_indices[[min2_index]], 
                                       size=c(sums[min_index],sums[min2_index]))
    
    # update the sizes
    sums <- c(sums[-c(min_index, min2_index)],
              sums[min_index]+sums[min2_index])
    
    # update the indices
    group_indices[[length(group_indices)+1]] <- c(group_indices[[min_index]],
                                                  group_indices[[min2_index]])
    group_indices <- group_indices[-c(min_index, min2_index)]
                  
  }
  return(record)
}

compute_grid <- function(data, width=1, height=1, xmin=0, ymin=0){
  
  # relabel to avoid scoping issues
  width0=width
  height0=height
  xmin0=xmin
  ymin0=ymin
  
  # start with each category getting the entire grid.
  data <- data %>% mutate(width=width0, height=height0, xmin=xmin0, ymin=ymin0)
  
  # compute the aggregation process
  greg <- aggregate(data, value_col="grouped_value")
  
  for (merge in rev(greg)) {
    # for each merger (in reverse order), compute the grids after the division.
    size_proportion <- merge$size[1]/(merge$size[1]+merge$size[2])
    current_specs <- data[merge$index1[1],]
    current_width <- current_specs$width
    current_height <- current_specs$height
    current_xmin <- current_specs$xmin
    current_ymin <- current_specs$ymin
    
    
    if (current_width >= current_height){ # split into left & right
      new_width1 <- current_width * size_proportion
      new_width2 <- current_width * (1-size_proportion)
      new_xmin2 <- current_xmin + new_width1
      # for categories in the first group in the merger, they now get the left half
      data[merge$index1,"width"] <- new_width1
      data[merge$index2,"width"] <- new_width2
      # same for the second group getting the right half
      data[merge$index2,"xmin"] <- new_xmin2
    } else {  # split into top & bottom
      new_height1 <- current_height * size_proportion
      new_height2 <- current_height * (1-size_proportion)
      new_ymin2 <- current_ymin + new_height1
      # first group getting top
      data[merge$index1,"height"] <- new_height1
      data[merge$index2,"height"] <- new_height2
      # second group getting bottom
      data[merge$index2,"ymin"] <- new_ymin2
    }
  }
  return(data)
}

Test on Sample Data

Again, gridder() returns a data.frame containing the original data, along with columns xmin, ymin, width, height defining the rectangular grid in which the row would go into in the tree-map.

sample_grid <- gridder(sample_data, area="Freq",width=1, height=1, "Hair","Eye")
head(sample_grid, 5)
##    Hair   Eye  Sex Freq value grouped_value      xmin      ymin     width
## 1 Black Brown Male   32    32            68 0.6745495 0.6470588 0.3254505
## 2 Brown Brown Male   53    53           119 0.0000000 0.0000000 0.4831081
## 3   Red Brown Male   10    10            26 0.8107156 0.4150327 0.1892844
## 4 Blond Brown Male    3     3             7 0.4831081 0.2012280 0.1343105
## 5 Black  Blue Male   11    11            20 0.4831081 0.6470588 0.1914414
##       height
## 1 0.35294118
## 2 0.41608392
## 3 0.23202614
## 4 0.08803724
## 5 0.17647059
# the unnamed arguments in the end should name the columns (levels) by which the data should be categorized

# grouped value indicates the sum of values in rows of the same category (e.g. Sex is not selected as a level, so Sex=Male and Sex=Female rows with the same Hair and Eye categories are summed).
# value is a copy of Freq created to standardize the column name for internal use. Please keep it if you pass this data.frame to the plotting function in the next section; if you are using this for other use, you can safely discard it.

Function for Plotting the Tree Map

Arguments: data: a data.frame, containing categorical columns in by, and the numerical column named by area.
by: a character vector, naming the columns by which the data should be lumped into categories; their order in the vector should correspond to their order on the graph: the first column is the the highest level of grids.
area: a string naming the column that determines the size of the grid each category gets.
label: how each block should be labelled:
- “all” means the categories on each level are pasted together, e.g. as in “Black Hazel” in the sample
- “last” means only use the categories on the lowest level
- or provide a custom function to compute the label by being applied to each row.
- anything else would result in no labels.
width and height: dimensions of the image
linewidth: width of grid-lines, either a constant for constant width, or “auto” for decreasing width
fill_color: a string indicating the base color for filling the grids. A smaller grid will have a more diluted color, while a larger grid will be closer to this specified color.
...: arguments to be passed onto geom_rect() used for plotting.
- pass color=<color> for grid-lines

plant_tree <- function(data, by=c(), area, label="all", width=1, height=1, linewidth="auto", fill_color="#7593C5", ...){
  
  if (linewidth == "auto"){
    linewidth <- seq(0, 1, 1/length(by))[-1]
  } else if (length(linewidth) == 1){
    linewidth <- rep(linewidth, length(by))
  }
  
  plot <- ggplot()
  
  for (i in length(by):1) {
    new_grid <- do.call(gridder, c(list(data, width, height, area),
                       as.list(by[1:(length(by)-i+1)])))
    
    if (i==1){
      colors <- new_grid %>% mutate(color=width * height)
      colors <- colors$color/max(colors$color)
      plot = plot + geom_rect(data=new_grid,
                          aes(xmin=xmin, xmax=xmin+width,
                              ymin=ymin, ymax=ymin+height),
                          fill = fill_color,
                          alpha=colors, linewidth=linewidth[i], ...)
    } else {
    plot = plot + geom_rect(data=new_grid,
                            aes(xmin=xmin, xmax=xmin+width,
                                ymin=ymin, ymax=ymin+height),
                            fill = NA,
                            linewidth=linewidth[i], ...)
    }
  }
  
  if (identical(label,"all")){
    labeler <- function(x){
      label = ""
      for (category in by){label <- paste(label, x[[category]])}
      return(label)
    }
    labels <- apply(data, 1, labeler)
  } else if (identical(label,"last")) {
    labeler <- function(x){x[[by[length(by)]]]}
    labels <- apply(data, 1, labeler)
  } else if (is.function(label)){
    labels <- apply(data, 1, label)
  } else {
    labels <- rep("", nrow(new_grid))
  }
      
  plot = plot + 
    geom_fit_text(data=new_grid, 
                  aes(xmin=xmin, xmax=xmin+width,
                      ymin=ymin, ymax=ymin+height,
                      label=labels), reflow = TRUE)
  
  return(plot)
}

Plotting the sample data

Full default

(except for grid-lines)

plant_tree(data=sample_data, 
           by=c("Hair","Eye"), area="Freq", color="black") +
  theme_void() + theme(legend.position="none")

A bit of customization

Remember color is the color of the grid-lines, while fill_color is that of the fill.

labeler <- function(x){
  paste(x[["Hair"]],"hair,",x[["Eye"]],"eyed",x[["Sex"]])}

plant_tree(data=sample_data,
           by=c("Sex","Hair","Eye"), area="Freq",
           label=labeler, color="black", fill_color = "#E19566") +
    theme_void() + theme(legend.position="none")