Pie Charts

Pie charts show how a whole is divided into parts.

Each slice represents the proportion of a category relative to the total.

They are commonly used when:

there are few categories
proportions are easy to compare

However, pie charts can become difficult to interpret when there are many slices.

We will use the mpg dataset and count the number of vehicles in each class.

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("mpg")

class_counts <- mpg %>%
  count(class)

ggplot(class_counts, aes(x = "", y = n, fill = class)) +
  geom_col(width = 1) +
  coord_polar("y") +
  theme_void() +
  labs(title = "Vehicle Class Distribution")

Explanation

geom_col() creates a stacked bar chart
coord_polar(“y”) converts the bar chart into a circular pie chart
theme_void() removes unnecessary axes and grid lines

Prop <- c(3,7,9,1,2)
pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E")) #add labels to slices

pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E") , density=10 , angle=c(20,90,30,10,0)) # Add dashed lines by using density, and control the angles.

Donut Charts

Donut charts are similar to pie charts but include a hole in the center. This can make labels easier to read and reduce visual clutter.

ggplot(class_counts, aes(x = 2, y = n, fill = class)) +
  geom_col(width = 1) +
  coord_polar("y") +
  xlim(0.5, 2.5) +
  theme_void() +
  labs(title = "Vehicle Class Distribution (Donut Chart)")

Explanation

The donut hole is created by manipulating the x-axis limits.

the bars are drawn away from the center
the middle space becomes empty

To get percentage labels positioned around the donut like in your example, you need to:

Calculate proportions
Compute label positions
Add labels with geom_text()

library(ggplot2)
library(dplyr)
library(scales)

df <- data.frame(
  category = c("A","B","C"),
  value = c(10,60,30)
)

# Compute percentages and label positions
df <- df %>%
  mutate(
    prop = value / sum(value), #calculate proportions
    ymax = cumsum(prop), #calculate slice boundaries to see where they begin and end
    ymin = lag(ymax, default = 0),
    label_pos = (ymax + ymin) / 2, #place text in the center of each slice arc
    label = percent(prop) 
  )

ggplot(df, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) +
  geom_rect() +
  coord_polar(theta = "y") +
  
  # percentage labels
  geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) + #add percentage labels
  
  xlim(2, 4.5) +
  theme_void()

data <- data.frame(
  category=c("A", "B", "C"),
  count=c(10, 60, 30)
)
 
# Compute percentages
data$fraction <- data$count / sum(data$count)

# Compute the cumulative percentages (top of each rectangle)
data$ymax <- cumsum(data$fraction)

# Compute the bottom of each rectangle
data$ymin <- c(0, head(data$ymax, n=-1))

# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2

# Compute a good label
data$label <- paste0(data$category, "\n value: ", data$count)

# Make the plot
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
  geom_rect() +
  geom_label( x=3.5, aes(y=labelPosition, label=label), size=6) +
  scale_fill_brewer(palette=4) +
  coord_polar(theta="y") +
  xlim(c(2, 4)) +
  theme_void() +
  theme(legend.position = "none")

Sunburst Plots

install.packages("sunburstR")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

library(sunburstR)
library(dplyr)

sunburst_data <- data.frame(
  sequence = c(
    "Fruit-Apples",
    "Fruit-Bananas",
    "Fruit-Oranges",
    "Vegetable-Carrots",
    "Vegetable-Broccoli",
    "Vegetable-Peppers"
  ),
  value = c(30, 20, 25, 15, 18, 12)
)

sunburst(sunburst_data)

Think of this like a nested dout plot, that can show a category within a category. They are not ideal when:

students need precise comparisons
there are many small slices
labels become crowded

Treemap

library(treemap)
library(dplyr)

sales_data <- data.frame(
  category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
  subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
  value = c(30, 20, 25, 15, 18, 12)
)

treemap(
  sales_data,
  index = c("category", "subcategory"),
  vSize = "value",
  title = "Treemap of Category and Subcategory Counts"
)

They can become hard to read if:

labels are too long
there are too many tiny categories
viewers need exact comparisons

Circular Packing

Circular packing visualizes nested hierarchical structures.

Large circles represent categories, while smaller circles inside represent subcategories.

This is useful for:

hierarchical datasets
taxonomies
organizational structures

library(packcircles)
library(ggforce)

sizes <- c(50, 30, 20, 10, 5)

circle_data <- circleProgressiveLayout(sizes)

library(ggplot2)

ggplot(circle_data) +
  geom_circle(aes(x0 = x, y0 = y, r = radius),
              fill = "skyblue",
              color = "black") +
  coord_equal() +
  theme_void()

Explanation

each circle size represents a category
circles are packed efficiently to avoid overlap

library(ggraph)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ✔ readr     2.2.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%()       masks igraph::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor()     masks scales::col_factor()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ tidyr::crossing()       masks igraph::crossing()
## ✖ purrr::discard()        masks scales::discard()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::simplify()       masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(viridis)

## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal

# We need a data frame giving a hierarchical structure. Let's consider the flare dataset:
edges <- flare$edges
vertices <- flare$vertices
mygraph <- graph_from_data_frame( edges, vertices=vertices )
 
# Control the size of each circle: (use the size column of the vertices data frame)
ggraph(mygraph, layout = 'circlepack', weight=size) + 
  geom_node_circle() +
  theme_void()

ggraph(mygraph, layout = 'circlepack', weight=size) + 
  geom_node_circle(aes(fill = depth)) + #add the fill designation
  theme_void() + 
  theme(legend.position="FALSE") +
   scale_fill_viridis() #choose a color pallette

ggraph(mygraph, layout = 'circlepack', weight=size ) + 
  geom_node_circle(aes(fill = depth)) +
  geom_node_text( aes(label=shortName, filter=leaf, fill=depth, size=size)) +
  theme_void() + 
  theme(legend.position="FALSE") + 
  scale_fill_viridis()

## Warning in geom_node_text(aes(label = shortName, filter = leaf, fill = depth, :
## Ignoring unknown aesthetics: fill

Dendrograms

Dendrograms visualize hierarchical clustering.

They show how observations group together based on similarity.

Common uses include:

gene expression analysis
species similarity
document clustering

# Load packages
library(ggraph)
library(igraph)
library(tidyverse)

# Step 1: Create the top level of the hierarchy
# One starting point called "origin" branches into 5 groups
level1 <- data.frame(
  from = "origin",
  to = paste0("group_", 1:5)
)

# Step 2: Create the second level of the hierarchy
# Each group branches into 5 subgroups
level2 <- data.frame(
  from = rep(level1$to, each = 5),
  to = paste0("subgroup_", 1:25)
)

# Step 3: Combine all connections into one edge list
edges <- bind_rows(level1, level2)

# Step 4: Turn the edge list into a graph object
graph <- graph_from_data_frame(edges)

# Step 5: Plot the hierarchy as a dendrogram
ggraph(graph, layout = "dendrogram", circular = FALSE) +
  geom_edge_diagonal() +
  geom_node_point() +
  theme_void()

# Adding Labels 
ggraph(graph, layout = 'dendrogram') + 
  geom_edge_diagonal() +
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.01) + #add a text label to each of your nodes
  ylim(-.4, NA)

ggraph(graph, layout = 'dendrogram') + 
  geom_edge_diagonal() +
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.04) +
  geom_node_point(aes(filter=leaf) , alpha=0.6) + #add points at the end of each line
  ylim(-.5, NA)

Interpretation

each branch represents a cluster
observations that join earlier are more similar

Circular Dendrograms

ggraph(graph, layout = "dendrogram", circular = TRUE) +
  geom_edge_diagonal() +
  geom_node_point() +
  theme_void()

ggraph(mygraph, layout = "dendrogram", circular = TRUE) +
  geom_edge_link() + #change from curved lines to straight linkages
  geom_node_point() + 
  theme_void()

#adding labels to circular plots is quite difficult.

Word Clouds

Word clouds visualize text frequency.

Words that appear more often are displayed larger.

They are commonly used in:

text analysis
social media analysis
document exploration

library(wordcloud2)

wordcloud2(data=demoFreq, size=1.6)

# Gives a proposed palette
wordcloud2(demoFreq, size=1.6, color='random-dark')

# or a vector of colors. vector must be same length than input data
wordcloud2(demoFreq, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )

# Change the background color
wordcloud2(demoFreq, size=1.6, color='random-light', backgroundColor="black")

Explanation

word size reflects frequency
color adds visual variation

You can custom the wordcloud shape using the shape argument. Available shapes are:

circle
cardioid
diamond
triangle-forward
triangle
pentagon
star

# Change the shape:
wordcloud2(demoFreq, size = 0.7, shape = 'star')

Homework

Task 1. Pie chart with ggplot

Use the mpg dataset to create a table counting the number of vehicles in each class. Then create a pie chart showing the distribution of vehicle classes.

Create a simple pie chart in base R using the following values: 3, 7, 9, 1, 2

Label the slices:Gr-A, Gr-B, Gr-C, Gr-D, Gr-E

Requirements:

use count(class) from dplyr
use geom_col()
convert the plot to a pie chart with coord_polar(“y”)
use theme_void()
add a title

library(ggplot2)
library(dplyr)

data("mpg")

class_counts <- mpg %>%
  count(class)

class_counts

ggplot(class_counts, aes(x = "", y = n, fill = class)) +
  geom_col(width = 1) +
  coord_polar("y") +
  theme_void() +
  labs(title = "Vehicle Class Distribution")

values <- c(3, 7, 9, 1, 2)
labels <- c("Gr-A", "Gr-B", "Gr-C", "Gr-D", "Gr-E")
pie(values, labels = labels, main = "Pie Chart")

Questions

Which vehicle class appears most common? SUVs appear to be the most common.
Why can pie charts become difficult to interpret when there are many categories? Its very hard to tell different proportions from one another when there are too many categorical variables being shown.

Task 2: Create a simple pie chart in base R using the following values:

Create a donut chart using this small dataset:

data <- data.frame(
  category = c("A", "B", "C"),
  count = c(10, 60, 30)
)

ggplot(data =data, aes(x = 2, y = count, fill = category)) +
  geom_col(color = "white") +
  coord_polar(theta = "y") +
  xlim(0.5, 2.5) +  
  theme_void() +
  labs(title = "Donut Chart")

  # percentage labels
  #geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) + #add percentage labels

Add labels that show either:

the percentages, or
the category names and values

library(ggplot2)

data <- data.frame(
  category = c("A", "B", "C"),
  counts = c(10, 60, 30)
)

ggplot(data, aes(x = 3, y = counts, fill = category)) +
  geom_col(color = "white") +
  coord_polar(theta = "y") +
  geom_text(
    aes(label = paste0(round(counts / sum(counts) * 100, 1), "%")),
    position = position_stack(vjust = 0.5),
    size = 4
  ) +
  xlim(2, 4.5) +
  theme_void() +
  labs(title = "Donut Chart with Percentage Labels")

Questions

Why do we need to calculate label positions manually for this plot? Donut graphs work in a circle and usually R uses cartesian points instead of polar so this must be calculated to make up for that.
Which label type is easier to read: percentages only, or category plus value? Percentages are easier to read, and legends can be added to distinguish between categories.

Task 3: Sunburst Plot

Create the following dataset:

sunburst_data <- data.frame(
  sequence = c(
    "Fruit-Apples",
    "Fruit-Bananas",
    "Fruit-Oranges",
    "Vegetable-Carrots",
    "Vegetable-Broccoli",
    "Vegetable-Peppers"
  ),
  value = c(30, 20, 25, 15, 18, 12)
)

Use the sunburstR package to create a sunburst plot.

install.packages("sunburstR")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

library(dplyr)
library (sunburstR)


sunburst_data <- data.frame(
  sequence = c(
    "Fruit-Apples",
    "Fruit-Bananas",
    "Fruit-Oranges",
    "Vegetable-Carrots",
    "Vegetable-Broccoli",
    "Vegetable-Peppers"
  ),
  value = c(30, 20, 25, 15, 18, 12)
)
sunburst(sunburst_data)

Task 4: Treemap

Using this dataset:

sales_data <- data.frame(
  category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
  subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
  value = c(30, 20, 25, 15, 18, 12)
)

Create a treemap.

Requirements

use index = c(“category”, “subcategory”)
use vSize = “value”
include a title

library(ggplot2)
install.packages("treem")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

## Warning: package 'treem' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

library(treemap)


sales_data <- data.frame(
  category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
  subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
  value = c(30, 20, 25, 15, 18, 12)
)

treemap(
  sales_data,
  index = c("category", "subcategory"),
  vSize = "value",
  title = "Treemap of Sales Data"
)

How is a treemap similar to a sunburst chart? Both display data with categories and subcategories, and proportions for each.
How is it different? Treemaps don’t overlay the values together as well, so its a little harder to compare values.
Which one do you think is easier to read? Sunburst plots are easier to read because its easier to compare subcategory values because of the overlap.

Task 5: Dendrogram

Create a simple hierarchy with:

one origin
four groups
three subgroups per group

Then convert it into a graph and plot it as a dendrogram. Add labels and points to the ends of the branches.

library(ggraph)
library(igraph)
library(tidyverse)

# Step 1: Create the top level of the hierarchy
# One starting point called "origin" branches into 4 groups
level1 <- data.frame(
  from = "origin",
  to = paste0("group_", 1:4)
)

# Step 2: Create the second level of the hierarchy
# Each group branches into 3 subgroups
level2 <- data.frame(
  from = rep(level1$to, each = 3),
  to = paste0("subgroup_", 1:12)
)

# Step 3: Combine all connections into one edge list
edges <- bind_rows(level1, level2)

# Step 4: Turn the edge list into a graph object
graph <- graph_from_data_frame(edges)

# Step 5: Plot the hierarchy as a dendrogram
ggraph(graph, layout = "dendrogram", circular = FALSE) +
  geom_edge_diagonal() +
  geom_node_point() +
  theme_void()

# Adding Labels 
ggraph(graph, layout = 'dendrogram') + 
  geom_edge_diagonal() +
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.01) + #add a text label to each of your nodes
  ylim(-.4, NA)

ggraph(graph, layout = 'dendrogram') + 
  geom_edge_diagonal() +
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.04) +
  geom_node_point(aes(filter=leaf) , alpha=0.6) + #add points at the end of each line
  ylim(-.5, NA)

Task 6: Word Cloud

Run the following code to create a word based dataset from The State of the Union text. Annotate the lines to ensure you know how it was built. This uses a real speech and produces meaningful words.

library(janeaustenr)
library(tidytext)
library(dplyr)
library(wordcloud2)
#loading packages

text <- austen_books() #creating dataset from Jane Austen's books
word_counts <- text %>% #Creating dataset for word count within text
  unnest_tokens(word, text) %>% #breaking text into individual words
  count(word, sort = TRUE) # counting the frequency of each word and ordering it from most common to least common

data("stop_words") # data for common stop words like "the" and "and"

#Pay special attention to this. This is important for word clouds. 
word_counts <- word_counts %>% #creating word count dataset
  anti_join(stop_words) #removing stop words from being counted

## Joining with `by = join_by(word)`

Create a word cloud of the most commonly used words in the speech.

install.packages("wordcloud2")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

library(wordcloud2)
library(dplyr)

word_counts <- word_counts %>%
  slice_max(n, n = 100)

wordcloud2(data=word_counts, size=1.6)

# Gives a proposed palette
wordcloud2(word_counts, size=1.6, color='random-dark')

# or a vector of colors. vector must be same length than input data
wordcloud2(word_counts, size=1.6, color=rep_len( c("green","blue"), nrow(word_counts) ) )

# Change the background color
wordcloud2(word_counts, size=1.6, color='random-light', backgroundColor="black")

Pie and Donut Charts, Dendrograms, Circular Packing, Sunburst, & WordCloud

Pie Charts

Donut Charts

Sunburst Plots

Treemap

Circular Packing

Dendrograms

Circular Dendrograms

Word Clouds

Homework

Task 1. Pie chart with ggplot

Task 2: Create a simple pie chart in base R using the following values:

Task 3: Sunburst Plot

Task 4: Treemap

Task 5: Dendrogram

Task 6: Word Cloud