Pie charts show how a whole is divided into parts.
Each slice represents the proportion of a category relative to the total.
They are commonly used when:
However, pie charts can become difficult to interpret when there are many slices.
We will use the mpg dataset and count the number of vehicles in each class.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("mpg")
class_counts <- mpg %>%
count(class)
ggplot(class_counts, aes(x = "", y = n, fill = class)) +
geom_col(width = 1) +
coord_polar("y") +
theme_void() +
labs(title = "Vehicle Class Distribution")
Explanation
Prop <- c(3,7,9,1,2)
pie(Prop)
pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E")) #add labels to slices
pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E") , density=10 , angle=c(20,90,30,10,0)) # Add dashed lines by using density, and control the angles.
Donut charts are similar to pie charts but include a hole in the center. This can make labels easier to read and reduce visual clutter.
ggplot(class_counts, aes(x = 2, y = n, fill = class)) +
geom_col(width = 1) +
coord_polar("y") +
xlim(0.5, 2.5) +
theme_void() +
labs(title = "Vehicle Class Distribution (Donut Chart)")
Explanation
The donut hole is created by manipulating the x-axis limits.
To get percentage labels positioned around the donut like in your example, you need to:
library(ggplot2)
library(dplyr)
library(scales)
df <- data.frame(
category = c("A","B","C"),
value = c(10,60,30)
)
# Compute percentages and label positions
df <- df %>%
mutate(
prop = value / sum(value), #calculate proportions
ymax = cumsum(prop), #calculate slice boundaries to see where they begin and end
ymin = lag(ymax, default = 0),
label_pos = (ymax + ymin) / 2, #place text in the center of each slice arc
label = percent(prop)
)
ggplot(df, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) +
geom_rect() +
coord_polar(theta = "y") +
# percentage labels
geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) + #add percentage labels
xlim(2, 4.5) +
theme_void()
data <- data.frame(
category=c("A", "B", "C"),
count=c(10, 60, 30)
)
# Compute percentages
data$fraction <- data$count / sum(data$count)
# Compute the cumulative percentages (top of each rectangle)
data$ymax <- cumsum(data$fraction)
# Compute the bottom of each rectangle
data$ymin <- c(0, head(data$ymax, n=-1))
# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2
# Compute a good label
data$label <- paste0(data$category, "\n value: ", data$count)
# Make the plot
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
geom_rect() +
geom_label( x=3.5, aes(y=labelPosition, label=label), size=6) +
scale_fill_brewer(palette=4) +
coord_polar(theta="y") +
xlim(c(2, 4)) +
theme_void() +
theme(legend.position = "none")
install.packages("sunburstR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(sunburstR)
library(dplyr)
sunburst_data <- data.frame(
sequence = c(
"Fruit-Apples",
"Fruit-Bananas",
"Fruit-Oranges",
"Vegetable-Carrots",
"Vegetable-Broccoli",
"Vegetable-Peppers"
),
value = c(30, 20, 25, 15, 18, 12)
)
sunburst(sunburst_data)
Think of this like a nested dout plot, that can show a category within a category. They are not ideal when:
library(treemap)
library(dplyr)
sales_data <- data.frame(
category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
value = c(30, 20, 25, 15, 18, 12)
)
treemap(
sales_data,
index = c("category", "subcategory"),
vSize = "value",
title = "Treemap of Category and Subcategory Counts"
)
They can become hard to read if:
Circular packing visualizes nested hierarchical structures.
Large circles represent categories, while smaller circles inside represent subcategories.
This is useful for:
library(packcircles)
library(ggforce)
sizes <- c(50, 30, 20, 10, 5)
circle_data <- circleProgressiveLayout(sizes)
library(ggplot2)
ggplot(circle_data) +
geom_circle(aes(x0 = x, y0 = y, r = radius),
fill = "skyblue",
color = "black") +
coord_equal() +
theme_void()
Explanation
library(ggraph)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%() masks igraph::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::compose() masks igraph::compose()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::simplify() masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
##
## The following object is masked from 'package:scales':
##
## viridis_pal
# We need a data frame giving a hierarchical structure. Let's consider the flare dataset:
edges <- flare$edges
vertices <- flare$vertices
mygraph <- graph_from_data_frame( edges, vertices=vertices )
# Control the size of each circle: (use the size column of the vertices data frame)
ggraph(mygraph, layout = 'circlepack', weight=size) +
geom_node_circle() +
theme_void()
ggraph(mygraph, layout = 'circlepack', weight=size) +
geom_node_circle(aes(fill = depth)) + #add the fill designation
theme_void() +
theme(legend.position="top") +
scale_fill_viridis() #choose a color pallette
ggraph(mygraph, layout = 'circlepack', weight=size ) +
geom_node_circle(aes(fill = depth)) +
geom_node_text( aes(label=shortName, filter=leaf, fill=depth, size=size)) +
theme_void() +
theme(legend.position="FALSE") +
scale_fill_viridis()
## Warning in geom_node_text(aes(label = shortName, filter = leaf, fill = depth, :
## Ignoring unknown aesthetics: fill
Dendrograms visualize hierarchical clustering.
They show how observations group together based on similarity.
Common uses include:
# Load packages
library(ggraph)
library(igraph)
library(tidyverse)
# Step 1: Create the top level of the hierarchy
# One starting point called "origin" branches into 5 groups
level1 <- data.frame(
from = "origin",
to = paste0("group_", 1:5)
)
# Step 2: Create the second level of the hierarchy
# Each group branches into 5 subgroups
level2 <- data.frame(
from = rep(level1$to, each = 5),
to = paste0("subgroup_", 1:25)
)
# Step 3: Combine all connections into one edge list
edges <- bind_rows(level1, level2)
# Step 4: Turn the edge list into a graph object
graph <- graph_from_data_frame(edges)
# Step 5: Plot the hierarchy as a dendrogram
ggraph(graph, layout = "dendrogram", circular = FALSE) +
geom_edge_diagonal() +
geom_node_point() +
theme_void()
# Adding Labels
ggraph(graph, layout = 'dendrogram') +
geom_edge_diagonal() +
geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.01) + #add a text label to each of your nodes
ylim(-.4, NA)
ggraph(graph, layout = 'dendrogram') +
geom_edge_diagonal() +
geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.04) +
geom_node_point(aes(filter=leaf) , alpha=0.6) + #add points at the end of each line
ylim(-.5, NA)
Interpretation
ggraph(graph, layout = "dendrogram", circular = TRUE) +
geom_edge_diagonal() +
geom_node_point() +
theme_void()
ggraph(graph, layout = "dendrogram", circular = TRUE) +
geom_edge_link() + #change from curved lines to straight linkages
geom_node_point() +
theme_void()
#adding labels to circular plots is quite difficult.
Word clouds visualize text frequency.
Words that appear more often are displayed larger.
They are commonly used in:
library(wordcloud2)
wordcloud2(data=demoFreq, size=1.6)
# Gives a proposed palette
wordcloud2(demoFreq, size=1.6, color='random-dark')
# or a vector of colors. vector must be same length than input data
wordcloud2(demoFreq, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )
# Change the background color
wordcloud2(demoFreq, size=1.6, color='random-light', backgroundColor="black")
Explanation
You can custom the wordcloud shape using the shape argument. Available shapes are:
# Change the shape:
wordcloud2(demoFreq, size = 0.7, shape = 'star')
Use the mpg dataset to create a table counting the number of vehicles in each class. Then create a pie chart showing the distribution of vehicle classes.
Create a simple pie chart in base R using the following values: 3, 7, 9, 1, 2
Label the slices:Gr-A, Gr-B, Gr-C, Gr-D, Gr-E
Requirements:
ggplot(class_counts, aes(x = "", y = n, fill = class)) +
geom_col(width = 0.5) +
coord_polar("y") +
theme_void() +
labs(title = "Vehicle Class Distribution")
# Base R Pie Chart
Prop <- c(3,7,9,1,2)
pie(Prop)
pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E"))
Questions
The SUV car class seems to appear the most
It it generally hard for the human brain to decipher circles. The categories can be hard to see the proportions and how they compare to the others
Create a donut chart using this small dataset:
data <- data.frame(
category = c("A", "B", "C"),
count = c(10, 60, 30)
)
ggplot(data, aes(x = 2, y = count, fill = category)) +
geom_col(width = 1) +
coord_polar("y") +
xlim(0.5, 2.5) +
theme_void() +
labs(title = "Donut Chart of DIY Numbers",
fill = "Category")
Add labels that show either:
DonutDF <- data %>%
mutate(
prop = count / sum(count), #calculate proportions
ymax = cumsum(prop), #calculate slice boundaries to see where they begin and end
ymin = lag(ymax, default = 0),
label_pos = (ymax + ymin) / 2, #place text in the center of each slice arc
label = percent(prop)
)
ggplot(DonutDF, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) +
geom_rect() +
coord_polar(theta = "y") +
geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) +
xlim(2, 4.5) +
theme_void()
Questions
If label positions were not manually calculated then they would not align with the correct information
Percentage values are easier to read
Create the following dataset:
sunburst_data <- data.frame(
sequence = c(
"Fruit-Apples",
"Fruit-Bananas",
"Fruit-Oranges",
"Vegetable-Carrots",
"Vegetable-Broccoli",
"Vegetable-Peppers"
),
value = c(30, 20, 25, 15, 18, 12)
)
Use the sunburstR package to create a sunburst plot.
sunburst(sunburst_data)
Using this dataset:
sales_data <- data.frame(
category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
value = c(30, 20, 25, 15, 18, 12)
)
Create a treemap.
Requirements
treemap(sales_data,
index = c("category", "subcategory"),
vSize = "value",
title = "Treemap of Fruits and Vegtables"
)
A tree map and sunburst chart both show subcategories inside categories
A sunburts chart is similar to a donut in which how it is interpreted and a treemap looks like filled boxes on a piece of paper showing counts.
I think that the treemap is easier to read
Create a simple hierarchy with:
Then convert it into a graph and plot it as a dendrogram. Add labels and points to the ends of the branches.
level1 <- data.frame(
from = "origin",
to = paste0("group_", 1:4)
)
level2 <- data.frame(
from = rep(level1$to, each = 3),
to = paste0("subgroup_", 1:12)
)
edges <- bind_rows(level1, level2)
Run the following code to create a word based dataset from The State of the Union text. Annotate the lines to ensure you know how it was built. This uses a real speech and produces meaningful words.
library(janeaustenr)
library(tidytext)
library(dplyr)
library(wordcloud2)
text <- austen_books()
word_counts <- text %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
data("stop_words")
word_counts <- word_counts %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
Create a word cloud of the most commonly used words in the speech.
wordcloud2(word_counts, size=1.6, color='random-light', backgroundColor="black")