Homework

Task 1. Pie chart with ggplot

Use the mpg dataset to create a table counting the number of vehicles in each class. Then create a pie chart showing the distribution of vehicle classes.

Create a simple pie chart in base R using the following values: 3, 7, 9, 1, 2

Label the slices:Gr-A, Gr-B, Gr-C, Gr-D, Gr-E

Requirements:

use count(class) from dplyr
use geom_col()
convert the plot to a pie chart with coord_polar(“y”)
use theme_void()
add a title

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# data
data(mpg)

# count vehicle class
count_dat <- mpg %>%
  count(class) 
#head(count_dat)

# pie chart 1 not in base r 
count_dat %>%
  ggplot(aes(x = "", y = n, fill = class)) +
  geom_col() + 
  coord_polar(theta = "y") + 
  theme_void() + 
  labs(title = "Vehicle Class Distribution")

# pie chart 2 in base r
dat1 <- c(3,7,9,1,2)
labs1 <- c("Gr-A", "Gr-B", "Gr-C", "Gr-D", "Gr-E")

pie(dat1, labels = labs1)

Questions

Which vehicle class appears most common? SUV
Why can pie charts become difficult to interpret when there are many categories? It's difficult to distinguish counts from pie slices.

Task 2: Create a simple pie chart in base R using the following values:

Create a donut chart using this small dataset:

data2 <- data.frame(
  category = c("A", "B", "C"),
  count = c(10, 60, 30)
)

# base r pie chart 
pie(data2$count, labels = data2$category)

# ggplot donut pie chart
library(ggplot2)
library(dplyr)
data2 %>%
  ggplot(aes(x = 2, y = count, fill = category)) + 
  geom_col() + 
  coord_polar("y") + 
  xlim(0.5, 2.5) +
  theme_void() + 
  labs(title = "Donut Pie Chart")

Add labels that show either:

the percentages, or
the category names and values

library(dplyr)
library(scales)
library(ggplot2)
# data
data2_new <- data2 %>%
  mutate(
    prop = count / sum(count), # calc proportions
    ymax = cumsum(prop),       # sliced boundaries
    ymin = lag(ymax, default = 0),
    label_pos = (ymax + ymin) / 2,   # text placement 
    label = percent(prop)       # label
  )


  # ggplot donut pie chart

ggplot(data2_new, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) +
  geom_rect() +
  coord_polar(theta = "y") +
  # percent label
  geom_text(aes(x = 3.5, y = label_pos, label = label), size = 4) + 
   theme_void() + 
  xlim(2, 4.5) +
  labs(title = "Donut Pie Chart With Percentages")

Questions

Why do we need to calculate label positions manually for this plot? If we didn't, the labels would not be seen
Which label type is easier to read: percentages only, or category plus value? Depends on the data, category and vlaue coudl provide more information but percentages are more simple.

Task 3: Sunburst Plot

Create the following dataset:

sunburst_data <- data.frame(
  sequence = c(
    "Fruit-Apples",
    "Fruit-Bananas",
    "Fruit-Oranges",
    "Vegetable-Carrots",
    "Vegetable-Broccoli",
    "Vegetable-Peppers"
  ),
  value = c(30, 20, 25, 15, 18, 12)
)

Use the sunburstR package to create a sunburst plot.

#install.packages("sunburstR")
library(sunburstR)

## Warning: package 'sunburstR' was built under R version 4.5.3

sunburst(data = sunburst_data)

Task 4: Treemap

Using this dataset:

sales_data <- data.frame(
  category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
  subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
  value = c(30, 20, 25, 15, 18, 12)
)

Create a treemap.

Requirements

use index = c(“category”, “subcategory”)
use vSize = “value”
include a title

#install.packages("treemap")
library(treemap)

## Warning: package 'treemap' was built under R version 4.5.3

library(dplyr)
library(ggplot2)

sales_data %>%
  treemap(index = c("category", "subcategory"), 
          vSize = "value", 
          title = "Tree Map Fruits and Veggies")

How is a treemap similar to a sunburst chart? they both display categories and subcategories of data with area being the marker for frequency or counts of the variables
How is it different? no live interaction,
Which one do you think is easier to read? treemaps

Task 5: Dendrogram

Create a simple hierarchy with:

one origin
four groups
three subgroups per group

Then convert it into a graph and plot it as a dendrogram. Add labels and points to the ends of the branches.

# library 
#install.packages("ggraph")
library(ggraph)

## Warning: package 'ggraph' was built under R version 4.5.3

library(igraph)

## Warning: package 'igraph' was built under R version 4.5.3

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%()       masks igraph::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor()     masks scales::col_factor()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ tidyr::crossing()       masks igraph::crossing()
## ✖ purrr::discard()        masks scales::discard()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::simplify()       masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

# level 1 

level1 <- data.frame(
  from = "origin", 
  to = paste0("group", 1:4)
)

# level 2
level2 <- data.frame(
  from = level1$to, each = 3,
  to = paste0("subgroup", 1:12)
)

# combine levels 
edges <- bind_rows(level1, level2) 
#View(edges)

# turn list into graph-able object 
graph1 <- graph_from_data_frame(edges)
#View(graph1)

# plot as dendrogram 

ggraph::autograph(graph1, geom_nod
                  ) +
  theme_void() + 
  labs(title = "Dendrogram")

Task 6: Word Cloud

Run the following code to create a word based dataset from The State of the Union text. Annotate the lines to ensure you know how it was built. This uses a real speech and produces meaningful words.

library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 4.5.3

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.5.3

library(dplyr)
library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 4.5.3

text <- austen_books() # creating new data frame from the book chapters
word_counts <- text %>%    # counts of each word in the text
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

data("stop_words")    # loads the stop words like "and"

#Pay special attention to this. This is important for word clouds. 
word_counts <- word_counts %>%  # removes the stop words from our data frame
  anti_join(stop_words)

## Joining with `by = join_by(word)`

Create a word cloud of the most commonly used words in the speech.

wordcloud2(word_counts)

Lesson 8 Homework

Andrew Hand

2026-04-01