Homework

Task 1. Pie chart with ggplot

Vehicle & Class Distribution:

install.packages("ggplot2") #install ggplot

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

install.packages("dplyr") #install dplyr

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

library(ggplot2) #ensure ggplot installed
library(dplyr) #ensure dplyr installed

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

class_counts <- mpg %>% #create table counting vehicles per class
  count(class) #count by class

ggplot(class_counts, aes(x = "", y = n, fill = class)) + #use class data, fill by class
  geom_col(width = 1) + #set width for bar chart
  coord_polar("y") + #convert bar to pie chart
  theme_void() + #remove grids/lines
  labs(title = "Vehicle Class Distribution") #add title

Questions

Which vehicle class appears most common? SUV’s appear most often.
Why can pie charts become difficult to interpret when there are many categories? The sections become too thin for us to effectively conceptualize.

Create Pie Chart Given Values:

Prop <- c(3,7,9,1,2) #loading in given values
pie(Prop) #create pie chart

pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E")) #add given labels

pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E"),density=10, angle=c(20,90,30,10,0)) # Add dashed lines by using density, and control the angles.

Task 2: Create a simple pie chart in base R using given values:

Donut Chart:

install.packages("scales") #download scales package

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

library(scales) #ensure scales downloaded
data <- data.frame( 
  category = c("A", "B", "C"),
  count = c(10, 60, 30)
)

df <- data %>% #modify dataset
  mutate(
    prop = count / sum(count), #calculate proportions
    ymax = cumsum(prop), #calculate slice locations
    ymin = lag(ymax, default = 0), #calculate slice locations
    label_pos = (ymax + ymin) / 2, #center text
    label = percent(prop) #turn into percentages
  )

ggplot(df, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) + #plot the data
  geom_rect() + #select area
  coord_polar(theta = "y") + #specify location
  
  geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) + #add percent labels
  
  xlim(2, 4.5) + #set x max/min
  theme_void() #remove bars & grids

Questions

Why do we need to calculate label positions manually for this plot? They aren’t automatically calculated.
Which label type is easier to read: percentages only, or category plus value? In terms of a pie chart, percentages are easier to read.

Task 3: Sunburst Plot

Create the following dataset:

sunburst_data <- data.frame(
  sequence = c(
    "Fruit-Apples",
    "Fruit-Bananas",
    "Fruit-Oranges",
    "Vegetable-Carrots",
    "Vegetable-Broccoli",
    "Vegetable-Peppers"
  ),
  value = c(30, 20, 25, 15, 18, 12)
)

Create Sunburst Plot:

install.packages("sunburstR") #install sunburst package

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

library(sunburstR) #ensure sunburst package installed
sunburst(sunburst_data) #create sunburst plot using given dataset

Task 4: Treemap

Using this dataset:

sales_data <- data.frame(
  category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
  subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
  value = c(30, 20, 25, 15, 18, 12)
)

Create Treemap

install.packages("treemap") #install treemap package

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

library(treemap) #ensure treemap installed

treemap( #create treemap
  sales_data, #use sales_data from above dataset
  index = c("category", "subcategory"), #sort by category + subcategory
  vSize = "value", #base size on value
  title = "Treemap of Category and Subcategory Counts" #set title
)

How is a treemap similar to a sunburst chart? It shows a range of categories sized by value.
How is it different? Sunburst charts are circular and more difficult to read, while treemaps are rectangular.
Which one do you think is easier to read? I think treemaps are easier to read and interpret.

Task 5: Dendrogram

install.packages("ggraph") #install ggraph

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

install.packages("igraph") #install igraph

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

install.packages("tidyverse") #install tidyverse

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

library(ggraph) #ensure ggraph installed
library(igraph) #ensure igraph installed

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(tidyverse) #ensure tidyverse installed

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%()       masks igraph::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor()     masks scales::col_factor()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ tidyr::crossing()       masks igraph::crossing()
## ✖ purrr::discard()        masks scales::discard()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::simplify()       masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

level1 <- data.frame( #create top level/origin
  from = "origin",
  to = paste0("group_", 1:4) #origin breaks into 4 groups
)

level2 <- data.frame( #create 2nd level
  from = rep(level1$to, each = 3), #four groups break into 3 each
  to = paste0("subgroup_", 1:12) #up to 12
)

edges <- bind_rows(level1, level2) #combine into edge list

graph <- graph_from_data_frame(edges) #turn list into graph

ggraph(graph, layout = "dendrogram", circular = FALSE) + #plot the dendrogram
  geom_edge_diagonal() + #diagonal edge
  geom_node_point() + #add nodes
  theme_void() #remove grids

ggraph(graph, layout = 'dendrogram') + #adding labels
  geom_edge_diagonal() + #diagonal edge
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.01) + #label nodes
  ylim(-.4, NA) #set limits

ggraph(graph, layout = 'dendrogram') + #adding more labels
  geom_edge_diagonal() +
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.04) +
  geom_node_point(aes(filter=leaf) , alpha=0.6) + #add points at nodes
  ylim(-.5, NA) #set limits

Task 6: Word Cloud

install.packages("janeaustenr") #install janeaustenr

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

install.packages("tidytext") #install tidytext for text cleanup

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

install.packages("wordcloud2") #install wordcloud to create word clouds

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)

library(janeaustenr) #ensure janeaustenr installed
library(tidytext) #ensure tidytext installed
library(wordcloud2) #ensure wordcloud installed

text <- austen_books() #rename daatset
word_counts <- text %>% #
  unnest_tokens(word, text) %>% #word nesting
  count(word, sort = TRUE) #count & sort words

data("stop_words") #load in dataset

word_counts <- word_counts %>%
  anti_join(stop_words) #remove stop words

## Joining with `by = join_by(word)`

Create a word cloud of the most commonly used words in the speech.

wordcloud2(word_counts, size = 0.8, shape = 'star') #create word cloud in a star-shape

Lesson 9 HW

Leo Colburn

2026-04-05

Homework

Task 1. Pie chart with ggplot

Vehicle & Class Distribution:

Create Pie Chart Given Values:

Task 2: Create a simple pie chart in base R using given values:

Donut Chart:

Task 3: Sunburst Plot

Create Sunburst Plot:

Task 4: Treemap

Create Treemap

Task 5: Dendrogram

Task 6: Word Cloud