Homework

Task 1. Pie chart with ggplot

Vehicle & Class Distribution:

install.packages("ggplot2") #install ggplot
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("dplyr") #install dplyr
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2) #ensure ggplot installed
library(dplyr) #ensure dplyr installed
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
class_counts <- mpg %>% #create table counting vehicles per class
  count(class) #count by class

ggplot(class_counts, aes(x = "", y = n, fill = class)) + #use class data, fill by class
  geom_col(width = 1) + #set width for bar chart
  coord_polar("y") + #convert bar to pie chart
  theme_void() + #remove grids/lines
  labs(title = "Vehicle Class Distribution") #add title

Questions

  • Which vehicle class appears most common? SUV’s appear most often.

  • Why can pie charts become difficult to interpret when there are many categories? The sections become too thin for us to effectively conceptualize.

Create Pie Chart Given Values:

Prop <- c(3,7,9,1,2) #loading in given values
pie(Prop) #create pie chart

pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E")) #add given labels

pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E"),density=10, angle=c(20,90,30,10,0)) # Add dashed lines by using density, and control the angles.

Task 2: Create a simple pie chart in base R using given values:

Donut Chart:

install.packages("scales") #download scales package
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(scales) #ensure scales downloaded
data <- data.frame( 
  category = c("A", "B", "C"),
  count = c(10, 60, 30)
)

df <- data %>% #modify dataset
  mutate(
    prop = count / sum(count), #calculate proportions
    ymax = cumsum(prop), #calculate slice locations
    ymin = lag(ymax, default = 0), #calculate slice locations
    label_pos = (ymax + ymin) / 2, #center text
    label = percent(prop) #turn into percentages
  )

ggplot(df, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) + #plot the data
  geom_rect() + #select area
  coord_polar(theta = "y") + #specify location
  
  geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) + #add percent labels
  
  xlim(2, 4.5) + #set x max/min
  theme_void() #remove bars & grids

Questions

  • Why do we need to calculate label positions manually for this plot? They aren’t automatically calculated.

  • Which label type is easier to read: percentages only, or category plus value? In terms of a pie chart, percentages are easier to read.

Task 3: Sunburst Plot

Create the following dataset:

sunburst_data <- data.frame(
  sequence = c(
    "Fruit-Apples",
    "Fruit-Bananas",
    "Fruit-Oranges",
    "Vegetable-Carrots",
    "Vegetable-Broccoli",
    "Vegetable-Peppers"
  ),
  value = c(30, 20, 25, 15, 18, 12)
)

Create Sunburst Plot:

install.packages("sunburstR") #install sunburst package
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(sunburstR) #ensure sunburst package installed
sunburst(sunburst_data) #create sunburst plot using given dataset
Legend

Task 4: Treemap

Using this dataset:

sales_data <- data.frame(
  category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
  subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
  value = c(30, 20, 25, 15, 18, 12)
)

Create Treemap

install.packages("treemap") #install treemap package
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(treemap) #ensure treemap installed

treemap( #create treemap
  sales_data, #use sales_data from above dataset
  index = c("category", "subcategory"), #sort by category + subcategory
  vSize = "value", #base size on value
  title = "Treemap of Category and Subcategory Counts" #set title
)

  • How is a treemap similar to a sunburst chart? It shows a range of categories sized by value.

  • How is it different? Sunburst charts are circular and more difficult to read, while treemaps are rectangular.

  • Which one do you think is easier to read? I think treemaps are easier to read and interpret.

Task 5: Dendrogram

install.packages("ggraph") #install ggraph
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("igraph") #install igraph
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("tidyverse") #install tidyverse
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggraph) #ensure ggraph installed
library(igraph) #ensure igraph installed
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(tidyverse) #ensure tidyverse installed
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%()       masks igraph::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor()     masks scales::col_factor()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ tidyr::crossing()       masks igraph::crossing()
## ✖ purrr::discard()        masks scales::discard()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::simplify()       masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
level1 <- data.frame( #create top level/origin
  from = "origin",
  to = paste0("group_", 1:4) #origin breaks into 4 groups
)

level2 <- data.frame( #create 2nd level
  from = rep(level1$to, each = 3), #four groups break into 3 each
  to = paste0("subgroup_", 1:12) #up to 12
)

edges <- bind_rows(level1, level2) #combine into edge list

graph <- graph_from_data_frame(edges) #turn list into graph

ggraph(graph, layout = "dendrogram", circular = FALSE) + #plot the dendrogram
  geom_edge_diagonal() + #diagonal edge
  geom_node_point() + #add nodes
  theme_void() #remove grids 

ggraph(graph, layout = 'dendrogram') + #adding labels
  geom_edge_diagonal() + #diagonal edge
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.01) + #label nodes
  ylim(-.4, NA) #set limits

ggraph(graph, layout = 'dendrogram') + #adding more labels
  geom_edge_diagonal() +
  geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.04) +
  geom_node_point(aes(filter=leaf) , alpha=0.6) + #add points at nodes
  ylim(-.5, NA) #set limits

Task 6: Word Cloud

install.packages("janeaustenr") #install janeaustenr 
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("tidytext") #install tidytext for text cleanup
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("wordcloud2") #install wordcloud to create word clouds
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(janeaustenr) #ensure janeaustenr installed
library(tidytext) #ensure tidytext installed
library(wordcloud2) #ensure wordcloud installed

text <- austen_books() #rename daatset
word_counts <- text %>% #
  unnest_tokens(word, text) %>% #word nesting
  count(word, sort = TRUE) #count & sort words

data("stop_words") #load in dataset

word_counts <- word_counts %>%
  anti_join(stop_words) #remove stop words
## Joining with `by = join_by(word)`

Create a word cloud of the most commonly used words in the speech.

wordcloud2(word_counts, size = 0.8, shape = 'star') #create word cloud in a star-shape