install.packages("ggplot2") #install ggplot
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("dplyr") #install dplyr
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2) #ensure ggplot installed
library(dplyr) #ensure dplyr installed
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
class_counts <- mpg %>% #create table counting vehicles per class
count(class) #count by class
ggplot(class_counts, aes(x = "", y = n, fill = class)) + #use class data, fill by class
geom_col(width = 1) + #set width for bar chart
coord_polar("y") + #convert bar to pie chart
theme_void() + #remove grids/lines
labs(title = "Vehicle Class Distribution") #add title
Questions
Which vehicle class appears most common? SUV’s appear most often.
Why can pie charts become difficult to interpret when there are many categories? The sections become too thin for us to effectively conceptualize.
Prop <- c(3,7,9,1,2) #loading in given values
pie(Prop) #create pie chart
pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E")) #add given labels
pie(Prop , labels = c("Gr-A","Gr-B","Gr-C","Gr-D","Gr-E"),density=10, angle=c(20,90,30,10,0)) # Add dashed lines by using density, and control the angles.
install.packages("scales") #download scales package
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(scales) #ensure scales downloaded
data <- data.frame(
category = c("A", "B", "C"),
count = c(10, 60, 30)
)
df <- data %>% #modify dataset
mutate(
prop = count / sum(count), #calculate proportions
ymax = cumsum(prop), #calculate slice locations
ymin = lag(ymax, default = 0), #calculate slice locations
label_pos = (ymax + ymin) / 2, #center text
label = percent(prop) #turn into percentages
)
ggplot(df, aes(ymax = ymax, ymin = ymin, xmax = 4, xmin = 3, fill = category)) + #plot the data
geom_rect() + #select area
coord_polar(theta = "y") + #specify location
geom_text(aes(x = 4.2, y = label_pos, label = label), size = 4) + #add percent labels
xlim(2, 4.5) + #set x max/min
theme_void() #remove bars & grids
Questions
Why do we need to calculate label positions manually for this plot? They aren’t automatically calculated.
Which label type is easier to read: percentages only, or category plus value? In terms of a pie chart, percentages are easier to read.
Create the following dataset:
sunburst_data <- data.frame(
sequence = c(
"Fruit-Apples",
"Fruit-Bananas",
"Fruit-Oranges",
"Vegetable-Carrots",
"Vegetable-Broccoli",
"Vegetable-Peppers"
),
value = c(30, 20, 25, 15, 18, 12)
)
install.packages("sunburstR") #install sunburst package
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(sunburstR) #ensure sunburst package installed
sunburst(sunburst_data) #create sunburst plot using given dataset
Using this dataset:
sales_data <- data.frame(
category = c("Fruit", "Fruit", "Fruit", "Vegetable", "Vegetable", "Vegetable"),
subcategory = c("Apples", "Bananas", "Oranges", "Carrots", "Broccoli", "Peppers"),
value = c(30, 20, 25, 15, 18, 12)
)
install.packages("treemap") #install treemap package
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(treemap) #ensure treemap installed
treemap( #create treemap
sales_data, #use sales_data from above dataset
index = c("category", "subcategory"), #sort by category + subcategory
vSize = "value", #base size on value
title = "Treemap of Category and Subcategory Counts" #set title
)
How is a treemap similar to a sunburst chart? It shows a range of categories sized by value.
How is it different? Sunburst charts are circular and more difficult to read, while treemaps are rectangular.
Which one do you think is easier to read? I think treemaps are easier to read and interpret.
install.packages("ggraph") #install ggraph
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("igraph") #install igraph
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("tidyverse") #install tidyverse
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggraph) #ensure ggraph installed
library(igraph) #ensure igraph installed
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(tidyverse) #ensure tidyverse installed
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%() masks igraph::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::compose() masks igraph::compose()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::simplify() masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
level1 <- data.frame( #create top level/origin
from = "origin",
to = paste0("group_", 1:4) #origin breaks into 4 groups
)
level2 <- data.frame( #create 2nd level
from = rep(level1$to, each = 3), #four groups break into 3 each
to = paste0("subgroup_", 1:12) #up to 12
)
edges <- bind_rows(level1, level2) #combine into edge list
graph <- graph_from_data_frame(edges) #turn list into graph
ggraph(graph, layout = "dendrogram", circular = FALSE) + #plot the dendrogram
geom_edge_diagonal() + #diagonal edge
geom_node_point() + #add nodes
theme_void() #remove grids
ggraph(graph, layout = 'dendrogram') + #adding labels
geom_edge_diagonal() + #diagonal edge
geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.01) + #label nodes
ylim(-.4, NA) #set limits
ggraph(graph, layout = 'dendrogram') + #adding more labels
geom_edge_diagonal() +
geom_node_text(aes( label=name, filter=leaf) , angle=90 , hjust=1, nudge_y = -0.04) +
geom_node_point(aes(filter=leaf) , alpha=0.6) + #add points at nodes
ylim(-.5, NA) #set limits
install.packages("janeaustenr") #install janeaustenr
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("tidytext") #install tidytext for text cleanup
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
install.packages("wordcloud2") #install wordcloud to create word clouds
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(janeaustenr) #ensure janeaustenr installed
library(tidytext) #ensure tidytext installed
library(wordcloud2) #ensure wordcloud installed
text <- austen_books() #rename daatset
word_counts <- text %>% #
unnest_tokens(word, text) %>% #word nesting
count(word, sort = TRUE) #count & sort words
data("stop_words") #load in dataset
word_counts <- word_counts %>%
anti_join(stop_words) #remove stop words
## Joining with `by = join_by(word)`
Create a word cloud of the most commonly used words in the speech.
wordcloud2(word_counts, size = 0.8, shape = 'star') #create word cloud in a star-shape