library(tidyverse)
library(lubridate)
library(scales)
library(ggrepel)
library(tidyquant)
library(jsonlite)
theme_set(theme_minimal())
invisible(Sys.setlocale("LC_TIME", "en_US.UTF-8"))
Let’s look solely at the packages that are listed on CRAN. These are:
Other packages which seem to have WC functionality are:
letters <- list()
for (y in 2013:2023) {
cat(y, "\n")
url <- paste0("https://www.berkshirehathaway.com/letters/", y, "ltr.pdf")
textout <- pdftools::pdf_text(url) |>
paste(collapse = " ") |>
# replace all \[anyletter] such as \n \t \r with space
stringr::str_replace_all("\\s+", " ") |>
# remove all non-letters
stringr::str_replace_all("[^[:alpha:]]", " ") |>
# remove all single letters
stringr::str_replace_all("\\b\\w\\b", " ") |>
# remove all spaces in excess of just one
stringr::str_replace_all("\\s+", " ")
letters[[as.character(y)]] <- textout
}
saveRDS(letters, "data/wordclouds/letters.RDS")
letters <- readRDS("data/wordclouds/letters.RDS")
library(tidytext)
word_freq <- paste(letters, collapse = " ") |>
as_tibble() |>
unnest_tokens(word, value, to_lower = F) |>
filter(!(word %in% tidytext::stop_words$word)) |>
filter(!(word %in% str_to_title(tidytext::stop_words$word))) |>
count(word, sort = TRUE) |>
filter(n > 10) |>
filter(str_length(word) > 1)
library(ggwordcloud)
ggplot(word_freq |> head(50),
aes(label = word, size = n)) +
# area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used.
geom_text_wordcloud(area_corr = TRUE) +
# max_size: Size of largest points.
scale_size_area(max_size = 50) +
theme_minimal()
With colors:
ggplot(word_freq |> head(50) |>
mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()),
aes(label = word, size = n, color = first_letter)) +
# area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used.
geom_text_wordcloud(area_corr = TRUE) +
# max_size: Size of largest points.
scale_size_area(max_size = 50) +
theme_minimal()
With angles:
set.seed(123)
ggplot(word_freq |> head(50) |>
mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()) |>
mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 1, 4, 1, 1))),
aes(label = word, size = n, color = first_letter, angle = angle)) +
# area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used.
geom_text_wordcloud(area_corr = TRUE) +
# max_size: Size of largest points.
scale_size_area(max_size = 50) +
theme_minimal()
set.seed(123)
ggplot(word_freq |> head(50) |>
mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()) |>
mutate(angle = sample(-70:70, n(), replace = TRUE)),
aes(label = word, size = n, color = first_letter, angle = angle)) +
# area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used.
geom_text_wordcloud(area_corr = TRUE) +
# max_size: Size of largest points.
scale_size_area(max_size = 50) +
theme_minimal()
With ggplot facets:
word_freq_y <- letters |>
as_tibble_col() |>
unnest()|>
mutate(y = names(letters)) |>
group_by(y) |>
unnest_tokens(word, value, to_lower = F) |>
filter(!(word %in% tidytext::stop_words$word)) |>
filter(!(word %in% str_to_title(tidytext::stop_words$word))) |>
count(word, sort = TRUE) |>
filter(n > 10) |>
filter(str_length(word) > 1)
set.seed(123)
ggplot(word_freq_y |> slice_max(n, n = 10) |>
mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()) |>
mutate(angle = sample(-70:70, n(), replace = TRUE)),
aes(label = word, size = n, color = first_letter, angle = angle)) +
# area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used.
geom_text_wordcloud(area_corr = TRUE) +
# max_size: Size of largest points.
scale_size_area(max_size = 20) +
theme_minimal() +
facet_wrap(~y)
hwordcloud outputs an interactive htmlwidget. So it’s great for interactive markdown documents or Shiny apps. Not great for PDF reports.
library(hwordcloud)
df <- word_freq |> head(50)
hwordcloud(text = df$word, size = df$n,
width = "100%", height = "200px",
subtitle = "Word Freq")
There does not seem to be an argument to remove or change the caption (Highcharts.com).
Package was last updated in September 2017.
The wordcloud function can handle linear model outputs, however, it can also be used with a simple data (words and frequencies). Use of colors would also be possible.
library(modelwordcloud)
modelwordcloud::wordcloud(words = word_freq[1:50, "word"][[1]],
freq = word_freq[1:50, "n"][[1]])
With some random coloring.
modelwordcloud::wordcloud(words = word_freq[1:50, "word"][[1]],
freq = word_freq[1:50, "n"][[1]],
colors = RColorBrewer::brewer.pal(8, "Accent"),
random_color = TRUE)
Lots of words. Compared to ggwordcloud, the generation is much quicker.
modelwordcloud::wordcloud(words = word_freq[1:200, "word"][[1]],
freq = word_freq[1:200, "n"][[1]],
colors = RColorBrewer::brewer.pal(8, "Accent"),
random_color = TRUE)
Even more words.
modelwordcloud::wordcloud(words = word_freq[1:500, "word"][[1]],
freq = word_freq[1:500, "n"][[1]],
colors = RColorBrewer::brewer.pal(8, "Accent"),
random_color = TRUE)
With random rotation (20% rotated).
modelwordcloud::wordcloud(words = word_freq[1:500, "word"][[1]],
freq = word_freq[1:500, "n"][[1]],
colors = RColorBrewer::brewer.pal(8, "Accent"),
random_color = TRUE,
rot_per = 0.2)
library(wordcloud)
wordcloud::wordcloud(words = word_freq[1:50, "word"][[1]],
freq = word_freq[1:50, "n"][[1]])
With some random coloring.
wordcloud::wordcloud(words = word_freq[1:50, "word"][[1]],
freq = word_freq[1:50, "n"][[1]],
colors = RColorBrewer::brewer.pal(8, "Accent"),
random.color = TRUE)
Lots of words. Speed and output is very similar to modelwordcloud. Are they relying on the same underlying functions?
Colors are definitely better than modelwordcloud.
But one problem is that the most important word has gone missing: Berkshire.
Rotation is default (0.1=10%).
wordcloud::wordcloud(words = word_freq[1:200, "word"][[1]],
freq = word_freq[1:200, "n"][[1]],
colors = RColorBrewer::brewer.pal(8, "Accent"),
random.color = TRUE)
library(wordcloud2)
wordcloud2::wordcloud2(data = word_freq |> rename(freq = n))
wordcloud2 relies on wordcloud2.js so it’s interactive (htmlwidget) only.
PubMedWordcloud inputs same format as wordcloud2 but output looks like from the wordcloud or modelwordcloud packages.
library(PubMedWordcloud)
plotWordCloud(word_freq |> rename(freq = n))
plotWordCloud(word_freq |> rename(freq = n),
scale = c(3, 0.3), min.freq = 1, max.words = 200,
random.order = FALSE, rot.per = 0.1, use.r.layout = T,
colors = brewer.pal(8, "Accent"))
Gives errors when installing, I can’t be bothered, sorry.
For interactive wordclouds (like for in an interactive report or Shiny app) you can use hwordcloud or wordcloud2. wordcloud2 wins as the output looks better and it’s more customizable.
For static wordclouds you can use ggwordcloud, wordcloud, modelwordcloud, or PubMedWordcloud. ggwordcloud comes handy if you want to work in the ggplot2 environment. Mainly it supports facets (facet_wrap etc.) which is great for comparing sources. On the downside, ggworldcloud seems slower than the other packages, especially wordcloud. This I’d recommend for creating quickly a static wordcloud. Strangely, some key words go missing when too many words are used.
modelwordcloud is also good.