library(tidyverse)
library(lubridate)
library(scales)
library(ggrepel)
library(tidyquant)
library(jsonlite)
theme_set(theme_minimal())
invisible(Sys.setlocale("LC_TIME", "en_US.UTF-8"))

Let’s look solely at the packages that are listed on CRAN. These are:

ggwordcloud: A Word Cloud Geom for ‘ggplot2’
hwordcloud: Rendering Word Clouds
modelwordcloud: Model Word Clouds
wordcloud: Word Clouds
wordcloud2: Create Word Cloud by ‘htmlwidget’

Other packages which seem to have WC functionality are:

PubMedWordcloud: ‘Pubmed’ Word Clouds
MadanText: Persian Text Mining Tool for Frequency Analysis, Statistical Analysis, and Word Clouds

Sample text data

letters <- list()

for (y in 2013:2023) {
  cat(y, "\n")
  url <- paste0("https://www.berkshirehathaway.com/letters/", y, "ltr.pdf")
  
  textout <- pdftools::pdf_text(url) |> 
    paste(collapse = " ") |> 
    # replace all \[anyletter] such as \n \t \r with space
    stringr::str_replace_all("\\s+", " ") |>
    # remove all non-letters
    stringr::str_replace_all("[^[:alpha:]]", " ") |>
    # remove all single letters
    stringr::str_replace_all("\\b\\w\\b", " ") |> 
    # remove all spaces in excess of just one
    stringr::str_replace_all("\\s+", " ")
  
  letters[[as.character(y)]] <- textout
}

saveRDS(letters, "data/wordclouds/letters.RDS")

letters <- readRDS("data/wordclouds/letters.RDS")

library(tidytext)

word_freq <- paste(letters, collapse = " ") |> 
  as_tibble() |> 
  unnest_tokens(word, value, to_lower = F) |> 
  filter(!(word %in% tidytext::stop_words$word)) |>
  filter(!(word %in% str_to_title(tidytext::stop_words$word))) |>
  count(word, sort = TRUE) |> 
  filter(n > 10) |> 
  filter(str_length(word) > 1)

Package comparison

ggwordcloud

library(ggwordcloud)

ggplot(word_freq |> head(50), 
       aes(label = word, size = n)) +
  # area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used. 
  geom_text_wordcloud(area_corr = TRUE) +
  # max_size: Size of largest points.
  scale_size_area(max_size = 50) +
  theme_minimal()

With colors:

ggplot(word_freq |> head(50) |> 
         mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()), 
       aes(label = word, size = n, color = first_letter)) +
  # area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used. 
  geom_text_wordcloud(area_corr = TRUE) +
  # max_size: Size of largest points.
  scale_size_area(max_size = 50) +
  theme_minimal()

With angles:

set.seed(123)
ggplot(word_freq |> head(50) |> 
         mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()) |> 
         mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 1, 4, 1, 1))), 
       aes(label = word, size = n, color = first_letter, angle = angle)) +
  # area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used. 
  geom_text_wordcloud(area_corr = TRUE) +
  # max_size: Size of largest points.
  scale_size_area(max_size = 50) +
  theme_minimal()

set.seed(123)
ggplot(word_freq |> head(50) |> 
         mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()) |> 
         mutate(angle = sample(-70:70, n(), replace = TRUE)), 
       aes(label = word, size = n, color = first_letter, angle = angle)) +
  # area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used. 
  geom_text_wordcloud(area_corr = TRUE) +
  # max_size: Size of largest points.
  scale_size_area(max_size = 50) +
  theme_minimal()

With ggplot facets:

word_freq_y <- letters |> 
  as_tibble_col() |> 
  unnest()|> 
  mutate(y = names(letters)) |> 
  group_by(y) |> 
  unnest_tokens(word, value, to_lower = F) |> 
  filter(!(word %in% tidytext::stop_words$word)) |>
  filter(!(word %in% str_to_title(tidytext::stop_words$word))) |>
  count(word, sort = TRUE) |> 
  filter(n > 10) |> 
  filter(str_length(word) > 1)

set.seed(123)
ggplot(word_freq_y |> slice_max(n, n = 10) |> 
         mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()) |> 
         mutate(angle = sample(-70:70, n(), replace = TRUE)), 
       aes(label = word, size = n, color = first_letter, angle = angle)) +
  # area_corr: Set the font size so that the area is proportional to size aesthetic when the scale_size_area is used. 
  geom_text_wordcloud(area_corr = TRUE) +
  # max_size: Size of largest points.
  scale_size_area(max_size = 20) +
  theme_minimal() +
  facet_wrap(~y)

hwordcloud

hwordcloud outputs an interactive htmlwidget. So it’s great for interactive markdown documents or Shiny apps. Not great for PDF reports.

library(hwordcloud)

df <- word_freq |> head(50)

hwordcloud(text = df$word, size = df$n, 
           width = "100%", height = "200px",
           subtitle = "Word Freq")

There does not seem to be an argument to remove or change the caption (Highcharts.com).

modelwordcloud

Package was last updated in September 2017.

The wordcloud function can handle linear model outputs, however, it can also be used with a simple data (words and frequencies). Use of colors would also be possible.

library(modelwordcloud)

modelwordcloud::wordcloud(words = word_freq[1:50, "word"][[1]], 
                          freq = word_freq[1:50, "n"][[1]])

With some random coloring.

modelwordcloud::wordcloud(words = word_freq[1:50, "word"][[1]], 
          freq = word_freq[1:50, "n"][[1]],
          colors = RColorBrewer::brewer.pal(8, "Accent"),
          random_color = TRUE)

Lots of words. Compared to ggwordcloud, the generation is much quicker.

modelwordcloud::wordcloud(words = word_freq[1:200, "word"][[1]], 
          freq = word_freq[1:200, "n"][[1]],
          colors = RColorBrewer::brewer.pal(8, "Accent"),
          random_color = TRUE)

Even more words.

modelwordcloud::wordcloud(words = word_freq[1:500, "word"][[1]], 
          freq = word_freq[1:500, "n"][[1]],
          colors = RColorBrewer::brewer.pal(8, "Accent"),
          random_color = TRUE)

With random rotation (20% rotated).

modelwordcloud::wordcloud(words = word_freq[1:500, "word"][[1]], 
          freq = word_freq[1:500, "n"][[1]],
          colors = RColorBrewer::brewer.pal(8, "Accent"),
          random_color = TRUE,
          rot_per = 0.2)

wordcloud

library(wordcloud)

wordcloud::wordcloud(words = word_freq[1:50, "word"][[1]], 
                     freq = word_freq[1:50, "n"][[1]])

With some random coloring.

wordcloud::wordcloud(words = word_freq[1:50, "word"][[1]], 
                     freq = word_freq[1:50, "n"][[1]],
                     colors = RColorBrewer::brewer.pal(8, "Accent"),
                     random.color = TRUE)

Lots of words. Speed and output is very similar to modelwordcloud. Are they relying on the same underlying functions?

Colors are definitely better than modelwordcloud.

But one problem is that the most important word has gone missing: Berkshire.

Rotation is default (0.1=10%).

wordcloud::wordcloud(words = word_freq[1:200, "word"][[1]], 
                     freq = word_freq[1:200, "n"][[1]],
                     colors = RColorBrewer::brewer.pal(8, "Accent"),
                     random.color = TRUE)

wordcloud2

library(wordcloud2)

wordcloud2::wordcloud2(data = word_freq |> rename(freq = n))

wordcloud2 relies on wordcloud2.js so it’s interactive (htmlwidget) only.

PubMedWordcloud

PubMedWordcloud inputs same format as wordcloud2 but output looks like from the wordcloud or modelwordcloud packages.

library(PubMedWordcloud)

plotWordCloud(word_freq |> rename(freq = n))

plotWordCloud(word_freq |> rename(freq = n), 
              scale = c(3, 0.3), min.freq = 1, max.words = 200,
              random.order = FALSE, rot.per = 0.1, use.r.layout = T,
              colors = brewer.pal(8, "Accent"))

MadanText

Gives errors when installing, I can’t be bothered, sorry.

Conclusion

For interactive wordclouds (like for in an interactive report or Shiny app) you can use hwordcloud or wordcloud2. wordcloud2 wins as the output looks better and it’s more customizable.

For static wordclouds you can use ggwordcloud, wordcloud, modelwordcloud, or PubMedWordcloud. ggwordcloud comes handy if you want to work in the ggplot2 environment. Mainly it supports facets (facet_wrap etc.) which is great for comparing sources. On the downside, ggworldcloud seems slower than the other packages, especially wordcloud. This I’d recommend for creating quickly a static wordcloud. Strangely, some key words go missing when too many words are used.

modelwordcloud is also good.

Comparison of Word Cloud R Packages

Martin Geissmann

2024-12-22