library(tidyverse)
library(DT)
library(tidytext)        # package for text analysis
library(readxl)          # reads excel files, the format I used for the data
  1. Read in the text and unnest the words.
manifestos <- read_excel("manifestos .xlsx")

manifestos
manifestos_words <- manifestos %>%
  unnest_tokens(word, text)

manifestos_words
NA

Here I read in the text and unnested the words.

  1. Generate a table that includes both lexical diversity and density, and the total number of words, of each document.
manifestos_words %>% 
  group_by(author) %>% 
  summarise(num_words = n(),
            lex_diversity = n_distinct(word), 
            lex_density = n_distinct(word)/n())

This is a table showing lexical diversity and density, and the total number of words, of each document.

  1. Generate a table with the mean word length of each document.
manifestos_words %>%
  group_by(author) %>% 
  mutate(word_length = nchar(word)) %>% 
  summarize(mean_word_length = mean(word_length)) %>% 
  arrange(-mean_word_length)

This is a table showing the mean word length of each document.

  1. Generate a graph with mini histograms of each document’s word lengths.
manifestos_words %>%
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(vars(author), scales = "free_y") +
  labs(title = "Word Lengths by Document")

These are histograms showing the mean word length in each document.

  1. Remove stop words and then create a graph with the most common words in each document.

manifestos_words %>%
  anti_join(stop_words) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>%
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()
Joining, by = "word"
Selecting by n

These are graphs showing the most common words in each document, not including stop words.

  1. Calculate tf-idfs and create a graph of the words with the highest tf-idfs in each document.
manifestos_word_counts <- suicide_notes %>%             # This counts each word per author
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE) 

total_words <- manifestos_word_counts %>%               # This counts total words per author
  group_by(author) %>% 
  summarize(total = sum(n))

manifestos_word_counts <- left_join(manifestos_word_counts, total_words)    # Joins the two
Joining, by = "author"
manifestos_tf_idf <- manifestos_word_counts %>%             # Calculates tf-idf
  bind_tf_idf(word, author, n)

manifestos_tf_idf %>%                                   # Displays it
  arrange(-tf_idf)                          
NA

This shows the calculation of the tf-idfs.

manifestos_tf_idf %>%
  arrange(-tf_idf) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(5) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip() +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Most distinctive words in each manifesto")
Selecting by tf_idf

And here is the graph showing the tf-idfs.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KERUKQpsaWJyYXJ5KHRpZHl0ZXh0KSAgICAgICAgIyBwYWNrYWdlIGZvciB0ZXh0IGFuYWx5c2lzCmxpYnJhcnkocmVhZHhsKSAgICAgICAgICAjIHJlYWRzIGV4Y2VsIGZpbGVzLCB0aGUgZm9ybWF0IEkgdXNlZCBmb3IgdGhlIGRhdGEKCmBgYAoKMS4gUmVhZCBpbiB0aGUgdGV4dCBhbmQgdW5uZXN0IHRoZSB3b3Jkcy4gIAoKYGBge3J9Cm1hbmlmZXN0b3MgPC0gcmVhZF9leGNlbCgibWFuaWZlc3RvcyAueGxzeCIpCgptYW5pZmVzdG9zCmBgYAoKYGBge3J9Cm1hbmlmZXN0b3Nfd29yZHMgPC0gbWFuaWZlc3RvcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCgptYW5pZmVzdG9zX3dvcmRzCgpgYGAKCkhlcmUgSSByZWFkIGluIHRoZSB0ZXh0IGFuZCB1bm5lc3RlZCB0aGUgd29yZHMuCgoKMi4gR2VuZXJhdGUgYSB0YWJsZSB0aGF0IGluY2x1ZGVzIGJvdGggbGV4aWNhbCBkaXZlcnNpdHkgYW5kIGRlbnNpdHksIGFuZCB0aGUgdG90YWwgbnVtYmVyIG9mIHdvcmRzLCBvZiBlYWNoIGRvY3VtZW50LiAgCgpgYGB7cn0KbWFuaWZlc3Rvc193b3JkcyAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXNlKG51bV93b3JkcyA9IG4oKSwKICAgICAgICAgICAgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCksIAogICAgICAgICAgICBsZXhfZGVuc2l0eSA9IG5fZGlzdGluY3Qod29yZCkvbigpKQpgYGAKVGhpcyBpcyBhIHRhYmxlIHNob3dpbmcgbGV4aWNhbCBkaXZlcnNpdHkgYW5kIGRlbnNpdHksIGFuZCB0aGUgdG90YWwgbnVtYmVyIG9mIHdvcmRzLCBvZiBlYWNoIGRvY3VtZW50LgoKMy4gR2VuZXJhdGUgYSB0YWJsZSB3aXRoIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIG9mIGVhY2ggZG9jdW1lbnQuICAKCmBgYHtyfQptYW5pZmVzdG9zX3dvcmRzICU+JQogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSAlPiUgCiAgc3VtbWFyaXplKG1lYW5fd29yZF9sZW5ndGggPSBtZWFuKHdvcmRfbGVuZ3RoKSkgJT4lIAogIGFycmFuZ2UoLW1lYW5fd29yZF9sZW5ndGgpCmBgYApUaGlzIGlzIGEgdGFibGUgc2hvd2luZyB0aGUgbWVhbiB3b3JkIGxlbmd0aCBvZiBlYWNoIGRvY3VtZW50LiAKCgo0LiBHZW5lcmF0ZSBhIGdyYXBoIHdpdGggbWluaSBoaXN0b2dyYW1zIG9mIGVhY2ggZG9jdW1lbnQncyB3b3JkIGxlbmd0aHMuICAKCmBgYHtyfQptYW5pZmVzdG9zX3dvcmRzICU+JQogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkX2xlbmd0aCkpICsKICBnZW9tX2hpc3RvZ3JhbShiaW53aWR0aCA9IDEpICsKICBmYWNldF93cmFwKHZhcnMoYXV0aG9yKSwgc2NhbGVzID0gImZyZWVfeSIpICsKICBsYWJzKHRpdGxlID0gIldvcmQgTGVuZ3RocyBieSBEb2N1bWVudCIpCmBgYApUaGVzZSBhcmUgaGlzdG9ncmFtcyBzaG93aW5nIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIGluIGVhY2ggZG9jdW1lbnQuCgo1LiBSZW1vdmUgc3RvcCB3b3JkcyBhbmQgdGhlbiBjcmVhdGUgYSBncmFwaCB3aXRoIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyBpbiBlYWNoIGRvY3VtZW50LiAgCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgJT4lCiAgdG9wX24oNSkgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgbXV0YXRlKHdvcmQgPSByZW9yZGVyKHdvcmQsIG4pKSAlPiUKICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAiTW9zdCBjb21tb24gd29yZHMiKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgY29vcmRfZmxpcCgpCgpgYGAKVGhlc2UgYXJlIGdyYXBocyBzaG93aW5nIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyBpbiBlYWNoIGRvY3VtZW50LCBub3QgaW5jbHVkaW5nIHN0b3Agd29yZHMuCgo2LiBDYWxjdWxhdGUgdGYtaWRmcyBhbmQgY3JlYXRlIGEgZ3JhcGggb2YgdGhlIHdvcmRzIHdpdGggdGhlIGhpZ2hlc3QgdGYtaWRmcyBpbiBlYWNoIGRvY3VtZW50LiAKCgpgYGB7cn0KbWFuaWZlc3Rvc193b3JkX2NvdW50cyA8LSBzdWljaWRlX25vdGVzICU+JSAgICAgICAgICAgICAjIFRoaXMgY291bnRzIGVhY2ggd29yZCBwZXIgYXV0aG9yCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KSAlPiUKICBjb3VudChhdXRob3IsIHdvcmQsIHNvcnQgPSBUUlVFKSAKCnRvdGFsX3dvcmRzIDwtIG1hbmlmZXN0b3Nfd29yZF9jb3VudHMgJT4lICAgICAgICAgICAgICAgIyBUaGlzIGNvdW50cyB0b3RhbCB3b3JkcyBwZXIgYXV0aG9yCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQoKbWFuaWZlc3Rvc193b3JkX2NvdW50cyA8LSBsZWZ0X2pvaW4obWFuaWZlc3Rvc193b3JkX2NvdW50cywgdG90YWxfd29yZHMpICAgICMgSm9pbnMgdGhlIHR3bwoKbWFuaWZlc3Rvc190Zl9pZGYgPC0gbWFuaWZlc3Rvc193b3JkX2NvdW50cyAlPiUgICAgICAgICAgICAgIyBDYWxjdWxhdGVzIHRmLWlkZgogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKCm1hbmlmZXN0b3NfdGZfaWRmICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBEaXNwbGF5cyBpdAogIGFycmFuZ2UoLXRmX2lkZikgICAgICAgICAgICAgICAgICAgICAgICAgIAoKYGBgCgpUaGlzIHNob3dzIHRoZSBjYWxjdWxhdGlvbiBvZiB0aGUgdGYtaWRmcy4KCmBgYHtyfQptYW5pZmVzdG9zX3RmX2lkZiAlPiUKICBhcnJhbmdlKC10Zl9pZGYpICU+JQogIG11dGF0ZSh3b3JkID0gZmFjdG9yKHdvcmQsIGxldmVscyA9IHJldih1bmlxdWUod29yZCkpKSkgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHRvcF9uKDUpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmQsIHRmX2lkZiwgZmlsbCA9IGF1dGhvcikpICsKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJ0Zi1pZGYiKSArCiAgZmFjZXRfd3JhcCh+YXV0aG9yLCBzY2FsZXMgPSAiZnJlZSIpICsKICBjb29yZF9mbGlwKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArCiAgbGFicyh0aXRsZSA9ICJNb3N0IGRpc3RpbmN0aXZlIHdvcmRzIGluIGVhY2ggbWFuaWZlc3RvIikKCmBgYAoKQW5kIGhlcmUgaXMgdGhlIGdyYXBoIHNob3dpbmcgdGhlIHRmLWlkZnMuCgoKCgoKCgoKCgoKCgo=