library(tidyverse)
library(DT)
library(tidytext) # package for text analysis
library(readxl) # reads excel files, the format I used for the data
- Read in the text and unnest the words.
manifestos <- read_excel("manifestos .xlsx")
manifestos
manifestos_words <- manifestos %>%
unnest_tokens(word, text)
manifestos_words
NA
Here I read in the text and unnested the words.
- Generate a table that includes both lexical diversity and density, and the total number of words, of each document.
manifestos_words %>%
group_by(author) %>%
summarise(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = n_distinct(word)/n())
This is a table showing lexical diversity and density, and the total number of words, of each document.
- Generate a table with the mean word length of each document.
manifestos_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length)) %>%
arrange(-mean_word_length)
This is a table showing the mean word length of each document.
- Generate a graph with mini histograms of each document’s word lengths.
manifestos_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word Lengths by Document")

These are histograms showing the mean word length in each document.
- Remove stop words and then create a graph with the most common words in each document.
manifestos_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most common words") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining, by = "word"
Selecting by n

These are graphs showing the most common words in each document, not including stop words.
- Calculate tf-idfs and create a graph of the words with the highest tf-idfs in each document.
manifestos_word_counts <- suicide_notes %>% # This counts each word per author
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- manifestos_word_counts %>% # This counts total words per author
group_by(author) %>%
summarize(total = sum(n))
manifestos_word_counts <- left_join(manifestos_word_counts, total_words) # Joins the two
Joining, by = "author"
manifestos_tf_idf <- manifestos_word_counts %>% # Calculates tf-idf
bind_tf_idf(word, author, n)
manifestos_tf_idf %>% # Displays it
arrange(-tf_idf)
NA
This shows the calculation of the tf-idfs.
manifestos_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Most distinctive words in each manifesto")
Selecting by tf_idf

And here is the graph showing the tf-idfs.
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KERUKQpsaWJyYXJ5KHRpZHl0ZXh0KSAgICAgICAgIyBwYWNrYWdlIGZvciB0ZXh0IGFuYWx5c2lzCmxpYnJhcnkocmVhZHhsKSAgICAgICAgICAjIHJlYWRzIGV4Y2VsIGZpbGVzLCB0aGUgZm9ybWF0IEkgdXNlZCBmb3IgdGhlIGRhdGEKCmBgYAoKMS4gUmVhZCBpbiB0aGUgdGV4dCBhbmQgdW5uZXN0IHRoZSB3b3Jkcy4gIAoKYGBge3J9Cm1hbmlmZXN0b3MgPC0gcmVhZF9leGNlbCgibWFuaWZlc3RvcyAueGxzeCIpCgptYW5pZmVzdG9zCmBgYAoKYGBge3J9Cm1hbmlmZXN0b3Nfd29yZHMgPC0gbWFuaWZlc3RvcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCgptYW5pZmVzdG9zX3dvcmRzCgpgYGAKCkhlcmUgSSByZWFkIGluIHRoZSB0ZXh0IGFuZCB1bm5lc3RlZCB0aGUgd29yZHMuCgoKMi4gR2VuZXJhdGUgYSB0YWJsZSB0aGF0IGluY2x1ZGVzIGJvdGggbGV4aWNhbCBkaXZlcnNpdHkgYW5kIGRlbnNpdHksIGFuZCB0aGUgdG90YWwgbnVtYmVyIG9mIHdvcmRzLCBvZiBlYWNoIGRvY3VtZW50LiAgCgpgYGB7cn0KbWFuaWZlc3Rvc193b3JkcyAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXNlKG51bV93b3JkcyA9IG4oKSwKICAgICAgICAgICAgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCksIAogICAgICAgICAgICBsZXhfZGVuc2l0eSA9IG5fZGlzdGluY3Qod29yZCkvbigpKQpgYGAKVGhpcyBpcyBhIHRhYmxlIHNob3dpbmcgbGV4aWNhbCBkaXZlcnNpdHkgYW5kIGRlbnNpdHksIGFuZCB0aGUgdG90YWwgbnVtYmVyIG9mIHdvcmRzLCBvZiBlYWNoIGRvY3VtZW50LgoKMy4gR2VuZXJhdGUgYSB0YWJsZSB3aXRoIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIG9mIGVhY2ggZG9jdW1lbnQuICAKCmBgYHtyfQptYW5pZmVzdG9zX3dvcmRzICU+JQogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSAlPiUgCiAgc3VtbWFyaXplKG1lYW5fd29yZF9sZW5ndGggPSBtZWFuKHdvcmRfbGVuZ3RoKSkgJT4lIAogIGFycmFuZ2UoLW1lYW5fd29yZF9sZW5ndGgpCmBgYApUaGlzIGlzIGEgdGFibGUgc2hvd2luZyB0aGUgbWVhbiB3b3JkIGxlbmd0aCBvZiBlYWNoIGRvY3VtZW50LiAKCgo0LiBHZW5lcmF0ZSBhIGdyYXBoIHdpdGggbWluaSBoaXN0b2dyYW1zIG9mIGVhY2ggZG9jdW1lbnQncyB3b3JkIGxlbmd0aHMuICAKCmBgYHtyfQptYW5pZmVzdG9zX3dvcmRzICU+JQogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkX2xlbmd0aCkpICsKICBnZW9tX2hpc3RvZ3JhbShiaW53aWR0aCA9IDEpICsKICBmYWNldF93cmFwKHZhcnMoYXV0aG9yKSwgc2NhbGVzID0gImZyZWVfeSIpICsKICBsYWJzKHRpdGxlID0gIldvcmQgTGVuZ3RocyBieSBEb2N1bWVudCIpCmBgYApUaGVzZSBhcmUgaGlzdG9ncmFtcyBzaG93aW5nIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIGluIGVhY2ggZG9jdW1lbnQuCgo1LiBSZW1vdmUgc3RvcCB3b3JkcyBhbmQgdGhlbiBjcmVhdGUgYSBncmFwaCB3aXRoIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyBpbiBlYWNoIGRvY3VtZW50LiAgCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgJT4lCiAgdG9wX24oNSkgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgbXV0YXRlKHdvcmQgPSByZW9yZGVyKHdvcmQsIG4pKSAlPiUKICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAiTW9zdCBjb21tb24gd29yZHMiKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgY29vcmRfZmxpcCgpCgpgYGAKVGhlc2UgYXJlIGdyYXBocyBzaG93aW5nIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyBpbiBlYWNoIGRvY3VtZW50LCBub3QgaW5jbHVkaW5nIHN0b3Agd29yZHMuCgo2LiBDYWxjdWxhdGUgdGYtaWRmcyBhbmQgY3JlYXRlIGEgZ3JhcGggb2YgdGhlIHdvcmRzIHdpdGggdGhlIGhpZ2hlc3QgdGYtaWRmcyBpbiBlYWNoIGRvY3VtZW50LiAKCgpgYGB7cn0KbWFuaWZlc3Rvc193b3JkX2NvdW50cyA8LSBzdWljaWRlX25vdGVzICU+JSAgICAgICAgICAgICAjIFRoaXMgY291bnRzIGVhY2ggd29yZCBwZXIgYXV0aG9yCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KSAlPiUKICBjb3VudChhdXRob3IsIHdvcmQsIHNvcnQgPSBUUlVFKSAKCnRvdGFsX3dvcmRzIDwtIG1hbmlmZXN0b3Nfd29yZF9jb3VudHMgJT4lICAgICAgICAgICAgICAgIyBUaGlzIGNvdW50cyB0b3RhbCB3b3JkcyBwZXIgYXV0aG9yCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQoKbWFuaWZlc3Rvc193b3JkX2NvdW50cyA8LSBsZWZ0X2pvaW4obWFuaWZlc3Rvc193b3JkX2NvdW50cywgdG90YWxfd29yZHMpICAgICMgSm9pbnMgdGhlIHR3bwoKbWFuaWZlc3Rvc190Zl9pZGYgPC0gbWFuaWZlc3Rvc193b3JkX2NvdW50cyAlPiUgICAgICAgICAgICAgIyBDYWxjdWxhdGVzIHRmLWlkZgogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKCm1hbmlmZXN0b3NfdGZfaWRmICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBEaXNwbGF5cyBpdAogIGFycmFuZ2UoLXRmX2lkZikgICAgICAgICAgICAgICAgICAgICAgICAgIAoKYGBgCgpUaGlzIHNob3dzIHRoZSBjYWxjdWxhdGlvbiBvZiB0aGUgdGYtaWRmcy4KCmBgYHtyfQptYW5pZmVzdG9zX3RmX2lkZiAlPiUKICBhcnJhbmdlKC10Zl9pZGYpICU+JQogIG11dGF0ZSh3b3JkID0gZmFjdG9yKHdvcmQsIGxldmVscyA9IHJldih1bmlxdWUod29yZCkpKSkgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHRvcF9uKDUpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmQsIHRmX2lkZiwgZmlsbCA9IGF1dGhvcikpICsKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJ0Zi1pZGYiKSArCiAgZmFjZXRfd3JhcCh+YXV0aG9yLCBzY2FsZXMgPSAiZnJlZSIpICsKICBjb29yZF9mbGlwKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArCiAgbGFicyh0aXRsZSA9ICJNb3N0IGRpc3RpbmN0aXZlIHdvcmRzIGluIGVhY2ggbWFuaWZlc3RvIikKCmBgYAoKQW5kIGhlcmUgaXMgdGhlIGdyYXBoIHNob3dpbmcgdGhlIHRmLWlkZnMuCgoKCgoKCgoKCgoKCgo=