library(tidyverse)
library(DT)
library(tidytext) # package for text analysis
library(readxl) # reads excel files, the format I used for the data
- The first step in to read in the manifesto data and unnest the words.
manifesto_notes <- read_excel("manifestos.xlsx")
manifesto_notes
manifesto_words <- manifesto_notes %>%
unnest_tokens(word, text)
manifesto_words
- The second step was to create a table showing diversity, density, and total number of words.
manifesto_words %>%
group_by(author) %>%
summarise(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = n_distinct(word)/n())
- The next step was to create a table showing the mean word length by each author.
manifesto_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length)) %>%
arrange(-mean_word_length)
- Next, I created individual graphs in order to show the word length distribution between each manifesto.
manifesto_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word Length Distributions of Manifesto, by Author")

- Next, I ran a stop word command in order to get rid of less meaningful words. Then I created a graph for each author to show their most commonly used words.
stop_words <- get_stopwords()
stop_words$word
[1] "i" "me" "my" "myself" "we" "our" "ours" "ourselves"
[9] "you" "your" "yours" "yourself" "yourselves" "he" "him" "his"
[17] "himself" "she" "her" "hers" "herself" "it" "its" "itself"
[25] "they" "them" "their" "theirs" "themselves" "what" "which" "who"
[33] "whom" "this" "that" "these" "those" "am" "is" "are"
[41] "was" "were" "be" "been" "being" "have" "has" "had"
[49] "having" "do" "does" "did" "doing" "would" "should" "could"
[57] "ought" "i'm" "you're" "he's" "she's" "it's" "we're" "they're"
[65] "i've" "you've" "we've" "they've" "i'd" "you'd" "he'd" "she'd"
[73] "we'd" "they'd" "i'll" "you'll" "he'll" "she'll" "we'll" "they'll"
[81] "isn't" "aren't" "wasn't" "weren't" "hasn't" "haven't" "hadn't" "doesn't"
[89] "don't" "didn't" "won't" "wouldn't" "shan't" "shouldn't" "can't" "cannot"
[97] "couldn't" "mustn't" "let's" "that's" "who's" "what's" "here's" "there's"
[105] "when's" "where's" "why's" "how's" "a" "an" "the" "and"
[113] "but" "if" "or" "because" "as" "until" "while" "of"
[121] "at" "by" "for" "with" "about" "against" "between" "into"
[129] "through" "during" "before" "after" "above" "below" "to" "from"
[137] "up" "down" "in" "out" "on" "off" "over" "under"
[145] "again" "further" "then" "once" "here" "there" "when" "where"
[153] "why" "how" "all" "any" "both" "each" "few" "more"
[161] "most" "other" "some" "such" "no" "nor" "not" "only"
[169] "own" "same" "so" "than" "too" "very" "will"
manifesto_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most common words") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining, by = "word"
Selecting by n

- Lastly, I calculate tf-idfs and created a graph of the words with the highest tf-idfs in each document.
manifesto_word_counts <- manifesto_notes %>% # This counts each word per author
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- manifesto_word_counts %>% # This counts total words per author
group_by(author) %>%
summarize(total = sum(n))
manifesto_word_counts <- left_join(manifesto_word_counts, total_words) # Joins the two
Joining, by = "author"
manifesto_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Most Distinctive Words in Each Manifesto")
Selecting by tf_idf

LS0tCnRpdGxlOiAiTWFuaWZlc3RvIFRleHQgQW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShEVCkKbGlicmFyeSh0aWR5dGV4dCkgICAgICAgICMgcGFja2FnZSBmb3IgdGV4dCBhbmFseXNpcwpsaWJyYXJ5KHJlYWR4bCkgICAgICAgICAgIyByZWFkcyBleGNlbCBmaWxlcywgdGhlIGZvcm1hdCBJIHVzZWQgZm9yIHRoZSBkYXRhCgpgYGAKCgoxLiBUaGUgZmlyc3Qgc3RlcCBpbiB0byByZWFkIGluIHRoZSBtYW5pZmVzdG8gZGF0YSBhbmQgdW5uZXN0IHRoZSB3b3Jkcy4KYGBge3J9Cm1hbmlmZXN0b19ub3RlcyA8LSByZWFkX2V4Y2VsKCJtYW5pZmVzdG9zLnhsc3giKQoKbWFuaWZlc3RvX25vdGVzCmBgYAoKYGBge3J9Cm1hbmlmZXN0b193b3JkcyA8LSBtYW5pZmVzdG9fbm90ZXMgJT4lCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQoKbWFuaWZlc3RvX3dvcmRzCmBgYAoKCgoKMi4gVGhlIHNlY29uZCBzdGVwIHdhcyB0byBjcmVhdGUgYSB0YWJsZSBzaG93aW5nIGRpdmVyc2l0eSwgZGVuc2l0eSwgYW5kIHRvdGFsIG51bWJlciBvZiB3b3Jkcy4KCmBgYHtyfQptYW5pZmVzdG9fd29yZHMgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcmlzZShudW1fd29yZHMgPSBuKCksCiAgICAgICAgICAgIGxleF9kaXZlcnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpLCAKICAgICAgICAgICAgbGV4X2RlbnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpL24oKSkKYGBgCgozLiBUaGUgbmV4dCBzdGVwIHdhcyB0byBjcmVhdGUgYSB0YWJsZSBzaG93aW5nIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIGJ5IGVhY2ggYXV0aG9yLgoKYGBge3J9Cm1hbmlmZXN0b193b3JkcyAlPiUKICBncm91cF9ieShhdXRob3IpICU+JSAKICBtdXRhdGUod29yZF9sZW5ndGggPSBuY2hhcih3b3JkKSkgJT4lIAogIHN1bW1hcml6ZShtZWFuX3dvcmRfbGVuZ3RoID0gbWVhbih3b3JkX2xlbmd0aCkpICU+JSAKICBhcnJhbmdlKC1tZWFuX3dvcmRfbGVuZ3RoKQpgYGAKCjQuIE5leHQsIEkgY3JlYXRlZCBpbmRpdmlkdWFsIGdyYXBocyBpbiBvcmRlciB0byBzaG93IHRoZSB3b3JkIGxlbmd0aCBkaXN0cmlidXRpb24gYmV0d2VlbiBlYWNoIG1hbmlmZXN0by4KCmBgYHtyfQptYW5pZmVzdG9fd29yZHMgJT4lCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZV95IikgKwpsYWJzKHRpdGxlID0gIldvcmQgTGVuZ3RoIERpc3RyaWJ1dGlvbnMgb2YgTWFuaWZlc3RvLCBieSBBdXRob3IiKQpgYGAKCgoKCjUuIE5leHQsIEkgcmFuIGEgc3RvcCB3b3JkIGNvbW1hbmQgaW4gb3JkZXIgdG8gZ2V0IHJpZCBvZiBsZXNzIG1lYW5pbmdmdWwgd29yZHMuIFRoZW4gSSBjcmVhdGVkIGEgZ3JhcGggZm9yIGVhY2ggYXV0aG9yIHRvIHNob3cgdGhlaXIgbW9zdCBjb21tb25seSB1c2VkIHdvcmRzLgpgYGB7cn0Kc3RvcF93b3JkcyA8LSBnZXRfc3RvcHdvcmRzKCkKc3RvcF93b3JkcyR3b3JkCmBgYAoKCgpgYGB7cn0KbWFuaWZlc3RvX3dvcmRzICU+JQogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgY291bnQod29yZCwgc29ydCA9IFQpICU+JQogIHRvcF9uKDUpICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCBuLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgY29tbW9uIHdvcmRzIikgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZSIpICsKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsKICB0aGVtZV9taW5pbWFsKCkgKwogIGNvb3JkX2ZsaXAoKQpgYGAKCjYuIExhc3RseSwgSSBjYWxjdWxhdGUgdGYtaWRmcyBhbmQgY3JlYXRlZCBhIGdyYXBoIG9mIHRoZSB3b3JkcyB3aXRoIHRoZSBoaWdoZXN0IHRmLWlkZnMgaW4gZWFjaCBkb2N1bWVudC4gIAoKYGBge3J9Cm1hbmlmZXN0b193b3JkX2NvdW50cyA8LSBtYW5pZmVzdG9fbm90ZXMgJT4lICAgICAgICAgICAgICMgVGhpcyBjb3VudHMgZWFjaCB3b3JkIHBlciBhdXRob3IKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpICU+JQogIGNvdW50KGF1dGhvciwgd29yZCwgc29ydCA9IFRSVUUpIAoKdG90YWxfd29yZHMgPC0gbWFuaWZlc3RvX3dvcmRfY291bnRzICU+JSAgICAgICAgICAgICAgICMgVGhpcyBjb3VudHMgdG90YWwgd29yZHMgcGVyIGF1dGhvcgogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcml6ZSh0b3RhbCA9IHN1bShuKSkKCm1hbmlmZXN0b193b3JkX2NvdW50cyA8LSBsZWZ0X2pvaW4obWFuaWZlc3RvX3dvcmRfY291bnRzLCB0b3RhbF93b3JkcykgICAgIyBKb2lucyB0aGUgdHdvCmBgYAoKYGBge3J9Cm1hbmlmZXN0b190Zl9pZGYgJT4lCiAgYXJyYW5nZSgtdGZfaWRmKSAlPiUKICBtdXRhdGUod29yZCA9IGZhY3Rvcih3b3JkLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQpKSkpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICB0b3Bfbig1KSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYsIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKwogIGZhY2V0X3dyYXAofmF1dGhvciwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIGxhYnModGl0bGUgPSAiTW9zdCBEaXN0aW5jdGl2ZSBXb3JkcyBpbiBFYWNoIE1hbmlmZXN0byIpCmBgYAo=