library(tidyverse)
library(DT)
library(tidytext)        # package for text analysis
library(readxl)          # reads excel files, the format I used for the data
  1. The first step in to read in the manifesto data and unnest the words.
manifesto_notes <- read_excel("manifestos.xlsx")

manifesto_notes
manifesto_words <- manifesto_notes %>%
  unnest_tokens(word, text)

manifesto_words
  1. The second step was to create a table showing diversity, density, and total number of words.
manifesto_words %>% 
  group_by(author) %>% 
  summarise(num_words = n(),
            lex_diversity = n_distinct(word), 
            lex_density = n_distinct(word)/n())
  1. The next step was to create a table showing the mean word length by each author.
manifesto_words %>%
  group_by(author) %>% 
  mutate(word_length = nchar(word)) %>% 
  summarize(mean_word_length = mean(word_length)) %>% 
  arrange(-mean_word_length)
  1. Next, I created individual graphs in order to show the word length distribution between each manifesto.
manifesto_words %>%
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word Length Distributions of Manifesto, by Author")

  1. Next, I ran a stop word command in order to get rid of less meaningful words. Then I created a graph for each author to show their most commonly used words.
stop_words <- get_stopwords()
stop_words$word
  [1] "i"          "me"         "my"         "myself"     "we"         "our"        "ours"       "ourselves" 
  [9] "you"        "your"       "yours"      "yourself"   "yourselves" "he"         "him"        "his"       
 [17] "himself"    "she"        "her"        "hers"       "herself"    "it"         "its"        "itself"    
 [25] "they"       "them"       "their"      "theirs"     "themselves" "what"       "which"      "who"       
 [33] "whom"       "this"       "that"       "these"      "those"      "am"         "is"         "are"       
 [41] "was"        "were"       "be"         "been"       "being"      "have"       "has"        "had"       
 [49] "having"     "do"         "does"       "did"        "doing"      "would"      "should"     "could"     
 [57] "ought"      "i'm"        "you're"     "he's"       "she's"      "it's"       "we're"      "they're"   
 [65] "i've"       "you've"     "we've"      "they've"    "i'd"        "you'd"      "he'd"       "she'd"     
 [73] "we'd"       "they'd"     "i'll"       "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
 [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"     "haven't"    "hadn't"     "doesn't"   
 [89] "don't"      "didn't"     "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"      "cannot"    
 [97] "couldn't"   "mustn't"    "let's"      "that's"     "who's"      "what's"     "here's"     "there's"   
[105] "when's"     "where's"    "why's"      "how's"      "a"          "an"         "the"        "and"       
[113] "but"        "if"         "or"         "because"    "as"         "until"      "while"      "of"        
[121] "at"         "by"         "for"        "with"       "about"      "against"    "between"    "into"      
[129] "through"    "during"     "before"     "after"      "above"      "below"      "to"         "from"      
[137] "up"         "down"       "in"         "out"        "on"         "off"        "over"       "under"     
[145] "again"      "further"    "then"       "once"       "here"       "there"      "when"       "where"     
[153] "why"        "how"        "all"        "any"        "both"       "each"       "few"        "more"      
[161] "most"       "other"      "some"       "such"       "no"         "nor"        "not"        "only"      
[169] "own"        "same"       "so"         "than"       "too"        "very"       "will"      
manifesto_words %>%
  anti_join(stop_words) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>%
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()
Joining, by = "word"
Selecting by n

  1. Lastly, I calculate tf-idfs and created a graph of the words with the highest tf-idfs in each document.
manifesto_word_counts <- manifesto_notes %>%             # This counts each word per author
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE) 

total_words <- manifesto_word_counts %>%               # This counts total words per author
  group_by(author) %>% 
  summarize(total = sum(n))

manifesto_word_counts <- left_join(manifesto_word_counts, total_words)    # Joins the two
Joining, by = "author"
manifesto_tf_idf %>%
  arrange(-tf_idf) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(5) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip() +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Most Distinctive Words in Each Manifesto")
Selecting by tf_idf

LS0tCnRpdGxlOiAiTWFuaWZlc3RvIFRleHQgQW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShEVCkKbGlicmFyeSh0aWR5dGV4dCkgICAgICAgICMgcGFja2FnZSBmb3IgdGV4dCBhbmFseXNpcwpsaWJyYXJ5KHJlYWR4bCkgICAgICAgICAgIyByZWFkcyBleGNlbCBmaWxlcywgdGhlIGZvcm1hdCBJIHVzZWQgZm9yIHRoZSBkYXRhCgpgYGAKCgoxLiBUaGUgZmlyc3Qgc3RlcCBpbiB0byByZWFkIGluIHRoZSBtYW5pZmVzdG8gZGF0YSBhbmQgdW5uZXN0IHRoZSB3b3Jkcy4KYGBge3J9Cm1hbmlmZXN0b19ub3RlcyA8LSByZWFkX2V4Y2VsKCJtYW5pZmVzdG9zLnhsc3giKQoKbWFuaWZlc3RvX25vdGVzCmBgYAoKYGBge3J9Cm1hbmlmZXN0b193b3JkcyA8LSBtYW5pZmVzdG9fbm90ZXMgJT4lCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQoKbWFuaWZlc3RvX3dvcmRzCmBgYAoKCgoKMi4gVGhlIHNlY29uZCBzdGVwIHdhcyB0byBjcmVhdGUgYSB0YWJsZSBzaG93aW5nIGRpdmVyc2l0eSwgZGVuc2l0eSwgYW5kIHRvdGFsIG51bWJlciBvZiB3b3Jkcy4KCmBgYHtyfQptYW5pZmVzdG9fd29yZHMgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcmlzZShudW1fd29yZHMgPSBuKCksCiAgICAgICAgICAgIGxleF9kaXZlcnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpLCAKICAgICAgICAgICAgbGV4X2RlbnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpL24oKSkKYGBgCgozLiBUaGUgbmV4dCBzdGVwIHdhcyB0byBjcmVhdGUgYSB0YWJsZSBzaG93aW5nIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIGJ5IGVhY2ggYXV0aG9yLgoKYGBge3J9Cm1hbmlmZXN0b193b3JkcyAlPiUKICBncm91cF9ieShhdXRob3IpICU+JSAKICBtdXRhdGUod29yZF9sZW5ndGggPSBuY2hhcih3b3JkKSkgJT4lIAogIHN1bW1hcml6ZShtZWFuX3dvcmRfbGVuZ3RoID0gbWVhbih3b3JkX2xlbmd0aCkpICU+JSAKICBhcnJhbmdlKC1tZWFuX3dvcmRfbGVuZ3RoKQpgYGAKCjQuIE5leHQsIEkgY3JlYXRlZCBpbmRpdmlkdWFsIGdyYXBocyBpbiBvcmRlciB0byBzaG93IHRoZSB3b3JkIGxlbmd0aCBkaXN0cmlidXRpb24gYmV0d2VlbiBlYWNoIG1hbmlmZXN0by4KCmBgYHtyfQptYW5pZmVzdG9fd29yZHMgJT4lCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZV95IikgKwpsYWJzKHRpdGxlID0gIldvcmQgTGVuZ3RoIERpc3RyaWJ1dGlvbnMgb2YgTWFuaWZlc3RvLCBieSBBdXRob3IiKQpgYGAKCgoKCjUuIE5leHQsIEkgcmFuIGEgc3RvcCB3b3JkIGNvbW1hbmQgaW4gb3JkZXIgdG8gZ2V0IHJpZCBvZiBsZXNzIG1lYW5pbmdmdWwgd29yZHMuIFRoZW4gSSBjcmVhdGVkIGEgZ3JhcGggZm9yIGVhY2ggYXV0aG9yIHRvIHNob3cgdGhlaXIgbW9zdCBjb21tb25seSB1c2VkIHdvcmRzLgpgYGB7cn0Kc3RvcF93b3JkcyA8LSBnZXRfc3RvcHdvcmRzKCkKc3RvcF93b3JkcyR3b3JkCmBgYAoKCgpgYGB7cn0KbWFuaWZlc3RvX3dvcmRzICU+JQogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgY291bnQod29yZCwgc29ydCA9IFQpICU+JQogIHRvcF9uKDUpICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCBuLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgY29tbW9uIHdvcmRzIikgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZSIpICsKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsKICB0aGVtZV9taW5pbWFsKCkgKwogIGNvb3JkX2ZsaXAoKQpgYGAKCjYuIExhc3RseSwgSSBjYWxjdWxhdGUgdGYtaWRmcyBhbmQgY3JlYXRlZCBhIGdyYXBoIG9mIHRoZSB3b3JkcyB3aXRoIHRoZSBoaWdoZXN0IHRmLWlkZnMgaW4gZWFjaCBkb2N1bWVudC4gIAoKYGBge3J9Cm1hbmlmZXN0b193b3JkX2NvdW50cyA8LSBtYW5pZmVzdG9fbm90ZXMgJT4lICAgICAgICAgICAgICMgVGhpcyBjb3VudHMgZWFjaCB3b3JkIHBlciBhdXRob3IKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpICU+JQogIGNvdW50KGF1dGhvciwgd29yZCwgc29ydCA9IFRSVUUpIAoKdG90YWxfd29yZHMgPC0gbWFuaWZlc3RvX3dvcmRfY291bnRzICU+JSAgICAgICAgICAgICAgICMgVGhpcyBjb3VudHMgdG90YWwgd29yZHMgcGVyIGF1dGhvcgogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcml6ZSh0b3RhbCA9IHN1bShuKSkKCm1hbmlmZXN0b193b3JkX2NvdW50cyA8LSBsZWZ0X2pvaW4obWFuaWZlc3RvX3dvcmRfY291bnRzLCB0b3RhbF93b3JkcykgICAgIyBKb2lucyB0aGUgdHdvCmBgYAoKYGBge3J9Cm1hbmlmZXN0b190Zl9pZGYgJT4lCiAgYXJyYW5nZSgtdGZfaWRmKSAlPiUKICBtdXRhdGUod29yZCA9IGZhY3Rvcih3b3JkLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQpKSkpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICB0b3Bfbig1KSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYsIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKwogIGZhY2V0X3dyYXAofmF1dGhvciwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIGxhYnModGl0bGUgPSAiTW9zdCBEaXN0aW5jdGl2ZSBXb3JkcyBpbiBFYWNoIE1hbmlmZXN0byIpCmBgYAo=