R Notebook

library(tidyverse)
library(DT)
library(tidytext)
library(readxl)

Killer_notes <- read_excel("manifestos.xlsx")

Killer_notes

This is essentially loading the data file.

Killer_words <- Killer_notes %>%
  unnest_tokens(word, text)
Killer_words

This breaks the words down into a seperate line per word to make it easier to analyze.

Killer_words %>% 
  group_by(author) %>% 
  summarize(num_words = n(), lex_diversity = n_distinct(word))

This groups it by author and obtains the lexical diversity, density, and the total number of words, of each document.

Killer_words %>% 
  group_by(author) %>% 
  summarise(num_words = n(),
            lex_diversity = n_distinct(word), 
            lex_density = n_distinct(word)/n())

This is a table that displays the author, number of words, lexical diversity and the lexical density.

Killer_words %>%
  group_by(author) %>% 
  mutate(word_length = nchar(word)) %>% 
  summarize(mean_word_length = mean(word_length)) %>% 
  arrange(-mean_word_length)

This is a table that shows the mean word length of each document.

Killer_words %>%
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(vars(author), scales = "free_y") +
  labs(title = "word length distributions of Killer notes, by author")

These are mini histograms that show each documents length.

stop_words <- get_stopwords()
stop_words$word

  [1] "i"          "me"         "my"         "myself"     "we"         "our"       
  [7] "ours"       "ourselves"  "you"        "your"       "yours"      "yourself"  
 [13] "yourselves" "he"         "him"        "his"        "himself"    "she"       
 [19] "her"        "hers"       "herself"    "it"         "its"        "itself"    
 [25] "they"       "them"       "their"      "theirs"     "themselves" "what"      
 [31] "which"      "who"        "whom"       "this"       "that"       "these"     
 [37] "those"      "am"         "is"         "are"        "was"        "were"      
 [43] "be"         "been"       "being"      "have"       "has"        "had"       
 [49] "having"     "do"         "does"       "did"        "doing"      "would"     
 [55] "should"     "could"      "ought"      "i'm"        "you're"     "he's"      
 [61] "she's"      "it's"       "we're"      "they're"    "i've"       "you've"    
 [67] "we've"      "they've"    "i'd"        "you'd"      "he'd"       "she'd"     
 [73] "we'd"       "they'd"     "i'll"       "you'll"     "he'll"      "she'll"    
 [79] "we'll"      "they'll"    "isn't"      "aren't"     "wasn't"     "weren't"   
 [85] "hasn't"     "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
 [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"      "cannot"    
 [97] "couldn't"   "mustn't"    "let's"      "that's"     "who's"      "what's"    
[103] "here's"     "there's"    "when's"     "where's"    "why's"      "how's"     
[109] "a"          "an"         "the"        "and"        "but"        "if"        
[115] "or"         "because"    "as"         "until"      "while"      "of"        
[121] "at"         "by"         "for"        "with"       "about"      "against"   
[127] "between"    "into"       "through"    "during"     "before"     "after"     
[133] "above"      "below"      "to"         "from"       "up"         "down"      
[139] "in"         "out"        "on"         "off"        "over"       "under"     
[145] "again"      "further"    "then"       "once"       "here"       "there"     
[151] "when"       "where"      "why"        "how"        "all"        "any"       
[157] "both"       "each"       "few"        "more"       "most"       "other"     
[163] "some"       "such"       "no"         "nor"        "not"        "only"      
[169] "own"        "same"       "so"         "than"       "too"        "very"      
[175] "will"

This obtains the “stopwords”, which are the most common words used.

Killer_words %>%
  anti_join(stop_words)

Joining with `by = join_by(word)`

This removes those stop words.

Killer_words %>%
  anti_join(stop_words) %>% 
  group_by(author) %>% 
  count(word, sort = T)

Joining with `by = join_by(word)`

This includes a number column.

Killer_words %>%
  anti_join(stop_words) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>%
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()

Joining with `by = join_by(word)`Selecting by n

These are mini graphs showing the most common words in each document.

Killer_word_counts <- Killer_notes %>%
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE) 
total_words <- Killer_word_counts %>%
  group_by(author) %>% 
  summarize(total = sum(n))
Killer_word_counts <- left_join(Killer_word_counts, total_words)

Joining with `by = join_by(author)`

Killer_tf_idf <- Killer_word_counts %>%
  bind_tf_idf(word, author, n)
Killer_tf_idf %>%
  arrange(-tf_idf)

Term frequency-Inverse Document Frequency (TF-IDF) measures how important a word is in a document relative to other documents. This table reflects that data.

Killer_tf_idf %>%
  arrange(-tf_idf) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(5) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip() +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Most distinctive words in each Killer note")

Selecting by tf_idf

These are mini graphs that show the words with the highest tf-idfs in each document.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0Ci0tLQoKYGBge3IgZWNobz1UUlVFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShEVCkKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShyZWFkeGwpCgpgYGAKCgpgYGB7cn0KS2lsbGVyX25vdGVzIDwtIHJlYWRfZXhjZWwoIm1hbmlmZXN0b3MueGxzeCIpCgpLaWxsZXJfbm90ZXMKYGBgCgpUaGlzIGlzIGVzc2VudGlhbGx5IGxvYWRpbmcgdGhlIGRhdGEgZmlsZS4KCgpgYGB7cn0KS2lsbGVyX3dvcmRzIDwtIEtpbGxlcl9ub3RlcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCktpbGxlcl93b3JkcwpgYGAKClRoaXMgYnJlYWtzIHRoZSB3b3JkcyBkb3duIGludG8gYSBzZXBlcmF0ZSBsaW5lIHBlciB3b3JkIHRvIG1ha2UgaXQgZWFzaWVyIHRvIGFuYWx5emUuCgoKYGBge3J9CktpbGxlcl93b3JkcyAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXplKG51bV93b3JkcyA9IG4oKSwgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCkpCmBgYAoKVGhpcyBncm91cHMgaXQgYnkgYXV0aG9yIGFuZCBvYnRhaW5zIHRoZSBsZXhpY2FsIGRpdmVyc2l0eSwgZGVuc2l0eSwgYW5kIHRoZSB0b3RhbCBudW1iZXIgb2Ygd29yZHMsIG9mIGVhY2ggZG9jdW1lbnQuCgoKYGBge3J9CktpbGxlcl93b3JkcyAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXNlKG51bV93b3JkcyA9IG4oKSwKICAgICAgICAgICAgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCksIAogICAgICAgICAgICBsZXhfZGVuc2l0eSA9IG5fZGlzdGluY3Qod29yZCkvbigpKQpgYGAKClRoaXMgaXMgYSB0YWJsZSB0aGF0IGRpc3BsYXlzIHRoZSBhdXRob3IsIG51bWJlciBvZiB3b3JkcywgbGV4aWNhbCBkaXZlcnNpdHkgYW5kIHRoZSBsZXhpY2FsIGRlbnNpdHkuCgoKYGBge3J9CktpbGxlcl93b3JkcyAlPiUKICBncm91cF9ieShhdXRob3IpICU+JSAKICBtdXRhdGUod29yZF9sZW5ndGggPSBuY2hhcih3b3JkKSkgJT4lIAogIHN1bW1hcml6ZShtZWFuX3dvcmRfbGVuZ3RoID0gbWVhbih3b3JkX2xlbmd0aCkpICU+JSAKICBhcnJhbmdlKC1tZWFuX3dvcmRfbGVuZ3RoKQpgYGAKClRoaXMgaXMgYSB0YWJsZSB0aGF0IHNob3dzIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIG9mIGVhY2ggZG9jdW1lbnQuCgoKYGBge3J9CktpbGxlcl93b3JkcyAlPiUKICBtdXRhdGUod29yZF9sZW5ndGggPSBuY2hhcih3b3JkKSkgJT4lIAogIGdncGxvdChhZXMod29yZF9sZW5ndGgpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAxKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlX3kiKSArCiAgbGFicyh0aXRsZSA9ICJ3b3JkIGxlbmd0aCBkaXN0cmlidXRpb25zIG9mIEtpbGxlciBub3RlcywgYnkgYXV0aG9yIikKYGBgCgpUaGVzZSBhcmUgbWluaSBoaXN0b2dyYW1zIHRoYXQgc2hvdyBlYWNoIGRvY3VtZW50cyBsZW5ndGguCgoKYGBge3J9CnN0b3Bfd29yZHMgPC0gZ2V0X3N0b3B3b3JkcygpCnN0b3Bfd29yZHMkd29yZApgYGAKClRoaXMgb2J0YWlucyB0aGUgInN0b3B3b3JkcyIsIHdoaWNoIGFyZSB0aGUgbW9zdCBjb21tb24gd29yZHMgdXNlZC4KCgpgYGB7cn0KS2lsbGVyX3dvcmRzICU+JQogIGFudGlfam9pbihzdG9wX3dvcmRzKQpgYGAKClRoaXMgcmVtb3ZlcyB0aG9zZSBzdG9wIHdvcmRzLgoKCmBgYHtyfQpLaWxsZXJfd29yZHMgJT4lCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgCmBgYAoKVGhpcyBpbmNsdWRlcyBhIG51bWJlciBjb2x1bW4uCgoKYGBge3J9CktpbGxlcl93b3JkcyAlPiUKICBhbnRpX2pvaW4oc3RvcF93b3JkcykgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIGNvdW50KHdvcmQsIHNvcnQgPSBUKSAlPiUKICB0b3Bfbig1KSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgbikpICU+JQogIGdncGxvdChhZXMod29yZCwgbiwgZmlsbCA9IGF1dGhvcikpICsKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJNb3N0IGNvbW1vbiB3b3JkcyIpICsKICBmYWNldF93cmFwKHZhcnMoYXV0aG9yKSwgc2NhbGVzID0gImZyZWUiKSArCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArCiAgdGhlbWVfbWluaW1hbCgpICsKICBjb29yZF9mbGlwKCkKYGBgCgpUaGVzZSBhcmUgbWluaSBncmFwaHMgc2hvd2luZyB0aGUgbW9zdCBjb21tb24gd29yZHMgaW4gZWFjaCBkb2N1bWVudC4KCgpgYGB7cn0KS2lsbGVyX3dvcmRfY291bnRzIDwtIEtpbGxlcl9ub3RlcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpICU+JQogIGNvdW50KGF1dGhvciwgd29yZCwgc29ydCA9IFRSVUUpIAp0b3RhbF93b3JkcyA8LSBLaWxsZXJfd29yZF9jb3VudHMgJT4lCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQpLaWxsZXJfd29yZF9jb3VudHMgPC0gbGVmdF9qb2luKEtpbGxlcl93b3JkX2NvdW50cywgdG90YWxfd29yZHMpCktpbGxlcl90Zl9pZGYgPC0gS2lsbGVyX3dvcmRfY291bnRzICU+JQogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKS2lsbGVyX3RmX2lkZiAlPiUKICBhcnJhbmdlKC10Zl9pZGYpCmBgYAoKVGVybSBmcmVxdWVuY3ktSW52ZXJzZSBEb2N1bWVudCBGcmVxdWVuY3kgKFRGLUlERikgbWVhc3VyZXMgaG93IGltcG9ydGFudCBhIHdvcmQgaXMgaW4gYSBkb2N1bWVudCByZWxhdGl2ZSB0byBvdGhlciBkb2N1bWVudHMuIFRoaXMgdGFibGUgcmVmbGVjdHMgdGhhdCBkYXRhLgoKCmBgYHtyfQpLaWxsZXJfdGZfaWRmICU+JQogIGFycmFuZ2UoLXRmX2lkZikgJT4lCiAgbXV0YXRlKHdvcmQgPSBmYWN0b3Iod29yZCwgbGV2ZWxzID0gcmV2KHVuaXF1ZSh3b3JkKSkpKSAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgdG9wX24oNSkgJT4lIAogIGdncGxvdChhZXMod29yZCwgdGZfaWRmLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gInRmLWlkZiIpICsKICBmYWNldF93cmFwKH5hdXRob3IsIHNjYWxlcyA9ICJmcmVlIikgKwogIGNvb3JkX2ZsaXAoKSArCiAgdGhlbWVfbWluaW1hbCgpICsKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsKICBsYWJzKHRpdGxlID0gIk1vc3QgZGlzdGluY3RpdmUgd29yZHMgaW4gZWFjaCBLaWxsZXIgbm90ZSIpCmBgYAoKVGhlc2UgYXJlIG1pbmkgZ3JhcGhzIHRoYXQgc2hvdyB0aGUgd29yZHMgd2l0aCB0aGUgaGlnaGVzdCB0Zi1pZGZzIGluIGVhY2ggZG9jdW1lbnQuCgo=