manifestos <- read_excel("manifestos.xlsx")

manifestos
  1. Here is the text with unnest words.
manifestos_words <- manifestos %>%
  unnest_tokens(word, text)

manifestos_words
NA

There are over 21,000 rows!

  1. Here is a table that includes both lexical diversity and density, and the total number of words, of each document.
manifestos_words %>% 
  group_by(author) %>% 
  summarise(num_words = n(),
            lex_diversity = n_distinct(word), 
            lex_density = n_distinct(word)/n())

Breivik had the most words and lex diversity while Harper Mercer had the highest lex density.

  1. Here we have a table with the mean word length of each document.
manifestos_words %>%
  group_by(author) %>% 
  mutate(word_length = nchar(word)) %>% 
  summarize(mean_word_length = mean(word_length)) %>% 
  arrange(-mean_word_length)

Breivik had the highest mean word length.

  1. Here is a graph with mini histograms of each document’s word lengths.
manifestos_words %>%
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(vars(author), scales = "free_y") +
  labs( title = "Word Lengths by Author")

Most of the graphs have a similar shape which I find very interesting.

  1. Here is a graph with removed stop words and a graph with the most common words in each document.
manifestos_words %>%
  anti_join(stop_words) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>%
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()
Joining, by = "word"
Selecting by n

NA
NA

Based on these words it seems many of these people had issues with the society they were living in which probably caused their actions.

  1. Here are the calculated tf-idfs and create a graph of the words with the highest tf-idfs in each document.
manifestos_word_counts <- manifestos %>%             # This counts each word per author
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE) 

total_words <- manifestos_word_counts %>%               # This counts total words per author
  group_by(author) %>% 
  summarize(total = sum(n))

manifestos_word_counts <- left_join(manifestos_word_counts, total_words)    # Joins the two
Joining, by = "author"
manifestos_tf_idf <- manifestos_word_counts %>%             # Calculates tf-idf
  bind_tf_idf(word, author, n)

manifestos_tf_idf %>%                                   # Displays it
  arrange(-tf_idf)                          
NA
manifestos_tf_idf %>%
  arrange(-tf_idf) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(5) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip() +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Most distinctive words in each suicide note")
Selecting by tf_idf

Harper Mercer had the least variety in tf-ids.

LS0tDQp0aXRsZTogIlRleHQgQW5hbHlzaXMiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCg0KYGBge3J9DQptYW5pZmVzdG9zIDwtIHJlYWRfZXhjZWwoIm1hbmlmZXN0b3MueGxzeCIpDQoNCm1hbmlmZXN0b3MNCmBgYA0KMS4gSGVyZSBpcyB0aGUgdGV4dCB3aXRoIHVubmVzdCB3b3Jkcy4gIA0KYGBge3J9DQptYW5pZmVzdG9zX3dvcmRzIDwtIG1hbmlmZXN0b3MgJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkNCg0KbWFuaWZlc3Rvc193b3Jkcw0KDQpgYGANClRoZXJlIGFyZSBvdmVyIDIxLDAwMCByb3dzIQ0KDQoNCjIuIEhlcmUgaXMgYSB0YWJsZSB0aGF0IGluY2x1ZGVzIGJvdGggbGV4aWNhbCBkaXZlcnNpdHkgYW5kIGRlbnNpdHksIGFuZCB0aGUgdG90YWwgbnVtYmVyIG9mIHdvcmRzLCBvZiBlYWNoIGRvY3VtZW50LiANCmBgYHtyfQ0KbWFuaWZlc3Rvc193b3JkcyAlPiUgDQogIGdyb3VwX2J5KGF1dGhvcikgJT4lIA0KICBzdW1tYXJpc2UobnVtX3dvcmRzID0gbigpLA0KICAgICAgICAgICAgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCksIA0KICAgICAgICAgICAgbGV4X2RlbnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpL24oKSkNCmBgYA0KQnJlaXZpayBoYWQgdGhlIG1vc3Qgd29yZHMgYW5kIGxleCBkaXZlcnNpdHkgd2hpbGUgSGFycGVyIE1lcmNlciBoYWQgdGhlIGhpZ2hlc3QgbGV4IGRlbnNpdHkuDQoNCg0KMy4gSGVyZSB3ZSBoYXZlIGEgdGFibGUgd2l0aCB0aGUgbWVhbiB3b3JkIGxlbmd0aCBvZiBlYWNoIGRvY3VtZW50LiAgDQpgYGB7cn0NCm1hbmlmZXN0b3Nfd29yZHMgJT4lDQogIGdyb3VwX2J5KGF1dGhvcikgJT4lIA0KICBtdXRhdGUod29yZF9sZW5ndGggPSBuY2hhcih3b3JkKSkgJT4lIA0KICBzdW1tYXJpemUobWVhbl93b3JkX2xlbmd0aCA9IG1lYW4od29yZF9sZW5ndGgpKSAlPiUgDQogIGFycmFuZ2UoLW1lYW5fd29yZF9sZW5ndGgpDQpgYGANCkJyZWl2aWsgaGFkIHRoZSBoaWdoZXN0IG1lYW4gd29yZCBsZW5ndGguDQoNCg0KNC4gSGVyZSBpcyBhIGdyYXBoIHdpdGggbWluaSBoaXN0b2dyYW1zIG9mIGVhY2ggZG9jdW1lbnQncyB3b3JkIGxlbmd0aHMuIA0KYGBge3J9DQptYW5pZmVzdG9zX3dvcmRzICU+JQ0KICBtdXRhdGUod29yZF9sZW5ndGggPSBuY2hhcih3b3JkKSkgJT4lIA0KICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKw0KICBnZW9tX2hpc3RvZ3JhbShiaW53aWR0aCA9IDEpICsNCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlX3kiKSArDQogIGxhYnMoIHRpdGxlID0gIldvcmQgTGVuZ3RocyBieSBBdXRob3IiKQ0KDQpgYGANCk1vc3Qgb2YgdGhlIGdyYXBocyBoYXZlIGEgc2ltaWxhciBzaGFwZSB3aGljaCBJIGZpbmQgdmVyeSBpbnRlcmVzdGluZy4NCg0KDQo1LiBIZXJlIGlzIGEgZ3JhcGggd2l0aCByZW1vdmVkIHN0b3Agd29yZHMgYW5kICBhIGdyYXBoIHdpdGggdGhlIG1vc3QgY29tbW9uIHdvcmRzIGluIGVhY2ggZG9jdW1lbnQuICANCmBgYHtyfQ0KbWFuaWZlc3Rvc193b3JkcyAlPiUNCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JSANCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgDQogIGNvdW50KHdvcmQsIHNvcnQgPSBUKSAlPiUNCiAgdG9wX24oNSkgJT4lIA0KICB1bmdyb3VwKCkgJT4lIA0KICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgbikpICU+JQ0KICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBhdXRob3IpKSArDQogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsNCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJNb3N0IGNvbW1vbiB3b3JkcyIpICsNCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKw0KICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsNCiAgdGhlbWVfbWluaW1hbCgpICsNCiAgY29vcmRfZmxpcCgpDQogIA0KDQpgYGANCkJhc2VkIG9uIHRoZXNlIHdvcmRzIGl0IHNlZW1zIG1hbnkgb2YgdGhlc2UgcGVvcGxlIGhhZCBpc3N1ZXMgd2l0aCB0aGUgc29jaWV0eSB0aGV5IHdlcmUgbGl2aW5nIGluIHdoaWNoIHByb2JhYmx5IGNhdXNlZCB0aGVpciBhY3Rpb25zLg0KDQoNCjYuIEhlcmUgYXJlIHRoZSBjYWxjdWxhdGVkIHRmLWlkZnMgYW5kIGNyZWF0ZSBhIGdyYXBoIG9mIHRoZSB3b3JkcyB3aXRoIHRoZSBoaWdoZXN0IHRmLWlkZnMgaW4gZWFjaCBkb2N1bWVudC4gDQpgYGB7cn0NCm1hbmlmZXN0b3Nfd29yZF9jb3VudHMgPC0gbWFuaWZlc3RvcyAlPiUgICAgICAgICAgICAgDQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lDQogIGNvdW50KGF1dGhvciwgd29yZCwgc29ydCA9IFRSVUUpIA0KDQp0b3RhbF93b3JkcyA8LSBtYW5pZmVzdG9zX3dvcmRfY291bnRzICU+JSAgICAgICAgICAgICAgIA0KICBncm91cF9ieShhdXRob3IpICU+JSANCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQ0KDQptYW5pZmVzdG9zX3dvcmRfY291bnRzIDwtIGxlZnRfam9pbihtYW5pZmVzdG9zX3dvcmRfY291bnRzLCB0b3RhbF93b3JkcykgICAgDQoNCm1hbmlmZXN0b3NfdGZfaWRmIDwtIG1hbmlmZXN0b3Nfd29yZF9jb3VudHMgJT4lICAgICAgICAgICAgIA0KICBiaW5kX3RmX2lkZih3b3JkLCBhdXRob3IsIG4pDQoNCm1hbmlmZXN0b3NfdGZfaWRmICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgDQogIGFycmFuZ2UoLXRmX2lkZikgICAgICAgICAgICAgICAgICAgICAgICAgIA0KDQpgYGANCg0KYGBge3J9DQptYW5pZmVzdG9zX3RmX2lkZiAlPiUNCiAgYXJyYW5nZSgtdGZfaWRmKSAlPiUNCiAgbXV0YXRlKHdvcmQgPSBmYWN0b3Iod29yZCwgbGV2ZWxzID0gcmV2KHVuaXF1ZSh3b3JkKSkpKSAlPiUgDQogIGdyb3VwX2J5KGF1dGhvcikgJT4lIA0KICB0b3Bfbig1KSAlPiUgDQogIGdncGxvdChhZXMod29yZCwgdGZfaWRmLCBmaWxsID0gYXV0aG9yKSkgKw0KICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArDQogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKw0KICBmYWNldF93cmFwKH5hdXRob3IsIHNjYWxlcyA9ICJmcmVlIikgKw0KICBjb29yZF9mbGlwKCkgKw0KICB0aGVtZV9taW5pbWFsKCkgKw0KICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsNCiAgbGFicyh0aXRsZSA9ICJNb3N0IGRpc3RpbmN0aXZlIHdvcmRzIGluIGVhY2ggc3VpY2lkZSBub3RlIikNCg0KYGBgDQpIYXJwZXIgTWVyY2VyIGhhZCB0aGUgbGVhc3QgdmFyaWV0eSBpbiB0Zi1pZHMuDQo=