Kaitlin Kavlie PSYC-541 Lab #8: Text Analysis of Manifestos
- Using the first code below I read the Manifestos texts into R and created a data set called ‘manifestos’. Then with the second code I unnest the words for each of the manifestos.
manifestos <- read_excel('manifestos.xlsx')
manifestos
manifestos_words <- manifestos %>%
unnest_tokens(word, text)
manifestos_words
NA
- By using the code shown below, I created a table that includes both lexical diversity and density, as well as the total number of words for each manifesto.
manifestos_words %>%
group_by(author) %>%
summarise(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = n_distinct(word)/n())
- I produced a table with the mean word length of each of the manifestos by using the code shown below.
manifestos_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length)) %>%
arrange(-mean_word_length)
- I generated a graph using the code chunk below, which includes mini histograms of each manifesto’s word lengths.
manifestos_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word lengths by manifesto author")

- I removed stop words and then created a graph with the most common words in each document.
I used this first code to load the list of stop words.
stop_words
Then I used the code directly below to remove the stop words from the manifestos_words data set.
manifestos_words %>%
anti_join(stop_words)
Joining, by = "word"
The code chunk below was used to create a graph of the most used words in each of the manifestos.
manifestos_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most common words in each manifesto") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining, by = "word"
Selecting by n

- Using the first code shown below, I calculated tf-idfs for each manifesto and a table displaying the results. Then with the last code below I created a graph of the words with the highest tf-idfs in each manifesto
manifestos_word_counts <- manifestos %>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- manifestos_word_counts %>%
group_by(author) %>%
summarize(total = sum(n))
manifestos_word_counts <- left_join(manifestos_word_counts, total_words)
Joining, by = "author"
manifestos_tf_idf <- manifestos_word_counts %>%
bind_tf_idf(word, author, n)
manifestos_tf_idf %>%
arrange(-tf_idf)
NA
With this final code below I created a graph of the words with the highest tf-idfs in each manifesto.
manifestos_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Most distinctive words in each manifesto")
Selecting by tf_idf

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpLYWl0bGluIEthdmxpZSBQU1lDLTU0MQ0KTGFiICM4OiBUZXh0IEFuYWx5c2lzIG9mIE1hbmlmZXN0b3MNCg0KDQoNCg0KDQoxLiBVc2luZyB0aGUgZmlyc3QgY29kZSBiZWxvdyBJIHJlYWQgdGhlIE1hbmlmZXN0b3MgdGV4dHMgaW50byBSIGFuZCBjcmVhdGVkIGEgZGF0YSBzZXQgY2FsbGVkICdtYW5pZmVzdG9zJy4gVGhlbiB3aXRoIHRoZSBzZWNvbmQgY29kZSBJIHVubmVzdCB0aGUgd29yZHMgZm9yIGVhY2ggb2YgdGhlIG1hbmlmZXN0b3MuDQoNCg0KDQpgYGB7cn0NCm1hbmlmZXN0b3MgPC0gcmVhZF9leGNlbCgnbWFuaWZlc3Rvcy54bHN4JykNCg0KbWFuaWZlc3Rvcw0KYGBgDQoNCg0KDQoNCmBgYHtyfQ0KbWFuaWZlc3Rvc193b3JkcyA8LSBtYW5pZmVzdG9zICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpDQoNCm1hbmlmZXN0b3Nfd29yZHMNCg0KYGBgDQoNCg0KDQoNCjIuIEJ5IHVzaW5nIHRoZSBjb2RlIHNob3duIGJlbG93LCBJIGNyZWF0ZWQgYSB0YWJsZSB0aGF0IGluY2x1ZGVzIGJvdGggbGV4aWNhbCBkaXZlcnNpdHkgYW5kIGRlbnNpdHksIGFzIHdlbGwgYXMgdGhlIHRvdGFsIG51bWJlciBvZiB3b3JkcyBmb3IgZWFjaCBtYW5pZmVzdG8uICANCg0KYGBge3J9DQptYW5pZmVzdG9zX3dvcmRzICU+JSANCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgDQogIHN1bW1hcmlzZShudW1fd29yZHMgPSBuKCksDQogICAgICAgICAgICBsZXhfZGl2ZXJzaXR5ID0gbl9kaXN0aW5jdCh3b3JkKSwgDQogICAgICAgICAgICBsZXhfZGVuc2l0eSA9IG5fZGlzdGluY3Qod29yZCkvbigpKQ0KYGBgDQoNCg0KDQoNCjMuIEkgcHJvZHVjZWQgYSB0YWJsZSB3aXRoIHRoZSBtZWFuIHdvcmQgbGVuZ3RoIG9mIGVhY2ggb2YgdGhlIG1hbmlmZXN0b3MgYnkgdXNpbmcgdGhlIGNvZGUgc2hvd24gYmVsb3cuICANCg0KYGBge3J9DQptYW5pZmVzdG9zX3dvcmRzICU+JQ0KICBncm91cF9ieShhdXRob3IpICU+JSANCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSANCiAgc3VtbWFyaXplKG1lYW5fd29yZF9sZW5ndGggPSBtZWFuKHdvcmRfbGVuZ3RoKSkgJT4lIA0KICBhcnJhbmdlKC1tZWFuX3dvcmRfbGVuZ3RoKQ0KYGBgDQoNCg0KDQoNCg0KNC4gSSBnZW5lcmF0ZWQgYSBncmFwaCB1c2luZyB0aGUgY29kZSBjaHVuayBiZWxvdywgd2hpY2ggaW5jbHVkZXMgbWluaSBoaXN0b2dyYW1zIG9mIGVhY2ggbWFuaWZlc3RvJ3Mgd29yZCBsZW5ndGhzLg0KDQpgYGB7cn0NCm1hbmlmZXN0b3Nfd29yZHMgJT4lDQogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSAlPiUgDQogIGdncGxvdChhZXMod29yZF9sZW5ndGgpKSArDQogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKw0KICBmYWNldF93cmFwKHZhcnMoYXV0aG9yKSwgc2NhbGVzID0gImZyZWVfeSIpICsNCiAgbGFicyh0aXRsZSA9ICJXb3JkIGxlbmd0aHMgYnkgbWFuaWZlc3RvIGF1dGhvciIpDQpgYGANCg0KDQoNCjUuIEkgcmVtb3ZlZCBzdG9wIHdvcmRzIGFuZCB0aGVuIGNyZWF0ZWQgYSBncmFwaCB3aXRoIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyBpbiBlYWNoIGRvY3VtZW50LiANCg0KSSB1c2VkIHRoaXMgZmlyc3QgY29kZSB0byBsb2FkIHRoZSBsaXN0IG9mIHN0b3Agd29yZHMuDQoNCmBgYHtyfQ0Kc3RvcF93b3Jkcw0KYGBgDQoNClRoZW4gSSB1c2VkIHRoZSBjb2RlIGRpcmVjdGx5IGJlbG93IHRvIHJlbW92ZSB0aGUgc3RvcCB3b3JkcyBmcm9tIHRoZSBtYW5pZmVzdG9zX3dvcmRzIGRhdGEgc2V0Lg0KDQpgYGB7cn0NCm1hbmlmZXN0b3Nfd29yZHMgJT4lDQogIGFudGlfam9pbihzdG9wX3dvcmRzKQ0KYGBgDQoNClRoZSBjb2RlIGNodW5rIGJlbG93IHdhcyB1c2VkIHRvIGNyZWF0ZSBhIGdyYXBoIG9mIHRoZSBtb3N0IHVzZWQgd29yZHMgaW4gZWFjaCBvZiB0aGUgbWFuaWZlc3Rvcy4NCg0KYGBge3J9DQptYW5pZmVzdG9zX3dvcmRzICU+JQ0KICBhbnRpX2pvaW4oc3RvcF93b3JkcykgJT4lIA0KICBncm91cF9ieShhdXRob3IpICU+JSANCiAgY291bnQod29yZCwgc29ydCA9IFQpICU+JQ0KICB0b3Bfbig1KSAlPiUgDQogIHVuZ3JvdXAoKSAlPiUgDQogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lDQogIGdncGxvdChhZXMod29yZCwgbiwgZmlsbCA9IGF1dGhvcikpICsNCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKw0KICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgY29tbW9uIHdvcmRzIGluIGVhY2ggbWFuaWZlc3RvIikgKw0KICBmYWNldF93cmFwKHZhcnMoYXV0aG9yKSwgc2NhbGVzID0gImZyZWUiKSArDQogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKw0KICB0aGVtZV9taW5pbWFsKCkgKw0KICBjb29yZF9mbGlwKCkNCg0KYGBgDQoNCg0KDQo2LiBVc2luZyB0aGUgZmlyc3QgY29kZSBzaG93biBiZWxvdywgSSBjYWxjdWxhdGVkIHRmLWlkZnMgZm9yIGVhY2ggbWFuaWZlc3RvIGFuZCBhIHRhYmxlIGRpc3BsYXlpbmcgdGhlIHJlc3VsdHMuIFRoZW4gd2l0aCB0aGUgbGFzdCBjb2RlIGJlbG93IEkgY3JlYXRlZCBhIGdyYXBoIG9mIHRoZSB3b3JkcyB3aXRoIHRoZSBoaWdoZXN0IHRmLWlkZnMgaW4gZWFjaCBtYW5pZmVzdG8gIA0KDQpgYGB7cn0NCm1hbmlmZXN0b3Nfd29yZF9jb3VudHMgPC0gbWFuaWZlc3RvcyAlPiUgICAgICAgICAgICAgDQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lDQogIGNvdW50KGF1dGhvciwgd29yZCwgc29ydCA9IFRSVUUpIA0KDQp0b3RhbF93b3JkcyA8LSBtYW5pZmVzdG9zX3dvcmRfY291bnRzICU+JSAgICAgICAgICAgICAgIA0KICBncm91cF9ieShhdXRob3IpICU+JSANCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQ0KDQptYW5pZmVzdG9zX3dvcmRfY291bnRzIDwtIGxlZnRfam9pbihtYW5pZmVzdG9zX3dvcmRfY291bnRzLCB0b3RhbF93b3JkcykgICAgDQoNCm1hbmlmZXN0b3NfdGZfaWRmIDwtIG1hbmlmZXN0b3Nfd29yZF9jb3VudHMgJT4lICAgICAgICAgICAgIA0KICBiaW5kX3RmX2lkZih3b3JkLCBhdXRob3IsIG4pDQoNCm1hbmlmZXN0b3NfdGZfaWRmICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgDQogIGFycmFuZ2UoLXRmX2lkZikgICAgICAgICAgICAgICAgICAgICAgICAgIA0KDQpgYGANCg0KDQpXaXRoIHRoaXMgZmluYWwgY29kZSBiZWxvdyBJIGNyZWF0ZWQgYSBncmFwaCBvZiB0aGUgd29yZHMgd2l0aCB0aGUgaGlnaGVzdCB0Zi1pZGZzIGluIGVhY2ggbWFuaWZlc3RvLiAgDQoNCg0KYGBge3J9DQptYW5pZmVzdG9zX3RmX2lkZiAlPiUNCiAgYXJyYW5nZSgtdGZfaWRmKSAlPiUNCiAgbXV0YXRlKHdvcmQgPSBmYWN0b3Iod29yZCwgbGV2ZWxzID0gcmV2KHVuaXF1ZSh3b3JkKSkpKSAlPiUgDQogIGdyb3VwX2J5KGF1dGhvcikgJT4lIA0KICB0b3Bfbig1KSAlPiUgDQogIGdncGxvdChhZXMod29yZCwgdGZfaWRmLCBmaWxsID0gYXV0aG9yKSkgKw0KICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArDQogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKw0KICBmYWNldF93cmFwKH5hdXRob3IsIHNjYWxlcyA9ICJmcmVlIikgKw0KICBjb29yZF9mbGlwKCkgKw0KICB0aGVtZV9taW5pbWFsKCkgKyANCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArDQogIGxhYnModGl0bGUgPSAiTW9zdCBkaXN0aW5jdGl2ZSB3b3JkcyBpbiBlYWNoIG1hbmlmZXN0byIpDQoNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg==