library(tidyverse)
library(DT)
library(tidytext) # package for text analysis
library(readxl) # reads excel files, the format I used for the data
Assignment
- Read in the text and unnest the words.
manifestos <- read_excel('manifestos.xlsx')
manifestos
NA
manifestos_words <- manifestos %>%
unnest_tokens(word, text)
manifestos_words
NA
NA
I used the codes above to first read in the Manifestos document to R, and to “unnest” the words within the spreadsheet.
- Generate a table that includes both lexical diversity and density, and the total number of words, of each document.
manifestos_words %>%
group_by(author) %>%
summarise(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = n_distinct(word)/n())
NA
I used the code above to create a table that includes the lexical diversity and density, as well as showing a total word count for each document.
- Generate a table with the mean word length of each document.
manifestos_words %>%
group_by(author) %>%
mutate(word_length = nchar(word)) %>%
summarize(mean_word_length = mean(word_length)) %>%
arrange(-mean_word_length)
NA
NA
I used the code above to generate a table to show the mean word length for each document.
- Generate a graph with mini histograms of each document’s word lengths.
manifestos_words %>%
mutate(word_length = nchar(word)) %>%
ggplot(aes(word_length)) +
geom_histogram(binwidth = 1) +
facet_wrap(vars(author), scales = "free_y") +
labs(title = "Word Lengths per Manifesto")

The code above was used to create mini histograms to compare average word length for the different manifesto documents.
- Remove stop words and then create a graph with the most common words in each document.
manifestos_words %>%
anti_join(stop_words) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Most Common Words in Each Document") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining, by = "word"Selecting by n

The code above was used to remove the stop words and then graph the most common words in each document. Cho used the most vulgar language of the group.
- Calculate tf-idfs and create a graph of the words with the highest tf-idfs in each document.
manifestos_word_counts <- manifestos %>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
total_words <- manifestos_word_counts %>%
group_by(author) %>%
summarize(total = sum(n))
manifestos_word_counts <- left_join(manifestos_word_counts, total_words)
Joining, by = "author"
manifestos_tf_idf <- manifestos_word_counts %>%
bind_tf_idf(word, author, n)
manifestos_tf_idf %>%
arrange(-tf_idf)
NA
manifestos_tf_idf %>%
arrange(-tf_idf) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(5) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
coord_flip() +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Most Unique Words in Each Manifesto")
Selecting by tf_idf

With the codes above, I first calculated the tf-idfs and created a table to display the results. I then used the data to create a graph of the words with the highest tf-idfs in each manifesto.
LS0tCnRpdGxlOiAiSW50cm8gdG8gVGV4dCBBbmFseXNpcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KERUKQpsaWJyYXJ5KHRpZHl0ZXh0KSAgICAgICAgIyBwYWNrYWdlIGZvciB0ZXh0IGFuYWx5c2lzCmxpYnJhcnkocmVhZHhsKSAgICAgICAgICAjIHJlYWRzIGV4Y2VsIGZpbGVzLCB0aGUgZm9ybWF0IEkgdXNlZCBmb3IgdGhlIGRhdGEKCmBgYAoKCkFzc2lnbm1lbnQKCgoxLiBSZWFkIGluIHRoZSB0ZXh0IGFuZCB1bm5lc3QgdGhlIHdvcmRzLiAgCgpgYGB7cn0KbWFuaWZlc3RvcyA8LSByZWFkX2V4Y2VsKCdtYW5pZmVzdG9zLnhsc3gnKQoKbWFuaWZlc3RvcwoKYGBgCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgPC0gbWFuaWZlc3RvcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCgptYW5pZmVzdG9zX3dvcmRzCgoKYGBgCgpJIHVzZWQgdGhlIGNvZGVzIGFib3ZlIHRvIGZpcnN0IHJlYWQgaW4gdGhlIE1hbmlmZXN0b3MgZG9jdW1lbnQgdG8gUiwgYW5kIHRvICJ1bm5lc3QiIHRoZSB3b3JkcyB3aXRoaW4gdGhlIHNwcmVhZHNoZWV0LgoKCjIuIEdlbmVyYXRlIGEgdGFibGUgdGhhdCBpbmNsdWRlcyBib3RoIGxleGljYWwgZGl2ZXJzaXR5IGFuZCBkZW5zaXR5LCBhbmQgdGhlIHRvdGFsIG51bWJlciBvZiB3b3Jkcywgb2YgZWFjaCBkb2N1bWVudC4gCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcmlzZShudW1fd29yZHMgPSBuKCksCiAgICAgICAgICAgIGxleF9kaXZlcnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpLCAKICAgICAgICAgICAgbGV4X2RlbnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpL24oKSkKICAgICAgICAgICAgCmBgYAoKSSB1c2VkIHRoZSBjb2RlIGFib3ZlIHRvIGNyZWF0ZSBhIHRhYmxlIHRoYXQgaW5jbHVkZXMgdGhlIGxleGljYWwgZGl2ZXJzaXR5IGFuZCBkZW5zaXR5LCBhcyB3ZWxsIGFzIHNob3dpbmcgYSB0b3RhbCB3b3JkIGNvdW50IGZvciBlYWNoIGRvY3VtZW50LiAKCgozLiBHZW5lcmF0ZSBhIHRhYmxlIHdpdGggdGhlIG1lYW4gd29yZCBsZW5ndGggb2YgZWFjaCBkb2N1bWVudC4gCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSAKICBzdW1tYXJpemUobWVhbl93b3JkX2xlbmd0aCA9IG1lYW4od29yZF9sZW5ndGgpKSAlPiUgCiAgYXJyYW5nZSgtbWVhbl93b3JkX2xlbmd0aCkKCgpgYGAKCkkgdXNlZCB0aGUgY29kZSBhYm92ZSB0byBnZW5lcmF0ZSBhIHRhYmxlIHRvIHNob3cgdGhlIG1lYW4gd29yZCBsZW5ndGggZm9yIGVhY2ggZG9jdW1lbnQuCgo0LiBHZW5lcmF0ZSBhIGdyYXBoIHdpdGggbWluaSBoaXN0b2dyYW1zIG9mIGVhY2ggZG9jdW1lbnQncyB3b3JkIGxlbmd0aHMuCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZV95IikgKwogIGxhYnModGl0bGUgPSAiV29yZCBMZW5ndGhzIHBlciBNYW5pZmVzdG8iKQoKYGBgCgpUaGUgY29kZSBhYm92ZSB3YXMgdXNlZCB0byBjcmVhdGUgbWluaSBoaXN0b2dyYW1zIHRvIGNvbXBhcmUgYXZlcmFnZSB3b3JkIGxlbmd0aCBmb3IgdGhlIGRpZmZlcmVudCBtYW5pZmVzdG8gZG9jdW1lbnRzLgoKCjUuIFJlbW92ZSBzdG9wIHdvcmRzIGFuZCB0aGVuIGNyZWF0ZSBhIGdyYXBoIHdpdGggdGhlIG1vc3QgY29tbW9uIHdvcmRzIGluIGVhY2ggZG9jdW1lbnQuIAoKYGBge3J9CgptYW5pZmVzdG9zX3dvcmRzICU+JQogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgY291bnQod29yZCwgc29ydCA9IFQpICU+JQogIHRvcF9uKDUpICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCBuLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgQ29tbW9uIFdvcmRzIGluIEVhY2ggRG9jdW1lbnQiKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgY29vcmRfZmxpcCgpCgpgYGAKClRoZSBjb2RlIGFib3ZlIHdhcyB1c2VkIHRvIHJlbW92ZSB0aGUgc3RvcCB3b3JkcyBhbmQgdGhlbiBncmFwaCB0aGUgbW9zdCBjb21tb24gd29yZHMgaW4gZWFjaCBkb2N1bWVudC4gQ2hvIHVzZWQgdGhlIG1vc3QgdnVsZ2FyIGxhbmd1YWdlIG9mIHRoZSBncm91cC4gCgoKNi4gQ2FsY3VsYXRlIHRmLWlkZnMgYW5kIGNyZWF0ZSBhIGdyYXBoIG9mIHRoZSB3b3JkcyB3aXRoIHRoZSBoaWdoZXN0IHRmLWlkZnMgaW4gZWFjaCBkb2N1bWVudC4gIAoKCmBgYHtyfQptYW5pZmVzdG9zX3dvcmRfY291bnRzIDwtIG1hbmlmZXN0b3MgJT4lICAgICAgICAgICAgIAogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lCiAgY291bnQoYXV0aG9yLCB3b3JkLCBzb3J0ID0gVFJVRSkgCgp0b3RhbF93b3JkcyA8LSBtYW5pZmVzdG9zX3dvcmRfY291bnRzICU+JSAgICAgICAgICAgICAgIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcml6ZSh0b3RhbCA9IHN1bShuKSkKCm1hbmlmZXN0b3Nfd29yZF9jb3VudHMgPC0gbGVmdF9qb2luKG1hbmlmZXN0b3Nfd29yZF9jb3VudHMsIHRvdGFsX3dvcmRzKSAgICAKCm1hbmlmZXN0b3NfdGZfaWRmIDwtIG1hbmlmZXN0b3Nfd29yZF9jb3VudHMgJT4lICAgICAgICAgICAgIAogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKCm1hbmlmZXN0b3NfdGZfaWRmICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgYXJyYW5nZSgtdGZfaWRmKSAgICAgICAgICAgICAgICAgICAgICAgICAgCgpgYGAKCmBgYHtyfQoKbWFuaWZlc3Rvc190Zl9pZGYgJT4lCiAgYXJyYW5nZSgtdGZfaWRmKSAlPiUKICBtdXRhdGUod29yZCA9IGZhY3Rvcih3b3JkLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQpKSkpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICB0b3Bfbig1KSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYsIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKwogIGZhY2V0X3dyYXAofmF1dGhvciwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkgKyAKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsKICBsYWJzKHRpdGxlID0gIk1vc3QgVW5pcXVlIFdvcmRzIGluIEVhY2ggTWFuaWZlc3RvIikKCgpgYGAKCldpdGggdGhlIGNvZGVzIGFib3ZlLCBJIGZpcnN0IGNhbGN1bGF0ZWQgdGhlIHRmLWlkZnMgYW5kIGNyZWF0ZWQgYSB0YWJsZSB0byBkaXNwbGF5IHRoZSByZXN1bHRzLiBJIHRoZW4gdXNlZCB0aGUgZGF0YSB0byBjcmVhdGUgYSBncmFwaCBvZiB0aGUgd29yZHMgd2l0aCB0aGUgaGlnaGVzdCB0Zi1pZGZzIGluIGVhY2ggbWFuaWZlc3RvLgoKCgoKCgoKCgo=