library(tidyverse)
library(DT)
library(tidytext)        # package for text analysis
library(readxl)          # reads excel files, the format I used for the data

Assignment

  1. Read in the text and unnest the words.
manifestos <- read_excel('manifestos.xlsx')

manifestos
NA

manifestos_words <- manifestos %>%
  unnest_tokens(word, text)

manifestos_words
NA
NA

I used the codes above to first read in the Manifestos document to R, and to “unnest” the words within the spreadsheet.

  1. Generate a table that includes both lexical diversity and density, and the total number of words, of each document.

manifestos_words %>% 
  group_by(author) %>% 
  summarise(num_words = n(),
            lex_diversity = n_distinct(word), 
            lex_density = n_distinct(word)/n())
NA

I used the code above to create a table that includes the lexical diversity and density, as well as showing a total word count for each document.

  1. Generate a table with the mean word length of each document.

manifestos_words %>%
  group_by(author) %>% 
  mutate(word_length = nchar(word)) %>% 
  summarize(mean_word_length = mean(word_length)) %>% 
  arrange(-mean_word_length)
NA
NA

I used the code above to generate a table to show the mean word length for each document.

  1. Generate a graph with mini histograms of each document’s word lengths.

manifestos_words %>%
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(vars(author), scales = "free_y") +
  labs(title = "Word Lengths per Manifesto")

The code above was used to create mini histograms to compare average word length for the different manifesto documents.

  1. Remove stop words and then create a graph with the most common words in each document.

manifestos_words %>%
  anti_join(stop_words) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>%
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most Common Words in Each Document") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()
Joining, by = "word"Selecting by n

The code above was used to remove the stop words and then graph the most common words in each document. Cho used the most vulgar language of the group.

  1. Calculate tf-idfs and create a graph of the words with the highest tf-idfs in each document.
manifestos_word_counts <- manifestos %>%             
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE) 

total_words <- manifestos_word_counts %>%               
  group_by(author) %>% 
  summarize(total = sum(n))

manifestos_word_counts <- left_join(manifestos_word_counts, total_words)    
Joining, by = "author"
manifestos_tf_idf <- manifestos_word_counts %>%             
  bind_tf_idf(word, author, n)

manifestos_tf_idf %>%                                   
  arrange(-tf_idf)                          
NA

manifestos_tf_idf %>%
  arrange(-tf_idf) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(5) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip() +
  theme_minimal() + 
  scale_fill_viridis_d() +
  labs(title = "Most Unique Words in Each Manifesto")
Selecting by tf_idf

With the codes above, I first calculated the tf-idfs and created a table to display the results. I then used the data to create a graph of the words with the highest tf-idfs in each manifesto.

LS0tCnRpdGxlOiAiSW50cm8gdG8gVGV4dCBBbmFseXNpcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KERUKQpsaWJyYXJ5KHRpZHl0ZXh0KSAgICAgICAgIyBwYWNrYWdlIGZvciB0ZXh0IGFuYWx5c2lzCmxpYnJhcnkocmVhZHhsKSAgICAgICAgICAjIHJlYWRzIGV4Y2VsIGZpbGVzLCB0aGUgZm9ybWF0IEkgdXNlZCBmb3IgdGhlIGRhdGEKCmBgYAoKCkFzc2lnbm1lbnQKCgoxLiBSZWFkIGluIHRoZSB0ZXh0IGFuZCB1bm5lc3QgdGhlIHdvcmRzLiAgCgpgYGB7cn0KbWFuaWZlc3RvcyA8LSByZWFkX2V4Y2VsKCdtYW5pZmVzdG9zLnhsc3gnKQoKbWFuaWZlc3RvcwoKYGBgCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgPC0gbWFuaWZlc3RvcyAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCgptYW5pZmVzdG9zX3dvcmRzCgoKYGBgCgpJIHVzZWQgdGhlIGNvZGVzIGFib3ZlIHRvIGZpcnN0IHJlYWQgaW4gdGhlIE1hbmlmZXN0b3MgZG9jdW1lbnQgdG8gUiwgYW5kIHRvICJ1bm5lc3QiIHRoZSB3b3JkcyB3aXRoaW4gdGhlIHNwcmVhZHNoZWV0LgoKCjIuIEdlbmVyYXRlIGEgdGFibGUgdGhhdCBpbmNsdWRlcyBib3RoIGxleGljYWwgZGl2ZXJzaXR5IGFuZCBkZW5zaXR5LCBhbmQgdGhlIHRvdGFsIG51bWJlciBvZiB3b3Jkcywgb2YgZWFjaCBkb2N1bWVudC4gCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcmlzZShudW1fd29yZHMgPSBuKCksCiAgICAgICAgICAgIGxleF9kaXZlcnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpLCAKICAgICAgICAgICAgbGV4X2RlbnNpdHkgPSBuX2Rpc3RpbmN0KHdvcmQpL24oKSkKICAgICAgICAgICAgCmBgYAoKSSB1c2VkIHRoZSBjb2RlIGFib3ZlIHRvIGNyZWF0ZSBhIHRhYmxlIHRoYXQgaW5jbHVkZXMgdGhlIGxleGljYWwgZGl2ZXJzaXR5IGFuZCBkZW5zaXR5LCBhcyB3ZWxsIGFzIHNob3dpbmcgYSB0b3RhbCB3b3JkIGNvdW50IGZvciBlYWNoIGRvY3VtZW50LiAKCgozLiBHZW5lcmF0ZSBhIHRhYmxlIHdpdGggdGhlIG1lYW4gd29yZCBsZW5ndGggb2YgZWFjaCBkb2N1bWVudC4gCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSAKICBzdW1tYXJpemUobWVhbl93b3JkX2xlbmd0aCA9IG1lYW4od29yZF9sZW5ndGgpKSAlPiUgCiAgYXJyYW5nZSgtbWVhbl93b3JkX2xlbmd0aCkKCgpgYGAKCkkgdXNlZCB0aGUgY29kZSBhYm92ZSB0byBnZW5lcmF0ZSBhIHRhYmxlIHRvIHNob3cgdGhlIG1lYW4gd29yZCBsZW5ndGggZm9yIGVhY2ggZG9jdW1lbnQuCgo0LiBHZW5lcmF0ZSBhIGdyYXBoIHdpdGggbWluaSBoaXN0b2dyYW1zIG9mIGVhY2ggZG9jdW1lbnQncyB3b3JkIGxlbmd0aHMuCgpgYGB7cn0KCm1hbmlmZXN0b3Nfd29yZHMgJT4lCiAgbXV0YXRlKHdvcmRfbGVuZ3RoID0gbmNoYXIod29yZCkpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZV95IikgKwogIGxhYnModGl0bGUgPSAiV29yZCBMZW5ndGhzIHBlciBNYW5pZmVzdG8iKQoKYGBgCgpUaGUgY29kZSBhYm92ZSB3YXMgdXNlZCB0byBjcmVhdGUgbWluaSBoaXN0b2dyYW1zIHRvIGNvbXBhcmUgYXZlcmFnZSB3b3JkIGxlbmd0aCBmb3IgdGhlIGRpZmZlcmVudCBtYW5pZmVzdG8gZG9jdW1lbnRzLgoKCjUuIFJlbW92ZSBzdG9wIHdvcmRzIGFuZCB0aGVuIGNyZWF0ZSBhIGdyYXBoIHdpdGggdGhlIG1vc3QgY29tbW9uIHdvcmRzIGluIGVhY2ggZG9jdW1lbnQuIAoKYGBge3J9CgptYW5pZmVzdG9zX3dvcmRzICU+JQogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUgCiAgZ3JvdXBfYnkoYXV0aG9yKSAlPiUgCiAgY291bnQod29yZCwgc29ydCA9IFQpICU+JQogIHRvcF9uKDUpICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lCiAgZ2dwbG90KGFlcyh3b3JkLCBuLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgQ29tbW9uIFdvcmRzIGluIEVhY2ggRG9jdW1lbnQiKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgY29vcmRfZmxpcCgpCgpgYGAKClRoZSBjb2RlIGFib3ZlIHdhcyB1c2VkIHRvIHJlbW92ZSB0aGUgc3RvcCB3b3JkcyBhbmQgdGhlbiBncmFwaCB0aGUgbW9zdCBjb21tb24gd29yZHMgaW4gZWFjaCBkb2N1bWVudC4gQ2hvIHVzZWQgdGhlIG1vc3QgdnVsZ2FyIGxhbmd1YWdlIG9mIHRoZSBncm91cC4gCgoKNi4gQ2FsY3VsYXRlIHRmLWlkZnMgYW5kIGNyZWF0ZSBhIGdyYXBoIG9mIHRoZSB3b3JkcyB3aXRoIHRoZSBoaWdoZXN0IHRmLWlkZnMgaW4gZWFjaCBkb2N1bWVudC4gIAoKCmBgYHtyfQptYW5pZmVzdG9zX3dvcmRfY291bnRzIDwtIG1hbmlmZXN0b3MgJT4lICAgICAgICAgICAgIAogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lCiAgY291bnQoYXV0aG9yLCB3b3JkLCBzb3J0ID0gVFJVRSkgCgp0b3RhbF93b3JkcyA8LSBtYW5pZmVzdG9zX3dvcmRfY291bnRzICU+JSAgICAgICAgICAgICAgIAogIGdyb3VwX2J5KGF1dGhvcikgJT4lIAogIHN1bW1hcml6ZSh0b3RhbCA9IHN1bShuKSkKCm1hbmlmZXN0b3Nfd29yZF9jb3VudHMgPC0gbGVmdF9qb2luKG1hbmlmZXN0b3Nfd29yZF9jb3VudHMsIHRvdGFsX3dvcmRzKSAgICAKCm1hbmlmZXN0b3NfdGZfaWRmIDwtIG1hbmlmZXN0b3Nfd29yZF9jb3VudHMgJT4lICAgICAgICAgICAgIAogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKCm1hbmlmZXN0b3NfdGZfaWRmICU+JSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgYXJyYW5nZSgtdGZfaWRmKSAgICAgICAgICAgICAgICAgICAgICAgICAgCgpgYGAKCmBgYHtyfQoKbWFuaWZlc3Rvc190Zl9pZGYgJT4lCiAgYXJyYW5nZSgtdGZfaWRmKSAlPiUKICBtdXRhdGUod29yZCA9IGZhY3Rvcih3b3JkLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQpKSkpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICB0b3Bfbig1KSAlPiUgCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYsIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGxhYnMoeCA9IE5VTEwsIHkgPSAidGYtaWRmIikgKwogIGZhY2V0X3dyYXAofmF1dGhvciwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkgKyAKICBzY2FsZV9maWxsX3ZpcmlkaXNfZCgpICsKICBsYWJzKHRpdGxlID0gIk1vc3QgVW5pcXVlIFdvcmRzIGluIEVhY2ggTWFuaWZlc3RvIikKCgpgYGAKCldpdGggdGhlIGNvZGVzIGFib3ZlLCBJIGZpcnN0IGNhbGN1bGF0ZWQgdGhlIHRmLWlkZnMgYW5kIGNyZWF0ZWQgYSB0YWJsZSB0byBkaXNwbGF5IHRoZSByZXN1bHRzLiBJIHRoZW4gdXNlZCB0aGUgZGF0YSB0byBjcmVhdGUgYSBncmFwaCBvZiB0aGUgd29yZHMgd2l0aCB0aGUgaGlnaGVzdCB0Zi1pZGZzIGluIGVhY2ggbWFuaWZlc3RvLgoKCgoKCgoKCgo=