library(tidyverse)
library(tidytext)
library(readxl) 

These are the packages that I used for this assignment.

inaug_speeches <- read_excel("R class/inaug_speeches.xlsx")
inaug_words <- inaug_speeches |>
  unnest_tokens(word, text)

Question 1:

inaug_words |> 
  group_by(author) |> 
  summarize(num_words = n(), lex_diversity = n_distinct(word),
            lexical_density =  lex_diversity/num_words)

This is a table showing the complexity of the number of words, diversity, and density of each President’s speech.

inaug_words |>
  mutate(word_length = nchar(word)) |> 
  ggplot(aes(word_length)) +
  facet_wrap(vars(author), scales = "free_y") +
  geom_histogram(binwidth = 1) +
  labs(title = "Word Lengths of Presidential Inaugeration Speeches")

This is a graph showing the distributions of word lengths for each Presidential speech.

Question 2:

inaug_words |> 
  group_by(author) |>
  count(word, sort = T) |>
  top_n(5) |>
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~author, scales = "free") +           
  scale_fill_viridis_d() +                        
  theme_minimal() +                                
  labs(x = NULL, y = "Most common inaugural speech words")
Selecting by n

This is a graph showing the most common words used by each President during their speech.

stop_words %>% 
  filter(lexicon == "snowball") -> snowball
inaug_words %>% 
  anti_join(snowball) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>% 
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Commonality of Inauguration Speech Vocabulary") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()
Joining with `by = join_by(word)`Selecting by n

This is a graph showing the most common words used by each President during their speech excluding common words.

Question 3:

inaug_word_counts <- inaug_speeches |>             
  unnest_tokens(word, text) |>
  count(author, word, sort = TRUE) 

total_words <- inaug_word_counts |>               
  group_by(author) |> 
  summarize(total = sum(n))

inaug_word_counts <- left_join(inaug_word_counts, total_words)
inaug_tf_idf <- inaug_word_counts |>             
  bind_tf_idf(word, author, n)

inaug_tf_idf |>                                   
  arrange(-tf_idf) 

This is a table showing the word frequency (tf) in each President’s speech, the number of times that the word was used in other speeches (idf), and the significance of the word in relation to this speech compared to the use in other speeches.

inaug_tf_idf |>
  arrange(-tf_idf) |>
  mutate(word = factor(word, levels = rev(unique(word)))) |> 
  group_by(author) |> 
  top_n(3) |> 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Significant words for each of the Presidential inaugural speech") +
  coord_flip()
Selecting by tf_idf

This is a graph showing the most common significant words that each president used during their speeches.

Question 4:

bing <- get_sentiments("bing") 

inaug_words %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  group_by(sentiment) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") + 
  labs(y = "Sentiment analysis of the most Common Positive and Negative Words Used", x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()
Joining with `by = join_by(word)`Selecting by n

This is a graph showing the most common positive and negative words that were used in the speeches.

Question 5:

inaug_speeches %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  select(bigram) -> inaug_bigrams
inaug_bigrams %>% 
  count(bigram, sort = T)

This is a table showing the most common bigrams used in the presidential speeches.

inaug_bigrams |> 
  separate(bigram, c("word1", "word2"), sep = " ") |> 
  filter(!word1 %in% snowball$word) |>
  filter(!word2 %in% snowball$word) |> 
  unite(bigram, word1, word2, sep = " ") |>
  count(bigram, sort = T)

This is a table showing the most commonly used bigrams from the speeches excluding common stop words.

Question 6:

first_word <- c("citizens", "government")  

inaug_bigrams |> 
  count(bigram, sort = T) |> 
  separate(bigram, c("word1", "word2"), sep = " ") |>       
  filter(word1 %in% first_word) |>                          
  count(word1, word2, wt = n, sort = TRUE) |>
  mutate(word2 = factor(word2, levels = rev(unique(word2)))) |>     
  group_by(word1) |> 
  top_n(5) |> 
  ggplot(aes(word2, n, fill = word1)) +                          
  scale_fill_viridis_d() +                                           
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following:") +
  facet_wrap(~word1, scales = "free") +
  coord_flip() +
  theme_minimal()
Selecting by n

This is a graph showing the most frequent words used directly after the words “citizens” and “government” in the presidential speeches.

LS0tCnRpdGxlOiAiVGV4dCBBbmFseXNpcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShyZWFkeGwpIApgYGAKClRoZXNlIGFyZSB0aGUgcGFja2FnZXMgdGhhdCBJIHVzZWQgZm9yIHRoaXMgYXNzaWdubWVudC4KCmBgYHtyfQppbmF1Z19zcGVlY2hlcyA8LSByZWFkX2V4Y2VsKCJSIGNsYXNzL2luYXVnX3NwZWVjaGVzLnhsc3giKQpgYGAKCmBgYHtyfQppbmF1Z193b3JkcyA8LSBpbmF1Z19zcGVlY2hlcyB8PgogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkKYGBgCgpRdWVzdGlvbiAxOgoKYGBge3J9CmluYXVnX3dvcmRzIHw+IAogIGdyb3VwX2J5KGF1dGhvcikgfD4gCiAgc3VtbWFyaXplKG51bV93b3JkcyA9IG4oKSwgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCksCiAgICAgICAgICAgIGxleGljYWxfZGVuc2l0eSA9ICBsZXhfZGl2ZXJzaXR5L251bV93b3JkcykKYGBgCgpUaGlzIGlzIGEgdGFibGUgc2hvd2luZyB0aGUgY29tcGxleGl0eSBvZiB0aGUgbnVtYmVyIG9mIHdvcmRzLCBkaXZlcnNpdHksIGFuZCBkZW5zaXR5IG9mIGVhY2ggUHJlc2lkZW50J3Mgc3BlZWNoLiAKCmBgYHtyfQppbmF1Z193b3JkcyB8PgogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSB8PiAKICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZV95IikgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKwogIGxhYnModGl0bGUgPSAiV29yZCBMZW5ndGhzIG9mIFByZXNpZGVudGlhbCBJbmF1Z2VyYXRpb24gU3BlZWNoZXMiKQpgYGAKClRoaXMgaXMgYSBncmFwaCBzaG93aW5nIHRoZSBkaXN0cmlidXRpb25zIG9mIHdvcmQgbGVuZ3RocyBmb3IgZWFjaCBQcmVzaWRlbnRpYWwgc3BlZWNoLiAKClF1ZXN0aW9uIDI6CgpgYGB7cn0KaW5hdWdfd29yZHMgfD4gCiAgZ3JvdXBfYnkoYXV0aG9yKSB8PgogIGNvdW50KHdvcmQsIHNvcnQgPSBUKSB8PgogIHRvcF9uKDUpIHw+CiAgdW5ncm91cCgpIHw+IAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgfD4KICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGNvb3JkX2ZsaXAoKSArCiAgZmFjZXRfd3JhcCh+YXV0aG9yLCBzY2FsZXMgPSAiZnJlZSIpICsgICAgICAgICAgIAogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKyAgICAgICAgICAgICAgICAgICAgICAgIAogIHRoZW1lX21pbmltYWwoKSArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgY29tbW9uIGluYXVndXJhbCBzcGVlY2ggd29yZHMiKQpgYGAKClRoaXMgaXMgYSBncmFwaCBzaG93aW5nIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyB1c2VkIGJ5IGVhY2ggUHJlc2lkZW50IGR1cmluZyB0aGVpciBzcGVlY2guIAoKYGBge3J9CnN0b3Bfd29yZHMgJT4lIAogIGZpbHRlcihsZXhpY29uID09ICJzbm93YmFsbCIpIC0+IHNub3diYWxsCmluYXVnX3dvcmRzICU+JSAKICBhbnRpX2pvaW4oc25vd2JhbGwpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgJT4lIAogIHRvcF9uKDUpICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lIAogIGdncGxvdChhZXMod29yZCwgbiwgZmlsbCA9IGF1dGhvcikpICsKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJDb21tb25hbGl0eSBvZiBJbmF1Z3VyYXRpb24gU3BlZWNoIFZvY2FidWxhcnkiKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKVGhpcyBpcyBhIGdyYXBoIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIHdvcmRzIHVzZWQgYnkgZWFjaCBQcmVzaWRlbnQgZHVyaW5nIHRoZWlyIHNwZWVjaCBleGNsdWRpbmcgY29tbW9uIHdvcmRzLiAKClF1ZXN0aW9uIDM6IAoKYGBge3J9CmluYXVnX3dvcmRfY291bnRzIDwtIGluYXVnX3NwZWVjaGVzIHw+ICAgICAgICAgICAgIAogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgfD4KICBjb3VudChhdXRob3IsIHdvcmQsIHNvcnQgPSBUUlVFKSAKCnRvdGFsX3dvcmRzIDwtIGluYXVnX3dvcmRfY291bnRzIHw+ICAgICAgICAgICAgICAgCiAgZ3JvdXBfYnkoYXV0aG9yKSB8PiAKICBzdW1tYXJpemUodG90YWwgPSBzdW0obikpCgppbmF1Z193b3JkX2NvdW50cyA8LSBsZWZ0X2pvaW4oaW5hdWdfd29yZF9jb3VudHMsIHRvdGFsX3dvcmRzKQoKYGBgCgpgYGB7cn0KaW5hdWdfdGZfaWRmIDwtIGluYXVnX3dvcmRfY291bnRzIHw+ICAgICAgICAgICAgIAogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKCmluYXVnX3RmX2lkZiB8PiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgYXJyYW5nZSgtdGZfaWRmKSAKYGBgCgpUaGlzIGlzIGEgdGFibGUgc2hvd2luZyB0aGUgd29yZCBmcmVxdWVuY3kgKHRmKSBpbiBlYWNoIFByZXNpZGVudCdzIHNwZWVjaCwgdGhlIG51bWJlciBvZiB0aW1lcyB0aGF0IHRoZSB3b3JkIHdhcyB1c2VkIGluIG90aGVyIHNwZWVjaGVzIChpZGYpLCBhbmQgdGhlIHNpZ25pZmljYW5jZSBvZiB0aGUgd29yZCBpbiByZWxhdGlvbiB0byB0aGlzIHNwZWVjaCBjb21wYXJlZCB0byB0aGUgdXNlIGluIG90aGVyIHNwZWVjaGVzLiAKCmBgYHtyfQppbmF1Z190Zl9pZGYgfD4KICBhcnJhbmdlKC10Zl9pZGYpIHw+CiAgbXV0YXRlKHdvcmQgPSBmYWN0b3Iod29yZCwgbGV2ZWxzID0gcmV2KHVuaXF1ZSh3b3JkKSkpKSB8PiAKICBncm91cF9ieShhdXRob3IpIHw+IAogIHRvcF9uKDMpIHw+IAogIGdncGxvdChhZXMod29yZCwgdGZfaWRmLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gInRmLWlkZiIpICsKICBmYWNldF93cmFwKH5hdXRob3IsIHNjYWxlcyA9ICJmcmVlIikgKwogIHRoZW1lX21pbmltYWwoKSArCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArCiAgbGFicyh0aXRsZSA9ICJTaWduaWZpY2FudCB3b3JkcyBmb3IgZWFjaCBvZiB0aGUgUHJlc2lkZW50aWFsIGluYXVndXJhbCBzcGVlY2giKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKVGhpcyBpcyBhIGdyYXBoIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIHNpZ25pZmljYW50IHdvcmRzIHRoYXQgZWFjaCBwcmVzaWRlbnQgdXNlZCBkdXJpbmcgdGhlaXIgc3BlZWNoZXMuIAoKUXVlc3Rpb24gNDoKCmBgYHtyfQpiaW5nIDwtIGdldF9zZW50aW1lbnRzKCJiaW5nIikgCgppbmF1Z193b3JkcyAlPiUgCiAgaW5uZXJfam9pbihiaW5nKSAlPiUgCiAgY291bnQod29yZCwgc2VudGltZW50LCBzb3J0ID0gVFJVRSkgJT4lIAogIGdyb3VwX2J5KHNlbnRpbWVudCkgJT4lIAogIHRvcF9uKDEwKSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgbikpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBzZW50aW1lbnQpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGZhY2V0X3dyYXAodmFycyhzZW50aW1lbnQpLCBzY2FsZXMgPSAiZnJlZSIpICsgCiAgbGFicyh5ID0gIlNlbnRpbWVudCBhbmFseXNpcyBvZiB0aGUgbW9zdCBDb21tb24gUG9zaXRpdmUgYW5kIE5lZ2F0aXZlIFdvcmRzIFVzZWQiLCB4ID0gTlVMTCkgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIGNvb3JkX2ZsaXAoKSArCiAgdGhlbWVfbWluaW1hbCgpCmBgYAoKVGhpcyBpcyBhIGdyYXBoIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIHBvc2l0aXZlIGFuZCBuZWdhdGl2ZSB3b3JkcyB0aGF0IHdlcmUgdXNlZCBpbiB0aGUgc3BlZWNoZXMuCgpRdWVzdGlvbiA1OgoKYGBge3J9CmluYXVnX3NwZWVjaGVzICU+JSAKICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpICU+JSAKICBzZWxlY3QoYmlncmFtKSAtPiBpbmF1Z19iaWdyYW1zCmluYXVnX2JpZ3JhbXMgJT4lIAogIGNvdW50KGJpZ3JhbSwgc29ydCA9IFQpCmBgYAoKVGhpcyBpcyBhIHRhYmxlIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIGJpZ3JhbXMgdXNlZCBpbiB0aGUgcHJlc2lkZW50aWFsIHNwZWVjaGVzLgoKYGBge3J9CmluYXVnX2JpZ3JhbXMgfD4gCiAgc2VwYXJhdGUoYmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIpLCBzZXAgPSAiICIpIHw+IAogIGZpbHRlcighd29yZDEgJWluJSBzbm93YmFsbCR3b3JkKSB8PgogIGZpbHRlcighd29yZDIgJWluJSBzbm93YmFsbCR3b3JkKSB8PiAKICB1bml0ZShiaWdyYW0sIHdvcmQxLCB3b3JkMiwgc2VwID0gIiAiKSB8PgogIGNvdW50KGJpZ3JhbSwgc29ydCA9IFQpCmBgYAoKVGhpcyBpcyBhIHRhYmxlIHNob3dpbmcgdGhlIG1vc3QgY29tbW9ubHkgdXNlZCBiaWdyYW1zIGZyb20gdGhlIHNwZWVjaGVzIGV4Y2x1ZGluZyBjb21tb24gc3RvcCB3b3Jkcy4gCgpRdWVzdGlvbiA2OgoKYGBge3J9CmZpcnN0X3dvcmQgPC0gYygiY2l0aXplbnMiLCAiZ292ZXJubWVudCIpICAKCmluYXVnX2JpZ3JhbXMgfD4gCiAgY291bnQoYmlncmFtLCBzb3J0ID0gVCkgfD4gCiAgc2VwYXJhdGUoYmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIpLCBzZXAgPSAiICIpIHw+ICAgICAgIAogIGZpbHRlcih3b3JkMSAlaW4lIGZpcnN0X3dvcmQpIHw+ICAgICAgICAgICAgICAgICAgICAgICAgICAKICBjb3VudCh3b3JkMSwgd29yZDIsIHd0ID0gbiwgc29ydCA9IFRSVUUpIHw+CiAgbXV0YXRlKHdvcmQyID0gZmFjdG9yKHdvcmQyLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQyKSkpKSB8PiAgICAgCiAgZ3JvdXBfYnkod29yZDEpIHw+IAogIHRvcF9uKDUpIHw+IAogIGdncGxvdChhZXMod29yZDIsIG4sIGZpbGwgPSB3b3JkMSkpICsgICAgICAgICAgICAgICAgICAgICAgICAgIAogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9IE5VTEwsIHRpdGxlID0gIldvcmQgZm9sbG93aW5nOiIpICsKICBmYWNldF93cmFwKH53b3JkMSwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgpUaGlzIGlzIGEgZ3JhcGggc2hvd2luZyB0aGUgbW9zdCBmcmVxdWVudCB3b3JkcyB1c2VkIGRpcmVjdGx5IGFmdGVyIHRoZSB3b3JkcyAiY2l0aXplbnMiIGFuZCAiZ292ZXJubWVudCIgaW4gdGhlIHByZXNpZGVudGlhbCBzcGVlY2hlcy4gCgoKCgo=