library(tidyverse)
library(tidytext)
library(readxl)
These are the packages that I used for this assignment.
inaug_speeches <- read_excel("R class/inaug_speeches.xlsx")
inaug_words <- inaug_speeches |>
unnest_tokens(word, text)
Question 1:
inaug_words |>
group_by(author) |>
summarize(num_words = n(), lex_diversity = n_distinct(word),
lexical_density = lex_diversity/num_words)
This is a table showing the complexity of the number of words,
diversity, and density of each President’s speech.
inaug_words |>
mutate(word_length = nchar(word)) |>
ggplot(aes(word_length)) +
facet_wrap(vars(author), scales = "free_y") +
geom_histogram(binwidth = 1) +
labs(title = "Word Lengths of Presidential Inaugeration Speeches")

This is a graph showing the distributions of word lengths for each
Presidential speech.
Question 2:
inaug_words |>
group_by(author) |>
count(word, sort = T) |>
top_n(5) |>
ungroup() |>
mutate(word = reorder(word, n)) |>
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~author, scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
labs(x = NULL, y = "Most common inaugural speech words")
Selecting by n

This is a graph showing the most common words used by each President
during their speech.
stop_words %>%
filter(lexicon == "snowball") -> snowball
inaug_words %>%
anti_join(snowball) %>%
group_by(author) %>%
count(word, sort = T) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Commonality of Inauguration Speech Vocabulary") +
facet_wrap(vars(author), scales = "free") +
scale_fill_viridis_d() +
theme_minimal() +
coord_flip()
Joining with `by = join_by(word)`Selecting by n

This is a graph showing the most common words used by each President
during their speech excluding common words.
Question 3:
inaug_word_counts <- inaug_speeches |>
unnest_tokens(word, text) |>
count(author, word, sort = TRUE)
total_words <- inaug_word_counts |>
group_by(author) |>
summarize(total = sum(n))
inaug_word_counts <- left_join(inaug_word_counts, total_words)
inaug_tf_idf <- inaug_word_counts |>
bind_tf_idf(word, author, n)
inaug_tf_idf |>
arrange(-tf_idf)
This is a table showing the word frequency (tf) in each President’s
speech, the number of times that the word was used in other speeches
(idf), and the significance of the word in relation to this speech
compared to the use in other speeches.
inaug_tf_idf |>
arrange(-tf_idf) |>
mutate(word = factor(word, levels = rev(unique(word)))) |>
group_by(author) |>
top_n(3) |>
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, scales = "free") +
theme_minimal() +
scale_fill_viridis_d() +
labs(title = "Significant words for each of the Presidential inaugural speech") +
coord_flip()
Selecting by tf_idf

This is a graph showing the most common significant words that each
president used during their speeches.
Question 4:
bing <- get_sentiments("bing")
inaug_words %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free") +
labs(y = "Sentiment analysis of the most Common Positive and Negative Words Used", x = NULL) +
scale_fill_viridis_d() +
coord_flip() +
theme_minimal()
Joining with `by = join_by(word)`Selecting by n

This is a graph showing the most common positive and negative words
that were used in the speeches.
Question 5:
inaug_speeches %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
select(bigram) -> inaug_bigrams
inaug_bigrams %>%
count(bigram, sort = T)
This is a table showing the most common bigrams used in the
presidential speeches.
inaug_bigrams |>
separate(bigram, c("word1", "word2"), sep = " ") |>
filter(!word1 %in% snowball$word) |>
filter(!word2 %in% snowball$word) |>
unite(bigram, word1, word2, sep = " ") |>
count(bigram, sort = T)
This is a table showing the most commonly used bigrams from the
speeches excluding common stop words.
Question 6:
first_word <- c("citizens", "government")
inaug_bigrams |>
count(bigram, sort = T) |>
separate(bigram, c("word1", "word2"), sep = " ") |>
filter(word1 %in% first_word) |>
count(word1, word2, wt = n, sort = TRUE) |>
mutate(word2 = factor(word2, levels = rev(unique(word2)))) |>
group_by(word1) |>
top_n(5) |>
ggplot(aes(word2, n, fill = word1)) +
scale_fill_viridis_d() +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL, title = "Word following:") +
facet_wrap(~word1, scales = "free") +
coord_flip() +
theme_minimal()
Selecting by n

This is a graph showing the most frequent words used directly after
the words “citizens” and “government” in the presidential speeches.
LS0tCnRpdGxlOiAiVGV4dCBBbmFseXNpcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShyZWFkeGwpIApgYGAKClRoZXNlIGFyZSB0aGUgcGFja2FnZXMgdGhhdCBJIHVzZWQgZm9yIHRoaXMgYXNzaWdubWVudC4KCmBgYHtyfQppbmF1Z19zcGVlY2hlcyA8LSByZWFkX2V4Y2VsKCJSIGNsYXNzL2luYXVnX3NwZWVjaGVzLnhsc3giKQpgYGAKCmBgYHtyfQppbmF1Z193b3JkcyA8LSBpbmF1Z19zcGVlY2hlcyB8PgogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkKYGBgCgpRdWVzdGlvbiAxOgoKYGBge3J9CmluYXVnX3dvcmRzIHw+IAogIGdyb3VwX2J5KGF1dGhvcikgfD4gCiAgc3VtbWFyaXplKG51bV93b3JkcyA9IG4oKSwgbGV4X2RpdmVyc2l0eSA9IG5fZGlzdGluY3Qod29yZCksCiAgICAgICAgICAgIGxleGljYWxfZGVuc2l0eSA9ICBsZXhfZGl2ZXJzaXR5L251bV93b3JkcykKYGBgCgpUaGlzIGlzIGEgdGFibGUgc2hvd2luZyB0aGUgY29tcGxleGl0eSBvZiB0aGUgbnVtYmVyIG9mIHdvcmRzLCBkaXZlcnNpdHksIGFuZCBkZW5zaXR5IG9mIGVhY2ggUHJlc2lkZW50J3Mgc3BlZWNoLiAKCmBgYHtyfQppbmF1Z193b3JkcyB8PgogIG11dGF0ZSh3b3JkX2xlbmd0aCA9IG5jaGFyKHdvcmQpKSB8PiAKICBnZ3Bsb3QoYWVzKHdvcmRfbGVuZ3RoKSkgKwogIGZhY2V0X3dyYXAodmFycyhhdXRob3IpLCBzY2FsZXMgPSAiZnJlZV95IikgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKwogIGxhYnModGl0bGUgPSAiV29yZCBMZW5ndGhzIG9mIFByZXNpZGVudGlhbCBJbmF1Z2VyYXRpb24gU3BlZWNoZXMiKQpgYGAKClRoaXMgaXMgYSBncmFwaCBzaG93aW5nIHRoZSBkaXN0cmlidXRpb25zIG9mIHdvcmQgbGVuZ3RocyBmb3IgZWFjaCBQcmVzaWRlbnRpYWwgc3BlZWNoLiAKClF1ZXN0aW9uIDI6CgpgYGB7cn0KaW5hdWdfd29yZHMgfD4gCiAgZ3JvdXBfYnkoYXV0aG9yKSB8PgogIGNvdW50KHdvcmQsIHNvcnQgPSBUKSB8PgogIHRvcF9uKDUpIHw+CiAgdW5ncm91cCgpIHw+IAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgfD4KICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBhdXRob3IpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGNvb3JkX2ZsaXAoKSArCiAgZmFjZXRfd3JhcCh+YXV0aG9yLCBzY2FsZXMgPSAiZnJlZSIpICsgICAgICAgICAgIAogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKyAgICAgICAgICAgICAgICAgICAgICAgIAogIHRoZW1lX21pbmltYWwoKSArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICBsYWJzKHggPSBOVUxMLCB5ID0gIk1vc3QgY29tbW9uIGluYXVndXJhbCBzcGVlY2ggd29yZHMiKQpgYGAKClRoaXMgaXMgYSBncmFwaCBzaG93aW5nIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyB1c2VkIGJ5IGVhY2ggUHJlc2lkZW50IGR1cmluZyB0aGVpciBzcGVlY2guIAoKYGBge3J9CnN0b3Bfd29yZHMgJT4lIAogIGZpbHRlcihsZXhpY29uID09ICJzbm93YmFsbCIpIC0+IHNub3diYWxsCmluYXVnX3dvcmRzICU+JSAKICBhbnRpX2pvaW4oc25vd2JhbGwpICU+JSAKICBncm91cF9ieShhdXRob3IpICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgJT4lIAogIHRvcF9uKDUpICU+JSAKICB1bmdyb3VwKCkgJT4lIAogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lIAogIGdncGxvdChhZXMod29yZCwgbiwgZmlsbCA9IGF1dGhvcikpICsKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9ICJDb21tb25hbGl0eSBvZiBJbmF1Z3VyYXRpb24gU3BlZWNoIFZvY2FidWxhcnkiKSArCiAgZmFjZXRfd3JhcCh2YXJzKGF1dGhvciksIHNjYWxlcyA9ICJmcmVlIikgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKVGhpcyBpcyBhIGdyYXBoIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIHdvcmRzIHVzZWQgYnkgZWFjaCBQcmVzaWRlbnQgZHVyaW5nIHRoZWlyIHNwZWVjaCBleGNsdWRpbmcgY29tbW9uIHdvcmRzLiAKClF1ZXN0aW9uIDM6IAoKYGBge3J9CmluYXVnX3dvcmRfY291bnRzIDwtIGluYXVnX3NwZWVjaGVzIHw+ICAgICAgICAgICAgIAogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgfD4KICBjb3VudChhdXRob3IsIHdvcmQsIHNvcnQgPSBUUlVFKSAKCnRvdGFsX3dvcmRzIDwtIGluYXVnX3dvcmRfY291bnRzIHw+ICAgICAgICAgICAgICAgCiAgZ3JvdXBfYnkoYXV0aG9yKSB8PiAKICBzdW1tYXJpemUodG90YWwgPSBzdW0obikpCgppbmF1Z193b3JkX2NvdW50cyA8LSBsZWZ0X2pvaW4oaW5hdWdfd29yZF9jb3VudHMsIHRvdGFsX3dvcmRzKQoKYGBgCgpgYGB7cn0KaW5hdWdfdGZfaWRmIDwtIGluYXVnX3dvcmRfY291bnRzIHw+ICAgICAgICAgICAgIAogIGJpbmRfdGZfaWRmKHdvcmQsIGF1dGhvciwgbikKCmluYXVnX3RmX2lkZiB8PiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgYXJyYW5nZSgtdGZfaWRmKSAKYGBgCgpUaGlzIGlzIGEgdGFibGUgc2hvd2luZyB0aGUgd29yZCBmcmVxdWVuY3kgKHRmKSBpbiBlYWNoIFByZXNpZGVudCdzIHNwZWVjaCwgdGhlIG51bWJlciBvZiB0aW1lcyB0aGF0IHRoZSB3b3JkIHdhcyB1c2VkIGluIG90aGVyIHNwZWVjaGVzIChpZGYpLCBhbmQgdGhlIHNpZ25pZmljYW5jZSBvZiB0aGUgd29yZCBpbiByZWxhdGlvbiB0byB0aGlzIHNwZWVjaCBjb21wYXJlZCB0byB0aGUgdXNlIGluIG90aGVyIHNwZWVjaGVzLiAKCmBgYHtyfQppbmF1Z190Zl9pZGYgfD4KICBhcnJhbmdlKC10Zl9pZGYpIHw+CiAgbXV0YXRlKHdvcmQgPSBmYWN0b3Iod29yZCwgbGV2ZWxzID0gcmV2KHVuaXF1ZSh3b3JkKSkpKSB8PiAKICBncm91cF9ieShhdXRob3IpIHw+IAogIHRvcF9uKDMpIHw+IAogIGdncGxvdChhZXMod29yZCwgdGZfaWRmLCBmaWxsID0gYXV0aG9yKSkgKwogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsKICBsYWJzKHggPSBOVUxMLCB5ID0gInRmLWlkZiIpICsKICBmYWNldF93cmFwKH5hdXRob3IsIHNjYWxlcyA9ICJmcmVlIikgKwogIHRoZW1lX21pbmltYWwoKSArCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArCiAgbGFicyh0aXRsZSA9ICJTaWduaWZpY2FudCB3b3JkcyBmb3IgZWFjaCBvZiB0aGUgUHJlc2lkZW50aWFsIGluYXVndXJhbCBzcGVlY2giKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKVGhpcyBpcyBhIGdyYXBoIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIHNpZ25pZmljYW50IHdvcmRzIHRoYXQgZWFjaCBwcmVzaWRlbnQgdXNlZCBkdXJpbmcgdGhlaXIgc3BlZWNoZXMuIAoKUXVlc3Rpb24gNDoKCmBgYHtyfQpiaW5nIDwtIGdldF9zZW50aW1lbnRzKCJiaW5nIikgCgppbmF1Z193b3JkcyAlPiUgCiAgaW5uZXJfam9pbihiaW5nKSAlPiUgCiAgY291bnQod29yZCwgc2VudGltZW50LCBzb3J0ID0gVFJVRSkgJT4lIAogIGdyb3VwX2J5KHNlbnRpbWVudCkgJT4lIAogIHRvcF9uKDEwKSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgbikpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBzZW50aW1lbnQpKSArCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKwogIGZhY2V0X3dyYXAodmFycyhzZW50aW1lbnQpLCBzY2FsZXMgPSAiZnJlZSIpICsgCiAgbGFicyh5ID0gIlNlbnRpbWVudCBhbmFseXNpcyBvZiB0aGUgbW9zdCBDb21tb24gUG9zaXRpdmUgYW5kIE5lZ2F0aXZlIFdvcmRzIFVzZWQiLCB4ID0gTlVMTCkgKwogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKwogIGNvb3JkX2ZsaXAoKSArCiAgdGhlbWVfbWluaW1hbCgpCmBgYAoKVGhpcyBpcyBhIGdyYXBoIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIHBvc2l0aXZlIGFuZCBuZWdhdGl2ZSB3b3JkcyB0aGF0IHdlcmUgdXNlZCBpbiB0aGUgc3BlZWNoZXMuCgpRdWVzdGlvbiA1OgoKYGBge3J9CmluYXVnX3NwZWVjaGVzICU+JSAKICB1bm5lc3RfdG9rZW5zKGJpZ3JhbSwgdGV4dCwgdG9rZW4gPSAibmdyYW1zIiwgbiA9IDIpICU+JSAKICBzZWxlY3QoYmlncmFtKSAtPiBpbmF1Z19iaWdyYW1zCmluYXVnX2JpZ3JhbXMgJT4lIAogIGNvdW50KGJpZ3JhbSwgc29ydCA9IFQpCmBgYAoKVGhpcyBpcyBhIHRhYmxlIHNob3dpbmcgdGhlIG1vc3QgY29tbW9uIGJpZ3JhbXMgdXNlZCBpbiB0aGUgcHJlc2lkZW50aWFsIHNwZWVjaGVzLgoKYGBge3J9CmluYXVnX2JpZ3JhbXMgfD4gCiAgc2VwYXJhdGUoYmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIpLCBzZXAgPSAiICIpIHw+IAogIGZpbHRlcighd29yZDEgJWluJSBzbm93YmFsbCR3b3JkKSB8PgogIGZpbHRlcighd29yZDIgJWluJSBzbm93YmFsbCR3b3JkKSB8PiAKICB1bml0ZShiaWdyYW0sIHdvcmQxLCB3b3JkMiwgc2VwID0gIiAiKSB8PgogIGNvdW50KGJpZ3JhbSwgc29ydCA9IFQpCmBgYAoKVGhpcyBpcyBhIHRhYmxlIHNob3dpbmcgdGhlIG1vc3QgY29tbW9ubHkgdXNlZCBiaWdyYW1zIGZyb20gdGhlIHNwZWVjaGVzIGV4Y2x1ZGluZyBjb21tb24gc3RvcCB3b3Jkcy4gCgpRdWVzdGlvbiA2OgoKYGBge3J9CmZpcnN0X3dvcmQgPC0gYygiY2l0aXplbnMiLCAiZ292ZXJubWVudCIpICAKCmluYXVnX2JpZ3JhbXMgfD4gCiAgY291bnQoYmlncmFtLCBzb3J0ID0gVCkgfD4gCiAgc2VwYXJhdGUoYmlncmFtLCBjKCJ3b3JkMSIsICJ3b3JkMiIpLCBzZXAgPSAiICIpIHw+ICAgICAgIAogIGZpbHRlcih3b3JkMSAlaW4lIGZpcnN0X3dvcmQpIHw+ICAgICAgICAgICAgICAgICAgICAgICAgICAKICBjb3VudCh3b3JkMSwgd29yZDIsIHd0ID0gbiwgc29ydCA9IFRSVUUpIHw+CiAgbXV0YXRlKHdvcmQyID0gZmFjdG9yKHdvcmQyLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQyKSkpKSB8PiAgICAgCiAgZ3JvdXBfYnkod29yZDEpIHw+IAogIHRvcF9uKDUpIHw+IAogIGdncGxvdChhZXMod29yZDIsIG4sIGZpbGwgPSB3b3JkMSkpICsgICAgICAgICAgICAgICAgICAgICAgICAgIAogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICBnZW9tX2NvbChzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgbGFicyh4ID0gTlVMTCwgeSA9IE5VTEwsIHRpdGxlID0gIldvcmQgZm9sbG93aW5nOiIpICsKICBmYWNldF93cmFwKH53b3JkMSwgc2NhbGVzID0gImZyZWUiKSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgpUaGlzIGlzIGEgZ3JhcGggc2hvd2luZyB0aGUgbW9zdCBmcmVxdWVudCB3b3JkcyB1c2VkIGRpcmVjdGx5IGFmdGVyIHRoZSB3b3JkcyAiY2l0aXplbnMiIGFuZCAiZ292ZXJubWVudCIgaW4gdGhlIHByZXNpZGVudGlhbCBzcGVlY2hlcy4gCgoKCgo=