Text Analysis

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)        # package for text analysis
library(readxl)          # reads excel files, the format I used for the data

library(readxl)
inaug_speeches <- read_excel("inaug_speeches.xlsx")

inaugural_speeches <- read_excel("inaug_speeches.xlsx")

inaugural_words <- inaugural_speeches |>
  unnest_tokens(word, text)

inaugural_words |> 
  group_by(author) |> 
  summarize(num_words = n(),
            lex_diversity = n_distinct(word),
            lex_density = lex_diversity/num_words)

# A tibble: 7 × 4
  author     num_words lex_diversity lex_density
  <chr>          <int>         <int>       <dbl>
1 FDR             1881           709       0.377
2 Jefferson       1730           680       0.393
3 Kennedy         1365           534       0.391
4 Lincoln         3637          1011       0.278
5 Obama           2399           893       0.372
6 Reagan          2442           845       0.346
7 Washington      1420           593       0.418

#1. This table displays the number of words (num_words) each individual inaugural speech contained, total number of diverse words (lex_diversity) in each inaugural speech, and the overall density of the distinct words (lex_density) in each inaugural speech. Density is calculated by the toal number of diverse words divided by the total number of words. Essentially this table allows us to see how diverse each President’s inaugural speech was. While Abraham Lincoln had the highest total number of diverse words used (1011), He also had the lowest rate of density (0.278). Furthermore, George Washington has the highest level of density (0.418) and lowest total amount of diverse words used (593).

inaugural_words |>
  mutate(word_length = nchar(word)) |> 
  ggplot(aes(word_length)) +
  facet_wrap(vars(author), scales = "free_y") +
  geom_histogram(binwidth = 1) +
  labs(title = "Word Length Distrbution for Each Inaugural Speech")

#2 The graphs above show distribution of word length between each President’s inaugural speeches. Referring back to question #1, we see that Abraham Lincoln had the greatest length of words.

inaugural_words |>
  group_by(author) |>
  count(word, sort = T) |>
   top_n(5) |>
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~author, scales = "free") +           
  scale_fill_viridis_d() +                         
  theme_minimal() +                                
  labs(x = NULL, y = "Most common words")

Selecting by n

#2 This chart displays the most common words used by each individual President in their Inaugural speeches. The most commonly used word for each President is the word “the”, and secondly, for the most part the next most commonly used word is “of”.

stop_words |>
  filter(lexicon == "snowball") -> snowball

inaugural_words |>
  anti_join(snowball) |>
  group_by(author) |>
  count(word, sort = T) |>
  top_n(5) |> 
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words (stop words removed)") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()

Joining with `by = join_by(word)`
Selecting by n

#2 Similar to the previous chart, This chart also displays the most common words used y each President in their Inaugural speech. However, this chart eliminates the use of ‘stop words’. We see that for the most part the three most common words used are either ‘us’, ‘will’ , or ‘constitution’.

inaugural_word_counts <- inaugural_speeches |>             # This counts each word per author
  unnest_tokens(word, text) |>
  count(author, word, sort = TRUE) 

total_words <- inaugural_word_counts |>               # This counts total words per author
  group_by(author) |> 
  summarize(total = sum(n))

inaugural_word_counts <- left_join(inaugural_word_counts, total_words)    # Joins the two

Joining with `by = join_by(author)`

inaugural_tf_idf <- inaugural_word_counts |>             # Calculates tf-idf
  bind_tf_idf(word, author, n)

inaugural_tf_idf |>                                   # Displays it
  arrange(-tf_idf)

# A tibble: 5,265 × 7
   author    word           n total      tf   idf  tf_idf
   <chr>     <chr>      <int> <int>   <dbl> <dbl>   <dbl>
 1 FDR       helped         7  1881 0.00372 1.95  0.00724
 2 FDR       leadership     7  1881 0.00372 1.95  0.00724
 3 Lincoln   while         13  3637 0.00357 1.95  0.00696
 4 Kennedy   both          10  1365 0.00733 0.847 0.00621
 5 Kennedy   arms           4  1365 0.00293 1.95  0.00570
 6 FDR       money          5  1881 0.00266 1.95  0.00517
 7 Kennedy   sides          8  1365 0.00586 0.847 0.00497
 8 Lincoln   case           9  3637 0.00247 1.95  0.00482
 9 Lincoln   union         20  3637 0.00550 0.847 0.00466
10 Jefferson principle      6  1730 0.00347 1.25  0.00434
# ℹ 5,255 more rows

#3 This table displays the unique words used by each President in their Inaugural speeches in comparison to how frequent the other President’s chose to use the word. For example, if the word was ‘helped’, FDR used the word a total of seven times (n=7) for a term frequency-inverse document frequency (TF-IDF) of 0.0072 which means that the term ‘help’ was used quite rarely in comparison to the other Presidents Inaugural speeches.

inaugural_tf_idf |>
  arrange(-tf_idf) |>
  mutate(word = factor(word, levels = rev(unique(word)))) |> 
  group_by(author) |> 
  top_n(5) |> 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  coord_flip()

Selecting by tf_idf

#3 This chart furthermore displays the results from the tables above Regarding the frequency and uniqueness of words used in each Presidents Inaugural speech.

bing <- get_sentiments("bing")

inaugural_words |> 
  inner_join(bing) |> 
  count(word, sentiment, sort = TRUE) |>
  group_by(sentiment) |>
  top_n(10) |>
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "Inaugural Speeches: Words that contribute the most to each sentiment",
       x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()

Joining with `by = join_by(word)`
Selecting by n

#4 These two bar graphs display the words that contributed the most to both positive and negative sentiment. For the left graph (negative) we see the words with the most power are ‘fear’ and ‘crisis. For the graph on the right (positive sentiment), the words with the most power are ’good’, great’, and ‘freedom’.

inaugural_speeches |>
  unnest_tokens(bigram, text, token = "ngrams", n = 2) |>
  select(bigram) -> inaugural_bigram

inaugural_bigram |>
  count(bigram, sort = T)

# A tibble: 10,876 × 2
   bigram       n
   <chr>    <int>
 1 of the     146
 2 in the      80
 3 of our      55
 4 to the      55
 5 and the     38
 6 to be       37
 7 it is       35
 8 by the      34
 9 for the     30
10 that the    29
# ℹ 10,866 more rows

#5 This table shows the bigrams from most-to-least common in all of the President’s Inaugural speeches.

first_word <- c("american", "fellow")                                  

inaugural_bigram |> 
  count(bigram, sort = T) |> 
  separate(bigram, c("word1", "word2"), sep = " ") |>       # separate the two words
  filter(word1 %in% first_word) |>                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE) |>
  mutate(word2 = factor(word2, levels = rev(unique(word2)))) |>     # put the words in order
  group_by(word1) |> 
  top_n(5) |> 
  ggplot(aes(word2, n, fill = word1)) +                          
  scale_fill_viridis_d() +                                           # set the color palette
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following: american and fellow") +
  facet_wrap(~word1, scales = "free") +
  coord_flip() +
  theme_minimal()

Selecting by n

#6 This graph shows across all of the studied Inaugural (n=7) of the most common words that follow either ‘American’ or ‘Fellow’. For the left graph (American), the most common word is ‘people’. For the right graph (Fellow), the most common word is ‘citizens’.

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

Running Code

When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:

1 + 1

[1] 2

You can add options to executable code like this

[1] 4

The echo: false option disables the printing of code (only output is displayed).