text_analysis

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)        # package for text analysis

Warning: package 'tidytext' was built under R version 4.3.3

library(readxl)          # reads excel files, the format I used for the data

inaug_speeches <- read_excel("inaug_speeches.xlsx")

inaug_words <- inaug_speeches |>
  unnest_tokens(word, text)

inaug_words |> 
  group_by(author) |> 
  summarize(num_words = n(),
lex_diversity = n_distinct(word),
lex_density = lex_diversity/num_words)

# A tibble: 7 × 4
  author     num_words lex_diversity lex_density
  <chr>          <int>         <int>       <dbl>
1 FDR             1881           709       0.377
2 Jefferson       1730           680       0.393
3 Kennedy         1365           534       0.391
4 Lincoln         3637          1011       0.278
5 Obama           2399           893       0.372
6 Reagan          2442           845       0.346
7 Washington      1420           593       0.418

Analysis of the number of words, distinct words, and density of each inaugural speech.

inaug_words |>
  mutate(word_length = nchar(word)) |> 
  ggplot(aes(word_length)) +
  facet_wrap(vars(author), scales = "free_y") +
  geom_histogram(binwidth = 1) +
  labs(title = "Word length distributions for each inaugural speech")

Graph of the word length of each president during their inaugural speech.

  inaug_words |>
  group_by (author) |>
  count(word, sort = T) |>
  top_n(5) |>
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~author, scales = "free") +           # creates separate graphs for each author
  scale_fill_viridis_d() +                         # uses a nicer color scheme
  theme_minimal() +                                # removes the gray background
  labs(x = NULL, y = "Most common words")

Selecting by n

Graph of the most common words used by each president during their inaugural speech.

stop_words |> 
  filter(lexicon == "snowball") -> snowball

inaug_words |>
  anti_join(snowball) |>
  group_by(author) |>
count(word, sort = T) |>
  top_n(5) |> 
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()

Joining with `by = join_by(word)`
Selecting by n

Graph of the most common words excluding stop words that were used by each president during their inaugural speech.

inaug_word_counts <- inaug_speeches |>             # This counts each word per author
  unnest_tokens(word, text) |>
  count(author, word, sort = TRUE) 

total_words <- inaug_word_counts |>               # This counts total words per author
  group_by(author) |> 
  summarize(total = sum(n))

inaug_word_counts <- left_join(inaug_word_counts, total_words)    # Joins the two

Joining with `by = join_by(author)`

inaug_tf_idf <- inaug_word_counts |>             # Calculates tf-idf
  bind_tf_idf(word, author, n)

inaug_tf_idf |>                                   # Displays it
  arrange(-tf_idf)

# A tibble: 5,265 × 7
   author    word           n total      tf   idf  tf_idf
   <chr>     <chr>      <int> <int>   <dbl> <dbl>   <dbl>
 1 FDR       helped         7  1881 0.00372 1.95  0.00724
 2 FDR       leadership     7  1881 0.00372 1.95  0.00724
 3 Lincoln   while         13  3637 0.00357 1.95  0.00696
 4 Kennedy   both          10  1365 0.00733 0.847 0.00621
 5 Kennedy   arms           4  1365 0.00293 1.95  0.00570
 6 FDR       money          5  1881 0.00266 1.95  0.00517
 7 Kennedy   sides          8  1365 0.00586 0.847 0.00497
 8 Lincoln   case           9  3637 0.00247 1.95  0.00482
 9 Lincoln   union         20  3637 0.00550 0.847 0.00466
10 Jefferson principle      6  1730 0.00347 1.25  0.00434
# ℹ 5,255 more rows

Analysis of the term frequency and the inverse document frequency of each inaugural speech.

inaug_tf_idf |>
  arrange(-tf_idf) |>
  mutate(word = factor(word, levels = rev(unique(word)))) |> 
  group_by(author) |> 
  top_n(5) |> 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Most distinctive words in each inaugural speech") +
  coord_flip()

Selecting by tf_idf

Graph of term frequency and the inverse document frequency of word used by each president during their inaugural speech.

bing <- get_sentiments("bing")
bing

# A tibble: 6,786 × 2
   word        sentiment
   <chr>       <chr>    
 1 2-faces     negative 
 2 abnormal    negative 
 3 abolish     negative 
 4 abominable  negative 
 5 abominably  negative 
 6 abominate   negative 
 7 abomination negative 
 8 abort       negative 
 9 aborted     negative 
10 aborts      negative 
# ℹ 6,776 more rows

inaug_words |> 
  inner_join(bing) |> 
  count(word, sentiment, sort = TRUE)

Joining with `by = join_by(word)`

# A tibble: 619 × 3
   word       sentiment     n
   <chr>      <chr>     <int>
 1 good       positive     21
 2 freedom    positive     19
 3 great      positive     19
 4 right      positive     18
 5 work       positive     18
 6 peace      positive     17
 7 free       positive     16
 8 well       positive     15
 9 confidence positive     11
10 happiness  positive     11
# ℹ 609 more rows

Sentiment analysis of the common positive and negative words of each inaugural speech.

inaug_words |> 
  inner_join(bing) |> 
  count(word, sentiment, sort = TRUE) |>
  group_by(sentiment) |>
  top_n(10) |>
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "inaugural speech: Words that contribute the most to each sentiment",
       x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()

Joining with `by = join_by(word)`
Selecting by n

Graph of the common positive and negative words used by presidents in their inaugural speech.

inaug_speeches |>
  unnest_tokens(bigram, text, token = "ngrams", n = 2) |> 
  select(bigram) -> inaug_bigrams

inaug_bigrams |>
  count(bigram, sort = T)

# A tibble: 10,876 × 2
   bigram       n
   <chr>    <int>
 1 of the     146
 2 in the      80
 3 of our      55
 4 to the      55
 5 and the     38
 6 to be       37
 7 it is       35
 8 by the      34
 9 for the     30
10 that the    29
# ℹ 10,866 more rows

Analysis of the common bigrams of each inaugural speech.

inaug_bigrams |> 
  separate(bigram, c("word1", "word2"), sep = " ") |> 
  filter(!word1 %in% snowball$word) |>
  filter(!word2 %in% snowball$word) |> 
  unite(bigram, word1, word2, sep = " ") |>
  count(bigram, sort = T)

# A tibble: 2,399 × 2
   bigram                 n
   <chr>              <int>
 1 let us                18
 2 fellow citizens       16
 3 united states         11
 4 american people        6
 5 federal government     4
 6 government can         4
 7 one section            4
 8 vice president         4
 9 will endure            4
10 among us               3
# ℹ 2,389 more rows

Analysis of the common bigrams excluding stop words of each inaugural speech.

first_word <- c("president", "citizens")                                  # these need to be lowercase

inaug_bigrams |> 
  count(bigram, sort = T) |> 
  separate(bigram, c("word1", "word2"), sep = " ") |>       # separate the two words
  filter(word1 %in% first_word) |>                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE) |>

  mutate(word2 = factor(word2, levels = rev(unique(word2)))) |>     # put the words in order
  group_by(word1) |> 
  top_n(5) |> 
  ggplot(aes(word2, n, fill = word1)) +                          
  scale_fill_viridis_d() +                                           # set the color palette
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following:") +
  facet_wrap(~word1, scales = "free") +
  coord_flip() +
  theme_minimal()

Selecting by n

Graph of the common words that followed the words citizens and president used by presidents in their inaugural speech.