text_analysis

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)        # package for text analysis
library(readxl)          # reads excel files, the format I used for the data
speech <- read_excel("~/Documents/untitled folder/inaug_speeches.xlsx")
speech %>% 
  unnest_tokens(word, text) 
# A tibble: 14,874 × 2
   author     word        
   <chr>      <chr>       
 1 Washington among       
 2 Washington the         
 3 Washington vicissitudes
 4 Washington incident    
 5 Washington to          
 6 Washington life        
 7 Washington no          
 8 Washington event       
 9 Washington could       
10 Washington have        
# ℹ 14,864 more rows
inaug_words <- speech  %>% 
  unnest_tokens(word, text)
inaug_words %>% 
  group_by(author) %>% 
  summarise(num_words = n(), lex_diversity = n_distinct(word), lexical_density = lex_diversity/num_words)
# A tibble: 7 × 4
  author     num_words lex_diversity lexical_density
  <chr>          <int>         <int>           <dbl>
1 FDR             1881           709           0.377
2 Jefferson       1730           680           0.393
3 Kennedy         1365           534           0.391
4 Lincoln         3637          1011           0.278
5 Obama           2399           893           0.372
6 Reagan          2442           845           0.346
7 Washington      1420           593           0.418

The table above shows both lexical and word diversity of each presidents speech.

inaug_words %>% 
  mutate(word_length = nchar(word)) %>% 
  ggplot(aes(word_length)) +
  facet_wrap(vars(author), scales = "free_y") +
  geom_histogram(binwidth = 1) +
  labs(title = "Word Length of Inaugural Speeches")

Above are bar-graphs showing the lexical length of each of the presidents inauguration speeches.

inaug_words %>% 
  group_by(author) %>% 
  count(word, sort = T) %>% 
  top_n(5) %>% 
  ungroup() %>%  
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~author, scales = "free") +           
  scale_fill_viridis_d() +                        
  theme_minimal() +                                
  labs(x = NULL, y = "Most Frequently Used Words In Presidential Inaugural Speeches")
Selecting by n

The above graphs list the president and what the most common words they used in their inauguration speech.

stop_words %>% 
  filter(lexicon == "snowball") -> snowball
inaug_words %>% 
  anti_join(snowball) %>% 
  group_by(author) %>% 
  count(word, sort = T) %>% 
  top_n(5) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Commonality of Inauguration Speech Vocabulary") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()
Joining with `by = join_by(word)`
Selecting by n

The above graphs list the president and what the most common words they used in their inauguration speech while dis-including common words such as: the, and, of, which, etc.

inaug_word_counts <- speech %>% 
  unnest_tokens(word, text) %>% 
  count(author, word, sort = TRUE)
total_words <- inaug_word_counts %>%               
  group_by(author) %>% 
  summarize(total = sum(n))
inaug_word_counts <- left_join(inaug_word_counts, total_words)
Joining with `by = join_by(author)`
inaug_tf_idk <- inaug_word_counts %>% 
  bind_tf_idf(word, author, n)

inaug_tf_idk %>% 
  arrange(-tf_idf)
# A tibble: 5,265 × 7
   author    word           n total      tf   idf  tf_idf
   <chr>     <chr>      <int> <int>   <dbl> <dbl>   <dbl>
 1 FDR       helped         7  1881 0.00372 1.95  0.00724
 2 FDR       leadership     7  1881 0.00372 1.95  0.00724
 3 Lincoln   while         13  3637 0.00357 1.95  0.00696
 4 Kennedy   both          10  1365 0.00733 0.847 0.00621
 5 Kennedy   arms           4  1365 0.00293 1.95  0.00570
 6 FDR       money          5  1881 0.00266 1.95  0.00517
 7 Kennedy   sides          8  1365 0.00586 0.847 0.00497
 8 Lincoln   case           9  3637 0.00247 1.95  0.00482
 9 Lincoln   union         20  3637 0.00550 0.847 0.00466
10 Jefferson principle      6  1730 0.00347 1.25  0.00434
# ℹ 5,255 more rows

The table highlights frequency of word used during a president’s speech (tf) compared to the number of times it was used in other speeches (idf) while also showing the significance the word holds to their particular speech when compared to its use in other inaugural speeches. .

inaug_tf_idk %>% 
  arrange(-tf_idf) %>% 
  mutate(word =factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(3) %>% 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  theme_minimal() + 
  scale_fill_viridis_d() +
  labs(title = "Lexicon in Inaugural Speeches Based on Uniqueness") +
  coord_flip()
Selecting by tf_idf

The graphs show the most common “unique” words that each of the listed president’s used in the inaugural speeches.

bing <- get_sentiments("bing") 

inaug_words %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  group_by(sentiment) %>% 
  top_n(10) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") + 
  labs(y = "Most Common Positive and Negative Words Used", x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()
Joining with `by = join_by(word)`
Selecting by n

The graph above analyzes the most common positive and negative words used in inauguration speeches.

speech %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  select(bigram) -> inaug_bigrams
inaug_bigrams %>% 
  count(bigram, sort = T)
# A tibble: 10,876 × 2
   bigram       n
   <chr>    <int>
 1 of the     146
 2 in the      80
 3 of our      55
 4 to the      55
 5 and the     38
 6 to be       37
 7 it is       35
 8 by the      34
 9 for the     30
10 that the    29
# ℹ 10,866 more rows

The table shows the bigrams most commonly used in inaugural speeches.

inaug_bigrams %>% 
  separate(bigram, c("word1", "word2"), sep = "") %>% 
  filter(!word1 %in% snowball$word) %>% 
  filter(!word2 %in% snowball$word) %>% 
  unite(bigram, word1, word2, sep = "") %>% 
  count(bigram, sort = T)
Warning: Expected 2 pieces. Additional pieces discarded in 14867 rows [1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# A tibble: 26 × 2
   bigram     n
   <chr>  <int>
 1 t       2498
 2 o       1362
 3 w        995
 4 s        800
 5 b        724
 6 c        675
 7 p        636
 8 f        628
 9 m        534
10 h        479
# ℹ 16 more rows

The table above lists the bigrams most commonly used in inaugural speeches without stop words being present in analysis.

first_word <- c("america", "unite")  

inaug_bigrams %>%  
  count(bigram, sort = T) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>%       
  filter(word1 %in% first_word) %>%                           
  count(word1, word2, wt = n, sort = TRUE) %>% 
  mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%     
  group_by(word1) %>%  
  top_n(5) %>% 
  ggplot(aes(word2, n, fill = word1)) +                          
  scale_fill_viridis_d() +                                           
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following:") +
  facet_wrap(~word1, scales = "free") +
  coord_flip() +
  theme_minimal()
Selecting by n

The graph above shows the words and frequency in which they are used after America and unite in inaugural speeches.