text_analysis

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)        # package for text analysis
library(readxl)          # reads excel files, the format I used for the data

library(readxl)
inaug_speeches <- read_excel("~/Desktop/Stats/text analysis/inaug_speeches.xlsx")

pres_speech <- inaug_speeches |>
  unnest_tokens(word, text)

Measures of text complexity

pres_speech |> 
  group_by(author) |> 
  summarize(num_words = n(), 
            lex_diversity = n_distinct(word),
            lex_density = lex_diversity/num_words)

# A tibble: 7 × 4
  author     num_words lex_diversity lex_density
  <chr>          <int>         <int>       <dbl>
1 FDR             1881           709       0.377
2 Jefferson       1730           680       0.393
3 Kennedy         1365           534       0.391
4 Lincoln         3637          1011       0.278
5 Obama           2399           893       0.372
6 Reagan          2442           845       0.346
7 Washington      1420           593       0.418

1a. Above is a table for the inauguration speeches. This table shows us the number of words, the diversity and the density. FDR’s number of words was 1881, diversity was 709, and density was .3769. Jefferson’s number of words was 1730, diversity 680, and density .3830. Kennedy’s number of words was 1365, diversity 534, and density was .3912. Lincoln’s number of words was 3637, diversity 1011, and density was .2779. Obama’s number of words was 2399, diversity 893, and density was .3722. Reagan’s number of words was 2442, diversity was 845, and density was .3460. And Washington’s number of words was 1420, diversity was 593, and density was .4176.

pres_speech |>
  mutate(word_length = nchar(word)) |> 
  distinct(word, word_length, author) |> 
  arrange(-word_length)

# A tibble: 5,265 × 3
   word             word_length author    
   <chr>                  <int> <chr>     
 1 unconstitutional          16 Lincoln   
 2 constitutionally          16 Lincoln   
 3 misunderstanding          16 Reagan    
 4 administration's          16 Reagan    
 5 irresponsibility          16 Obama     
 6 recommendations           15 Washington
 7 representatives           15 Washington
 8 administrations           15 Jefferson 
 9 insurrectionary           15 Lincoln   
10 misconstruction           15 Lincoln   
# ℹ 5,255 more rows

1b. Above is a table of the longest words used and the author that used them in the inauguration speech. The table shows that the longest word was unconstitutional (16) and it was said by Lincoln.

pres_speech |>
  group_by(author) |> 
  mutate(word_length = nchar(word)) |> 
  summarize(mean_word_length = mean(word_length)) |> 
  arrange(-mean_word_length)

# A tibble: 7 × 2
  author     mean_word_length
  <chr>                 <dbl>
1 Washington             4.94
2 Jefferson              4.74
3 FDR                    4.68
4 Lincoln                4.66
5 Reagan                 4.47
6 Obama                  4.44
7 Kennedy                4.39

1c. Above is a table with the mean word length. Washington’s word length was 4.941. Jefferson’s word lengh was 4.735. FDR’s word length was 4.681. Lincoln’s word length 4.664. Reagan’s word length was 4.474. Obama’s word length was 4.441. And Kennedy’s word length was 4.394.

pres_speech |>
  mutate(word_length = nchar(word)) |> 
  ggplot(aes(word_length)) +
  facet_wrap(vars(author), scales = "free_y") +
  geom_histogram(binwidth = 1) +
  labs(title = "Word Length Distributions for Each Speech")

1d. Above is a graph of the word length distributions for each speech.

pres_speech |> 
  count(word, sort = T)

# A tibble: 3,018 × 2
   word      n
   <chr> <int>
 1 the     969
 2 of      664
 3 and     538
 4 to      487
 5 in      272
 6 a       251
 7 that    237
 8 our     211
 9 we      196
10 be      181
# ℹ 3,008 more rows

pres_speech |> 
  group_by(author) |>
  count(word, sort = T)

# A tibble: 5,265 × 3
# Groups:   author [7]
   author     word      n
   <chr>      <chr> <int>
 1 Lincoln    the     256
 2 Lincoln    of      146
 3 Lincoln    to      135
 4 FDR        the     130
 5 Jefferson  the     130
 6 Obama      the     130
 7 Reagan     the     123
 8 Obama      and     114
 9 Washington the     114
10 FDR        of      109
# ℹ 5,255 more rows

pres_speech |> 
  group_by(author) |>
  count(word, sort = T) |> 
  top_n(5) |>
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~author, scales = "free") +           # creates separate graphs for each author
  scale_fill_viridis_d() +                         # uses a nicer color scheme
  theme_minimal() +                                # removes the gray background
  labs(x = NULL, y = "Most common words")

Selecting by n

2a. Above is a graph of each speech with the most common words. FDR’s most common words was the(130), of(109), and(58), to(50), and in(44). Jefferson’s most common words was the(130), of(104), and(81), to(61), and which(25). Kennedy’s most common words used were the(86), of(65), and(41), to(43), and we(30). Lincoln’s most common words used was the(256), of(146), and(105), to(135), and in(77). Obama’s most common words used was the(130), of(82), and(114), to (70), and our(67). Reagan’s most common words used was the(123), of(90), and(92), to(80), and we(57). And Washington’s most common words used was the(114), of(68), and(47), to(48), and which(36).

stop_words

# A tibble: 1,149 × 2
   word        lexicon
   <chr>       <chr>  
 1 a           SMART  
 2 a's         SMART  
 3 able        SMART  
 4 about       SMART  
 5 above       SMART  
 6 according   SMART  
 7 accordingly SMART  
 8 across      SMART  
 9 actually    SMART  
10 after       SMART  
# ℹ 1,139 more rows

stop_words |> 
  group_by(lexicon) |> 
  count()

# A tibble: 3 × 2
# Groups:   lexicon [3]
  lexicon      n
  <chr>    <int>
1 SMART      571
2 onix       404
3 snowball   174

stop_words |> 
  filter(lexicon == "snowball") -> snowball

pres_speech |>
  anti_join(snowball) |> 
  group_by(author) |> 
  count(word, sort = T)

Joining with `by = join_by(word)`

# A tibble: 4,633 × 3
# Groups:   author [7]
   author  word             n
   <chr>   <chr>        <int>
 1 Reagan  will            33
 2 Lincoln can             28
 3 Lincoln will            27
 4 Reagan  us              25
 5 Lincoln constitution    24
 6 Obama   us              23
 7 Lincoln people          20
 8 Lincoln union           20
 9 Lincoln states          19
10 Obama   will            19
# ℹ 4,623 more rows

pres_speech |>
  anti_join(snowball) |> 
  group_by(author) |> 
  count(word, sort = T) |> 
  top_n(5) |> 
  ungroup() |> 
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Most common words with stop words") +
  facet_wrap(vars(author), scales = "free") +
  scale_fill_viridis_d() +
  theme_minimal() +
  coord_flip()

Joining with `by = join_by(word)`
Selecting by n

2b. Above is a graph of the most common words with stop words removed in each speech which inclues FDR, Jefferson, Kennedy, Lincoln, Obama, Regan, and Washington. FDR’s most common words used was will(10), can(11), must(9), national(9), and may(8). Jefferson’s most common words used was will(12), us(10), government(12), let(7), may(8), fellow(7), and citizens(7). Kennedy’s most common words used was us(12), can(9), let(16), world(8), and sides(8). Lincoln’s most common words used was constitution(24), union(20), people(20), will(27), and can(28). Obama’s most common words used was will(19), us(23), can(13), nation(12), and new(11). Reagan’s most common words used was will(33), us(25), government(17), believe(10), and must(10). Washington’s most common words used was will(10), can(9), government(8), every(9), may(6), and public(6).

TF-IDF

pres_word_count<- inaug_speeches |>             # This counts each word per author
  unnest_tokens(word, text) |>
  count(author, word, sort = TRUE) 

total_words <- pres_word_count |>               # This counts total words per author
  group_by(author) |> 
  summarize(total = sum(n))

pres_word_count <- left_join(pres_word_count, total_words)    # Joins the two

Joining with `by = join_by(author)`

pres_tf_idf <- pres_word_count |>             # Calculates tf-idf
  bind_tf_idf(word, author, n)

pres_tf_idf |>                                   # Displays it
  arrange(-tf_idf)

# A tibble: 5,265 × 7
   author    word           n total      tf   idf  tf_idf
   <chr>     <chr>      <int> <int>   <dbl> <dbl>   <dbl>
 1 FDR       helped         7  1881 0.00372 1.95  0.00724
 2 FDR       leadership     7  1881 0.00372 1.95  0.00724
 3 Lincoln   while         13  3637 0.00357 1.95  0.00696
 4 Kennedy   both          10  1365 0.00733 0.847 0.00621
 5 Kennedy   arms           4  1365 0.00293 1.95  0.00570
 6 FDR       money          5  1881 0.00266 1.95  0.00517
 7 Kennedy   sides          8  1365 0.00586 0.847 0.00497
 8 Lincoln   case           9  3637 0.00247 1.95  0.00482
 9 Lincoln   union         20  3637 0.00550 0.847 0.00466
10 Jefferson principle      6  1730 0.00347 1.25  0.00434
# ℹ 5,255 more rows

3a. Above is a table of tf-idf of the most distinctive words in each speech which inclues FDR, Jefferson, Kennedy, Lincoln, Obama, Regan, and Washington. The table shows the number of times the words was used, the total, the tf, the idf, and the combined tf-idf.

pres_tf_idf |>
  arrange(-tf_idf) |>
  mutate(word = factor(word, levels = rev(unique(word)))) |> 
  group_by(author) |> 
  top_n(5) |> 
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, scales = "free") +
  theme_minimal() +
  scale_fill_viridis_d() +
  labs(title = "Most distinctive words in each Speech") +
  coord_flip()

Selecting by tf_idf

3b. Above is a graph of tf-idf of the most distinctive words in each speech which includes FDR, Jefferson, Kennedy, Lincoln, Obama, Reagan, and Washington. The most distinctive words that was used by FDR was helped(.0072), leadership(.0072), money(.0051), discipline(.0041), emergency(.0041), and respects(.0041). The most distinctive words that was used by Jefferson was principle(.0043), safety(.0036), industry(.0033), him(.0028), and honest(.0028). The most distinctive words that was used by Kennedy was both(.0062), arms(.0057), sides(.0049), final(.0042), and poverty(.0042). The most distinctive words that was used by Lincoln was while(.0069), case(.0048), union(.0046), laws(.0042), clause(.0026), expressly(.0026), object(.0026), plainly(.0026), section(.0026), slave(.0026), and surrender(.0026). The most distinctive words that was used by Obama was america(.0028), spirit(.0026), carried(.0024), ideals(.0024), and journey(.0024). The most distinctive words that was used by Reagan was dreams(.0031), front(.0023), going(.0023), group(.0023), maintaining(.0023), productivity(.0023), tax(.0023), and weapon(.0023). The most distinctive words that was used by Washington was myself(.0035), article(.0027), communities(.0027), deliberations(.0027), department(.0027), established(.0027), expedient(.0027), immutable(.0027), impressions(.0027), liberties(.0027), peculiarly(.0027), pecuniary(.0027), providential(.0027), qualifications(.0027), and rendered(.0027).

Sentiment analysis

bing <- get_sentiments("bing")
bing

# A tibble: 6,786 × 2
   word        sentiment
   <chr>       <chr>    
 1 2-faces     negative 
 2 abnormal    negative 
 3 abolish     negative 
 4 abominable  negative 
 5 abominably  negative 
 6 abominate   negative 
 7 abomination negative 
 8 abort       negative 
 9 aborted     negative 
10 aborts      negative 
# ℹ 6,776 more rows

pres_speech |> 
  inner_join(bing) |> 
  count(word, sentiment, sort = TRUE)

Joining with `by = join_by(word)`

# A tibble: 619 × 3
   word       sentiment     n
   <chr>      <chr>     <int>
 1 good       positive     21
 2 freedom    positive     19
 3 great      positive     19
 4 right      positive     18
 5 work       positive     18
 6 peace      positive     17
 7 free       positive     16
 8 well       positive     15
 9 confidence positive     11
10 happiness  positive     11
# ℹ 609 more rows

pres_speech |> 
  inner_join(bing) |> 
  count(word, sentiment, sort = TRUE) |> 
  group_by(sentiment) |>
  top_n(10) |>
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(sentiment), scales = "free") +
  labs(y = "Speeches: Words that contribute the most to each sentiment",
       x = NULL) +
  scale_fill_viridis_d() +
  coord_flip() +
  theme_minimal()

Joining with `by = join_by(word)`
Selecting by n

Above is a graph the is on negative and positive for speeches and the words that contriubute the most to each sentiment. The negatives are fear(8), crisis(8), failure(6), struggle(5), slave(5), problems(5), object(5), issue(5), hard(5), enemies(5), destroy(5), dark(5), conflict(5), burden(5) and break(5). The positives are good(21), great(19), freedom(19), work(18), right(18), peace(17), free(16), well(15), support(11), liberty(11), happiness(11), and confidence(11).

inaug_speeches|>
  unnest_tokens(bigram, text, token = "ngrams", n = 2) |> 
  select(bigram) -> pres_bigrams

pres_bigrams |> 
  count(bigram, sort = T)

# A tibble: 10,876 × 2
   bigram       n
   <chr>    <int>
 1 of the     146
 2 in the      80
 3 of our      55
 4 to the      55
 5 and the     38
 6 to be       37
 7 it is       35
 8 by the      34
 9 for the     30
10 that the    29
# ℹ 10,866 more rows

5a. Above is a table to the most common bigrams. Which includes of the(146), in the (80), of our (55), to the (55), and the (38), to be (37), it is (35), by the (34), for the (30), and that the (29).

pres_bigrams |> 
  separate(bigram, c("word1", "word2"), sep = " ") |> 
  filter(!word1 %in% snowball$word) |>
  filter(!word2 %in% snowball$word) |> 
  unite(bigram, word1, word2, sep = " ") |> 
  count(bigram, sort = T)

# A tibble: 2,399 × 2
   bigram                 n
   <chr>              <int>
 1 let us                18
 2 fellow citizens       16
 3 united states         11
 4 american people        6
 5 federal government     4
 6 government can         4
 7 one section            4
 8 vice president         4
 9 will endure            4
10 among us               3
# ℹ 2,389 more rows

5b. Above is a table which shows the most common words after removing stop words which is let us(18), fellow citizens (16), united states (11), american people (6), federal government (4), government can (4), one section (4), vice president (4), will endure (4), among us (3).

first_word <- c("of", "in")                                  # these need to be lowercase

pres_bigrams |> 
  count(bigram, sort = T) |> 
  separate(bigram, c("word1", "word2"), sep = " ") |>       # separate the two words
  filter(word1 %in% first_word) |>                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE)

# A tibble: 388 × 3
   word1 word2     n
   <chr> <chr> <int>
 1 of    the     146
 2 in    the      80
 3 of    our      55
 4 of    a        24
 5 of    this     16
 6 in    a        14
 7 of    us       11
 8 in    our      10
 9 of    all      10
10 in    this      9
# ℹ 378 more rows

first_word <- c("of", "in")                                  # these need to be lowercase

pres_bigrams |> 
  count(bigram, sort = T) |> 
  separate(bigram, c("word1", "word2"), sep = " ") |>       # separate the two words
  filter(word1 %in% first_word) |>                          # find first words from our list
  count(word1, word2, wt = n, sort = TRUE) |> 
  mutate(word2 = factor(word2, levels = rev(unique(word2)))) |>     # put the words in order
  group_by(word1) |> 
  top_n(5) |> 
  ggplot(aes(word2, n, fill = word1)) +                          
  scale_fill_viridis_d() +                                           # set the color palette
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL, title = "Word following:") +
  facet_wrap(~word1, scales = "free") +
  coord_flip() +
  theme_minimal()

Selecting by n

Above is a graph that shows the most common used words and the word that follows. The words that I decided to look at is in and of. The purple graph shows that “in” ans the words that followed “in” which is “the”(80), “our” (10), “a” (14), “this” (9), “all” (8), “any” (8), and “which” (8). The yellow graph shows the word “of” and the words after which is “the” (146), “our” (55), “a” (24), “this” (16), and “us” (11).